1 /* $NetBSD: nfs_nfsdcache.c,v 1.5 2024/07/05 04:31:52 rin Exp $ */
2 /*-
3 * Copyright (c) 1989, 1993
4 * The Regents of the University of California. All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * Rick Macklem at The University of Guelph.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 */
34
35 #include <sys/cdefs.h>
36 /* __FBSDID("FreeBSD: head/sys/fs/nfsserver/nfs_nfsdcache.c 304026 2016-08-12 22:44:59Z rmacklem "); */
37 __RCSID("$NetBSD: nfs_nfsdcache.c,v 1.5 2024/07/05 04:31:52 rin Exp $");
38
39 /*
40 * Here is the basic algorithm:
41 * First, some design criteria I used:
42 * - I think a false hit is more serious than a false miss
43 * - A false hit for an RPC that has Op(s) that order via seqid# must be
44 * avoided at all cost
45 * - A valid hit will probably happen a long time after the original reply
46 * and the TCP socket that the original request was received on will no
47 * longer be active
48 * (The long time delay implies to me that LRU is not appropriate.)
49 * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
50 * in them as well as minimizing the risk of redoing retried non-idempotent
51 * Ops.
52 * Because it is biased towards avoiding false hits, multiple entries with
53 * the same xid are to be expected, especially for the case of the entry
54 * in the cache being related to a seqid# sequenced Op.
55 *
56 * The basic algorithm I'm about to code up:
57 * - Null RPCs bypass the cache and are just done
58 * For TCP
59 * - key on <xid, NFS version> (as noted above, there can be several
60 * entries with the same key)
61 * When a request arrives:
62 * For all that match key
63 * - if RPC# != OR request_size !=
64 * - not a match with this one
65 * - if NFSv4 and received on same TCP socket OR
66 * received on a TCP connection created before the
67 * entry was cached
68 * - not a match with this one
69 * (V2,3 clients might retry on same TCP socket)
70 * - calculate checksum on first N bytes of NFS XDR
71 * - if checksum !=
72 * - not a match for this one
73 * If any of the remaining ones that match has a
74 * seqid_refcnt > 0
75 * - not a match (go do RPC, using new cache entry)
76 * If one match left
77 * - a hit (reply from cache)
78 * else
79 * - miss (go do RPC, using new cache entry)
80 *
81 * During processing of NFSv4 request:
82 * - set a flag when a non-idempotent Op is processed
83 * - when an Op that uses a seqid# (Open,...) is processed
84 * - if same seqid# as referenced entry in cache
85 * - free new cache entry
86 * - reply from referenced cache entry
87 * else if next seqid# in order
88 * - free referenced cache entry
89 * - increment seqid_refcnt on new cache entry
90 * - set pointer from Openowner/Lockowner to
91 * new cache entry (aka reference it)
92 * else if first seqid# in sequence
93 * - increment seqid_refcnt on new cache entry
94 * - set pointer from Openowner/Lockowner to
95 * new cache entry (aka reference it)
96 *
97 * At end of RPC processing:
98 * - if seqid_refcnt > 0 OR flagged non-idempotent on new
99 * cache entry
100 * - save reply in cache entry
101 * - calculate checksum on first N bytes of NFS XDR
102 * request
103 * - note op and length of XDR request (in bytes)
104 * - timestamp it
105 * else
106 * - free new cache entry
107 * - Send reply (noting info for socket activity check, below)
108 *
109 * For cache entries saved above:
110 * - if saved since seqid_refcnt was > 0
111 * - free when seqid_refcnt decrements to 0
112 * (when next one in sequence is processed above, or
113 * when Openowner/Lockowner is discarded)
114 * else { non-idempotent Op(s) }
115 * - free when
116 * - some further activity observed on same
117 * socket
118 * (I'm not yet sure how I'm going to do
119 * this. Maybe look at the TCP connection
120 * to see if the send_tcp_sequence# is well
121 * past sent reply OR K additional RPCs
122 * replied on same socket OR?)
123 * OR
124 * - when very old (hours, days, weeks?)
125 *
126 * For UDP (v2, 3 only), pretty much the old way:
127 * - key on <xid, NFS version, RPC#, Client host ip#>
128 * (at most one entry for each key)
129 *
130 * When a Request arrives:
131 * - if a match with entry via key
132 * - if RPC marked In_progress
133 * - discard request (don't send reply)
134 * else
135 * - reply from cache
136 * - timestamp cache entry
137 * else
138 * - add entry to cache, marked In_progress
139 * - do RPC
140 * - when RPC done
141 * - if RPC# non-idempotent
142 * - mark entry Done (not In_progress)
143 * - save reply
144 * - timestamp cache entry
145 * else
146 * - free cache entry
147 * - send reply
148 *
149 * Later, entries with saved replies are free'd a short time (few minutes)
150 * after reply sent (timestamp).
151 * Reference: Chet Juszczak, "Improving the Performance and Correctness
152 * of an NFS Server", in Proc. Winter 1989 USENIX Conference,
153 * pages 53-63. San Diego, February 1989.
154 * for the UDP case.
155 * nfsrc_floodlevel is set to the allowable upper limit for saved replies
156 * for TCP. For V3, a reply won't be saved when the flood level is
157 * hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
158 * that case. This level should be set high enough that this almost
159 * never happens.
160 */
161 #ifndef APPLEKEXT
162 #include <fs/nfs/common/nfsport.h>
163
164 extern struct nfsstatsv1 nfsstatsv1;
165 extern struct mtx nfsrc_udpmtx;
166 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
167 extern struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
168 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
169 #endif /* !APPLEKEXT */
170
171 SYSCTL_DECL(_vfs_nfsd);
172
173 static u_int nfsrc_tcphighwater = 0;
174 static int
sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)175 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
176 {
177 int error, newhighwater;
178
179 newhighwater = nfsrc_tcphighwater;
180 error = sysctl_handle_int(oidp, &newhighwater, 0, req);
181 if (error != 0 || req->newptr == NULL)
182 return (error);
183 if (newhighwater < 0)
184 return (EINVAL);
185 if (newhighwater >= nfsrc_floodlevel)
186 nfsrc_floodlevel = newhighwater + newhighwater / 5;
187 nfsrc_tcphighwater = newhighwater;
188 return (0);
189 }
190 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
191 sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
192 "High water mark for TCP cache entries");
193
194 static u_int nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
195 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
196 &nfsrc_udphighwater, 0,
197 "High water mark for UDP cache entries");
198 static u_int nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
199 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
200 &nfsrc_tcptimeout, 0,
201 "Timeout for TCP entries in the DRC");
202 static u_int nfsrc_tcpnonidempotent = 1;
203 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
204 &nfsrc_tcpnonidempotent, 0,
205 "Enable the DRC for NFS over TCP");
206
207 static int nfsrc_udpcachesize = 0;
208 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
209 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
210
211 /*
212 * and the reverse mapping from generic to Version 2 procedure numbers
213 */
214 static int newnfsv2_procid[NFS_V3NPROCS] = {
215 NFSV2PROC_NULL,
216 NFSV2PROC_GETATTR,
217 NFSV2PROC_SETATTR,
218 NFSV2PROC_LOOKUP,
219 NFSV2PROC_NOOP,
220 NFSV2PROC_READLINK,
221 NFSV2PROC_READ,
222 NFSV2PROC_WRITE,
223 NFSV2PROC_CREATE,
224 NFSV2PROC_MKDIR,
225 NFSV2PROC_SYMLINK,
226 NFSV2PROC_CREATE,
227 NFSV2PROC_REMOVE,
228 NFSV2PROC_RMDIR,
229 NFSV2PROC_RENAME,
230 NFSV2PROC_LINK,
231 NFSV2PROC_READDIR,
232 NFSV2PROC_NOOP,
233 NFSV2PROC_STATFS,
234 NFSV2PROC_NOOP,
235 NFSV2PROC_NOOP,
236 NFSV2PROC_NOOP,
237 };
238
239 #define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
240 #define NFSRCUDPHASH(xid) \
241 (&nfsrvudphashtbl[nfsrc_hash(xid)])
242 #define NFSRCHASH(xid) \
243 (&nfsrchash_table[nfsrc_hash(xid)].tbl)
244 #define NFSRCAHASH(xid) (&nfsrcahash_table[nfsrc_hash(xid)])
245 #define TRUE 1
246 #define FALSE 0
247 #define NFSRVCACHE_CHECKLEN 100
248
249 /* True iff the rpc reply is an nfs status ONLY! */
250 static int nfsv2_repstat[NFS_V3NPROCS] = {
251 FALSE,
252 FALSE,
253 FALSE,
254 FALSE,
255 FALSE,
256 FALSE,
257 FALSE,
258 FALSE,
259 FALSE,
260 FALSE,
261 TRUE,
262 TRUE,
263 TRUE,
264 TRUE,
265 FALSE,
266 TRUE,
267 FALSE,
268 FALSE,
269 FALSE,
270 FALSE,
271 FALSE,
272 FALSE,
273 };
274
275 /*
276 * Will NFS want to work over IPv6 someday?
277 */
278 #define NETFAMILY(rp) \
279 (((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
280
281 /* local functions */
282 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
283 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
284 static void nfsrc_lock(struct nfsrvcache *rp);
285 static void nfsrc_unlock(struct nfsrvcache *rp);
286 static void nfsrc_wanted(struct nfsrvcache *rp);
287 static void nfsrc_freecache(struct nfsrvcache *rp);
288 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
289 static void nfsrc_marksametcpconn(u_int64_t);
290
291 /*
292 * Return the correct mutex for this cache entry.
293 */
294 static __inline struct mtx *
nfsrc_cachemutex(struct nfsrvcache * rp)295 nfsrc_cachemutex(struct nfsrvcache *rp)
296 {
297
298 if ((rp->rc_flag & RC_UDP) != 0)
299 return (&nfsrc_udpmtx);
300 return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
301 }
302
303 /*
304 * Initialize the server request cache list
305 */
306 APPLESTATIC void
nfsrvd_initcache(void)307 nfsrvd_initcache(void)
308 {
309 int i;
310 static int inited = 0;
311
312 if (inited)
313 return;
314 inited = 1;
315 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
316 LIST_INIT(&nfsrvudphashtbl[i]);
317 LIST_INIT(&nfsrchash_table[i].tbl);
318 LIST_INIT(&nfsrcahash_table[i].tbl);
319 }
320 TAILQ_INIT(&nfsrvudplru);
321 nfsrc_tcpsavedreplies = 0;
322 nfsrc_udpcachesize = 0;
323 nfsstatsv1.srvcache_tcppeak = 0;
324 nfsstatsv1.srvcache_size = 0;
325 }
326
327 /*
328 * Get a cache entry for this request. Basically just malloc a new one
329 * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
330 */
331 APPLESTATIC int
nfsrvd_getcache(struct nfsrv_descript * nd)332 nfsrvd_getcache(struct nfsrv_descript *nd)
333 {
334 struct nfsrvcache *newrp;
335 int ret;
336
337 if (nd->nd_procnum == NFSPROC_NULL)
338 panic("nfsd cache null");
339 MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
340 M_NFSRVCACHE, M_WAITOK);
341 NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
342 if (nd->nd_flag & ND_NFSV4)
343 newrp->rc_flag = RC_NFSV4;
344 else if (nd->nd_flag & ND_NFSV3)
345 newrp->rc_flag = RC_NFSV3;
346 else
347 newrp->rc_flag = RC_NFSV2;
348 newrp->rc_xid = nd->nd_retxid;
349 newrp->rc_proc = nd->nd_procnum;
350 newrp->rc_sockref = nd->nd_sockref;
351 newrp->rc_cachetime = nd->nd_tcpconntime;
352 if (nd->nd_flag & ND_SAMETCPCONN)
353 newrp->rc_flag |= RC_SAMETCPCONN;
354 if (nd->nd_nam2 != NULL) {
355 newrp->rc_flag |= RC_UDP;
356 ret = nfsrc_getudp(nd, newrp);
357 } else {
358 ret = nfsrc_gettcp(nd, newrp);
359 }
360 NFSEXITCODE2(0, nd);
361 return (ret);
362 }
363
364 /*
365 * For UDP (v2, v3):
366 * - key on <xid, NFS version, RPC#, Client host ip#>
367 * (at most one entry for each key)
368 */
369 static int
nfsrc_getudp(struct nfsrv_descript * nd,struct nfsrvcache * newrp)370 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
371 {
372 struct nfsrvcache *rp;
373 struct sockaddr_in *saddr;
374 struct sockaddr_in6 *saddr6;
375 struct nfsrvhashhead *hp;
376 int ret = 0;
377 struct mtx *mutex;
378
379 mutex = nfsrc_cachemutex(newrp);
380 hp = NFSRCUDPHASH(newrp->rc_xid);
381 loop:
382 mtx_lock(mutex);
383 LIST_FOREACH(rp, hp, rc_hash) {
384 if (newrp->rc_xid == rp->rc_xid &&
385 newrp->rc_proc == rp->rc_proc &&
386 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
387 nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
388 if ((rp->rc_flag & RC_LOCKED) != 0) {
389 rp->rc_flag |= RC_WANTED;
390 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
391 "nfsrc", 10 * hz);
392 goto loop;
393 }
394 if (rp->rc_flag == 0)
395 panic("nfs udp cache0");
396 rp->rc_flag |= RC_LOCKED;
397 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
398 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
399 if (rp->rc_flag & RC_INPROG) {
400 nfsstatsv1.srvcache_inproghits++;
401 mtx_unlock(mutex);
402 ret = RC_DROPIT;
403 } else if (rp->rc_flag & RC_REPSTATUS) {
404 /*
405 * V2 only.
406 */
407 nfsstatsv1.srvcache_nonidemdonehits++;
408 mtx_unlock(mutex);
409 nfsrvd_rephead(nd);
410 *(nd->nd_errp) = rp->rc_status;
411 ret = RC_REPLY;
412 rp->rc_timestamp = NFSD_MONOSEC +
413 NFSRVCACHE_UDPTIMEOUT;
414 } else if (rp->rc_flag & RC_REPMBUF) {
415 nfsstatsv1.srvcache_nonidemdonehits++;
416 mtx_unlock(mutex);
417 nd->nd_mreq = m_copym(rp->rc_reply, 0,
418 M_COPYALL, M_WAITOK);
419 ret = RC_REPLY;
420 rp->rc_timestamp = NFSD_MONOSEC +
421 NFSRVCACHE_UDPTIMEOUT;
422 } else {
423 panic("nfs udp cache1");
424 }
425 nfsrc_unlock(rp);
426 free((caddr_t)newrp, M_NFSRVCACHE);
427 goto out;
428 }
429 }
430 nfsstatsv1.srvcache_misses++;
431 atomic_add_int(&nfsstatsv1.srvcache_size, 1);
432 nfsrc_udpcachesize++;
433
434 newrp->rc_flag |= RC_INPROG;
435 saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
436 if (saddr->sin_family == AF_INET)
437 newrp->rc_inet = saddr->sin_addr.s_addr;
438 else if (saddr->sin_family == AF_INET6) {
439 saddr6 = (struct sockaddr_in6 *)saddr;
440 NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
441 sizeof (struct in6_addr));
442 newrp->rc_flag |= RC_INETIPV6;
443 }
444 LIST_INSERT_HEAD(hp, newrp, rc_hash);
445 TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
446 mtx_unlock(mutex);
447 nd->nd_rp = newrp;
448 ret = RC_DOIT;
449
450 out:
451 NFSEXITCODE2(0, nd);
452 return (ret);
453 }
454
455 /*
456 * Update a request cache entry after the rpc has been done
457 */
458 APPLESTATIC struct nfsrvcache *
nfsrvd_updatecache(struct nfsrv_descript * nd)459 nfsrvd_updatecache(struct nfsrv_descript *nd)
460 {
461 struct nfsrvcache *rp;
462 struct nfsrvcache *retrp = NULL;
463 mbuf_t m;
464 struct mtx *mutex;
465
466 rp = nd->nd_rp;
467 if (!rp)
468 panic("nfsrvd_updatecache null rp");
469 nd->nd_rp = NULL;
470 mutex = nfsrc_cachemutex(rp);
471 mtx_lock(mutex);
472 nfsrc_lock(rp);
473 if (!(rp->rc_flag & RC_INPROG))
474 panic("nfsrvd_updatecache not inprog");
475 rp->rc_flag &= ~RC_INPROG;
476 if (rp->rc_flag & RC_UDP) {
477 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
478 TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
479 }
480
481 /*
482 * Reply from cache is a special case returned by nfsrv_checkseqid().
483 */
484 if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
485 nfsstatsv1.srvcache_nonidemdonehits++;
486 mtx_unlock(mutex);
487 nd->nd_repstat = 0;
488 mbuf_freem(nd->nd_mreq);
489 if (!(rp->rc_flag & RC_REPMBUF))
490 panic("reply from cache");
491 nd->nd_mreq = m_copym(rp->rc_reply, 0,
492 M_COPYALL, M_WAITOK);
493 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
494 nfsrc_unlock(rp);
495 goto out;
496 }
497
498 /*
499 * If rc_refcnt > 0, save it
500 * For UDP, save it if ND_SAVEREPLY is set
501 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
502 */
503 if (nd->nd_repstat != NFSERR_DONTREPLY &&
504 (rp->rc_refcnt > 0 ||
505 ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
506 ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
507 nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
508 nfsrc_tcpnonidempotent))) {
509 if (rp->rc_refcnt > 0) {
510 if (!(rp->rc_flag & RC_NFSV4))
511 panic("update_cache refcnt");
512 rp->rc_flag |= RC_REFCNT;
513 }
514 if ((nd->nd_flag & ND_NFSV2) &&
515 nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
516 rp->rc_status = nd->nd_repstat;
517 rp->rc_flag |= RC_REPSTATUS;
518 mtx_unlock(mutex);
519 } else {
520 if (!(rp->rc_flag & RC_UDP)) {
521 atomic_add_int(&nfsrc_tcpsavedreplies, 1);
522 if (nfsrc_tcpsavedreplies >
523 nfsstatsv1.srvcache_tcppeak)
524 nfsstatsv1.srvcache_tcppeak =
525 nfsrc_tcpsavedreplies;
526 }
527 mtx_unlock(mutex);
528 m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
529 mtx_lock(mutex);
530 rp->rc_reply = m;
531 rp->rc_flag |= RC_REPMBUF;
532 mtx_unlock(mutex);
533 }
534 if (rp->rc_flag & RC_UDP) {
535 rp->rc_timestamp = NFSD_MONOSEC +
536 NFSRVCACHE_UDPTIMEOUT;
537 nfsrc_unlock(rp);
538 } else {
539 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
540 if (rp->rc_refcnt > 0)
541 nfsrc_unlock(rp);
542 else
543 retrp = rp;
544 }
545 } else {
546 nfsrc_freecache(rp);
547 mtx_unlock(mutex);
548 }
549
550 out:
551 NFSEXITCODE2(0, nd);
552 return (retrp);
553 }
554
555 /*
556 * Invalidate and, if possible, free an in prog cache entry.
557 * Must not sleep.
558 */
559 APPLESTATIC void
nfsrvd_delcache(struct nfsrvcache * rp)560 nfsrvd_delcache(struct nfsrvcache *rp)
561 {
562 struct mtx *mutex;
563
564 mutex = nfsrc_cachemutex(rp);
565 if (!(rp->rc_flag & RC_INPROG))
566 panic("nfsrvd_delcache not in prog");
567 mtx_lock(mutex);
568 rp->rc_flag &= ~RC_INPROG;
569 if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
570 nfsrc_freecache(rp);
571 mtx_unlock(mutex);
572 }
573
574 /*
575 * Called after nfsrvd_updatecache() once the reply is sent, to update
576 * the entry's sequence number and unlock it. The argument is
577 * the pointer returned by nfsrvd_updatecache().
578 */
579 APPLESTATIC void
nfsrvd_sentcache(struct nfsrvcache * rp,int have_seq,uint32_t seq)580 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
581 {
582 struct nfsrchash_bucket *hbp;
583
584 KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
585 if (have_seq) {
586 hbp = NFSRCAHASH(rp->rc_sockref);
587 mtx_lock(&hbp->mtx);
588 rp->rc_tcpseq = seq;
589 if (rp->rc_acked != RC_NO_ACK)
590 LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
591 rp->rc_acked = RC_NO_ACK;
592 mtx_unlock(&hbp->mtx);
593 }
594 nfsrc_unlock(rp);
595 }
596
597 /*
598 * Get a cache entry for TCP
599 * - key on <xid, nfs version>
600 * (allow multiple entries for a given key)
601 */
602 static int
nfsrc_gettcp(struct nfsrv_descript * nd,struct nfsrvcache * newrp)603 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
604 {
605 struct nfsrvcache *rp, *nextrp;
606 int i;
607 struct nfsrvcache *hitrp;
608 struct nfsrvhashhead *hp, nfsrc_templist;
609 int hit, ret = 0;
610 struct mtx *mutex;
611
612 mutex = nfsrc_cachemutex(newrp);
613 hp = NFSRCHASH(newrp->rc_xid);
614 newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
615 tryagain:
616 mtx_lock(mutex);
617 hit = 1;
618 LIST_INIT(&nfsrc_templist);
619 /*
620 * Get all the matches and put them on the temp list.
621 */
622 rp = LIST_FIRST(hp);
623 while (rp != NULL) {
624 nextrp = LIST_NEXT(rp, rc_hash);
625 if (newrp->rc_xid == rp->rc_xid &&
626 (!(rp->rc_flag & RC_INPROG) ||
627 ((newrp->rc_flag & RC_SAMETCPCONN) &&
628 newrp->rc_sockref == rp->rc_sockref)) &&
629 (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
630 newrp->rc_proc == rp->rc_proc &&
631 ((newrp->rc_flag & RC_NFSV4) &&
632 newrp->rc_sockref != rp->rc_sockref &&
633 newrp->rc_cachetime >= rp->rc_cachetime)
634 && newrp->rc_reqlen == rp->rc_reqlen &&
635 newrp->rc_cksum == rp->rc_cksum) {
636 LIST_REMOVE(rp, rc_hash);
637 LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
638 }
639 rp = nextrp;
640 }
641
642 /*
643 * Now, use nfsrc_templist to decide if there is a match.
644 */
645 i = 0;
646 LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
647 i++;
648 if (rp->rc_refcnt > 0) {
649 hit = 0;
650 break;
651 }
652 }
653 /*
654 * Can be a hit only if one entry left.
655 * Note possible hit entry and put nfsrc_templist back on hash
656 * list.
657 */
658 if (i != 1)
659 hit = 0;
660 hitrp = rp = LIST_FIRST(&nfsrc_templist);
661 while (rp != NULL) {
662 nextrp = LIST_NEXT(rp, rc_hash);
663 LIST_REMOVE(rp, rc_hash);
664 LIST_INSERT_HEAD(hp, rp, rc_hash);
665 rp = nextrp;
666 }
667 if (LIST_FIRST(&nfsrc_templist) != NULL)
668 panic("nfs gettcp cache templist");
669
670 if (hit) {
671 rp = hitrp;
672 if ((rp->rc_flag & RC_LOCKED) != 0) {
673 rp->rc_flag |= RC_WANTED;
674 (void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
675 "nfsrc", 10 * hz);
676 goto tryagain;
677 }
678 if (rp->rc_flag == 0)
679 panic("nfs tcp cache0");
680 rp->rc_flag |= RC_LOCKED;
681 if (rp->rc_flag & RC_INPROG) {
682 nfsstatsv1.srvcache_inproghits++;
683 mtx_unlock(mutex);
684 if (newrp->rc_sockref == rp->rc_sockref)
685 nfsrc_marksametcpconn(rp->rc_sockref);
686 ret = RC_DROPIT;
687 } else if (rp->rc_flag & RC_REPSTATUS) {
688 /*
689 * V2 only.
690 */
691 nfsstatsv1.srvcache_nonidemdonehits++;
692 mtx_unlock(mutex);
693 if (newrp->rc_sockref == rp->rc_sockref)
694 nfsrc_marksametcpconn(rp->rc_sockref);
695 ret = RC_REPLY;
696 nfsrvd_rephead(nd);
697 *(nd->nd_errp) = rp->rc_status;
698 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
699 } else if (rp->rc_flag & RC_REPMBUF) {
700 nfsstatsv1.srvcache_nonidemdonehits++;
701 mtx_unlock(mutex);
702 if (newrp->rc_sockref == rp->rc_sockref)
703 nfsrc_marksametcpconn(rp->rc_sockref);
704 ret = RC_REPLY;
705 nd->nd_mreq = m_copym(rp->rc_reply, 0,
706 M_COPYALL, M_WAITOK);
707 rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
708 } else {
709 panic("nfs tcp cache1");
710 }
711 nfsrc_unlock(rp);
712 free((caddr_t)newrp, M_NFSRVCACHE);
713 goto out;
714 }
715 nfsstatsv1.srvcache_misses++;
716 atomic_add_int(&nfsstatsv1.srvcache_size, 1);
717
718 /*
719 * For TCP, multiple entries for a key are allowed, so don't
720 * chain it into the hash table until done.
721 */
722 newrp->rc_cachetime = NFSD_MONOSEC;
723 newrp->rc_flag |= RC_INPROG;
724 LIST_INSERT_HEAD(hp, newrp, rc_hash);
725 mtx_unlock(mutex);
726 nd->nd_rp = newrp;
727 ret = RC_DOIT;
728
729 out:
730 NFSEXITCODE2(0, nd);
731 return (ret);
732 }
733
734 /*
735 * Lock a cache entry.
736 */
737 static void
nfsrc_lock(struct nfsrvcache * rp)738 nfsrc_lock(struct nfsrvcache *rp)
739 {
740 struct mtx *mutex;
741
742 mutex = nfsrc_cachemutex(rp);
743 mtx_assert(mutex, MA_OWNED);
744 while ((rp->rc_flag & RC_LOCKED) != 0) {
745 rp->rc_flag |= RC_WANTED;
746 (void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
747 }
748 rp->rc_flag |= RC_LOCKED;
749 }
750
751 /*
752 * Unlock a cache entry.
753 */
754 static void
nfsrc_unlock(struct nfsrvcache * rp)755 nfsrc_unlock(struct nfsrvcache *rp)
756 {
757 struct mtx *mutex;
758
759 mutex = nfsrc_cachemutex(rp);
760 mtx_lock(mutex);
761 rp->rc_flag &= ~RC_LOCKED;
762 nfsrc_wanted(rp);
763 mtx_unlock(mutex);
764 }
765
766 /*
767 * Wakeup anyone wanting entry.
768 */
769 static void
nfsrc_wanted(struct nfsrvcache * rp)770 nfsrc_wanted(struct nfsrvcache *rp)
771 {
772 if (rp->rc_flag & RC_WANTED) {
773 rp->rc_flag &= ~RC_WANTED;
774 wakeup((caddr_t)rp);
775 }
776 }
777
778 /*
779 * Free up the entry.
780 * Must not sleep.
781 */
782 static void
nfsrc_freecache(struct nfsrvcache * rp)783 nfsrc_freecache(struct nfsrvcache *rp)
784 {
785 struct nfsrchash_bucket *hbp;
786
787 LIST_REMOVE(rp, rc_hash);
788 if (rp->rc_flag & RC_UDP) {
789 TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
790 nfsrc_udpcachesize--;
791 } else if (rp->rc_acked != RC_NO_SEQ) {
792 hbp = NFSRCAHASH(rp->rc_sockref);
793 mtx_lock(&hbp->mtx);
794 if (rp->rc_acked == RC_NO_ACK)
795 LIST_REMOVE(rp, rc_ahash);
796 mtx_unlock(&hbp->mtx);
797 }
798 nfsrc_wanted(rp);
799 if (rp->rc_flag & RC_REPMBUF) {
800 mbuf_freem(rp->rc_reply);
801 if (!(rp->rc_flag & RC_UDP))
802 atomic_add_int(&nfsrc_tcpsavedreplies, -1);
803 }
804 FREE((caddr_t)rp, M_NFSRVCACHE);
805 atomic_add_int(&nfsstatsv1.srvcache_size, -1);
806 }
807
808 /*
809 * Clean out the cache. Called when nfsserver module is unloaded.
810 */
811 APPLESTATIC void
nfsrvd_cleancache(void)812 nfsrvd_cleancache(void)
813 {
814 struct nfsrvcache *rp, *nextrp;
815 int i;
816
817 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
818 mtx_lock(&nfsrchash_table[i].mtx);
819 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
820 nfsrc_freecache(rp);
821 mtx_unlock(&nfsrchash_table[i].mtx);
822 }
823 mtx_lock(&nfsrc_udpmtx);
824 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
825 LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
826 nfsrc_freecache(rp);
827 }
828 }
829 nfsstatsv1.srvcache_size = 0;
830 mtx_unlock(&nfsrc_udpmtx);
831 nfsrc_tcpsavedreplies = 0;
832 }
833
834 #define HISTSIZE 16
835 /*
836 * The basic rule is to get rid of entries that are expired.
837 */
838 void
nfsrc_trimcache(u_int64_t sockref,uint32_t snd_una,int final)839 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
840 {
841 struct nfsrchash_bucket *hbp;
842 struct nfsrvcache *rp, *nextrp;
843 int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
844 time_t thisstamp;
845 static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
846 static int onethread = 0, oneslot = 0;
847
848 if (sockref != 0) {
849 hbp = NFSRCAHASH(sockref);
850 mtx_lock(&hbp->mtx);
851 LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
852 if (sockref == rp->rc_sockref) {
853 if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
854 rp->rc_acked = RC_ACK;
855 LIST_REMOVE(rp, rc_ahash);
856 } else if (final) {
857 rp->rc_acked = RC_NACK;
858 LIST_REMOVE(rp, rc_ahash);
859 }
860 }
861 }
862 mtx_unlock(&hbp->mtx);
863 }
864
865 if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
866 return;
867 if (NFSD_MONOSEC != udp_lasttrim ||
868 nfsrc_udpcachesize >= (nfsrc_udphighwater +
869 nfsrc_udphighwater / 2)) {
870 mtx_lock(&nfsrc_udpmtx);
871 udp_lasttrim = NFSD_MONOSEC;
872 TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
873 if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
874 && rp->rc_refcnt == 0
875 && ((rp->rc_flag & RC_REFCNT) ||
876 udp_lasttrim > rp->rc_timestamp ||
877 nfsrc_udpcachesize > nfsrc_udphighwater))
878 nfsrc_freecache(rp);
879 }
880 mtx_unlock(&nfsrc_udpmtx);
881 }
882 if (NFSD_MONOSEC != tcp_lasttrim ||
883 nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
884 force = nfsrc_tcphighwater / 4;
885 if (force > 0 &&
886 nfsrc_tcpsavedreplies + force >= nfsrc_tcphighwater) {
887 for (i = 0; i < HISTSIZE; i++)
888 time_histo[i] = 0;
889 i = 0;
890 lastslot = NFSRVCACHE_HASHSIZE - 1;
891 } else {
892 force = 0;
893 if (NFSD_MONOSEC != tcp_lasttrim) {
894 i = 0;
895 lastslot = NFSRVCACHE_HASHSIZE - 1;
896 } else {
897 lastslot = i = oneslot;
898 if (++oneslot >= NFSRVCACHE_HASHSIZE)
899 oneslot = 0;
900 }
901 }
902 tto = nfsrc_tcptimeout;
903 tcp_lasttrim = NFSD_MONOSEC;
904 for (; i <= lastslot; i++) {
905 mtx_lock(&nfsrchash_table[i].mtx);
906 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
907 nextrp) {
908 if (!(rp->rc_flag &
909 (RC_INPROG|RC_LOCKED|RC_WANTED))
910 && rp->rc_refcnt == 0) {
911 if ((rp->rc_flag & RC_REFCNT) ||
912 tcp_lasttrim > rp->rc_timestamp ||
913 rp->rc_acked == RC_ACK) {
914 nfsrc_freecache(rp);
915 continue;
916 }
917
918 if (force == 0)
919 continue;
920 /*
921 * The timestamps range from roughly the
922 * present (tcp_lasttrim) to the present
923 * + nfsrc_tcptimeout. Generate a simple
924 * histogram of where the timeouts fall.
925 */
926 j = rp->rc_timestamp - tcp_lasttrim;
927 if (j >= tto)
928 j = HISTSIZE - 1;
929 else if (j < 0)
930 j = 0;
931 else
932 j = j * HISTSIZE / tto;
933 time_histo[j]++;
934 }
935 }
936 mtx_unlock(&nfsrchash_table[i].mtx);
937 }
938 if (force) {
939 /*
940 * Trim some more with a smaller timeout of as little
941 * as 20% of nfsrc_tcptimeout to try and get below
942 * 80% of the nfsrc_tcphighwater.
943 */
944 k = 0;
945 for (i = 0; i < (HISTSIZE - 2); i++) {
946 k += time_histo[i];
947 if (k > force)
948 break;
949 }
950 k = tto * (i + 1) / HISTSIZE;
951 if (k < 1)
952 k = 1;
953 thisstamp = tcp_lasttrim + k;
954 for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
955 mtx_lock(&nfsrchash_table[i].mtx);
956 LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
957 rc_hash, nextrp) {
958 if (!(rp->rc_flag &
959 (RC_INPROG|RC_LOCKED|RC_WANTED))
960 && rp->rc_refcnt == 0
961 && ((rp->rc_flag & RC_REFCNT) ||
962 thisstamp > rp->rc_timestamp ||
963 rp->rc_acked == RC_ACK))
964 nfsrc_freecache(rp);
965 }
966 mtx_unlock(&nfsrchash_table[i].mtx);
967 }
968 }
969 }
970 atomic_store_rel_int(&onethread, 0);
971 }
972
973 /*
974 * Add a seqid# reference to the cache entry.
975 */
976 APPLESTATIC void
nfsrvd_refcache(struct nfsrvcache * rp)977 nfsrvd_refcache(struct nfsrvcache *rp)
978 {
979 struct mtx *mutex;
980
981 if (rp == NULL)
982 /* For NFSv4.1, there is no cache entry. */
983 return;
984 mutex = nfsrc_cachemutex(rp);
985 mtx_lock(mutex);
986 if (rp->rc_refcnt < 0)
987 panic("nfs cache refcnt");
988 rp->rc_refcnt++;
989 mtx_unlock(mutex);
990 }
991
992 /*
993 * Dereference a seqid# cache entry.
994 */
995 APPLESTATIC void
nfsrvd_derefcache(struct nfsrvcache * rp)996 nfsrvd_derefcache(struct nfsrvcache *rp)
997 {
998 struct mtx *mutex;
999
1000 mutex = nfsrc_cachemutex(rp);
1001 mtx_lock(mutex);
1002 if (rp->rc_refcnt <= 0)
1003 panic("nfs cache derefcnt");
1004 rp->rc_refcnt--;
1005 if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1006 nfsrc_freecache(rp);
1007 mtx_unlock(mutex);
1008 }
1009
1010 /*
1011 * Calculate the length of the mbuf list and a checksum on the first up to
1012 * NFSRVCACHE_CHECKLEN bytes.
1013 */
1014 static int
nfsrc_getlenandcksum(mbuf_t m1,u_int16_t * cksum)1015 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
1016 {
1017 int len = 0, cklen;
1018 mbuf_t m;
1019
1020 m = m1;
1021 while (m) {
1022 len += mbuf_len(m);
1023 m = mbuf_next(m);
1024 }
1025 cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1026 *cksum = in_cksum(m1, cklen);
1027 return (len);
1028 }
1029
1030 /*
1031 * Mark a TCP connection that is seeing retries. Should never happen for
1032 * NFSv4.
1033 */
1034 static void
nfsrc_marksametcpconn(u_int64_t sockref)1035 nfsrc_marksametcpconn(u_int64_t sockref)
1036 {
1037 }
1038
1039