xref: /netbsd-src/sys/fs/nfs/server/nfs_nfsdcache.c (revision 481d3881954fd794ca5f2d880b68c53a5db8620e)
1 /*	$NetBSD: nfs_nfsdcache.c,v 1.5 2024/07/05 04:31:52 rin Exp $	*/
2 /*-
3  * Copyright (c) 1989, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * Rick Macklem at The University of Guelph.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  */
34 
35 #include <sys/cdefs.h>
36 /* __FBSDID("FreeBSD: head/sys/fs/nfsserver/nfs_nfsdcache.c 304026 2016-08-12 22:44:59Z rmacklem "); */
37 __RCSID("$NetBSD: nfs_nfsdcache.c,v 1.5 2024/07/05 04:31:52 rin Exp $");
38 
39 /*
40  * Here is the basic algorithm:
41  * First, some design criteria I used:
42  * - I think a false hit is more serious than a false miss
43  * - A false hit for an RPC that has Op(s) that order via seqid# must be
44  *   avoided at all cost
45  * - A valid hit will probably happen a long time after the original reply
46  *   and the TCP socket that the original request was received on will no
47  *   longer be active
48  *   (The long time delay implies to me that LRU is not appropriate.)
49  * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
50  *   in them as well as minimizing the risk of redoing retried non-idempotent
51  *   Ops.
52  * Because it is biased towards avoiding false hits, multiple entries with
53  * the same xid are to be expected, especially for the case of the entry
54  * in the cache being related to a seqid# sequenced Op.
55  *
56  * The basic algorithm I'm about to code up:
57  * - Null RPCs bypass the cache and are just done
58  * For TCP
59  * 	- key on <xid, NFS version> (as noted above, there can be several
60  * 				     entries with the same key)
61  * 	When a request arrives:
62  * 		For all that match key
63  * 		- if RPC# != OR request_size !=
64  * 			- not a match with this one
65  * 		- if NFSv4 and received on same TCP socket OR
66  *			received on a TCP connection created before the
67  *			entry was cached
68  * 			- not a match with this one
69  * 			(V2,3 clients might retry on same TCP socket)
70  * 		- calculate checksum on first N bytes of NFS XDR
71  * 		- if checksum !=
72  * 			- not a match for this one
73  * 		If any of the remaining ones that match has a
74  * 			seqid_refcnt > 0
75  * 			- not a match (go do RPC, using new cache entry)
76  * 		If one match left
77  * 			- a hit (reply from cache)
78  * 		else
79  * 			- miss (go do RPC, using new cache entry)
80  *
81  * 	During processing of NFSv4 request:
82  * 		- set a flag when a non-idempotent Op is processed
83  * 		- when an Op that uses a seqid# (Open,...) is processed
84  * 			- if same seqid# as referenced entry in cache
85  * 				- free new cache entry
86  * 				- reply from referenced cache entry
87  * 			  else if next seqid# in order
88  * 				- free referenced cache entry
89  * 				- increment seqid_refcnt on new cache entry
90  * 				- set pointer from Openowner/Lockowner to
91  * 					new cache entry (aka reference it)
92  * 			  else if first seqid# in sequence
93  * 				- increment seqid_refcnt on new cache entry
94  * 				- set pointer from Openowner/Lockowner to
95  * 					new cache entry (aka reference it)
96  *
97  * 	At end of RPC processing:
98  * 		- if seqid_refcnt > 0 OR flagged non-idempotent on new
99  * 			cache entry
100  * 			- save reply in cache entry
101  * 			- calculate checksum on first N bytes of NFS XDR
102  * 				request
103  * 			- note op and length of XDR request (in bytes)
104  * 			- timestamp it
105  * 		  else
106  * 			- free new cache entry
107  * 		- Send reply (noting info for socket activity check, below)
108  *
109  * 	For cache entries saved above:
110  * 		- if saved since seqid_refcnt was > 0
111  * 			- free when seqid_refcnt decrements to 0
112  * 			  (when next one in sequence is processed above, or
113  * 			   when Openowner/Lockowner is discarded)
114  * 		  else { non-idempotent Op(s) }
115  * 			- free when
116  * 				- some further activity observed on same
117  * 					socket
118  * 				  (I'm not yet sure how I'm going to do
119  * 				   this. Maybe look at the TCP connection
120  * 				   to see if the send_tcp_sequence# is well
121  * 				   past sent reply OR K additional RPCs
122  * 				   replied on same socket OR?)
123  * 			  OR
124  * 				- when very old (hours, days, weeks?)
125  *
126  * For UDP (v2, 3 only), pretty much the old way:
127  * - key on <xid, NFS version, RPC#, Client host ip#>
128  *   (at most one entry for each key)
129  *
130  * When a Request arrives:
131  * - if a match with entry via key
132  * 	- if RPC marked In_progress
133  * 		- discard request (don't send reply)
134  * 	  else
135  * 		- reply from cache
136  * 		- timestamp cache entry
137  *   else
138  * 	- add entry to cache, marked In_progress
139  * 	- do RPC
140  * 	- when RPC done
141  * 		- if RPC# non-idempotent
142  * 			- mark entry Done (not In_progress)
143  * 			- save reply
144  * 			- timestamp cache entry
145  * 		  else
146  * 			- free cache entry
147  * 		- send reply
148  *
149  * Later, entries with saved replies are free'd a short time (few minutes)
150  * after reply sent (timestamp).
151  * Reference: Chet Juszczak, "Improving the Performance and Correctness
152  *		of an NFS Server", in Proc. Winter 1989 USENIX Conference,
153  *		pages 53-63. San Diego, February 1989.
154  *	 for the UDP case.
155  * nfsrc_floodlevel is set to the allowable upper limit for saved replies
156  *	for TCP. For V3, a reply won't be saved when the flood level is
157  *	hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
158  *	that case. This level should be set high enough that this almost
159  *	never happens.
160  */
161 #ifndef APPLEKEXT
162 #include <fs/nfs/common/nfsport.h>
163 
164 extern struct nfsstatsv1 nfsstatsv1;
165 extern struct mtx nfsrc_udpmtx;
166 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
167 extern struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
168 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
169 #endif	/* !APPLEKEXT */
170 
171 SYSCTL_DECL(_vfs_nfsd);
172 
173 static u_int	nfsrc_tcphighwater = 0;
174 static int
sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)175 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
176 {
177 	int error, newhighwater;
178 
179 	newhighwater = nfsrc_tcphighwater;
180 	error = sysctl_handle_int(oidp, &newhighwater, 0, req);
181 	if (error != 0 || req->newptr == NULL)
182 		return (error);
183 	if (newhighwater < 0)
184 		return (EINVAL);
185 	if (newhighwater >= nfsrc_floodlevel)
186 		nfsrc_floodlevel = newhighwater + newhighwater / 5;
187 	nfsrc_tcphighwater = newhighwater;
188 	return (0);
189 }
190 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
191     sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
192     "High water mark for TCP cache entries");
193 
194 static u_int	nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
195 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
196     &nfsrc_udphighwater, 0,
197     "High water mark for UDP cache entries");
198 static u_int	nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
199 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
200     &nfsrc_tcptimeout, 0,
201     "Timeout for TCP entries in the DRC");
202 static u_int nfsrc_tcpnonidempotent = 1;
203 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
204     &nfsrc_tcpnonidempotent, 0,
205     "Enable the DRC for NFS over TCP");
206 
207 static int nfsrc_udpcachesize = 0;
208 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
209 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
210 
211 /*
212  * and the reverse mapping from generic to Version 2 procedure numbers
213  */
214 static int newnfsv2_procid[NFS_V3NPROCS] = {
215 	NFSV2PROC_NULL,
216 	NFSV2PROC_GETATTR,
217 	NFSV2PROC_SETATTR,
218 	NFSV2PROC_LOOKUP,
219 	NFSV2PROC_NOOP,
220 	NFSV2PROC_READLINK,
221 	NFSV2PROC_READ,
222 	NFSV2PROC_WRITE,
223 	NFSV2PROC_CREATE,
224 	NFSV2PROC_MKDIR,
225 	NFSV2PROC_SYMLINK,
226 	NFSV2PROC_CREATE,
227 	NFSV2PROC_REMOVE,
228 	NFSV2PROC_RMDIR,
229 	NFSV2PROC_RENAME,
230 	NFSV2PROC_LINK,
231 	NFSV2PROC_READDIR,
232 	NFSV2PROC_NOOP,
233 	NFSV2PROC_STATFS,
234 	NFSV2PROC_NOOP,
235 	NFSV2PROC_NOOP,
236 	NFSV2PROC_NOOP,
237 };
238 
239 #define	nfsrc_hash(xid)	(((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
240 #define	NFSRCUDPHASH(xid) \
241 	(&nfsrvudphashtbl[nfsrc_hash(xid)])
242 #define	NFSRCHASH(xid) \
243 	(&nfsrchash_table[nfsrc_hash(xid)].tbl)
244 #define	NFSRCAHASH(xid) (&nfsrcahash_table[nfsrc_hash(xid)])
245 #define	TRUE	1
246 #define	FALSE	0
247 #define	NFSRVCACHE_CHECKLEN	100
248 
249 /* True iff the rpc reply is an nfs status ONLY! */
250 static int nfsv2_repstat[NFS_V3NPROCS] = {
251 	FALSE,
252 	FALSE,
253 	FALSE,
254 	FALSE,
255 	FALSE,
256 	FALSE,
257 	FALSE,
258 	FALSE,
259 	FALSE,
260 	FALSE,
261 	TRUE,
262 	TRUE,
263 	TRUE,
264 	TRUE,
265 	FALSE,
266 	TRUE,
267 	FALSE,
268 	FALSE,
269 	FALSE,
270 	FALSE,
271 	FALSE,
272 	FALSE,
273 };
274 
275 /*
276  * Will NFS want to work over IPv6 someday?
277  */
278 #define	NETFAMILY(rp) \
279 		(((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
280 
281 /* local functions */
282 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
283 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
284 static void nfsrc_lock(struct nfsrvcache *rp);
285 static void nfsrc_unlock(struct nfsrvcache *rp);
286 static void nfsrc_wanted(struct nfsrvcache *rp);
287 static void nfsrc_freecache(struct nfsrvcache *rp);
288 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
289 static void nfsrc_marksametcpconn(u_int64_t);
290 
291 /*
292  * Return the correct mutex for this cache entry.
293  */
294 static __inline struct mtx *
nfsrc_cachemutex(struct nfsrvcache * rp)295 nfsrc_cachemutex(struct nfsrvcache *rp)
296 {
297 
298 	if ((rp->rc_flag & RC_UDP) != 0)
299 		return (&nfsrc_udpmtx);
300 	return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
301 }
302 
303 /*
304  * Initialize the server request cache list
305  */
306 APPLESTATIC void
nfsrvd_initcache(void)307 nfsrvd_initcache(void)
308 {
309 	int i;
310 	static int inited = 0;
311 
312 	if (inited)
313 		return;
314 	inited = 1;
315 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
316 		LIST_INIT(&nfsrvudphashtbl[i]);
317 		LIST_INIT(&nfsrchash_table[i].tbl);
318 		LIST_INIT(&nfsrcahash_table[i].tbl);
319 	}
320 	TAILQ_INIT(&nfsrvudplru);
321 	nfsrc_tcpsavedreplies = 0;
322 	nfsrc_udpcachesize = 0;
323 	nfsstatsv1.srvcache_tcppeak = 0;
324 	nfsstatsv1.srvcache_size = 0;
325 }
326 
327 /*
328  * Get a cache entry for this request. Basically just malloc a new one
329  * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
330  */
331 APPLESTATIC int
nfsrvd_getcache(struct nfsrv_descript * nd)332 nfsrvd_getcache(struct nfsrv_descript *nd)
333 {
334 	struct nfsrvcache *newrp;
335 	int ret;
336 
337 	if (nd->nd_procnum == NFSPROC_NULL)
338 		panic("nfsd cache null");
339 	MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
340 	    M_NFSRVCACHE, M_WAITOK);
341 	NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
342 	if (nd->nd_flag & ND_NFSV4)
343 		newrp->rc_flag = RC_NFSV4;
344 	else if (nd->nd_flag & ND_NFSV3)
345 		newrp->rc_flag = RC_NFSV3;
346 	else
347 		newrp->rc_flag = RC_NFSV2;
348 	newrp->rc_xid = nd->nd_retxid;
349 	newrp->rc_proc = nd->nd_procnum;
350 	newrp->rc_sockref = nd->nd_sockref;
351 	newrp->rc_cachetime = nd->nd_tcpconntime;
352 	if (nd->nd_flag & ND_SAMETCPCONN)
353 		newrp->rc_flag |= RC_SAMETCPCONN;
354 	if (nd->nd_nam2 != NULL) {
355 		newrp->rc_flag |= RC_UDP;
356 		ret = nfsrc_getudp(nd, newrp);
357 	} else {
358 		ret = nfsrc_gettcp(nd, newrp);
359 	}
360 	NFSEXITCODE2(0, nd);
361 	return (ret);
362 }
363 
364 /*
365  * For UDP (v2, v3):
366  * - key on <xid, NFS version, RPC#, Client host ip#>
367  *   (at most one entry for each key)
368  */
369 static int
nfsrc_getudp(struct nfsrv_descript * nd,struct nfsrvcache * newrp)370 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
371 {
372 	struct nfsrvcache *rp;
373 	struct sockaddr_in *saddr;
374 	struct sockaddr_in6 *saddr6;
375 	struct nfsrvhashhead *hp;
376 	int ret = 0;
377 	struct mtx *mutex;
378 
379 	mutex = nfsrc_cachemutex(newrp);
380 	hp = NFSRCUDPHASH(newrp->rc_xid);
381 loop:
382 	mtx_lock(mutex);
383 	LIST_FOREACH(rp, hp, rc_hash) {
384 	    if (newrp->rc_xid == rp->rc_xid &&
385 		newrp->rc_proc == rp->rc_proc &&
386 		(newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
387 		nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
388 			if ((rp->rc_flag & RC_LOCKED) != 0) {
389 				rp->rc_flag |= RC_WANTED;
390 				(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
391 				    "nfsrc", 10 * hz);
392 				goto loop;
393 			}
394 			if (rp->rc_flag == 0)
395 				panic("nfs udp cache0");
396 			rp->rc_flag |= RC_LOCKED;
397 			TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
398 			TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
399 			if (rp->rc_flag & RC_INPROG) {
400 				nfsstatsv1.srvcache_inproghits++;
401 				mtx_unlock(mutex);
402 				ret = RC_DROPIT;
403 			} else if (rp->rc_flag & RC_REPSTATUS) {
404 				/*
405 				 * V2 only.
406 				 */
407 				nfsstatsv1.srvcache_nonidemdonehits++;
408 				mtx_unlock(mutex);
409 				nfsrvd_rephead(nd);
410 				*(nd->nd_errp) = rp->rc_status;
411 				ret = RC_REPLY;
412 				rp->rc_timestamp = NFSD_MONOSEC +
413 					NFSRVCACHE_UDPTIMEOUT;
414 			} else if (rp->rc_flag & RC_REPMBUF) {
415 				nfsstatsv1.srvcache_nonidemdonehits++;
416 				mtx_unlock(mutex);
417 				nd->nd_mreq = m_copym(rp->rc_reply, 0,
418 					M_COPYALL, M_WAITOK);
419 				ret = RC_REPLY;
420 				rp->rc_timestamp = NFSD_MONOSEC +
421 					NFSRVCACHE_UDPTIMEOUT;
422 			} else {
423 				panic("nfs udp cache1");
424 			}
425 			nfsrc_unlock(rp);
426 			free((caddr_t)newrp, M_NFSRVCACHE);
427 			goto out;
428 		}
429 	}
430 	nfsstatsv1.srvcache_misses++;
431 	atomic_add_int(&nfsstatsv1.srvcache_size, 1);
432 	nfsrc_udpcachesize++;
433 
434 	newrp->rc_flag |= RC_INPROG;
435 	saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
436 	if (saddr->sin_family == AF_INET)
437 		newrp->rc_inet = saddr->sin_addr.s_addr;
438 	else if (saddr->sin_family == AF_INET6) {
439 		saddr6 = (struct sockaddr_in6 *)saddr;
440 		NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
441 		    sizeof (struct in6_addr));
442 		newrp->rc_flag |= RC_INETIPV6;
443 	}
444 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
445 	TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
446 	mtx_unlock(mutex);
447 	nd->nd_rp = newrp;
448 	ret = RC_DOIT;
449 
450 out:
451 	NFSEXITCODE2(0, nd);
452 	return (ret);
453 }
454 
455 /*
456  * Update a request cache entry after the rpc has been done
457  */
458 APPLESTATIC struct nfsrvcache *
nfsrvd_updatecache(struct nfsrv_descript * nd)459 nfsrvd_updatecache(struct nfsrv_descript *nd)
460 {
461 	struct nfsrvcache *rp;
462 	struct nfsrvcache *retrp = NULL;
463 	mbuf_t m;
464 	struct mtx *mutex;
465 
466 	rp = nd->nd_rp;
467 	if (!rp)
468 		panic("nfsrvd_updatecache null rp");
469 	nd->nd_rp = NULL;
470 	mutex = nfsrc_cachemutex(rp);
471 	mtx_lock(mutex);
472 	nfsrc_lock(rp);
473 	if (!(rp->rc_flag & RC_INPROG))
474 		panic("nfsrvd_updatecache not inprog");
475 	rp->rc_flag &= ~RC_INPROG;
476 	if (rp->rc_flag & RC_UDP) {
477 		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
478 		TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
479 	}
480 
481 	/*
482 	 * Reply from cache is a special case returned by nfsrv_checkseqid().
483 	 */
484 	if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
485 		nfsstatsv1.srvcache_nonidemdonehits++;
486 		mtx_unlock(mutex);
487 		nd->nd_repstat = 0;
488 		mbuf_freem(nd->nd_mreq);
489 		if (!(rp->rc_flag & RC_REPMBUF))
490 			panic("reply from cache");
491 		nd->nd_mreq = m_copym(rp->rc_reply, 0,
492 		    M_COPYALL, M_WAITOK);
493 		rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
494 		nfsrc_unlock(rp);
495 		goto out;
496 	}
497 
498 	/*
499 	 * If rc_refcnt > 0, save it
500 	 * For UDP, save it if ND_SAVEREPLY is set
501 	 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
502 	 */
503 	if (nd->nd_repstat != NFSERR_DONTREPLY &&
504 	    (rp->rc_refcnt > 0 ||
505 	     ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
506 	     ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
507 	      nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
508 	      nfsrc_tcpnonidempotent))) {
509 		if (rp->rc_refcnt > 0) {
510 			if (!(rp->rc_flag & RC_NFSV4))
511 				panic("update_cache refcnt");
512 			rp->rc_flag |= RC_REFCNT;
513 		}
514 		if ((nd->nd_flag & ND_NFSV2) &&
515 		    nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
516 			rp->rc_status = nd->nd_repstat;
517 			rp->rc_flag |= RC_REPSTATUS;
518 			mtx_unlock(mutex);
519 		} else {
520 			if (!(rp->rc_flag & RC_UDP)) {
521 			    atomic_add_int(&nfsrc_tcpsavedreplies, 1);
522 			    if (nfsrc_tcpsavedreplies >
523 				nfsstatsv1.srvcache_tcppeak)
524 				nfsstatsv1.srvcache_tcppeak =
525 				    nfsrc_tcpsavedreplies;
526 			}
527 			mtx_unlock(mutex);
528 			m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
529 			mtx_lock(mutex);
530 			rp->rc_reply = m;
531 			rp->rc_flag |= RC_REPMBUF;
532 			mtx_unlock(mutex);
533 		}
534 		if (rp->rc_flag & RC_UDP) {
535 			rp->rc_timestamp = NFSD_MONOSEC +
536 			    NFSRVCACHE_UDPTIMEOUT;
537 			nfsrc_unlock(rp);
538 		} else {
539 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
540 			if (rp->rc_refcnt > 0)
541 				nfsrc_unlock(rp);
542 			else
543 				retrp = rp;
544 		}
545 	} else {
546 		nfsrc_freecache(rp);
547 		mtx_unlock(mutex);
548 	}
549 
550 out:
551 	NFSEXITCODE2(0, nd);
552 	return (retrp);
553 }
554 
555 /*
556  * Invalidate and, if possible, free an in prog cache entry.
557  * Must not sleep.
558  */
559 APPLESTATIC void
nfsrvd_delcache(struct nfsrvcache * rp)560 nfsrvd_delcache(struct nfsrvcache *rp)
561 {
562 	struct mtx *mutex;
563 
564 	mutex = nfsrc_cachemutex(rp);
565 	if (!(rp->rc_flag & RC_INPROG))
566 		panic("nfsrvd_delcache not in prog");
567 	mtx_lock(mutex);
568 	rp->rc_flag &= ~RC_INPROG;
569 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
570 		nfsrc_freecache(rp);
571 	mtx_unlock(mutex);
572 }
573 
574 /*
575  * Called after nfsrvd_updatecache() once the reply is sent, to update
576  * the entry's sequence number and unlock it. The argument is
577  * the pointer returned by nfsrvd_updatecache().
578  */
579 APPLESTATIC void
nfsrvd_sentcache(struct nfsrvcache * rp,int have_seq,uint32_t seq)580 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
581 {
582 	struct nfsrchash_bucket *hbp;
583 
584 	KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
585 	if (have_seq) {
586 		hbp = NFSRCAHASH(rp->rc_sockref);
587 		mtx_lock(&hbp->mtx);
588 		rp->rc_tcpseq = seq;
589 		if (rp->rc_acked != RC_NO_ACK)
590 			LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
591 		rp->rc_acked = RC_NO_ACK;
592 		mtx_unlock(&hbp->mtx);
593 	}
594 	nfsrc_unlock(rp);
595 }
596 
597 /*
598  * Get a cache entry for TCP
599  * - key on <xid, nfs version>
600  *   (allow multiple entries for a given key)
601  */
602 static int
nfsrc_gettcp(struct nfsrv_descript * nd,struct nfsrvcache * newrp)603 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
604 {
605 	struct nfsrvcache *rp, *nextrp;
606 	int i;
607 	struct nfsrvcache *hitrp;
608 	struct nfsrvhashhead *hp, nfsrc_templist;
609 	int hit, ret = 0;
610 	struct mtx *mutex;
611 
612 	mutex = nfsrc_cachemutex(newrp);
613 	hp = NFSRCHASH(newrp->rc_xid);
614 	newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
615 tryagain:
616 	mtx_lock(mutex);
617 	hit = 1;
618 	LIST_INIT(&nfsrc_templist);
619 	/*
620 	 * Get all the matches and put them on the temp list.
621 	 */
622 	rp = LIST_FIRST(hp);
623 	while (rp != NULL) {
624 		nextrp = LIST_NEXT(rp, rc_hash);
625 		if (newrp->rc_xid == rp->rc_xid &&
626 		    (!(rp->rc_flag & RC_INPROG) ||
627 		     ((newrp->rc_flag & RC_SAMETCPCONN) &&
628 		      newrp->rc_sockref == rp->rc_sockref)) &&
629 		    (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
630 		    newrp->rc_proc == rp->rc_proc &&
631 		    ((newrp->rc_flag & RC_NFSV4) &&
632 		     newrp->rc_sockref != rp->rc_sockref &&
633 		     newrp->rc_cachetime >= rp->rc_cachetime)
634 		    && newrp->rc_reqlen == rp->rc_reqlen &&
635 		    newrp->rc_cksum == rp->rc_cksum) {
636 			LIST_REMOVE(rp, rc_hash);
637 			LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
638 		}
639 		rp = nextrp;
640 	}
641 
642 	/*
643 	 * Now, use nfsrc_templist to decide if there is a match.
644 	 */
645 	i = 0;
646 	LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
647 		i++;
648 		if (rp->rc_refcnt > 0) {
649 			hit = 0;
650 			break;
651 		}
652 	}
653 	/*
654 	 * Can be a hit only if one entry left.
655 	 * Note possible hit entry and put nfsrc_templist back on hash
656 	 * list.
657 	 */
658 	if (i != 1)
659 		hit = 0;
660 	hitrp = rp = LIST_FIRST(&nfsrc_templist);
661 	while (rp != NULL) {
662 		nextrp = LIST_NEXT(rp, rc_hash);
663 		LIST_REMOVE(rp, rc_hash);
664 		LIST_INSERT_HEAD(hp, rp, rc_hash);
665 		rp = nextrp;
666 	}
667 	if (LIST_FIRST(&nfsrc_templist) != NULL)
668 		panic("nfs gettcp cache templist");
669 
670 	if (hit) {
671 		rp = hitrp;
672 		if ((rp->rc_flag & RC_LOCKED) != 0) {
673 			rp->rc_flag |= RC_WANTED;
674 			(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
675 			    "nfsrc", 10 * hz);
676 			goto tryagain;
677 		}
678 		if (rp->rc_flag == 0)
679 			panic("nfs tcp cache0");
680 		rp->rc_flag |= RC_LOCKED;
681 		if (rp->rc_flag & RC_INPROG) {
682 			nfsstatsv1.srvcache_inproghits++;
683 			mtx_unlock(mutex);
684 			if (newrp->rc_sockref == rp->rc_sockref)
685 				nfsrc_marksametcpconn(rp->rc_sockref);
686 			ret = RC_DROPIT;
687 		} else if (rp->rc_flag & RC_REPSTATUS) {
688 			/*
689 			 * V2 only.
690 			 */
691 			nfsstatsv1.srvcache_nonidemdonehits++;
692 			mtx_unlock(mutex);
693 			if (newrp->rc_sockref == rp->rc_sockref)
694 				nfsrc_marksametcpconn(rp->rc_sockref);
695 			ret = RC_REPLY;
696 			nfsrvd_rephead(nd);
697 			*(nd->nd_errp) = rp->rc_status;
698 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
699 		} else if (rp->rc_flag & RC_REPMBUF) {
700 			nfsstatsv1.srvcache_nonidemdonehits++;
701 			mtx_unlock(mutex);
702 			if (newrp->rc_sockref == rp->rc_sockref)
703 				nfsrc_marksametcpconn(rp->rc_sockref);
704 			ret = RC_REPLY;
705 			nd->nd_mreq = m_copym(rp->rc_reply, 0,
706 				M_COPYALL, M_WAITOK);
707 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
708 		} else {
709 			panic("nfs tcp cache1");
710 		}
711 		nfsrc_unlock(rp);
712 		free((caddr_t)newrp, M_NFSRVCACHE);
713 		goto out;
714 	}
715 	nfsstatsv1.srvcache_misses++;
716 	atomic_add_int(&nfsstatsv1.srvcache_size, 1);
717 
718 	/*
719 	 * For TCP, multiple entries for a key are allowed, so don't
720 	 * chain it into the hash table until done.
721 	 */
722 	newrp->rc_cachetime = NFSD_MONOSEC;
723 	newrp->rc_flag |= RC_INPROG;
724 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
725 	mtx_unlock(mutex);
726 	nd->nd_rp = newrp;
727 	ret = RC_DOIT;
728 
729 out:
730 	NFSEXITCODE2(0, nd);
731 	return (ret);
732 }
733 
734 /*
735  * Lock a cache entry.
736  */
737 static void
nfsrc_lock(struct nfsrvcache * rp)738 nfsrc_lock(struct nfsrvcache *rp)
739 {
740 	struct mtx *mutex;
741 
742 	mutex = nfsrc_cachemutex(rp);
743 	mtx_assert(mutex, MA_OWNED);
744 	while ((rp->rc_flag & RC_LOCKED) != 0) {
745 		rp->rc_flag |= RC_WANTED;
746 		(void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
747 	}
748 	rp->rc_flag |= RC_LOCKED;
749 }
750 
751 /*
752  * Unlock a cache entry.
753  */
754 static void
nfsrc_unlock(struct nfsrvcache * rp)755 nfsrc_unlock(struct nfsrvcache *rp)
756 {
757 	struct mtx *mutex;
758 
759 	mutex = nfsrc_cachemutex(rp);
760 	mtx_lock(mutex);
761 	rp->rc_flag &= ~RC_LOCKED;
762 	nfsrc_wanted(rp);
763 	mtx_unlock(mutex);
764 }
765 
766 /*
767  * Wakeup anyone wanting entry.
768  */
769 static void
nfsrc_wanted(struct nfsrvcache * rp)770 nfsrc_wanted(struct nfsrvcache *rp)
771 {
772 	if (rp->rc_flag & RC_WANTED) {
773 		rp->rc_flag &= ~RC_WANTED;
774 		wakeup((caddr_t)rp);
775 	}
776 }
777 
778 /*
779  * Free up the entry.
780  * Must not sleep.
781  */
782 static void
nfsrc_freecache(struct nfsrvcache * rp)783 nfsrc_freecache(struct nfsrvcache *rp)
784 {
785 	struct nfsrchash_bucket *hbp;
786 
787 	LIST_REMOVE(rp, rc_hash);
788 	if (rp->rc_flag & RC_UDP) {
789 		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
790 		nfsrc_udpcachesize--;
791 	} else if (rp->rc_acked != RC_NO_SEQ) {
792 		hbp = NFSRCAHASH(rp->rc_sockref);
793 		mtx_lock(&hbp->mtx);
794 		if (rp->rc_acked == RC_NO_ACK)
795 			LIST_REMOVE(rp, rc_ahash);
796 		mtx_unlock(&hbp->mtx);
797 	}
798 	nfsrc_wanted(rp);
799 	if (rp->rc_flag & RC_REPMBUF) {
800 		mbuf_freem(rp->rc_reply);
801 		if (!(rp->rc_flag & RC_UDP))
802 			atomic_add_int(&nfsrc_tcpsavedreplies, -1);
803 	}
804 	FREE((caddr_t)rp, M_NFSRVCACHE);
805 	atomic_add_int(&nfsstatsv1.srvcache_size, -1);
806 }
807 
808 /*
809  * Clean out the cache. Called when nfsserver module is unloaded.
810  */
811 APPLESTATIC void
nfsrvd_cleancache(void)812 nfsrvd_cleancache(void)
813 {
814 	struct nfsrvcache *rp, *nextrp;
815 	int i;
816 
817 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
818 		mtx_lock(&nfsrchash_table[i].mtx);
819 		LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
820 			nfsrc_freecache(rp);
821 		mtx_unlock(&nfsrchash_table[i].mtx);
822 	}
823 	mtx_lock(&nfsrc_udpmtx);
824 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
825 		LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
826 			nfsrc_freecache(rp);
827 		}
828 	}
829 	nfsstatsv1.srvcache_size = 0;
830 	mtx_unlock(&nfsrc_udpmtx);
831 	nfsrc_tcpsavedreplies = 0;
832 }
833 
834 #define HISTSIZE	16
835 /*
836  * The basic rule is to get rid of entries that are expired.
837  */
838 void
nfsrc_trimcache(u_int64_t sockref,uint32_t snd_una,int final)839 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
840 {
841 	struct nfsrchash_bucket *hbp;
842 	struct nfsrvcache *rp, *nextrp;
843 	int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
844 	time_t thisstamp;
845 	static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
846 	static int onethread = 0, oneslot = 0;
847 
848 	if (sockref != 0) {
849 		hbp = NFSRCAHASH(sockref);
850 		mtx_lock(&hbp->mtx);
851 		LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
852 			if (sockref == rp->rc_sockref) {
853 				if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
854 					rp->rc_acked = RC_ACK;
855 					LIST_REMOVE(rp, rc_ahash);
856 				} else if (final) {
857 					rp->rc_acked = RC_NACK;
858 					LIST_REMOVE(rp, rc_ahash);
859 				}
860 			}
861 		}
862 		mtx_unlock(&hbp->mtx);
863 	}
864 
865 	if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
866 		return;
867 	if (NFSD_MONOSEC != udp_lasttrim ||
868 	    nfsrc_udpcachesize >= (nfsrc_udphighwater +
869 	    nfsrc_udphighwater / 2)) {
870 		mtx_lock(&nfsrc_udpmtx);
871 		udp_lasttrim = NFSD_MONOSEC;
872 		TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
873 			if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
874 			     && rp->rc_refcnt == 0
875 			     && ((rp->rc_flag & RC_REFCNT) ||
876 				 udp_lasttrim > rp->rc_timestamp ||
877 				 nfsrc_udpcachesize > nfsrc_udphighwater))
878 				nfsrc_freecache(rp);
879 		}
880 		mtx_unlock(&nfsrc_udpmtx);
881 	}
882 	if (NFSD_MONOSEC != tcp_lasttrim ||
883 	    nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
884 		force = nfsrc_tcphighwater / 4;
885 		if (force > 0 &&
886 		    nfsrc_tcpsavedreplies + force >= nfsrc_tcphighwater) {
887 			for (i = 0; i < HISTSIZE; i++)
888 				time_histo[i] = 0;
889 			i = 0;
890 			lastslot = NFSRVCACHE_HASHSIZE - 1;
891 		} else {
892 			force = 0;
893 			if (NFSD_MONOSEC != tcp_lasttrim) {
894 				i = 0;
895 				lastslot = NFSRVCACHE_HASHSIZE - 1;
896 			} else {
897 				lastslot = i = oneslot;
898 				if (++oneslot >= NFSRVCACHE_HASHSIZE)
899 					oneslot = 0;
900 			}
901 		}
902 		tto = nfsrc_tcptimeout;
903 		tcp_lasttrim = NFSD_MONOSEC;
904 		for (; i <= lastslot; i++) {
905 			mtx_lock(&nfsrchash_table[i].mtx);
906 			LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
907 			    nextrp) {
908 				if (!(rp->rc_flag &
909 				     (RC_INPROG|RC_LOCKED|RC_WANTED))
910 				     && rp->rc_refcnt == 0) {
911 					if ((rp->rc_flag & RC_REFCNT) ||
912 					    tcp_lasttrim > rp->rc_timestamp ||
913 					    rp->rc_acked == RC_ACK) {
914 						nfsrc_freecache(rp);
915 						continue;
916 					}
917 
918 					if (force == 0)
919 						continue;
920 					/*
921 					 * The timestamps range from roughly the
922 					 * present (tcp_lasttrim) to the present
923 					 * + nfsrc_tcptimeout. Generate a simple
924 					 * histogram of where the timeouts fall.
925 					 */
926 					j = rp->rc_timestamp - tcp_lasttrim;
927 					if (j >= tto)
928 						j = HISTSIZE - 1;
929 					else if (j < 0)
930 						j = 0;
931 					else
932 						j = j * HISTSIZE / tto;
933 					time_histo[j]++;
934 				}
935 			}
936 			mtx_unlock(&nfsrchash_table[i].mtx);
937 		}
938 		if (force) {
939 			/*
940 			 * Trim some more with a smaller timeout of as little
941 			 * as 20% of nfsrc_tcptimeout to try and get below
942 			 * 80% of the nfsrc_tcphighwater.
943 			 */
944 			k = 0;
945 			for (i = 0; i < (HISTSIZE - 2); i++) {
946 				k += time_histo[i];
947 				if (k > force)
948 					break;
949 			}
950 			k = tto * (i + 1) / HISTSIZE;
951 			if (k < 1)
952 				k = 1;
953 			thisstamp = tcp_lasttrim + k;
954 			for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
955 				mtx_lock(&nfsrchash_table[i].mtx);
956 				LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
957 				    rc_hash, nextrp) {
958 					if (!(rp->rc_flag &
959 					     (RC_INPROG|RC_LOCKED|RC_WANTED))
960 					     && rp->rc_refcnt == 0
961 					     && ((rp->rc_flag & RC_REFCNT) ||
962 						 thisstamp > rp->rc_timestamp ||
963 						 rp->rc_acked == RC_ACK))
964 						nfsrc_freecache(rp);
965 				}
966 				mtx_unlock(&nfsrchash_table[i].mtx);
967 			}
968 		}
969 	}
970 	atomic_store_rel_int(&onethread, 0);
971 }
972 
973 /*
974  * Add a seqid# reference to the cache entry.
975  */
976 APPLESTATIC void
nfsrvd_refcache(struct nfsrvcache * rp)977 nfsrvd_refcache(struct nfsrvcache *rp)
978 {
979 	struct mtx *mutex;
980 
981 	if (rp == NULL)
982 		/* For NFSv4.1, there is no cache entry. */
983 		return;
984 	mutex = nfsrc_cachemutex(rp);
985 	mtx_lock(mutex);
986 	if (rp->rc_refcnt < 0)
987 		panic("nfs cache refcnt");
988 	rp->rc_refcnt++;
989 	mtx_unlock(mutex);
990 }
991 
992 /*
993  * Dereference a seqid# cache entry.
994  */
995 APPLESTATIC void
nfsrvd_derefcache(struct nfsrvcache * rp)996 nfsrvd_derefcache(struct nfsrvcache *rp)
997 {
998 	struct mtx *mutex;
999 
1000 	mutex = nfsrc_cachemutex(rp);
1001 	mtx_lock(mutex);
1002 	if (rp->rc_refcnt <= 0)
1003 		panic("nfs cache derefcnt");
1004 	rp->rc_refcnt--;
1005 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1006 		nfsrc_freecache(rp);
1007 	mtx_unlock(mutex);
1008 }
1009 
1010 /*
1011  * Calculate the length of the mbuf list and a checksum on the first up to
1012  * NFSRVCACHE_CHECKLEN bytes.
1013  */
1014 static int
nfsrc_getlenandcksum(mbuf_t m1,u_int16_t * cksum)1015 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
1016 {
1017 	int len = 0, cklen;
1018 	mbuf_t m;
1019 
1020 	m = m1;
1021 	while (m) {
1022 		len += mbuf_len(m);
1023 		m = mbuf_next(m);
1024 	}
1025 	cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1026 	*cksum = in_cksum(m1, cklen);
1027 	return (len);
1028 }
1029 
1030 /*
1031  * Mark a TCP connection that is seeing retries. Should never happen for
1032  * NFSv4.
1033  */
1034 static void
nfsrc_marksametcpconn(u_int64_t sockref)1035 nfsrc_marksametcpconn(u_int64_t sockref)
1036 {
1037 }
1038 
1039