xref: /netbsd-src/sys/nfs/nfs_syscalls.c (revision b1c86f5f087524e68db12794ee9c3e3da1ab17a0)
1 /*	$NetBSD: nfs_syscalls.c,v 1.153 2009/12/31 20:01:33 christos Exp $	*/
2 
3 /*
4  * Copyright (c) 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Rick Macklem at The University of Guelph.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)nfs_syscalls.c	8.5 (Berkeley) 3/30/95
35  */
36 
37 #include <sys/cdefs.h>
38 __KERNEL_RCSID(0, "$NetBSD: nfs_syscalls.c,v 1.153 2009/12/31 20:01:33 christos Exp $");
39 
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/kernel.h>
43 #include <sys/file.h>
44 #include <sys/stat.h>
45 #include <sys/vnode.h>
46 #include <sys/mount.h>
47 #include <sys/proc.h>
48 #include <sys/uio.h>
49 #include <sys/malloc.h>
50 #include <sys/kmem.h>
51 #include <sys/buf.h>
52 #include <sys/mbuf.h>
53 #include <sys/socket.h>
54 #include <sys/socketvar.h>
55 #include <sys/signalvar.h>
56 #include <sys/domain.h>
57 #include <sys/protosw.h>
58 #include <sys/namei.h>
59 #include <sys/syslog.h>
60 #include <sys/filedesc.h>
61 #include <sys/kthread.h>
62 #include <sys/kauth.h>
63 #include <sys/syscallargs.h>
64 
65 #include <netinet/in.h>
66 #include <netinet/tcp.h>
67 #include <nfs/xdr_subs.h>
68 #include <nfs/rpcv2.h>
69 #include <nfs/nfsproto.h>
70 #include <nfs/nfs.h>
71 #include <nfs/nfsm_subs.h>
72 #include <nfs/nfsrvcache.h>
73 #include <nfs/nfsmount.h>
74 #include <nfs/nfsnode.h>
75 #include <nfs/nfsrtt.h>
76 #include <nfs/nfs_var.h>
77 
78 extern int32_t (*nfsrv3_procs[NFS_NPROCS])(struct nfsrv_descript *,
79 						struct nfssvc_sock *,
80 						struct lwp *, struct mbuf **);
81 extern int nfsrvw_procrastinate;
82 extern int nuidhash_max;
83 
84 static int nfs_numnfsd = 0;
85 static struct nfsdrt nfsdrt;
86 kmutex_t nfsd_lock;
87 struct nfssvc_sockhead nfssvc_sockhead;
88 kcondvar_t nfsd_initcv;
89 struct nfssvc_sockhead nfssvc_sockpending;
90 struct nfsdhead nfsd_head;
91 struct nfsdidlehead nfsd_idle_head;
92 
93 int nfssvc_sockhead_flag;
94 int nfsd_head_flag;
95 
96 struct nfssvc_sock *nfs_udpsock;
97 struct nfssvc_sock *nfs_udp6sock;
98 
99 static struct nfssvc_sock *nfsrv_sockalloc(void);
100 static void nfsrv_sockfree(struct nfssvc_sock *);
101 static void nfsd_rt(int, struct nfsrv_descript *, int);
102 
103 /*
104  * NFS server system calls
105  */
106 
107 
108 /*
109  * Nfs server pseudo system call for the nfsd's
110  * Based on the flag value it either:
111  * - adds a socket to the selection list
112  * - remains in the kernel as an nfsd
113  * - remains in the kernel as an nfsiod
114  */
115 int
116 sys_nfssvc(struct lwp *l, const struct sys_nfssvc_args *uap, register_t *retval)
117 {
118 	/* {
119 		syscallarg(int) flag;
120 		syscallarg(void *) argp;
121 	} */
122 	int error;
123 	file_t *fp;
124 	struct mbuf *nam;
125 	struct nfsd_args nfsdarg;
126 	struct nfsd_srvargs nfsd_srvargs, *nsd = &nfsd_srvargs;
127 	struct nfsd *nfsd;
128 	struct nfssvc_sock *slp;
129 	struct nfsuid *nuidp;
130 
131 	error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_NFS,
132 	    KAUTH_REQ_NETWORK_NFS_SVC, NULL, NULL, NULL);
133 	if (error)
134 		return (error);
135 
136 	mutex_enter(&nfsd_lock);
137 	while (nfssvc_sockhead_flag & SLP_INIT) {
138 		cv_wait(&nfsd_initcv, &nfsd_lock);
139 	}
140 	mutex_exit(&nfsd_lock);
141 
142 	if (SCARG(uap, flag) & NFSSVC_BIOD) {
143 		/* Dummy implementation of nfsios for 1.4 and earlier. */
144 		error = kpause("nfsbiod", true, 0, NULL);
145 	} else if (SCARG(uap, flag) & NFSSVC_MNTD) {
146 		error = ENOSYS;
147 	} else if (SCARG(uap, flag) & NFSSVC_ADDSOCK) {
148 		error = copyin(SCARG(uap, argp), (void *)&nfsdarg,
149 		    sizeof(nfsdarg));
150 		if (error)
151 			return (error);
152 		/* getsock() will use the descriptor for us */
153 		if ((fp = fd_getfile(nfsdarg.sock)) == NULL)
154 			return (EBADF);
155 		if (fp->f_type != DTYPE_SOCKET) {
156 			fd_putfile(nfsdarg.sock);
157 			return (ENOTSOCK);
158 		}
159 		/*
160 		 * Get the client address for connected sockets.
161 		 */
162 		if (nfsdarg.name == NULL || nfsdarg.namelen == 0)
163 			nam = (struct mbuf *)0;
164 		else {
165 			error = sockargs(&nam, nfsdarg.name, nfsdarg.namelen,
166 				MT_SONAME);
167 			if (error) {
168 				fd_putfile(nfsdarg.sock);
169 				return (error);
170 			}
171 		}
172 		error = nfssvc_addsock(fp, nam);
173 		fd_putfile(nfsdarg.sock);
174 	} else if (SCARG(uap, flag) & NFSSVC_SETEXPORTSLIST) {
175 		struct export_args *args;
176 		struct mountd_exports_list mel;
177 
178 		error = copyin(SCARG(uap, argp), &mel, sizeof(mel));
179 		if (error != 0)
180 			return error;
181 
182 		args = (struct export_args *)malloc(mel.mel_nexports *
183 		    sizeof(struct export_args), M_TEMP, M_WAITOK);
184 		error = copyin(mel.mel_exports, args, mel.mel_nexports *
185 		    sizeof(struct export_args));
186 		if (error != 0) {
187 			free(args, M_TEMP);
188 			return error;
189 		}
190 		mel.mel_exports = args;
191 
192 		error = mountd_set_exports_list(&mel, l, NULL);
193 
194 		free(args, M_TEMP);
195 	} else {
196 		error = copyin(SCARG(uap, argp), (void *)nsd, sizeof (*nsd));
197 		if (error)
198 			return (error);
199 		if ((SCARG(uap, flag) & NFSSVC_AUTHIN) &&
200 		    ((nfsd = nsd->nsd_nfsd)) != NULL &&
201 		    (nfsd->nfsd_slp->ns_flags & SLP_VALID)) {
202 			slp = nfsd->nfsd_slp;
203 
204 			/*
205 			 * First check to see if another nfsd has already
206 			 * added this credential.
207 			 */
208 			LIST_FOREACH(nuidp, NUIDHASH(slp, nsd->nsd_cr.cr_uid),
209 			    nu_hash) {
210 				if (kauth_cred_geteuid(nuidp->nu_cr) ==
211 				    nsd->nsd_cr.cr_uid &&
212 				    (!nfsd->nfsd_nd->nd_nam2 ||
213 				     netaddr_match(NU_NETFAM(nuidp),
214 				     &nuidp->nu_haddr, nfsd->nfsd_nd->nd_nam2)))
215 					break;
216 			}
217 			if (nuidp) {
218 			    kauth_cred_hold(nuidp->nu_cr);
219 			    nfsd->nfsd_nd->nd_cr = nuidp->nu_cr;
220 			    nfsd->nfsd_nd->nd_flag |= ND_KERBFULL;
221 			} else {
222 			    /*
223 			     * Nope, so we will.
224 			     */
225 			    if (slp->ns_numuids < nuidhash_max) {
226 				slp->ns_numuids++;
227 				nuidp = kmem_alloc(sizeof(*nuidp), KM_SLEEP);
228 			    } else
229 				nuidp = (struct nfsuid *)0;
230 			    if ((slp->ns_flags & SLP_VALID) == 0) {
231 				if (nuidp)
232 				    kmem_free(nuidp, sizeof(*nuidp));
233 			    } else {
234 				if (nuidp == (struct nfsuid *)0) {
235 				    nuidp = TAILQ_FIRST(&slp->ns_uidlruhead);
236 				    LIST_REMOVE(nuidp, nu_hash);
237 				    TAILQ_REMOVE(&slp->ns_uidlruhead, nuidp,
238 					nu_lru);
239 				    if (nuidp->nu_flag & NU_NAM)
240 					m_freem(nuidp->nu_nam);
241 			        }
242 				nuidp->nu_flag = 0;
243 				kauth_uucred_to_cred(nuidp->nu_cr,
244 				    &nsd->nsd_cr);
245 				nuidp->nu_timestamp = nsd->nsd_timestamp;
246 				nuidp->nu_expire = time_second + nsd->nsd_ttl;
247 				/*
248 				 * and save the session key in nu_key.
249 				 */
250 				memcpy(nuidp->nu_key, nsd->nsd_key,
251 				    sizeof(nsd->nsd_key));
252 				if (nfsd->nfsd_nd->nd_nam2) {
253 				    struct sockaddr_in *saddr;
254 
255 				    saddr = mtod(nfsd->nfsd_nd->nd_nam2,
256 					 struct sockaddr_in *);
257 				    switch (saddr->sin_family) {
258 				    case AF_INET:
259 					nuidp->nu_flag |= NU_INETADDR;
260 					nuidp->nu_inetaddr =
261 					     saddr->sin_addr.s_addr;
262 					break;
263 				    case AF_INET6:
264 					nuidp->nu_flag |= NU_NAM;
265 					nuidp->nu_nam = m_copym(
266 					    nfsd->nfsd_nd->nd_nam2, 0,
267 					     M_COPYALL, M_WAIT);
268 					break;
269 				    default:
270 					return EAFNOSUPPORT;
271 				    };
272 				}
273 				TAILQ_INSERT_TAIL(&slp->ns_uidlruhead, nuidp,
274 					nu_lru);
275 				LIST_INSERT_HEAD(NUIDHASH(slp, nsd->nsd_uid),
276 					nuidp, nu_hash);
277 				kauth_cred_hold(nuidp->nu_cr);
278 				nfsd->nfsd_nd->nd_cr = nuidp->nu_cr;
279 				nfsd->nfsd_nd->nd_flag |= ND_KERBFULL;
280 			    }
281 			}
282 		}
283 		if ((SCARG(uap, flag) & NFSSVC_AUTHINFAIL) &&
284 		    (nfsd = nsd->nsd_nfsd))
285 			nfsd->nfsd_flag |= NFSD_AUTHFAIL;
286 		error = nfssvc_nfsd(nsd, SCARG(uap, argp), l);
287 	}
288 	if (error == EINTR || error == ERESTART)
289 		error = 0;
290 	return (error);
291 }
292 
293 static struct nfssvc_sock *
294 nfsrv_sockalloc(void)
295 {
296 	struct nfssvc_sock *slp;
297 
298 	slp = kmem_alloc(sizeof(*slp), KM_SLEEP);
299 	memset(slp, 0, sizeof (struct nfssvc_sock));
300 	mutex_init(&slp->ns_lock, MUTEX_DRIVER, IPL_SOFTNET);
301 	mutex_init(&slp->ns_alock, MUTEX_DRIVER, IPL_SOFTNET);
302 	cv_init(&slp->ns_cv, "nfsdsock");
303 	TAILQ_INIT(&slp->ns_uidlruhead);
304 	LIST_INIT(&slp->ns_tq);
305 	SIMPLEQ_INIT(&slp->ns_sendq);
306 	mutex_enter(&nfsd_lock);
307 	TAILQ_INSERT_TAIL(&nfssvc_sockhead, slp, ns_chain);
308 	mutex_exit(&nfsd_lock);
309 
310 	return slp;
311 }
312 
313 static void
314 nfsrv_sockfree(struct nfssvc_sock *slp)
315 {
316 
317 	KASSERT(slp->ns_so == NULL);
318 	KASSERT(slp->ns_fp == NULL);
319 	KASSERT((slp->ns_flags & SLP_VALID) == 0);
320 	mutex_destroy(&slp->ns_lock);
321 	mutex_destroy(&slp->ns_alock);
322 	cv_destroy(&slp->ns_cv);
323 	kmem_free(slp, sizeof(*slp));
324 }
325 
326 /*
327  * Adds a socket to the list for servicing by nfsds.
328  */
329 int
330 nfssvc_addsock(file_t *fp, struct mbuf *mynam)
331 {
332 	int siz;
333 	struct nfssvc_sock *slp;
334 	struct socket *so;
335 	struct nfssvc_sock *tslp;
336 	int error;
337 	int val;
338 
339 	so = (struct socket *)fp->f_data;
340 	tslp = (struct nfssvc_sock *)0;
341 	/*
342 	 * Add it to the list, as required.
343 	 */
344 	if (so->so_proto->pr_protocol == IPPROTO_UDP) {
345 		if (so->so_proto->pr_domain->dom_family == AF_INET6)
346 			tslp = nfs_udp6sock;
347 		else {
348 			tslp = nfs_udpsock;
349 			if (tslp->ns_flags & SLP_VALID) {
350 				m_freem(mynam);
351 				return (EPERM);
352 			}
353 		}
354 	}
355 	if (so->so_type == SOCK_STREAM)
356 		siz = NFS_MAXPACKET + sizeof (u_long);
357 	else
358 		siz = NFS_MAXPACKET;
359 	solock(so);
360 	error = soreserve(so, siz, siz);
361 	sounlock(so);
362 	if (error) {
363 		m_freem(mynam);
364 		return (error);
365 	}
366 
367 	/*
368 	 * Set protocol specific options { for now TCP only } and
369 	 * reserve some space. For datagram sockets, this can get called
370 	 * repeatedly for the same socket, but that isn't harmful.
371 	 */
372 	if (so->so_type == SOCK_STREAM) {
373 		val = 1;
374 		so_setsockopt(NULL, so, SOL_SOCKET, SO_KEEPALIVE, &val,
375 		    sizeof(val));
376 	}
377 	if ((so->so_proto->pr_domain->dom_family == AF_INET ||
378 	    so->so_proto->pr_domain->dom_family == AF_INET6) &&
379 	    so->so_proto->pr_protocol == IPPROTO_TCP) {
380 		val = 1;
381 		so_setsockopt(NULL, so, IPPROTO_TCP, TCP_NODELAY, &val,
382 		    sizeof(val));
383 	}
384 	solock(so);
385 	so->so_rcv.sb_flags &= ~SB_NOINTR;
386 	so->so_rcv.sb_timeo = 0;
387 	so->so_snd.sb_flags &= ~SB_NOINTR;
388 	so->so_snd.sb_timeo = 0;
389 	sounlock(so);
390 	if (tslp) {
391 		slp = tslp;
392 	} else {
393 		slp = nfsrv_sockalloc();
394 	}
395 	slp->ns_so = so;
396 	slp->ns_nam = mynam;
397 	mutex_enter(&fp->f_lock);
398 	fp->f_count++;
399 	mutex_exit(&fp->f_lock);
400 	slp->ns_fp = fp;
401 	slp->ns_flags = SLP_VALID;
402 	slp->ns_aflags = SLP_A_NEEDQ;
403 	slp->ns_gflags = 0;
404 	slp->ns_sflags = 0;
405 	solock(so);
406 	so->so_upcallarg = (void *)slp;
407 	so->so_upcall = nfsrv_soupcall;
408 	so->so_rcv.sb_flags |= SB_UPCALL;
409 	sounlock(so);
410 	nfsrv_wakenfsd(slp);
411 	return (0);
412 }
413 
414 /*
415  * Called by nfssvc() for nfsds. Just loops around servicing rpc requests
416  * until it is killed by a signal.
417  */
418 int
419 nfssvc_nfsd(struct nfsd_srvargs *nsd, void *argp, struct lwp *l)
420 {
421 	struct timeval tv;
422 	struct mbuf *m;
423 	struct nfssvc_sock *slp;
424 	struct nfsd *nfsd = nsd->nsd_nfsd;
425 	struct nfsrv_descript *nd = NULL;
426 	struct mbuf *mreq;
427 	u_quad_t cur_usec;
428 	int error = 0, cacherep, siz, sotype, writes_todo;
429 	struct proc *p = l->l_proc;
430 	bool doreinit;
431 
432 #ifndef nolint
433 	cacherep = RC_DOIT;
434 	writes_todo = 0;
435 #endif
436 	if (nfsd == NULL) {
437 		nsd->nsd_nfsd = nfsd = kmem_alloc(sizeof(*nfsd), KM_SLEEP);
438 		memset(nfsd, 0, sizeof (struct nfsd));
439 		cv_init(&nfsd->nfsd_cv, "nfsd");
440 		nfsd->nfsd_procp = p;
441 		mutex_enter(&nfsd_lock);
442 		while ((nfssvc_sockhead_flag & SLP_INIT) != 0) {
443 			KASSERT(nfs_numnfsd == 0);
444 			cv_wait(&nfsd_initcv, &nfsd_lock);
445 		}
446 		TAILQ_INSERT_TAIL(&nfsd_head, nfsd, nfsd_chain);
447 		nfs_numnfsd++;
448 		mutex_exit(&nfsd_lock);
449 	}
450 	/*
451 	 * Loop getting rpc requests until SIGKILL.
452 	 */
453 	for (;;) {
454 		bool dummy;
455 
456 		if ((curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD)
457 		    != 0) {
458 			preempt();
459 		}
460 		if (nfsd->nfsd_slp == NULL) {
461 			mutex_enter(&nfsd_lock);
462 			while (nfsd->nfsd_slp == NULL &&
463 			    (nfsd_head_flag & NFSD_CHECKSLP) == 0) {
464 				SLIST_INSERT_HEAD(&nfsd_idle_head, nfsd,
465 				    nfsd_idle);
466 				error = cv_wait_sig(&nfsd->nfsd_cv, &nfsd_lock);
467 				if (error) {
468 					slp = nfsd->nfsd_slp;
469 					nfsd->nfsd_slp = NULL;
470 					if (!slp)
471 						SLIST_REMOVE(&nfsd_idle_head,
472 						    nfsd, nfsd, nfsd_idle);
473 					mutex_exit(&nfsd_lock);
474 					if (slp) {
475 						nfsrv_wakenfsd(slp);
476 						nfsrv_slpderef(slp);
477 					}
478 					goto done;
479 				}
480 			}
481 			if (nfsd->nfsd_slp == NULL &&
482 			    (nfsd_head_flag & NFSD_CHECKSLP) != 0) {
483 				slp = TAILQ_FIRST(&nfssvc_sockpending);
484 				if (slp) {
485 					KASSERT((slp->ns_gflags & SLP_G_DOREC)
486 					    != 0);
487 					TAILQ_REMOVE(&nfssvc_sockpending, slp,
488 					    ns_pending);
489 					slp->ns_gflags &= ~SLP_G_DOREC;
490 					slp->ns_sref++;
491 					nfsd->nfsd_slp = slp;
492 				} else
493 					nfsd_head_flag &= ~NFSD_CHECKSLP;
494 			}
495 			KASSERT(nfsd->nfsd_slp == NULL ||
496 			    nfsd->nfsd_slp->ns_sref > 0);
497 			mutex_exit(&nfsd_lock);
498 			if ((slp = nfsd->nfsd_slp) == NULL)
499 				continue;
500 			if (slp->ns_flags & SLP_VALID) {
501 				bool more;
502 
503 				if (nfsdsock_testbits(slp, SLP_A_NEEDQ)) {
504 					nfsrv_rcv(slp);
505 				}
506 				if (nfsdsock_testbits(slp, SLP_A_DISCONN)) {
507 					nfsrv_zapsock(slp);
508 				}
509 				error = nfsrv_dorec(slp, nfsd, &nd, &more);
510 				getmicrotime(&tv);
511 				cur_usec = (u_quad_t)tv.tv_sec * 1000000 +
512 					(u_quad_t)tv.tv_usec;
513 				writes_todo = 0;
514 				if (error) {
515 					struct nfsrv_descript *nd2;
516 
517 					mutex_enter(&nfsd_lock);
518 					nd2 = LIST_FIRST(&slp->ns_tq);
519 					if (nd2 != NULL &&
520 					    nd2->nd_time <= cur_usec) {
521 						error = 0;
522 						cacherep = RC_DOIT;
523 						writes_todo = 1;
524 					}
525 					mutex_exit(&nfsd_lock);
526 				}
527 				if (error == 0 && more) {
528 					nfsrv_wakenfsd(slp);
529 				}
530 			}
531 		} else {
532 			error = 0;
533 			slp = nfsd->nfsd_slp;
534 		}
535 		KASSERT(slp != NULL);
536 		KASSERT(nfsd->nfsd_slp == slp);
537 		if (error || (slp->ns_flags & SLP_VALID) == 0) {
538 			if (nd) {
539 				nfsdreq_free(nd);
540 				nd = NULL;
541 			}
542 			nfsd->nfsd_slp = NULL;
543 			nfsrv_slpderef(slp);
544 			continue;
545 		}
546 		sotype = slp->ns_so->so_type;
547 		if (nd) {
548 			getmicrotime(&nd->nd_starttime);
549 			if (nd->nd_nam2)
550 				nd->nd_nam = nd->nd_nam2;
551 			else
552 				nd->nd_nam = slp->ns_nam;
553 
554 			/*
555 			 * Check to see if authorization is needed.
556 			 */
557 			if (nfsd->nfsd_flag & NFSD_NEEDAUTH) {
558 				nfsd->nfsd_flag &= ~NFSD_NEEDAUTH;
559 				nsd->nsd_haddr = mtod(nd->nd_nam,
560 				    struct sockaddr_in *)->sin_addr.s_addr;
561 				nsd->nsd_authlen = nfsd->nfsd_authlen;
562 				nsd->nsd_verflen = nfsd->nfsd_verflen;
563 				if (!copyout(nfsd->nfsd_authstr,
564 				    nsd->nsd_authstr, nfsd->nfsd_authlen) &&
565 				    !copyout(nfsd->nfsd_verfstr,
566 				    nsd->nsd_verfstr, nfsd->nfsd_verflen) &&
567 				    !copyout(nsd, argp, sizeof (*nsd))) {
568 					return (ENEEDAUTH);
569 				}
570 				cacherep = RC_DROPIT;
571 			} else
572 				cacherep = nfsrv_getcache(nd, slp, &mreq);
573 
574 			if (nfsd->nfsd_flag & NFSD_AUTHFAIL) {
575 				nfsd->nfsd_flag &= ~NFSD_AUTHFAIL;
576 				nd->nd_procnum = NFSPROC_NOOP;
577 				nd->nd_repstat =
578 				    (NFSERR_AUTHERR | AUTH_TOOWEAK);
579 				cacherep = RC_DOIT;
580 			}
581 		}
582 
583 		/*
584 		 * Loop to get all the write rpc relies that have been
585 		 * gathered together.
586 		 */
587 		do {
588 			switch (cacherep) {
589 			case RC_DOIT:
590 				mreq = NULL;
591 				netexport_rdlock();
592 				if (writes_todo || nd == NULL ||
593 				     (!(nd->nd_flag & ND_NFSV3) &&
594 				     nd->nd_procnum == NFSPROC_WRITE &&
595 				     nfsrvw_procrastinate > 0))
596 					error = nfsrv_writegather(&nd, slp,
597 					    l, &mreq);
598 				else
599 					error =
600 					    (*(nfsrv3_procs[nd->nd_procnum]))
601 					    (nd, slp, l, &mreq);
602 				netexport_rdunlock();
603 				if (mreq == NULL) {
604 					if (nd != NULL) {
605 						if (nd->nd_nam2)
606 							m_free(nd->nd_nam2);
607 					}
608 					break;
609 				}
610 				if (error) {
611 					nfsstats.srv_errs++;
612 					nfsrv_updatecache(nd, false, mreq);
613 					if (nd->nd_nam2)
614 						m_freem(nd->nd_nam2);
615 					break;
616 				}
617 				nfsstats.srvrpccnt[nd->nd_procnum]++;
618 				nfsrv_updatecache(nd, true, mreq);
619 				nd->nd_mrep = (struct mbuf *)0;
620 			case RC_REPLY:
621 				m = mreq;
622 				siz = 0;
623 				while (m) {
624 					siz += m->m_len;
625 					m = m->m_next;
626 				}
627 				if (siz <= 0 || siz > NFS_MAXPACKET) {
628 					printf("mbuf siz=%d\n",siz);
629 					panic("Bad nfs svc reply");
630 				}
631 				m = mreq;
632 				m->m_pkthdr.len = siz;
633 				m->m_pkthdr.rcvif = (struct ifnet *)0;
634 				/*
635 				 * For stream protocols, prepend a Sun RPC
636 				 * Record Mark.
637 				 */
638 				if (sotype == SOCK_STREAM) {
639 					M_PREPEND(m, NFSX_UNSIGNED, M_WAIT);
640 					*mtod(m, u_int32_t *) =
641 					    htonl(0x80000000 | siz);
642 				}
643 				nd->nd_mreq = m;
644 				if (nfsrtton) {
645 					nfsd_rt(slp->ns_so->so_type, nd,
646 					    cacherep);
647 				}
648 				error = nfsdsock_sendreply(slp, nd);
649 				nd = NULL;
650 				if (error == EPIPE)
651 					nfsrv_zapsock(slp);
652 				if (error == EINTR || error == ERESTART) {
653 					nfsd->nfsd_slp = NULL;
654 					nfsrv_slpderef(slp);
655 					goto done;
656 				}
657 				break;
658 			case RC_DROPIT:
659 				if (nfsrtton)
660 					nfsd_rt(sotype, nd, cacherep);
661 				m_freem(nd->nd_mrep);
662 				m_freem(nd->nd_nam2);
663 				break;
664 			}
665 			if (nd) {
666 				nfsdreq_free(nd);
667 				nd = NULL;
668 			}
669 
670 			/*
671 			 * Check to see if there are outstanding writes that
672 			 * need to be serviced.
673 			 */
674 			getmicrotime(&tv);
675 			cur_usec = (u_quad_t)tv.tv_sec * 1000000 +
676 			    (u_quad_t)tv.tv_usec;
677 			mutex_enter(&nfsd_lock);
678 			if (LIST_FIRST(&slp->ns_tq) &&
679 			    LIST_FIRST(&slp->ns_tq)->nd_time <= cur_usec) {
680 				cacherep = RC_DOIT;
681 				writes_todo = 1;
682 			} else
683 				writes_todo = 0;
684 			mutex_exit(&nfsd_lock);
685 		} while (writes_todo);
686 		if (nfsrv_dorec(slp, nfsd, &nd, &dummy)) {
687 			nfsd->nfsd_slp = NULL;
688 			nfsrv_slpderef(slp);
689 		}
690 	}
691 done:
692 	mutex_enter(&nfsd_lock);
693 	TAILQ_REMOVE(&nfsd_head, nfsd, nfsd_chain);
694 	doreinit = --nfs_numnfsd == 0;
695 	if (doreinit)
696 		nfssvc_sockhead_flag |= SLP_INIT;
697 	mutex_exit(&nfsd_lock);
698 	cv_destroy(&nfsd->nfsd_cv);
699 	kmem_free(nfsd, sizeof(*nfsd));
700 	nsd->nsd_nfsd = NULL;
701 	if (doreinit)
702 		nfsrv_init(true);	/* Reinitialize everything */
703 	return (error);
704 }
705 
706 /*
707  * Shut down a socket associated with an nfssvc_sock structure.
708  * Should be called with the send lock set, if required.
709  * The trick here is to increment the sref at the start, so that the nfsds
710  * will stop using it and clear ns_flag at the end so that it will not be
711  * reassigned during cleanup.
712  *
713  * called at splsoftnet.
714  */
715 void
716 nfsrv_zapsock(struct nfssvc_sock *slp)
717 {
718 	struct nfsuid *nuidp, *nnuidp;
719 	struct nfsrv_descript *nwp;
720 	struct socket *so;
721 	struct mbuf *m;
722 
723 	if (nfsdsock_drain(slp)) {
724 		return;
725 	}
726 	mutex_enter(&nfsd_lock);
727 	if (slp->ns_gflags & SLP_G_DOREC) {
728 		TAILQ_REMOVE(&nfssvc_sockpending, slp, ns_pending);
729 		slp->ns_gflags &= ~SLP_G_DOREC;
730 	}
731 	mutex_exit(&nfsd_lock);
732 
733 	so = slp->ns_so;
734 	KASSERT(so != NULL);
735 	solock(so);
736 	so->so_upcall = NULL;
737 	so->so_upcallarg = NULL;
738 	so->so_rcv.sb_flags &= ~SB_UPCALL;
739 	soshutdown(so, SHUT_RDWR);
740 	sounlock(so);
741 
742 	m_freem(slp->ns_raw);
743 	m = slp->ns_rec;
744 	while (m != NULL) {
745 		struct mbuf *n;
746 
747 		n = m->m_nextpkt;
748 		m_freem(m);
749 		m = n;
750 	}
751 	/* XXX what about freeing ns_frag ? */
752 	for (nuidp = TAILQ_FIRST(&slp->ns_uidlruhead); nuidp != 0;
753 	    nuidp = nnuidp) {
754 		nnuidp = TAILQ_NEXT(nuidp, nu_lru);
755 		LIST_REMOVE(nuidp, nu_hash);
756 		TAILQ_REMOVE(&slp->ns_uidlruhead, nuidp, nu_lru);
757 		if (nuidp->nu_flag & NU_NAM)
758 			m_freem(nuidp->nu_nam);
759 		kmem_free(nuidp, sizeof(*nuidp));
760 	}
761 	mutex_enter(&nfsd_lock);
762 	while ((nwp = LIST_FIRST(&slp->ns_tq)) != NULL) {
763 		LIST_REMOVE(nwp, nd_tq);
764 		mutex_exit(&nfsd_lock);
765 		nfsdreq_free(nwp);
766 		mutex_enter(&nfsd_lock);
767 	}
768 	mutex_exit(&nfsd_lock);
769 }
770 
771 /*
772  * Derefence a server socket structure. If it has no more references and
773  * is no longer valid, you can throw it away.
774  */
775 void
776 nfsrv_slpderef(struct nfssvc_sock *slp)
777 {
778 	uint32_t ref;
779 
780 	mutex_enter(&nfsd_lock);
781 	KASSERT(slp->ns_sref > 0);
782 	ref = --slp->ns_sref;
783 	if (ref == 0 && (slp->ns_flags & SLP_VALID) == 0) {
784 		file_t *fp;
785 
786 		KASSERT((slp->ns_gflags & SLP_G_DOREC) == 0);
787 		TAILQ_REMOVE(&nfssvc_sockhead, slp, ns_chain);
788 		mutex_exit(&nfsd_lock);
789 
790 		fp = slp->ns_fp;
791 		if (fp != NULL) {
792 			slp->ns_fp = NULL;
793 			KASSERT(fp != NULL);
794 			KASSERT(fp->f_data == slp->ns_so);
795 			KASSERT(fp->f_count > 0);
796 			closef(fp);
797 			slp->ns_so = NULL;
798 		}
799 
800 		if (slp->ns_nam)
801 			m_free(slp->ns_nam);
802 		nfsrv_sockfree(slp);
803 	} else
804 		mutex_exit(&nfsd_lock);
805 }
806 
807 /*
808  * Initialize the data structures for the server.
809  * Handshake with any new nfsds starting up to avoid any chance of
810  * corruption.
811  */
812 void
813 nfsrv_init(int terminating)
814 {
815 	struct nfssvc_sock *slp;
816 
817 	if (!terminating) {
818 		mutex_init(&nfsd_lock, MUTEX_DRIVER, IPL_SOFTNET);
819 		cv_init(&nfsd_initcv, "nfsdinit");
820 	}
821 
822 	mutex_enter(&nfsd_lock);
823 	if (!terminating && (nfssvc_sockhead_flag & SLP_INIT) != 0)
824 		panic("nfsd init");
825 	nfssvc_sockhead_flag |= SLP_INIT;
826 
827 	if (terminating) {
828 		KASSERT(SLIST_EMPTY(&nfsd_idle_head));
829 		KASSERT(TAILQ_EMPTY(&nfsd_head));
830 		while ((slp = TAILQ_FIRST(&nfssvc_sockhead)) != NULL) {
831 			mutex_exit(&nfsd_lock);
832 			KASSERT(slp->ns_sref == 0);
833 			slp->ns_sref++;
834 			nfsrv_zapsock(slp);
835 			nfsrv_slpderef(slp);
836 			mutex_enter(&nfsd_lock);
837 		}
838 		KASSERT(TAILQ_EMPTY(&nfssvc_sockpending));
839 		mutex_exit(&nfsd_lock);
840 		nfsrv_cleancache();	/* And clear out server cache */
841 	} else {
842 		mutex_exit(&nfsd_lock);
843 		nfs_pub.np_valid = 0;
844 	}
845 
846 	TAILQ_INIT(&nfssvc_sockhead);
847 	TAILQ_INIT(&nfssvc_sockpending);
848 
849 	TAILQ_INIT(&nfsd_head);
850 	SLIST_INIT(&nfsd_idle_head);
851 	nfsd_head_flag &= ~NFSD_CHECKSLP;
852 
853 	nfs_udpsock = nfsrv_sockalloc();
854 	nfs_udp6sock = nfsrv_sockalloc();
855 
856 	mutex_enter(&nfsd_lock);
857 	nfssvc_sockhead_flag &= ~SLP_INIT;
858 	cv_broadcast(&nfsd_initcv);
859 	mutex_exit(&nfsd_lock);
860 }
861 
862 void
863 nfsrv_fini(void)
864 {
865 
866 	nfsrv_init(true);
867 	cv_destroy(&nfsd_initcv);
868 	mutex_destroy(&nfsd_lock);
869 }
870 
871 /*
872  * Add entries to the server monitor log.
873  */
874 static void
875 nfsd_rt(int sotype, struct nfsrv_descript *nd, int cacherep)
876 {
877 	struct timeval tv;
878 	struct drt *rt;
879 
880 	rt = &nfsdrt.drt[nfsdrt.pos];
881 	if (cacherep == RC_DOIT)
882 		rt->flag = 0;
883 	else if (cacherep == RC_REPLY)
884 		rt->flag = DRT_CACHEREPLY;
885 	else
886 		rt->flag = DRT_CACHEDROP;
887 	if (sotype == SOCK_STREAM)
888 		rt->flag |= DRT_TCP;
889 	if (nd->nd_flag & ND_NFSV3)
890 		rt->flag |= DRT_NFSV3;
891 	rt->proc = nd->nd_procnum;
892 	if (mtod(nd->nd_nam, struct sockaddr *)->sa_family == AF_INET)
893 	    rt->ipadr = mtod(nd->nd_nam, struct sockaddr_in *)->sin_addr.s_addr;
894 	else
895 	    rt->ipadr = INADDR_ANY;
896 	getmicrotime(&tv);
897 	rt->resptime = ((tv.tv_sec - nd->nd_starttime.tv_sec) * 1000000) +
898 		(tv.tv_usec - nd->nd_starttime.tv_usec);
899 	rt->tstamp = tv;
900 	nfsdrt.pos = (nfsdrt.pos + 1) % NFSRTTLOGSIZ;
901 }
902