xref: /netbsd-src/sys/nfs/nfs_syscalls.c (revision 6d322f2f4598f0d8a138f10ea648ec4fabe41f8b)
1 /*	$NetBSD: nfs_syscalls.c,v 1.154 2013/11/27 22:10:47 christos Exp $	*/
2 
3 /*
4  * Copyright (c) 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Rick Macklem at The University of Guelph.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)nfs_syscalls.c	8.5 (Berkeley) 3/30/95
35  */
36 
37 #include <sys/cdefs.h>
38 __KERNEL_RCSID(0, "$NetBSD: nfs_syscalls.c,v 1.154 2013/11/27 22:10:47 christos Exp $");
39 
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/kernel.h>
43 #include <sys/file.h>
44 #include <sys/stat.h>
45 #include <sys/vnode.h>
46 #include <sys/mount.h>
47 #include <sys/proc.h>
48 #include <sys/uio.h>
49 #include <sys/malloc.h>
50 #include <sys/kmem.h>
51 #include <sys/buf.h>
52 #include <sys/mbuf.h>
53 #include <sys/socket.h>
54 #include <sys/socketvar.h>
55 #include <sys/signalvar.h>
56 #include <sys/domain.h>
57 #include <sys/protosw.h>
58 #include <sys/namei.h>
59 #include <sys/syslog.h>
60 #include <sys/filedesc.h>
61 #include <sys/kthread.h>
62 #include <sys/kauth.h>
63 #include <sys/syscallargs.h>
64 
65 #include <netinet/in.h>
66 #include <netinet/tcp.h>
67 #include <nfs/xdr_subs.h>
68 #include <nfs/rpcv2.h>
69 #include <nfs/nfsproto.h>
70 #include <nfs/nfs.h>
71 #include <nfs/nfsm_subs.h>
72 #include <nfs/nfsrvcache.h>
73 #include <nfs/nfsmount.h>
74 #include <nfs/nfsnode.h>
75 #include <nfs/nfsrtt.h>
76 #include <nfs/nfs_var.h>
77 
78 extern int32_t (*nfsrv3_procs[NFS_NPROCS])(struct nfsrv_descript *,
79 						struct nfssvc_sock *,
80 						struct lwp *, struct mbuf **);
81 extern int nfsrvw_procrastinate;
82 extern int nuidhash_max;
83 
84 static int nfs_numnfsd = 0;
85 static struct nfsdrt nfsdrt;
86 kmutex_t nfsd_lock;
87 struct nfssvc_sockhead nfssvc_sockhead;
88 kcondvar_t nfsd_initcv;
89 struct nfssvc_sockhead nfssvc_sockpending;
90 struct nfsdhead nfsd_head;
91 struct nfsdidlehead nfsd_idle_head;
92 
93 int nfssvc_sockhead_flag;
94 int nfsd_head_flag;
95 
96 struct nfssvc_sock *nfs_udpsock;
97 struct nfssvc_sock *nfs_udp6sock;
98 
99 static struct nfssvc_sock *nfsrv_sockalloc(void);
100 static void nfsrv_sockfree(struct nfssvc_sock *);
101 static void nfsd_rt(int, struct nfsrv_descript *, int);
102 
103 /*
104  * NFS server system calls
105  */
106 
107 
108 /*
109  * Nfs server pseudo system call for the nfsd's
110  * Based on the flag value it either:
111  * - adds a socket to the selection list
112  * - remains in the kernel as an nfsd
113  * - remains in the kernel as an nfsiod
114  */
115 int
116 sys_nfssvc(struct lwp *l, const struct sys_nfssvc_args *uap, register_t *retval)
117 {
118 	/* {
119 		syscallarg(int) flag;
120 		syscallarg(void *) argp;
121 	} */
122 	int error;
123 	file_t *fp;
124 	struct mbuf *nam;
125 	struct nfsd_args nfsdarg;
126 	struct nfsd_srvargs nfsd_srvargs, *nsd = &nfsd_srvargs;
127 	struct nfsd *nfsd;
128 	struct nfssvc_sock *slp;
129 	struct nfsuid *nuidp;
130 
131 	error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_NFS,
132 	    KAUTH_REQ_NETWORK_NFS_SVC, NULL, NULL, NULL);
133 	if (error)
134 		return (error);
135 
136 	mutex_enter(&nfsd_lock);
137 	while (nfssvc_sockhead_flag & SLP_INIT) {
138 		cv_wait(&nfsd_initcv, &nfsd_lock);
139 	}
140 	mutex_exit(&nfsd_lock);
141 
142 	if (SCARG(uap, flag) & NFSSVC_BIOD) {
143 		/* Dummy implementation of nfsios for 1.4 and earlier. */
144 		error = kpause("nfsbiod", true, 0, NULL);
145 	} else if (SCARG(uap, flag) & NFSSVC_MNTD) {
146 		error = ENOSYS;
147 	} else if (SCARG(uap, flag) & NFSSVC_ADDSOCK) {
148 		error = copyin(SCARG(uap, argp), (void *)&nfsdarg,
149 		    sizeof(nfsdarg));
150 		if (error)
151 			return (error);
152 		/* getsock() will use the descriptor for us */
153 		if ((fp = fd_getfile(nfsdarg.sock)) == NULL)
154 			return (EBADF);
155 		if (fp->f_type != DTYPE_SOCKET) {
156 			fd_putfile(nfsdarg.sock);
157 			return (ENOTSOCK);
158 		}
159 		/*
160 		 * Get the client address for connected sockets.
161 		 */
162 		if (nfsdarg.name == NULL || nfsdarg.namelen == 0)
163 			nam = (struct mbuf *)0;
164 		else {
165 			error = sockargs(&nam, nfsdarg.name, nfsdarg.namelen,
166 				MT_SONAME);
167 			if (error) {
168 				fd_putfile(nfsdarg.sock);
169 				return (error);
170 			}
171 		}
172 		error = nfssvc_addsock(fp, nam);
173 		fd_putfile(nfsdarg.sock);
174 	} else if (SCARG(uap, flag) & NFSSVC_SETEXPORTSLIST) {
175 		struct export_args *args;
176 		struct mountd_exports_list mel;
177 
178 		error = copyin(SCARG(uap, argp), &mel, sizeof(mel));
179 		if (error != 0)
180 			return error;
181 
182 		args = (struct export_args *)malloc(mel.mel_nexports *
183 		    sizeof(struct export_args), M_TEMP, M_WAITOK);
184 		error = copyin(mel.mel_exports, args, mel.mel_nexports *
185 		    sizeof(struct export_args));
186 		if (error != 0) {
187 			free(args, M_TEMP);
188 			return error;
189 		}
190 		mel.mel_exports = args;
191 
192 		error = mountd_set_exports_list(&mel, l, NULL);
193 
194 		free(args, M_TEMP);
195 	} else {
196 		error = copyin(SCARG(uap, argp), (void *)nsd, sizeof (*nsd));
197 		if (error)
198 			return (error);
199 		if ((SCARG(uap, flag) & NFSSVC_AUTHIN) &&
200 		    ((nfsd = nsd->nsd_nfsd)) != NULL &&
201 		    (nfsd->nfsd_slp->ns_flags & SLP_VALID)) {
202 			slp = nfsd->nfsd_slp;
203 
204 			/*
205 			 * First check to see if another nfsd has already
206 			 * added this credential.
207 			 */
208 			LIST_FOREACH(nuidp, NUIDHASH(slp, nsd->nsd_cr.cr_uid),
209 			    nu_hash) {
210 				if (kauth_cred_geteuid(nuidp->nu_cr) ==
211 				    nsd->nsd_cr.cr_uid &&
212 				    (!nfsd->nfsd_nd->nd_nam2 ||
213 				     netaddr_match(NU_NETFAM(nuidp),
214 				     &nuidp->nu_haddr, nfsd->nfsd_nd->nd_nam2)))
215 					break;
216 			}
217 			if (nuidp) {
218 			    kauth_cred_hold(nuidp->nu_cr);
219 			    nfsd->nfsd_nd->nd_cr = nuidp->nu_cr;
220 			    nfsd->nfsd_nd->nd_flag |= ND_KERBFULL;
221 			} else {
222 			    /*
223 			     * Nope, so we will.
224 			     */
225 			    if (slp->ns_numuids < nuidhash_max) {
226 				slp->ns_numuids++;
227 				nuidp = kmem_alloc(sizeof(*nuidp), KM_SLEEP);
228 			    } else
229 				nuidp = (struct nfsuid *)0;
230 			    if ((slp->ns_flags & SLP_VALID) == 0) {
231 				if (nuidp)
232 				    kmem_free(nuidp, sizeof(*nuidp));
233 			    } else {
234 				if (nuidp == (struct nfsuid *)0) {
235 				    nuidp = TAILQ_FIRST(&slp->ns_uidlruhead);
236 				    LIST_REMOVE(nuidp, nu_hash);
237 				    TAILQ_REMOVE(&slp->ns_uidlruhead, nuidp,
238 					nu_lru);
239 				    if (nuidp->nu_flag & NU_NAM)
240 					m_freem(nuidp->nu_nam);
241 			        }
242 				nuidp->nu_flag = 0;
243 				kauth_uucred_to_cred(nuidp->nu_cr,
244 				    &nsd->nsd_cr);
245 				nuidp->nu_timestamp = nsd->nsd_timestamp;
246 				nuidp->nu_expire = time_second + nsd->nsd_ttl;
247 				/*
248 				 * and save the session key in nu_key.
249 				 */
250 				memcpy(nuidp->nu_key, nsd->nsd_key,
251 				    sizeof(nsd->nsd_key));
252 				if (nfsd->nfsd_nd->nd_nam2) {
253 				    struct sockaddr_in *saddr;
254 
255 				    saddr = mtod(nfsd->nfsd_nd->nd_nam2,
256 					 struct sockaddr_in *);
257 				    switch (saddr->sin_family) {
258 				    case AF_INET:
259 					nuidp->nu_flag |= NU_INETADDR;
260 					nuidp->nu_inetaddr =
261 					     saddr->sin_addr.s_addr;
262 					break;
263 				    case AF_INET6:
264 					nuidp->nu_flag |= NU_NAM;
265 					nuidp->nu_nam = m_copym(
266 					    nfsd->nfsd_nd->nd_nam2, 0,
267 					     M_COPYALL, M_WAIT);
268 					break;
269 				    default:
270 					return EAFNOSUPPORT;
271 				    };
272 				}
273 				TAILQ_INSERT_TAIL(&slp->ns_uidlruhead, nuidp,
274 					nu_lru);
275 				LIST_INSERT_HEAD(NUIDHASH(slp, nsd->nsd_uid),
276 					nuidp, nu_hash);
277 				kauth_cred_hold(nuidp->nu_cr);
278 				nfsd->nfsd_nd->nd_cr = nuidp->nu_cr;
279 				nfsd->nfsd_nd->nd_flag |= ND_KERBFULL;
280 			    }
281 			}
282 		}
283 		if ((SCARG(uap, flag) & NFSSVC_AUTHINFAIL) &&
284 		    (nfsd = nsd->nsd_nfsd))
285 			nfsd->nfsd_flag |= NFSD_AUTHFAIL;
286 		error = nfssvc_nfsd(nsd, SCARG(uap, argp), l);
287 	}
288 	if (error == EINTR || error == ERESTART)
289 		error = 0;
290 	return (error);
291 }
292 
293 static struct nfssvc_sock *
294 nfsrv_sockalloc(void)
295 {
296 	struct nfssvc_sock *slp;
297 
298 	slp = kmem_alloc(sizeof(*slp), KM_SLEEP);
299 	memset(slp, 0, sizeof (struct nfssvc_sock));
300 	mutex_init(&slp->ns_lock, MUTEX_DRIVER, IPL_SOFTNET);
301 	mutex_init(&slp->ns_alock, MUTEX_DRIVER, IPL_SOFTNET);
302 	cv_init(&slp->ns_cv, "nfsdsock");
303 	TAILQ_INIT(&slp->ns_uidlruhead);
304 	LIST_INIT(&slp->ns_tq);
305 	SIMPLEQ_INIT(&slp->ns_sendq);
306 	mutex_enter(&nfsd_lock);
307 	TAILQ_INSERT_TAIL(&nfssvc_sockhead, slp, ns_chain);
308 	mutex_exit(&nfsd_lock);
309 
310 	return slp;
311 }
312 
313 static void
314 nfsrv_sockfree(struct nfssvc_sock *slp)
315 {
316 
317 	KASSERT(slp->ns_so == NULL);
318 	KASSERT(slp->ns_fp == NULL);
319 	KASSERT((slp->ns_flags & SLP_VALID) == 0);
320 	mutex_destroy(&slp->ns_lock);
321 	mutex_destroy(&slp->ns_alock);
322 	cv_destroy(&slp->ns_cv);
323 	kmem_free(slp, sizeof(*slp));
324 }
325 
326 /*
327  * Adds a socket to the list for servicing by nfsds.
328  */
329 int
330 nfssvc_addsock(file_t *fp, struct mbuf *mynam)
331 {
332 	int siz;
333 	struct nfssvc_sock *slp;
334 	struct socket *so;
335 	struct nfssvc_sock *tslp;
336 	int error;
337 	int val;
338 
339 	so = (struct socket *)fp->f_data;
340 	tslp = (struct nfssvc_sock *)0;
341 	/*
342 	 * Add it to the list, as required.
343 	 */
344 	if (so->so_proto->pr_protocol == IPPROTO_UDP) {
345 		if (so->so_proto->pr_domain->dom_family == AF_INET6)
346 			tslp = nfs_udp6sock;
347 		else {
348 			tslp = nfs_udpsock;
349 			if (tslp->ns_flags & SLP_VALID) {
350 				m_freem(mynam);
351 				return (EPERM);
352 			}
353 		}
354 	}
355 	if (so->so_type == SOCK_STREAM)
356 		siz = NFS_MAXPACKET + sizeof (u_long);
357 	else
358 		siz = NFS_MAXPACKET;
359 	solock(so);
360 	error = soreserve(so, siz, siz);
361 	sounlock(so);
362 	if (error) {
363 		m_freem(mynam);
364 		return (error);
365 	}
366 
367 	/*
368 	 * Set protocol specific options { for now TCP only } and
369 	 * reserve some space. For datagram sockets, this can get called
370 	 * repeatedly for the same socket, but that isn't harmful.
371 	 */
372 	if (so->so_type == SOCK_STREAM) {
373 		val = 1;
374 		so_setsockopt(NULL, so, SOL_SOCKET, SO_KEEPALIVE, &val,
375 		    sizeof(val));
376 	}
377 	if ((so->so_proto->pr_domain->dom_family == AF_INET ||
378 	    so->so_proto->pr_domain->dom_family == AF_INET6) &&
379 	    so->so_proto->pr_protocol == IPPROTO_TCP) {
380 		val = 1;
381 		so_setsockopt(NULL, so, IPPROTO_TCP, TCP_NODELAY, &val,
382 		    sizeof(val));
383 	}
384 	solock(so);
385 	so->so_rcv.sb_flags &= ~SB_NOINTR;
386 	so->so_rcv.sb_timeo = 0;
387 	so->so_snd.sb_flags &= ~SB_NOINTR;
388 	so->so_snd.sb_timeo = 0;
389 	sounlock(so);
390 	if (tslp) {
391 		slp = tslp;
392 	} else {
393 		slp = nfsrv_sockalloc();
394 	}
395 	slp->ns_so = so;
396 	slp->ns_nam = mynam;
397 	mutex_enter(&fp->f_lock);
398 	fp->f_count++;
399 	mutex_exit(&fp->f_lock);
400 	slp->ns_fp = fp;
401 	slp->ns_flags = SLP_VALID;
402 	slp->ns_aflags = SLP_A_NEEDQ;
403 	slp->ns_gflags = 0;
404 	slp->ns_sflags = 0;
405 	solock(so);
406 	so->so_upcallarg = (void *)slp;
407 	so->so_upcall = nfsrv_soupcall;
408 	so->so_rcv.sb_flags |= SB_UPCALL;
409 	sounlock(so);
410 	nfsrv_wakenfsd(slp);
411 	return (0);
412 }
413 
414 /*
415  * Called by nfssvc() for nfsds. Just loops around servicing rpc requests
416  * until it is killed by a signal.
417  */
418 int
419 nfssvc_nfsd(struct nfsd_srvargs *nsd, void *argp, struct lwp *l)
420 {
421 	struct timeval tv;
422 	struct mbuf *m;
423 	struct nfssvc_sock *slp;
424 	struct nfsd *nfsd = nsd->nsd_nfsd;
425 	struct nfsrv_descript *nd = NULL;
426 	struct mbuf *mreq;
427 	u_quad_t cur_usec;
428 	int error = 0, cacherep, siz, sotype, writes_todo;
429 	struct proc *p = l->l_proc;
430 	bool doreinit;
431 
432 #ifndef nolint
433 	cacherep = RC_DOIT;
434 	writes_todo = 0;
435 #endif
436 	if (nfsd == NULL) {
437 		nsd->nsd_nfsd = nfsd = kmem_alloc(sizeof(*nfsd), KM_SLEEP);
438 		memset(nfsd, 0, sizeof (struct nfsd));
439 		cv_init(&nfsd->nfsd_cv, "nfsd");
440 		nfsd->nfsd_procp = p;
441 		mutex_enter(&nfsd_lock);
442 		while ((nfssvc_sockhead_flag & SLP_INIT) != 0) {
443 			KASSERT(nfs_numnfsd == 0);
444 			cv_wait(&nfsd_initcv, &nfsd_lock);
445 		}
446 		TAILQ_INSERT_TAIL(&nfsd_head, nfsd, nfsd_chain);
447 		nfs_numnfsd++;
448 		mutex_exit(&nfsd_lock);
449 	}
450 	/*
451 	 * Loop getting rpc requests until SIGKILL.
452 	 */
453 	for (;;) {
454 		bool dummy;
455 
456 		if ((curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD)
457 		    != 0) {
458 			preempt();
459 		}
460 		if (nfsd->nfsd_slp == NULL) {
461 			mutex_enter(&nfsd_lock);
462 			while (nfsd->nfsd_slp == NULL &&
463 			    (nfsd_head_flag & NFSD_CHECKSLP) == 0) {
464 				SLIST_INSERT_HEAD(&nfsd_idle_head, nfsd,
465 				    nfsd_idle);
466 				error = cv_wait_sig(&nfsd->nfsd_cv, &nfsd_lock);
467 				if (error) {
468 					slp = nfsd->nfsd_slp;
469 					nfsd->nfsd_slp = NULL;
470 					if (!slp)
471 						SLIST_REMOVE(&nfsd_idle_head,
472 						    nfsd, nfsd, nfsd_idle);
473 					mutex_exit(&nfsd_lock);
474 					if (slp) {
475 						nfsrv_wakenfsd(slp);
476 						nfsrv_slpderef(slp);
477 					}
478 					goto done;
479 				}
480 			}
481 			if (nfsd->nfsd_slp == NULL &&
482 			    (nfsd_head_flag & NFSD_CHECKSLP) != 0) {
483 				slp = TAILQ_FIRST(&nfssvc_sockpending);
484 				if (slp) {
485 					KASSERT((slp->ns_gflags & SLP_G_DOREC)
486 					    != 0);
487 					TAILQ_REMOVE(&nfssvc_sockpending, slp,
488 					    ns_pending);
489 					slp->ns_gflags &= ~SLP_G_DOREC;
490 					slp->ns_sref++;
491 					nfsd->nfsd_slp = slp;
492 				} else
493 					nfsd_head_flag &= ~NFSD_CHECKSLP;
494 			}
495 			KASSERT(nfsd->nfsd_slp == NULL ||
496 			    nfsd->nfsd_slp->ns_sref > 0);
497 			mutex_exit(&nfsd_lock);
498 			if ((slp = nfsd->nfsd_slp) == NULL)
499 				continue;
500 			if (slp->ns_flags & SLP_VALID) {
501 				bool more;
502 
503 				if (nfsdsock_testbits(slp, SLP_A_NEEDQ)) {
504 					nfsrv_rcv(slp);
505 				}
506 				if (nfsdsock_testbits(slp, SLP_A_DISCONN)) {
507 					nfsrv_zapsock(slp);
508 				}
509 				error = nfsrv_dorec(slp, nfsd, &nd, &more);
510 				getmicrotime(&tv);
511 				cur_usec = (u_quad_t)tv.tv_sec * 1000000 +
512 					(u_quad_t)tv.tv_usec;
513 				writes_todo = 0;
514 				if (error) {
515 					struct nfsrv_descript *nd2;
516 
517 					mutex_enter(&nfsd_lock);
518 					nd2 = LIST_FIRST(&slp->ns_tq);
519 					if (nd2 != NULL &&
520 					    nd2->nd_time <= cur_usec) {
521 						error = 0;
522 						cacherep = RC_DOIT;
523 						writes_todo = 1;
524 					}
525 					mutex_exit(&nfsd_lock);
526 				}
527 				if (error == 0 && more) {
528 					nfsrv_wakenfsd(slp);
529 				}
530 			}
531 		} else {
532 			error = 0;
533 			slp = nfsd->nfsd_slp;
534 		}
535 		KASSERT(slp != NULL);
536 		KASSERT(nfsd->nfsd_slp == slp);
537 		if (error || (slp->ns_flags & SLP_VALID) == 0) {
538 			if (nd) {
539 				nfsdreq_free(nd);
540 				nd = NULL;
541 			}
542 			nfsd->nfsd_slp = NULL;
543 			nfsrv_slpderef(slp);
544 			continue;
545 		}
546 		sotype = slp->ns_so->so_type;
547 		if (nd) {
548 			getmicrotime(&nd->nd_starttime);
549 			if (nd->nd_nam2)
550 				nd->nd_nam = nd->nd_nam2;
551 			else
552 				nd->nd_nam = slp->ns_nam;
553 
554 			/*
555 			 * Check to see if authorization is needed.
556 			 */
557 			if (nfsd->nfsd_flag & NFSD_NEEDAUTH) {
558 				nfsd->nfsd_flag &= ~NFSD_NEEDAUTH;
559 				nsd->nsd_haddr = mtod(nd->nd_nam,
560 				    struct sockaddr_in *)->sin_addr.s_addr;
561 				nsd->nsd_authlen = nfsd->nfsd_authlen;
562 				nsd->nsd_verflen = nfsd->nfsd_verflen;
563 				if (!copyout(nfsd->nfsd_authstr,
564 				    nsd->nsd_authstr, nfsd->nfsd_authlen) &&
565 				    !copyout(nfsd->nfsd_verfstr,
566 				    nsd->nsd_verfstr, nfsd->nfsd_verflen) &&
567 				    !copyout(nsd, argp, sizeof (*nsd))) {
568 					return (ENEEDAUTH);
569 				}
570 				cacherep = RC_DROPIT;
571 			} else
572 				cacherep = nfsrv_getcache(nd, slp, &mreq);
573 
574 			if (nfsd->nfsd_flag & NFSD_AUTHFAIL) {
575 				nfsd->nfsd_flag &= ~NFSD_AUTHFAIL;
576 				nd->nd_procnum = NFSPROC_NOOP;
577 				nd->nd_repstat =
578 				    (NFSERR_AUTHERR | AUTH_TOOWEAK);
579 				cacherep = RC_DOIT;
580 			}
581 		}
582 
583 		/*
584 		 * Loop to get all the write rpc relies that have been
585 		 * gathered together.
586 		 */
587 		do {
588 			switch (cacherep) {
589 			case RC_DOIT:
590 				mreq = NULL;
591 				netexport_rdlock();
592 				if (writes_todo || nd == NULL ||
593 				     (!(nd->nd_flag & ND_NFSV3) &&
594 				     nd->nd_procnum == NFSPROC_WRITE &&
595 				     nfsrvw_procrastinate > 0))
596 					error = nfsrv_writegather(&nd, slp,
597 					    l, &mreq);
598 				else
599 					error =
600 					    (*(nfsrv3_procs[nd->nd_procnum]))
601 					    (nd, slp, l, &mreq);
602 				netexport_rdunlock();
603 				if (mreq == NULL) {
604 					if (nd != NULL) {
605 						if (nd->nd_nam2)
606 							m_free(nd->nd_nam2);
607 					}
608 					break;
609 				}
610 				if (error) {
611 					nfsstats.srv_errs++;
612 					if (nd) {
613 						nfsrv_updatecache(nd, false,
614 						    mreq);
615 						if (nd->nd_nam2)
616 							m_freem(nd->nd_nam2);
617 					}
618 					break;
619 				}
620 				if (nd) {
621 					nfsstats.srvrpccnt[nd->nd_procnum]++;
622 					nfsrv_updatecache(nd, true, mreq);
623 					nd->nd_mrep = NULL;
624 				}
625 			case RC_REPLY:
626 				m = mreq;
627 				siz = 0;
628 				while (m) {
629 					siz += m->m_len;
630 					m = m->m_next;
631 				}
632 				if (siz <= 0 || siz > NFS_MAXPACKET) {
633 					printf("mbuf siz=%d\n",siz);
634 					panic("Bad nfs svc reply");
635 				}
636 				m = mreq;
637 				m->m_pkthdr.len = siz;
638 				m->m_pkthdr.rcvif = (struct ifnet *)0;
639 				/*
640 				 * For stream protocols, prepend a Sun RPC
641 				 * Record Mark.
642 				 */
643 				if (sotype == SOCK_STREAM) {
644 					M_PREPEND(m, NFSX_UNSIGNED, M_WAIT);
645 					*mtod(m, u_int32_t *) =
646 					    htonl(0x80000000 | siz);
647 				}
648 				if (nd) {
649 					nd->nd_mreq = m;
650 					if (nfsrtton) {
651 						nfsd_rt(slp->ns_so->so_type, nd,
652 						    cacherep);
653 					}
654 					error = nfsdsock_sendreply(slp, nd);
655 					nd = NULL;
656 				}
657 				if (error == EPIPE)
658 					nfsrv_zapsock(slp);
659 				if (error == EINTR || error == ERESTART) {
660 					nfsd->nfsd_slp = NULL;
661 					nfsrv_slpderef(slp);
662 					goto done;
663 				}
664 				break;
665 			case RC_DROPIT:
666 				if (nd) {
667 					if (nfsrtton)
668 						nfsd_rt(sotype, nd, cacherep);
669 					m_freem(nd->nd_mrep);
670 					m_freem(nd->nd_nam2);
671 				}
672 				break;
673 			}
674 			if (nd) {
675 				nfsdreq_free(nd);
676 				nd = NULL;
677 			}
678 
679 			/*
680 			 * Check to see if there are outstanding writes that
681 			 * need to be serviced.
682 			 */
683 			getmicrotime(&tv);
684 			cur_usec = (u_quad_t)tv.tv_sec * 1000000 +
685 			    (u_quad_t)tv.tv_usec;
686 			mutex_enter(&nfsd_lock);
687 			if (LIST_FIRST(&slp->ns_tq) &&
688 			    LIST_FIRST(&slp->ns_tq)->nd_time <= cur_usec) {
689 				cacherep = RC_DOIT;
690 				writes_todo = 1;
691 			} else
692 				writes_todo = 0;
693 			mutex_exit(&nfsd_lock);
694 		} while (writes_todo);
695 		if (nfsrv_dorec(slp, nfsd, &nd, &dummy)) {
696 			nfsd->nfsd_slp = NULL;
697 			nfsrv_slpderef(slp);
698 		}
699 	}
700 done:
701 	mutex_enter(&nfsd_lock);
702 	TAILQ_REMOVE(&nfsd_head, nfsd, nfsd_chain);
703 	doreinit = --nfs_numnfsd == 0;
704 	if (doreinit)
705 		nfssvc_sockhead_flag |= SLP_INIT;
706 	mutex_exit(&nfsd_lock);
707 	cv_destroy(&nfsd->nfsd_cv);
708 	kmem_free(nfsd, sizeof(*nfsd));
709 	nsd->nsd_nfsd = NULL;
710 	if (doreinit)
711 		nfsrv_init(true);	/* Reinitialize everything */
712 	return (error);
713 }
714 
715 /*
716  * Shut down a socket associated with an nfssvc_sock structure.
717  * Should be called with the send lock set, if required.
718  * The trick here is to increment the sref at the start, so that the nfsds
719  * will stop using it and clear ns_flag at the end so that it will not be
720  * reassigned during cleanup.
721  *
722  * called at splsoftnet.
723  */
724 void
725 nfsrv_zapsock(struct nfssvc_sock *slp)
726 {
727 	struct nfsuid *nuidp, *nnuidp;
728 	struct nfsrv_descript *nwp;
729 	struct socket *so;
730 	struct mbuf *m;
731 
732 	if (nfsdsock_drain(slp)) {
733 		return;
734 	}
735 	mutex_enter(&nfsd_lock);
736 	if (slp->ns_gflags & SLP_G_DOREC) {
737 		TAILQ_REMOVE(&nfssvc_sockpending, slp, ns_pending);
738 		slp->ns_gflags &= ~SLP_G_DOREC;
739 	}
740 	mutex_exit(&nfsd_lock);
741 
742 	so = slp->ns_so;
743 	KASSERT(so != NULL);
744 	solock(so);
745 	so->so_upcall = NULL;
746 	so->so_upcallarg = NULL;
747 	so->so_rcv.sb_flags &= ~SB_UPCALL;
748 	soshutdown(so, SHUT_RDWR);
749 	sounlock(so);
750 
751 	m_freem(slp->ns_raw);
752 	m = slp->ns_rec;
753 	while (m != NULL) {
754 		struct mbuf *n;
755 
756 		n = m->m_nextpkt;
757 		m_freem(m);
758 		m = n;
759 	}
760 	/* XXX what about freeing ns_frag ? */
761 	for (nuidp = TAILQ_FIRST(&slp->ns_uidlruhead); nuidp != 0;
762 	    nuidp = nnuidp) {
763 		nnuidp = TAILQ_NEXT(nuidp, nu_lru);
764 		LIST_REMOVE(nuidp, nu_hash);
765 		TAILQ_REMOVE(&slp->ns_uidlruhead, nuidp, nu_lru);
766 		if (nuidp->nu_flag & NU_NAM)
767 			m_freem(nuidp->nu_nam);
768 		kmem_free(nuidp, sizeof(*nuidp));
769 	}
770 	mutex_enter(&nfsd_lock);
771 	while ((nwp = LIST_FIRST(&slp->ns_tq)) != NULL) {
772 		LIST_REMOVE(nwp, nd_tq);
773 		mutex_exit(&nfsd_lock);
774 		nfsdreq_free(nwp);
775 		mutex_enter(&nfsd_lock);
776 	}
777 	mutex_exit(&nfsd_lock);
778 }
779 
780 /*
781  * Derefence a server socket structure. If it has no more references and
782  * is no longer valid, you can throw it away.
783  */
784 void
785 nfsrv_slpderef(struct nfssvc_sock *slp)
786 {
787 	uint32_t ref;
788 
789 	mutex_enter(&nfsd_lock);
790 	KASSERT(slp->ns_sref > 0);
791 	ref = --slp->ns_sref;
792 	if (ref == 0 && (slp->ns_flags & SLP_VALID) == 0) {
793 		file_t *fp;
794 
795 		KASSERT((slp->ns_gflags & SLP_G_DOREC) == 0);
796 		TAILQ_REMOVE(&nfssvc_sockhead, slp, ns_chain);
797 		mutex_exit(&nfsd_lock);
798 
799 		fp = slp->ns_fp;
800 		if (fp != NULL) {
801 			slp->ns_fp = NULL;
802 			KASSERT(fp != NULL);
803 			KASSERT(fp->f_data == slp->ns_so);
804 			KASSERT(fp->f_count > 0);
805 			closef(fp);
806 			slp->ns_so = NULL;
807 		}
808 
809 		if (slp->ns_nam)
810 			m_free(slp->ns_nam);
811 		nfsrv_sockfree(slp);
812 	} else
813 		mutex_exit(&nfsd_lock);
814 }
815 
816 /*
817  * Initialize the data structures for the server.
818  * Handshake with any new nfsds starting up to avoid any chance of
819  * corruption.
820  */
821 void
822 nfsrv_init(int terminating)
823 {
824 	struct nfssvc_sock *slp;
825 
826 	if (!terminating) {
827 		mutex_init(&nfsd_lock, MUTEX_DRIVER, IPL_SOFTNET);
828 		cv_init(&nfsd_initcv, "nfsdinit");
829 	}
830 
831 	mutex_enter(&nfsd_lock);
832 	if (!terminating && (nfssvc_sockhead_flag & SLP_INIT) != 0)
833 		panic("nfsd init");
834 	nfssvc_sockhead_flag |= SLP_INIT;
835 
836 	if (terminating) {
837 		KASSERT(SLIST_EMPTY(&nfsd_idle_head));
838 		KASSERT(TAILQ_EMPTY(&nfsd_head));
839 		while ((slp = TAILQ_FIRST(&nfssvc_sockhead)) != NULL) {
840 			mutex_exit(&nfsd_lock);
841 			KASSERT(slp->ns_sref == 0);
842 			slp->ns_sref++;
843 			nfsrv_zapsock(slp);
844 			nfsrv_slpderef(slp);
845 			mutex_enter(&nfsd_lock);
846 		}
847 		KASSERT(TAILQ_EMPTY(&nfssvc_sockpending));
848 		mutex_exit(&nfsd_lock);
849 		nfsrv_cleancache();	/* And clear out server cache */
850 	} else {
851 		mutex_exit(&nfsd_lock);
852 		nfs_pub.np_valid = 0;
853 	}
854 
855 	TAILQ_INIT(&nfssvc_sockhead);
856 	TAILQ_INIT(&nfssvc_sockpending);
857 
858 	TAILQ_INIT(&nfsd_head);
859 	SLIST_INIT(&nfsd_idle_head);
860 	nfsd_head_flag &= ~NFSD_CHECKSLP;
861 
862 	nfs_udpsock = nfsrv_sockalloc();
863 	nfs_udp6sock = nfsrv_sockalloc();
864 
865 	mutex_enter(&nfsd_lock);
866 	nfssvc_sockhead_flag &= ~SLP_INIT;
867 	cv_broadcast(&nfsd_initcv);
868 	mutex_exit(&nfsd_lock);
869 }
870 
871 void
872 nfsrv_fini(void)
873 {
874 
875 	nfsrv_init(true);
876 	cv_destroy(&nfsd_initcv);
877 	mutex_destroy(&nfsd_lock);
878 }
879 
880 /*
881  * Add entries to the server monitor log.
882  */
883 static void
884 nfsd_rt(int sotype, struct nfsrv_descript *nd, int cacherep)
885 {
886 	struct timeval tv;
887 	struct drt *rt;
888 
889 	rt = &nfsdrt.drt[nfsdrt.pos];
890 	if (cacherep == RC_DOIT)
891 		rt->flag = 0;
892 	else if (cacherep == RC_REPLY)
893 		rt->flag = DRT_CACHEREPLY;
894 	else
895 		rt->flag = DRT_CACHEDROP;
896 	if (sotype == SOCK_STREAM)
897 		rt->flag |= DRT_TCP;
898 	if (nd->nd_flag & ND_NFSV3)
899 		rt->flag |= DRT_NFSV3;
900 	rt->proc = nd->nd_procnum;
901 	if (mtod(nd->nd_nam, struct sockaddr *)->sa_family == AF_INET)
902 	    rt->ipadr = mtod(nd->nd_nam, struct sockaddr_in *)->sin_addr.s_addr;
903 	else
904 	    rt->ipadr = INADDR_ANY;
905 	getmicrotime(&tv);
906 	rt->resptime = ((tv.tv_sec - nd->nd_starttime.tv_sec) * 1000000) +
907 		(tv.tv_usec - nd->nd_starttime.tv_usec);
908 	rt->tstamp = tv;
909 	nfsdrt.pos = (nfsdrt.pos + 1) % NFSRTTLOGSIZ;
910 }
911