xref: /netbsd-src/sys/nfs/nfs_clntsocket.c (revision aad9773e38ed2370a628a6416e098f9008fc10a7)
1 /*	$NetBSD: nfs_clntsocket.c,v 1.2 2014/09/05 05:34:57 matt Exp $	*/
2 
3 /*
4  * Copyright (c) 1989, 1991, 1993, 1995
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Rick Macklem at The University of Guelph.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)nfs_socket.c	8.5 (Berkeley) 3/30/95
35  */
36 
37 /*
38  * Socket operations for use by nfs
39  */
40 
41 #include <sys/cdefs.h>
42 __KERNEL_RCSID(0, "$NetBSD: nfs_clntsocket.c,v 1.2 2014/09/05 05:34:57 matt Exp $");
43 
44 #ifdef _KERNEL_OPT
45 #include "opt_nfs.h"
46 #include "opt_mbuftrace.h"
47 #endif
48 
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/evcnt.h>
52 #include <sys/callout.h>
53 #include <sys/proc.h>
54 #include <sys/mount.h>
55 #include <sys/kernel.h>
56 #include <sys/kmem.h>
57 #include <sys/mbuf.h>
58 #include <sys/vnode.h>
59 #include <sys/domain.h>
60 #include <sys/protosw.h>
61 #include <sys/socket.h>
62 #include <sys/socketvar.h>
63 #include <sys/syslog.h>
64 #include <sys/tprintf.h>
65 #include <sys/namei.h>
66 #include <sys/signal.h>
67 #include <sys/signalvar.h>
68 #include <sys/kauth.h>
69 
70 #include <netinet/in.h>
71 #include <netinet/tcp.h>
72 
73 #include <nfs/rpcv2.h>
74 #include <nfs/nfsproto.h>
75 #include <nfs/nfs.h>
76 #include <nfs/xdr_subs.h>
77 #include <nfs/nfsm_subs.h>
78 #include <nfs/nfsmount.h>
79 #include <nfs/nfsnode.h>
80 #include <nfs/nfsrtt.h>
81 #include <nfs/nfs_var.h>
82 
83 static int nfs_sndlock(struct nfsmount *, struct nfsreq *);
84 static void nfs_sndunlock(struct nfsmount *);
85 
86 /*
87  * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
88  * done by soreceive(), but for SOCK_STREAM we must deal with the Record
89  * Mark and consolidate the data into a new mbuf list.
90  * nb: Sometimes TCP passes the data up to soreceive() in long lists of
91  *     small mbufs.
92  * For SOCK_STREAM we must be very careful to read an entire record once
93  * we have read any of it, even if the system call has been interrupted.
94  */
95 static int
96 nfs_receive(struct nfsreq *rep, struct mbuf **aname, struct mbuf **mp,
97     struct lwp *l)
98 {
99 	struct socket *so;
100 	struct uio auio;
101 	struct iovec aio;
102 	struct mbuf *m;
103 	struct mbuf *control;
104 	u_int32_t len;
105 	struct mbuf **getnam;
106 	int error, sotype, rcvflg;
107 
108 	/*
109 	 * Set up arguments for soreceive()
110 	 */
111 	*mp = NULL;
112 	*aname = NULL;
113 	sotype = rep->r_nmp->nm_sotype;
114 
115 	/*
116 	 * For reliable protocols, lock against other senders/receivers
117 	 * in case a reconnect is necessary.
118 	 * For SOCK_STREAM, first get the Record Mark to find out how much
119 	 * more there is to get.
120 	 * We must lock the socket against other receivers
121 	 * until we have an entire rpc request/reply.
122 	 */
123 	if (sotype != SOCK_DGRAM) {
124 		error = nfs_sndlock(rep->r_nmp, rep);
125 		if (error)
126 			return (error);
127 tryagain:
128 		/*
129 		 * Check for fatal errors and resending request.
130 		 */
131 		/*
132 		 * Ugh: If a reconnect attempt just happened, nm_so
133 		 * would have changed. NULL indicates a failed
134 		 * attempt that has essentially shut down this
135 		 * mount point.
136 		 */
137 		if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) {
138 			nfs_sndunlock(rep->r_nmp);
139 			return (EINTR);
140 		}
141 		so = rep->r_nmp->nm_so;
142 		if (!so) {
143 			error = nfs_reconnect(rep);
144 			if (error) {
145 				nfs_sndunlock(rep->r_nmp);
146 				return (error);
147 			}
148 			goto tryagain;
149 		}
150 		while (rep->r_flags & R_MUSTRESEND) {
151 			m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT);
152 			nfsstats.rpcretries++;
153 			rep->r_rtt = 0;
154 			rep->r_flags &= ~R_TIMING;
155 			error = nfs_send(so, rep->r_nmp->nm_nam, m, rep, l);
156 			if (error) {
157 				if (error == EINTR || error == ERESTART ||
158 				    (error = nfs_reconnect(rep)) != 0) {
159 					nfs_sndunlock(rep->r_nmp);
160 					return (error);
161 				}
162 				goto tryagain;
163 			}
164 		}
165 		nfs_sndunlock(rep->r_nmp);
166 		if (sotype == SOCK_STREAM) {
167 			aio.iov_base = (void *) &len;
168 			aio.iov_len = sizeof(u_int32_t);
169 			auio.uio_iov = &aio;
170 			auio.uio_iovcnt = 1;
171 			auio.uio_rw = UIO_READ;
172 			auio.uio_offset = 0;
173 			auio.uio_resid = sizeof(u_int32_t);
174 			UIO_SETUP_SYSSPACE(&auio);
175 			do {
176 			   rcvflg = MSG_WAITALL;
177 			   error = (*so->so_receive)(so, NULL, &auio,
178 				NULL, NULL, &rcvflg);
179 			   if (error == EWOULDBLOCK && rep) {
180 				if (rep->r_flags & R_SOFTTERM)
181 					return (EINTR);
182 				/*
183 				 * if it seems that the server died after it
184 				 * received our request, set EPIPE so that
185 				 * we'll reconnect and retransmit requests.
186 				 */
187 				if (rep->r_rexmit >= rep->r_nmp->nm_retry) {
188 					nfsstats.rpctimeouts++;
189 					error = EPIPE;
190 				}
191 			   }
192 			} while (error == EWOULDBLOCK);
193 			if (!error && auio.uio_resid > 0) {
194 			    /*
195 			     * Don't log a 0 byte receive; it means
196 			     * that the socket has been closed, and
197 			     * can happen during normal operation
198 			     * (forcible unmount or Solaris server).
199 			     */
200 			    if (auio.uio_resid != sizeof (u_int32_t))
201 			      log(LOG_INFO,
202 				 "short receive (%lu/%lu) from nfs server %s\n",
203 				 (u_long)sizeof(u_int32_t) - auio.uio_resid,
204 				 (u_long)sizeof(u_int32_t),
205 				 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
206 			    error = EPIPE;
207 			}
208 			if (error)
209 				goto errout;
210 			len = ntohl(len) & ~0x80000000;
211 			/*
212 			 * This is SERIOUS! We are out of sync with the sender
213 			 * and forcing a disconnect/reconnect is all I can do.
214 			 */
215 			if (len > NFS_MAXPACKET) {
216 			    log(LOG_ERR, "%s (%d) from nfs server %s\n",
217 				"impossible packet length",
218 				len,
219 				rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
220 			    error = EFBIG;
221 			    goto errout;
222 			}
223 			auio.uio_resid = len;
224 			do {
225 			    rcvflg = MSG_WAITALL;
226 			    error =  (*so->so_receive)(so, NULL,
227 				&auio, mp, NULL, &rcvflg);
228 			} while (error == EWOULDBLOCK || error == EINTR ||
229 				 error == ERESTART);
230 			if (!error && auio.uio_resid > 0) {
231 			    if (len != auio.uio_resid)
232 			      log(LOG_INFO,
233 				"short receive (%lu/%d) from nfs server %s\n",
234 				(u_long)len - auio.uio_resid, len,
235 				rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
236 			    error = EPIPE;
237 			}
238 		} else {
239 			/*
240 			 * NB: Since uio_resid is big, MSG_WAITALL is ignored
241 			 * and soreceive() will return when it has either a
242 			 * control msg or a data msg.
243 			 * We have no use for control msg., but must grab them
244 			 * and then throw them away so we know what is going
245 			 * on.
246 			 */
247 			auio.uio_resid = len = 100000000; /* Anything Big */
248 			/* not need to setup uio_vmspace */
249 			do {
250 			    rcvflg = 0;
251 			    error =  (*so->so_receive)(so, NULL,
252 				&auio, mp, &control, &rcvflg);
253 			    if (control)
254 				m_freem(control);
255 			    if (error == EWOULDBLOCK && rep) {
256 				if (rep->r_flags & R_SOFTTERM)
257 					return (EINTR);
258 			    }
259 			} while (error == EWOULDBLOCK ||
260 				 (!error && *mp == NULL && control));
261 			if ((rcvflg & MSG_EOR) == 0)
262 				printf("Egad!!\n");
263 			if (!error && *mp == NULL)
264 				error = EPIPE;
265 			len -= auio.uio_resid;
266 		}
267 errout:
268 		if (error && error != EINTR && error != ERESTART) {
269 			m_freem(*mp);
270 			*mp = NULL;
271 			if (error != EPIPE)
272 				log(LOG_INFO,
273 				    "receive error %d from nfs server %s\n",
274 				    error,
275 				 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
276 			error = nfs_sndlock(rep->r_nmp, rep);
277 			if (!error)
278 				error = nfs_reconnect(rep);
279 			if (!error)
280 				goto tryagain;
281 			else
282 				nfs_sndunlock(rep->r_nmp);
283 		}
284 	} else {
285 		if ((so = rep->r_nmp->nm_so) == NULL)
286 			return (EACCES);
287 		if (so->so_state & SS_ISCONNECTED)
288 			getnam = NULL;
289 		else
290 			getnam = aname;
291 		auio.uio_resid = len = 1000000;
292 		/* not need to setup uio_vmspace */
293 		do {
294 			rcvflg = 0;
295 			error =  (*so->so_receive)(so, getnam, &auio, mp,
296 				NULL, &rcvflg);
297 			if (error == EWOULDBLOCK &&
298 			    (rep->r_flags & R_SOFTTERM))
299 				return (EINTR);
300 		} while (error == EWOULDBLOCK);
301 		len -= auio.uio_resid;
302 		if (!error && *mp == NULL)
303 			error = EPIPE;
304 	}
305 	if (error) {
306 		m_freem(*mp);
307 		*mp = NULL;
308 	}
309 	return (error);
310 }
311 
312 /*
313  * Implement receipt of reply on a socket.
314  * We must search through the list of received datagrams matching them
315  * with outstanding requests using the xid, until ours is found.
316  */
317 /* ARGSUSED */
318 static int
319 nfs_reply(struct nfsreq *myrep, struct lwp *lwp)
320 {
321 	struct nfsreq *rep;
322 	struct nfsmount *nmp = myrep->r_nmp;
323 	int32_t t1;
324 	struct mbuf *mrep, *nam, *md;
325 	u_int32_t rxid, *tl;
326 	char *dpos, *cp2;
327 	int error;
328 
329 	/*
330 	 * Loop around until we get our own reply
331 	 */
332 	for (;;) {
333 		/*
334 		 * Lock against other receivers so that I don't get stuck in
335 		 * sbwait() after someone else has received my reply for me.
336 		 * Also necessary for connection based protocols to avoid
337 		 * race conditions during a reconnect.
338 		 */
339 		error = nfs_rcvlock(nmp, myrep);
340 		if (error == EALREADY)
341 			return (0);
342 		if (error)
343 			return (error);
344 		/*
345 		 * Get the next Rpc reply off the socket
346 		 */
347 
348 		mutex_enter(&nmp->nm_lock);
349 		nmp->nm_waiters++;
350 		mutex_exit(&nmp->nm_lock);
351 
352 		error = nfs_receive(myrep, &nam, &mrep, lwp);
353 
354 		mutex_enter(&nmp->nm_lock);
355 		nmp->nm_waiters--;
356 		cv_signal(&nmp->nm_disconcv);
357 		mutex_exit(&nmp->nm_lock);
358 
359 		if (error) {
360 			nfs_rcvunlock(nmp);
361 
362 			if (nmp->nm_iflag & NFSMNT_DISMNT) {
363 				/*
364 				 * Oops, we're going away now..
365 				 */
366 				return error;
367 			}
368 			/*
369 			 * Ignore routing errors on connectionless protocols? ?
370 			 */
371 			if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) {
372 				nmp->nm_so->so_error = 0;
373 #ifdef DEBUG
374 				if (ratecheck(&nfs_reply_last_err_time,
375 				    &nfs_err_interval))
376 					printf("%s: ignoring error %d\n",
377 					       __func__, error);
378 #endif
379 				continue;
380 			}
381 			return (error);
382 		}
383 		if (nam)
384 			m_freem(nam);
385 
386 		/*
387 		 * Get the xid and check that it is an rpc reply
388 		 */
389 		md = mrep;
390 		dpos = mtod(md, void *);
391 		nfsm_dissect(tl, u_int32_t *, 2*NFSX_UNSIGNED);
392 		rxid = *tl++;
393 		if (*tl != rpc_reply) {
394 			nfsstats.rpcinvalid++;
395 			m_freem(mrep);
396 nfsmout:
397 			nfs_rcvunlock(nmp);
398 			continue;
399 		}
400 
401 		/*
402 		 * Loop through the request list to match up the reply
403 		 * Iff no match, just drop the datagram
404 		 */
405 		TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
406 			if (rep->r_mrep == NULL && rxid == rep->r_xid) {
407 				/* Found it.. */
408 				rep->r_mrep = mrep;
409 				rep->r_md = md;
410 				rep->r_dpos = dpos;
411 				if (nfsrtton) {
412 					struct rttl *rt;
413 
414 					rt = &nfsrtt.rttl[nfsrtt.pos];
415 					rt->proc = rep->r_procnum;
416 					rt->rto = NFS_RTO(nmp, nfs_proct[rep->r_procnum]);
417 					rt->sent = nmp->nm_sent;
418 					rt->cwnd = nmp->nm_cwnd;
419 					rt->srtt = nmp->nm_srtt[nfs_proct[rep->r_procnum] - 1];
420 					rt->sdrtt = nmp->nm_sdrtt[nfs_proct[rep->r_procnum] - 1];
421 					rt->fsid = nmp->nm_mountp->mnt_stat.f_fsidx;
422 					getmicrotime(&rt->tstamp);
423 					if (rep->r_flags & R_TIMING)
424 						rt->rtt = rep->r_rtt;
425 					else
426 						rt->rtt = 1000000;
427 					nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
428 				}
429 				/*
430 				 * Update congestion window.
431 				 * Do the additive increase of
432 				 * one rpc/rtt.
433 				 */
434 				if (nmp->nm_cwnd <= nmp->nm_sent) {
435 					nmp->nm_cwnd +=
436 					   (NFS_CWNDSCALE * NFS_CWNDSCALE +
437 					   (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
438 					if (nmp->nm_cwnd > NFS_MAXCWND)
439 						nmp->nm_cwnd = NFS_MAXCWND;
440 				}
441 				rep->r_flags &= ~R_SENT;
442 				nmp->nm_sent -= NFS_CWNDSCALE;
443 				/*
444 				 * Update rtt using a gain of 0.125 on the mean
445 				 * and a gain of 0.25 on the deviation.
446 				 */
447 				if (rep->r_flags & R_TIMING) {
448 					/*
449 					 * Since the timer resolution of
450 					 * NFS_HZ is so course, it can often
451 					 * result in r_rtt == 0. Since
452 					 * r_rtt == N means that the actual
453 					 * rtt is between N+dt and N+2-dt ticks,
454 					 * add 1.
455 					 */
456 					t1 = rep->r_rtt + 1;
457 					t1 -= (NFS_SRTT(rep) >> 3);
458 					NFS_SRTT(rep) += t1;
459 					if (t1 < 0)
460 						t1 = -t1;
461 					t1 -= (NFS_SDRTT(rep) >> 2);
462 					NFS_SDRTT(rep) += t1;
463 				}
464 				nmp->nm_timeouts = 0;
465 				break;
466 			}
467 		}
468 		nfs_rcvunlock(nmp);
469 		/*
470 		 * If not matched to a request, drop it.
471 		 * If it's mine, get out.
472 		 */
473 		if (rep == 0) {
474 			nfsstats.rpcunexpected++;
475 			m_freem(mrep);
476 		} else if (rep == myrep) {
477 			if (rep->r_mrep == NULL)
478 				panic("nfsreply nil");
479 			return (0);
480 		}
481 	}
482 }
483 
484 /*
485  * nfs_request - goes something like this
486  *	- fill in request struct
487  *	- links it into list
488  *	- calls nfs_send() for first transmit
489  *	- calls nfs_receive() to get reply
490  *	- break down rpc header and return with nfs reply pointed to
491  *	  by mrep or error
492  * nb: always frees up mreq mbuf list
493  */
494 int
495 nfs_request(struct nfsnode *np, struct mbuf *mrest, int procnum, struct lwp *lwp, kauth_cred_t cred, struct mbuf **mrp, struct mbuf **mdp, char **dposp, int *rexmitp)
496 {
497 	struct mbuf *m, *mrep;
498 	struct nfsreq *rep;
499 	u_int32_t *tl;
500 	int i;
501 	struct nfsmount *nmp = VFSTONFS(np->n_vnode->v_mount);
502 	struct mbuf *md, *mheadend;
503 	char nickv[RPCX_NICKVERF];
504 	time_t waituntil;
505 	char *dpos, *cp2;
506 	int t1, s, error = 0, mrest_len, auth_len, auth_type;
507 	int trylater_delay = NFS_TRYLATERDEL, failed_auth = 0;
508 	int verf_len, verf_type;
509 	u_int32_t xid;
510 	char *auth_str, *verf_str;
511 	NFSKERBKEY_T key;		/* save session key */
512 	kauth_cred_t acred;
513 	struct mbuf *mrest_backup = NULL;
514 	kauth_cred_t origcred = NULL; /* XXX: gcc */
515 	bool retry_cred = true;
516 	bool use_opencred = (np->n_flag & NUSEOPENCRED) != 0;
517 
518 	if (rexmitp != NULL)
519 		*rexmitp = 0;
520 
521 	acred = kauth_cred_alloc();
522 
523 tryagain_cred:
524 	KASSERT(cred != NULL);
525 	rep = kmem_alloc(sizeof(*rep), KM_SLEEP);
526 	rep->r_nmp = nmp;
527 	KASSERT(lwp == NULL || lwp == curlwp);
528 	rep->r_lwp = lwp;
529 	rep->r_procnum = procnum;
530 	i = 0;
531 	m = mrest;
532 	while (m) {
533 		i += m->m_len;
534 		m = m->m_next;
535 	}
536 	mrest_len = i;
537 
538 	/*
539 	 * Get the RPC header with authorization.
540 	 */
541 kerbauth:
542 	verf_str = auth_str = NULL;
543 	if (nmp->nm_flag & NFSMNT_KERB) {
544 		verf_str = nickv;
545 		verf_len = sizeof (nickv);
546 		auth_type = RPCAUTH_KERB4;
547 		memset((void *)key, 0, sizeof (key));
548 		if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str,
549 			&auth_len, verf_str, verf_len)) {
550 			error = nfs_getauth(nmp, rep, cred, &auth_str,
551 				&auth_len, verf_str, &verf_len, key);
552 			if (error) {
553 				kmem_free(rep, sizeof(*rep));
554 				m_freem(mrest);
555 				KASSERT(kauth_cred_getrefcnt(acred) == 1);
556 				kauth_cred_free(acred);
557 				return (error);
558 			}
559 		}
560 		retry_cred = false;
561 	} else {
562 		/* AUTH_UNIX */
563 		uid_t uid;
564 		gid_t gid;
565 
566 		/*
567 		 * on the most unix filesystems, permission checks are
568 		 * done when the file is open(2)'ed.
569 		 * ie. once a file is successfully open'ed,
570 		 * following i/o operations never fail with EACCES.
571 		 * we try to follow the semantics as far as possible.
572 		 *
573 		 * note that we expect that the nfs server always grant
574 		 * accesses by the file's owner.
575 		 */
576 		origcred = cred;
577 		switch (procnum) {
578 		case NFSPROC_READ:
579 		case NFSPROC_WRITE:
580 		case NFSPROC_COMMIT:
581 			uid = np->n_vattr->va_uid;
582 			gid = np->n_vattr->va_gid;
583 			if (kauth_cred_geteuid(cred) == uid &&
584 			    kauth_cred_getegid(cred) == gid) {
585 				retry_cred = false;
586 				break;
587 			}
588 			if (use_opencred)
589 				break;
590 			kauth_cred_setuid(acred, uid);
591 			kauth_cred_seteuid(acred, uid);
592 			kauth_cred_setsvuid(acred, uid);
593 			kauth_cred_setgid(acred, gid);
594 			kauth_cred_setegid(acred, gid);
595 			kauth_cred_setsvgid(acred, gid);
596 			cred = acred;
597 			break;
598 		default:
599 			retry_cred = false;
600 			break;
601 		}
602 		/*
603 		 * backup mbuf chain if we can need it later to retry.
604 		 *
605 		 * XXX maybe we can keep a direct reference to
606 		 * mrest without doing m_copym, but it's ...ugly.
607 		 */
608 		if (retry_cred)
609 			mrest_backup = m_copym(mrest, 0, M_COPYALL, M_WAIT);
610 		auth_type = RPCAUTH_UNIX;
611 		/* XXX elad - ngroups */
612 		auth_len = (((kauth_cred_ngroups(cred) > nmp->nm_numgrps) ?
613 			nmp->nm_numgrps : kauth_cred_ngroups(cred)) << 2) +
614 			5 * NFSX_UNSIGNED;
615 	}
616 	m = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
617 	     auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid);
618 	if (auth_str)
619 		free(auth_str, M_TEMP);
620 
621 	/*
622 	 * For stream protocols, insert a Sun RPC Record Mark.
623 	 */
624 	if (nmp->nm_sotype == SOCK_STREAM) {
625 		M_PREPEND(m, NFSX_UNSIGNED, M_WAIT);
626 		*mtod(m, u_int32_t *) = htonl(0x80000000 |
627 			 (m->m_pkthdr.len - NFSX_UNSIGNED));
628 	}
629 	rep->r_mreq = m;
630 	rep->r_xid = xid;
631 tryagain:
632 	if (nmp->nm_flag & NFSMNT_SOFT)
633 		rep->r_retry = nmp->nm_retry;
634 	else
635 		rep->r_retry = NFS_MAXREXMIT + 1;	/* past clip limit */
636 	rep->r_rtt = rep->r_rexmit = 0;
637 	if (nfs_proct[procnum] > 0)
638 		rep->r_flags = R_TIMING;
639 	else
640 		rep->r_flags = 0;
641 	rep->r_mrep = NULL;
642 
643 	/*
644 	 * Do the client side RPC.
645 	 */
646 	nfsstats.rpcrequests++;
647 	/*
648 	 * Chain request into list of outstanding requests. Be sure
649 	 * to put it LAST so timer finds oldest requests first.
650 	 */
651 	s = splsoftnet();
652 	TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
653 	nfs_timer_start();
654 
655 	/*
656 	 * If backing off another request or avoiding congestion, don't
657 	 * send this one now but let timer do it. If not timing a request,
658 	 * do it now.
659 	 */
660 	if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
661 	    (nmp->nm_flag & NFSMNT_DUMBTIMR) || nmp->nm_sent < nmp->nm_cwnd)) {
662 		splx(s);
663 		if (nmp->nm_soflags & PR_CONNREQUIRED)
664 			error = nfs_sndlock(nmp, rep);
665 		if (!error) {
666 			m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT);
667 			error = nfs_send(nmp->nm_so, nmp->nm_nam, m, rep, lwp);
668 			if (nmp->nm_soflags & PR_CONNREQUIRED)
669 				nfs_sndunlock(nmp);
670 		}
671 		s = splsoftnet();
672 		if (!error && (rep->r_flags & R_MUSTRESEND) == 0) {
673 			if ((rep->r_flags & R_SENT) == 0) {
674 				nmp->nm_sent += NFS_CWNDSCALE;
675 				rep->r_flags |= R_SENT;
676 			}
677 		}
678 		splx(s);
679 	} else {
680 		splx(s);
681 		rep->r_rtt = -1;
682 	}
683 
684 	/*
685 	 * Wait for the reply from our send or the timer's.
686 	 */
687 	if (!error || error == EPIPE || error == EWOULDBLOCK)
688 		error = nfs_reply(rep, lwp);
689 
690 	/*
691 	 * RPC done, unlink the request.
692 	 */
693 	s = splsoftnet();
694 	TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
695 
696 	/*
697 	 * Decrement the outstanding request count.
698 	 */
699 	if (rep->r_flags & R_SENT) {
700 		rep->r_flags &= ~R_SENT;	/* paranoia */
701 		nmp->nm_sent -= NFS_CWNDSCALE;
702 	}
703 	splx(s);
704 
705 	if (rexmitp != NULL) {
706 		int rexmit;
707 
708 		if (nmp->nm_sotype != SOCK_DGRAM)
709 			rexmit = (rep->r_flags & R_REXMITTED) != 0;
710 		else
711 			rexmit = rep->r_rexmit;
712 		*rexmitp = rexmit;
713 	}
714 
715 	/*
716 	 * If there was a successful reply and a tprintf msg.
717 	 * tprintf a response.
718 	 */
719 	if (!error && (rep->r_flags & R_TPRINTFMSG))
720 		nfs_msg(rep->r_lwp, nmp->nm_mountp->mnt_stat.f_mntfromname,
721 		    "is alive again");
722 	mrep = rep->r_mrep;
723 	md = rep->r_md;
724 	dpos = rep->r_dpos;
725 	if (error)
726 		goto nfsmout;
727 
728 	/*
729 	 * break down the rpc header and check if ok
730 	 */
731 	nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
732 	if (*tl++ == rpc_msgdenied) {
733 		if (*tl == rpc_mismatch)
734 			error = EOPNOTSUPP;
735 		else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) {
736 			if (!failed_auth) {
737 				failed_auth++;
738 				mheadend->m_next = NULL;
739 				m_freem(mrep);
740 				m_freem(rep->r_mreq);
741 				goto kerbauth;
742 			} else
743 				error = EAUTH;
744 		} else
745 			error = EACCES;
746 		m_freem(mrep);
747 		goto nfsmout;
748 	}
749 
750 	/*
751 	 * Grab any Kerberos verifier, otherwise just throw it away.
752 	 */
753 	verf_type = fxdr_unsigned(int, *tl++);
754 	i = fxdr_unsigned(int32_t, *tl);
755 	if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
756 		error = nfs_savenickauth(nmp, cred, i, key, &md, &dpos, mrep);
757 		if (error)
758 			goto nfsmout;
759 	} else if (i > 0)
760 		nfsm_adv(nfsm_rndup(i));
761 	nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED);
762 	/* 0 == ok */
763 	if (*tl == 0) {
764 		nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED);
765 		if (*tl != 0) {
766 			error = fxdr_unsigned(int, *tl);
767 			switch (error) {
768 			case NFSERR_PERM:
769 				error = EPERM;
770 				break;
771 
772 			case NFSERR_NOENT:
773 				error = ENOENT;
774 				break;
775 
776 			case NFSERR_IO:
777 				error = EIO;
778 				break;
779 
780 			case NFSERR_NXIO:
781 				error = ENXIO;
782 				break;
783 
784 			case NFSERR_ACCES:
785 				error = EACCES;
786 				if (!retry_cred)
787 					break;
788 				m_freem(mrep);
789 				m_freem(rep->r_mreq);
790 				kmem_free(rep, sizeof(*rep));
791 				use_opencred = !use_opencred;
792 				if (mrest_backup == NULL) {
793 					/* m_copym failure */
794 					KASSERT(
795 					    kauth_cred_getrefcnt(acred) == 1);
796 					kauth_cred_free(acred);
797 					return ENOMEM;
798 				}
799 				mrest = mrest_backup;
800 				mrest_backup = NULL;
801 				cred = origcred;
802 				error = 0;
803 				retry_cred = false;
804 				goto tryagain_cred;
805 
806 			case NFSERR_EXIST:
807 				error = EEXIST;
808 				break;
809 
810 			case NFSERR_XDEV:
811 				error = EXDEV;
812 				break;
813 
814 			case NFSERR_NODEV:
815 				error = ENODEV;
816 				break;
817 
818 			case NFSERR_NOTDIR:
819 				error = ENOTDIR;
820 				break;
821 
822 			case NFSERR_ISDIR:
823 				error = EISDIR;
824 				break;
825 
826 			case NFSERR_INVAL:
827 				error = EINVAL;
828 				break;
829 
830 			case NFSERR_FBIG:
831 				error = EFBIG;
832 				break;
833 
834 			case NFSERR_NOSPC:
835 				error = ENOSPC;
836 				break;
837 
838 			case NFSERR_ROFS:
839 				error = EROFS;
840 				break;
841 
842 			case NFSERR_MLINK:
843 				error = EMLINK;
844 				break;
845 
846 			case NFSERR_TIMEDOUT:
847 				error = ETIMEDOUT;
848 				break;
849 
850 			case NFSERR_NAMETOL:
851 				error = ENAMETOOLONG;
852 				break;
853 
854 			case NFSERR_NOTEMPTY:
855 				error = ENOTEMPTY;
856 				break;
857 
858 			case NFSERR_DQUOT:
859 				error = EDQUOT;
860 				break;
861 
862 			case NFSERR_STALE:
863 				/*
864 				 * If the File Handle was stale, invalidate the
865 				 * lookup cache, just in case.
866 				 */
867 				error = ESTALE;
868 				cache_purge(NFSTOV(np));
869 				break;
870 
871 			case NFSERR_REMOTE:
872 				error = EREMOTE;
873 				break;
874 
875 			case NFSERR_WFLUSH:
876 			case NFSERR_BADHANDLE:
877 			case NFSERR_NOT_SYNC:
878 			case NFSERR_BAD_COOKIE:
879 				error = EINVAL;
880 				break;
881 
882 			case NFSERR_NOTSUPP:
883 				error = ENOTSUP;
884 				break;
885 
886 			case NFSERR_TOOSMALL:
887 			case NFSERR_SERVERFAULT:
888 			case NFSERR_BADTYPE:
889 				error = EINVAL;
890 				break;
891 
892 			case NFSERR_TRYLATER:
893 				if ((nmp->nm_flag & NFSMNT_NFSV3) == 0)
894 					break;
895 				m_freem(mrep);
896 				error = 0;
897 				waituntil = time_second + trylater_delay;
898 				while (time_second < waituntil) {
899 					kpause("nfstrylater", false, hz, NULL);
900 				}
901 				trylater_delay *= NFS_TRYLATERDELMUL;
902 				if (trylater_delay > NFS_TRYLATERDELMAX)
903 					trylater_delay = NFS_TRYLATERDELMAX;
904 				/*
905 				 * RFC1813:
906 				 * The client should wait and then try
907 				 * the request with a new RPC transaction ID.
908 				 */
909 				nfs_renewxid(rep);
910 				goto tryagain;
911 
912 			default:
913 #ifdef DIAGNOSTIC
914 				printf("Invalid rpc error code %d\n", error);
915 #endif
916 				error = EINVAL;
917 				break;
918 			}
919 
920 			if (nmp->nm_flag & NFSMNT_NFSV3) {
921 				*mrp = mrep;
922 				*mdp = md;
923 				*dposp = dpos;
924 				error |= NFSERR_RETERR;
925 			} else
926 				m_freem(mrep);
927 			goto nfsmout;
928 		}
929 
930 		/*
931 		 * note which credential worked to minimize number of retries.
932 		 */
933 		if (use_opencred)
934 			np->n_flag |= NUSEOPENCRED;
935 		else
936 			np->n_flag &= ~NUSEOPENCRED;
937 
938 		*mrp = mrep;
939 		*mdp = md;
940 		*dposp = dpos;
941 
942 		KASSERT(error == 0);
943 		goto nfsmout;
944 	}
945 	m_freem(mrep);
946 	error = EPROTONOSUPPORT;
947 nfsmout:
948 	KASSERT(kauth_cred_getrefcnt(acred) == 1);
949 	kauth_cred_free(acred);
950 	m_freem(rep->r_mreq);
951 	kmem_free(rep, sizeof(*rep));
952 	m_freem(mrest_backup);
953 	return (error);
954 }
955 
956 /*
957  * Lock a socket against others.
958  * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
959  * and also to avoid race conditions between the processes with nfs requests
960  * in progress when a reconnect is necessary.
961  */
962 static int
963 nfs_sndlock(struct nfsmount *nmp, struct nfsreq *rep)
964 {
965 	struct lwp *l;
966 	int timeo = 0;
967 	bool catch_p = false;
968 	int error = 0;
969 
970 	if (rep) {
971 		l = rep->r_lwp;
972 		if (rep->r_nmp->nm_flag & NFSMNT_INT)
973 			catch_p = true;
974 	} else
975 		l = NULL;
976 	mutex_enter(&nmp->nm_lock);
977 	while ((nmp->nm_iflag & NFSMNT_SNDLOCK) != 0) {
978 		if (rep && nfs_sigintr(rep->r_nmp, rep, l)) {
979 			error = EINTR;
980 			goto quit;
981 		}
982 		if (catch_p) {
983 			cv_timedwait_sig(&nmp->nm_sndcv, &nmp->nm_lock, timeo);
984 		} else {
985 			cv_timedwait(&nmp->nm_sndcv, &nmp->nm_lock, timeo);
986 		}
987 		if (catch_p) {
988 			catch_p = false;
989 			timeo = 2 * hz;
990 		}
991 	}
992 	nmp->nm_iflag |= NFSMNT_SNDLOCK;
993 quit:
994 	mutex_exit(&nmp->nm_lock);
995 	return error;
996 }
997 
998 /*
999  * Unlock the stream socket for others.
1000  */
1001 static void
1002 nfs_sndunlock(struct nfsmount *nmp)
1003 {
1004 
1005 	mutex_enter(&nmp->nm_lock);
1006 	if ((nmp->nm_iflag & NFSMNT_SNDLOCK) == 0)
1007 		panic("nfs sndunlock");
1008 	nmp->nm_iflag &= ~NFSMNT_SNDLOCK;
1009 	cv_signal(&nmp->nm_sndcv);
1010 	mutex_exit(&nmp->nm_lock);
1011 }
1012