xref: /csrg-svn/sys/nfs/nfs_socket.c (revision 40117)
138414Smckusick /*
238414Smckusick  * Copyright (c) 1989 The Regents of the University of California.
338414Smckusick  * All rights reserved.
438414Smckusick  *
538414Smckusick  * This code is derived from software contributed to Berkeley by
638414Smckusick  * Rick Macklem at The University of Guelph.
738414Smckusick  *
838414Smckusick  * Redistribution and use in source and binary forms are permitted
938414Smckusick  * provided that the above copyright notice and this paragraph are
1038414Smckusick  * duplicated in all such forms and that any documentation,
1138414Smckusick  * advertising materials, and other materials related to such
1238414Smckusick  * distribution and use acknowledge that the software was developed
1338414Smckusick  * by the University of California, Berkeley.  The name of the
1438414Smckusick  * University may not be used to endorse or promote products derived
1538414Smckusick  * from this software without specific prior written permission.
1638414Smckusick  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
1738414Smckusick  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
1838414Smckusick  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
1938414Smckusick  *
20*40117Smckusick  *	@(#)nfs_socket.c	7.6 (Berkeley) 02/16/90
2138414Smckusick  */
2238414Smckusick 
2338414Smckusick /*
2438414Smckusick  * Socket operations for use by nfs (similar to uipc_socket.c, but never
2538414Smckusick  * with copies to/from a uio vector)
26*40117Smckusick  * NB: For now, they only work for datagram sockets.
2738414Smckusick  * (Use on stream sockets would require some record boundary mark in the
2839754Smckusick  *  stream as defined by "RPC: Remote Procedure Call Protocol
2939754Smckusick  *  Specification" RFC1057 Section 10)
3038414Smckusick  *  and different versions of send, receive and reply that do not assume
3138414Smckusick  *  an atomic protocol
3238414Smckusick  */
3338414Smckusick 
3438414Smckusick #include "types.h"
3538414Smckusick #include "param.h"
3638414Smckusick #include "uio.h"
3738414Smckusick #include "user.h"
38*40117Smckusick #include "proc.h"
39*40117Smckusick #include "signal.h"
4038414Smckusick #include "mount.h"
4138414Smckusick #include "kernel.h"
4238414Smckusick #include "malloc.h"
4338414Smckusick #include "mbuf.h"
4438414Smckusick #include "vnode.h"
4538414Smckusick #include "domain.h"
4638414Smckusick #include "protosw.h"
4738414Smckusick #include "socket.h"
4838414Smckusick #include "socketvar.h"
4938414Smckusick #include "rpcv2.h"
5038414Smckusick #include "nfsv2.h"
5138414Smckusick #include "nfs.h"
5238414Smckusick #include "xdr_subs.h"
5338414Smckusick #include "nfsm_subs.h"
5438414Smckusick #include "nfsmount.h"
5538414Smckusick 
56*40117Smckusick #include "syslog.h"
57*40117Smckusick #define nfs_log(message, host)	log(LOG_ERR, message, host)
58*40117Smckusick 
5938414Smckusick #define	TRUE	1
6038414Smckusick 
6138414Smckusick /* set lock on sockbuf sb, sleep at neg prio */
6238414Smckusick #define nfs_sblock(sb) { \
6338414Smckusick 	while ((sb)->sb_flags & SB_LOCK) { \
6438414Smckusick 		(sb)->sb_flags |= SB_WANT; \
6538414Smckusick 		sleep((caddr_t)&(sb)->sb_flags, PZERO-1); \
6638414Smckusick 	} \
6738414Smckusick 	(sb)->sb_flags |= SB_LOCK; \
6838414Smckusick }
69*40117Smckusick /*
70*40117Smckusick  * nfs_sbwait() is simply sbwait() but at a negative priority so that it
71*40117Smckusick  * can not be interrupted by a signal.
72*40117Smckusick  */
73*40117Smckusick nfs_sbwait(sb)
74*40117Smckusick 	struct sockbuf *sb;
75*40117Smckusick {
76*40117Smckusick 	sb->sb_flags |= SB_WAIT;
77*40117Smckusick 	sleep((caddr_t)&sb->sb_cc, PZERO-2);
78*40117Smckusick }
7938414Smckusick 
8038414Smckusick /*
8138414Smckusick  * External data, mostly RPC constants in XDR form
8238414Smckusick  */
8338414Smckusick extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix,
8438414Smckusick 	rpc_msgaccepted, rpc_call;
8538414Smckusick extern u_long nfs_prog, nfs_vers;
8638414Smckusick int	nfsrv_null(),
8738414Smckusick 	nfsrv_getattr(),
8838414Smckusick 	nfsrv_setattr(),
8938414Smckusick 	nfsrv_lookup(),
9038414Smckusick 	nfsrv_readlink(),
9138414Smckusick 	nfsrv_read(),
9238414Smckusick 	nfsrv_write(),
9338414Smckusick 	nfsrv_create(),
9438414Smckusick 	nfsrv_remove(),
9538414Smckusick 	nfsrv_rename(),
9638414Smckusick 	nfsrv_link(),
9738414Smckusick 	nfsrv_symlink(),
9838414Smckusick 	nfsrv_mkdir(),
9938414Smckusick 	nfsrv_rmdir(),
10038414Smckusick 	nfsrv_readdir(),
10138414Smckusick 	nfsrv_statfs(),
10238414Smckusick 	nfsrv_noop();
10338414Smckusick 
10438414Smckusick int (*nfsrv_procs[NFS_NPROCS])() = {
10538414Smckusick 	nfsrv_null,
10638414Smckusick 	nfsrv_getattr,
10738414Smckusick 	nfsrv_setattr,
10838414Smckusick 	nfsrv_noop,
10938414Smckusick 	nfsrv_lookup,
11038414Smckusick 	nfsrv_readlink,
11138414Smckusick 	nfsrv_read,
11238414Smckusick 	nfsrv_noop,
11338414Smckusick 	nfsrv_write,
11438414Smckusick 	nfsrv_create,
11538414Smckusick 	nfsrv_remove,
11638414Smckusick 	nfsrv_rename,
11738414Smckusick 	nfsrv_link,
11838414Smckusick 	nfsrv_symlink,
11938414Smckusick 	nfsrv_mkdir,
12038414Smckusick 	nfsrv_rmdir,
12138414Smckusick 	nfsrv_readdir,
12238414Smckusick 	nfsrv_statfs,
12338414Smckusick };
12438414Smckusick 
125*40117Smckusick struct nfshost *nfshosth;
126*40117Smckusick struct nfsreq nfsreqh;
127*40117Smckusick int nfsrexmtthresh = NFS_FISHY;
12838414Smckusick 
12938414Smckusick /*
130*40117Smckusick  * Initialize sockets and per-host congestion for a new NFS connection.
131*40117Smckusick  * We do not free the sockaddr if error.
13238414Smckusick  */
133*40117Smckusick nfs_connect(nmp, saddr)
134*40117Smckusick 	register struct nfsmount *nmp;
135*40117Smckusick 	struct mbuf *saddr;
136*40117Smckusick {
137*40117Smckusick 	int s, error, srvaddrlen;
138*40117Smckusick 	struct mbuf *m;
139*40117Smckusick 	register struct nfshost *nfshp;
140*40117Smckusick 
141*40117Smckusick 	nmp->nm_so = 0;
142*40117Smckusick 	if (error = socreate(mtod(saddr, struct sockaddr *)->sa_family,
143*40117Smckusick 				&nmp->nm_so, SOCK_DGRAM, 0))
144*40117Smckusick 		goto bad;
145*40117Smckusick 
146*40117Smckusick 	/* Unix sockets do not provide a local bind for server reply */
147*40117Smckusick 	if (mtod(saddr, struct sockaddr *)->sa_family == AF_UNIX) {
148*40117Smckusick 		struct sockaddr *sa;
149*40117Smckusick 		static char client[] = "/tmp/.nfs/nfsclient##";
150*40117Smckusick 		static int serial;
151*40117Smckusick 		int firstserial;
152*40117Smckusick 		m = m_getclr(M_WAIT, MT_SONAME);
153*40117Smckusick 		if (m == NULL) {
154*40117Smckusick 			error = ENOBUFS;
155*40117Smckusick 			goto bad;
156*40117Smckusick 		}
157*40117Smckusick 		m->m_len = sizeof (client) + 2;
158*40117Smckusick 		sa = mtod(m, struct sockaddr *);
159*40117Smckusick 		sa->sa_family = AF_UNIX;
160*40117Smckusick #ifdef	MSG_TRUNC	/* Have sa_len to set? */
161*40117Smckusick 		sa->sa_len = m->m_len;
162*40117Smckusick #endif
163*40117Smckusick 		bcopy(client, sa->sa_data, sizeof(client));
164*40117Smckusick 		firstserial = serial;
165*40117Smckusick 		do {
166*40117Smckusick 			if (++serial >= 100) serial = 0;
167*40117Smckusick 			sa->sa_data[19] = (serial / 10) + '0';
168*40117Smckusick 			sa->sa_data[20] = (serial % 10) + '0';
169*40117Smckusick 			error = sobind(nmp->nm_so, m);
170*40117Smckusick 			if (firstserial == serial) break;
171*40117Smckusick 		} while (error == EADDRINUSE);
172*40117Smckusick 		m_freem(m);
173*40117Smckusick 		if (error)
174*40117Smckusick 			goto bad;
175*40117Smckusick 	}
176*40117Smckusick 
177*40117Smckusick 	if (error = soconnect(nmp->nm_so, saddr))
178*40117Smckusick 		goto bad;
179*40117Smckusick 	error = soreserve(nmp->nm_so,	/* get space ! */
180*40117Smckusick 				nmp->nm_wsize + 1024,		/* one out */
181*40117Smckusick 				(nmp->nm_rsize + 1024) * 4);	/* four in */
182*40117Smckusick 	if (error)
183*40117Smckusick 		goto bad;
184*40117Smckusick 
185*40117Smckusick 	/*
186*40117Smckusick 	 * Search mount list for existing server entry.
187*40117Smckusick 	 *
188*40117Smckusick 	 * Note, even though we have a sockaddr, it is not quite reliable
189*40117Smckusick 	 * enough to bcmp against. For instance, a sockaddr_in has a
190*40117Smckusick 	 * sin_zero field which is not reliably zeroed by user code (e.g.
191*40117Smckusick 	 * mount). So what we do as an attempt at transport independence
192*40117Smckusick 	 * is to get the peeraddr of our connected socket into a zeroed
193*40117Smckusick 	 * sockaddr. Then we cache that and compare against it. This is
194*40117Smckusick 	 * not exactly perfect. However it is not critical that it be, if
195*40117Smckusick 	 * we cannot match the sockaddr we will simply allocate a new nfshp
196*40117Smckusick 	 * per mount, which will disable the per-host congestion but
197*40117Smckusick 	 * everything else will work as normal.
198*40117Smckusick 	 */
199*40117Smckusick 	m = m_getclr(M_WAIT, MT_SONAME);
200*40117Smckusick 	if (m && (*(nmp->nm_so->so_proto->pr_usrreq))(nmp->nm_so, PRU_PEERADDR,
201*40117Smckusick 				(struct mbuf *)0, m, (struct mbuf *)0) == 0) {
202*40117Smckusick 		m_freem(saddr);
203*40117Smckusick 		saddr = m;
204*40117Smckusick 	} else
205*40117Smckusick 		m_freem(m);
206*40117Smckusick 	srvaddrlen = saddr->m_len;
207*40117Smckusick 
208*40117Smckusick 	s = splnet();
209*40117Smckusick 
210*40117Smckusick 	for (nfshp = nfshosth; nfshp; nfshp = nfshp->nh_next) {
211*40117Smckusick 		if (srvaddrlen != nfshp->nh_salen)
212*40117Smckusick 			continue;
213*40117Smckusick 		if (!bcmp(mtod(saddr,caddr_t),mtod(nfshp->nh_sockaddr,caddr_t),
214*40117Smckusick 				srvaddrlen))
215*40117Smckusick 			break;
216*40117Smckusick 	}
217*40117Smckusick 	if (nfshp)		/* Have an existing mount host */
218*40117Smckusick 		m_freem(saddr);
219*40117Smckusick 	else {
220*40117Smckusick 		MALLOC(nfshp,struct nfshost *,sizeof *nfshp,M_NFSMNT,M_WAITOK);
221*40117Smckusick 		bzero((caddr_t)nfshp, sizeof *nfshp);
222*40117Smckusick 		nfshp->nh_sockaddr = saddr;
223*40117Smckusick 		nfshp->nh_salen = srvaddrlen;
224*40117Smckusick 		/* Initialize other non-zero congestion variables */
225*40117Smckusick 		nfshp->nh_currto = NFS_TIMEO;
226*40117Smckusick 		nfshp->nh_window = 1;		    /* Initial send window */
227*40117Smckusick 		nfshp->nh_ssthresh = NFS_MAXWINDOW; /* Slowstart threshold */
228*40117Smckusick 		if (nfshosth) nfshosth->nh_prev = nfshp;	/* Chain in */
229*40117Smckusick 		nfshp->nh_next = nfshosth;
230*40117Smckusick 		nfshosth = nfshp;
231*40117Smckusick 	}
232*40117Smckusick 	nfshp->nh_refcnt++;
233*40117Smckusick 	splx(s);
234*40117Smckusick 	nmp->nm_hostinfo = nfshp;
235*40117Smckusick 	if (nmp->nm_rto == NFS_TIMEO) {
236*40117Smckusick 		nmp->nm_rto = nfshp->nh_currto;
237*40117Smckusick 		nmp->nm_rttvar = nmp->nm_rto << 1;
238*40117Smckusick 	}
239*40117Smckusick 	return (0);
240*40117Smckusick 
241*40117Smckusick bad:
242*40117Smckusick 	if (nmp->nm_so) (void) soclose(nmp->nm_so);
243*40117Smckusick 	nmp->nm_so = 0;
244*40117Smckusick 	return (error);
245*40117Smckusick }
246*40117Smckusick 
247*40117Smckusick /*
248*40117Smckusick  * NFS disconnect. Clean up and unlink.
249*40117Smckusick  */
250*40117Smckusick nfs_disconnect(nmp)
251*40117Smckusick 	register struct nfsmount *nmp;
252*40117Smckusick {
253*40117Smckusick 	register struct nfshost *nfshp;
254*40117Smckusick 
255*40117Smckusick 	if (nmp->nm_so)
256*40117Smckusick 		soclose(nmp->nm_so);
257*40117Smckusick 	nmp->nm_so = 0;
258*40117Smckusick 	if (nfshp = nmp->nm_hostinfo) {
259*40117Smckusick 		int s = splnet();
260*40117Smckusick 		if (--nfshp->nh_refcnt <= 0) {
261*40117Smckusick 			if (nfshp->nh_next)
262*40117Smckusick 				nfshp->nh_next->nh_prev = nfshp->nh_prev;
263*40117Smckusick 			if (nfshp->nh_prev)
264*40117Smckusick 				nfshp->nh_prev->nh_next = nfshp->nh_next;
265*40117Smckusick 			else
266*40117Smckusick 				nfshosth = nfshp->nh_next;
267*40117Smckusick 			/* If unix family, remove the nfsclient from /tmp */
268*40117Smckusick 			if (mtod(nfshp->nh_sockaddr,
269*40117Smckusick 				struct sockaddr *)->sa_family == AF_UNIX) {
270*40117Smckusick 					/* Lookup sa_data, do VOP_REMOVE... */
271*40117Smckusick 			}
272*40117Smckusick 			m_freem(nfshp->nh_sockaddr);
273*40117Smckusick 			FREE(nfshp, M_NFSMNT);
274*40117Smckusick 		}
275*40117Smckusick 		nmp->nm_hostinfo = 0;
276*40117Smckusick 		splx(s);
277*40117Smckusick 	}
278*40117Smckusick }
279*40117Smckusick 
280*40117Smckusick /*
281*40117Smckusick  * This is a stripped down non-interruptible version of sosend().
282*40117Smckusick  */
283*40117Smckusick nfs_send(so, nam, top, flags, siz)
28438414Smckusick 	register struct socket *so;
28538414Smckusick 	struct mbuf *nam;
28638414Smckusick 	struct mbuf *top;
28738414Smckusick 	int flags;
28838414Smckusick 	int siz;
28938414Smckusick {
290*40117Smckusick 	int error, s;
29138414Smckusick 
29238414Smckusick #ifdef MGETHDR
29338414Smckusick 	top->m_pkthdr.len = siz;
29438414Smckusick #endif
295*40117Smckusick 	for (;;) {
296*40117Smckusick 		nfs_sblock(&so->so_snd);
297*40117Smckusick 		s = splnet();
298*40117Smckusick 		if (error = nfs_sockerr(so, 1)) {
299*40117Smckusick 			splx(s);
300*40117Smckusick 			m_freem(top);
301*40117Smckusick 			break;
302*40117Smckusick 		}
303*40117Smckusick 		if (sbspace(&so->so_snd) < siz) {
304*40117Smckusick 			sbunlock(&so->so_snd);
305*40117Smckusick 			nfs_sbwait(&so->so_snd);
306*40117Smckusick 			splx(s);
307*40117Smckusick 			continue;
308*40117Smckusick 		}
309*40117Smckusick 		error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, top,
310*40117Smckusick 			(struct mbuf *)nam, (struct mbuf *)0, (struct mbuf *)0);
31138414Smckusick 		splx(s);
312*40117Smckusick 		break;
31338414Smckusick 	}
31438414Smckusick 	sbunlock(&so->so_snd);
31538414Smckusick 	return (error);
31638414Smckusick }
31738414Smckusick 
31838414Smckusick /*
319*40117Smckusick  * This is a stripped down datagram specific version of soreceive()
32038414Smckusick  */
321*40117Smckusick nfs_dgreceive(so, msk, mtch, aname, mp)
32238414Smckusick 	register struct socket *so;
32339754Smckusick 	u_long msk;
32439754Smckusick 	u_long mtch;
32538414Smckusick 	struct mbuf **aname;
32638414Smckusick 	struct mbuf **mp;
32738414Smckusick {
32838414Smckusick 	register struct mbuf *m;
32938414Smckusick 	int s, error = 0;
33038414Smckusick 	struct mbuf *nextrecord;
33138414Smckusick 
33238414Smckusick 	if (aname)
33338414Smckusick 		*aname = 0;
33438414Smckusick 
335*40117Smckusick 	for (;;) {
336*40117Smckusick 		sblock(&so->so_rcv);
337*40117Smckusick 		s = splnet();
33838414Smckusick 
339*40117Smckusick 		if (so->so_rcv.sb_cc == 0) {
340*40117Smckusick 			if (error = nfs_sockerr(so, 0)) {
341*40117Smckusick 				so->so_error = 0;
342*40117Smckusick 				break;
343*40117Smckusick 			}
34439754Smckusick 			sbunlock(&so->so_rcv);
345*40117Smckusick 			sbwait(&so->so_rcv);
34639754Smckusick 			splx(s);
347*40117Smckusick 			continue;
34839754Smckusick 		}
34938414Smckusick 		m = so->so_rcv.sb_mb;
350*40117Smckusick 		if (m == 0)
351*40117Smckusick 			panic("nfs_dgreceive 1");
352*40117Smckusick 		nextrecord = m->m_nextpkt;
353*40117Smckusick 		/* Save sender's address */
354*40117Smckusick 		if (m->m_type != MT_SONAME)
355*40117Smckusick 			panic("nfs_dgreceive 1a");
35638414Smckusick 		sbfree(&so->so_rcv, m);
357*40117Smckusick 		if (aname) {
358*40117Smckusick 			*aname = m;
359*40117Smckusick 			so->so_rcv.sb_mb = m->m_next;
360*40117Smckusick 			m->m_next = 0;
361*40117Smckusick 			m = so->so_rcv.sb_mb;
362*40117Smckusick 		} else {
363*40117Smckusick 			MFREE(m, so->so_rcv.sb_mb);
364*40117Smckusick 			m = so->so_rcv.sb_mb;
365*40117Smckusick 		}
366*40117Smckusick 		/* Drop control mbuf's */
367*40117Smckusick 		if (m && m->m_type == MT_RIGHTS)
368*40117Smckusick 			panic("nfs_dgreceive 2");
369*40117Smckusick 		if (m && m->m_type == MT_CONTROL) {
370*40117Smckusick 			sbfree(&so->so_rcv, m);
371*40117Smckusick 			MFREE(m, so->so_rcv.sb_mb);
372*40117Smckusick 			m = so->so_rcv.sb_mb;
373*40117Smckusick 		}
374*40117Smckusick 		/* Dequeue packet from sockbuf */
375*40117Smckusick 		*mp = m;
376*40117Smckusick 		while (m) {
377*40117Smckusick 			if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
378*40117Smckusick 				panic("nfs_dgreceive 3");
379*40117Smckusick 			sbfree(&so->so_rcv, m);
380*40117Smckusick 			m = so->so_rcv.sb_mb = m->m_next;
381*40117Smckusick 		}
382*40117Smckusick 		so->so_rcv.sb_mb = nextrecord;
383*40117Smckusick 		/* Return */
384*40117Smckusick 		break;
38538414Smckusick 	}
38638414Smckusick 	sbunlock(&so->so_rcv);
38738414Smckusick 	splx(s);
38838414Smckusick 	return (error);
38938414Smckusick }
39038414Smckusick 
39138414Smckusick struct rpc_replyhead {
39238414Smckusick 	u_long	r_xid;
39338414Smckusick 	u_long	r_rep;
39438414Smckusick };
39538414Smckusick 
39638414Smckusick /*
397*40117Smckusick  * Implement NFS client side datagram receive.
39838414Smckusick  * We depend on the way that records are added to the sockbuf
39938414Smckusick  * by sbappend*.  In particular, each record (mbufs linked through m_next)
40038414Smckusick  * must begin with an address, followed by optional MT_CONTROL mbuf
40138414Smckusick  * and then zero or more mbufs of data.
40238414Smckusick  * We must search through the list of received datagrams matching them
40338414Smckusick  * with outstanding requests using the xid, until ours is found.
40438414Smckusick  */
405*40117Smckusick nfs_dgreply(so, mntp, myrep)
40638414Smckusick 	register struct socket *so;
40738414Smckusick 	struct nfsmount *mntp;
40839344Smckusick 	struct nfsreq *myrep;
40938414Smckusick {
41038414Smckusick 	register struct mbuf *m;
41138414Smckusick 	register struct nfsreq *rep;
41238414Smckusick 	register int error = 0, s;
413*40117Smckusick 	int logged = 0;
41438414Smckusick 	struct mbuf *nextrecord;
41538414Smckusick 	struct rpc_replyhead replyh;
41638414Smckusick 
41738414Smckusick restart:
41839344Smckusick 	nfs_sblock(&so->so_rcv);
419*40117Smckusick 	s = splnet();
420*40117Smckusick 	/* Already received and queued for us, bye bye */
42139344Smckusick 	if (myrep->r_mrep != NULL) {
422*40117Smckusick 		error = 0;
423*40117Smckusick 		goto release;
42439344Smckusick 	}
425*40117Smckusick 	/* If we have run out of retries (hard mounts have bogus count) */
426*40117Smckusick 	if (myrep->r_rexmit > myrep->r_retry) {
427*40117Smckusick 		error = ETIMEDOUT;
428*40117Smckusick 		nfsstats.rpctimeouts++;
429*40117Smckusick giveup:
430*40117Smckusick 		if (myrep->r_flags & R_TIMING) {
431*40117Smckusick 			myrep->r_flags &= ~R_TIMING;
432*40117Smckusick 			mntp->nm_rtt = -1;
433*40117Smckusick 		}
434*40117Smckusick 		if (myrep->r_flags & R_SENT) {
435*40117Smckusick 			myrep->r_flags &= ~R_SENT;
436*40117Smckusick 			--mntp->nm_hostinfo->nh_sent;
437*40117Smckusick 			/* If count now 0, want to initiate new req */
438*40117Smckusick 		}
439*40117Smckusick 		goto release;
44039344Smckusick 	}
44138414Smckusick 
44239344Smckusick 	m = so->so_rcv.sb_mb;
44339344Smckusick 	if (m == 0) {
44439344Smckusick 		if (so->so_rcv.sb_cc)
44539344Smckusick 			panic("nfs_soreply 1");
446*40117Smckusick 		if (error = nfs_sockerr(so, 0)) {
44738414Smckusick 			so->so_error = 0;
448*40117Smckusick 			goto giveup;
44938414Smckusick 		}
450*40117Smckusick 		/* Allow signals to interrupt request? (nfs_timer wakes up) */
451*40117Smckusick 		if ((mntp->nm_flag & NFSMNT_INT) &&
452*40117Smckusick 		    u.u_procp->p_sig & ~u.u_procp->p_sigmask) {
453*40117Smckusick 			error = EINTR;
454*40117Smckusick 			goto giveup;
455*40117Smckusick 		}
456*40117Smckusick 		if (mntp->nm_rexmit >= nfsrexmtthresh && logged++ == 0)
457*40117Smckusick 			uprintf("NFS server %s not responding, retrying\n",
458*40117Smckusick 				mntp->nm_host);
45938414Smckusick 		sbunlock(&so->so_rcv);
46038414Smckusick 		nfs_sbwait(&so->so_rcv);
46138414Smckusick 		splx(s);
46238414Smckusick 		goto restart;
46338414Smckusick 	}
46438414Smckusick 
46538414Smckusick 	/*
46638414Smckusick 	 * Take off the address, check for rights and ditch any control
46738414Smckusick 	 * mbufs.
46838414Smckusick 	 */
469*40117Smckusick 	nextrecord = m->m_nextpkt;
47038414Smckusick 	if (m->m_type != MT_SONAME)
47138414Smckusick 		panic("nfs reply SONAME");
47238414Smckusick 	sbfree(&so->so_rcv, m);
47338414Smckusick 	MFREE(m, so->so_rcv.sb_mb);
47438414Smckusick 	m = so->so_rcv.sb_mb;
47538414Smckusick 	if (m && m->m_type == MT_RIGHTS)
47638414Smckusick 		panic("nfs reply RIGHTS");
47738414Smckusick 	if (m && m->m_type == MT_CONTROL) {
47838414Smckusick 		sbfree(&so->so_rcv, m);
47938414Smckusick 		MFREE(m, so->so_rcv.sb_mb);
48038414Smckusick 		m = so->so_rcv.sb_mb;
48138414Smckusick 	}
48239344Smckusick 	if (m) {
48338414Smckusick 		m->m_nextpkt = nextrecord;
48439344Smckusick 	} else {
48539344Smckusick 		so->so_rcv.sb_mb = nextrecord;
48638414Smckusick 		sbunlock(&so->so_rcv);
48738414Smckusick 		splx(s);
48838414Smckusick 		goto restart;
48938414Smckusick 	}
49038414Smckusick 
49138414Smckusick 	/*
49238414Smckusick 	 * Get the xid and check that it is an rpc reply
49338414Smckusick 	 */
494*40117Smckusick 	if (m->m_len >= sizeof replyh)
495*40117Smckusick 		bcopy(mtod(m, caddr_t), (caddr_t)&replyh, sizeof replyh);
49638414Smckusick 	else {
497*40117Smckusick 		struct mbuf *mp = m;
498*40117Smckusick 		caddr_t cp = (caddr_t)&replyh;
499*40117Smckusick 		int cnt = sizeof replyh;
500*40117Smckusick 		do {
50138414Smckusick 			if (mp->m_len > 0) {
502*40117Smckusick 				int xfer = (mp->m_len >= cnt) ? cnt : mp->m_len;
50338414Smckusick 				bcopy(mtod(mp, caddr_t), cp, xfer);
50438414Smckusick 				cnt -= xfer;
50538414Smckusick 				cp += xfer;
50638414Smckusick 			}
50738414Smckusick 			if (cnt > 0)
50838414Smckusick 				mp = mp->m_next;
509*40117Smckusick 		} while (mp && cnt > 0);
510*40117Smckusick 		if (mp == NULL) {		/* Insufficient length */
511*40117Smckusick 			nfsstats.rpcinvalid++;
512*40117Smckusick 			goto dropit;
51338414Smckusick 		}
51438414Smckusick 	}
515*40117Smckusick 	if (replyh.r_rep != rpc_reply) {	/* Not a reply */
516*40117Smckusick 		nfsstats.rpcinvalid++;
51738414Smckusick 		goto dropit;
518*40117Smckusick 	}
51938414Smckusick 	/*
52038414Smckusick 	 * Loop through the request list to match up the reply
521*40117Smckusick 	 * If no match, just drop the datagram
52238414Smckusick 	 */
523*40117Smckusick 	if (rep = nfsreqh.r_next) {
524*40117Smckusick 	    while (rep != &nfsreqh) {
525*40117Smckusick 		/* The socket, being connected, will only queue matches */
526*40117Smckusick 		if (replyh.r_xid == rep->r_xid && so == rep->r_mntp->nm_so) {
52738414Smckusick 			/* Found it.. */
528*40117Smckusick 			if (rep->r_mrep)	/* Already there - duplicate */
529*40117Smckusick 				break;
53038414Smckusick 			rep->r_mrep = m;
53138414Smckusick 			while (m) {
53238414Smckusick 				if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
53338414Smckusick 					panic("nfs_soreply 3");
53438414Smckusick 				sbfree(&so->so_rcv, m);
53538414Smckusick 				m = so->so_rcv.sb_mb = m->m_next;
53638414Smckusick 			}
53738414Smckusick 			so->so_rcv.sb_mb = nextrecord;
538*40117Smckusick 			if (rep->r_flags & R_TIMING) {
539*40117Smckusick 				nfs_updatetimer(mntp);
540*40117Smckusick 				rep->r_flags &= ~R_TIMING;
541*40117Smckusick 				mntp->nm_rtt = -1;	/* re-arm timer */
542*40117Smckusick 			}
543*40117Smckusick 			if (rep->r_flags & R_SENT) {
544*40117Smckusick 				rep->r_flags &= ~R_SENT;
545*40117Smckusick 				--mntp->nm_hostinfo->nh_sent;
546*40117Smckusick 				/* If count now 0, want to initiate new req */
547*40117Smckusick 			}
548*40117Smckusick 			if (rep == myrep) {		/* This is success */
549*40117Smckusick 				if (logged)
550*40117Smckusick 					uprintf("NFS server %s responded\n",
551*40117Smckusick 						mntp->nm_host);
55238414Smckusick 				goto release;
553*40117Smckusick 			}
554*40117Smckusick 			/* Else wake up other sleeper and wait for next */
555*40117Smckusick 			sbunlock(&so->so_rcv);
556*40117Smckusick 			sorwakeup(so);
557*40117Smckusick 			splx(s);
558*40117Smckusick 			goto restart;
55938414Smckusick 		}
56038414Smckusick 		rep = rep->r_next;
561*40117Smckusick 	    }
56238414Smckusick 	}
563*40117Smckusick 	/* If not matched to request, drop it */
564*40117Smckusick 	nfsstats.rpcunexpected++;
56538414Smckusick dropit:
566*40117Smckusick 	sbdroprecord(&so->so_rcv);
56738414Smckusick 	sbunlock(&so->so_rcv);
56838414Smckusick 	splx(s);
56938414Smckusick 	goto restart;
570*40117Smckusick 
57138414Smckusick release:
57238414Smckusick 	sbunlock(&so->so_rcv);
57338414Smckusick 	splx(s);
57438414Smckusick 	return (error);
57538414Smckusick }
57638414Smckusick 
57738414Smckusick /*
57838414Smckusick  * nfs_request - goes something like this
57938414Smckusick  *	- fill in request struct
58038414Smckusick  *	- links it into list
58138414Smckusick  *	- calls nfs_sosend() for first transmit
58238414Smckusick  *	- calls nfs_soreceive() to get reply
58338414Smckusick  *	- break down rpc header and return with nfs reply pointed to
58438414Smckusick  *	  by mrep or error
58538414Smckusick  * nb: always frees up mreq mbuf list
58638414Smckusick  */
587*40117Smckusick nfs_request(vp, mreq, xid, idem, mp, mrp, mdp, dposp)
58838414Smckusick 	struct vnode *vp;
58938414Smckusick 	struct mbuf *mreq;
59038414Smckusick 	u_long xid;
591*40117Smckusick 	int idem;
59238414Smckusick 	struct mount *mp;
59338414Smckusick 	struct mbuf **mrp;
59438414Smckusick 	struct mbuf **mdp;
59538414Smckusick 	caddr_t *dposp;
59638414Smckusick {
59738414Smckusick 	register struct mbuf *m, *mrep;
59838414Smckusick 	register struct nfsreq *rep;
59938414Smckusick 	register u_long *p;
60038414Smckusick 	register int len;
60138414Smckusick 	struct nfsmount *mntp;
60238414Smckusick 	struct mbuf *md;
60339344Smckusick 	struct nfsreq *reph;
60438414Smckusick 	caddr_t dpos;
60538414Smckusick 	char *cp2;
60638414Smckusick 	int t1;
60738414Smckusick 	int s;
60838414Smckusick 	int error;
60938414Smckusick 
61038414Smckusick 	mntp = vfs_to_nfs(mp);
61138414Smckusick 	m = mreq;
61238414Smckusick 	MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
61338414Smckusick 	rep->r_xid = xid;
61438414Smckusick 	rep->r_mntp = mntp;
61538414Smckusick 	rep->r_vp = vp;
61638414Smckusick 	if (mntp->nm_flag & NFSMNT_SOFT)
617*40117Smckusick 		rep->r_retry = mntp->nm_retry;
61838414Smckusick 	else
619*40117Smckusick 		rep->r_retry = NFS_MAXREXMIT + 1;	/* past clip limit */
620*40117Smckusick 	rep->r_flags = rep->r_rexmit = 0;
621*40117Smckusick 	/* Idempotency: add N * MINTIMEO to requests if not, else use 0 */
622*40117Smckusick 	rep->r_timer = rep->r_timerinit = -(idem * NFS_MINTIMEO);
62338414Smckusick 	rep->r_mrep = NULL;
62438414Smckusick 	rep->r_mreq = m;
62538414Smckusick 	len = 0;
62638414Smckusick 	while (m) {
62738414Smckusick 		len += m->m_len;
62838414Smckusick 		m = m->m_next;
62938414Smckusick 	}
63038414Smckusick 	rep->r_msiz = len;
63138414Smckusick 
632*40117Smckusick 	/*
633*40117Smckusick 	 * Do the client side RPC.
634*40117Smckusick 	 */
635*40117Smckusick 	nfsstats.rpcrequests++;
636*40117Smckusick 	s = splnet();
637*40117Smckusick 	/* Chain request into list of outstanding requests. Be sure
638*40117Smckusick 	 * to put it LAST so timer finds oldest requests first. */
63939344Smckusick 	reph = &nfsreqh;
64039344Smckusick 	if (reph->r_prev == NULL) {
64139344Smckusick 		reph->r_next = rep;
64239344Smckusick 		rep->r_prev = reph;
64339344Smckusick 	} else {
64439344Smckusick 		reph->r_prev->r_next = rep;
64539344Smckusick 		rep->r_prev = reph->r_prev;
64639344Smckusick 	}
64739344Smckusick 	reph->r_prev = rep;
64839344Smckusick 	rep->r_next = reph;
649*40117Smckusick 	/*
650*40117Smckusick 	 * If backing off another request or avoiding congestion, don't
651*40117Smckusick 	 * send this one now but let timer do it. If not timing a request,
652*40117Smckusick 	 * do it now.
653*40117Smckusick 	 */
654*40117Smckusick 	if (mntp->nm_hostinfo->nh_sent > 0 &&
655*40117Smckusick 	    (mntp->nm_hostinfo->nh_currexmit != 0 ||
656*40117Smckusick 	     mntp->nm_hostinfo->nh_sent >= mntp->nm_hostinfo->nh_window)) {
657*40117Smckusick 		splx(s);
658*40117Smckusick 		goto skipsend;
659*40117Smckusick 	}
660*40117Smckusick 	++mntp->nm_hostinfo->nh_sent;	/* Inconsistent if can't NFSMCOPY */
661*40117Smckusick 	rep->r_flags |= R_SENT;		/* But not a catastrophe */
662*40117Smckusick 	if (mntp->nm_rtt == -1) {
663*40117Smckusick 		mntp->nm_rtt = 0;
664*40117Smckusick 		rep->r_flags |= R_TIMING;
665*40117Smckusick 	}
66638414Smckusick 	splx(s);
66738414Smckusick 
66838414Smckusick 	/*
669*40117Smckusick 	 * If we can get a packet to send, send it off...
67038414Smckusick 	 * otherwise the timer will retransmit later
67138414Smckusick 	 */
672*40117Smckusick 	m = NFSMCOPY(mreq, 0, M_COPYALL, M_WAIT);
67338414Smckusick 	if (m != NULL)
674*40117Smckusick 		(void) nfs_send(mntp->nm_so, (struct mbuf *)0, m, 0, len);
675*40117Smckusick 	/*
676*40117Smckusick 	 * Wait for the reply from our send or the timer's.
677*40117Smckusick 	 */
678*40117Smckusick skipsend:
679*40117Smckusick 	error = nfs_dgreply(mntp->nm_so, mntp, rep);
68038414Smckusick 
681*40117Smckusick 	/*
682*40117Smckusick 	 * RPC done, unlink the request.
683*40117Smckusick 	 */
68438414Smckusick 	s = splnet();
68538414Smckusick 	rep->r_prev->r_next = rep->r_next;
68639344Smckusick 	rep->r_next->r_prev = rep->r_prev;
68738414Smckusick 	splx(s);
68838414Smckusick 	m_freem(rep->r_mreq);
68938414Smckusick 	mrep = md = rep->r_mrep;
69038414Smckusick 	FREE((caddr_t)rep, M_NFSREQ);
69138414Smckusick 	if (error)
69238414Smckusick 		return (error);
69338414Smckusick 
69438414Smckusick 	/*
69538414Smckusick 	 * break down the rpc header and check if ok
69638414Smckusick 	 */
69738414Smckusick 	dpos = mtod(md, caddr_t);
69838414Smckusick 	nfsm_disect(p, u_long *, 5*NFSX_UNSIGNED);
69938414Smckusick 	p += 2;
70038414Smckusick 	if (*p++ == rpc_msgdenied) {
70138414Smckusick 		if (*p == rpc_mismatch)
70238414Smckusick 			error = EOPNOTSUPP;
70338414Smckusick 		else
70438414Smckusick 			error = EACCES;
70538414Smckusick 		m_freem(mrep);
70638414Smckusick 		return (error);
70738414Smckusick 	}
70838414Smckusick 	/*
70938414Smckusick 	 * skip over the auth_verf, someday we may want to cache auth_short's
71038414Smckusick 	 * for nfs_reqhead(), but for now just dump it
71138414Smckusick 	 */
71238414Smckusick 	if (*++p != 0) {
71338414Smckusick 		len = nfsm_rndup(fxdr_unsigned(long, *p));
71438414Smckusick 		nfsm_adv(len);
71538414Smckusick 	}
71638414Smckusick 	nfsm_disect(p, u_long *, NFSX_UNSIGNED);
71738414Smckusick 	/* 0 == ok */
71838414Smckusick 	if (*p == 0) {
71938414Smckusick 		nfsm_disect(p, u_long *, NFSX_UNSIGNED);
72038414Smckusick 		if (*p != 0) {
72138414Smckusick 			error = fxdr_unsigned(int, *p);
72238414Smckusick 			m_freem(mrep);
72338414Smckusick 			return (error);
72438414Smckusick 		}
72538414Smckusick 		*mrp = mrep;
72638414Smckusick 		*mdp = md;
72738414Smckusick 		*dposp = dpos;
72838414Smckusick 		return (0);
72938414Smckusick 	}
73038414Smckusick 	m_freem(mrep);
73138414Smckusick 	return (EPROTONOSUPPORT);
73238414Smckusick nfsmout:
73338414Smckusick 	return (error);
73438414Smckusick }
73538414Smckusick 
73638414Smckusick /*
73738414Smckusick  * Get a request for the server main loop
73838414Smckusick  * - receive a request via. nfs_soreceive()
73938414Smckusick  * - verify it
74038414Smckusick  * - fill in the cred struct.
74138414Smckusick  */
74239754Smckusick nfs_getreq(so, prog, vers, maxproc, nam, mrp, mdp, dposp, retxid, proc, cr,
74339754Smckusick 	   msk, mtch)
74438414Smckusick 	struct socket *so;
74538414Smckusick 	u_long prog;
74638414Smckusick 	u_long vers;
74738414Smckusick 	int maxproc;
74838414Smckusick 	struct mbuf **nam;
74938414Smckusick 	struct mbuf **mrp;
75038414Smckusick 	struct mbuf **mdp;
75138414Smckusick 	caddr_t *dposp;
75238414Smckusick 	u_long *retxid;
75338414Smckusick 	u_long *proc;
75438414Smckusick 	register struct ucred *cr;
75539754Smckusick 	u_long msk;
75639754Smckusick 	u_long mtch;
75738414Smckusick {
75838414Smckusick 	register int i;
75939494Smckusick 	register u_long *p;
76039494Smckusick 	register long t1;
76139494Smckusick 	caddr_t dpos, cp2;
76239494Smckusick 	int error = 0;
76339494Smckusick 	struct mbuf *mrep, *md;
76439494Smckusick 	int len;
76538414Smckusick 
766*40117Smckusick 	if (error = nfs_dgreceive(so, msk, mtch, nam, &mrep))
76738414Smckusick 		return (error);
76838414Smckusick 	md = mrep;
76938414Smckusick 	dpos = mtod(mrep, caddr_t);
77038414Smckusick 	nfsm_disect(p, u_long *, 10*NFSX_UNSIGNED);
77138414Smckusick 	*retxid = *p++;
77238414Smckusick 	if (*p++ != rpc_call) {
77338414Smckusick 		m_freem(mrep);
77438414Smckusick 		return (ERPCMISMATCH);
77538414Smckusick 	}
77638414Smckusick 	if (*p++ != rpc_vers) {
77738414Smckusick 		m_freem(mrep);
77838414Smckusick 		return (ERPCMISMATCH);
77938414Smckusick 	}
78038414Smckusick 	if (*p++ != prog) {
78138414Smckusick 		m_freem(mrep);
78238414Smckusick 		return (EPROGUNAVAIL);
78338414Smckusick 	}
78438414Smckusick 	if (*p++ != vers) {
78538414Smckusick 		m_freem(mrep);
78638414Smckusick 		return (EPROGMISMATCH);
78738414Smckusick 	}
78838414Smckusick 	*proc = fxdr_unsigned(u_long, *p++);
78938414Smckusick 	if (*proc == NFSPROC_NULL) {
79038414Smckusick 		*mrp = mrep;
79138414Smckusick 		return (0);
79238414Smckusick 	}
79338414Smckusick 	if (*proc > maxproc || *p++ != rpc_auth_unix) {
79438414Smckusick 		m_freem(mrep);
79538414Smckusick 		return (EPROCUNAVAIL);
79638414Smckusick 	}
79739494Smckusick 	(void) fxdr_unsigned(int, *p++);
79839494Smckusick 	len = fxdr_unsigned(int, *++p);
79939494Smckusick 	nfsm_adv(nfsm_rndup(len));
80038414Smckusick 	nfsm_disect(p, u_long *, 3*NFSX_UNSIGNED);
80138414Smckusick 	cr->cr_uid = fxdr_unsigned(uid_t, *p++);
80238414Smckusick 	cr->cr_gid = fxdr_unsigned(gid_t, *p++);
80339494Smckusick 	len = fxdr_unsigned(int, *p);
80439494Smckusick 	if (len > 10) {
80538414Smckusick 		m_freem(mrep);
80638414Smckusick 		return (EBADRPC);
80738414Smckusick 	}
80839494Smckusick 	nfsm_disect(p, u_long *, (len + 2)*NFSX_UNSIGNED);
80939494Smckusick 	for (i = 1; i <= len; i++)
81038414Smckusick 		cr->cr_groups[i] = fxdr_unsigned(gid_t, *p++);
81139494Smckusick 	cr->cr_ngroups = len + 1;
81238414Smckusick 	/*
81338414Smckusick 	 * Do we have any use for the verifier.
81438414Smckusick 	 * According to the "Remote Procedure Call Protocol Spec." it
81538414Smckusick 	 * should be AUTH_NULL, but some clients make it AUTH_UNIX?
81638414Smckusick 	 * For now, just skip over it
81738414Smckusick 	 */
81839494Smckusick 	len = fxdr_unsigned(int, *++p);
81939494Smckusick 	if (len > 0)
82039494Smckusick 		nfsm_adv(nfsm_rndup(len));
82138414Smckusick 	*mrp = mrep;
82238414Smckusick 	*mdp = md;
82338414Smckusick 	*dposp = dpos;
82438414Smckusick 	return (0);
82538414Smckusick nfsmout:
82638414Smckusick 	return (error);
82738414Smckusick }
82838414Smckusick 
82938414Smckusick /*
83038414Smckusick  * Generate the rpc reply header
83138414Smckusick  * siz arg. is used to decide if adding a cluster is worthwhile
83238414Smckusick  */
83338414Smckusick nfs_rephead(siz, retxid, err, mrq, mbp, bposp)
83438414Smckusick 	int siz;
83538414Smckusick 	u_long retxid;
83638414Smckusick 	int err;
83738414Smckusick 	struct mbuf **mrq;
83838414Smckusick 	struct mbuf **mbp;
83938414Smckusick 	caddr_t *bposp;
84038414Smckusick {
84139494Smckusick 	register u_long *p;
84239494Smckusick 	register long t1;
84339494Smckusick 	caddr_t bpos;
84439494Smckusick 	struct mbuf *mreq, *mb, *mb2;
84538414Smckusick 
84638414Smckusick 	NFSMGETHDR(mreq);
84738414Smckusick 	mb = mreq;
84838414Smckusick 	if ((siz+RPC_REPLYSIZ) > MHLEN)
84938414Smckusick 		NFSMCLGET(mreq, M_WAIT);
85038414Smckusick 	p = mtod(mreq, u_long *);
85138414Smckusick 	mreq->m_len = 6*NFSX_UNSIGNED;
85238414Smckusick 	bpos = ((caddr_t)p)+mreq->m_len;
85338414Smckusick 	*p++ = retxid;
85438414Smckusick 	*p++ = rpc_reply;
85538414Smckusick 	if (err == ERPCMISMATCH) {
85638414Smckusick 		*p++ = rpc_msgdenied;
85738414Smckusick 		*p++ = rpc_mismatch;
85838414Smckusick 		*p++ = txdr_unsigned(2);
85938414Smckusick 		*p = txdr_unsigned(2);
86038414Smckusick 	} else {
86138414Smckusick 		*p++ = rpc_msgaccepted;
86238414Smckusick 		*p++ = 0;
86338414Smckusick 		*p++ = 0;
86438414Smckusick 		switch (err) {
86538414Smckusick 		case EPROGUNAVAIL:
86638414Smckusick 			*p = txdr_unsigned(RPC_PROGUNAVAIL);
86738414Smckusick 			break;
86838414Smckusick 		case EPROGMISMATCH:
86938414Smckusick 			*p = txdr_unsigned(RPC_PROGMISMATCH);
87038414Smckusick 			nfsm_build(p, u_long *, 2*NFSX_UNSIGNED);
87138414Smckusick 			*p++ = txdr_unsigned(2);
87238414Smckusick 			*p = txdr_unsigned(2);	/* someday 3 */
87338414Smckusick 			break;
87438414Smckusick 		case EPROCUNAVAIL:
87538414Smckusick 			*p = txdr_unsigned(RPC_PROCUNAVAIL);
87638414Smckusick 			break;
87738414Smckusick 		default:
87838414Smckusick 			*p = 0;
87938414Smckusick 			if (err != VNOVAL) {
88038414Smckusick 				nfsm_build(p, u_long *, NFSX_UNSIGNED);
88138414Smckusick 				*p = txdr_unsigned(err);
88238414Smckusick 			}
88338414Smckusick 			break;
88438414Smckusick 		};
88538414Smckusick 	}
88638414Smckusick 	*mrq = mreq;
88738414Smckusick 	*mbp = mb;
88838414Smckusick 	*bposp = bpos;
88938414Smckusick 	if (err != 0 && err != VNOVAL)
89038414Smckusick 		nfsstats.srvrpc_errs++;
89138414Smckusick 	return (0);
89238414Smckusick }
89338414Smckusick 
89438414Smckusick /*
89538414Smckusick  * Nfs timer routine
89638414Smckusick  * Scan the nfsreq list and retranmit any requests that have timed out
89738414Smckusick  * To avoid retransmission attempts on STREAM sockets (in the future) make
898*40117Smckusick  * sure to set the r_retry field to 0 (implies nm_retry == 0).
89938414Smckusick  */
90038414Smckusick nfs_timer()
90138414Smckusick {
90238414Smckusick 	register struct nfsreq *rep;
90338414Smckusick 	register struct mbuf *m;
90438414Smckusick 	register struct socket *so;
905*40117Smckusick 	register struct nfsmount *mntp;
906*40117Smckusick 	int s, error;
90738414Smckusick 
90838414Smckusick 	s = splnet();
90938414Smckusick 	rep = nfsreqh.r_next;
910*40117Smckusick 	if (rep) for ( ; rep != &nfsreqh; rep = rep->r_next) {
911*40117Smckusick 		mntp = rep->r_mntp;
912*40117Smckusick 		if (rep->r_flags & R_TIMING)	/* update rtt in mount */
913*40117Smckusick 			mntp->nm_rtt++;
914*40117Smckusick 		/* If not timed out or reply already received, skip */
915*40117Smckusick 		if (++rep->r_timer < mntp->nm_rto || rep->r_mrep)
916*40117Smckusick 			continue;
917*40117Smckusick 		/* Do backoff and save new timeout in mount */
918*40117Smckusick 		if (rep->r_flags & R_TIMING) {
919*40117Smckusick 			nfs_backofftimer(mntp);
920*40117Smckusick 			rep->r_flags &= ~R_TIMING;
921*40117Smckusick 			mntp->nm_rtt = -1;
922*40117Smckusick 		}
923*40117Smckusick 		if (rep->r_flags & R_SENT) {
924*40117Smckusick 			rep->r_flags &= ~R_SENT;
925*40117Smckusick 			--mntp->nm_hostinfo->nh_sent;
926*40117Smckusick 		}
927*40117Smckusick 		/* Check state of socket, cf nfs_send */
928*40117Smckusick 		so = mntp->nm_so;
929*40117Smckusick 		if (error = nfs_sockerr(so, 1))
930*40117Smckusick 			goto wakeup;
931*40117Smckusick 		if (sbspace(&so->so_snd) < rep->r_msiz)
932*40117Smckusick 			goto wakeup;
933*40117Smckusick 		/* Check for too many retries, cf nfs_dgreply */
934*40117Smckusick 		if (++rep->r_rexmit > NFS_MAXREXMIT)	/* clip */
935*40117Smckusick 			rep->r_rexmit = NFS_MAXREXMIT;
936*40117Smckusick 		if (rep->r_rexmit > rep->r_retry)	/* too many */
937*40117Smckusick 			goto wakeup;
938*40117Smckusick 		/* Check for congestion control, cf nfs_request */
939*40117Smckusick 		if (mntp->nm_hostinfo->nh_sent >= mntp->nm_hostinfo->nh_window)
940*40117Smckusick 			goto wakeup;
941*40117Smckusick 		/* Send it! */
942*40117Smckusick 		m = NFSMCOPY(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT);
943*40117Smckusick 		if (m == NULL)
944*40117Smckusick 			goto wakeup;
945*40117Smckusick 		nfsstats.rpcretries++;
94638414Smckusick #ifdef MGETHDR
947*40117Smckusick 		m->m_pkthdr.len = rep->r_msiz;
94838414Smckusick #endif
949*40117Smckusick 		(void)(*so->so_proto->pr_usrreq)(so, PRU_SEND, m,
950*40117Smckusick 			(struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0);
951*40117Smckusick 
952*40117Smckusick 		/* We need to time the request even though we're
953*40117Smckusick 		 * retransmitting, in order to maintain backoff. */
954*40117Smckusick 		mntp->nm_rtt = 0;
955*40117Smckusick 		++mntp->nm_hostinfo->nh_sent;
956*40117Smckusick 		rep->r_flags |= (R_SENT|R_TIMING);
957*40117Smckusick 		rep->r_timer = rep->r_timerinit;
958*40117Smckusick wakeup:
959*40117Smckusick 		/* If error or interruptible mount, give user a look */
960*40117Smckusick 		if (error || (mntp->nm_flag & NFSMNT_INT))
961*40117Smckusick 			sorwakeup(so);
962*40117Smckusick 	}
963*40117Smckusick 	splx(s);
964*40117Smckusick 	timeout(nfs_timer, (caddr_t)0, hz/NFS_HZ);
965*40117Smckusick }
966*40117Smckusick 
967*40117Smckusick /*
968*40117Smckusick  * NFS timer update and backoff. The "Jacobson/Karels/Karn" scheme is
969*40117Smckusick  * used here. The timer state is held in the nfsmount structure and
970*40117Smckusick  * a single request is used to clock the response. When successful
971*40117Smckusick  * the rtt smoothing in nfs_updatetimer is used, when failed the backoff
972*40117Smckusick  * is done by nfs_backofftimer. We also log failure messages in these
973*40117Smckusick  * routines.
974*40117Smckusick  *
975*40117Smckusick  * Congestion variables are held in the nfshost structure which
976*40117Smckusick  * is referenced by nfsmounts and shared per-server. This separation
977*40117Smckusick  * makes it possible to do per-mount timing which allows varying disk
978*40117Smckusick  * access times to be dealt with, while preserving a network oriented
979*40117Smckusick  * congestion control scheme.
980*40117Smckusick  *
981*40117Smckusick  * The windowing implements the Jacobson/Karels slowstart algorithm
982*40117Smckusick  * with adjusted scaling factors. We start with one request, then send
983*40117Smckusick  * 4 more after each success until the ssthresh limit is reached, then
984*40117Smckusick  * we increment at a rate proportional to the window. On failure, we
985*40117Smckusick  * remember 3/4 the current window and clamp the send limit to 1. Note
986*40117Smckusick  * ICMP source quench is not reflected in so->so_error so we ignore that
987*40117Smckusick  * for now.
988*40117Smckusick  *
989*40117Smckusick  * NFS behaves much more like a transport protocol with these changes,
990*40117Smckusick  * shedding the teenage pedal-to-the-metal tendencies of "other"
991*40117Smckusick  * implementations.
992*40117Smckusick  *
993*40117Smckusick  * Timers and congestion avoidance by Tom Talpey, Open Software Foundation.
994*40117Smckusick  */
995*40117Smckusick 
996*40117Smckusick /*
997*40117Smckusick  * The TCP algorithm was not forgiving enough. Because the NFS server
998*40117Smckusick  * responds only after performing lookups/diskio/etc, we have to be
999*40117Smckusick  * more prepared to accept a spiky variance. The TCP algorithm is:
1000*40117Smckusick  * TCP_RTO(mntp) ((((mntp)->nm_srtt >> 2) + (mntp)->nm_rttvar) >> 1)
1001*40117Smckusick  */
1002*40117Smckusick #define NFS_RTO(mntp)	(((mntp)->nm_srtt >> 3) + (mntp)->nm_rttvar)
1003*40117Smckusick 
1004*40117Smckusick nfs_updatetimer(mntp)
1005*40117Smckusick 	register struct nfsmount *mntp;
1006*40117Smckusick {
1007*40117Smckusick 	register struct nfshost *nfshp = mntp->nm_hostinfo;
1008*40117Smckusick 
1009*40117Smckusick 	/* If retransmitted, clear and return */
1010*40117Smckusick 	if (mntp->nm_rexmit || nfshp->nh_currexmit) {
1011*40117Smckusick 		if (nfshp->nh_currexmit >= nfsrexmtthresh)
1012*40117Smckusick 			nfs_log("NFS server %s OK\n", mntp->nm_host);
1013*40117Smckusick 		mntp->nm_rexmit = nfshp->nh_currexmit = 0;
1014*40117Smckusick 		return;
1015*40117Smckusick 	}
1016*40117Smckusick 	/* If have a measurement, do smoothing */
1017*40117Smckusick 	if (mntp->nm_srtt) {
1018*40117Smckusick 		register short delta;
1019*40117Smckusick 		delta = mntp->nm_rtt - (mntp->nm_srtt >> 3);
1020*40117Smckusick 		if ((mntp->nm_srtt += delta) <= 0)
1021*40117Smckusick 			mntp->nm_srtt = 1;
1022*40117Smckusick 		if (delta < 0)
1023*40117Smckusick 			delta = -delta;
1024*40117Smckusick 		delta -= (mntp->nm_rttvar >> 2);
1025*40117Smckusick 		if ((mntp->nm_rttvar += delta) <= 0)
1026*40117Smckusick 			mntp->nm_rttvar = 1;
1027*40117Smckusick 	/* Else initialize */
1028*40117Smckusick 	} else {
1029*40117Smckusick 		mntp->nm_rttvar = mntp->nm_rtt << 1;
1030*40117Smckusick 		if (mntp->nm_rttvar == 0) mntp->nm_rttvar = 2;
1031*40117Smckusick 		mntp->nm_srtt = mntp->nm_rttvar << 2;
1032*40117Smckusick 	}
1033*40117Smckusick 	/* Compute new Retransmission TimeOut and clip */
1034*40117Smckusick 	mntp->nm_rto = NFS_RTO(mntp);
1035*40117Smckusick 	if (mntp->nm_rto < NFS_MINTIMEO)
1036*40117Smckusick 		mntp->nm_rto = NFS_MINTIMEO;
1037*40117Smckusick 	else if (mntp->nm_rto > NFS_MAXTIMEO)
1038*40117Smckusick 		mntp->nm_rto = NFS_MAXTIMEO;
1039*40117Smckusick 	nfshp->nh_currto = mntp->nm_rto;
1040*40117Smckusick 
1041*40117Smckusick 	/* Update window estimate */
1042*40117Smckusick 	if (nfshp->nh_window < nfshp->nh_ssthresh)	/* quickly */
1043*40117Smckusick 		nfshp->nh_window += 4;
1044*40117Smckusick 	else {						/* slowly */
1045*40117Smckusick 		register long incr = ++nfshp->nh_winext;
1046*40117Smckusick 		incr = (incr * incr) / nfshp->nh_window;
1047*40117Smckusick 		if (incr > 0) {
1048*40117Smckusick 			nfshp->nh_winext = 0;
1049*40117Smckusick 			++nfshp->nh_window;
1050*40117Smckusick 		}
1051*40117Smckusick 	}
1052*40117Smckusick 	if (nfshp->nh_window > NFS_MAXWINDOW)
1053*40117Smckusick 		nfshp->nh_window = NFS_MAXWINDOW;
1054*40117Smckusick }
1055*40117Smckusick 
1056*40117Smckusick nfs_backofftimer(mntp)
1057*40117Smckusick 	register struct nfsmount *mntp;
1058*40117Smckusick {
1059*40117Smckusick 	register struct nfshost *nfshp = mntp->nm_hostinfo;
1060*40117Smckusick 	register unsigned long newrto;
1061*40117Smckusick 
1062*40117Smckusick 	/* Clip shift count */
1063*40117Smckusick 	if (++mntp->nm_rexmit > 8 * sizeof mntp->nm_rto)
1064*40117Smckusick 		mntp->nm_rexmit = 8 * sizeof mntp->nm_rto;
1065*40117Smckusick 	/* Back off RTO exponentially */
1066*40117Smckusick 	newrto = NFS_RTO(mntp);
1067*40117Smckusick 	newrto <<= (mntp->nm_rexmit - 1);
1068*40117Smckusick 	if (newrto == 0 || newrto > NFS_MAXTIMEO)
1069*40117Smckusick 		newrto = NFS_MAXTIMEO;
1070*40117Smckusick 	mntp->nm_rto = nfshp->nh_currto = newrto;
1071*40117Smckusick 
1072*40117Smckusick 	/* If too many retries, message, assume a bogus RTT and re-measure */
1073*40117Smckusick 	if (nfshp->nh_currexmit < mntp->nm_rexmit) {
1074*40117Smckusick 		nfshp->nh_currexmit = mntp->nm_rexmit;
1075*40117Smckusick 		if (nfshp->nh_currexmit >= nfsrexmtthresh) {
1076*40117Smckusick 			if (nfshp->nh_currexmit == nfsrexmtthresh) {
1077*40117Smckusick 				nfs_log("NFS server %s not responding\n",
1078*40117Smckusick 								mntp->nm_host);
1079*40117Smckusick 				mntp->nm_rttvar += (mntp->nm_srtt >> 2);
1080*40117Smckusick 				mntp->nm_srtt = 0;
108138414Smckusick 			}
1082*40117Smckusick 			/* The routing invalidation should be a usrreq PRU */
1083*40117Smckusick 			if (mtod(nfshp->nh_sockaddr,
1084*40117Smckusick 				struct sockaddr *)->sa_family == AF_INET)
1085*40117Smckusick 				in_losing(mntp->nm_so->so_pcb);
108638414Smckusick 		}
108738414Smckusick 	}
1088*40117Smckusick 	/* Close down window but remember this point (3/4 current) for later */
1089*40117Smckusick 	nfshp->nh_ssthresh = ((nfshp->nh_window << 1) + nfshp->nh_window) >> 2;
1090*40117Smckusick 	nfshp->nh_window = 1;
1091*40117Smckusick 	nfshp->nh_winext = 0;
109238414Smckusick }
109338414Smckusick 
109438414Smckusick /*
1095*40117Smckusick  * Not all errors are fatal. The closed checks deal
1096*40117Smckusick  * with errors a little strangely.
109738414Smckusick  */
1098*40117Smckusick 
1099*40117Smckusick nfs_sockerr(so, sending)
1100*40117Smckusick 	struct socket *so;
1101*40117Smckusick 	int sending;
110238414Smckusick {
1103*40117Smckusick 	if (sending && (so->so_state & SS_CANTSENDMORE)) {
1104*40117Smckusick 		so->so_error = EPIPE;
1105*40117Smckusick 		return (EPIPE);
1106*40117Smckusick 	}
1107*40117Smckusick 
1108*40117Smckusick 	switch (so->so_error) {			/* inhibit certain errors */
1109*40117Smckusick 	case ENETDOWN:
1110*40117Smckusick 	case ENETUNREACH:
1111*40117Smckusick 	case EHOSTDOWN:
1112*40117Smckusick 	case EHOSTUNREACH:
1113*40117Smckusick 		so->so_error = 0;
1114*40117Smckusick 	case 0:
1115*40117Smckusick 		break;
1116*40117Smckusick 	default:				/* return all others */
1117*40117Smckusick 		printf("nfs_sockerr: error %d on %s\n", so->so_error,
1118*40117Smckusick 			sending?"send":"receive");
1119*40117Smckusick 		return (so->so_error);
1120*40117Smckusick 	}
1121*40117Smckusick 
1122*40117Smckusick 	if (!sending && (so->so_state & SS_CANTRCVMORE)) {
1123*40117Smckusick 		so->so_error = 0;		/* (no error) */
1124*40117Smckusick 		return (EPIPE);
1125*40117Smckusick 	}
1126*40117Smckusick 	return (so->so_error);
112738414Smckusick }
1128