xref: /csrg-svn/sys/nfs/nfs_socket.c (revision 40484)
138414Smckusick /*
238414Smckusick  * Copyright (c) 1989 The Regents of the University of California.
338414Smckusick  * All rights reserved.
438414Smckusick  *
538414Smckusick  * This code is derived from software contributed to Berkeley by
638414Smckusick  * Rick Macklem at The University of Guelph.
738414Smckusick  *
838414Smckusick  * Redistribution and use in source and binary forms are permitted
938414Smckusick  * provided that the above copyright notice and this paragraph are
1038414Smckusick  * duplicated in all such forms and that any documentation,
1138414Smckusick  * advertising materials, and other materials related to such
1238414Smckusick  * distribution and use acknowledge that the software was developed
1338414Smckusick  * by the University of California, Berkeley.  The name of the
1438414Smckusick  * University may not be used to endorse or promote products derived
1538414Smckusick  * from this software without specific prior written permission.
1638414Smckusick  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
1738414Smckusick  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
1838414Smckusick  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
1938414Smckusick  *
20*40484Smckusick  *	@(#)nfs_socket.c	7.9 (Berkeley) 03/13/90
2138414Smckusick  */
2238414Smckusick 
2338414Smckusick /*
2438414Smckusick  * Socket operations for use by nfs (similar to uipc_socket.c, but never
2538414Smckusick  * with copies to/from a uio vector)
2640117Smckusick  * NB: For now, they only work for datagram sockets.
2738414Smckusick  * (Use on stream sockets would require some record boundary mark in the
2839754Smckusick  *  stream as defined by "RPC: Remote Procedure Call Protocol
2939754Smckusick  *  Specification" RFC1057 Section 10)
3038414Smckusick  *  and different versions of send, receive and reply that do not assume
3138414Smckusick  *  an atomic protocol
3238414Smckusick  */
3338414Smckusick 
3438414Smckusick #include "types.h"
3538414Smckusick #include "param.h"
3638414Smckusick #include "uio.h"
3738414Smckusick #include "user.h"
3840117Smckusick #include "proc.h"
3940117Smckusick #include "signal.h"
4038414Smckusick #include "mount.h"
4138414Smckusick #include "kernel.h"
4238414Smckusick #include "malloc.h"
4338414Smckusick #include "mbuf.h"
4438414Smckusick #include "vnode.h"
4538414Smckusick #include "domain.h"
4638414Smckusick #include "protosw.h"
4738414Smckusick #include "socket.h"
4838414Smckusick #include "socketvar.h"
4938414Smckusick #include "rpcv2.h"
5038414Smckusick #include "nfsv2.h"
5138414Smckusick #include "nfs.h"
5238414Smckusick #include "xdr_subs.h"
5338414Smckusick #include "nfsm_subs.h"
5438414Smckusick #include "nfsmount.h"
5538414Smckusick 
5640117Smckusick #include "syslog.h"
5740117Smckusick #define nfs_log(message, host)	log(LOG_ERR, message, host)
5840117Smckusick 
5938414Smckusick #define	TRUE	1
6038414Smckusick 
6138414Smckusick /* set lock on sockbuf sb, sleep at neg prio */
6238414Smckusick #define nfs_sblock(sb) { \
6338414Smckusick 	while ((sb)->sb_flags & SB_LOCK) { \
6438414Smckusick 		(sb)->sb_flags |= SB_WANT; \
6538414Smckusick 		sleep((caddr_t)&(sb)->sb_flags, PZERO-1); \
6638414Smckusick 	} \
6738414Smckusick 	(sb)->sb_flags |= SB_LOCK; \
6838414Smckusick }
6940117Smckusick /*
7040117Smckusick  * nfs_sbwait() is simply sbwait() but at a negative priority so that it
7140117Smckusick  * can not be interrupted by a signal.
7240117Smckusick  */
7340117Smckusick nfs_sbwait(sb)
7440117Smckusick 	struct sockbuf *sb;
7540117Smckusick {
7640117Smckusick 	sb->sb_flags |= SB_WAIT;
7740117Smckusick 	sleep((caddr_t)&sb->sb_cc, PZERO-2);
7840117Smckusick }
7938414Smckusick 
8038414Smckusick /*
8138414Smckusick  * External data, mostly RPC constants in XDR form
8238414Smckusick  */
8338414Smckusick extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix,
8438414Smckusick 	rpc_msgaccepted, rpc_call;
8538414Smckusick extern u_long nfs_prog, nfs_vers;
8638414Smckusick int	nfsrv_null(),
8738414Smckusick 	nfsrv_getattr(),
8838414Smckusick 	nfsrv_setattr(),
8938414Smckusick 	nfsrv_lookup(),
9038414Smckusick 	nfsrv_readlink(),
9138414Smckusick 	nfsrv_read(),
9238414Smckusick 	nfsrv_write(),
9338414Smckusick 	nfsrv_create(),
9438414Smckusick 	nfsrv_remove(),
9538414Smckusick 	nfsrv_rename(),
9638414Smckusick 	nfsrv_link(),
9738414Smckusick 	nfsrv_symlink(),
9838414Smckusick 	nfsrv_mkdir(),
9938414Smckusick 	nfsrv_rmdir(),
10038414Smckusick 	nfsrv_readdir(),
10138414Smckusick 	nfsrv_statfs(),
10238414Smckusick 	nfsrv_noop();
10338414Smckusick 
10438414Smckusick int (*nfsrv_procs[NFS_NPROCS])() = {
10538414Smckusick 	nfsrv_null,
10638414Smckusick 	nfsrv_getattr,
10738414Smckusick 	nfsrv_setattr,
10838414Smckusick 	nfsrv_noop,
10938414Smckusick 	nfsrv_lookup,
11038414Smckusick 	nfsrv_readlink,
11138414Smckusick 	nfsrv_read,
11238414Smckusick 	nfsrv_noop,
11338414Smckusick 	nfsrv_write,
11438414Smckusick 	nfsrv_create,
11538414Smckusick 	nfsrv_remove,
11638414Smckusick 	nfsrv_rename,
11738414Smckusick 	nfsrv_link,
11838414Smckusick 	nfsrv_symlink,
11938414Smckusick 	nfsrv_mkdir,
12038414Smckusick 	nfsrv_rmdir,
12138414Smckusick 	nfsrv_readdir,
12238414Smckusick 	nfsrv_statfs,
12338414Smckusick };
12438414Smckusick 
12540117Smckusick struct nfshost *nfshosth;
12640117Smckusick struct nfsreq nfsreqh;
12740117Smckusick int nfsrexmtthresh = NFS_FISHY;
12838414Smckusick 
12938414Smckusick /*
13040117Smckusick  * Initialize sockets and per-host congestion for a new NFS connection.
13140117Smckusick  * We do not free the sockaddr if error.
13238414Smckusick  */
13340117Smckusick nfs_connect(nmp, saddr)
13440117Smckusick 	register struct nfsmount *nmp;
13540117Smckusick 	struct mbuf *saddr;
13640117Smckusick {
13740117Smckusick 	int s, error, srvaddrlen;
13840117Smckusick 	struct mbuf *m;
13940117Smckusick 	register struct nfshost *nfshp;
14040117Smckusick 
14140117Smckusick 	nmp->nm_so = 0;
14240117Smckusick 	if (error = socreate(mtod(saddr, struct sockaddr *)->sa_family,
14340117Smckusick 				&nmp->nm_so, SOCK_DGRAM, 0))
14440117Smckusick 		goto bad;
14540117Smckusick 
14640117Smckusick 	/* Unix sockets do not provide a local bind for server reply */
14740117Smckusick 	if (mtod(saddr, struct sockaddr *)->sa_family == AF_UNIX) {
14840117Smckusick 		struct sockaddr *sa;
14940117Smckusick 		static char client[] = "/tmp/.nfs/nfsclient##";
15040117Smckusick 		static int serial;
15140117Smckusick 		int firstserial;
15240117Smckusick 		m = m_getclr(M_WAIT, MT_SONAME);
15340117Smckusick 		if (m == NULL) {
15440117Smckusick 			error = ENOBUFS;
15540117Smckusick 			goto bad;
15640117Smckusick 		}
15740117Smckusick 		m->m_len = sizeof (client) + 2;
15840117Smckusick 		sa = mtod(m, struct sockaddr *);
15940117Smckusick 		sa->sa_family = AF_UNIX;
16040117Smckusick #ifdef	MSG_TRUNC	/* Have sa_len to set? */
16140117Smckusick 		sa->sa_len = m->m_len;
16240117Smckusick #endif
16340117Smckusick 		bcopy(client, sa->sa_data, sizeof(client));
16440117Smckusick 		firstserial = serial;
16540117Smckusick 		do {
16640117Smckusick 			if (++serial >= 100) serial = 0;
16740117Smckusick 			sa->sa_data[19] = (serial / 10) + '0';
16840117Smckusick 			sa->sa_data[20] = (serial % 10) + '0';
16940117Smckusick 			error = sobind(nmp->nm_so, m);
17040117Smckusick 			if (firstserial == serial) break;
17140117Smckusick 		} while (error == EADDRINUSE);
17240117Smckusick 		m_freem(m);
17340117Smckusick 		if (error)
17440117Smckusick 			goto bad;
17540117Smckusick 	}
17640117Smckusick 
17740117Smckusick 	if (error = soconnect(nmp->nm_so, saddr))
17840117Smckusick 		goto bad;
17940117Smckusick 	error = soreserve(nmp->nm_so,	/* get space ! */
18040117Smckusick 				nmp->nm_wsize + 1024,		/* one out */
18140117Smckusick 				(nmp->nm_rsize + 1024) * 4);	/* four in */
18240117Smckusick 	if (error)
18340117Smckusick 		goto bad;
18440117Smckusick 
18540117Smckusick 	/*
18640117Smckusick 	 * Search mount list for existing server entry.
18740117Smckusick 	 *
18840117Smckusick 	 * Note, even though we have a sockaddr, it is not quite reliable
18940117Smckusick 	 * enough to bcmp against. For instance, a sockaddr_in has a
19040117Smckusick 	 * sin_zero field which is not reliably zeroed by user code (e.g.
19140117Smckusick 	 * mount). So what we do as an attempt at transport independence
19240117Smckusick 	 * is to get the peeraddr of our connected socket into a zeroed
19340117Smckusick 	 * sockaddr. Then we cache that and compare against it. This is
19440117Smckusick 	 * not exactly perfect. However it is not critical that it be, if
19540117Smckusick 	 * we cannot match the sockaddr we will simply allocate a new nfshp
19640117Smckusick 	 * per mount, which will disable the per-host congestion but
19740117Smckusick 	 * everything else will work as normal.
19840117Smckusick 	 */
19940117Smckusick 	m = m_getclr(M_WAIT, MT_SONAME);
20040117Smckusick 	if (m && (*(nmp->nm_so->so_proto->pr_usrreq))(nmp->nm_so, PRU_PEERADDR,
20140117Smckusick 				(struct mbuf *)0, m, (struct mbuf *)0) == 0) {
20240117Smckusick 		m_freem(saddr);
20340117Smckusick 		saddr = m;
20440117Smckusick 	} else
20540117Smckusick 		m_freem(m);
20640117Smckusick 	srvaddrlen = saddr->m_len;
20740117Smckusick 
20840117Smckusick 	s = splnet();
20940117Smckusick 
21040117Smckusick 	for (nfshp = nfshosth; nfshp; nfshp = nfshp->nh_next) {
21140117Smckusick 		if (srvaddrlen != nfshp->nh_salen)
21240117Smckusick 			continue;
21340117Smckusick 		if (!bcmp(mtod(saddr,caddr_t),mtod(nfshp->nh_sockaddr,caddr_t),
21440117Smckusick 				srvaddrlen))
21540117Smckusick 			break;
21640117Smckusick 	}
21740117Smckusick 	if (nfshp)		/* Have an existing mount host */
21840117Smckusick 		m_freem(saddr);
21940117Smckusick 	else {
22040117Smckusick 		MALLOC(nfshp,struct nfshost *,sizeof *nfshp,M_NFSMNT,M_WAITOK);
22140117Smckusick 		bzero((caddr_t)nfshp, sizeof *nfshp);
22240117Smckusick 		nfshp->nh_sockaddr = saddr;
22340117Smckusick 		nfshp->nh_salen = srvaddrlen;
22440117Smckusick 		/* Initialize other non-zero congestion variables */
22540117Smckusick 		nfshp->nh_currto = NFS_TIMEO;
22640117Smckusick 		nfshp->nh_window = 1;		    /* Initial send window */
22740117Smckusick 		nfshp->nh_ssthresh = NFS_MAXWINDOW; /* Slowstart threshold */
22840117Smckusick 		if (nfshosth) nfshosth->nh_prev = nfshp;	/* Chain in */
22940117Smckusick 		nfshp->nh_next = nfshosth;
23040117Smckusick 		nfshosth = nfshp;
23140117Smckusick 	}
23240117Smckusick 	nfshp->nh_refcnt++;
23340117Smckusick 	splx(s);
23440117Smckusick 	nmp->nm_hostinfo = nfshp;
23540117Smckusick 	if (nmp->nm_rto == NFS_TIMEO) {
23640117Smckusick 		nmp->nm_rto = nfshp->nh_currto;
23740117Smckusick 		nmp->nm_rttvar = nmp->nm_rto << 1;
23840117Smckusick 	}
23940117Smckusick 	return (0);
24040117Smckusick 
24140117Smckusick bad:
24240117Smckusick 	if (nmp->nm_so) (void) soclose(nmp->nm_so);
24340117Smckusick 	nmp->nm_so = 0;
24440117Smckusick 	return (error);
24540117Smckusick }
24640117Smckusick 
24740117Smckusick /*
24840117Smckusick  * NFS disconnect. Clean up and unlink.
24940117Smckusick  */
25040117Smckusick nfs_disconnect(nmp)
25140117Smckusick 	register struct nfsmount *nmp;
25240117Smckusick {
25340117Smckusick 	register struct nfshost *nfshp;
25440117Smckusick 
25540117Smckusick 	if (nmp->nm_so)
25640117Smckusick 		soclose(nmp->nm_so);
25740117Smckusick 	nmp->nm_so = 0;
25840117Smckusick 	if (nfshp = nmp->nm_hostinfo) {
25940117Smckusick 		int s = splnet();
26040117Smckusick 		if (--nfshp->nh_refcnt <= 0) {
26140117Smckusick 			if (nfshp->nh_next)
26240117Smckusick 				nfshp->nh_next->nh_prev = nfshp->nh_prev;
26340117Smckusick 			if (nfshp->nh_prev)
26440117Smckusick 				nfshp->nh_prev->nh_next = nfshp->nh_next;
26540117Smckusick 			else
26640117Smckusick 				nfshosth = nfshp->nh_next;
26740117Smckusick 			/* If unix family, remove the nfsclient from /tmp */
26840117Smckusick 			if (mtod(nfshp->nh_sockaddr,
26940117Smckusick 				struct sockaddr *)->sa_family == AF_UNIX) {
27040117Smckusick 					/* Lookup sa_data, do VOP_REMOVE... */
27140117Smckusick 			}
27240117Smckusick 			m_freem(nfshp->nh_sockaddr);
27340117Smckusick 			FREE(nfshp, M_NFSMNT);
27440117Smckusick 		}
27540117Smckusick 		nmp->nm_hostinfo = 0;
27640117Smckusick 		splx(s);
27740117Smckusick 	}
27840117Smckusick }
27940117Smckusick 
28040117Smckusick /*
28140117Smckusick  * This is a stripped down non-interruptible version of sosend().
28240117Smckusick  */
28340117Smckusick nfs_send(so, nam, top, flags, siz)
28438414Smckusick 	register struct socket *so;
28538414Smckusick 	struct mbuf *nam;
28638414Smckusick 	struct mbuf *top;
28738414Smckusick 	int flags;
28838414Smckusick 	int siz;
28938414Smckusick {
29040117Smckusick 	int error, s;
29138414Smckusick 
29238414Smckusick #ifdef MGETHDR
29338414Smckusick 	top->m_pkthdr.len = siz;
29438414Smckusick #endif
29540117Smckusick 	for (;;) {
29640117Smckusick 		nfs_sblock(&so->so_snd);
29740117Smckusick 		s = splnet();
29840117Smckusick 		if (error = nfs_sockerr(so, 1)) {
29940117Smckusick 			splx(s);
30040117Smckusick 			m_freem(top);
30140117Smckusick 			break;
30240117Smckusick 		}
30340117Smckusick 		if (sbspace(&so->so_snd) < siz) {
30440117Smckusick 			sbunlock(&so->so_snd);
30540117Smckusick 			nfs_sbwait(&so->so_snd);
30640117Smckusick 			splx(s);
30740117Smckusick 			continue;
30840117Smckusick 		}
30940117Smckusick 		error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, top,
31040327Ssklower 			(struct mbuf *)nam, (struct mbuf *)0);
31138414Smckusick 		splx(s);
31240117Smckusick 		break;
31338414Smckusick 	}
31438414Smckusick 	sbunlock(&so->so_snd);
31538414Smckusick 	return (error);
31638414Smckusick }
31738414Smckusick 
31838414Smckusick /*
31940117Smckusick  * This is a stripped down datagram specific version of soreceive()
32038414Smckusick  */
32140117Smckusick nfs_dgreceive(so, msk, mtch, aname, mp)
32238414Smckusick 	register struct socket *so;
32339754Smckusick 	u_long msk;
32439754Smckusick 	u_long mtch;
32538414Smckusick 	struct mbuf **aname;
32638414Smckusick 	struct mbuf **mp;
32738414Smckusick {
32838414Smckusick 	register struct mbuf *m;
32938414Smckusick 	int s, error = 0;
33038414Smckusick 	struct mbuf *nextrecord;
33138414Smckusick 
33238414Smckusick 	if (aname)
33338414Smckusick 		*aname = 0;
33438414Smckusick 
33540117Smckusick 	for (;;) {
33640117Smckusick 		sblock(&so->so_rcv);
33740117Smckusick 		s = splnet();
33838414Smckusick 
33940117Smckusick 		if (so->so_rcv.sb_cc == 0) {
34040117Smckusick 			if (error = nfs_sockerr(so, 0)) {
34140117Smckusick 				so->so_error = 0;
34240117Smckusick 				break;
34340117Smckusick 			}
34439754Smckusick 			sbunlock(&so->so_rcv);
34540117Smckusick 			sbwait(&so->so_rcv);
34639754Smckusick 			splx(s);
34740117Smckusick 			continue;
34839754Smckusick 		}
34938414Smckusick 		m = so->so_rcv.sb_mb;
35040117Smckusick 		if (m == 0)
35140117Smckusick 			panic("nfs_dgreceive 1");
35240117Smckusick 		nextrecord = m->m_nextpkt;
35340117Smckusick 		/* Save sender's address */
35440117Smckusick 		if (m->m_type != MT_SONAME)
35540117Smckusick 			panic("nfs_dgreceive 1a");
35638414Smckusick 		sbfree(&so->so_rcv, m);
35740117Smckusick 		if (aname) {
35840117Smckusick 			*aname = m;
35940117Smckusick 			so->so_rcv.sb_mb = m->m_next;
36040117Smckusick 			m->m_next = 0;
36140117Smckusick 			m = so->so_rcv.sb_mb;
36240117Smckusick 		} else {
36340117Smckusick 			MFREE(m, so->so_rcv.sb_mb);
36440117Smckusick 			m = so->so_rcv.sb_mb;
36540117Smckusick 		}
36640117Smckusick 		/* Drop control mbuf's */
36740117Smckusick 		if (m && m->m_type == MT_RIGHTS)
36840117Smckusick 			panic("nfs_dgreceive 2");
36940117Smckusick 		if (m && m->m_type == MT_CONTROL) {
37040117Smckusick 			sbfree(&so->so_rcv, m);
37140117Smckusick 			MFREE(m, so->so_rcv.sb_mb);
37240117Smckusick 			m = so->so_rcv.sb_mb;
37340117Smckusick 		}
37440117Smckusick 		/* Dequeue packet from sockbuf */
37540117Smckusick 		*mp = m;
37640117Smckusick 		while (m) {
37740117Smckusick 			if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
37840117Smckusick 				panic("nfs_dgreceive 3");
37940117Smckusick 			sbfree(&so->so_rcv, m);
38040117Smckusick 			m = so->so_rcv.sb_mb = m->m_next;
38140117Smckusick 		}
38240117Smckusick 		so->so_rcv.sb_mb = nextrecord;
38340117Smckusick 		/* Return */
38440117Smckusick 		break;
38538414Smckusick 	}
38638414Smckusick 	sbunlock(&so->so_rcv);
38738414Smckusick 	splx(s);
38838414Smckusick 	return (error);
38938414Smckusick }
39038414Smckusick 
39138414Smckusick struct rpc_replyhead {
39238414Smckusick 	u_long	r_xid;
39338414Smckusick 	u_long	r_rep;
39438414Smckusick };
39538414Smckusick 
39638414Smckusick /*
39740117Smckusick  * Implement NFS client side datagram receive.
39838414Smckusick  * We depend on the way that records are added to the sockbuf
39938414Smckusick  * by sbappend*.  In particular, each record (mbufs linked through m_next)
40038414Smckusick  * must begin with an address, followed by optional MT_CONTROL mbuf
40138414Smckusick  * and then zero or more mbufs of data.
40238414Smckusick  * We must search through the list of received datagrams matching them
40338414Smckusick  * with outstanding requests using the xid, until ours is found.
40438414Smckusick  */
40540117Smckusick nfs_dgreply(so, mntp, myrep)
40638414Smckusick 	register struct socket *so;
40738414Smckusick 	struct nfsmount *mntp;
40839344Smckusick 	struct nfsreq *myrep;
40938414Smckusick {
41038414Smckusick 	register struct mbuf *m;
41138414Smckusick 	register struct nfsreq *rep;
41238414Smckusick 	register int error = 0, s;
41340117Smckusick 	int logged = 0;
41438414Smckusick 	struct mbuf *nextrecord;
41538414Smckusick 	struct rpc_replyhead replyh;
41638414Smckusick 
41738414Smckusick restart:
41839344Smckusick 	nfs_sblock(&so->so_rcv);
41940117Smckusick 	s = splnet();
42040117Smckusick 	/* Already received and queued for us, bye bye */
42139344Smckusick 	if (myrep->r_mrep != NULL) {
42240117Smckusick 		error = 0;
42340117Smckusick 		goto release;
42439344Smckusick 	}
42540117Smckusick 	/* If we have run out of retries (hard mounts have bogus count) */
42640117Smckusick 	if (myrep->r_rexmit > myrep->r_retry) {
42740117Smckusick 		error = ETIMEDOUT;
42840117Smckusick 		nfsstats.rpctimeouts++;
42940117Smckusick giveup:
43040117Smckusick 		if (myrep->r_flags & R_TIMING) {
43140117Smckusick 			myrep->r_flags &= ~R_TIMING;
43240117Smckusick 			mntp->nm_rtt = -1;
43340117Smckusick 		}
43440117Smckusick 		if (myrep->r_flags & R_SENT) {
43540117Smckusick 			myrep->r_flags &= ~R_SENT;
43640117Smckusick 			--mntp->nm_hostinfo->nh_sent;
43740117Smckusick 			/* If count now 0, want to initiate new req */
43840117Smckusick 		}
43940117Smckusick 		goto release;
44039344Smckusick 	}
44138414Smckusick 
44239344Smckusick 	m = so->so_rcv.sb_mb;
44339344Smckusick 	if (m == 0) {
44439344Smckusick 		if (so->so_rcv.sb_cc)
44539344Smckusick 			panic("nfs_soreply 1");
44640117Smckusick 		if (error = nfs_sockerr(so, 0)) {
44738414Smckusick 			so->so_error = 0;
44840117Smckusick 			goto giveup;
44938414Smckusick 		}
45040117Smckusick 		/* Allow signals to interrupt request? (nfs_timer wakes up) */
45140117Smckusick 		if ((mntp->nm_flag & NFSMNT_INT) &&
452*40484Smckusick 		    (u.u_sigintr & sigmask(u.u_procp->p_cursig)) != 0) {
45340117Smckusick 			error = EINTR;
45440117Smckusick 			goto giveup;
45540117Smckusick 		}
45640117Smckusick 		if (mntp->nm_rexmit >= nfsrexmtthresh && logged++ == 0)
45740117Smckusick 			uprintf("NFS server %s not responding, retrying\n",
45840351Smckusick 				mntp->nm_mountp->m_stat.f_mntfromname);
45938414Smckusick 		sbunlock(&so->so_rcv);
46038414Smckusick 		nfs_sbwait(&so->so_rcv);
46138414Smckusick 		splx(s);
46238414Smckusick 		goto restart;
46338414Smckusick 	}
46438414Smckusick 
46538414Smckusick 	/*
46638414Smckusick 	 * Take off the address, check for rights and ditch any control
46738414Smckusick 	 * mbufs.
46838414Smckusick 	 */
46940117Smckusick 	nextrecord = m->m_nextpkt;
47038414Smckusick 	if (m->m_type != MT_SONAME)
47138414Smckusick 		panic("nfs reply SONAME");
47238414Smckusick 	sbfree(&so->so_rcv, m);
47338414Smckusick 	MFREE(m, so->so_rcv.sb_mb);
47438414Smckusick 	m = so->so_rcv.sb_mb;
47538414Smckusick 	if (m && m->m_type == MT_RIGHTS)
47638414Smckusick 		panic("nfs reply RIGHTS");
47738414Smckusick 	if (m && m->m_type == MT_CONTROL) {
47838414Smckusick 		sbfree(&so->so_rcv, m);
47938414Smckusick 		MFREE(m, so->so_rcv.sb_mb);
48038414Smckusick 		m = so->so_rcv.sb_mb;
48138414Smckusick 	}
48239344Smckusick 	if (m) {
48338414Smckusick 		m->m_nextpkt = nextrecord;
48439344Smckusick 	} else {
48539344Smckusick 		so->so_rcv.sb_mb = nextrecord;
48638414Smckusick 		sbunlock(&so->so_rcv);
48738414Smckusick 		splx(s);
48838414Smckusick 		goto restart;
48938414Smckusick 	}
49038414Smckusick 
49138414Smckusick 	/*
49238414Smckusick 	 * Get the xid and check that it is an rpc reply
49338414Smckusick 	 */
49440117Smckusick 	if (m->m_len >= sizeof replyh)
49540117Smckusick 		bcopy(mtod(m, caddr_t), (caddr_t)&replyh, sizeof replyh);
49638414Smckusick 	else {
49740117Smckusick 		struct mbuf *mp = m;
49840117Smckusick 		caddr_t cp = (caddr_t)&replyh;
49940117Smckusick 		int cnt = sizeof replyh;
50040117Smckusick 		do {
50138414Smckusick 			if (mp->m_len > 0) {
50240117Smckusick 				int xfer = (mp->m_len >= cnt) ? cnt : mp->m_len;
50338414Smckusick 				bcopy(mtod(mp, caddr_t), cp, xfer);
50438414Smckusick 				cnt -= xfer;
50538414Smckusick 				cp += xfer;
50638414Smckusick 			}
50738414Smckusick 			if (cnt > 0)
50838414Smckusick 				mp = mp->m_next;
50940117Smckusick 		} while (mp && cnt > 0);
51040117Smckusick 		if (mp == NULL) {		/* Insufficient length */
51140117Smckusick 			nfsstats.rpcinvalid++;
51240117Smckusick 			goto dropit;
51338414Smckusick 		}
51438414Smckusick 	}
51540117Smckusick 	if (replyh.r_rep != rpc_reply) {	/* Not a reply */
51640117Smckusick 		nfsstats.rpcinvalid++;
51738414Smckusick 		goto dropit;
51840117Smckusick 	}
51938414Smckusick 	/*
52038414Smckusick 	 * Loop through the request list to match up the reply
52140117Smckusick 	 * If no match, just drop the datagram
52238414Smckusick 	 */
52340117Smckusick 	if (rep = nfsreqh.r_next) {
52440117Smckusick 	    while (rep != &nfsreqh) {
52540117Smckusick 		/* The socket, being connected, will only queue matches */
52640117Smckusick 		if (replyh.r_xid == rep->r_xid && so == rep->r_mntp->nm_so) {
52738414Smckusick 			/* Found it.. */
52840117Smckusick 			if (rep->r_mrep)	/* Already there - duplicate */
52940117Smckusick 				break;
53038414Smckusick 			rep->r_mrep = m;
53138414Smckusick 			while (m) {
53238414Smckusick 				if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
53338414Smckusick 					panic("nfs_soreply 3");
53438414Smckusick 				sbfree(&so->so_rcv, m);
53538414Smckusick 				m = so->so_rcv.sb_mb = m->m_next;
53638414Smckusick 			}
53738414Smckusick 			so->so_rcv.sb_mb = nextrecord;
53840117Smckusick 			if (rep->r_flags & R_TIMING) {
53940117Smckusick 				nfs_updatetimer(mntp);
54040117Smckusick 				rep->r_flags &= ~R_TIMING;
54140117Smckusick 				mntp->nm_rtt = -1;	/* re-arm timer */
54240117Smckusick 			}
54340117Smckusick 			if (rep->r_flags & R_SENT) {
54440117Smckusick 				rep->r_flags &= ~R_SENT;
54540117Smckusick 				--mntp->nm_hostinfo->nh_sent;
54640117Smckusick 				/* If count now 0, want to initiate new req */
54740117Smckusick 			}
54840117Smckusick 			if (rep == myrep) {		/* This is success */
54940117Smckusick 				if (logged)
55040117Smckusick 					uprintf("NFS server %s responded\n",
55140351Smckusick 					mntp->nm_mountp->m_stat.f_mntfromname);
55238414Smckusick 				goto release;
55340117Smckusick 			}
55440117Smckusick 			/* Else wake up other sleeper and wait for next */
55540117Smckusick 			sbunlock(&so->so_rcv);
55640117Smckusick 			sorwakeup(so);
55740117Smckusick 			splx(s);
55840117Smckusick 			goto restart;
55938414Smckusick 		}
56038414Smckusick 		rep = rep->r_next;
56140117Smckusick 	    }
56238414Smckusick 	}
56340117Smckusick 	/* If not matched to request, drop it */
56440117Smckusick 	nfsstats.rpcunexpected++;
56538414Smckusick dropit:
56640117Smckusick 	sbdroprecord(&so->so_rcv);
56738414Smckusick 	sbunlock(&so->so_rcv);
56838414Smckusick 	splx(s);
56938414Smckusick 	goto restart;
57040117Smckusick 
57138414Smckusick release:
57238414Smckusick 	sbunlock(&so->so_rcv);
57338414Smckusick 	splx(s);
57438414Smckusick 	return (error);
57538414Smckusick }
57638414Smckusick 
57738414Smckusick /*
57838414Smckusick  * nfs_request - goes something like this
57938414Smckusick  *	- fill in request struct
58038414Smckusick  *	- links it into list
58138414Smckusick  *	- calls nfs_sosend() for first transmit
58238414Smckusick  *	- calls nfs_soreceive() to get reply
58338414Smckusick  *	- break down rpc header and return with nfs reply pointed to
58438414Smckusick  *	  by mrep or error
58538414Smckusick  * nb: always frees up mreq mbuf list
58638414Smckusick  */
58740117Smckusick nfs_request(vp, mreq, xid, idem, mp, mrp, mdp, dposp)
58838414Smckusick 	struct vnode *vp;
58938414Smckusick 	struct mbuf *mreq;
59038414Smckusick 	u_long xid;
59140117Smckusick 	int idem;
59238414Smckusick 	struct mount *mp;
59338414Smckusick 	struct mbuf **mrp;
59438414Smckusick 	struct mbuf **mdp;
59538414Smckusick 	caddr_t *dposp;
59638414Smckusick {
59738414Smckusick 	register struct mbuf *m, *mrep;
59838414Smckusick 	register struct nfsreq *rep;
59938414Smckusick 	register u_long *p;
60038414Smckusick 	register int len;
60138414Smckusick 	struct nfsmount *mntp;
60238414Smckusick 	struct mbuf *md;
60339344Smckusick 	struct nfsreq *reph;
60438414Smckusick 	caddr_t dpos;
60538414Smckusick 	char *cp2;
60638414Smckusick 	int t1;
60738414Smckusick 	int s;
60838414Smckusick 	int error;
60938414Smckusick 
61038414Smckusick 	mntp = vfs_to_nfs(mp);
61138414Smckusick 	m = mreq;
61238414Smckusick 	MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
61338414Smckusick 	rep->r_xid = xid;
61438414Smckusick 	rep->r_mntp = mntp;
61538414Smckusick 	rep->r_vp = vp;
61638414Smckusick 	if (mntp->nm_flag & NFSMNT_SOFT)
61740117Smckusick 		rep->r_retry = mntp->nm_retry;
61838414Smckusick 	else
61940117Smckusick 		rep->r_retry = NFS_MAXREXMIT + 1;	/* past clip limit */
62040117Smckusick 	rep->r_flags = rep->r_rexmit = 0;
62140117Smckusick 	/* Idempotency: add N * MINTIMEO to requests if not, else use 0 */
62240117Smckusick 	rep->r_timer = rep->r_timerinit = -(idem * NFS_MINTIMEO);
62338414Smckusick 	rep->r_mrep = NULL;
62438414Smckusick 	rep->r_mreq = m;
62538414Smckusick 	len = 0;
62638414Smckusick 	while (m) {
62738414Smckusick 		len += m->m_len;
62838414Smckusick 		m = m->m_next;
62938414Smckusick 	}
63038414Smckusick 	rep->r_msiz = len;
63138414Smckusick 
63240117Smckusick 	/*
63340117Smckusick 	 * Do the client side RPC.
63440117Smckusick 	 */
63540117Smckusick 	nfsstats.rpcrequests++;
63640117Smckusick 	s = splnet();
63740117Smckusick 	/* Chain request into list of outstanding requests. Be sure
63840117Smckusick 	 * to put it LAST so timer finds oldest requests first. */
63939344Smckusick 	reph = &nfsreqh;
64039344Smckusick 	if (reph->r_prev == NULL) {
64139344Smckusick 		reph->r_next = rep;
64239344Smckusick 		rep->r_prev = reph;
64339344Smckusick 	} else {
64439344Smckusick 		reph->r_prev->r_next = rep;
64539344Smckusick 		rep->r_prev = reph->r_prev;
64639344Smckusick 	}
64739344Smckusick 	reph->r_prev = rep;
64839344Smckusick 	rep->r_next = reph;
64940117Smckusick 	/*
65040117Smckusick 	 * If backing off another request or avoiding congestion, don't
65140117Smckusick 	 * send this one now but let timer do it. If not timing a request,
65240117Smckusick 	 * do it now.
65340117Smckusick 	 */
65440117Smckusick 	if (mntp->nm_hostinfo->nh_sent > 0 &&
65540117Smckusick 	    (mntp->nm_hostinfo->nh_currexmit != 0 ||
65640117Smckusick 	     mntp->nm_hostinfo->nh_sent >= mntp->nm_hostinfo->nh_window)) {
65740117Smckusick 		splx(s);
65840117Smckusick 		goto skipsend;
65940117Smckusick 	}
66040117Smckusick 	++mntp->nm_hostinfo->nh_sent;	/* Inconsistent if can't NFSMCOPY */
66140117Smckusick 	rep->r_flags |= R_SENT;		/* But not a catastrophe */
66240117Smckusick 	if (mntp->nm_rtt == -1) {
66340117Smckusick 		mntp->nm_rtt = 0;
66440117Smckusick 		rep->r_flags |= R_TIMING;
66540117Smckusick 	}
66638414Smckusick 	splx(s);
66738414Smckusick 
66838414Smckusick 	/*
66940117Smckusick 	 * If we can get a packet to send, send it off...
67038414Smckusick 	 * otherwise the timer will retransmit later
67138414Smckusick 	 */
67240117Smckusick 	m = NFSMCOPY(mreq, 0, M_COPYALL, M_WAIT);
67338414Smckusick 	if (m != NULL)
67440117Smckusick 		(void) nfs_send(mntp->nm_so, (struct mbuf *)0, m, 0, len);
67540117Smckusick 	/*
67640117Smckusick 	 * Wait for the reply from our send or the timer's.
67740117Smckusick 	 */
67840117Smckusick skipsend:
67940117Smckusick 	error = nfs_dgreply(mntp->nm_so, mntp, rep);
68038414Smckusick 
68140117Smckusick 	/*
68240117Smckusick 	 * RPC done, unlink the request.
68340117Smckusick 	 */
68438414Smckusick 	s = splnet();
68538414Smckusick 	rep->r_prev->r_next = rep->r_next;
68639344Smckusick 	rep->r_next->r_prev = rep->r_prev;
68738414Smckusick 	splx(s);
68838414Smckusick 	m_freem(rep->r_mreq);
68938414Smckusick 	mrep = md = rep->r_mrep;
69038414Smckusick 	FREE((caddr_t)rep, M_NFSREQ);
69138414Smckusick 	if (error)
69238414Smckusick 		return (error);
69338414Smckusick 
69438414Smckusick 	/*
69538414Smckusick 	 * break down the rpc header and check if ok
69638414Smckusick 	 */
69738414Smckusick 	dpos = mtod(md, caddr_t);
69838414Smckusick 	nfsm_disect(p, u_long *, 5*NFSX_UNSIGNED);
69938414Smckusick 	p += 2;
70038414Smckusick 	if (*p++ == rpc_msgdenied) {
70138414Smckusick 		if (*p == rpc_mismatch)
70238414Smckusick 			error = EOPNOTSUPP;
70338414Smckusick 		else
70438414Smckusick 			error = EACCES;
70538414Smckusick 		m_freem(mrep);
70638414Smckusick 		return (error);
70738414Smckusick 	}
70838414Smckusick 	/*
70938414Smckusick 	 * skip over the auth_verf, someday we may want to cache auth_short's
71038414Smckusick 	 * for nfs_reqhead(), but for now just dump it
71138414Smckusick 	 */
71238414Smckusick 	if (*++p != 0) {
71338414Smckusick 		len = nfsm_rndup(fxdr_unsigned(long, *p));
71438414Smckusick 		nfsm_adv(len);
71538414Smckusick 	}
71638414Smckusick 	nfsm_disect(p, u_long *, NFSX_UNSIGNED);
71738414Smckusick 	/* 0 == ok */
71838414Smckusick 	if (*p == 0) {
71938414Smckusick 		nfsm_disect(p, u_long *, NFSX_UNSIGNED);
72038414Smckusick 		if (*p != 0) {
72138414Smckusick 			error = fxdr_unsigned(int, *p);
72238414Smckusick 			m_freem(mrep);
72338414Smckusick 			return (error);
72438414Smckusick 		}
72538414Smckusick 		*mrp = mrep;
72638414Smckusick 		*mdp = md;
72738414Smckusick 		*dposp = dpos;
72838414Smckusick 		return (0);
72938414Smckusick 	}
73038414Smckusick 	m_freem(mrep);
73138414Smckusick 	return (EPROTONOSUPPORT);
73238414Smckusick nfsmout:
73338414Smckusick 	return (error);
73438414Smckusick }
73538414Smckusick 
73638414Smckusick /*
73738414Smckusick  * Get a request for the server main loop
73838414Smckusick  * - receive a request via. nfs_soreceive()
73938414Smckusick  * - verify it
74038414Smckusick  * - fill in the cred struct.
74138414Smckusick  */
74239754Smckusick nfs_getreq(so, prog, vers, maxproc, nam, mrp, mdp, dposp, retxid, proc, cr,
74339754Smckusick 	   msk, mtch)
74438414Smckusick 	struct socket *so;
74538414Smckusick 	u_long prog;
74638414Smckusick 	u_long vers;
74738414Smckusick 	int maxproc;
74838414Smckusick 	struct mbuf **nam;
74938414Smckusick 	struct mbuf **mrp;
75038414Smckusick 	struct mbuf **mdp;
75138414Smckusick 	caddr_t *dposp;
75238414Smckusick 	u_long *retxid;
75338414Smckusick 	u_long *proc;
75438414Smckusick 	register struct ucred *cr;
75539754Smckusick 	u_long msk;
75639754Smckusick 	u_long mtch;
75738414Smckusick {
75838414Smckusick 	register int i;
75939494Smckusick 	register u_long *p;
76039494Smckusick 	register long t1;
76139494Smckusick 	caddr_t dpos, cp2;
76239494Smckusick 	int error = 0;
76339494Smckusick 	struct mbuf *mrep, *md;
76439494Smckusick 	int len;
76538414Smckusick 
76640117Smckusick 	if (error = nfs_dgreceive(so, msk, mtch, nam, &mrep))
76738414Smckusick 		return (error);
76838414Smckusick 	md = mrep;
76938414Smckusick 	dpos = mtod(mrep, caddr_t);
77038414Smckusick 	nfsm_disect(p, u_long *, 10*NFSX_UNSIGNED);
77138414Smckusick 	*retxid = *p++;
77238414Smckusick 	if (*p++ != rpc_call) {
77338414Smckusick 		m_freem(mrep);
77438414Smckusick 		return (ERPCMISMATCH);
77538414Smckusick 	}
77638414Smckusick 	if (*p++ != rpc_vers) {
77738414Smckusick 		m_freem(mrep);
77838414Smckusick 		return (ERPCMISMATCH);
77938414Smckusick 	}
78038414Smckusick 	if (*p++ != prog) {
78138414Smckusick 		m_freem(mrep);
78238414Smckusick 		return (EPROGUNAVAIL);
78338414Smckusick 	}
78438414Smckusick 	if (*p++ != vers) {
78538414Smckusick 		m_freem(mrep);
78638414Smckusick 		return (EPROGMISMATCH);
78738414Smckusick 	}
78838414Smckusick 	*proc = fxdr_unsigned(u_long, *p++);
78938414Smckusick 	if (*proc == NFSPROC_NULL) {
79038414Smckusick 		*mrp = mrep;
79138414Smckusick 		return (0);
79238414Smckusick 	}
79338414Smckusick 	if (*proc > maxproc || *p++ != rpc_auth_unix) {
79438414Smckusick 		m_freem(mrep);
79538414Smckusick 		return (EPROCUNAVAIL);
79638414Smckusick 	}
79739494Smckusick 	(void) fxdr_unsigned(int, *p++);
79839494Smckusick 	len = fxdr_unsigned(int, *++p);
79939494Smckusick 	nfsm_adv(nfsm_rndup(len));
80038414Smckusick 	nfsm_disect(p, u_long *, 3*NFSX_UNSIGNED);
80138414Smckusick 	cr->cr_uid = fxdr_unsigned(uid_t, *p++);
80238414Smckusick 	cr->cr_gid = fxdr_unsigned(gid_t, *p++);
80339494Smckusick 	len = fxdr_unsigned(int, *p);
80439494Smckusick 	if (len > 10) {
80538414Smckusick 		m_freem(mrep);
80638414Smckusick 		return (EBADRPC);
80738414Smckusick 	}
80839494Smckusick 	nfsm_disect(p, u_long *, (len + 2)*NFSX_UNSIGNED);
80939494Smckusick 	for (i = 1; i <= len; i++)
81038414Smckusick 		cr->cr_groups[i] = fxdr_unsigned(gid_t, *p++);
81139494Smckusick 	cr->cr_ngroups = len + 1;
81238414Smckusick 	/*
81338414Smckusick 	 * Do we have any use for the verifier.
81438414Smckusick 	 * According to the "Remote Procedure Call Protocol Spec." it
81538414Smckusick 	 * should be AUTH_NULL, but some clients make it AUTH_UNIX?
81638414Smckusick 	 * For now, just skip over it
81738414Smckusick 	 */
81839494Smckusick 	len = fxdr_unsigned(int, *++p);
81939494Smckusick 	if (len > 0)
82039494Smckusick 		nfsm_adv(nfsm_rndup(len));
82138414Smckusick 	*mrp = mrep;
82238414Smckusick 	*mdp = md;
82338414Smckusick 	*dposp = dpos;
82438414Smckusick 	return (0);
82538414Smckusick nfsmout:
82638414Smckusick 	return (error);
82738414Smckusick }
82838414Smckusick 
82938414Smckusick /*
83038414Smckusick  * Generate the rpc reply header
83138414Smckusick  * siz arg. is used to decide if adding a cluster is worthwhile
83238414Smckusick  */
83338414Smckusick nfs_rephead(siz, retxid, err, mrq, mbp, bposp)
83438414Smckusick 	int siz;
83538414Smckusick 	u_long retxid;
83638414Smckusick 	int err;
83738414Smckusick 	struct mbuf **mrq;
83838414Smckusick 	struct mbuf **mbp;
83938414Smckusick 	caddr_t *bposp;
84038414Smckusick {
84139494Smckusick 	register u_long *p;
84239494Smckusick 	register long t1;
84339494Smckusick 	caddr_t bpos;
84439494Smckusick 	struct mbuf *mreq, *mb, *mb2;
84538414Smckusick 
84638414Smckusick 	NFSMGETHDR(mreq);
84738414Smckusick 	mb = mreq;
84838414Smckusick 	if ((siz+RPC_REPLYSIZ) > MHLEN)
84938414Smckusick 		NFSMCLGET(mreq, M_WAIT);
85038414Smckusick 	p = mtod(mreq, u_long *);
85138414Smckusick 	mreq->m_len = 6*NFSX_UNSIGNED;
85238414Smckusick 	bpos = ((caddr_t)p)+mreq->m_len;
85338414Smckusick 	*p++ = retxid;
85438414Smckusick 	*p++ = rpc_reply;
85538414Smckusick 	if (err == ERPCMISMATCH) {
85638414Smckusick 		*p++ = rpc_msgdenied;
85738414Smckusick 		*p++ = rpc_mismatch;
85838414Smckusick 		*p++ = txdr_unsigned(2);
85938414Smckusick 		*p = txdr_unsigned(2);
86038414Smckusick 	} else {
86138414Smckusick 		*p++ = rpc_msgaccepted;
86238414Smckusick 		*p++ = 0;
86338414Smckusick 		*p++ = 0;
86438414Smckusick 		switch (err) {
86538414Smckusick 		case EPROGUNAVAIL:
86638414Smckusick 			*p = txdr_unsigned(RPC_PROGUNAVAIL);
86738414Smckusick 			break;
86838414Smckusick 		case EPROGMISMATCH:
86938414Smckusick 			*p = txdr_unsigned(RPC_PROGMISMATCH);
87038414Smckusick 			nfsm_build(p, u_long *, 2*NFSX_UNSIGNED);
87138414Smckusick 			*p++ = txdr_unsigned(2);
87238414Smckusick 			*p = txdr_unsigned(2);	/* someday 3 */
87338414Smckusick 			break;
87438414Smckusick 		case EPROCUNAVAIL:
87538414Smckusick 			*p = txdr_unsigned(RPC_PROCUNAVAIL);
87638414Smckusick 			break;
87738414Smckusick 		default:
87838414Smckusick 			*p = 0;
87938414Smckusick 			if (err != VNOVAL) {
88038414Smckusick 				nfsm_build(p, u_long *, NFSX_UNSIGNED);
88138414Smckusick 				*p = txdr_unsigned(err);
88238414Smckusick 			}
88338414Smckusick 			break;
88438414Smckusick 		};
88538414Smckusick 	}
88638414Smckusick 	*mrq = mreq;
88738414Smckusick 	*mbp = mb;
88838414Smckusick 	*bposp = bpos;
88938414Smckusick 	if (err != 0 && err != VNOVAL)
89038414Smckusick 		nfsstats.srvrpc_errs++;
89138414Smckusick 	return (0);
89238414Smckusick }
89338414Smckusick 
89438414Smckusick /*
89538414Smckusick  * Nfs timer routine
89638414Smckusick  * Scan the nfsreq list and retranmit any requests that have timed out
89738414Smckusick  * To avoid retransmission attempts on STREAM sockets (in the future) make
89840117Smckusick  * sure to set the r_retry field to 0 (implies nm_retry == 0).
89938414Smckusick  */
90038414Smckusick nfs_timer()
90138414Smckusick {
90238414Smckusick 	register struct nfsreq *rep;
90338414Smckusick 	register struct mbuf *m;
90438414Smckusick 	register struct socket *so;
90540117Smckusick 	register struct nfsmount *mntp;
90640117Smckusick 	int s, error;
90738414Smckusick 
90838414Smckusick 	s = splnet();
90938414Smckusick 	rep = nfsreqh.r_next;
91040117Smckusick 	if (rep) for ( ; rep != &nfsreqh; rep = rep->r_next) {
91140117Smckusick 		mntp = rep->r_mntp;
91240117Smckusick 		if (rep->r_flags & R_TIMING)	/* update rtt in mount */
91340117Smckusick 			mntp->nm_rtt++;
91440117Smckusick 		/* If not timed out or reply already received, skip */
91540117Smckusick 		if (++rep->r_timer < mntp->nm_rto || rep->r_mrep)
91640117Smckusick 			continue;
91740117Smckusick 		/* Do backoff and save new timeout in mount */
91840117Smckusick 		if (rep->r_flags & R_TIMING) {
91940117Smckusick 			nfs_backofftimer(mntp);
92040117Smckusick 			rep->r_flags &= ~R_TIMING;
92140117Smckusick 			mntp->nm_rtt = -1;
92240117Smckusick 		}
92340117Smckusick 		if (rep->r_flags & R_SENT) {
92440117Smckusick 			rep->r_flags &= ~R_SENT;
92540117Smckusick 			--mntp->nm_hostinfo->nh_sent;
92640117Smckusick 		}
92740117Smckusick 		/* Check state of socket, cf nfs_send */
92840117Smckusick 		so = mntp->nm_so;
92940117Smckusick 		if (error = nfs_sockerr(so, 1))
93040117Smckusick 			goto wakeup;
93140117Smckusick 		if (sbspace(&so->so_snd) < rep->r_msiz)
93240117Smckusick 			goto wakeup;
93340117Smckusick 		/* Check for too many retries, cf nfs_dgreply */
93440117Smckusick 		if (++rep->r_rexmit > NFS_MAXREXMIT)	/* clip */
93540117Smckusick 			rep->r_rexmit = NFS_MAXREXMIT;
93640117Smckusick 		if (rep->r_rexmit > rep->r_retry)	/* too many */
93740117Smckusick 			goto wakeup;
93840117Smckusick 		/* Check for congestion control, cf nfs_request */
93940117Smckusick 		if (mntp->nm_hostinfo->nh_sent >= mntp->nm_hostinfo->nh_window)
94040117Smckusick 			goto wakeup;
94140117Smckusick 		/* Send it! */
94240117Smckusick 		m = NFSMCOPY(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT);
94340117Smckusick 		if (m == NULL)
94440117Smckusick 			goto wakeup;
94540117Smckusick 		nfsstats.rpcretries++;
94638414Smckusick #ifdef MGETHDR
94740117Smckusick 		m->m_pkthdr.len = rep->r_msiz;
94838414Smckusick #endif
94940117Smckusick 		(void)(*so->so_proto->pr_usrreq)(so, PRU_SEND, m,
95040327Ssklower 			(struct mbuf *)0, (struct mbuf *)0);
95140117Smckusick 
95240117Smckusick 		/* We need to time the request even though we're
95340117Smckusick 		 * retransmitting, in order to maintain backoff. */
95440117Smckusick 		mntp->nm_rtt = 0;
95540117Smckusick 		++mntp->nm_hostinfo->nh_sent;
95640117Smckusick 		rep->r_flags |= (R_SENT|R_TIMING);
95740117Smckusick 		rep->r_timer = rep->r_timerinit;
95840117Smckusick wakeup:
95940117Smckusick 		/* If error or interruptible mount, give user a look */
96040117Smckusick 		if (error || (mntp->nm_flag & NFSMNT_INT))
96140117Smckusick 			sorwakeup(so);
96240117Smckusick 	}
96340117Smckusick 	splx(s);
96440117Smckusick 	timeout(nfs_timer, (caddr_t)0, hz/NFS_HZ);
96540117Smckusick }
96640117Smckusick 
96740117Smckusick /*
96840117Smckusick  * NFS timer update and backoff. The "Jacobson/Karels/Karn" scheme is
96940117Smckusick  * used here. The timer state is held in the nfsmount structure and
97040117Smckusick  * a single request is used to clock the response. When successful
97140117Smckusick  * the rtt smoothing in nfs_updatetimer is used, when failed the backoff
97240117Smckusick  * is done by nfs_backofftimer. We also log failure messages in these
97340117Smckusick  * routines.
97440117Smckusick  *
97540117Smckusick  * Congestion variables are held in the nfshost structure which
97640117Smckusick  * is referenced by nfsmounts and shared per-server. This separation
97740117Smckusick  * makes it possible to do per-mount timing which allows varying disk
97840117Smckusick  * access times to be dealt with, while preserving a network oriented
97940117Smckusick  * congestion control scheme.
98040117Smckusick  *
98140117Smckusick  * The windowing implements the Jacobson/Karels slowstart algorithm
98240117Smckusick  * with adjusted scaling factors. We start with one request, then send
98340117Smckusick  * 4 more after each success until the ssthresh limit is reached, then
98440117Smckusick  * we increment at a rate proportional to the window. On failure, we
98540117Smckusick  * remember 3/4 the current window and clamp the send limit to 1. Note
98640117Smckusick  * ICMP source quench is not reflected in so->so_error so we ignore that
98740117Smckusick  * for now.
98840117Smckusick  *
98940117Smckusick  * NFS behaves much more like a transport protocol with these changes,
99040117Smckusick  * shedding the teenage pedal-to-the-metal tendencies of "other"
99140117Smckusick  * implementations.
99240117Smckusick  *
99340117Smckusick  * Timers and congestion avoidance by Tom Talpey, Open Software Foundation.
99440117Smckusick  */
99540117Smckusick 
99640117Smckusick /*
99740117Smckusick  * The TCP algorithm was not forgiving enough. Because the NFS server
99840117Smckusick  * responds only after performing lookups/diskio/etc, we have to be
99940117Smckusick  * more prepared to accept a spiky variance. The TCP algorithm is:
100040117Smckusick  * TCP_RTO(mntp) ((((mntp)->nm_srtt >> 2) + (mntp)->nm_rttvar) >> 1)
100140117Smckusick  */
100240117Smckusick #define NFS_RTO(mntp)	(((mntp)->nm_srtt >> 3) + (mntp)->nm_rttvar)
100340117Smckusick 
100440117Smckusick nfs_updatetimer(mntp)
100540117Smckusick 	register struct nfsmount *mntp;
100640117Smckusick {
100740117Smckusick 	register struct nfshost *nfshp = mntp->nm_hostinfo;
100840117Smckusick 
100940117Smckusick 	/* If retransmitted, clear and return */
101040117Smckusick 	if (mntp->nm_rexmit || nfshp->nh_currexmit) {
101140117Smckusick 		if (nfshp->nh_currexmit >= nfsrexmtthresh)
101240351Smckusick 			nfs_log("NFS server %s OK\n",
101340351Smckusick 				mntp->nm_mountp->m_stat.f_mntfromname);
101440117Smckusick 		mntp->nm_rexmit = nfshp->nh_currexmit = 0;
101540117Smckusick 		return;
101640117Smckusick 	}
101740117Smckusick 	/* If have a measurement, do smoothing */
101840117Smckusick 	if (mntp->nm_srtt) {
101940117Smckusick 		register short delta;
102040117Smckusick 		delta = mntp->nm_rtt - (mntp->nm_srtt >> 3);
102140117Smckusick 		if ((mntp->nm_srtt += delta) <= 0)
102240117Smckusick 			mntp->nm_srtt = 1;
102340117Smckusick 		if (delta < 0)
102440117Smckusick 			delta = -delta;
102540117Smckusick 		delta -= (mntp->nm_rttvar >> 2);
102640117Smckusick 		if ((mntp->nm_rttvar += delta) <= 0)
102740117Smckusick 			mntp->nm_rttvar = 1;
102840117Smckusick 	/* Else initialize */
102940117Smckusick 	} else {
103040117Smckusick 		mntp->nm_rttvar = mntp->nm_rtt << 1;
103140117Smckusick 		if (mntp->nm_rttvar == 0) mntp->nm_rttvar = 2;
103240117Smckusick 		mntp->nm_srtt = mntp->nm_rttvar << 2;
103340117Smckusick 	}
103440117Smckusick 	/* Compute new Retransmission TimeOut and clip */
103540117Smckusick 	mntp->nm_rto = NFS_RTO(mntp);
103640117Smckusick 	if (mntp->nm_rto < NFS_MINTIMEO)
103740117Smckusick 		mntp->nm_rto = NFS_MINTIMEO;
103840117Smckusick 	else if (mntp->nm_rto > NFS_MAXTIMEO)
103940117Smckusick 		mntp->nm_rto = NFS_MAXTIMEO;
104040117Smckusick 	nfshp->nh_currto = mntp->nm_rto;
104140117Smckusick 
104240117Smckusick 	/* Update window estimate */
104340117Smckusick 	if (nfshp->nh_window < nfshp->nh_ssthresh)	/* quickly */
104440117Smckusick 		nfshp->nh_window += 4;
104540117Smckusick 	else {						/* slowly */
104640117Smckusick 		register long incr = ++nfshp->nh_winext;
104740117Smckusick 		incr = (incr * incr) / nfshp->nh_window;
104840117Smckusick 		if (incr > 0) {
104940117Smckusick 			nfshp->nh_winext = 0;
105040117Smckusick 			++nfshp->nh_window;
105140117Smckusick 		}
105240117Smckusick 	}
105340117Smckusick 	if (nfshp->nh_window > NFS_MAXWINDOW)
105440117Smckusick 		nfshp->nh_window = NFS_MAXWINDOW;
105540117Smckusick }
105640117Smckusick 
105740117Smckusick nfs_backofftimer(mntp)
105840117Smckusick 	register struct nfsmount *mntp;
105940117Smckusick {
106040117Smckusick 	register struct nfshost *nfshp = mntp->nm_hostinfo;
106140117Smckusick 	register unsigned long newrto;
106240117Smckusick 
106340117Smckusick 	/* Clip shift count */
106440117Smckusick 	if (++mntp->nm_rexmit > 8 * sizeof mntp->nm_rto)
106540117Smckusick 		mntp->nm_rexmit = 8 * sizeof mntp->nm_rto;
106640117Smckusick 	/* Back off RTO exponentially */
106740117Smckusick 	newrto = NFS_RTO(mntp);
106840117Smckusick 	newrto <<= (mntp->nm_rexmit - 1);
106940117Smckusick 	if (newrto == 0 || newrto > NFS_MAXTIMEO)
107040117Smckusick 		newrto = NFS_MAXTIMEO;
107140117Smckusick 	mntp->nm_rto = nfshp->nh_currto = newrto;
107240117Smckusick 
107340117Smckusick 	/* If too many retries, message, assume a bogus RTT and re-measure */
107440117Smckusick 	if (nfshp->nh_currexmit < mntp->nm_rexmit) {
107540117Smckusick 		nfshp->nh_currexmit = mntp->nm_rexmit;
107640117Smckusick 		if (nfshp->nh_currexmit >= nfsrexmtthresh) {
107740117Smckusick 			if (nfshp->nh_currexmit == nfsrexmtthresh) {
107840117Smckusick 				nfs_log("NFS server %s not responding\n",
107940351Smckusick 					mntp->nm_mountp->m_stat.f_mntfromname);
108040117Smckusick 				mntp->nm_rttvar += (mntp->nm_srtt >> 2);
108140117Smckusick 				mntp->nm_srtt = 0;
108238414Smckusick 			}
108340117Smckusick 			/* The routing invalidation should be a usrreq PRU */
108440117Smckusick 			if (mtod(nfshp->nh_sockaddr,
108540117Smckusick 				struct sockaddr *)->sa_family == AF_INET)
108640117Smckusick 				in_losing(mntp->nm_so->so_pcb);
108738414Smckusick 		}
108838414Smckusick 	}
108940117Smckusick 	/* Close down window but remember this point (3/4 current) for later */
109040117Smckusick 	nfshp->nh_ssthresh = ((nfshp->nh_window << 1) + nfshp->nh_window) >> 2;
109140117Smckusick 	nfshp->nh_window = 1;
109240117Smckusick 	nfshp->nh_winext = 0;
109338414Smckusick }
109438414Smckusick 
109538414Smckusick /*
109640117Smckusick  * Not all errors are fatal. The closed checks deal
109740117Smckusick  * with errors a little strangely.
109838414Smckusick  */
109940117Smckusick 
110040117Smckusick nfs_sockerr(so, sending)
110140117Smckusick 	struct socket *so;
110240117Smckusick 	int sending;
110338414Smckusick {
110440117Smckusick 	if (sending && (so->so_state & SS_CANTSENDMORE)) {
110540117Smckusick 		so->so_error = EPIPE;
110640117Smckusick 		return (EPIPE);
110740117Smckusick 	}
110840117Smckusick 
110940117Smckusick 	switch (so->so_error) {			/* inhibit certain errors */
111040117Smckusick 	case ENETDOWN:
111140117Smckusick 	case ENETUNREACH:
111240117Smckusick 	case EHOSTDOWN:
111340117Smckusick 	case EHOSTUNREACH:
111440117Smckusick 		so->so_error = 0;
111540117Smckusick 	case 0:
111640117Smckusick 		break;
111740117Smckusick 	default:				/* return all others */
111840117Smckusick 		printf("nfs_sockerr: error %d on %s\n", so->so_error,
111940117Smckusick 			sending?"send":"receive");
112040117Smckusick 		return (so->so_error);
112140117Smckusick 	}
112240117Smckusick 
112340117Smckusick 	if (!sending && (so->so_state & SS_CANTRCVMORE)) {
112440117Smckusick 		so->so_error = 0;		/* (no error) */
112540117Smckusick 		return (EPIPE);
112640117Smckusick 	}
112740117Smckusick 	return (so->so_error);
112838414Smckusick }
1129