xref: /csrg-svn/sys/nfs/nfs_socket.c (revision 40761)
138414Smckusick /*
238414Smckusick  * Copyright (c) 1989 The Regents of the University of California.
338414Smckusick  * All rights reserved.
438414Smckusick  *
538414Smckusick  * This code is derived from software contributed to Berkeley by
638414Smckusick  * Rick Macklem at The University of Guelph.
738414Smckusick  *
838414Smckusick  * Redistribution and use in source and binary forms are permitted
938414Smckusick  * provided that the above copyright notice and this paragraph are
1038414Smckusick  * duplicated in all such forms and that any documentation,
1138414Smckusick  * advertising materials, and other materials related to such
1238414Smckusick  * distribution and use acknowledge that the software was developed
1338414Smckusick  * by the University of California, Berkeley.  The name of the
1438414Smckusick  * University may not be used to endorse or promote products derived
1538414Smckusick  * from this software without specific prior written permission.
1638414Smckusick  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
1738414Smckusick  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
1838414Smckusick  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
1938414Smckusick  *
20*40761Skarels  *	@(#)nfs_socket.c	7.10 (Berkeley) 04/04/90
2138414Smckusick  */
2238414Smckusick 
2338414Smckusick /*
2438414Smckusick  * Socket operations for use by nfs (similar to uipc_socket.c, but never
2538414Smckusick  * with copies to/from a uio vector)
2640117Smckusick  * NB: For now, they only work for datagram sockets.
2738414Smckusick  * (Use on stream sockets would require some record boundary mark in the
2839754Smckusick  *  stream as defined by "RPC: Remote Procedure Call Protocol
2939754Smckusick  *  Specification" RFC1057 Section 10)
3038414Smckusick  *  and different versions of send, receive and reply that do not assume
3138414Smckusick  *  an atomic protocol
3238414Smckusick  */
3338414Smckusick 
3438414Smckusick #include "types.h"
3538414Smckusick #include "param.h"
3638414Smckusick #include "uio.h"
3738414Smckusick #include "user.h"
3840117Smckusick #include "proc.h"
3940117Smckusick #include "signal.h"
4038414Smckusick #include "mount.h"
4138414Smckusick #include "kernel.h"
4238414Smckusick #include "malloc.h"
4338414Smckusick #include "mbuf.h"
4438414Smckusick #include "vnode.h"
4538414Smckusick #include "domain.h"
4638414Smckusick #include "protosw.h"
4738414Smckusick #include "socket.h"
4838414Smckusick #include "socketvar.h"
4938414Smckusick #include "rpcv2.h"
5038414Smckusick #include "nfsv2.h"
5138414Smckusick #include "nfs.h"
5238414Smckusick #include "xdr_subs.h"
5338414Smckusick #include "nfsm_subs.h"
5438414Smckusick #include "nfsmount.h"
5538414Smckusick 
5640117Smckusick #include "syslog.h"
5740117Smckusick #define nfs_log(message, host)	log(LOG_ERR, message, host)
5840117Smckusick 
5938414Smckusick #define	TRUE	1
6038414Smckusick 
6138414Smckusick /* set lock on sockbuf sb, sleep at neg prio */
6238414Smckusick #define nfs_sblock(sb) { \
6338414Smckusick 	while ((sb)->sb_flags & SB_LOCK) { \
6438414Smckusick 		(sb)->sb_flags |= SB_WANT; \
6538414Smckusick 		sleep((caddr_t)&(sb)->sb_flags, PZERO-1); \
6638414Smckusick 	} \
6738414Smckusick 	(sb)->sb_flags |= SB_LOCK; \
6838414Smckusick }
6940117Smckusick /*
7040117Smckusick  * nfs_sbwait() is simply sbwait() but at a negative priority so that it
7140117Smckusick  * can not be interrupted by a signal.
7240117Smckusick  */
7340117Smckusick nfs_sbwait(sb)
7440117Smckusick 	struct sockbuf *sb;
7540117Smckusick {
7640117Smckusick 	sb->sb_flags |= SB_WAIT;
7740117Smckusick 	sleep((caddr_t)&sb->sb_cc, PZERO-2);
7840117Smckusick }
7938414Smckusick 
8038414Smckusick /*
8138414Smckusick  * External data, mostly RPC constants in XDR form
8238414Smckusick  */
8338414Smckusick extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix,
8438414Smckusick 	rpc_msgaccepted, rpc_call;
8538414Smckusick extern u_long nfs_prog, nfs_vers;
8638414Smckusick int	nfsrv_null(),
8738414Smckusick 	nfsrv_getattr(),
8838414Smckusick 	nfsrv_setattr(),
8938414Smckusick 	nfsrv_lookup(),
9038414Smckusick 	nfsrv_readlink(),
9138414Smckusick 	nfsrv_read(),
9238414Smckusick 	nfsrv_write(),
9338414Smckusick 	nfsrv_create(),
9438414Smckusick 	nfsrv_remove(),
9538414Smckusick 	nfsrv_rename(),
9638414Smckusick 	nfsrv_link(),
9738414Smckusick 	nfsrv_symlink(),
9838414Smckusick 	nfsrv_mkdir(),
9938414Smckusick 	nfsrv_rmdir(),
10038414Smckusick 	nfsrv_readdir(),
10138414Smckusick 	nfsrv_statfs(),
10238414Smckusick 	nfsrv_noop();
10338414Smckusick 
10438414Smckusick int (*nfsrv_procs[NFS_NPROCS])() = {
10538414Smckusick 	nfsrv_null,
10638414Smckusick 	nfsrv_getattr,
10738414Smckusick 	nfsrv_setattr,
10838414Smckusick 	nfsrv_noop,
10938414Smckusick 	nfsrv_lookup,
11038414Smckusick 	nfsrv_readlink,
11138414Smckusick 	nfsrv_read,
11238414Smckusick 	nfsrv_noop,
11338414Smckusick 	nfsrv_write,
11438414Smckusick 	nfsrv_create,
11538414Smckusick 	nfsrv_remove,
11638414Smckusick 	nfsrv_rename,
11738414Smckusick 	nfsrv_link,
11838414Smckusick 	nfsrv_symlink,
11938414Smckusick 	nfsrv_mkdir,
12038414Smckusick 	nfsrv_rmdir,
12138414Smckusick 	nfsrv_readdir,
12238414Smckusick 	nfsrv_statfs,
12338414Smckusick };
12438414Smckusick 
12540117Smckusick struct nfshost *nfshosth;
12640117Smckusick struct nfsreq nfsreqh;
12740117Smckusick int nfsrexmtthresh = NFS_FISHY;
12838414Smckusick 
12938414Smckusick /*
13040117Smckusick  * Initialize sockets and per-host congestion for a new NFS connection.
13140117Smckusick  * We do not free the sockaddr if error.
13238414Smckusick  */
13340117Smckusick nfs_connect(nmp, saddr)
13440117Smckusick 	register struct nfsmount *nmp;
13540117Smckusick 	struct mbuf *saddr;
13640117Smckusick {
13740117Smckusick 	int s, error, srvaddrlen;
13840117Smckusick 	struct mbuf *m;
13940117Smckusick 	register struct nfshost *nfshp;
14040117Smckusick 
14140117Smckusick 	nmp->nm_so = 0;
14240117Smckusick 	if (error = socreate(mtod(saddr, struct sockaddr *)->sa_family,
14340117Smckusick 				&nmp->nm_so, SOCK_DGRAM, 0))
14440117Smckusick 		goto bad;
14540117Smckusick 
14640117Smckusick 	/* Unix sockets do not provide a local bind for server reply */
14740117Smckusick 	if (mtod(saddr, struct sockaddr *)->sa_family == AF_UNIX) {
14840117Smckusick 		struct sockaddr *sa;
14940117Smckusick 		static char client[] = "/tmp/.nfs/nfsclient##";
15040117Smckusick 		static int serial;
15140117Smckusick 		int firstserial;
15240117Smckusick 		m = m_getclr(M_WAIT, MT_SONAME);
15340117Smckusick 		if (m == NULL) {
15440117Smckusick 			error = ENOBUFS;
15540117Smckusick 			goto bad;
15640117Smckusick 		}
15740117Smckusick 		m->m_len = sizeof (client) + 2;
15840117Smckusick 		sa = mtod(m, struct sockaddr *);
15940117Smckusick 		sa->sa_family = AF_UNIX;
16040117Smckusick #ifdef	MSG_TRUNC	/* Have sa_len to set? */
16140117Smckusick 		sa->sa_len = m->m_len;
16240117Smckusick #endif
16340117Smckusick 		bcopy(client, sa->sa_data, sizeof(client));
16440117Smckusick 		firstserial = serial;
16540117Smckusick 		do {
16640117Smckusick 			if (++serial >= 100) serial = 0;
16740117Smckusick 			sa->sa_data[19] = (serial / 10) + '0';
16840117Smckusick 			sa->sa_data[20] = (serial % 10) + '0';
16940117Smckusick 			error = sobind(nmp->nm_so, m);
17040117Smckusick 			if (firstserial == serial) break;
17140117Smckusick 		} while (error == EADDRINUSE);
17240117Smckusick 		m_freem(m);
17340117Smckusick 		if (error)
17440117Smckusick 			goto bad;
17540117Smckusick 	}
17640117Smckusick 
17740117Smckusick 	if (error = soconnect(nmp->nm_so, saddr))
17840117Smckusick 		goto bad;
17940117Smckusick 	error = soreserve(nmp->nm_so,	/* get space ! */
18040117Smckusick 				nmp->nm_wsize + 1024,		/* one out */
18140117Smckusick 				(nmp->nm_rsize + 1024) * 4);	/* four in */
18240117Smckusick 	if (error)
18340117Smckusick 		goto bad;
18440117Smckusick 
18540117Smckusick 	/*
18640117Smckusick 	 * Search mount list for existing server entry.
18740117Smckusick 	 *
18840117Smckusick 	 * Note, even though we have a sockaddr, it is not quite reliable
18940117Smckusick 	 * enough to bcmp against. For instance, a sockaddr_in has a
19040117Smckusick 	 * sin_zero field which is not reliably zeroed by user code (e.g.
19140117Smckusick 	 * mount). So what we do as an attempt at transport independence
19240117Smckusick 	 * is to get the peeraddr of our connected socket into a zeroed
19340117Smckusick 	 * sockaddr. Then we cache that and compare against it. This is
19440117Smckusick 	 * not exactly perfect. However it is not critical that it be, if
19540117Smckusick 	 * we cannot match the sockaddr we will simply allocate a new nfshp
19640117Smckusick 	 * per mount, which will disable the per-host congestion but
19740117Smckusick 	 * everything else will work as normal.
19840117Smckusick 	 */
19940117Smckusick 	m = m_getclr(M_WAIT, MT_SONAME);
20040117Smckusick 	if (m && (*(nmp->nm_so->so_proto->pr_usrreq))(nmp->nm_so, PRU_PEERADDR,
20140117Smckusick 				(struct mbuf *)0, m, (struct mbuf *)0) == 0) {
20240117Smckusick 		m_freem(saddr);
20340117Smckusick 		saddr = m;
20440117Smckusick 	} else
20540117Smckusick 		m_freem(m);
20640117Smckusick 	srvaddrlen = saddr->m_len;
20740117Smckusick 
20840117Smckusick 	s = splnet();
20940117Smckusick 
21040117Smckusick 	for (nfshp = nfshosth; nfshp; nfshp = nfshp->nh_next) {
21140117Smckusick 		if (srvaddrlen != nfshp->nh_salen)
21240117Smckusick 			continue;
21340117Smckusick 		if (!bcmp(mtod(saddr,caddr_t),mtod(nfshp->nh_sockaddr,caddr_t),
21440117Smckusick 				srvaddrlen))
21540117Smckusick 			break;
21640117Smckusick 	}
21740117Smckusick 	if (nfshp)		/* Have an existing mount host */
21840117Smckusick 		m_freem(saddr);
21940117Smckusick 	else {
22040117Smckusick 		MALLOC(nfshp,struct nfshost *,sizeof *nfshp,M_NFSMNT,M_WAITOK);
22140117Smckusick 		bzero((caddr_t)nfshp, sizeof *nfshp);
22240117Smckusick 		nfshp->nh_sockaddr = saddr;
22340117Smckusick 		nfshp->nh_salen = srvaddrlen;
22440117Smckusick 		/* Initialize other non-zero congestion variables */
22540117Smckusick 		nfshp->nh_currto = NFS_TIMEO;
22640117Smckusick 		nfshp->nh_window = 1;		    /* Initial send window */
22740117Smckusick 		nfshp->nh_ssthresh = NFS_MAXWINDOW; /* Slowstart threshold */
22840117Smckusick 		if (nfshosth) nfshosth->nh_prev = nfshp;	/* Chain in */
22940117Smckusick 		nfshp->nh_next = nfshosth;
23040117Smckusick 		nfshosth = nfshp;
23140117Smckusick 	}
23240117Smckusick 	nfshp->nh_refcnt++;
23340117Smckusick 	splx(s);
23440117Smckusick 	nmp->nm_hostinfo = nfshp;
23540117Smckusick 	if (nmp->nm_rto == NFS_TIMEO) {
23640117Smckusick 		nmp->nm_rto = nfshp->nh_currto;
23740117Smckusick 		nmp->nm_rttvar = nmp->nm_rto << 1;
23840117Smckusick 	}
23940117Smckusick 	return (0);
24040117Smckusick 
24140117Smckusick bad:
24240117Smckusick 	if (nmp->nm_so) (void) soclose(nmp->nm_so);
24340117Smckusick 	nmp->nm_so = 0;
24440117Smckusick 	return (error);
24540117Smckusick }
24640117Smckusick 
24740117Smckusick /*
24840117Smckusick  * NFS disconnect. Clean up and unlink.
24940117Smckusick  */
25040117Smckusick nfs_disconnect(nmp)
25140117Smckusick 	register struct nfsmount *nmp;
25240117Smckusick {
25340117Smckusick 	register struct nfshost *nfshp;
25440117Smckusick 
25540117Smckusick 	if (nmp->nm_so)
25640117Smckusick 		soclose(nmp->nm_so);
25740117Smckusick 	nmp->nm_so = 0;
25840117Smckusick 	if (nfshp = nmp->nm_hostinfo) {
25940117Smckusick 		int s = splnet();
26040117Smckusick 		if (--nfshp->nh_refcnt <= 0) {
26140117Smckusick 			if (nfshp->nh_next)
26240117Smckusick 				nfshp->nh_next->nh_prev = nfshp->nh_prev;
26340117Smckusick 			if (nfshp->nh_prev)
26440117Smckusick 				nfshp->nh_prev->nh_next = nfshp->nh_next;
26540117Smckusick 			else
26640117Smckusick 				nfshosth = nfshp->nh_next;
26740117Smckusick 			/* If unix family, remove the nfsclient from /tmp */
26840117Smckusick 			if (mtod(nfshp->nh_sockaddr,
26940117Smckusick 				struct sockaddr *)->sa_family == AF_UNIX) {
27040117Smckusick 					/* Lookup sa_data, do VOP_REMOVE... */
27140117Smckusick 			}
27240117Smckusick 			m_freem(nfshp->nh_sockaddr);
27340117Smckusick 			FREE(nfshp, M_NFSMNT);
27440117Smckusick 		}
27540117Smckusick 		nmp->nm_hostinfo = 0;
27640117Smckusick 		splx(s);
27740117Smckusick 	}
27840117Smckusick }
27940117Smckusick 
28040117Smckusick /*
28140117Smckusick  * This is a stripped down non-interruptible version of sosend().
28240117Smckusick  */
28340117Smckusick nfs_send(so, nam, top, flags, siz)
28438414Smckusick 	register struct socket *so;
28538414Smckusick 	struct mbuf *nam;
28638414Smckusick 	struct mbuf *top;
28738414Smckusick 	int flags;
28838414Smckusick 	int siz;
28938414Smckusick {
29040117Smckusick 	int error, s;
29138414Smckusick 
29238414Smckusick #ifdef MGETHDR
29338414Smckusick 	top->m_pkthdr.len = siz;
29438414Smckusick #endif
29540117Smckusick 	for (;;) {
29640117Smckusick 		nfs_sblock(&so->so_snd);
29740117Smckusick 		s = splnet();
29840117Smckusick 		if (error = nfs_sockerr(so, 1)) {
29940117Smckusick 			splx(s);
30040117Smckusick 			m_freem(top);
30140117Smckusick 			break;
30240117Smckusick 		}
30340117Smckusick 		if (sbspace(&so->so_snd) < siz) {
30440117Smckusick 			sbunlock(&so->so_snd);
30540117Smckusick 			nfs_sbwait(&so->so_snd);
30640117Smckusick 			splx(s);
30740117Smckusick 			continue;
30840117Smckusick 		}
30940117Smckusick 		error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, top,
31040327Ssklower 			(struct mbuf *)nam, (struct mbuf *)0);
31138414Smckusick 		splx(s);
31240117Smckusick 		break;
31338414Smckusick 	}
31438414Smckusick 	sbunlock(&so->so_snd);
31538414Smckusick 	return (error);
31638414Smckusick }
31738414Smckusick 
31838414Smckusick /*
31940117Smckusick  * This is a stripped down datagram specific version of soreceive()
32038414Smckusick  */
32140117Smckusick nfs_dgreceive(so, msk, mtch, aname, mp)
32238414Smckusick 	register struct socket *so;
32339754Smckusick 	u_long msk;
32439754Smckusick 	u_long mtch;
32538414Smckusick 	struct mbuf **aname;
32638414Smckusick 	struct mbuf **mp;
32738414Smckusick {
32838414Smckusick 	register struct mbuf *m;
32938414Smckusick 	int s, error = 0;
33038414Smckusick 	struct mbuf *nextrecord;
33138414Smckusick 
33238414Smckusick 	if (aname)
33338414Smckusick 		*aname = 0;
33438414Smckusick 
33540117Smckusick 	for (;;) {
336*40761Skarels 		if (error = sblock(&so->so_rcv))
337*40761Skarels 			return (error);
33840117Smckusick 		s = splnet();
33938414Smckusick 
34040117Smckusick 		if (so->so_rcv.sb_cc == 0) {
34140117Smckusick 			if (error = nfs_sockerr(so, 0)) {
34240117Smckusick 				so->so_error = 0;
34340117Smckusick 				break;
34440117Smckusick 			}
34539754Smckusick 			sbunlock(&so->so_rcv);
346*40761Skarels 			error = sbwait(&so->so_rcv);
34739754Smckusick 			splx(s);
348*40761Skarels 			if (error)
349*40761Skarels 				return (error);
35040117Smckusick 			continue;
35139754Smckusick 		}
35238414Smckusick 		m = so->so_rcv.sb_mb;
35340117Smckusick 		if (m == 0)
35440117Smckusick 			panic("nfs_dgreceive 1");
35540117Smckusick 		nextrecord = m->m_nextpkt;
35640117Smckusick 		/* Save sender's address */
35740117Smckusick 		if (m->m_type != MT_SONAME)
35840117Smckusick 			panic("nfs_dgreceive 1a");
35938414Smckusick 		sbfree(&so->so_rcv, m);
36040117Smckusick 		if (aname) {
36140117Smckusick 			*aname = m;
36240117Smckusick 			so->so_rcv.sb_mb = m->m_next;
36340117Smckusick 			m->m_next = 0;
36440117Smckusick 			m = so->so_rcv.sb_mb;
36540117Smckusick 		} else {
36640117Smckusick 			MFREE(m, so->so_rcv.sb_mb);
36740117Smckusick 			m = so->so_rcv.sb_mb;
36840117Smckusick 		}
36940117Smckusick 		/* Drop control mbuf's */
37040117Smckusick 		if (m && m->m_type == MT_RIGHTS)
37140117Smckusick 			panic("nfs_dgreceive 2");
37240117Smckusick 		if (m && m->m_type == MT_CONTROL) {
37340117Smckusick 			sbfree(&so->so_rcv, m);
37440117Smckusick 			MFREE(m, so->so_rcv.sb_mb);
37540117Smckusick 			m = so->so_rcv.sb_mb;
37640117Smckusick 		}
37740117Smckusick 		/* Dequeue packet from sockbuf */
37840117Smckusick 		*mp = m;
37940117Smckusick 		while (m) {
38040117Smckusick 			if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
38140117Smckusick 				panic("nfs_dgreceive 3");
38240117Smckusick 			sbfree(&so->so_rcv, m);
38340117Smckusick 			m = so->so_rcv.sb_mb = m->m_next;
38440117Smckusick 		}
38540117Smckusick 		so->so_rcv.sb_mb = nextrecord;
38640117Smckusick 		/* Return */
38740117Smckusick 		break;
38838414Smckusick 	}
38938414Smckusick 	sbunlock(&so->so_rcv);
39038414Smckusick 	splx(s);
39138414Smckusick 	return (error);
39238414Smckusick }
39338414Smckusick 
39438414Smckusick struct rpc_replyhead {
39538414Smckusick 	u_long	r_xid;
39638414Smckusick 	u_long	r_rep;
39738414Smckusick };
39838414Smckusick 
39938414Smckusick /*
40040117Smckusick  * Implement NFS client side datagram receive.
40138414Smckusick  * We depend on the way that records are added to the sockbuf
40238414Smckusick  * by sbappend*.  In particular, each record (mbufs linked through m_next)
40338414Smckusick  * must begin with an address, followed by optional MT_CONTROL mbuf
40438414Smckusick  * and then zero or more mbufs of data.
40538414Smckusick  * We must search through the list of received datagrams matching them
40638414Smckusick  * with outstanding requests using the xid, until ours is found.
40738414Smckusick  */
40840117Smckusick nfs_dgreply(so, mntp, myrep)
40938414Smckusick 	register struct socket *so;
41038414Smckusick 	struct nfsmount *mntp;
41139344Smckusick 	struct nfsreq *myrep;
41238414Smckusick {
41338414Smckusick 	register struct mbuf *m;
41438414Smckusick 	register struct nfsreq *rep;
41538414Smckusick 	register int error = 0, s;
41640117Smckusick 	int logged = 0;
41738414Smckusick 	struct mbuf *nextrecord;
41838414Smckusick 	struct rpc_replyhead replyh;
41938414Smckusick 
42038414Smckusick restart:
42139344Smckusick 	nfs_sblock(&so->so_rcv);
42240117Smckusick 	s = splnet();
42340117Smckusick 	/* Already received and queued for us, bye bye */
42439344Smckusick 	if (myrep->r_mrep != NULL) {
42540117Smckusick 		error = 0;
42640117Smckusick 		goto release;
42739344Smckusick 	}
42840117Smckusick 	/* If we have run out of retries (hard mounts have bogus count) */
42940117Smckusick 	if (myrep->r_rexmit > myrep->r_retry) {
43040117Smckusick 		error = ETIMEDOUT;
43140117Smckusick 		nfsstats.rpctimeouts++;
43240117Smckusick giveup:
43340117Smckusick 		if (myrep->r_flags & R_TIMING) {
43440117Smckusick 			myrep->r_flags &= ~R_TIMING;
43540117Smckusick 			mntp->nm_rtt = -1;
43640117Smckusick 		}
43740117Smckusick 		if (myrep->r_flags & R_SENT) {
43840117Smckusick 			myrep->r_flags &= ~R_SENT;
43940117Smckusick 			--mntp->nm_hostinfo->nh_sent;
44040117Smckusick 			/* If count now 0, want to initiate new req */
44140117Smckusick 		}
44240117Smckusick 		goto release;
44339344Smckusick 	}
44438414Smckusick 
44539344Smckusick 	m = so->so_rcv.sb_mb;
44639344Smckusick 	if (m == 0) {
44739344Smckusick 		if (so->so_rcv.sb_cc)
44839344Smckusick 			panic("nfs_soreply 1");
44940117Smckusick 		if (error = nfs_sockerr(so, 0)) {
45038414Smckusick 			so->so_error = 0;
45140117Smckusick 			goto giveup;
45238414Smckusick 		}
45340117Smckusick 		/* Allow signals to interrupt request? (nfs_timer wakes up) */
45440117Smckusick 		if ((mntp->nm_flag & NFSMNT_INT) &&
45540484Smckusick 		    (u.u_sigintr & sigmask(u.u_procp->p_cursig)) != 0) {
45640117Smckusick 			error = EINTR;
45740117Smckusick 			goto giveup;
45840117Smckusick 		}
45940117Smckusick 		if (mntp->nm_rexmit >= nfsrexmtthresh && logged++ == 0)
46040117Smckusick 			uprintf("NFS server %s not responding, retrying\n",
46140351Smckusick 				mntp->nm_mountp->m_stat.f_mntfromname);
46238414Smckusick 		sbunlock(&so->so_rcv);
46338414Smckusick 		nfs_sbwait(&so->so_rcv);
46438414Smckusick 		splx(s);
46538414Smckusick 		goto restart;
46638414Smckusick 	}
46738414Smckusick 
46838414Smckusick 	/*
46938414Smckusick 	 * Take off the address, check for rights and ditch any control
47038414Smckusick 	 * mbufs.
47138414Smckusick 	 */
47240117Smckusick 	nextrecord = m->m_nextpkt;
47338414Smckusick 	if (m->m_type != MT_SONAME)
47438414Smckusick 		panic("nfs reply SONAME");
47538414Smckusick 	sbfree(&so->so_rcv, m);
47638414Smckusick 	MFREE(m, so->so_rcv.sb_mb);
47738414Smckusick 	m = so->so_rcv.sb_mb;
47838414Smckusick 	if (m && m->m_type == MT_RIGHTS)
47938414Smckusick 		panic("nfs reply RIGHTS");
48038414Smckusick 	if (m && m->m_type == MT_CONTROL) {
48138414Smckusick 		sbfree(&so->so_rcv, m);
48238414Smckusick 		MFREE(m, so->so_rcv.sb_mb);
48338414Smckusick 		m = so->so_rcv.sb_mb;
48438414Smckusick 	}
48539344Smckusick 	if (m) {
48638414Smckusick 		m->m_nextpkt = nextrecord;
48739344Smckusick 	} else {
48839344Smckusick 		so->so_rcv.sb_mb = nextrecord;
48938414Smckusick 		sbunlock(&so->so_rcv);
49038414Smckusick 		splx(s);
49138414Smckusick 		goto restart;
49238414Smckusick 	}
49338414Smckusick 
49438414Smckusick 	/*
49538414Smckusick 	 * Get the xid and check that it is an rpc reply
49638414Smckusick 	 */
49740117Smckusick 	if (m->m_len >= sizeof replyh)
49840117Smckusick 		bcopy(mtod(m, caddr_t), (caddr_t)&replyh, sizeof replyh);
49938414Smckusick 	else {
50040117Smckusick 		struct mbuf *mp = m;
50140117Smckusick 		caddr_t cp = (caddr_t)&replyh;
50240117Smckusick 		int cnt = sizeof replyh;
50340117Smckusick 		do {
50438414Smckusick 			if (mp->m_len > 0) {
50540117Smckusick 				int xfer = (mp->m_len >= cnt) ? cnt : mp->m_len;
50638414Smckusick 				bcopy(mtod(mp, caddr_t), cp, xfer);
50738414Smckusick 				cnt -= xfer;
50838414Smckusick 				cp += xfer;
50938414Smckusick 			}
51038414Smckusick 			if (cnt > 0)
51138414Smckusick 				mp = mp->m_next;
51240117Smckusick 		} while (mp && cnt > 0);
51340117Smckusick 		if (mp == NULL) {		/* Insufficient length */
51440117Smckusick 			nfsstats.rpcinvalid++;
51540117Smckusick 			goto dropit;
51638414Smckusick 		}
51738414Smckusick 	}
51840117Smckusick 	if (replyh.r_rep != rpc_reply) {	/* Not a reply */
51940117Smckusick 		nfsstats.rpcinvalid++;
52038414Smckusick 		goto dropit;
52140117Smckusick 	}
52238414Smckusick 	/*
52338414Smckusick 	 * Loop through the request list to match up the reply
52440117Smckusick 	 * If no match, just drop the datagram
52538414Smckusick 	 */
52640117Smckusick 	if (rep = nfsreqh.r_next) {
52740117Smckusick 	    while (rep != &nfsreqh) {
52840117Smckusick 		/* The socket, being connected, will only queue matches */
52940117Smckusick 		if (replyh.r_xid == rep->r_xid && so == rep->r_mntp->nm_so) {
53038414Smckusick 			/* Found it.. */
53140117Smckusick 			if (rep->r_mrep)	/* Already there - duplicate */
53240117Smckusick 				break;
53338414Smckusick 			rep->r_mrep = m;
53438414Smckusick 			while (m) {
53538414Smckusick 				if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
53638414Smckusick 					panic("nfs_soreply 3");
53738414Smckusick 				sbfree(&so->so_rcv, m);
53838414Smckusick 				m = so->so_rcv.sb_mb = m->m_next;
53938414Smckusick 			}
54038414Smckusick 			so->so_rcv.sb_mb = nextrecord;
54140117Smckusick 			if (rep->r_flags & R_TIMING) {
54240117Smckusick 				nfs_updatetimer(mntp);
54340117Smckusick 				rep->r_flags &= ~R_TIMING;
54440117Smckusick 				mntp->nm_rtt = -1;	/* re-arm timer */
54540117Smckusick 			}
54640117Smckusick 			if (rep->r_flags & R_SENT) {
54740117Smckusick 				rep->r_flags &= ~R_SENT;
54840117Smckusick 				--mntp->nm_hostinfo->nh_sent;
54940117Smckusick 				/* If count now 0, want to initiate new req */
55040117Smckusick 			}
55140117Smckusick 			if (rep == myrep) {		/* This is success */
55240117Smckusick 				if (logged)
55340117Smckusick 					uprintf("NFS server %s responded\n",
55440351Smckusick 					mntp->nm_mountp->m_stat.f_mntfromname);
55538414Smckusick 				goto release;
55640117Smckusick 			}
55740117Smckusick 			/* Else wake up other sleeper and wait for next */
55840117Smckusick 			sbunlock(&so->so_rcv);
55940117Smckusick 			sorwakeup(so);
56040117Smckusick 			splx(s);
56140117Smckusick 			goto restart;
56238414Smckusick 		}
56338414Smckusick 		rep = rep->r_next;
56440117Smckusick 	    }
56538414Smckusick 	}
56640117Smckusick 	/* If not matched to request, drop it */
56740117Smckusick 	nfsstats.rpcunexpected++;
56838414Smckusick dropit:
56940117Smckusick 	sbdroprecord(&so->so_rcv);
57038414Smckusick 	sbunlock(&so->so_rcv);
57138414Smckusick 	splx(s);
57238414Smckusick 	goto restart;
57340117Smckusick 
57438414Smckusick release:
57538414Smckusick 	sbunlock(&so->so_rcv);
57638414Smckusick 	splx(s);
57738414Smckusick 	return (error);
57838414Smckusick }
57938414Smckusick 
58038414Smckusick /*
58138414Smckusick  * nfs_request - goes something like this
58238414Smckusick  *	- fill in request struct
58338414Smckusick  *	- links it into list
58438414Smckusick  *	- calls nfs_sosend() for first transmit
58538414Smckusick  *	- calls nfs_soreceive() to get reply
58638414Smckusick  *	- break down rpc header and return with nfs reply pointed to
58738414Smckusick  *	  by mrep or error
58838414Smckusick  * nb: always frees up mreq mbuf list
58938414Smckusick  */
59040117Smckusick nfs_request(vp, mreq, xid, idem, mp, mrp, mdp, dposp)
59138414Smckusick 	struct vnode *vp;
59238414Smckusick 	struct mbuf *mreq;
59338414Smckusick 	u_long xid;
59440117Smckusick 	int idem;
59538414Smckusick 	struct mount *mp;
59638414Smckusick 	struct mbuf **mrp;
59738414Smckusick 	struct mbuf **mdp;
59838414Smckusick 	caddr_t *dposp;
59938414Smckusick {
60038414Smckusick 	register struct mbuf *m, *mrep;
60138414Smckusick 	register struct nfsreq *rep;
60238414Smckusick 	register u_long *p;
60338414Smckusick 	register int len;
60438414Smckusick 	struct nfsmount *mntp;
60538414Smckusick 	struct mbuf *md;
60639344Smckusick 	struct nfsreq *reph;
60738414Smckusick 	caddr_t dpos;
60838414Smckusick 	char *cp2;
60938414Smckusick 	int t1;
61038414Smckusick 	int s;
61138414Smckusick 	int error;
61238414Smckusick 
61338414Smckusick 	mntp = vfs_to_nfs(mp);
61438414Smckusick 	m = mreq;
61538414Smckusick 	MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
61638414Smckusick 	rep->r_xid = xid;
61738414Smckusick 	rep->r_mntp = mntp;
61838414Smckusick 	rep->r_vp = vp;
61938414Smckusick 	if (mntp->nm_flag & NFSMNT_SOFT)
62040117Smckusick 		rep->r_retry = mntp->nm_retry;
62138414Smckusick 	else
62240117Smckusick 		rep->r_retry = NFS_MAXREXMIT + 1;	/* past clip limit */
62340117Smckusick 	rep->r_flags = rep->r_rexmit = 0;
62440117Smckusick 	/* Idempotency: add N * MINTIMEO to requests if not, else use 0 */
62540117Smckusick 	rep->r_timer = rep->r_timerinit = -(idem * NFS_MINTIMEO);
62638414Smckusick 	rep->r_mrep = NULL;
62738414Smckusick 	rep->r_mreq = m;
62838414Smckusick 	len = 0;
62938414Smckusick 	while (m) {
63038414Smckusick 		len += m->m_len;
63138414Smckusick 		m = m->m_next;
63238414Smckusick 	}
63338414Smckusick 	rep->r_msiz = len;
63438414Smckusick 
63540117Smckusick 	/*
63640117Smckusick 	 * Do the client side RPC.
63740117Smckusick 	 */
63840117Smckusick 	nfsstats.rpcrequests++;
63940117Smckusick 	s = splnet();
64040117Smckusick 	/* Chain request into list of outstanding requests. Be sure
64140117Smckusick 	 * to put it LAST so timer finds oldest requests first. */
64239344Smckusick 	reph = &nfsreqh;
64339344Smckusick 	if (reph->r_prev == NULL) {
64439344Smckusick 		reph->r_next = rep;
64539344Smckusick 		rep->r_prev = reph;
64639344Smckusick 	} else {
64739344Smckusick 		reph->r_prev->r_next = rep;
64839344Smckusick 		rep->r_prev = reph->r_prev;
64939344Smckusick 	}
65039344Smckusick 	reph->r_prev = rep;
65139344Smckusick 	rep->r_next = reph;
65240117Smckusick 	/*
65340117Smckusick 	 * If backing off another request or avoiding congestion, don't
65440117Smckusick 	 * send this one now but let timer do it. If not timing a request,
65540117Smckusick 	 * do it now.
65640117Smckusick 	 */
65740117Smckusick 	if (mntp->nm_hostinfo->nh_sent > 0 &&
65840117Smckusick 	    (mntp->nm_hostinfo->nh_currexmit != 0 ||
65940117Smckusick 	     mntp->nm_hostinfo->nh_sent >= mntp->nm_hostinfo->nh_window)) {
66040117Smckusick 		splx(s);
66140117Smckusick 		goto skipsend;
66240117Smckusick 	}
66340117Smckusick 	++mntp->nm_hostinfo->nh_sent;	/* Inconsistent if can't NFSMCOPY */
66440117Smckusick 	rep->r_flags |= R_SENT;		/* But not a catastrophe */
66540117Smckusick 	if (mntp->nm_rtt == -1) {
66640117Smckusick 		mntp->nm_rtt = 0;
66740117Smckusick 		rep->r_flags |= R_TIMING;
66840117Smckusick 	}
66938414Smckusick 	splx(s);
67038414Smckusick 
67138414Smckusick 	/*
67240117Smckusick 	 * If we can get a packet to send, send it off...
67338414Smckusick 	 * otherwise the timer will retransmit later
67438414Smckusick 	 */
67540117Smckusick 	m = NFSMCOPY(mreq, 0, M_COPYALL, M_WAIT);
67638414Smckusick 	if (m != NULL)
67740117Smckusick 		(void) nfs_send(mntp->nm_so, (struct mbuf *)0, m, 0, len);
67840117Smckusick 	/*
67940117Smckusick 	 * Wait for the reply from our send or the timer's.
68040117Smckusick 	 */
68140117Smckusick skipsend:
68240117Smckusick 	error = nfs_dgreply(mntp->nm_so, mntp, rep);
68338414Smckusick 
68440117Smckusick 	/*
68540117Smckusick 	 * RPC done, unlink the request.
68640117Smckusick 	 */
68738414Smckusick 	s = splnet();
68838414Smckusick 	rep->r_prev->r_next = rep->r_next;
68939344Smckusick 	rep->r_next->r_prev = rep->r_prev;
69038414Smckusick 	splx(s);
69138414Smckusick 	m_freem(rep->r_mreq);
69238414Smckusick 	mrep = md = rep->r_mrep;
69338414Smckusick 	FREE((caddr_t)rep, M_NFSREQ);
69438414Smckusick 	if (error)
69538414Smckusick 		return (error);
69638414Smckusick 
69738414Smckusick 	/*
69838414Smckusick 	 * break down the rpc header and check if ok
69938414Smckusick 	 */
70038414Smckusick 	dpos = mtod(md, caddr_t);
70138414Smckusick 	nfsm_disect(p, u_long *, 5*NFSX_UNSIGNED);
70238414Smckusick 	p += 2;
70338414Smckusick 	if (*p++ == rpc_msgdenied) {
70438414Smckusick 		if (*p == rpc_mismatch)
70538414Smckusick 			error = EOPNOTSUPP;
70638414Smckusick 		else
70738414Smckusick 			error = EACCES;
70838414Smckusick 		m_freem(mrep);
70938414Smckusick 		return (error);
71038414Smckusick 	}
71138414Smckusick 	/*
71238414Smckusick 	 * skip over the auth_verf, someday we may want to cache auth_short's
71338414Smckusick 	 * for nfs_reqhead(), but for now just dump it
71438414Smckusick 	 */
71538414Smckusick 	if (*++p != 0) {
71638414Smckusick 		len = nfsm_rndup(fxdr_unsigned(long, *p));
71738414Smckusick 		nfsm_adv(len);
71838414Smckusick 	}
71938414Smckusick 	nfsm_disect(p, u_long *, NFSX_UNSIGNED);
72038414Smckusick 	/* 0 == ok */
72138414Smckusick 	if (*p == 0) {
72238414Smckusick 		nfsm_disect(p, u_long *, NFSX_UNSIGNED);
72338414Smckusick 		if (*p != 0) {
72438414Smckusick 			error = fxdr_unsigned(int, *p);
72538414Smckusick 			m_freem(mrep);
72638414Smckusick 			return (error);
72738414Smckusick 		}
72838414Smckusick 		*mrp = mrep;
72938414Smckusick 		*mdp = md;
73038414Smckusick 		*dposp = dpos;
73138414Smckusick 		return (0);
73238414Smckusick 	}
73338414Smckusick 	m_freem(mrep);
73438414Smckusick 	return (EPROTONOSUPPORT);
73538414Smckusick nfsmout:
73638414Smckusick 	return (error);
73738414Smckusick }
73838414Smckusick 
73938414Smckusick /*
74038414Smckusick  * Get a request for the server main loop
74138414Smckusick  * - receive a request via. nfs_soreceive()
74238414Smckusick  * - verify it
74338414Smckusick  * - fill in the cred struct.
74438414Smckusick  */
74539754Smckusick nfs_getreq(so, prog, vers, maxproc, nam, mrp, mdp, dposp, retxid, proc, cr,
74639754Smckusick 	   msk, mtch)
74738414Smckusick 	struct socket *so;
74838414Smckusick 	u_long prog;
74938414Smckusick 	u_long vers;
75038414Smckusick 	int maxproc;
75138414Smckusick 	struct mbuf **nam;
75238414Smckusick 	struct mbuf **mrp;
75338414Smckusick 	struct mbuf **mdp;
75438414Smckusick 	caddr_t *dposp;
75538414Smckusick 	u_long *retxid;
75638414Smckusick 	u_long *proc;
75738414Smckusick 	register struct ucred *cr;
75839754Smckusick 	u_long msk;
75939754Smckusick 	u_long mtch;
76038414Smckusick {
76138414Smckusick 	register int i;
76239494Smckusick 	register u_long *p;
76339494Smckusick 	register long t1;
76439494Smckusick 	caddr_t dpos, cp2;
76539494Smckusick 	int error = 0;
76639494Smckusick 	struct mbuf *mrep, *md;
76739494Smckusick 	int len;
76838414Smckusick 
76940117Smckusick 	if (error = nfs_dgreceive(so, msk, mtch, nam, &mrep))
77038414Smckusick 		return (error);
77138414Smckusick 	md = mrep;
77238414Smckusick 	dpos = mtod(mrep, caddr_t);
77338414Smckusick 	nfsm_disect(p, u_long *, 10*NFSX_UNSIGNED);
77438414Smckusick 	*retxid = *p++;
77538414Smckusick 	if (*p++ != rpc_call) {
77638414Smckusick 		m_freem(mrep);
77738414Smckusick 		return (ERPCMISMATCH);
77838414Smckusick 	}
77938414Smckusick 	if (*p++ != rpc_vers) {
78038414Smckusick 		m_freem(mrep);
78138414Smckusick 		return (ERPCMISMATCH);
78238414Smckusick 	}
78338414Smckusick 	if (*p++ != prog) {
78438414Smckusick 		m_freem(mrep);
78538414Smckusick 		return (EPROGUNAVAIL);
78638414Smckusick 	}
78738414Smckusick 	if (*p++ != vers) {
78838414Smckusick 		m_freem(mrep);
78938414Smckusick 		return (EPROGMISMATCH);
79038414Smckusick 	}
79138414Smckusick 	*proc = fxdr_unsigned(u_long, *p++);
79238414Smckusick 	if (*proc == NFSPROC_NULL) {
79338414Smckusick 		*mrp = mrep;
79438414Smckusick 		return (0);
79538414Smckusick 	}
79638414Smckusick 	if (*proc > maxproc || *p++ != rpc_auth_unix) {
79738414Smckusick 		m_freem(mrep);
79838414Smckusick 		return (EPROCUNAVAIL);
79938414Smckusick 	}
80039494Smckusick 	(void) fxdr_unsigned(int, *p++);
80139494Smckusick 	len = fxdr_unsigned(int, *++p);
80239494Smckusick 	nfsm_adv(nfsm_rndup(len));
80338414Smckusick 	nfsm_disect(p, u_long *, 3*NFSX_UNSIGNED);
80438414Smckusick 	cr->cr_uid = fxdr_unsigned(uid_t, *p++);
80538414Smckusick 	cr->cr_gid = fxdr_unsigned(gid_t, *p++);
80639494Smckusick 	len = fxdr_unsigned(int, *p);
80739494Smckusick 	if (len > 10) {
80838414Smckusick 		m_freem(mrep);
80938414Smckusick 		return (EBADRPC);
81038414Smckusick 	}
81139494Smckusick 	nfsm_disect(p, u_long *, (len + 2)*NFSX_UNSIGNED);
81239494Smckusick 	for (i = 1; i <= len; i++)
81338414Smckusick 		cr->cr_groups[i] = fxdr_unsigned(gid_t, *p++);
81439494Smckusick 	cr->cr_ngroups = len + 1;
81538414Smckusick 	/*
81638414Smckusick 	 * Do we have any use for the verifier.
81738414Smckusick 	 * According to the "Remote Procedure Call Protocol Spec." it
81838414Smckusick 	 * should be AUTH_NULL, but some clients make it AUTH_UNIX?
81938414Smckusick 	 * For now, just skip over it
82038414Smckusick 	 */
82139494Smckusick 	len = fxdr_unsigned(int, *++p);
82239494Smckusick 	if (len > 0)
82339494Smckusick 		nfsm_adv(nfsm_rndup(len));
82438414Smckusick 	*mrp = mrep;
82538414Smckusick 	*mdp = md;
82638414Smckusick 	*dposp = dpos;
82738414Smckusick 	return (0);
82838414Smckusick nfsmout:
82938414Smckusick 	return (error);
83038414Smckusick }
83138414Smckusick 
83238414Smckusick /*
83338414Smckusick  * Generate the rpc reply header
83438414Smckusick  * siz arg. is used to decide if adding a cluster is worthwhile
83538414Smckusick  */
83638414Smckusick nfs_rephead(siz, retxid, err, mrq, mbp, bposp)
83738414Smckusick 	int siz;
83838414Smckusick 	u_long retxid;
83938414Smckusick 	int err;
84038414Smckusick 	struct mbuf **mrq;
84138414Smckusick 	struct mbuf **mbp;
84238414Smckusick 	caddr_t *bposp;
84338414Smckusick {
84439494Smckusick 	register u_long *p;
84539494Smckusick 	register long t1;
84639494Smckusick 	caddr_t bpos;
84739494Smckusick 	struct mbuf *mreq, *mb, *mb2;
84838414Smckusick 
84938414Smckusick 	NFSMGETHDR(mreq);
85038414Smckusick 	mb = mreq;
85138414Smckusick 	if ((siz+RPC_REPLYSIZ) > MHLEN)
85238414Smckusick 		NFSMCLGET(mreq, M_WAIT);
85338414Smckusick 	p = mtod(mreq, u_long *);
85438414Smckusick 	mreq->m_len = 6*NFSX_UNSIGNED;
85538414Smckusick 	bpos = ((caddr_t)p)+mreq->m_len;
85638414Smckusick 	*p++ = retxid;
85738414Smckusick 	*p++ = rpc_reply;
85838414Smckusick 	if (err == ERPCMISMATCH) {
85938414Smckusick 		*p++ = rpc_msgdenied;
86038414Smckusick 		*p++ = rpc_mismatch;
86138414Smckusick 		*p++ = txdr_unsigned(2);
86238414Smckusick 		*p = txdr_unsigned(2);
86338414Smckusick 	} else {
86438414Smckusick 		*p++ = rpc_msgaccepted;
86538414Smckusick 		*p++ = 0;
86638414Smckusick 		*p++ = 0;
86738414Smckusick 		switch (err) {
86838414Smckusick 		case EPROGUNAVAIL:
86938414Smckusick 			*p = txdr_unsigned(RPC_PROGUNAVAIL);
87038414Smckusick 			break;
87138414Smckusick 		case EPROGMISMATCH:
87238414Smckusick 			*p = txdr_unsigned(RPC_PROGMISMATCH);
87338414Smckusick 			nfsm_build(p, u_long *, 2*NFSX_UNSIGNED);
87438414Smckusick 			*p++ = txdr_unsigned(2);
87538414Smckusick 			*p = txdr_unsigned(2);	/* someday 3 */
87638414Smckusick 			break;
87738414Smckusick 		case EPROCUNAVAIL:
87838414Smckusick 			*p = txdr_unsigned(RPC_PROCUNAVAIL);
87938414Smckusick 			break;
88038414Smckusick 		default:
88138414Smckusick 			*p = 0;
88238414Smckusick 			if (err != VNOVAL) {
88338414Smckusick 				nfsm_build(p, u_long *, NFSX_UNSIGNED);
88438414Smckusick 				*p = txdr_unsigned(err);
88538414Smckusick 			}
88638414Smckusick 			break;
88738414Smckusick 		};
88838414Smckusick 	}
88938414Smckusick 	*mrq = mreq;
89038414Smckusick 	*mbp = mb;
89138414Smckusick 	*bposp = bpos;
89238414Smckusick 	if (err != 0 && err != VNOVAL)
89338414Smckusick 		nfsstats.srvrpc_errs++;
89438414Smckusick 	return (0);
89538414Smckusick }
89638414Smckusick 
89738414Smckusick /*
89838414Smckusick  * Nfs timer routine
89938414Smckusick  * Scan the nfsreq list and retranmit any requests that have timed out
90038414Smckusick  * To avoid retransmission attempts on STREAM sockets (in the future) make
90140117Smckusick  * sure to set the r_retry field to 0 (implies nm_retry == 0).
90238414Smckusick  */
90338414Smckusick nfs_timer()
90438414Smckusick {
90538414Smckusick 	register struct nfsreq *rep;
90638414Smckusick 	register struct mbuf *m;
90738414Smckusick 	register struct socket *so;
90840117Smckusick 	register struct nfsmount *mntp;
90940117Smckusick 	int s, error;
91038414Smckusick 
91138414Smckusick 	s = splnet();
91238414Smckusick 	rep = nfsreqh.r_next;
91340117Smckusick 	if (rep) for ( ; rep != &nfsreqh; rep = rep->r_next) {
91440117Smckusick 		mntp = rep->r_mntp;
91540117Smckusick 		if (rep->r_flags & R_TIMING)	/* update rtt in mount */
91640117Smckusick 			mntp->nm_rtt++;
91740117Smckusick 		/* If not timed out or reply already received, skip */
91840117Smckusick 		if (++rep->r_timer < mntp->nm_rto || rep->r_mrep)
91940117Smckusick 			continue;
92040117Smckusick 		/* Do backoff and save new timeout in mount */
92140117Smckusick 		if (rep->r_flags & R_TIMING) {
92240117Smckusick 			nfs_backofftimer(mntp);
92340117Smckusick 			rep->r_flags &= ~R_TIMING;
92440117Smckusick 			mntp->nm_rtt = -1;
92540117Smckusick 		}
92640117Smckusick 		if (rep->r_flags & R_SENT) {
92740117Smckusick 			rep->r_flags &= ~R_SENT;
92840117Smckusick 			--mntp->nm_hostinfo->nh_sent;
92940117Smckusick 		}
93040117Smckusick 		/* Check state of socket, cf nfs_send */
93140117Smckusick 		so = mntp->nm_so;
93240117Smckusick 		if (error = nfs_sockerr(so, 1))
93340117Smckusick 			goto wakeup;
93440117Smckusick 		if (sbspace(&so->so_snd) < rep->r_msiz)
93540117Smckusick 			goto wakeup;
93640117Smckusick 		/* Check for too many retries, cf nfs_dgreply */
93740117Smckusick 		if (++rep->r_rexmit > NFS_MAXREXMIT)	/* clip */
93840117Smckusick 			rep->r_rexmit = NFS_MAXREXMIT;
93940117Smckusick 		if (rep->r_rexmit > rep->r_retry)	/* too many */
94040117Smckusick 			goto wakeup;
94140117Smckusick 		/* Check for congestion control, cf nfs_request */
94240117Smckusick 		if (mntp->nm_hostinfo->nh_sent >= mntp->nm_hostinfo->nh_window)
94340117Smckusick 			goto wakeup;
94440117Smckusick 		/* Send it! */
94540117Smckusick 		m = NFSMCOPY(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT);
94640117Smckusick 		if (m == NULL)
94740117Smckusick 			goto wakeup;
94840117Smckusick 		nfsstats.rpcretries++;
94938414Smckusick #ifdef MGETHDR
95040117Smckusick 		m->m_pkthdr.len = rep->r_msiz;
95138414Smckusick #endif
95240117Smckusick 		(void)(*so->so_proto->pr_usrreq)(so, PRU_SEND, m,
95340327Ssklower 			(struct mbuf *)0, (struct mbuf *)0);
95440117Smckusick 
95540117Smckusick 		/* We need to time the request even though we're
95640117Smckusick 		 * retransmitting, in order to maintain backoff. */
95740117Smckusick 		mntp->nm_rtt = 0;
95840117Smckusick 		++mntp->nm_hostinfo->nh_sent;
95940117Smckusick 		rep->r_flags |= (R_SENT|R_TIMING);
96040117Smckusick 		rep->r_timer = rep->r_timerinit;
96140117Smckusick wakeup:
96240117Smckusick 		/* If error or interruptible mount, give user a look */
96340117Smckusick 		if (error || (mntp->nm_flag & NFSMNT_INT))
96440117Smckusick 			sorwakeup(so);
96540117Smckusick 	}
96640117Smckusick 	splx(s);
96740117Smckusick 	timeout(nfs_timer, (caddr_t)0, hz/NFS_HZ);
96840117Smckusick }
96940117Smckusick 
97040117Smckusick /*
97140117Smckusick  * NFS timer update and backoff. The "Jacobson/Karels/Karn" scheme is
97240117Smckusick  * used here. The timer state is held in the nfsmount structure and
97340117Smckusick  * a single request is used to clock the response. When successful
97440117Smckusick  * the rtt smoothing in nfs_updatetimer is used, when failed the backoff
97540117Smckusick  * is done by nfs_backofftimer. We also log failure messages in these
97640117Smckusick  * routines.
97740117Smckusick  *
97840117Smckusick  * Congestion variables are held in the nfshost structure which
97940117Smckusick  * is referenced by nfsmounts and shared per-server. This separation
98040117Smckusick  * makes it possible to do per-mount timing which allows varying disk
98140117Smckusick  * access times to be dealt with, while preserving a network oriented
98240117Smckusick  * congestion control scheme.
98340117Smckusick  *
98440117Smckusick  * The windowing implements the Jacobson/Karels slowstart algorithm
98540117Smckusick  * with adjusted scaling factors. We start with one request, then send
98640117Smckusick  * 4 more after each success until the ssthresh limit is reached, then
98740117Smckusick  * we increment at a rate proportional to the window. On failure, we
98840117Smckusick  * remember 3/4 the current window and clamp the send limit to 1. Note
98940117Smckusick  * ICMP source quench is not reflected in so->so_error so we ignore that
99040117Smckusick  * for now.
99140117Smckusick  *
99240117Smckusick  * NFS behaves much more like a transport protocol with these changes,
99340117Smckusick  * shedding the teenage pedal-to-the-metal tendencies of "other"
99440117Smckusick  * implementations.
99540117Smckusick  *
99640117Smckusick  * Timers and congestion avoidance by Tom Talpey, Open Software Foundation.
99740117Smckusick  */
99840117Smckusick 
99940117Smckusick /*
100040117Smckusick  * The TCP algorithm was not forgiving enough. Because the NFS server
100140117Smckusick  * responds only after performing lookups/diskio/etc, we have to be
100240117Smckusick  * more prepared to accept a spiky variance. The TCP algorithm is:
100340117Smckusick  * TCP_RTO(mntp) ((((mntp)->nm_srtt >> 2) + (mntp)->nm_rttvar) >> 1)
100440117Smckusick  */
100540117Smckusick #define NFS_RTO(mntp)	(((mntp)->nm_srtt >> 3) + (mntp)->nm_rttvar)
100640117Smckusick 
100740117Smckusick nfs_updatetimer(mntp)
100840117Smckusick 	register struct nfsmount *mntp;
100940117Smckusick {
101040117Smckusick 	register struct nfshost *nfshp = mntp->nm_hostinfo;
101140117Smckusick 
101240117Smckusick 	/* If retransmitted, clear and return */
101340117Smckusick 	if (mntp->nm_rexmit || nfshp->nh_currexmit) {
101440117Smckusick 		if (nfshp->nh_currexmit >= nfsrexmtthresh)
101540351Smckusick 			nfs_log("NFS server %s OK\n",
101640351Smckusick 				mntp->nm_mountp->m_stat.f_mntfromname);
101740117Smckusick 		mntp->nm_rexmit = nfshp->nh_currexmit = 0;
101840117Smckusick 		return;
101940117Smckusick 	}
102040117Smckusick 	/* If have a measurement, do smoothing */
102140117Smckusick 	if (mntp->nm_srtt) {
102240117Smckusick 		register short delta;
102340117Smckusick 		delta = mntp->nm_rtt - (mntp->nm_srtt >> 3);
102440117Smckusick 		if ((mntp->nm_srtt += delta) <= 0)
102540117Smckusick 			mntp->nm_srtt = 1;
102640117Smckusick 		if (delta < 0)
102740117Smckusick 			delta = -delta;
102840117Smckusick 		delta -= (mntp->nm_rttvar >> 2);
102940117Smckusick 		if ((mntp->nm_rttvar += delta) <= 0)
103040117Smckusick 			mntp->nm_rttvar = 1;
103140117Smckusick 	/* Else initialize */
103240117Smckusick 	} else {
103340117Smckusick 		mntp->nm_rttvar = mntp->nm_rtt << 1;
103440117Smckusick 		if (mntp->nm_rttvar == 0) mntp->nm_rttvar = 2;
103540117Smckusick 		mntp->nm_srtt = mntp->nm_rttvar << 2;
103640117Smckusick 	}
103740117Smckusick 	/* Compute new Retransmission TimeOut and clip */
103840117Smckusick 	mntp->nm_rto = NFS_RTO(mntp);
103940117Smckusick 	if (mntp->nm_rto < NFS_MINTIMEO)
104040117Smckusick 		mntp->nm_rto = NFS_MINTIMEO;
104140117Smckusick 	else if (mntp->nm_rto > NFS_MAXTIMEO)
104240117Smckusick 		mntp->nm_rto = NFS_MAXTIMEO;
104340117Smckusick 	nfshp->nh_currto = mntp->nm_rto;
104440117Smckusick 
104540117Smckusick 	/* Update window estimate */
104640117Smckusick 	if (nfshp->nh_window < nfshp->nh_ssthresh)	/* quickly */
104740117Smckusick 		nfshp->nh_window += 4;
104840117Smckusick 	else {						/* slowly */
104940117Smckusick 		register long incr = ++nfshp->nh_winext;
105040117Smckusick 		incr = (incr * incr) / nfshp->nh_window;
105140117Smckusick 		if (incr > 0) {
105240117Smckusick 			nfshp->nh_winext = 0;
105340117Smckusick 			++nfshp->nh_window;
105440117Smckusick 		}
105540117Smckusick 	}
105640117Smckusick 	if (nfshp->nh_window > NFS_MAXWINDOW)
105740117Smckusick 		nfshp->nh_window = NFS_MAXWINDOW;
105840117Smckusick }
105940117Smckusick 
106040117Smckusick nfs_backofftimer(mntp)
106140117Smckusick 	register struct nfsmount *mntp;
106240117Smckusick {
106340117Smckusick 	register struct nfshost *nfshp = mntp->nm_hostinfo;
106440117Smckusick 	register unsigned long newrto;
106540117Smckusick 
106640117Smckusick 	/* Clip shift count */
106740117Smckusick 	if (++mntp->nm_rexmit > 8 * sizeof mntp->nm_rto)
106840117Smckusick 		mntp->nm_rexmit = 8 * sizeof mntp->nm_rto;
106940117Smckusick 	/* Back off RTO exponentially */
107040117Smckusick 	newrto = NFS_RTO(mntp);
107140117Smckusick 	newrto <<= (mntp->nm_rexmit - 1);
107240117Smckusick 	if (newrto == 0 || newrto > NFS_MAXTIMEO)
107340117Smckusick 		newrto = NFS_MAXTIMEO;
107440117Smckusick 	mntp->nm_rto = nfshp->nh_currto = newrto;
107540117Smckusick 
107640117Smckusick 	/* If too many retries, message, assume a bogus RTT and re-measure */
107740117Smckusick 	if (nfshp->nh_currexmit < mntp->nm_rexmit) {
107840117Smckusick 		nfshp->nh_currexmit = mntp->nm_rexmit;
107940117Smckusick 		if (nfshp->nh_currexmit >= nfsrexmtthresh) {
108040117Smckusick 			if (nfshp->nh_currexmit == nfsrexmtthresh) {
108140117Smckusick 				nfs_log("NFS server %s not responding\n",
108240351Smckusick 					mntp->nm_mountp->m_stat.f_mntfromname);
108340117Smckusick 				mntp->nm_rttvar += (mntp->nm_srtt >> 2);
108440117Smckusick 				mntp->nm_srtt = 0;
108538414Smckusick 			}
108640117Smckusick 			/* The routing invalidation should be a usrreq PRU */
108740117Smckusick 			if (mtod(nfshp->nh_sockaddr,
108840117Smckusick 				struct sockaddr *)->sa_family == AF_INET)
108940117Smckusick 				in_losing(mntp->nm_so->so_pcb);
109038414Smckusick 		}
109138414Smckusick 	}
109240117Smckusick 	/* Close down window but remember this point (3/4 current) for later */
109340117Smckusick 	nfshp->nh_ssthresh = ((nfshp->nh_window << 1) + nfshp->nh_window) >> 2;
109440117Smckusick 	nfshp->nh_window = 1;
109540117Smckusick 	nfshp->nh_winext = 0;
109638414Smckusick }
109738414Smckusick 
109838414Smckusick /*
109940117Smckusick  * Not all errors are fatal. The closed checks deal
110040117Smckusick  * with errors a little strangely.
110138414Smckusick  */
110240117Smckusick 
110340117Smckusick nfs_sockerr(so, sending)
110440117Smckusick 	struct socket *so;
110540117Smckusick 	int sending;
110638414Smckusick {
110740117Smckusick 	if (sending && (so->so_state & SS_CANTSENDMORE)) {
110840117Smckusick 		so->so_error = EPIPE;
110940117Smckusick 		return (EPIPE);
111040117Smckusick 	}
111140117Smckusick 
111240117Smckusick 	switch (so->so_error) {			/* inhibit certain errors */
111340117Smckusick 	case ENETDOWN:
111440117Smckusick 	case ENETUNREACH:
111540117Smckusick 	case EHOSTDOWN:
111640117Smckusick 	case EHOSTUNREACH:
111740117Smckusick 		so->so_error = 0;
111840117Smckusick 	case 0:
111940117Smckusick 		break;
112040117Smckusick 	default:				/* return all others */
112140117Smckusick 		printf("nfs_sockerr: error %d on %s\n", so->so_error,
112240117Smckusick 			sending?"send":"receive");
112340117Smckusick 		return (so->so_error);
112440117Smckusick 	}
112540117Smckusick 
112640117Smckusick 	if (!sending && (so->so_state & SS_CANTRCVMORE)) {
112740117Smckusick 		so->so_error = 0;		/* (no error) */
112840117Smckusick 		return (EPIPE);
112940117Smckusick 	}
113040117Smckusick 	return (so->so_error);
113138414Smckusick }
1132