/*
 * Copyright (c) 1989 The Regents of the University of California.
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Rick Macklem at The University of Guelph.
 *
 * Redistribution and use in source and binary forms are permitted
 * provided that the above copyright notice and this paragraph are
 * duplicated in all such forms and that any documentation,
 * advertising materials, and other materials related to such
 * distribution and use acknowledge that the software was developed
 * by the University of California, Berkeley.  The name of the
 * University may not be used to endorse or promote products derived
 * from this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
 *
 *	@(#)nfs_socket.c	7.7 (Berkeley) 03/06/90
 */

/*
 * Socket operations for use by nfs (similar to uipc_socket.c, but never
 * with copies to/from a uio vector)
 * NB: For now, they only work for datagram sockets.
 * (Use on stream sockets would require some record boundary mark in the
 *  stream as defined by "RPC: Remote Procedure Call Protocol
 *  Specification" RFC1057 Section 10)
 *  and different versions of send, receive and reply that do not assume
 *  an atomic protocol
 */

#include "types.h"
#include "param.h"
#include "uio.h"
#include "user.h"
#include "proc.h"
#include "signal.h"
#include "mount.h"
#include "kernel.h"
#include "malloc.h"
#include "mbuf.h"
#include "vnode.h"
#include "domain.h"
#include "protosw.h"
#include "socket.h"
#include "socketvar.h"
#include "rpcv2.h"
#include "nfsv2.h"
#include "nfs.h"
#include "xdr_subs.h"
#include "nfsm_subs.h"
#include "nfsmount.h"

#include "syslog.h"
#define nfs_log(message, host)	log(LOG_ERR, message, host)

#define	TRUE	1

/* set lock on sockbuf sb, sleep at neg prio */
#define nfs_sblock(sb) { \
	while ((sb)->sb_flags & SB_LOCK) { \
		(sb)->sb_flags |= SB_WANT; \
		sleep((caddr_t)&(sb)->sb_flags, PZERO-1); \
	} \
	(sb)->sb_flags |= SB_LOCK; \
}
/*
 * nfs_sbwait() is simply sbwait() but at a negative priority so that it
 * can not be interrupted by a signal.
 */
nfs_sbwait(sb)
	struct sockbuf *sb;
{
	sb->sb_flags |= SB_WAIT;
	sleep((caddr_t)&sb->sb_cc, PZERO-2);
}

/*
 * External data, mostly RPC constants in XDR form
 */
extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix,
	rpc_msgaccepted, rpc_call;
extern u_long nfs_prog, nfs_vers;
int	nfsrv_null(),
	nfsrv_getattr(),
	nfsrv_setattr(),
	nfsrv_lookup(),
	nfsrv_readlink(),
	nfsrv_read(),
	nfsrv_write(),
	nfsrv_create(),
	nfsrv_remove(),
	nfsrv_rename(),
	nfsrv_link(),
	nfsrv_symlink(),
	nfsrv_mkdir(),
	nfsrv_rmdir(),
	nfsrv_readdir(),
	nfsrv_statfs(),
	nfsrv_noop();

int (*nfsrv_procs[NFS_NPROCS])() = {
	nfsrv_null,
	nfsrv_getattr,
	nfsrv_setattr,
	nfsrv_noop,
	nfsrv_lookup,
	nfsrv_readlink,
	nfsrv_read,
	nfsrv_noop,
	nfsrv_write,
	nfsrv_create,
	nfsrv_remove,
	nfsrv_rename,
	nfsrv_link,
	nfsrv_symlink,
	nfsrv_mkdir,
	nfsrv_rmdir,
	nfsrv_readdir,
	nfsrv_statfs,
};

struct nfshost *nfshosth;
struct nfsreq nfsreqh;
int nfsrexmtthresh = NFS_FISHY;

/*
 * Initialize sockets and per-host congestion for a new NFS connection.
 * We do not free the sockaddr if error.
 */
nfs_connect(nmp, saddr)
	register struct nfsmount *nmp;
	struct mbuf *saddr;
{
	int s, error, srvaddrlen;
	struct mbuf *m;
	register struct nfshost *nfshp;

	nmp->nm_so = 0;
	if (error = socreate(mtod(saddr, struct sockaddr *)->sa_family,
				&nmp->nm_so, SOCK_DGRAM, 0))
		goto bad;

	/* Unix sockets do not provide a local bind for server reply */
	if (mtod(saddr, struct sockaddr *)->sa_family == AF_UNIX) {
		struct sockaddr *sa;
		static char client[] = "/tmp/.nfs/nfsclient##";
		static int serial;
		int firstserial;
		m = m_getclr(M_WAIT, MT_SONAME);
		if (m == NULL) {
			error = ENOBUFS;
			goto bad;
		}
		m->m_len = sizeof (client) + 2;
		sa = mtod(m, struct sockaddr *);
		sa->sa_family = AF_UNIX;
#ifdef	MSG_TRUNC	/* Have sa_len to set? */
		sa->sa_len = m->m_len;
#endif
		bcopy(client, sa->sa_data, sizeof(client));
		firstserial = serial;
		do {
			if (++serial >= 100) serial = 0;
			sa->sa_data[19] = (serial / 10) + '0';
			sa->sa_data[20] = (serial % 10) + '0';
			error = sobind(nmp->nm_so, m);
			if (firstserial == serial) break;
		} while (error == EADDRINUSE);
		m_freem(m);
		if (error)
			goto bad;
	}

	if (error = soconnect(nmp->nm_so, saddr))
		goto bad;
	error = soreserve(nmp->nm_so,	/* get space ! */
				nmp->nm_wsize + 1024,		/* one out */
				(nmp->nm_rsize + 1024) * 4);	/* four in */
	if (error)
		goto bad;

	/*
	 * Search mount list for existing server entry.
	 *
	 * Note, even though we have a sockaddr, it is not quite reliable
	 * enough to bcmp against. For instance, a sockaddr_in has a 
	 * sin_zero field which is not reliably zeroed by user code (e.g.
	 * mount). So what we do as an attempt at transport independence
	 * is to get the peeraddr of our connected socket into a zeroed
	 * sockaddr. Then we cache that and compare against it. This is
	 * not exactly perfect. However it is not critical that it be, if
	 * we cannot match the sockaddr we will simply allocate a new nfshp
	 * per mount, which will disable the per-host congestion but
	 * everything else will work as normal.
	 */
	m = m_getclr(M_WAIT, MT_SONAME);
	if (m && (*(nmp->nm_so->so_proto->pr_usrreq))(nmp->nm_so, PRU_PEERADDR,
				(struct mbuf *)0, m, (struct mbuf *)0) == 0) {
		m_freem(saddr);
		saddr = m;
	} else
		m_freem(m);
	srvaddrlen = saddr->m_len;

	s = splnet();

	for (nfshp = nfshosth; nfshp; nfshp = nfshp->nh_next) {
		if (srvaddrlen != nfshp->nh_salen)
			continue;
		if (!bcmp(mtod(saddr,caddr_t),mtod(nfshp->nh_sockaddr,caddr_t),
				srvaddrlen))
			break;
	}
	if (nfshp)		/* Have an existing mount host */
		m_freem(saddr);
	else {
		MALLOC(nfshp,struct nfshost *,sizeof *nfshp,M_NFSMNT,M_WAITOK);
		bzero((caddr_t)nfshp, sizeof *nfshp);
		nfshp->nh_sockaddr = saddr;
		nfshp->nh_salen = srvaddrlen;
		/* Initialize other non-zero congestion variables */
		nfshp->nh_currto = NFS_TIMEO;
		nfshp->nh_window = 1;		    /* Initial send window */
		nfshp->nh_ssthresh = NFS_MAXWINDOW; /* Slowstart threshold */
		if (nfshosth) nfshosth->nh_prev = nfshp;	/* Chain in */
		nfshp->nh_next = nfshosth;
		nfshosth = nfshp;
	}
	nfshp->nh_refcnt++;
	splx(s);
	nmp->nm_hostinfo = nfshp;
	if (nmp->nm_rto == NFS_TIMEO) {
		nmp->nm_rto = nfshp->nh_currto;
		nmp->nm_rttvar = nmp->nm_rto << 1;
	}
	return (0);

bad:
	if (nmp->nm_so) (void) soclose(nmp->nm_so);
	nmp->nm_so = 0;
	return (error);
}

/*
 * NFS disconnect. Clean up and unlink.
 */
nfs_disconnect(nmp)
	register struct nfsmount *nmp;
{
	register struct nfshost *nfshp;

	if (nmp->nm_so)
		soclose(nmp->nm_so);
	nmp->nm_so = 0;
	if (nfshp = nmp->nm_hostinfo) {
		int s = splnet();
		if (--nfshp->nh_refcnt <= 0) {
			if (nfshp->nh_next)
				nfshp->nh_next->nh_prev = nfshp->nh_prev;
			if (nfshp->nh_prev)
				nfshp->nh_prev->nh_next = nfshp->nh_next;
			else
				nfshosth = nfshp->nh_next;
			/* If unix family, remove the nfsclient from /tmp */
			if (mtod(nfshp->nh_sockaddr,
				struct sockaddr *)->sa_family == AF_UNIX) {
					/* Lookup sa_data, do VOP_REMOVE... */
			}
			m_freem(nfshp->nh_sockaddr);
			FREE(nfshp, M_NFSMNT);
		}
		nmp->nm_hostinfo = 0;
		splx(s);
	}
}

/*
 * This is a stripped down non-interruptible version of sosend().
 */
nfs_send(so, nam, top, flags, siz)
	register struct socket *so;
	struct mbuf *nam;
	struct mbuf *top;
	int flags;
	int siz;
{
	int error, s;

#ifdef MGETHDR
	top->m_pkthdr.len = siz;
#endif
	for (;;) {
		nfs_sblock(&so->so_snd);
		s = splnet();
		if (error = nfs_sockerr(so, 1)) {
			splx(s);
			m_freem(top);
			break;
		}
		if (sbspace(&so->so_snd) < siz) {
			sbunlock(&so->so_snd);
			nfs_sbwait(&so->so_snd);
			splx(s);
			continue;
		}
		error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, top,
			(struct mbuf *)nam, (struct mbuf *)0);
		splx(s);
		break;
	}
	sbunlock(&so->so_snd);
	return (error);
}

/*
 * This is a stripped down datagram specific version of soreceive()
 */
nfs_dgreceive(so, msk, mtch, aname, mp)
	register struct socket *so;
	u_long msk;
	u_long mtch;
	struct mbuf **aname;
	struct mbuf **mp;
{
	register struct mbuf *m;
	int s, error = 0;
	struct mbuf *nextrecord;

	if (aname)
		*aname = 0;

	for (;;) {
		sblock(&so->so_rcv);
		s = splnet();

		if (so->so_rcv.sb_cc == 0) {
			if (error = nfs_sockerr(so, 0)) {
				so->so_error = 0;
				break;
			}
			sbunlock(&so->so_rcv);
			sbwait(&so->so_rcv);
			splx(s);
			continue;
		}
		m = so->so_rcv.sb_mb;
		if (m == 0)
			panic("nfs_dgreceive 1");
		nextrecord = m->m_nextpkt;
		/* Save sender's address */
		if (m->m_type != MT_SONAME)
			panic("nfs_dgreceive 1a");
		sbfree(&so->so_rcv, m);
		if (aname) {
			*aname = m;
			so->so_rcv.sb_mb = m->m_next;
			m->m_next = 0;
			m = so->so_rcv.sb_mb;
		} else {
			MFREE(m, so->so_rcv.sb_mb);
			m = so->so_rcv.sb_mb;
		}
		/* Drop control mbuf's */
		if (m && m->m_type == MT_RIGHTS)
			panic("nfs_dgreceive 2");
		if (m && m->m_type == MT_CONTROL) {
			sbfree(&so->so_rcv, m);
			MFREE(m, so->so_rcv.sb_mb);
			m = so->so_rcv.sb_mb;
		}
		/* Dequeue packet from sockbuf */
		*mp = m;
		while (m) {
			if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
				panic("nfs_dgreceive 3");
			sbfree(&so->so_rcv, m);
			m = so->so_rcv.sb_mb = m->m_next;
		}
		so->so_rcv.sb_mb = nextrecord;
		/* Return */
		break;
	}
	sbunlock(&so->so_rcv);
	splx(s);
	return (error);
}

struct rpc_replyhead {
	u_long	r_xid;
	u_long	r_rep;
};

/*
 * Implement NFS client side datagram receive.
 * We depend on the way that records are added to the sockbuf
 * by sbappend*.  In particular, each record (mbufs linked through m_next)
 * must begin with an address, followed by optional MT_CONTROL mbuf
 * and then zero or more mbufs of data.
 * We must search through the list of received datagrams matching them
 * with outstanding requests using the xid, until ours is found.
 */
nfs_dgreply(so, mntp, myrep)
	register struct socket *so;
	struct nfsmount *mntp;
	struct nfsreq *myrep;
{
	register struct mbuf *m;
	register struct nfsreq *rep;
	register int error = 0, s;
	int logged = 0;
	struct mbuf *nextrecord;
	struct rpc_replyhead replyh;

restart:
	nfs_sblock(&so->so_rcv);
	s = splnet();
	/* Already received and queued for us, bye bye */
	if (myrep->r_mrep != NULL) {
		error = 0;
		goto release;
	}
	/* If we have run out of retries (hard mounts have bogus count) */
	if (myrep->r_rexmit > myrep->r_retry) {
		error = ETIMEDOUT;
		nfsstats.rpctimeouts++;
giveup:
		if (myrep->r_flags & R_TIMING) {
			myrep->r_flags &= ~R_TIMING;
			mntp->nm_rtt = -1;
		}
		if (myrep->r_flags & R_SENT) {
			myrep->r_flags &= ~R_SENT;
			--mntp->nm_hostinfo->nh_sent;
			/* If count now 0, want to initiate new req */
		}
		goto release;
	}

	m = so->so_rcv.sb_mb;
	if (m == 0) {
		if (so->so_rcv.sb_cc)
			panic("nfs_soreply 1");
		if (error = nfs_sockerr(so, 0)) {
			so->so_error = 0;
			goto giveup;
		}
		/* Allow signals to interrupt request? (nfs_timer wakes up) */
		if ((mntp->nm_flag & NFSMNT_INT) &&
		    u.u_procp->p_sig & ~u.u_procp->p_sigmask) {
			error = EINTR;
			goto giveup;
		}
		if (mntp->nm_rexmit >= nfsrexmtthresh && logged++ == 0)
			uprintf("NFS server %s not responding, retrying\n",
				mntp->nm_host);
		sbunlock(&so->so_rcv);
		nfs_sbwait(&so->so_rcv);
		splx(s);
		goto restart;
	}

	/*
	 * Take off the address, check for rights and ditch any control
	 * mbufs.
	 */
	nextrecord = m->m_nextpkt;
	if (m->m_type != MT_SONAME)
		panic("nfs reply SONAME");
	sbfree(&so->so_rcv, m);
	MFREE(m, so->so_rcv.sb_mb);
	m = so->so_rcv.sb_mb;
	if (m && m->m_type == MT_RIGHTS)
		panic("nfs reply RIGHTS");
	if (m && m->m_type == MT_CONTROL) {
		sbfree(&so->so_rcv, m);
		MFREE(m, so->so_rcv.sb_mb);
		m = so->so_rcv.sb_mb;
	}
	if (m) {
		m->m_nextpkt = nextrecord;
	} else {
		so->so_rcv.sb_mb = nextrecord;
		sbunlock(&so->so_rcv);
		splx(s);
		goto restart;
	}

	/*
	 * Get the xid and check that it is an rpc reply
	 */
	if (m->m_len >= sizeof replyh)
		bcopy(mtod(m, caddr_t), (caddr_t)&replyh, sizeof replyh);
	else {
		struct mbuf *mp = m;
		caddr_t cp = (caddr_t)&replyh;
		int cnt = sizeof replyh;
		do {
			if (mp->m_len > 0) {
				int xfer = (mp->m_len >= cnt) ? cnt : mp->m_len;
				bcopy(mtod(mp, caddr_t), cp, xfer);
				cnt -= xfer;
				cp += xfer;
			}
			if (cnt > 0)
				mp = mp->m_next;
		} while (mp && cnt > 0);
		if (mp == NULL) {		/* Insufficient length */
			nfsstats.rpcinvalid++;
			goto dropit;
		}
	}
	if (replyh.r_rep != rpc_reply) {	/* Not a reply */
		nfsstats.rpcinvalid++;
		goto dropit;
	}
	/*
	 * Loop through the request list to match up the reply
	 * If no match, just drop the datagram
	 */
	if (rep = nfsreqh.r_next) {
	    while (rep != &nfsreqh) {
		/* The socket, being connected, will only queue matches */
		if (replyh.r_xid == rep->r_xid && so == rep->r_mntp->nm_so) {
			/* Found it.. */
			if (rep->r_mrep)	/* Already there - duplicate */
				break;
			rep->r_mrep = m;
			while (m) {
				if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
					panic("nfs_soreply 3");
				sbfree(&so->so_rcv, m);
				m = so->so_rcv.sb_mb = m->m_next;
			}
			so->so_rcv.sb_mb = nextrecord;
			if (rep->r_flags & R_TIMING) {
				nfs_updatetimer(mntp);
				rep->r_flags &= ~R_TIMING;
				mntp->nm_rtt = -1;	/* re-arm timer */
			}
			if (rep->r_flags & R_SENT) {
				rep->r_flags &= ~R_SENT;
				--mntp->nm_hostinfo->nh_sent;
				/* If count now 0, want to initiate new req */
			}
			if (rep == myrep) {		/* This is success */
				if (logged)
					uprintf("NFS server %s responded\n",
						mntp->nm_host);
				goto release;
			}
			/* Else wake up other sleeper and wait for next */
			sbunlock(&so->so_rcv);
			sorwakeup(so);
			splx(s);
			goto restart;
		}
		rep = rep->r_next;
	    }
	}
	/* If not matched to request, drop it */
	nfsstats.rpcunexpected++;
dropit:
	sbdroprecord(&so->so_rcv);
	sbunlock(&so->so_rcv);
	splx(s);
	goto restart;

release:
	sbunlock(&so->so_rcv);
	splx(s);
	return (error);
}

/*
 * nfs_request - goes something like this
 *	- fill in request struct
 *	- links it into list
 *	- calls nfs_sosend() for first transmit
 *	- calls nfs_soreceive() to get reply
 *	- break down rpc header and return with nfs reply pointed to
 *	  by mrep or error
 * nb: always frees up mreq mbuf list
 */
nfs_request(vp, mreq, xid, idem, mp, mrp, mdp, dposp)
	struct vnode *vp;
	struct mbuf *mreq;
	u_long xid;
	int idem;
	struct mount *mp;
	struct mbuf **mrp;
	struct mbuf **mdp;
	caddr_t *dposp;
{
	register struct mbuf *m, *mrep;
	register struct nfsreq *rep;
	register u_long *p;
	register int len;
	struct nfsmount *mntp;
	struct mbuf *md;
	struct nfsreq *reph;
	caddr_t dpos;
	char *cp2;
	int t1;
	int s;
	int error;

	mntp = vfs_to_nfs(mp);
	m = mreq;
	MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
	rep->r_xid = xid;
	rep->r_mntp = mntp;
	rep->r_vp = vp;
	if (mntp->nm_flag & NFSMNT_SOFT)
		rep->r_retry = mntp->nm_retry;
	else
		rep->r_retry = NFS_MAXREXMIT + 1;	/* past clip limit */
	rep->r_flags = rep->r_rexmit = 0;
	/* Idempotency: add N * MINTIMEO to requests if not, else use 0 */
	rep->r_timer = rep->r_timerinit = -(idem * NFS_MINTIMEO);
	rep->r_mrep = NULL;
	rep->r_mreq = m;
	len = 0;
	while (m) {
		len += m->m_len;
		m = m->m_next;
	}
	rep->r_msiz = len;

	/*
	 * Do the client side RPC.
	 */
	nfsstats.rpcrequests++;
	s = splnet();
	/* Chain request into list of outstanding requests. Be sure
	 * to put it LAST so timer finds oldest requests first. */
	reph = &nfsreqh;
	if (reph->r_prev == NULL) {
		reph->r_next = rep;
		rep->r_prev = reph;
	} else {
		reph->r_prev->r_next = rep;
		rep->r_prev = reph->r_prev;
	}
	reph->r_prev = rep;
	rep->r_next = reph;
	/*
	 * If backing off another request or avoiding congestion, don't
	 * send this one now but let timer do it. If not timing a request,
	 * do it now.
	 */
	if (mntp->nm_hostinfo->nh_sent > 0 &&
	    (mntp->nm_hostinfo->nh_currexmit != 0 ||
	     mntp->nm_hostinfo->nh_sent >= mntp->nm_hostinfo->nh_window)) {
		splx(s);
		goto skipsend;
	}
	++mntp->nm_hostinfo->nh_sent;	/* Inconsistent if can't NFSMCOPY */
	rep->r_flags |= R_SENT;		/* But not a catastrophe */
	if (mntp->nm_rtt == -1) {
		mntp->nm_rtt = 0;
		rep->r_flags |= R_TIMING;
	}
	splx(s);

	/*
	 * If we can get a packet to send, send it off...
	 * otherwise the timer will retransmit later
	 */
	m = NFSMCOPY(mreq, 0, M_COPYALL, M_WAIT);
	if (m != NULL)
		(void) nfs_send(mntp->nm_so, (struct mbuf *)0, m, 0, len);
	/*
	 * Wait for the reply from our send or the timer's.
	 */
skipsend:
	error = nfs_dgreply(mntp->nm_so, mntp, rep);

	/*
	 * RPC done, unlink the request.
	 */
	s = splnet();
	rep->r_prev->r_next = rep->r_next;
	rep->r_next->r_prev = rep->r_prev;
	splx(s);
	m_freem(rep->r_mreq);
	mrep = md = rep->r_mrep;
	FREE((caddr_t)rep, M_NFSREQ);
	if (error)
		return (error);

	/*
	 * break down the rpc header and check if ok
	 */
	dpos = mtod(md, caddr_t);
	nfsm_disect(p, u_long *, 5*NFSX_UNSIGNED);
	p += 2;
	if (*p++ == rpc_msgdenied) {
		if (*p == rpc_mismatch)
			error = EOPNOTSUPP;
		else
			error = EACCES;
		m_freem(mrep);
		return (error);
	}
	/*
	 * skip over the auth_verf, someday we may want to cache auth_short's
	 * for nfs_reqhead(), but for now just dump it
	 */
	if (*++p != 0) {
		len = nfsm_rndup(fxdr_unsigned(long, *p));
		nfsm_adv(len);
	}
	nfsm_disect(p, u_long *, NFSX_UNSIGNED);
	/* 0 == ok */
	if (*p == 0) {
		nfsm_disect(p, u_long *, NFSX_UNSIGNED);
		if (*p != 0) {
			error = fxdr_unsigned(int, *p);
			m_freem(mrep);
			return (error);
		}
		*mrp = mrep;
		*mdp = md;
		*dposp = dpos;
		return (0);
	}
	m_freem(mrep);
	return (EPROTONOSUPPORT);
nfsmout:
	return (error);
}

/*
 * Get a request for the server main loop
 * - receive a request via. nfs_soreceive()
 * - verify it
 * - fill in the cred struct.
 */
nfs_getreq(so, prog, vers, maxproc, nam, mrp, mdp, dposp, retxid, proc, cr,
	   msk, mtch)
	struct socket *so;
	u_long prog;
	u_long vers;
	int maxproc;
	struct mbuf **nam;
	struct mbuf **mrp;
	struct mbuf **mdp;
	caddr_t *dposp;
	u_long *retxid;
	u_long *proc;
	register struct ucred *cr;
	u_long msk;
	u_long mtch;
{
	register int i;
	register u_long *p;
	register long t1;
	caddr_t dpos, cp2;
	int error = 0;
	struct mbuf *mrep, *md;
	int len;

	if (error = nfs_dgreceive(so, msk, mtch, nam, &mrep))
		return (error);
	md = mrep;
	dpos = mtod(mrep, caddr_t);
	nfsm_disect(p, u_long *, 10*NFSX_UNSIGNED);
	*retxid = *p++;
	if (*p++ != rpc_call) {
		m_freem(mrep);
		return (ERPCMISMATCH);
	}
	if (*p++ != rpc_vers) {
		m_freem(mrep);
		return (ERPCMISMATCH);
	}
	if (*p++ != prog) {
		m_freem(mrep);
		return (EPROGUNAVAIL);
	}
	if (*p++ != vers) {
		m_freem(mrep);
		return (EPROGMISMATCH);
	}
	*proc = fxdr_unsigned(u_long, *p++);
	if (*proc == NFSPROC_NULL) {
		*mrp = mrep;
		return (0);
	}
	if (*proc > maxproc || *p++ != rpc_auth_unix) {
		m_freem(mrep);
		return (EPROCUNAVAIL);
	}
	(void) fxdr_unsigned(int, *p++);
	len = fxdr_unsigned(int, *++p);
	nfsm_adv(nfsm_rndup(len));
	nfsm_disect(p, u_long *, 3*NFSX_UNSIGNED);
	cr->cr_uid = fxdr_unsigned(uid_t, *p++);
	cr->cr_gid = fxdr_unsigned(gid_t, *p++);
	len = fxdr_unsigned(int, *p);
	if (len > 10) {
		m_freem(mrep);
		return (EBADRPC);
	}
	nfsm_disect(p, u_long *, (len + 2)*NFSX_UNSIGNED);
	for (i = 1; i <= len; i++)
		cr->cr_groups[i] = fxdr_unsigned(gid_t, *p++);
	cr->cr_ngroups = len + 1;
	/*
	 * Do we have any use for the verifier.
	 * According to the "Remote Procedure Call Protocol Spec." it
	 * should be AUTH_NULL, but some clients make it AUTH_UNIX?
	 * For now, just skip over it
	 */
	len = fxdr_unsigned(int, *++p);
	if (len > 0)
		nfsm_adv(nfsm_rndup(len));
	*mrp = mrep;
	*mdp = md;
	*dposp = dpos;
	return (0);
nfsmout:
	return (error);
}

/*
 * Generate the rpc reply header
 * siz arg. is used to decide if adding a cluster is worthwhile
 */
nfs_rephead(siz, retxid, err, mrq, mbp, bposp)
	int siz;
	u_long retxid;
	int err;
	struct mbuf **mrq;
	struct mbuf **mbp;
	caddr_t *bposp;
{
	register u_long *p;
	register long t1;
	caddr_t bpos;
	struct mbuf *mreq, *mb, *mb2;

	NFSMGETHDR(mreq);
	mb = mreq;
	if ((siz+RPC_REPLYSIZ) > MHLEN)
		NFSMCLGET(mreq, M_WAIT);
	p = mtod(mreq, u_long *);
	mreq->m_len = 6*NFSX_UNSIGNED;
	bpos = ((caddr_t)p)+mreq->m_len;
	*p++ = retxid;
	*p++ = rpc_reply;
	if (err == ERPCMISMATCH) {
		*p++ = rpc_msgdenied;
		*p++ = rpc_mismatch;
		*p++ = txdr_unsigned(2);
		*p = txdr_unsigned(2);
	} else {
		*p++ = rpc_msgaccepted;
		*p++ = 0;
		*p++ = 0;
		switch (err) {
		case EPROGUNAVAIL:
			*p = txdr_unsigned(RPC_PROGUNAVAIL);
			break;
		case EPROGMISMATCH:
			*p = txdr_unsigned(RPC_PROGMISMATCH);
			nfsm_build(p, u_long *, 2*NFSX_UNSIGNED);
			*p++ = txdr_unsigned(2);
			*p = txdr_unsigned(2);	/* someday 3 */
			break;
		case EPROCUNAVAIL:
			*p = txdr_unsigned(RPC_PROCUNAVAIL);
			break;
		default:
			*p = 0;
			if (err != VNOVAL) {
				nfsm_build(p, u_long *, NFSX_UNSIGNED);
				*p = txdr_unsigned(err);
			}
			break;
		};
	}
	*mrq = mreq;
	*mbp = mb;
	*bposp = bpos;
	if (err != 0 && err != VNOVAL)
		nfsstats.srvrpc_errs++;
	return (0);
}

/*
 * Nfs timer routine
 * Scan the nfsreq list and retranmit any requests that have timed out
 * To avoid retransmission attempts on STREAM sockets (in the future) make
 * sure to set the r_retry field to 0 (implies nm_retry == 0).
 */
nfs_timer()
{
	register struct nfsreq *rep;
	register struct mbuf *m;
	register struct socket *so;
	register struct nfsmount *mntp;
	int s, error;

	s = splnet();
	rep = nfsreqh.r_next;
	if (rep) for ( ; rep != &nfsreqh; rep = rep->r_next) {
		mntp = rep->r_mntp;
		if (rep->r_flags & R_TIMING)	/* update rtt in mount */
			mntp->nm_rtt++;
		/* If not timed out or reply already received, skip */
		if (++rep->r_timer < mntp->nm_rto || rep->r_mrep)
			continue;
		/* Do backoff and save new timeout in mount */
		if (rep->r_flags & R_TIMING) {
			nfs_backofftimer(mntp);
			rep->r_flags &= ~R_TIMING;
			mntp->nm_rtt = -1;
		}
		if (rep->r_flags & R_SENT) {
			rep->r_flags &= ~R_SENT;
			--mntp->nm_hostinfo->nh_sent;
		}
		/* Check state of socket, cf nfs_send */
		so = mntp->nm_so;
		if (error = nfs_sockerr(so, 1))
			goto wakeup;
		if (sbspace(&so->so_snd) < rep->r_msiz)
			goto wakeup;
		/* Check for too many retries, cf nfs_dgreply */
		if (++rep->r_rexmit > NFS_MAXREXMIT)	/* clip */
			rep->r_rexmit = NFS_MAXREXMIT;
		if (rep->r_rexmit > rep->r_retry)	/* too many */
			goto wakeup;
		/* Check for congestion control, cf nfs_request */
		if (mntp->nm_hostinfo->nh_sent >= mntp->nm_hostinfo->nh_window)
			goto wakeup;
		/* Send it! */
		m = NFSMCOPY(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT);
		if (m == NULL)
			goto wakeup;
		nfsstats.rpcretries++;
#ifdef MGETHDR
		m->m_pkthdr.len = rep->r_msiz;
#endif
		(void)(*so->so_proto->pr_usrreq)(so, PRU_SEND, m,
			(struct mbuf *)0, (struct mbuf *)0);

		/* We need to time the request even though we're
		 * retransmitting, in order to maintain backoff. */
		mntp->nm_rtt = 0;
		++mntp->nm_hostinfo->nh_sent;
		rep->r_flags |= (R_SENT|R_TIMING);
		rep->r_timer = rep->r_timerinit;
wakeup:
		/* If error or interruptible mount, give user a look */
		if (error || (mntp->nm_flag & NFSMNT_INT))
			sorwakeup(so);
	}
	splx(s);
	timeout(nfs_timer, (caddr_t)0, hz/NFS_HZ);
}

/*
 * NFS timer update and backoff. The "Jacobson/Karels/Karn" scheme is
 * used here. The timer state is held in the nfsmount structure and
 * a single request is used to clock the response. When successful
 * the rtt smoothing in nfs_updatetimer is used, when failed the backoff
 * is done by nfs_backofftimer. We also log failure messages in these
 * routines.
 *
 * Congestion variables are held in the nfshost structure which
 * is referenced by nfsmounts and shared per-server. This separation
 * makes it possible to do per-mount timing which allows varying disk
 * access times to be dealt with, while preserving a network oriented
 * congestion control scheme.
 *
 * The windowing implements the Jacobson/Karels slowstart algorithm
 * with adjusted scaling factors. We start with one request, then send
 * 4 more after each success until the ssthresh limit is reached, then
 * we increment at a rate proportional to the window. On failure, we
 * remember 3/4 the current window and clamp the send limit to 1. Note
 * ICMP source quench is not reflected in so->so_error so we ignore that
 * for now.
 *
 * NFS behaves much more like a transport protocol with these changes,
 * shedding the teenage pedal-to-the-metal tendencies of "other"
 * implementations.
 *
 * Timers and congestion avoidance by Tom Talpey, Open Software Foundation.
 */

/*
 * The TCP algorithm was not forgiving enough. Because the NFS server
 * responds only after performing lookups/diskio/etc, we have to be
 * more prepared to accept a spiky variance. The TCP algorithm is:
 * TCP_RTO(mntp) ((((mntp)->nm_srtt >> 2) + (mntp)->nm_rttvar) >> 1)
 */
#define NFS_RTO(mntp)	(((mntp)->nm_srtt >> 3) + (mntp)->nm_rttvar)

nfs_updatetimer(mntp)
	register struct nfsmount *mntp;
{
	register struct nfshost *nfshp = mntp->nm_hostinfo;

	/* If retransmitted, clear and return */
	if (mntp->nm_rexmit || nfshp->nh_currexmit) {
		if (nfshp->nh_currexmit >= nfsrexmtthresh)
			nfs_log("NFS server %s OK\n", mntp->nm_host);
		mntp->nm_rexmit = nfshp->nh_currexmit = 0;
		return;
	}
	/* If have a measurement, do smoothing */
	if (mntp->nm_srtt) {
		register short delta;
		delta = mntp->nm_rtt - (mntp->nm_srtt >> 3);
		if ((mntp->nm_srtt += delta) <= 0)
			mntp->nm_srtt = 1;
		if (delta < 0)
			delta = -delta;
		delta -= (mntp->nm_rttvar >> 2);
		if ((mntp->nm_rttvar += delta) <= 0)
			mntp->nm_rttvar = 1;
	/* Else initialize */
	} else {
		mntp->nm_rttvar = mntp->nm_rtt << 1;
		if (mntp->nm_rttvar == 0) mntp->nm_rttvar = 2;
		mntp->nm_srtt = mntp->nm_rttvar << 2;
	}
	/* Compute new Retransmission TimeOut and clip */
	mntp->nm_rto = NFS_RTO(mntp);
	if (mntp->nm_rto < NFS_MINTIMEO)
		mntp->nm_rto = NFS_MINTIMEO;
	else if (mntp->nm_rto > NFS_MAXTIMEO)
		mntp->nm_rto = NFS_MAXTIMEO;
	nfshp->nh_currto = mntp->nm_rto;

	/* Update window estimate */
	if (nfshp->nh_window < nfshp->nh_ssthresh)	/* quickly */
		nfshp->nh_window += 4;
	else {						/* slowly */
		register long incr = ++nfshp->nh_winext;
		incr = (incr * incr) / nfshp->nh_window;
		if (incr > 0) {
			nfshp->nh_winext = 0;
			++nfshp->nh_window;
		}
	}
	if (nfshp->nh_window > NFS_MAXWINDOW)
		nfshp->nh_window = NFS_MAXWINDOW;
}

nfs_backofftimer(mntp)
	register struct nfsmount *mntp;
{
	register struct nfshost *nfshp = mntp->nm_hostinfo;
	register unsigned long newrto;

	/* Clip shift count */
	if (++mntp->nm_rexmit > 8 * sizeof mntp->nm_rto)
		mntp->nm_rexmit = 8 * sizeof mntp->nm_rto;
	/* Back off RTO exponentially */
	newrto = NFS_RTO(mntp);
	newrto <<= (mntp->nm_rexmit - 1);
	if (newrto == 0 || newrto > NFS_MAXTIMEO)
		newrto = NFS_MAXTIMEO;
	mntp->nm_rto = nfshp->nh_currto = newrto;

	/* If too many retries, message, assume a bogus RTT and re-measure */
	if (nfshp->nh_currexmit < mntp->nm_rexmit) {
		nfshp->nh_currexmit = mntp->nm_rexmit;
		if (nfshp->nh_currexmit >= nfsrexmtthresh) {
			if (nfshp->nh_currexmit == nfsrexmtthresh) {
				nfs_log("NFS server %s not responding\n",
								mntp->nm_host);
				mntp->nm_rttvar += (mntp->nm_srtt >> 2);
				mntp->nm_srtt = 0;
			}
			/* The routing invalidation should be a usrreq PRU */
			if (mtod(nfshp->nh_sockaddr,
				struct sockaddr *)->sa_family == AF_INET)
				in_losing(mntp->nm_so->so_pcb);
		}
	}
	/* Close down window but remember this point (3/4 current) for later */
	nfshp->nh_ssthresh = ((nfshp->nh_window << 1) + nfshp->nh_window) >> 2;
	nfshp->nh_window = 1;
	nfshp->nh_winext = 0;
}

/*
 * Not all errors are fatal. The closed checks deal
 * with errors a little strangely.
 */

nfs_sockerr(so, sending)
	struct socket *so;
	int sending;
{
	if (sending && (so->so_state & SS_CANTSENDMORE)) {
		so->so_error = EPIPE;
		return (EPIPE);
	}

	switch (so->so_error) {			/* inhibit certain errors */
	case ENETDOWN:
	case ENETUNREACH:
	case EHOSTDOWN:
	case EHOSTUNREACH:
		so->so_error = 0;
	case 0:
		break;
	default:				/* return all others */
		printf("nfs_sockerr: error %d on %s\n", so->so_error,
			sending?"send":"receive");
		return (so->so_error);
	}

	if (!sending && (so->so_state & SS_CANTRCVMORE)) {
		so->so_error = 0;		/* (no error) */
		return (EPIPE);
	}
	return (so->so_error);
}