138414Smckusick /* 238414Smckusick * Copyright (c) 1989 The Regents of the University of California. 338414Smckusick * All rights reserved. 438414Smckusick * 538414Smckusick * This code is derived from software contributed to Berkeley by 638414Smckusick * Rick Macklem at The University of Guelph. 738414Smckusick * 838414Smckusick * Redistribution and use in source and binary forms are permitted 938414Smckusick * provided that the above copyright notice and this paragraph are 1038414Smckusick * duplicated in all such forms and that any documentation, 1138414Smckusick * advertising materials, and other materials related to such 1238414Smckusick * distribution and use acknowledge that the software was developed 1338414Smckusick * by the University of California, Berkeley. The name of the 1438414Smckusick * University may not be used to endorse or promote products derived 1538414Smckusick * from this software without specific prior written permission. 1638414Smckusick * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 1738414Smckusick * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 1838414Smckusick * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 1938414Smckusick * 20*40327Ssklower * @(#)nfs_socket.c 7.7 (Berkeley) 03/06/90 2138414Smckusick */ 2238414Smckusick 2338414Smckusick /* 2438414Smckusick * Socket operations for use by nfs (similar to uipc_socket.c, but never 2538414Smckusick * with copies to/from a uio vector) 2640117Smckusick * NB: For now, they only work for datagram sockets. 2738414Smckusick * (Use on stream sockets would require some record boundary mark in the 2839754Smckusick * stream as defined by "RPC: Remote Procedure Call Protocol 2939754Smckusick * Specification" RFC1057 Section 10) 3038414Smckusick * and different versions of send, receive and reply that do not assume 3138414Smckusick * an atomic protocol 3238414Smckusick */ 3338414Smckusick 3438414Smckusick #include "types.h" 3538414Smckusick #include "param.h" 3638414Smckusick #include "uio.h" 3738414Smckusick #include "user.h" 3840117Smckusick #include "proc.h" 3940117Smckusick #include "signal.h" 4038414Smckusick #include "mount.h" 4138414Smckusick #include "kernel.h" 4238414Smckusick #include "malloc.h" 4338414Smckusick #include "mbuf.h" 4438414Smckusick #include "vnode.h" 4538414Smckusick #include "domain.h" 4638414Smckusick #include "protosw.h" 4738414Smckusick #include "socket.h" 4838414Smckusick #include "socketvar.h" 4938414Smckusick #include "rpcv2.h" 5038414Smckusick #include "nfsv2.h" 5138414Smckusick #include "nfs.h" 5238414Smckusick #include "xdr_subs.h" 5338414Smckusick #include "nfsm_subs.h" 5438414Smckusick #include "nfsmount.h" 5538414Smckusick 5640117Smckusick #include "syslog.h" 5740117Smckusick #define nfs_log(message, host) log(LOG_ERR, message, host) 5840117Smckusick 5938414Smckusick #define TRUE 1 6038414Smckusick 6138414Smckusick /* set lock on sockbuf sb, sleep at neg prio */ 6238414Smckusick #define nfs_sblock(sb) { \ 6338414Smckusick while ((sb)->sb_flags & SB_LOCK) { \ 6438414Smckusick (sb)->sb_flags |= SB_WANT; \ 6538414Smckusick sleep((caddr_t)&(sb)->sb_flags, PZERO-1); \ 6638414Smckusick } \ 6738414Smckusick (sb)->sb_flags |= SB_LOCK; \ 6838414Smckusick } 6940117Smckusick /* 7040117Smckusick * nfs_sbwait() is simply sbwait() but at a negative priority so that it 7140117Smckusick * can not be interrupted by a signal. 7240117Smckusick */ 7340117Smckusick nfs_sbwait(sb) 7440117Smckusick struct sockbuf *sb; 7540117Smckusick { 7640117Smckusick sb->sb_flags |= SB_WAIT; 7740117Smckusick sleep((caddr_t)&sb->sb_cc, PZERO-2); 7840117Smckusick } 7938414Smckusick 8038414Smckusick /* 8138414Smckusick * External data, mostly RPC constants in XDR form 8238414Smckusick */ 8338414Smckusick extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix, 8438414Smckusick rpc_msgaccepted, rpc_call; 8538414Smckusick extern u_long nfs_prog, nfs_vers; 8638414Smckusick int nfsrv_null(), 8738414Smckusick nfsrv_getattr(), 8838414Smckusick nfsrv_setattr(), 8938414Smckusick nfsrv_lookup(), 9038414Smckusick nfsrv_readlink(), 9138414Smckusick nfsrv_read(), 9238414Smckusick nfsrv_write(), 9338414Smckusick nfsrv_create(), 9438414Smckusick nfsrv_remove(), 9538414Smckusick nfsrv_rename(), 9638414Smckusick nfsrv_link(), 9738414Smckusick nfsrv_symlink(), 9838414Smckusick nfsrv_mkdir(), 9938414Smckusick nfsrv_rmdir(), 10038414Smckusick nfsrv_readdir(), 10138414Smckusick nfsrv_statfs(), 10238414Smckusick nfsrv_noop(); 10338414Smckusick 10438414Smckusick int (*nfsrv_procs[NFS_NPROCS])() = { 10538414Smckusick nfsrv_null, 10638414Smckusick nfsrv_getattr, 10738414Smckusick nfsrv_setattr, 10838414Smckusick nfsrv_noop, 10938414Smckusick nfsrv_lookup, 11038414Smckusick nfsrv_readlink, 11138414Smckusick nfsrv_read, 11238414Smckusick nfsrv_noop, 11338414Smckusick nfsrv_write, 11438414Smckusick nfsrv_create, 11538414Smckusick nfsrv_remove, 11638414Smckusick nfsrv_rename, 11738414Smckusick nfsrv_link, 11838414Smckusick nfsrv_symlink, 11938414Smckusick nfsrv_mkdir, 12038414Smckusick nfsrv_rmdir, 12138414Smckusick nfsrv_readdir, 12238414Smckusick nfsrv_statfs, 12338414Smckusick }; 12438414Smckusick 12540117Smckusick struct nfshost *nfshosth; 12640117Smckusick struct nfsreq nfsreqh; 12740117Smckusick int nfsrexmtthresh = NFS_FISHY; 12838414Smckusick 12938414Smckusick /* 13040117Smckusick * Initialize sockets and per-host congestion for a new NFS connection. 13140117Smckusick * We do not free the sockaddr if error. 13238414Smckusick */ 13340117Smckusick nfs_connect(nmp, saddr) 13440117Smckusick register struct nfsmount *nmp; 13540117Smckusick struct mbuf *saddr; 13640117Smckusick { 13740117Smckusick int s, error, srvaddrlen; 13840117Smckusick struct mbuf *m; 13940117Smckusick register struct nfshost *nfshp; 14040117Smckusick 14140117Smckusick nmp->nm_so = 0; 14240117Smckusick if (error = socreate(mtod(saddr, struct sockaddr *)->sa_family, 14340117Smckusick &nmp->nm_so, SOCK_DGRAM, 0)) 14440117Smckusick goto bad; 14540117Smckusick 14640117Smckusick /* Unix sockets do not provide a local bind for server reply */ 14740117Smckusick if (mtod(saddr, struct sockaddr *)->sa_family == AF_UNIX) { 14840117Smckusick struct sockaddr *sa; 14940117Smckusick static char client[] = "/tmp/.nfs/nfsclient##"; 15040117Smckusick static int serial; 15140117Smckusick int firstserial; 15240117Smckusick m = m_getclr(M_WAIT, MT_SONAME); 15340117Smckusick if (m == NULL) { 15440117Smckusick error = ENOBUFS; 15540117Smckusick goto bad; 15640117Smckusick } 15740117Smckusick m->m_len = sizeof (client) + 2; 15840117Smckusick sa = mtod(m, struct sockaddr *); 15940117Smckusick sa->sa_family = AF_UNIX; 16040117Smckusick #ifdef MSG_TRUNC /* Have sa_len to set? */ 16140117Smckusick sa->sa_len = m->m_len; 16240117Smckusick #endif 16340117Smckusick bcopy(client, sa->sa_data, sizeof(client)); 16440117Smckusick firstserial = serial; 16540117Smckusick do { 16640117Smckusick if (++serial >= 100) serial = 0; 16740117Smckusick sa->sa_data[19] = (serial / 10) + '0'; 16840117Smckusick sa->sa_data[20] = (serial % 10) + '0'; 16940117Smckusick error = sobind(nmp->nm_so, m); 17040117Smckusick if (firstserial == serial) break; 17140117Smckusick } while (error == EADDRINUSE); 17240117Smckusick m_freem(m); 17340117Smckusick if (error) 17440117Smckusick goto bad; 17540117Smckusick } 17640117Smckusick 17740117Smckusick if (error = soconnect(nmp->nm_so, saddr)) 17840117Smckusick goto bad; 17940117Smckusick error = soreserve(nmp->nm_so, /* get space ! */ 18040117Smckusick nmp->nm_wsize + 1024, /* one out */ 18140117Smckusick (nmp->nm_rsize + 1024) * 4); /* four in */ 18240117Smckusick if (error) 18340117Smckusick goto bad; 18440117Smckusick 18540117Smckusick /* 18640117Smckusick * Search mount list for existing server entry. 18740117Smckusick * 18840117Smckusick * Note, even though we have a sockaddr, it is not quite reliable 18940117Smckusick * enough to bcmp against. For instance, a sockaddr_in has a 19040117Smckusick * sin_zero field which is not reliably zeroed by user code (e.g. 19140117Smckusick * mount). So what we do as an attempt at transport independence 19240117Smckusick * is to get the peeraddr of our connected socket into a zeroed 19340117Smckusick * sockaddr. Then we cache that and compare against it. This is 19440117Smckusick * not exactly perfect. However it is not critical that it be, if 19540117Smckusick * we cannot match the sockaddr we will simply allocate a new nfshp 19640117Smckusick * per mount, which will disable the per-host congestion but 19740117Smckusick * everything else will work as normal. 19840117Smckusick */ 19940117Smckusick m = m_getclr(M_WAIT, MT_SONAME); 20040117Smckusick if (m && (*(nmp->nm_so->so_proto->pr_usrreq))(nmp->nm_so, PRU_PEERADDR, 20140117Smckusick (struct mbuf *)0, m, (struct mbuf *)0) == 0) { 20240117Smckusick m_freem(saddr); 20340117Smckusick saddr = m; 20440117Smckusick } else 20540117Smckusick m_freem(m); 20640117Smckusick srvaddrlen = saddr->m_len; 20740117Smckusick 20840117Smckusick s = splnet(); 20940117Smckusick 21040117Smckusick for (nfshp = nfshosth; nfshp; nfshp = nfshp->nh_next) { 21140117Smckusick if (srvaddrlen != nfshp->nh_salen) 21240117Smckusick continue; 21340117Smckusick if (!bcmp(mtod(saddr,caddr_t),mtod(nfshp->nh_sockaddr,caddr_t), 21440117Smckusick srvaddrlen)) 21540117Smckusick break; 21640117Smckusick } 21740117Smckusick if (nfshp) /* Have an existing mount host */ 21840117Smckusick m_freem(saddr); 21940117Smckusick else { 22040117Smckusick MALLOC(nfshp,struct nfshost *,sizeof *nfshp,M_NFSMNT,M_WAITOK); 22140117Smckusick bzero((caddr_t)nfshp, sizeof *nfshp); 22240117Smckusick nfshp->nh_sockaddr = saddr; 22340117Smckusick nfshp->nh_salen = srvaddrlen; 22440117Smckusick /* Initialize other non-zero congestion variables */ 22540117Smckusick nfshp->nh_currto = NFS_TIMEO; 22640117Smckusick nfshp->nh_window = 1; /* Initial send window */ 22740117Smckusick nfshp->nh_ssthresh = NFS_MAXWINDOW; /* Slowstart threshold */ 22840117Smckusick if (nfshosth) nfshosth->nh_prev = nfshp; /* Chain in */ 22940117Smckusick nfshp->nh_next = nfshosth; 23040117Smckusick nfshosth = nfshp; 23140117Smckusick } 23240117Smckusick nfshp->nh_refcnt++; 23340117Smckusick splx(s); 23440117Smckusick nmp->nm_hostinfo = nfshp; 23540117Smckusick if (nmp->nm_rto == NFS_TIMEO) { 23640117Smckusick nmp->nm_rto = nfshp->nh_currto; 23740117Smckusick nmp->nm_rttvar = nmp->nm_rto << 1; 23840117Smckusick } 23940117Smckusick return (0); 24040117Smckusick 24140117Smckusick bad: 24240117Smckusick if (nmp->nm_so) (void) soclose(nmp->nm_so); 24340117Smckusick nmp->nm_so = 0; 24440117Smckusick return (error); 24540117Smckusick } 24640117Smckusick 24740117Smckusick /* 24840117Smckusick * NFS disconnect. Clean up and unlink. 24940117Smckusick */ 25040117Smckusick nfs_disconnect(nmp) 25140117Smckusick register struct nfsmount *nmp; 25240117Smckusick { 25340117Smckusick register struct nfshost *nfshp; 25440117Smckusick 25540117Smckusick if (nmp->nm_so) 25640117Smckusick soclose(nmp->nm_so); 25740117Smckusick nmp->nm_so = 0; 25840117Smckusick if (nfshp = nmp->nm_hostinfo) { 25940117Smckusick int s = splnet(); 26040117Smckusick if (--nfshp->nh_refcnt <= 0) { 26140117Smckusick if (nfshp->nh_next) 26240117Smckusick nfshp->nh_next->nh_prev = nfshp->nh_prev; 26340117Smckusick if (nfshp->nh_prev) 26440117Smckusick nfshp->nh_prev->nh_next = nfshp->nh_next; 26540117Smckusick else 26640117Smckusick nfshosth = nfshp->nh_next; 26740117Smckusick /* If unix family, remove the nfsclient from /tmp */ 26840117Smckusick if (mtod(nfshp->nh_sockaddr, 26940117Smckusick struct sockaddr *)->sa_family == AF_UNIX) { 27040117Smckusick /* Lookup sa_data, do VOP_REMOVE... */ 27140117Smckusick } 27240117Smckusick m_freem(nfshp->nh_sockaddr); 27340117Smckusick FREE(nfshp, M_NFSMNT); 27440117Smckusick } 27540117Smckusick nmp->nm_hostinfo = 0; 27640117Smckusick splx(s); 27740117Smckusick } 27840117Smckusick } 27940117Smckusick 28040117Smckusick /* 28140117Smckusick * This is a stripped down non-interruptible version of sosend(). 28240117Smckusick */ 28340117Smckusick nfs_send(so, nam, top, flags, siz) 28438414Smckusick register struct socket *so; 28538414Smckusick struct mbuf *nam; 28638414Smckusick struct mbuf *top; 28738414Smckusick int flags; 28838414Smckusick int siz; 28938414Smckusick { 29040117Smckusick int error, s; 29138414Smckusick 29238414Smckusick #ifdef MGETHDR 29338414Smckusick top->m_pkthdr.len = siz; 29438414Smckusick #endif 29540117Smckusick for (;;) { 29640117Smckusick nfs_sblock(&so->so_snd); 29740117Smckusick s = splnet(); 29840117Smckusick if (error = nfs_sockerr(so, 1)) { 29940117Smckusick splx(s); 30040117Smckusick m_freem(top); 30140117Smckusick break; 30240117Smckusick } 30340117Smckusick if (sbspace(&so->so_snd) < siz) { 30440117Smckusick sbunlock(&so->so_snd); 30540117Smckusick nfs_sbwait(&so->so_snd); 30640117Smckusick splx(s); 30740117Smckusick continue; 30840117Smckusick } 30940117Smckusick error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, top, 310*40327Ssklower (struct mbuf *)nam, (struct mbuf *)0); 31138414Smckusick splx(s); 31240117Smckusick break; 31338414Smckusick } 31438414Smckusick sbunlock(&so->so_snd); 31538414Smckusick return (error); 31638414Smckusick } 31738414Smckusick 31838414Smckusick /* 31940117Smckusick * This is a stripped down datagram specific version of soreceive() 32038414Smckusick */ 32140117Smckusick nfs_dgreceive(so, msk, mtch, aname, mp) 32238414Smckusick register struct socket *so; 32339754Smckusick u_long msk; 32439754Smckusick u_long mtch; 32538414Smckusick struct mbuf **aname; 32638414Smckusick struct mbuf **mp; 32738414Smckusick { 32838414Smckusick register struct mbuf *m; 32938414Smckusick int s, error = 0; 33038414Smckusick struct mbuf *nextrecord; 33138414Smckusick 33238414Smckusick if (aname) 33338414Smckusick *aname = 0; 33438414Smckusick 33540117Smckusick for (;;) { 33640117Smckusick sblock(&so->so_rcv); 33740117Smckusick s = splnet(); 33838414Smckusick 33940117Smckusick if (so->so_rcv.sb_cc == 0) { 34040117Smckusick if (error = nfs_sockerr(so, 0)) { 34140117Smckusick so->so_error = 0; 34240117Smckusick break; 34340117Smckusick } 34439754Smckusick sbunlock(&so->so_rcv); 34540117Smckusick sbwait(&so->so_rcv); 34639754Smckusick splx(s); 34740117Smckusick continue; 34839754Smckusick } 34938414Smckusick m = so->so_rcv.sb_mb; 35040117Smckusick if (m == 0) 35140117Smckusick panic("nfs_dgreceive 1"); 35240117Smckusick nextrecord = m->m_nextpkt; 35340117Smckusick /* Save sender's address */ 35440117Smckusick if (m->m_type != MT_SONAME) 35540117Smckusick panic("nfs_dgreceive 1a"); 35638414Smckusick sbfree(&so->so_rcv, m); 35740117Smckusick if (aname) { 35840117Smckusick *aname = m; 35940117Smckusick so->so_rcv.sb_mb = m->m_next; 36040117Smckusick m->m_next = 0; 36140117Smckusick m = so->so_rcv.sb_mb; 36240117Smckusick } else { 36340117Smckusick MFREE(m, so->so_rcv.sb_mb); 36440117Smckusick m = so->so_rcv.sb_mb; 36540117Smckusick } 36640117Smckusick /* Drop control mbuf's */ 36740117Smckusick if (m && m->m_type == MT_RIGHTS) 36840117Smckusick panic("nfs_dgreceive 2"); 36940117Smckusick if (m && m->m_type == MT_CONTROL) { 37040117Smckusick sbfree(&so->so_rcv, m); 37140117Smckusick MFREE(m, so->so_rcv.sb_mb); 37240117Smckusick m = so->so_rcv.sb_mb; 37340117Smckusick } 37440117Smckusick /* Dequeue packet from sockbuf */ 37540117Smckusick *mp = m; 37640117Smckusick while (m) { 37740117Smckusick if (m->m_type != MT_DATA && m->m_type != MT_HEADER) 37840117Smckusick panic("nfs_dgreceive 3"); 37940117Smckusick sbfree(&so->so_rcv, m); 38040117Smckusick m = so->so_rcv.sb_mb = m->m_next; 38140117Smckusick } 38240117Smckusick so->so_rcv.sb_mb = nextrecord; 38340117Smckusick /* Return */ 38440117Smckusick break; 38538414Smckusick } 38638414Smckusick sbunlock(&so->so_rcv); 38738414Smckusick splx(s); 38838414Smckusick return (error); 38938414Smckusick } 39038414Smckusick 39138414Smckusick struct rpc_replyhead { 39238414Smckusick u_long r_xid; 39338414Smckusick u_long r_rep; 39438414Smckusick }; 39538414Smckusick 39638414Smckusick /* 39740117Smckusick * Implement NFS client side datagram receive. 39838414Smckusick * We depend on the way that records are added to the sockbuf 39938414Smckusick * by sbappend*. In particular, each record (mbufs linked through m_next) 40038414Smckusick * must begin with an address, followed by optional MT_CONTROL mbuf 40138414Smckusick * and then zero or more mbufs of data. 40238414Smckusick * We must search through the list of received datagrams matching them 40338414Smckusick * with outstanding requests using the xid, until ours is found. 40438414Smckusick */ 40540117Smckusick nfs_dgreply(so, mntp, myrep) 40638414Smckusick register struct socket *so; 40738414Smckusick struct nfsmount *mntp; 40839344Smckusick struct nfsreq *myrep; 40938414Smckusick { 41038414Smckusick register struct mbuf *m; 41138414Smckusick register struct nfsreq *rep; 41238414Smckusick register int error = 0, s; 41340117Smckusick int logged = 0; 41438414Smckusick struct mbuf *nextrecord; 41538414Smckusick struct rpc_replyhead replyh; 41638414Smckusick 41738414Smckusick restart: 41839344Smckusick nfs_sblock(&so->so_rcv); 41940117Smckusick s = splnet(); 42040117Smckusick /* Already received and queued for us, bye bye */ 42139344Smckusick if (myrep->r_mrep != NULL) { 42240117Smckusick error = 0; 42340117Smckusick goto release; 42439344Smckusick } 42540117Smckusick /* If we have run out of retries (hard mounts have bogus count) */ 42640117Smckusick if (myrep->r_rexmit > myrep->r_retry) { 42740117Smckusick error = ETIMEDOUT; 42840117Smckusick nfsstats.rpctimeouts++; 42940117Smckusick giveup: 43040117Smckusick if (myrep->r_flags & R_TIMING) { 43140117Smckusick myrep->r_flags &= ~R_TIMING; 43240117Smckusick mntp->nm_rtt = -1; 43340117Smckusick } 43440117Smckusick if (myrep->r_flags & R_SENT) { 43540117Smckusick myrep->r_flags &= ~R_SENT; 43640117Smckusick --mntp->nm_hostinfo->nh_sent; 43740117Smckusick /* If count now 0, want to initiate new req */ 43840117Smckusick } 43940117Smckusick goto release; 44039344Smckusick } 44138414Smckusick 44239344Smckusick m = so->so_rcv.sb_mb; 44339344Smckusick if (m == 0) { 44439344Smckusick if (so->so_rcv.sb_cc) 44539344Smckusick panic("nfs_soreply 1"); 44640117Smckusick if (error = nfs_sockerr(so, 0)) { 44738414Smckusick so->so_error = 0; 44840117Smckusick goto giveup; 44938414Smckusick } 45040117Smckusick /* Allow signals to interrupt request? (nfs_timer wakes up) */ 45140117Smckusick if ((mntp->nm_flag & NFSMNT_INT) && 45240117Smckusick u.u_procp->p_sig & ~u.u_procp->p_sigmask) { 45340117Smckusick error = EINTR; 45440117Smckusick goto giveup; 45540117Smckusick } 45640117Smckusick if (mntp->nm_rexmit >= nfsrexmtthresh && logged++ == 0) 45740117Smckusick uprintf("NFS server %s not responding, retrying\n", 45840117Smckusick mntp->nm_host); 45938414Smckusick sbunlock(&so->so_rcv); 46038414Smckusick nfs_sbwait(&so->so_rcv); 46138414Smckusick splx(s); 46238414Smckusick goto restart; 46338414Smckusick } 46438414Smckusick 46538414Smckusick /* 46638414Smckusick * Take off the address, check for rights and ditch any control 46738414Smckusick * mbufs. 46838414Smckusick */ 46940117Smckusick nextrecord = m->m_nextpkt; 47038414Smckusick if (m->m_type != MT_SONAME) 47138414Smckusick panic("nfs reply SONAME"); 47238414Smckusick sbfree(&so->so_rcv, m); 47338414Smckusick MFREE(m, so->so_rcv.sb_mb); 47438414Smckusick m = so->so_rcv.sb_mb; 47538414Smckusick if (m && m->m_type == MT_RIGHTS) 47638414Smckusick panic("nfs reply RIGHTS"); 47738414Smckusick if (m && m->m_type == MT_CONTROL) { 47838414Smckusick sbfree(&so->so_rcv, m); 47938414Smckusick MFREE(m, so->so_rcv.sb_mb); 48038414Smckusick m = so->so_rcv.sb_mb; 48138414Smckusick } 48239344Smckusick if (m) { 48338414Smckusick m->m_nextpkt = nextrecord; 48439344Smckusick } else { 48539344Smckusick so->so_rcv.sb_mb = nextrecord; 48638414Smckusick sbunlock(&so->so_rcv); 48738414Smckusick splx(s); 48838414Smckusick goto restart; 48938414Smckusick } 49038414Smckusick 49138414Smckusick /* 49238414Smckusick * Get the xid and check that it is an rpc reply 49338414Smckusick */ 49440117Smckusick if (m->m_len >= sizeof replyh) 49540117Smckusick bcopy(mtod(m, caddr_t), (caddr_t)&replyh, sizeof replyh); 49638414Smckusick else { 49740117Smckusick struct mbuf *mp = m; 49840117Smckusick caddr_t cp = (caddr_t)&replyh; 49940117Smckusick int cnt = sizeof replyh; 50040117Smckusick do { 50138414Smckusick if (mp->m_len > 0) { 50240117Smckusick int xfer = (mp->m_len >= cnt) ? cnt : mp->m_len; 50338414Smckusick bcopy(mtod(mp, caddr_t), cp, xfer); 50438414Smckusick cnt -= xfer; 50538414Smckusick cp += xfer; 50638414Smckusick } 50738414Smckusick if (cnt > 0) 50838414Smckusick mp = mp->m_next; 50940117Smckusick } while (mp && cnt > 0); 51040117Smckusick if (mp == NULL) { /* Insufficient length */ 51140117Smckusick nfsstats.rpcinvalid++; 51240117Smckusick goto dropit; 51338414Smckusick } 51438414Smckusick } 51540117Smckusick if (replyh.r_rep != rpc_reply) { /* Not a reply */ 51640117Smckusick nfsstats.rpcinvalid++; 51738414Smckusick goto dropit; 51840117Smckusick } 51938414Smckusick /* 52038414Smckusick * Loop through the request list to match up the reply 52140117Smckusick * If no match, just drop the datagram 52238414Smckusick */ 52340117Smckusick if (rep = nfsreqh.r_next) { 52440117Smckusick while (rep != &nfsreqh) { 52540117Smckusick /* The socket, being connected, will only queue matches */ 52640117Smckusick if (replyh.r_xid == rep->r_xid && so == rep->r_mntp->nm_so) { 52738414Smckusick /* Found it.. */ 52840117Smckusick if (rep->r_mrep) /* Already there - duplicate */ 52940117Smckusick break; 53038414Smckusick rep->r_mrep = m; 53138414Smckusick while (m) { 53238414Smckusick if (m->m_type != MT_DATA && m->m_type != MT_HEADER) 53338414Smckusick panic("nfs_soreply 3"); 53438414Smckusick sbfree(&so->so_rcv, m); 53538414Smckusick m = so->so_rcv.sb_mb = m->m_next; 53638414Smckusick } 53738414Smckusick so->so_rcv.sb_mb = nextrecord; 53840117Smckusick if (rep->r_flags & R_TIMING) { 53940117Smckusick nfs_updatetimer(mntp); 54040117Smckusick rep->r_flags &= ~R_TIMING; 54140117Smckusick mntp->nm_rtt = -1; /* re-arm timer */ 54240117Smckusick } 54340117Smckusick if (rep->r_flags & R_SENT) { 54440117Smckusick rep->r_flags &= ~R_SENT; 54540117Smckusick --mntp->nm_hostinfo->nh_sent; 54640117Smckusick /* If count now 0, want to initiate new req */ 54740117Smckusick } 54840117Smckusick if (rep == myrep) { /* This is success */ 54940117Smckusick if (logged) 55040117Smckusick uprintf("NFS server %s responded\n", 55140117Smckusick mntp->nm_host); 55238414Smckusick goto release; 55340117Smckusick } 55440117Smckusick /* Else wake up other sleeper and wait for next */ 55540117Smckusick sbunlock(&so->so_rcv); 55640117Smckusick sorwakeup(so); 55740117Smckusick splx(s); 55840117Smckusick goto restart; 55938414Smckusick } 56038414Smckusick rep = rep->r_next; 56140117Smckusick } 56238414Smckusick } 56340117Smckusick /* If not matched to request, drop it */ 56440117Smckusick nfsstats.rpcunexpected++; 56538414Smckusick dropit: 56640117Smckusick sbdroprecord(&so->so_rcv); 56738414Smckusick sbunlock(&so->so_rcv); 56838414Smckusick splx(s); 56938414Smckusick goto restart; 57040117Smckusick 57138414Smckusick release: 57238414Smckusick sbunlock(&so->so_rcv); 57338414Smckusick splx(s); 57438414Smckusick return (error); 57538414Smckusick } 57638414Smckusick 57738414Smckusick /* 57838414Smckusick * nfs_request - goes something like this 57938414Smckusick * - fill in request struct 58038414Smckusick * - links it into list 58138414Smckusick * - calls nfs_sosend() for first transmit 58238414Smckusick * - calls nfs_soreceive() to get reply 58338414Smckusick * - break down rpc header and return with nfs reply pointed to 58438414Smckusick * by mrep or error 58538414Smckusick * nb: always frees up mreq mbuf list 58638414Smckusick */ 58740117Smckusick nfs_request(vp, mreq, xid, idem, mp, mrp, mdp, dposp) 58838414Smckusick struct vnode *vp; 58938414Smckusick struct mbuf *mreq; 59038414Smckusick u_long xid; 59140117Smckusick int idem; 59238414Smckusick struct mount *mp; 59338414Smckusick struct mbuf **mrp; 59438414Smckusick struct mbuf **mdp; 59538414Smckusick caddr_t *dposp; 59638414Smckusick { 59738414Smckusick register struct mbuf *m, *mrep; 59838414Smckusick register struct nfsreq *rep; 59938414Smckusick register u_long *p; 60038414Smckusick register int len; 60138414Smckusick struct nfsmount *mntp; 60238414Smckusick struct mbuf *md; 60339344Smckusick struct nfsreq *reph; 60438414Smckusick caddr_t dpos; 60538414Smckusick char *cp2; 60638414Smckusick int t1; 60738414Smckusick int s; 60838414Smckusick int error; 60938414Smckusick 61038414Smckusick mntp = vfs_to_nfs(mp); 61138414Smckusick m = mreq; 61238414Smckusick MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK); 61338414Smckusick rep->r_xid = xid; 61438414Smckusick rep->r_mntp = mntp; 61538414Smckusick rep->r_vp = vp; 61638414Smckusick if (mntp->nm_flag & NFSMNT_SOFT) 61740117Smckusick rep->r_retry = mntp->nm_retry; 61838414Smckusick else 61940117Smckusick rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */ 62040117Smckusick rep->r_flags = rep->r_rexmit = 0; 62140117Smckusick /* Idempotency: add N * MINTIMEO to requests if not, else use 0 */ 62240117Smckusick rep->r_timer = rep->r_timerinit = -(idem * NFS_MINTIMEO); 62338414Smckusick rep->r_mrep = NULL; 62438414Smckusick rep->r_mreq = m; 62538414Smckusick len = 0; 62638414Smckusick while (m) { 62738414Smckusick len += m->m_len; 62838414Smckusick m = m->m_next; 62938414Smckusick } 63038414Smckusick rep->r_msiz = len; 63138414Smckusick 63240117Smckusick /* 63340117Smckusick * Do the client side RPC. 63440117Smckusick */ 63540117Smckusick nfsstats.rpcrequests++; 63640117Smckusick s = splnet(); 63740117Smckusick /* Chain request into list of outstanding requests. Be sure 63840117Smckusick * to put it LAST so timer finds oldest requests first. */ 63939344Smckusick reph = &nfsreqh; 64039344Smckusick if (reph->r_prev == NULL) { 64139344Smckusick reph->r_next = rep; 64239344Smckusick rep->r_prev = reph; 64339344Smckusick } else { 64439344Smckusick reph->r_prev->r_next = rep; 64539344Smckusick rep->r_prev = reph->r_prev; 64639344Smckusick } 64739344Smckusick reph->r_prev = rep; 64839344Smckusick rep->r_next = reph; 64940117Smckusick /* 65040117Smckusick * If backing off another request or avoiding congestion, don't 65140117Smckusick * send this one now but let timer do it. If not timing a request, 65240117Smckusick * do it now. 65340117Smckusick */ 65440117Smckusick if (mntp->nm_hostinfo->nh_sent > 0 && 65540117Smckusick (mntp->nm_hostinfo->nh_currexmit != 0 || 65640117Smckusick mntp->nm_hostinfo->nh_sent >= mntp->nm_hostinfo->nh_window)) { 65740117Smckusick splx(s); 65840117Smckusick goto skipsend; 65940117Smckusick } 66040117Smckusick ++mntp->nm_hostinfo->nh_sent; /* Inconsistent if can't NFSMCOPY */ 66140117Smckusick rep->r_flags |= R_SENT; /* But not a catastrophe */ 66240117Smckusick if (mntp->nm_rtt == -1) { 66340117Smckusick mntp->nm_rtt = 0; 66440117Smckusick rep->r_flags |= R_TIMING; 66540117Smckusick } 66638414Smckusick splx(s); 66738414Smckusick 66838414Smckusick /* 66940117Smckusick * If we can get a packet to send, send it off... 67038414Smckusick * otherwise the timer will retransmit later 67138414Smckusick */ 67240117Smckusick m = NFSMCOPY(mreq, 0, M_COPYALL, M_WAIT); 67338414Smckusick if (m != NULL) 67440117Smckusick (void) nfs_send(mntp->nm_so, (struct mbuf *)0, m, 0, len); 67540117Smckusick /* 67640117Smckusick * Wait for the reply from our send or the timer's. 67740117Smckusick */ 67840117Smckusick skipsend: 67940117Smckusick error = nfs_dgreply(mntp->nm_so, mntp, rep); 68038414Smckusick 68140117Smckusick /* 68240117Smckusick * RPC done, unlink the request. 68340117Smckusick */ 68438414Smckusick s = splnet(); 68538414Smckusick rep->r_prev->r_next = rep->r_next; 68639344Smckusick rep->r_next->r_prev = rep->r_prev; 68738414Smckusick splx(s); 68838414Smckusick m_freem(rep->r_mreq); 68938414Smckusick mrep = md = rep->r_mrep; 69038414Smckusick FREE((caddr_t)rep, M_NFSREQ); 69138414Smckusick if (error) 69238414Smckusick return (error); 69338414Smckusick 69438414Smckusick /* 69538414Smckusick * break down the rpc header and check if ok 69638414Smckusick */ 69738414Smckusick dpos = mtod(md, caddr_t); 69838414Smckusick nfsm_disect(p, u_long *, 5*NFSX_UNSIGNED); 69938414Smckusick p += 2; 70038414Smckusick if (*p++ == rpc_msgdenied) { 70138414Smckusick if (*p == rpc_mismatch) 70238414Smckusick error = EOPNOTSUPP; 70338414Smckusick else 70438414Smckusick error = EACCES; 70538414Smckusick m_freem(mrep); 70638414Smckusick return (error); 70738414Smckusick } 70838414Smckusick /* 70938414Smckusick * skip over the auth_verf, someday we may want to cache auth_short's 71038414Smckusick * for nfs_reqhead(), but for now just dump it 71138414Smckusick */ 71238414Smckusick if (*++p != 0) { 71338414Smckusick len = nfsm_rndup(fxdr_unsigned(long, *p)); 71438414Smckusick nfsm_adv(len); 71538414Smckusick } 71638414Smckusick nfsm_disect(p, u_long *, NFSX_UNSIGNED); 71738414Smckusick /* 0 == ok */ 71838414Smckusick if (*p == 0) { 71938414Smckusick nfsm_disect(p, u_long *, NFSX_UNSIGNED); 72038414Smckusick if (*p != 0) { 72138414Smckusick error = fxdr_unsigned(int, *p); 72238414Smckusick m_freem(mrep); 72338414Smckusick return (error); 72438414Smckusick } 72538414Smckusick *mrp = mrep; 72638414Smckusick *mdp = md; 72738414Smckusick *dposp = dpos; 72838414Smckusick return (0); 72938414Smckusick } 73038414Smckusick m_freem(mrep); 73138414Smckusick return (EPROTONOSUPPORT); 73238414Smckusick nfsmout: 73338414Smckusick return (error); 73438414Smckusick } 73538414Smckusick 73638414Smckusick /* 73738414Smckusick * Get a request for the server main loop 73838414Smckusick * - receive a request via. nfs_soreceive() 73938414Smckusick * - verify it 74038414Smckusick * - fill in the cred struct. 74138414Smckusick */ 74239754Smckusick nfs_getreq(so, prog, vers, maxproc, nam, mrp, mdp, dposp, retxid, proc, cr, 74339754Smckusick msk, mtch) 74438414Smckusick struct socket *so; 74538414Smckusick u_long prog; 74638414Smckusick u_long vers; 74738414Smckusick int maxproc; 74838414Smckusick struct mbuf **nam; 74938414Smckusick struct mbuf **mrp; 75038414Smckusick struct mbuf **mdp; 75138414Smckusick caddr_t *dposp; 75238414Smckusick u_long *retxid; 75338414Smckusick u_long *proc; 75438414Smckusick register struct ucred *cr; 75539754Smckusick u_long msk; 75639754Smckusick u_long mtch; 75738414Smckusick { 75838414Smckusick register int i; 75939494Smckusick register u_long *p; 76039494Smckusick register long t1; 76139494Smckusick caddr_t dpos, cp2; 76239494Smckusick int error = 0; 76339494Smckusick struct mbuf *mrep, *md; 76439494Smckusick int len; 76538414Smckusick 76640117Smckusick if (error = nfs_dgreceive(so, msk, mtch, nam, &mrep)) 76738414Smckusick return (error); 76838414Smckusick md = mrep; 76938414Smckusick dpos = mtod(mrep, caddr_t); 77038414Smckusick nfsm_disect(p, u_long *, 10*NFSX_UNSIGNED); 77138414Smckusick *retxid = *p++; 77238414Smckusick if (*p++ != rpc_call) { 77338414Smckusick m_freem(mrep); 77438414Smckusick return (ERPCMISMATCH); 77538414Smckusick } 77638414Smckusick if (*p++ != rpc_vers) { 77738414Smckusick m_freem(mrep); 77838414Smckusick return (ERPCMISMATCH); 77938414Smckusick } 78038414Smckusick if (*p++ != prog) { 78138414Smckusick m_freem(mrep); 78238414Smckusick return (EPROGUNAVAIL); 78338414Smckusick } 78438414Smckusick if (*p++ != vers) { 78538414Smckusick m_freem(mrep); 78638414Smckusick return (EPROGMISMATCH); 78738414Smckusick } 78838414Smckusick *proc = fxdr_unsigned(u_long, *p++); 78938414Smckusick if (*proc == NFSPROC_NULL) { 79038414Smckusick *mrp = mrep; 79138414Smckusick return (0); 79238414Smckusick } 79338414Smckusick if (*proc > maxproc || *p++ != rpc_auth_unix) { 79438414Smckusick m_freem(mrep); 79538414Smckusick return (EPROCUNAVAIL); 79638414Smckusick } 79739494Smckusick (void) fxdr_unsigned(int, *p++); 79839494Smckusick len = fxdr_unsigned(int, *++p); 79939494Smckusick nfsm_adv(nfsm_rndup(len)); 80038414Smckusick nfsm_disect(p, u_long *, 3*NFSX_UNSIGNED); 80138414Smckusick cr->cr_uid = fxdr_unsigned(uid_t, *p++); 80238414Smckusick cr->cr_gid = fxdr_unsigned(gid_t, *p++); 80339494Smckusick len = fxdr_unsigned(int, *p); 80439494Smckusick if (len > 10) { 80538414Smckusick m_freem(mrep); 80638414Smckusick return (EBADRPC); 80738414Smckusick } 80839494Smckusick nfsm_disect(p, u_long *, (len + 2)*NFSX_UNSIGNED); 80939494Smckusick for (i = 1; i <= len; i++) 81038414Smckusick cr->cr_groups[i] = fxdr_unsigned(gid_t, *p++); 81139494Smckusick cr->cr_ngroups = len + 1; 81238414Smckusick /* 81338414Smckusick * Do we have any use for the verifier. 81438414Smckusick * According to the "Remote Procedure Call Protocol Spec." it 81538414Smckusick * should be AUTH_NULL, but some clients make it AUTH_UNIX? 81638414Smckusick * For now, just skip over it 81738414Smckusick */ 81839494Smckusick len = fxdr_unsigned(int, *++p); 81939494Smckusick if (len > 0) 82039494Smckusick nfsm_adv(nfsm_rndup(len)); 82138414Smckusick *mrp = mrep; 82238414Smckusick *mdp = md; 82338414Smckusick *dposp = dpos; 82438414Smckusick return (0); 82538414Smckusick nfsmout: 82638414Smckusick return (error); 82738414Smckusick } 82838414Smckusick 82938414Smckusick /* 83038414Smckusick * Generate the rpc reply header 83138414Smckusick * siz arg. is used to decide if adding a cluster is worthwhile 83238414Smckusick */ 83338414Smckusick nfs_rephead(siz, retxid, err, mrq, mbp, bposp) 83438414Smckusick int siz; 83538414Smckusick u_long retxid; 83638414Smckusick int err; 83738414Smckusick struct mbuf **mrq; 83838414Smckusick struct mbuf **mbp; 83938414Smckusick caddr_t *bposp; 84038414Smckusick { 84139494Smckusick register u_long *p; 84239494Smckusick register long t1; 84339494Smckusick caddr_t bpos; 84439494Smckusick struct mbuf *mreq, *mb, *mb2; 84538414Smckusick 84638414Smckusick NFSMGETHDR(mreq); 84738414Smckusick mb = mreq; 84838414Smckusick if ((siz+RPC_REPLYSIZ) > MHLEN) 84938414Smckusick NFSMCLGET(mreq, M_WAIT); 85038414Smckusick p = mtod(mreq, u_long *); 85138414Smckusick mreq->m_len = 6*NFSX_UNSIGNED; 85238414Smckusick bpos = ((caddr_t)p)+mreq->m_len; 85338414Smckusick *p++ = retxid; 85438414Smckusick *p++ = rpc_reply; 85538414Smckusick if (err == ERPCMISMATCH) { 85638414Smckusick *p++ = rpc_msgdenied; 85738414Smckusick *p++ = rpc_mismatch; 85838414Smckusick *p++ = txdr_unsigned(2); 85938414Smckusick *p = txdr_unsigned(2); 86038414Smckusick } else { 86138414Smckusick *p++ = rpc_msgaccepted; 86238414Smckusick *p++ = 0; 86338414Smckusick *p++ = 0; 86438414Smckusick switch (err) { 86538414Smckusick case EPROGUNAVAIL: 86638414Smckusick *p = txdr_unsigned(RPC_PROGUNAVAIL); 86738414Smckusick break; 86838414Smckusick case EPROGMISMATCH: 86938414Smckusick *p = txdr_unsigned(RPC_PROGMISMATCH); 87038414Smckusick nfsm_build(p, u_long *, 2*NFSX_UNSIGNED); 87138414Smckusick *p++ = txdr_unsigned(2); 87238414Smckusick *p = txdr_unsigned(2); /* someday 3 */ 87338414Smckusick break; 87438414Smckusick case EPROCUNAVAIL: 87538414Smckusick *p = txdr_unsigned(RPC_PROCUNAVAIL); 87638414Smckusick break; 87738414Smckusick default: 87838414Smckusick *p = 0; 87938414Smckusick if (err != VNOVAL) { 88038414Smckusick nfsm_build(p, u_long *, NFSX_UNSIGNED); 88138414Smckusick *p = txdr_unsigned(err); 88238414Smckusick } 88338414Smckusick break; 88438414Smckusick }; 88538414Smckusick } 88638414Smckusick *mrq = mreq; 88738414Smckusick *mbp = mb; 88838414Smckusick *bposp = bpos; 88938414Smckusick if (err != 0 && err != VNOVAL) 89038414Smckusick nfsstats.srvrpc_errs++; 89138414Smckusick return (0); 89238414Smckusick } 89338414Smckusick 89438414Smckusick /* 89538414Smckusick * Nfs timer routine 89638414Smckusick * Scan the nfsreq list and retranmit any requests that have timed out 89738414Smckusick * To avoid retransmission attempts on STREAM sockets (in the future) make 89840117Smckusick * sure to set the r_retry field to 0 (implies nm_retry == 0). 89938414Smckusick */ 90038414Smckusick nfs_timer() 90138414Smckusick { 90238414Smckusick register struct nfsreq *rep; 90338414Smckusick register struct mbuf *m; 90438414Smckusick register struct socket *so; 90540117Smckusick register struct nfsmount *mntp; 90640117Smckusick int s, error; 90738414Smckusick 90838414Smckusick s = splnet(); 90938414Smckusick rep = nfsreqh.r_next; 91040117Smckusick if (rep) for ( ; rep != &nfsreqh; rep = rep->r_next) { 91140117Smckusick mntp = rep->r_mntp; 91240117Smckusick if (rep->r_flags & R_TIMING) /* update rtt in mount */ 91340117Smckusick mntp->nm_rtt++; 91440117Smckusick /* If not timed out or reply already received, skip */ 91540117Smckusick if (++rep->r_timer < mntp->nm_rto || rep->r_mrep) 91640117Smckusick continue; 91740117Smckusick /* Do backoff and save new timeout in mount */ 91840117Smckusick if (rep->r_flags & R_TIMING) { 91940117Smckusick nfs_backofftimer(mntp); 92040117Smckusick rep->r_flags &= ~R_TIMING; 92140117Smckusick mntp->nm_rtt = -1; 92240117Smckusick } 92340117Smckusick if (rep->r_flags & R_SENT) { 92440117Smckusick rep->r_flags &= ~R_SENT; 92540117Smckusick --mntp->nm_hostinfo->nh_sent; 92640117Smckusick } 92740117Smckusick /* Check state of socket, cf nfs_send */ 92840117Smckusick so = mntp->nm_so; 92940117Smckusick if (error = nfs_sockerr(so, 1)) 93040117Smckusick goto wakeup; 93140117Smckusick if (sbspace(&so->so_snd) < rep->r_msiz) 93240117Smckusick goto wakeup; 93340117Smckusick /* Check for too many retries, cf nfs_dgreply */ 93440117Smckusick if (++rep->r_rexmit > NFS_MAXREXMIT) /* clip */ 93540117Smckusick rep->r_rexmit = NFS_MAXREXMIT; 93640117Smckusick if (rep->r_rexmit > rep->r_retry) /* too many */ 93740117Smckusick goto wakeup; 93840117Smckusick /* Check for congestion control, cf nfs_request */ 93940117Smckusick if (mntp->nm_hostinfo->nh_sent >= mntp->nm_hostinfo->nh_window) 94040117Smckusick goto wakeup; 94140117Smckusick /* Send it! */ 94240117Smckusick m = NFSMCOPY(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT); 94340117Smckusick if (m == NULL) 94440117Smckusick goto wakeup; 94540117Smckusick nfsstats.rpcretries++; 94638414Smckusick #ifdef MGETHDR 94740117Smckusick m->m_pkthdr.len = rep->r_msiz; 94838414Smckusick #endif 94940117Smckusick (void)(*so->so_proto->pr_usrreq)(so, PRU_SEND, m, 950*40327Ssklower (struct mbuf *)0, (struct mbuf *)0); 95140117Smckusick 95240117Smckusick /* We need to time the request even though we're 95340117Smckusick * retransmitting, in order to maintain backoff. */ 95440117Smckusick mntp->nm_rtt = 0; 95540117Smckusick ++mntp->nm_hostinfo->nh_sent; 95640117Smckusick rep->r_flags |= (R_SENT|R_TIMING); 95740117Smckusick rep->r_timer = rep->r_timerinit; 95840117Smckusick wakeup: 95940117Smckusick /* If error or interruptible mount, give user a look */ 96040117Smckusick if (error || (mntp->nm_flag & NFSMNT_INT)) 96140117Smckusick sorwakeup(so); 96240117Smckusick } 96340117Smckusick splx(s); 96440117Smckusick timeout(nfs_timer, (caddr_t)0, hz/NFS_HZ); 96540117Smckusick } 96640117Smckusick 96740117Smckusick /* 96840117Smckusick * NFS timer update and backoff. The "Jacobson/Karels/Karn" scheme is 96940117Smckusick * used here. The timer state is held in the nfsmount structure and 97040117Smckusick * a single request is used to clock the response. When successful 97140117Smckusick * the rtt smoothing in nfs_updatetimer is used, when failed the backoff 97240117Smckusick * is done by nfs_backofftimer. We also log failure messages in these 97340117Smckusick * routines. 97440117Smckusick * 97540117Smckusick * Congestion variables are held in the nfshost structure which 97640117Smckusick * is referenced by nfsmounts and shared per-server. This separation 97740117Smckusick * makes it possible to do per-mount timing which allows varying disk 97840117Smckusick * access times to be dealt with, while preserving a network oriented 97940117Smckusick * congestion control scheme. 98040117Smckusick * 98140117Smckusick * The windowing implements the Jacobson/Karels slowstart algorithm 98240117Smckusick * with adjusted scaling factors. We start with one request, then send 98340117Smckusick * 4 more after each success until the ssthresh limit is reached, then 98440117Smckusick * we increment at a rate proportional to the window. On failure, we 98540117Smckusick * remember 3/4 the current window and clamp the send limit to 1. Note 98640117Smckusick * ICMP source quench is not reflected in so->so_error so we ignore that 98740117Smckusick * for now. 98840117Smckusick * 98940117Smckusick * NFS behaves much more like a transport protocol with these changes, 99040117Smckusick * shedding the teenage pedal-to-the-metal tendencies of "other" 99140117Smckusick * implementations. 99240117Smckusick * 99340117Smckusick * Timers and congestion avoidance by Tom Talpey, Open Software Foundation. 99440117Smckusick */ 99540117Smckusick 99640117Smckusick /* 99740117Smckusick * The TCP algorithm was not forgiving enough. Because the NFS server 99840117Smckusick * responds only after performing lookups/diskio/etc, we have to be 99940117Smckusick * more prepared to accept a spiky variance. The TCP algorithm is: 100040117Smckusick * TCP_RTO(mntp) ((((mntp)->nm_srtt >> 2) + (mntp)->nm_rttvar) >> 1) 100140117Smckusick */ 100240117Smckusick #define NFS_RTO(mntp) (((mntp)->nm_srtt >> 3) + (mntp)->nm_rttvar) 100340117Smckusick 100440117Smckusick nfs_updatetimer(mntp) 100540117Smckusick register struct nfsmount *mntp; 100640117Smckusick { 100740117Smckusick register struct nfshost *nfshp = mntp->nm_hostinfo; 100840117Smckusick 100940117Smckusick /* If retransmitted, clear and return */ 101040117Smckusick if (mntp->nm_rexmit || nfshp->nh_currexmit) { 101140117Smckusick if (nfshp->nh_currexmit >= nfsrexmtthresh) 101240117Smckusick nfs_log("NFS server %s OK\n", mntp->nm_host); 101340117Smckusick mntp->nm_rexmit = nfshp->nh_currexmit = 0; 101440117Smckusick return; 101540117Smckusick } 101640117Smckusick /* If have a measurement, do smoothing */ 101740117Smckusick if (mntp->nm_srtt) { 101840117Smckusick register short delta; 101940117Smckusick delta = mntp->nm_rtt - (mntp->nm_srtt >> 3); 102040117Smckusick if ((mntp->nm_srtt += delta) <= 0) 102140117Smckusick mntp->nm_srtt = 1; 102240117Smckusick if (delta < 0) 102340117Smckusick delta = -delta; 102440117Smckusick delta -= (mntp->nm_rttvar >> 2); 102540117Smckusick if ((mntp->nm_rttvar += delta) <= 0) 102640117Smckusick mntp->nm_rttvar = 1; 102740117Smckusick /* Else initialize */ 102840117Smckusick } else { 102940117Smckusick mntp->nm_rttvar = mntp->nm_rtt << 1; 103040117Smckusick if (mntp->nm_rttvar == 0) mntp->nm_rttvar = 2; 103140117Smckusick mntp->nm_srtt = mntp->nm_rttvar << 2; 103240117Smckusick } 103340117Smckusick /* Compute new Retransmission TimeOut and clip */ 103440117Smckusick mntp->nm_rto = NFS_RTO(mntp); 103540117Smckusick if (mntp->nm_rto < NFS_MINTIMEO) 103640117Smckusick mntp->nm_rto = NFS_MINTIMEO; 103740117Smckusick else if (mntp->nm_rto > NFS_MAXTIMEO) 103840117Smckusick mntp->nm_rto = NFS_MAXTIMEO; 103940117Smckusick nfshp->nh_currto = mntp->nm_rto; 104040117Smckusick 104140117Smckusick /* Update window estimate */ 104240117Smckusick if (nfshp->nh_window < nfshp->nh_ssthresh) /* quickly */ 104340117Smckusick nfshp->nh_window += 4; 104440117Smckusick else { /* slowly */ 104540117Smckusick register long incr = ++nfshp->nh_winext; 104640117Smckusick incr = (incr * incr) / nfshp->nh_window; 104740117Smckusick if (incr > 0) { 104840117Smckusick nfshp->nh_winext = 0; 104940117Smckusick ++nfshp->nh_window; 105040117Smckusick } 105140117Smckusick } 105240117Smckusick if (nfshp->nh_window > NFS_MAXWINDOW) 105340117Smckusick nfshp->nh_window = NFS_MAXWINDOW; 105440117Smckusick } 105540117Smckusick 105640117Smckusick nfs_backofftimer(mntp) 105740117Smckusick register struct nfsmount *mntp; 105840117Smckusick { 105940117Smckusick register struct nfshost *nfshp = mntp->nm_hostinfo; 106040117Smckusick register unsigned long newrto; 106140117Smckusick 106240117Smckusick /* Clip shift count */ 106340117Smckusick if (++mntp->nm_rexmit > 8 * sizeof mntp->nm_rto) 106440117Smckusick mntp->nm_rexmit = 8 * sizeof mntp->nm_rto; 106540117Smckusick /* Back off RTO exponentially */ 106640117Smckusick newrto = NFS_RTO(mntp); 106740117Smckusick newrto <<= (mntp->nm_rexmit - 1); 106840117Smckusick if (newrto == 0 || newrto > NFS_MAXTIMEO) 106940117Smckusick newrto = NFS_MAXTIMEO; 107040117Smckusick mntp->nm_rto = nfshp->nh_currto = newrto; 107140117Smckusick 107240117Smckusick /* If too many retries, message, assume a bogus RTT and re-measure */ 107340117Smckusick if (nfshp->nh_currexmit < mntp->nm_rexmit) { 107440117Smckusick nfshp->nh_currexmit = mntp->nm_rexmit; 107540117Smckusick if (nfshp->nh_currexmit >= nfsrexmtthresh) { 107640117Smckusick if (nfshp->nh_currexmit == nfsrexmtthresh) { 107740117Smckusick nfs_log("NFS server %s not responding\n", 107840117Smckusick mntp->nm_host); 107940117Smckusick mntp->nm_rttvar += (mntp->nm_srtt >> 2); 108040117Smckusick mntp->nm_srtt = 0; 108138414Smckusick } 108240117Smckusick /* The routing invalidation should be a usrreq PRU */ 108340117Smckusick if (mtod(nfshp->nh_sockaddr, 108440117Smckusick struct sockaddr *)->sa_family == AF_INET) 108540117Smckusick in_losing(mntp->nm_so->so_pcb); 108638414Smckusick } 108738414Smckusick } 108840117Smckusick /* Close down window but remember this point (3/4 current) for later */ 108940117Smckusick nfshp->nh_ssthresh = ((nfshp->nh_window << 1) + nfshp->nh_window) >> 2; 109040117Smckusick nfshp->nh_window = 1; 109140117Smckusick nfshp->nh_winext = 0; 109238414Smckusick } 109338414Smckusick 109438414Smckusick /* 109540117Smckusick * Not all errors are fatal. The closed checks deal 109640117Smckusick * with errors a little strangely. 109738414Smckusick */ 109840117Smckusick 109940117Smckusick nfs_sockerr(so, sending) 110040117Smckusick struct socket *so; 110140117Smckusick int sending; 110238414Smckusick { 110340117Smckusick if (sending && (so->so_state & SS_CANTSENDMORE)) { 110440117Smckusick so->so_error = EPIPE; 110540117Smckusick return (EPIPE); 110640117Smckusick } 110740117Smckusick 110840117Smckusick switch (so->so_error) { /* inhibit certain errors */ 110940117Smckusick case ENETDOWN: 111040117Smckusick case ENETUNREACH: 111140117Smckusick case EHOSTDOWN: 111240117Smckusick case EHOSTUNREACH: 111340117Smckusick so->so_error = 0; 111440117Smckusick case 0: 111540117Smckusick break; 111640117Smckusick default: /* return all others */ 111740117Smckusick printf("nfs_sockerr: error %d on %s\n", so->so_error, 111840117Smckusick sending?"send":"receive"); 111940117Smckusick return (so->so_error); 112040117Smckusick } 112140117Smckusick 112240117Smckusick if (!sending && (so->so_state & SS_CANTRCVMORE)) { 112340117Smckusick so->so_error = 0; /* (no error) */ 112440117Smckusick return (EPIPE); 112540117Smckusick } 112640117Smckusick return (so->so_error); 112738414Smckusick } 1128