138414Smckusick /* 238414Smckusick * Copyright (c) 1989 The Regents of the University of California. 338414Smckusick * All rights reserved. 438414Smckusick * 538414Smckusick * This code is derived from software contributed to Berkeley by 638414Smckusick * Rick Macklem at The University of Guelph. 738414Smckusick * 838414Smckusick * Redistribution and use in source and binary forms are permitted 938414Smckusick * provided that the above copyright notice and this paragraph are 1038414Smckusick * duplicated in all such forms and that any documentation, 1138414Smckusick * advertising materials, and other materials related to such 1238414Smckusick * distribution and use acknowledge that the software was developed 1338414Smckusick * by the University of California, Berkeley. The name of the 1438414Smckusick * University may not be used to endorse or promote products derived 1538414Smckusick * from this software without specific prior written permission. 1638414Smckusick * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 1738414Smckusick * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 1838414Smckusick * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 1938414Smckusick * 20*40761Skarels * @(#)nfs_socket.c 7.10 (Berkeley) 04/04/90 2138414Smckusick */ 2238414Smckusick 2338414Smckusick /* 2438414Smckusick * Socket operations for use by nfs (similar to uipc_socket.c, but never 2538414Smckusick * with copies to/from a uio vector) 2640117Smckusick * NB: For now, they only work for datagram sockets. 2738414Smckusick * (Use on stream sockets would require some record boundary mark in the 2839754Smckusick * stream as defined by "RPC: Remote Procedure Call Protocol 2939754Smckusick * Specification" RFC1057 Section 10) 3038414Smckusick * and different versions of send, receive and reply that do not assume 3138414Smckusick * an atomic protocol 3238414Smckusick */ 3338414Smckusick 3438414Smckusick #include "types.h" 3538414Smckusick #include "param.h" 3638414Smckusick #include "uio.h" 3738414Smckusick #include "user.h" 3840117Smckusick #include "proc.h" 3940117Smckusick #include "signal.h" 4038414Smckusick #include "mount.h" 4138414Smckusick #include "kernel.h" 4238414Smckusick #include "malloc.h" 4338414Smckusick #include "mbuf.h" 4438414Smckusick #include "vnode.h" 4538414Smckusick #include "domain.h" 4638414Smckusick #include "protosw.h" 4738414Smckusick #include "socket.h" 4838414Smckusick #include "socketvar.h" 4938414Smckusick #include "rpcv2.h" 5038414Smckusick #include "nfsv2.h" 5138414Smckusick #include "nfs.h" 5238414Smckusick #include "xdr_subs.h" 5338414Smckusick #include "nfsm_subs.h" 5438414Smckusick #include "nfsmount.h" 5538414Smckusick 5640117Smckusick #include "syslog.h" 5740117Smckusick #define nfs_log(message, host) log(LOG_ERR, message, host) 5840117Smckusick 5938414Smckusick #define TRUE 1 6038414Smckusick 6138414Smckusick /* set lock on sockbuf sb, sleep at neg prio */ 6238414Smckusick #define nfs_sblock(sb) { \ 6338414Smckusick while ((sb)->sb_flags & SB_LOCK) { \ 6438414Smckusick (sb)->sb_flags |= SB_WANT; \ 6538414Smckusick sleep((caddr_t)&(sb)->sb_flags, PZERO-1); \ 6638414Smckusick } \ 6738414Smckusick (sb)->sb_flags |= SB_LOCK; \ 6838414Smckusick } 6940117Smckusick /* 7040117Smckusick * nfs_sbwait() is simply sbwait() but at a negative priority so that it 7140117Smckusick * can not be interrupted by a signal. 7240117Smckusick */ 7340117Smckusick nfs_sbwait(sb) 7440117Smckusick struct sockbuf *sb; 7540117Smckusick { 7640117Smckusick sb->sb_flags |= SB_WAIT; 7740117Smckusick sleep((caddr_t)&sb->sb_cc, PZERO-2); 7840117Smckusick } 7938414Smckusick 8038414Smckusick /* 8138414Smckusick * External data, mostly RPC constants in XDR form 8238414Smckusick */ 8338414Smckusick extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix, 8438414Smckusick rpc_msgaccepted, rpc_call; 8538414Smckusick extern u_long nfs_prog, nfs_vers; 8638414Smckusick int nfsrv_null(), 8738414Smckusick nfsrv_getattr(), 8838414Smckusick nfsrv_setattr(), 8938414Smckusick nfsrv_lookup(), 9038414Smckusick nfsrv_readlink(), 9138414Smckusick nfsrv_read(), 9238414Smckusick nfsrv_write(), 9338414Smckusick nfsrv_create(), 9438414Smckusick nfsrv_remove(), 9538414Smckusick nfsrv_rename(), 9638414Smckusick nfsrv_link(), 9738414Smckusick nfsrv_symlink(), 9838414Smckusick nfsrv_mkdir(), 9938414Smckusick nfsrv_rmdir(), 10038414Smckusick nfsrv_readdir(), 10138414Smckusick nfsrv_statfs(), 10238414Smckusick nfsrv_noop(); 10338414Smckusick 10438414Smckusick int (*nfsrv_procs[NFS_NPROCS])() = { 10538414Smckusick nfsrv_null, 10638414Smckusick nfsrv_getattr, 10738414Smckusick nfsrv_setattr, 10838414Smckusick nfsrv_noop, 10938414Smckusick nfsrv_lookup, 11038414Smckusick nfsrv_readlink, 11138414Smckusick nfsrv_read, 11238414Smckusick nfsrv_noop, 11338414Smckusick nfsrv_write, 11438414Smckusick nfsrv_create, 11538414Smckusick nfsrv_remove, 11638414Smckusick nfsrv_rename, 11738414Smckusick nfsrv_link, 11838414Smckusick nfsrv_symlink, 11938414Smckusick nfsrv_mkdir, 12038414Smckusick nfsrv_rmdir, 12138414Smckusick nfsrv_readdir, 12238414Smckusick nfsrv_statfs, 12338414Smckusick }; 12438414Smckusick 12540117Smckusick struct nfshost *nfshosth; 12640117Smckusick struct nfsreq nfsreqh; 12740117Smckusick int nfsrexmtthresh = NFS_FISHY; 12838414Smckusick 12938414Smckusick /* 13040117Smckusick * Initialize sockets and per-host congestion for a new NFS connection. 13140117Smckusick * We do not free the sockaddr if error. 13238414Smckusick */ 13340117Smckusick nfs_connect(nmp, saddr) 13440117Smckusick register struct nfsmount *nmp; 13540117Smckusick struct mbuf *saddr; 13640117Smckusick { 13740117Smckusick int s, error, srvaddrlen; 13840117Smckusick struct mbuf *m; 13940117Smckusick register struct nfshost *nfshp; 14040117Smckusick 14140117Smckusick nmp->nm_so = 0; 14240117Smckusick if (error = socreate(mtod(saddr, struct sockaddr *)->sa_family, 14340117Smckusick &nmp->nm_so, SOCK_DGRAM, 0)) 14440117Smckusick goto bad; 14540117Smckusick 14640117Smckusick /* Unix sockets do not provide a local bind for server reply */ 14740117Smckusick if (mtod(saddr, struct sockaddr *)->sa_family == AF_UNIX) { 14840117Smckusick struct sockaddr *sa; 14940117Smckusick static char client[] = "/tmp/.nfs/nfsclient##"; 15040117Smckusick static int serial; 15140117Smckusick int firstserial; 15240117Smckusick m = m_getclr(M_WAIT, MT_SONAME); 15340117Smckusick if (m == NULL) { 15440117Smckusick error = ENOBUFS; 15540117Smckusick goto bad; 15640117Smckusick } 15740117Smckusick m->m_len = sizeof (client) + 2; 15840117Smckusick sa = mtod(m, struct sockaddr *); 15940117Smckusick sa->sa_family = AF_UNIX; 16040117Smckusick #ifdef MSG_TRUNC /* Have sa_len to set? */ 16140117Smckusick sa->sa_len = m->m_len; 16240117Smckusick #endif 16340117Smckusick bcopy(client, sa->sa_data, sizeof(client)); 16440117Smckusick firstserial = serial; 16540117Smckusick do { 16640117Smckusick if (++serial >= 100) serial = 0; 16740117Smckusick sa->sa_data[19] = (serial / 10) + '0'; 16840117Smckusick sa->sa_data[20] = (serial % 10) + '0'; 16940117Smckusick error = sobind(nmp->nm_so, m); 17040117Smckusick if (firstserial == serial) break; 17140117Smckusick } while (error == EADDRINUSE); 17240117Smckusick m_freem(m); 17340117Smckusick if (error) 17440117Smckusick goto bad; 17540117Smckusick } 17640117Smckusick 17740117Smckusick if (error = soconnect(nmp->nm_so, saddr)) 17840117Smckusick goto bad; 17940117Smckusick error = soreserve(nmp->nm_so, /* get space ! */ 18040117Smckusick nmp->nm_wsize + 1024, /* one out */ 18140117Smckusick (nmp->nm_rsize + 1024) * 4); /* four in */ 18240117Smckusick if (error) 18340117Smckusick goto bad; 18440117Smckusick 18540117Smckusick /* 18640117Smckusick * Search mount list for existing server entry. 18740117Smckusick * 18840117Smckusick * Note, even though we have a sockaddr, it is not quite reliable 18940117Smckusick * enough to bcmp against. For instance, a sockaddr_in has a 19040117Smckusick * sin_zero field which is not reliably zeroed by user code (e.g. 19140117Smckusick * mount). So what we do as an attempt at transport independence 19240117Smckusick * is to get the peeraddr of our connected socket into a zeroed 19340117Smckusick * sockaddr. Then we cache that and compare against it. This is 19440117Smckusick * not exactly perfect. However it is not critical that it be, if 19540117Smckusick * we cannot match the sockaddr we will simply allocate a new nfshp 19640117Smckusick * per mount, which will disable the per-host congestion but 19740117Smckusick * everything else will work as normal. 19840117Smckusick */ 19940117Smckusick m = m_getclr(M_WAIT, MT_SONAME); 20040117Smckusick if (m && (*(nmp->nm_so->so_proto->pr_usrreq))(nmp->nm_so, PRU_PEERADDR, 20140117Smckusick (struct mbuf *)0, m, (struct mbuf *)0) == 0) { 20240117Smckusick m_freem(saddr); 20340117Smckusick saddr = m; 20440117Smckusick } else 20540117Smckusick m_freem(m); 20640117Smckusick srvaddrlen = saddr->m_len; 20740117Smckusick 20840117Smckusick s = splnet(); 20940117Smckusick 21040117Smckusick for (nfshp = nfshosth; nfshp; nfshp = nfshp->nh_next) { 21140117Smckusick if (srvaddrlen != nfshp->nh_salen) 21240117Smckusick continue; 21340117Smckusick if (!bcmp(mtod(saddr,caddr_t),mtod(nfshp->nh_sockaddr,caddr_t), 21440117Smckusick srvaddrlen)) 21540117Smckusick break; 21640117Smckusick } 21740117Smckusick if (nfshp) /* Have an existing mount host */ 21840117Smckusick m_freem(saddr); 21940117Smckusick else { 22040117Smckusick MALLOC(nfshp,struct nfshost *,sizeof *nfshp,M_NFSMNT,M_WAITOK); 22140117Smckusick bzero((caddr_t)nfshp, sizeof *nfshp); 22240117Smckusick nfshp->nh_sockaddr = saddr; 22340117Smckusick nfshp->nh_salen = srvaddrlen; 22440117Smckusick /* Initialize other non-zero congestion variables */ 22540117Smckusick nfshp->nh_currto = NFS_TIMEO; 22640117Smckusick nfshp->nh_window = 1; /* Initial send window */ 22740117Smckusick nfshp->nh_ssthresh = NFS_MAXWINDOW; /* Slowstart threshold */ 22840117Smckusick if (nfshosth) nfshosth->nh_prev = nfshp; /* Chain in */ 22940117Smckusick nfshp->nh_next = nfshosth; 23040117Smckusick nfshosth = nfshp; 23140117Smckusick } 23240117Smckusick nfshp->nh_refcnt++; 23340117Smckusick splx(s); 23440117Smckusick nmp->nm_hostinfo = nfshp; 23540117Smckusick if (nmp->nm_rto == NFS_TIMEO) { 23640117Smckusick nmp->nm_rto = nfshp->nh_currto; 23740117Smckusick nmp->nm_rttvar = nmp->nm_rto << 1; 23840117Smckusick } 23940117Smckusick return (0); 24040117Smckusick 24140117Smckusick bad: 24240117Smckusick if (nmp->nm_so) (void) soclose(nmp->nm_so); 24340117Smckusick nmp->nm_so = 0; 24440117Smckusick return (error); 24540117Smckusick } 24640117Smckusick 24740117Smckusick /* 24840117Smckusick * NFS disconnect. Clean up and unlink. 24940117Smckusick */ 25040117Smckusick nfs_disconnect(nmp) 25140117Smckusick register struct nfsmount *nmp; 25240117Smckusick { 25340117Smckusick register struct nfshost *nfshp; 25440117Smckusick 25540117Smckusick if (nmp->nm_so) 25640117Smckusick soclose(nmp->nm_so); 25740117Smckusick nmp->nm_so = 0; 25840117Smckusick if (nfshp = nmp->nm_hostinfo) { 25940117Smckusick int s = splnet(); 26040117Smckusick if (--nfshp->nh_refcnt <= 0) { 26140117Smckusick if (nfshp->nh_next) 26240117Smckusick nfshp->nh_next->nh_prev = nfshp->nh_prev; 26340117Smckusick if (nfshp->nh_prev) 26440117Smckusick nfshp->nh_prev->nh_next = nfshp->nh_next; 26540117Smckusick else 26640117Smckusick nfshosth = nfshp->nh_next; 26740117Smckusick /* If unix family, remove the nfsclient from /tmp */ 26840117Smckusick if (mtod(nfshp->nh_sockaddr, 26940117Smckusick struct sockaddr *)->sa_family == AF_UNIX) { 27040117Smckusick /* Lookup sa_data, do VOP_REMOVE... */ 27140117Smckusick } 27240117Smckusick m_freem(nfshp->nh_sockaddr); 27340117Smckusick FREE(nfshp, M_NFSMNT); 27440117Smckusick } 27540117Smckusick nmp->nm_hostinfo = 0; 27640117Smckusick splx(s); 27740117Smckusick } 27840117Smckusick } 27940117Smckusick 28040117Smckusick /* 28140117Smckusick * This is a stripped down non-interruptible version of sosend(). 28240117Smckusick */ 28340117Smckusick nfs_send(so, nam, top, flags, siz) 28438414Smckusick register struct socket *so; 28538414Smckusick struct mbuf *nam; 28638414Smckusick struct mbuf *top; 28738414Smckusick int flags; 28838414Smckusick int siz; 28938414Smckusick { 29040117Smckusick int error, s; 29138414Smckusick 29238414Smckusick #ifdef MGETHDR 29338414Smckusick top->m_pkthdr.len = siz; 29438414Smckusick #endif 29540117Smckusick for (;;) { 29640117Smckusick nfs_sblock(&so->so_snd); 29740117Smckusick s = splnet(); 29840117Smckusick if (error = nfs_sockerr(so, 1)) { 29940117Smckusick splx(s); 30040117Smckusick m_freem(top); 30140117Smckusick break; 30240117Smckusick } 30340117Smckusick if (sbspace(&so->so_snd) < siz) { 30440117Smckusick sbunlock(&so->so_snd); 30540117Smckusick nfs_sbwait(&so->so_snd); 30640117Smckusick splx(s); 30740117Smckusick continue; 30840117Smckusick } 30940117Smckusick error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, top, 31040327Ssklower (struct mbuf *)nam, (struct mbuf *)0); 31138414Smckusick splx(s); 31240117Smckusick break; 31338414Smckusick } 31438414Smckusick sbunlock(&so->so_snd); 31538414Smckusick return (error); 31638414Smckusick } 31738414Smckusick 31838414Smckusick /* 31940117Smckusick * This is a stripped down datagram specific version of soreceive() 32038414Smckusick */ 32140117Smckusick nfs_dgreceive(so, msk, mtch, aname, mp) 32238414Smckusick register struct socket *so; 32339754Smckusick u_long msk; 32439754Smckusick u_long mtch; 32538414Smckusick struct mbuf **aname; 32638414Smckusick struct mbuf **mp; 32738414Smckusick { 32838414Smckusick register struct mbuf *m; 32938414Smckusick int s, error = 0; 33038414Smckusick struct mbuf *nextrecord; 33138414Smckusick 33238414Smckusick if (aname) 33338414Smckusick *aname = 0; 33438414Smckusick 33540117Smckusick for (;;) { 336*40761Skarels if (error = sblock(&so->so_rcv)) 337*40761Skarels return (error); 33840117Smckusick s = splnet(); 33938414Smckusick 34040117Smckusick if (so->so_rcv.sb_cc == 0) { 34140117Smckusick if (error = nfs_sockerr(so, 0)) { 34240117Smckusick so->so_error = 0; 34340117Smckusick break; 34440117Smckusick } 34539754Smckusick sbunlock(&so->so_rcv); 346*40761Skarels error = sbwait(&so->so_rcv); 34739754Smckusick splx(s); 348*40761Skarels if (error) 349*40761Skarels return (error); 35040117Smckusick continue; 35139754Smckusick } 35238414Smckusick m = so->so_rcv.sb_mb; 35340117Smckusick if (m == 0) 35440117Smckusick panic("nfs_dgreceive 1"); 35540117Smckusick nextrecord = m->m_nextpkt; 35640117Smckusick /* Save sender's address */ 35740117Smckusick if (m->m_type != MT_SONAME) 35840117Smckusick panic("nfs_dgreceive 1a"); 35938414Smckusick sbfree(&so->so_rcv, m); 36040117Smckusick if (aname) { 36140117Smckusick *aname = m; 36240117Smckusick so->so_rcv.sb_mb = m->m_next; 36340117Smckusick m->m_next = 0; 36440117Smckusick m = so->so_rcv.sb_mb; 36540117Smckusick } else { 36640117Smckusick MFREE(m, so->so_rcv.sb_mb); 36740117Smckusick m = so->so_rcv.sb_mb; 36840117Smckusick } 36940117Smckusick /* Drop control mbuf's */ 37040117Smckusick if (m && m->m_type == MT_RIGHTS) 37140117Smckusick panic("nfs_dgreceive 2"); 37240117Smckusick if (m && m->m_type == MT_CONTROL) { 37340117Smckusick sbfree(&so->so_rcv, m); 37440117Smckusick MFREE(m, so->so_rcv.sb_mb); 37540117Smckusick m = so->so_rcv.sb_mb; 37640117Smckusick } 37740117Smckusick /* Dequeue packet from sockbuf */ 37840117Smckusick *mp = m; 37940117Smckusick while (m) { 38040117Smckusick if (m->m_type != MT_DATA && m->m_type != MT_HEADER) 38140117Smckusick panic("nfs_dgreceive 3"); 38240117Smckusick sbfree(&so->so_rcv, m); 38340117Smckusick m = so->so_rcv.sb_mb = m->m_next; 38440117Smckusick } 38540117Smckusick so->so_rcv.sb_mb = nextrecord; 38640117Smckusick /* Return */ 38740117Smckusick break; 38838414Smckusick } 38938414Smckusick sbunlock(&so->so_rcv); 39038414Smckusick splx(s); 39138414Smckusick return (error); 39238414Smckusick } 39338414Smckusick 39438414Smckusick struct rpc_replyhead { 39538414Smckusick u_long r_xid; 39638414Smckusick u_long r_rep; 39738414Smckusick }; 39838414Smckusick 39938414Smckusick /* 40040117Smckusick * Implement NFS client side datagram receive. 40138414Smckusick * We depend on the way that records are added to the sockbuf 40238414Smckusick * by sbappend*. In particular, each record (mbufs linked through m_next) 40338414Smckusick * must begin with an address, followed by optional MT_CONTROL mbuf 40438414Smckusick * and then zero or more mbufs of data. 40538414Smckusick * We must search through the list of received datagrams matching them 40638414Smckusick * with outstanding requests using the xid, until ours is found. 40738414Smckusick */ 40840117Smckusick nfs_dgreply(so, mntp, myrep) 40938414Smckusick register struct socket *so; 41038414Smckusick struct nfsmount *mntp; 41139344Smckusick struct nfsreq *myrep; 41238414Smckusick { 41338414Smckusick register struct mbuf *m; 41438414Smckusick register struct nfsreq *rep; 41538414Smckusick register int error = 0, s; 41640117Smckusick int logged = 0; 41738414Smckusick struct mbuf *nextrecord; 41838414Smckusick struct rpc_replyhead replyh; 41938414Smckusick 42038414Smckusick restart: 42139344Smckusick nfs_sblock(&so->so_rcv); 42240117Smckusick s = splnet(); 42340117Smckusick /* Already received and queued for us, bye bye */ 42439344Smckusick if (myrep->r_mrep != NULL) { 42540117Smckusick error = 0; 42640117Smckusick goto release; 42739344Smckusick } 42840117Smckusick /* If we have run out of retries (hard mounts have bogus count) */ 42940117Smckusick if (myrep->r_rexmit > myrep->r_retry) { 43040117Smckusick error = ETIMEDOUT; 43140117Smckusick nfsstats.rpctimeouts++; 43240117Smckusick giveup: 43340117Smckusick if (myrep->r_flags & R_TIMING) { 43440117Smckusick myrep->r_flags &= ~R_TIMING; 43540117Smckusick mntp->nm_rtt = -1; 43640117Smckusick } 43740117Smckusick if (myrep->r_flags & R_SENT) { 43840117Smckusick myrep->r_flags &= ~R_SENT; 43940117Smckusick --mntp->nm_hostinfo->nh_sent; 44040117Smckusick /* If count now 0, want to initiate new req */ 44140117Smckusick } 44240117Smckusick goto release; 44339344Smckusick } 44438414Smckusick 44539344Smckusick m = so->so_rcv.sb_mb; 44639344Smckusick if (m == 0) { 44739344Smckusick if (so->so_rcv.sb_cc) 44839344Smckusick panic("nfs_soreply 1"); 44940117Smckusick if (error = nfs_sockerr(so, 0)) { 45038414Smckusick so->so_error = 0; 45140117Smckusick goto giveup; 45238414Smckusick } 45340117Smckusick /* Allow signals to interrupt request? (nfs_timer wakes up) */ 45440117Smckusick if ((mntp->nm_flag & NFSMNT_INT) && 45540484Smckusick (u.u_sigintr & sigmask(u.u_procp->p_cursig)) != 0) { 45640117Smckusick error = EINTR; 45740117Smckusick goto giveup; 45840117Smckusick } 45940117Smckusick if (mntp->nm_rexmit >= nfsrexmtthresh && logged++ == 0) 46040117Smckusick uprintf("NFS server %s not responding, retrying\n", 46140351Smckusick mntp->nm_mountp->m_stat.f_mntfromname); 46238414Smckusick sbunlock(&so->so_rcv); 46338414Smckusick nfs_sbwait(&so->so_rcv); 46438414Smckusick splx(s); 46538414Smckusick goto restart; 46638414Smckusick } 46738414Smckusick 46838414Smckusick /* 46938414Smckusick * Take off the address, check for rights and ditch any control 47038414Smckusick * mbufs. 47138414Smckusick */ 47240117Smckusick nextrecord = m->m_nextpkt; 47338414Smckusick if (m->m_type != MT_SONAME) 47438414Smckusick panic("nfs reply SONAME"); 47538414Smckusick sbfree(&so->so_rcv, m); 47638414Smckusick MFREE(m, so->so_rcv.sb_mb); 47738414Smckusick m = so->so_rcv.sb_mb; 47838414Smckusick if (m && m->m_type == MT_RIGHTS) 47938414Smckusick panic("nfs reply RIGHTS"); 48038414Smckusick if (m && m->m_type == MT_CONTROL) { 48138414Smckusick sbfree(&so->so_rcv, m); 48238414Smckusick MFREE(m, so->so_rcv.sb_mb); 48338414Smckusick m = so->so_rcv.sb_mb; 48438414Smckusick } 48539344Smckusick if (m) { 48638414Smckusick m->m_nextpkt = nextrecord; 48739344Smckusick } else { 48839344Smckusick so->so_rcv.sb_mb = nextrecord; 48938414Smckusick sbunlock(&so->so_rcv); 49038414Smckusick splx(s); 49138414Smckusick goto restart; 49238414Smckusick } 49338414Smckusick 49438414Smckusick /* 49538414Smckusick * Get the xid and check that it is an rpc reply 49638414Smckusick */ 49740117Smckusick if (m->m_len >= sizeof replyh) 49840117Smckusick bcopy(mtod(m, caddr_t), (caddr_t)&replyh, sizeof replyh); 49938414Smckusick else { 50040117Smckusick struct mbuf *mp = m; 50140117Smckusick caddr_t cp = (caddr_t)&replyh; 50240117Smckusick int cnt = sizeof replyh; 50340117Smckusick do { 50438414Smckusick if (mp->m_len > 0) { 50540117Smckusick int xfer = (mp->m_len >= cnt) ? cnt : mp->m_len; 50638414Smckusick bcopy(mtod(mp, caddr_t), cp, xfer); 50738414Smckusick cnt -= xfer; 50838414Smckusick cp += xfer; 50938414Smckusick } 51038414Smckusick if (cnt > 0) 51138414Smckusick mp = mp->m_next; 51240117Smckusick } while (mp && cnt > 0); 51340117Smckusick if (mp == NULL) { /* Insufficient length */ 51440117Smckusick nfsstats.rpcinvalid++; 51540117Smckusick goto dropit; 51638414Smckusick } 51738414Smckusick } 51840117Smckusick if (replyh.r_rep != rpc_reply) { /* Not a reply */ 51940117Smckusick nfsstats.rpcinvalid++; 52038414Smckusick goto dropit; 52140117Smckusick } 52238414Smckusick /* 52338414Smckusick * Loop through the request list to match up the reply 52440117Smckusick * If no match, just drop the datagram 52538414Smckusick */ 52640117Smckusick if (rep = nfsreqh.r_next) { 52740117Smckusick while (rep != &nfsreqh) { 52840117Smckusick /* The socket, being connected, will only queue matches */ 52940117Smckusick if (replyh.r_xid == rep->r_xid && so == rep->r_mntp->nm_so) { 53038414Smckusick /* Found it.. */ 53140117Smckusick if (rep->r_mrep) /* Already there - duplicate */ 53240117Smckusick break; 53338414Smckusick rep->r_mrep = m; 53438414Smckusick while (m) { 53538414Smckusick if (m->m_type != MT_DATA && m->m_type != MT_HEADER) 53638414Smckusick panic("nfs_soreply 3"); 53738414Smckusick sbfree(&so->so_rcv, m); 53838414Smckusick m = so->so_rcv.sb_mb = m->m_next; 53938414Smckusick } 54038414Smckusick so->so_rcv.sb_mb = nextrecord; 54140117Smckusick if (rep->r_flags & R_TIMING) { 54240117Smckusick nfs_updatetimer(mntp); 54340117Smckusick rep->r_flags &= ~R_TIMING; 54440117Smckusick mntp->nm_rtt = -1; /* re-arm timer */ 54540117Smckusick } 54640117Smckusick if (rep->r_flags & R_SENT) { 54740117Smckusick rep->r_flags &= ~R_SENT; 54840117Smckusick --mntp->nm_hostinfo->nh_sent; 54940117Smckusick /* If count now 0, want to initiate new req */ 55040117Smckusick } 55140117Smckusick if (rep == myrep) { /* This is success */ 55240117Smckusick if (logged) 55340117Smckusick uprintf("NFS server %s responded\n", 55440351Smckusick mntp->nm_mountp->m_stat.f_mntfromname); 55538414Smckusick goto release; 55640117Smckusick } 55740117Smckusick /* Else wake up other sleeper and wait for next */ 55840117Smckusick sbunlock(&so->so_rcv); 55940117Smckusick sorwakeup(so); 56040117Smckusick splx(s); 56140117Smckusick goto restart; 56238414Smckusick } 56338414Smckusick rep = rep->r_next; 56440117Smckusick } 56538414Smckusick } 56640117Smckusick /* If not matched to request, drop it */ 56740117Smckusick nfsstats.rpcunexpected++; 56838414Smckusick dropit: 56940117Smckusick sbdroprecord(&so->so_rcv); 57038414Smckusick sbunlock(&so->so_rcv); 57138414Smckusick splx(s); 57238414Smckusick goto restart; 57340117Smckusick 57438414Smckusick release: 57538414Smckusick sbunlock(&so->so_rcv); 57638414Smckusick splx(s); 57738414Smckusick return (error); 57838414Smckusick } 57938414Smckusick 58038414Smckusick /* 58138414Smckusick * nfs_request - goes something like this 58238414Smckusick * - fill in request struct 58338414Smckusick * - links it into list 58438414Smckusick * - calls nfs_sosend() for first transmit 58538414Smckusick * - calls nfs_soreceive() to get reply 58638414Smckusick * - break down rpc header and return with nfs reply pointed to 58738414Smckusick * by mrep or error 58838414Smckusick * nb: always frees up mreq mbuf list 58938414Smckusick */ 59040117Smckusick nfs_request(vp, mreq, xid, idem, mp, mrp, mdp, dposp) 59138414Smckusick struct vnode *vp; 59238414Smckusick struct mbuf *mreq; 59338414Smckusick u_long xid; 59440117Smckusick int idem; 59538414Smckusick struct mount *mp; 59638414Smckusick struct mbuf **mrp; 59738414Smckusick struct mbuf **mdp; 59838414Smckusick caddr_t *dposp; 59938414Smckusick { 60038414Smckusick register struct mbuf *m, *mrep; 60138414Smckusick register struct nfsreq *rep; 60238414Smckusick register u_long *p; 60338414Smckusick register int len; 60438414Smckusick struct nfsmount *mntp; 60538414Smckusick struct mbuf *md; 60639344Smckusick struct nfsreq *reph; 60738414Smckusick caddr_t dpos; 60838414Smckusick char *cp2; 60938414Smckusick int t1; 61038414Smckusick int s; 61138414Smckusick int error; 61238414Smckusick 61338414Smckusick mntp = vfs_to_nfs(mp); 61438414Smckusick m = mreq; 61538414Smckusick MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK); 61638414Smckusick rep->r_xid = xid; 61738414Smckusick rep->r_mntp = mntp; 61838414Smckusick rep->r_vp = vp; 61938414Smckusick if (mntp->nm_flag & NFSMNT_SOFT) 62040117Smckusick rep->r_retry = mntp->nm_retry; 62138414Smckusick else 62240117Smckusick rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */ 62340117Smckusick rep->r_flags = rep->r_rexmit = 0; 62440117Smckusick /* Idempotency: add N * MINTIMEO to requests if not, else use 0 */ 62540117Smckusick rep->r_timer = rep->r_timerinit = -(idem * NFS_MINTIMEO); 62638414Smckusick rep->r_mrep = NULL; 62738414Smckusick rep->r_mreq = m; 62838414Smckusick len = 0; 62938414Smckusick while (m) { 63038414Smckusick len += m->m_len; 63138414Smckusick m = m->m_next; 63238414Smckusick } 63338414Smckusick rep->r_msiz = len; 63438414Smckusick 63540117Smckusick /* 63640117Smckusick * Do the client side RPC. 63740117Smckusick */ 63840117Smckusick nfsstats.rpcrequests++; 63940117Smckusick s = splnet(); 64040117Smckusick /* Chain request into list of outstanding requests. Be sure 64140117Smckusick * to put it LAST so timer finds oldest requests first. */ 64239344Smckusick reph = &nfsreqh; 64339344Smckusick if (reph->r_prev == NULL) { 64439344Smckusick reph->r_next = rep; 64539344Smckusick rep->r_prev = reph; 64639344Smckusick } else { 64739344Smckusick reph->r_prev->r_next = rep; 64839344Smckusick rep->r_prev = reph->r_prev; 64939344Smckusick } 65039344Smckusick reph->r_prev = rep; 65139344Smckusick rep->r_next = reph; 65240117Smckusick /* 65340117Smckusick * If backing off another request or avoiding congestion, don't 65440117Smckusick * send this one now but let timer do it. If not timing a request, 65540117Smckusick * do it now. 65640117Smckusick */ 65740117Smckusick if (mntp->nm_hostinfo->nh_sent > 0 && 65840117Smckusick (mntp->nm_hostinfo->nh_currexmit != 0 || 65940117Smckusick mntp->nm_hostinfo->nh_sent >= mntp->nm_hostinfo->nh_window)) { 66040117Smckusick splx(s); 66140117Smckusick goto skipsend; 66240117Smckusick } 66340117Smckusick ++mntp->nm_hostinfo->nh_sent; /* Inconsistent if can't NFSMCOPY */ 66440117Smckusick rep->r_flags |= R_SENT; /* But not a catastrophe */ 66540117Smckusick if (mntp->nm_rtt == -1) { 66640117Smckusick mntp->nm_rtt = 0; 66740117Smckusick rep->r_flags |= R_TIMING; 66840117Smckusick } 66938414Smckusick splx(s); 67038414Smckusick 67138414Smckusick /* 67240117Smckusick * If we can get a packet to send, send it off... 67338414Smckusick * otherwise the timer will retransmit later 67438414Smckusick */ 67540117Smckusick m = NFSMCOPY(mreq, 0, M_COPYALL, M_WAIT); 67638414Smckusick if (m != NULL) 67740117Smckusick (void) nfs_send(mntp->nm_so, (struct mbuf *)0, m, 0, len); 67840117Smckusick /* 67940117Smckusick * Wait for the reply from our send or the timer's. 68040117Smckusick */ 68140117Smckusick skipsend: 68240117Smckusick error = nfs_dgreply(mntp->nm_so, mntp, rep); 68338414Smckusick 68440117Smckusick /* 68540117Smckusick * RPC done, unlink the request. 68640117Smckusick */ 68738414Smckusick s = splnet(); 68838414Smckusick rep->r_prev->r_next = rep->r_next; 68939344Smckusick rep->r_next->r_prev = rep->r_prev; 69038414Smckusick splx(s); 69138414Smckusick m_freem(rep->r_mreq); 69238414Smckusick mrep = md = rep->r_mrep; 69338414Smckusick FREE((caddr_t)rep, M_NFSREQ); 69438414Smckusick if (error) 69538414Smckusick return (error); 69638414Smckusick 69738414Smckusick /* 69838414Smckusick * break down the rpc header and check if ok 69938414Smckusick */ 70038414Smckusick dpos = mtod(md, caddr_t); 70138414Smckusick nfsm_disect(p, u_long *, 5*NFSX_UNSIGNED); 70238414Smckusick p += 2; 70338414Smckusick if (*p++ == rpc_msgdenied) { 70438414Smckusick if (*p == rpc_mismatch) 70538414Smckusick error = EOPNOTSUPP; 70638414Smckusick else 70738414Smckusick error = EACCES; 70838414Smckusick m_freem(mrep); 70938414Smckusick return (error); 71038414Smckusick } 71138414Smckusick /* 71238414Smckusick * skip over the auth_verf, someday we may want to cache auth_short's 71338414Smckusick * for nfs_reqhead(), but for now just dump it 71438414Smckusick */ 71538414Smckusick if (*++p != 0) { 71638414Smckusick len = nfsm_rndup(fxdr_unsigned(long, *p)); 71738414Smckusick nfsm_adv(len); 71838414Smckusick } 71938414Smckusick nfsm_disect(p, u_long *, NFSX_UNSIGNED); 72038414Smckusick /* 0 == ok */ 72138414Smckusick if (*p == 0) { 72238414Smckusick nfsm_disect(p, u_long *, NFSX_UNSIGNED); 72338414Smckusick if (*p != 0) { 72438414Smckusick error = fxdr_unsigned(int, *p); 72538414Smckusick m_freem(mrep); 72638414Smckusick return (error); 72738414Smckusick } 72838414Smckusick *mrp = mrep; 72938414Smckusick *mdp = md; 73038414Smckusick *dposp = dpos; 73138414Smckusick return (0); 73238414Smckusick } 73338414Smckusick m_freem(mrep); 73438414Smckusick return (EPROTONOSUPPORT); 73538414Smckusick nfsmout: 73638414Smckusick return (error); 73738414Smckusick } 73838414Smckusick 73938414Smckusick /* 74038414Smckusick * Get a request for the server main loop 74138414Smckusick * - receive a request via. nfs_soreceive() 74238414Smckusick * - verify it 74338414Smckusick * - fill in the cred struct. 74438414Smckusick */ 74539754Smckusick nfs_getreq(so, prog, vers, maxproc, nam, mrp, mdp, dposp, retxid, proc, cr, 74639754Smckusick msk, mtch) 74738414Smckusick struct socket *so; 74838414Smckusick u_long prog; 74938414Smckusick u_long vers; 75038414Smckusick int maxproc; 75138414Smckusick struct mbuf **nam; 75238414Smckusick struct mbuf **mrp; 75338414Smckusick struct mbuf **mdp; 75438414Smckusick caddr_t *dposp; 75538414Smckusick u_long *retxid; 75638414Smckusick u_long *proc; 75738414Smckusick register struct ucred *cr; 75839754Smckusick u_long msk; 75939754Smckusick u_long mtch; 76038414Smckusick { 76138414Smckusick register int i; 76239494Smckusick register u_long *p; 76339494Smckusick register long t1; 76439494Smckusick caddr_t dpos, cp2; 76539494Smckusick int error = 0; 76639494Smckusick struct mbuf *mrep, *md; 76739494Smckusick int len; 76838414Smckusick 76940117Smckusick if (error = nfs_dgreceive(so, msk, mtch, nam, &mrep)) 77038414Smckusick return (error); 77138414Smckusick md = mrep; 77238414Smckusick dpos = mtod(mrep, caddr_t); 77338414Smckusick nfsm_disect(p, u_long *, 10*NFSX_UNSIGNED); 77438414Smckusick *retxid = *p++; 77538414Smckusick if (*p++ != rpc_call) { 77638414Smckusick m_freem(mrep); 77738414Smckusick return (ERPCMISMATCH); 77838414Smckusick } 77938414Smckusick if (*p++ != rpc_vers) { 78038414Smckusick m_freem(mrep); 78138414Smckusick return (ERPCMISMATCH); 78238414Smckusick } 78338414Smckusick if (*p++ != prog) { 78438414Smckusick m_freem(mrep); 78538414Smckusick return (EPROGUNAVAIL); 78638414Smckusick } 78738414Smckusick if (*p++ != vers) { 78838414Smckusick m_freem(mrep); 78938414Smckusick return (EPROGMISMATCH); 79038414Smckusick } 79138414Smckusick *proc = fxdr_unsigned(u_long, *p++); 79238414Smckusick if (*proc == NFSPROC_NULL) { 79338414Smckusick *mrp = mrep; 79438414Smckusick return (0); 79538414Smckusick } 79638414Smckusick if (*proc > maxproc || *p++ != rpc_auth_unix) { 79738414Smckusick m_freem(mrep); 79838414Smckusick return (EPROCUNAVAIL); 79938414Smckusick } 80039494Smckusick (void) fxdr_unsigned(int, *p++); 80139494Smckusick len = fxdr_unsigned(int, *++p); 80239494Smckusick nfsm_adv(nfsm_rndup(len)); 80338414Smckusick nfsm_disect(p, u_long *, 3*NFSX_UNSIGNED); 80438414Smckusick cr->cr_uid = fxdr_unsigned(uid_t, *p++); 80538414Smckusick cr->cr_gid = fxdr_unsigned(gid_t, *p++); 80639494Smckusick len = fxdr_unsigned(int, *p); 80739494Smckusick if (len > 10) { 80838414Smckusick m_freem(mrep); 80938414Smckusick return (EBADRPC); 81038414Smckusick } 81139494Smckusick nfsm_disect(p, u_long *, (len + 2)*NFSX_UNSIGNED); 81239494Smckusick for (i = 1; i <= len; i++) 81338414Smckusick cr->cr_groups[i] = fxdr_unsigned(gid_t, *p++); 81439494Smckusick cr->cr_ngroups = len + 1; 81538414Smckusick /* 81638414Smckusick * Do we have any use for the verifier. 81738414Smckusick * According to the "Remote Procedure Call Protocol Spec." it 81838414Smckusick * should be AUTH_NULL, but some clients make it AUTH_UNIX? 81938414Smckusick * For now, just skip over it 82038414Smckusick */ 82139494Smckusick len = fxdr_unsigned(int, *++p); 82239494Smckusick if (len > 0) 82339494Smckusick nfsm_adv(nfsm_rndup(len)); 82438414Smckusick *mrp = mrep; 82538414Smckusick *mdp = md; 82638414Smckusick *dposp = dpos; 82738414Smckusick return (0); 82838414Smckusick nfsmout: 82938414Smckusick return (error); 83038414Smckusick } 83138414Smckusick 83238414Smckusick /* 83338414Smckusick * Generate the rpc reply header 83438414Smckusick * siz arg. is used to decide if adding a cluster is worthwhile 83538414Smckusick */ 83638414Smckusick nfs_rephead(siz, retxid, err, mrq, mbp, bposp) 83738414Smckusick int siz; 83838414Smckusick u_long retxid; 83938414Smckusick int err; 84038414Smckusick struct mbuf **mrq; 84138414Smckusick struct mbuf **mbp; 84238414Smckusick caddr_t *bposp; 84338414Smckusick { 84439494Smckusick register u_long *p; 84539494Smckusick register long t1; 84639494Smckusick caddr_t bpos; 84739494Smckusick struct mbuf *mreq, *mb, *mb2; 84838414Smckusick 84938414Smckusick NFSMGETHDR(mreq); 85038414Smckusick mb = mreq; 85138414Smckusick if ((siz+RPC_REPLYSIZ) > MHLEN) 85238414Smckusick NFSMCLGET(mreq, M_WAIT); 85338414Smckusick p = mtod(mreq, u_long *); 85438414Smckusick mreq->m_len = 6*NFSX_UNSIGNED; 85538414Smckusick bpos = ((caddr_t)p)+mreq->m_len; 85638414Smckusick *p++ = retxid; 85738414Smckusick *p++ = rpc_reply; 85838414Smckusick if (err == ERPCMISMATCH) { 85938414Smckusick *p++ = rpc_msgdenied; 86038414Smckusick *p++ = rpc_mismatch; 86138414Smckusick *p++ = txdr_unsigned(2); 86238414Smckusick *p = txdr_unsigned(2); 86338414Smckusick } else { 86438414Smckusick *p++ = rpc_msgaccepted; 86538414Smckusick *p++ = 0; 86638414Smckusick *p++ = 0; 86738414Smckusick switch (err) { 86838414Smckusick case EPROGUNAVAIL: 86938414Smckusick *p = txdr_unsigned(RPC_PROGUNAVAIL); 87038414Smckusick break; 87138414Smckusick case EPROGMISMATCH: 87238414Smckusick *p = txdr_unsigned(RPC_PROGMISMATCH); 87338414Smckusick nfsm_build(p, u_long *, 2*NFSX_UNSIGNED); 87438414Smckusick *p++ = txdr_unsigned(2); 87538414Smckusick *p = txdr_unsigned(2); /* someday 3 */ 87638414Smckusick break; 87738414Smckusick case EPROCUNAVAIL: 87838414Smckusick *p = txdr_unsigned(RPC_PROCUNAVAIL); 87938414Smckusick break; 88038414Smckusick default: 88138414Smckusick *p = 0; 88238414Smckusick if (err != VNOVAL) { 88338414Smckusick nfsm_build(p, u_long *, NFSX_UNSIGNED); 88438414Smckusick *p = txdr_unsigned(err); 88538414Smckusick } 88638414Smckusick break; 88738414Smckusick }; 88838414Smckusick } 88938414Smckusick *mrq = mreq; 89038414Smckusick *mbp = mb; 89138414Smckusick *bposp = bpos; 89238414Smckusick if (err != 0 && err != VNOVAL) 89338414Smckusick nfsstats.srvrpc_errs++; 89438414Smckusick return (0); 89538414Smckusick } 89638414Smckusick 89738414Smckusick /* 89838414Smckusick * Nfs timer routine 89938414Smckusick * Scan the nfsreq list and retranmit any requests that have timed out 90038414Smckusick * To avoid retransmission attempts on STREAM sockets (in the future) make 90140117Smckusick * sure to set the r_retry field to 0 (implies nm_retry == 0). 90238414Smckusick */ 90338414Smckusick nfs_timer() 90438414Smckusick { 90538414Smckusick register struct nfsreq *rep; 90638414Smckusick register struct mbuf *m; 90738414Smckusick register struct socket *so; 90840117Smckusick register struct nfsmount *mntp; 90940117Smckusick int s, error; 91038414Smckusick 91138414Smckusick s = splnet(); 91238414Smckusick rep = nfsreqh.r_next; 91340117Smckusick if (rep) for ( ; rep != &nfsreqh; rep = rep->r_next) { 91440117Smckusick mntp = rep->r_mntp; 91540117Smckusick if (rep->r_flags & R_TIMING) /* update rtt in mount */ 91640117Smckusick mntp->nm_rtt++; 91740117Smckusick /* If not timed out or reply already received, skip */ 91840117Smckusick if (++rep->r_timer < mntp->nm_rto || rep->r_mrep) 91940117Smckusick continue; 92040117Smckusick /* Do backoff and save new timeout in mount */ 92140117Smckusick if (rep->r_flags & R_TIMING) { 92240117Smckusick nfs_backofftimer(mntp); 92340117Smckusick rep->r_flags &= ~R_TIMING; 92440117Smckusick mntp->nm_rtt = -1; 92540117Smckusick } 92640117Smckusick if (rep->r_flags & R_SENT) { 92740117Smckusick rep->r_flags &= ~R_SENT; 92840117Smckusick --mntp->nm_hostinfo->nh_sent; 92940117Smckusick } 93040117Smckusick /* Check state of socket, cf nfs_send */ 93140117Smckusick so = mntp->nm_so; 93240117Smckusick if (error = nfs_sockerr(so, 1)) 93340117Smckusick goto wakeup; 93440117Smckusick if (sbspace(&so->so_snd) < rep->r_msiz) 93540117Smckusick goto wakeup; 93640117Smckusick /* Check for too many retries, cf nfs_dgreply */ 93740117Smckusick if (++rep->r_rexmit > NFS_MAXREXMIT) /* clip */ 93840117Smckusick rep->r_rexmit = NFS_MAXREXMIT; 93940117Smckusick if (rep->r_rexmit > rep->r_retry) /* too many */ 94040117Smckusick goto wakeup; 94140117Smckusick /* Check for congestion control, cf nfs_request */ 94240117Smckusick if (mntp->nm_hostinfo->nh_sent >= mntp->nm_hostinfo->nh_window) 94340117Smckusick goto wakeup; 94440117Smckusick /* Send it! */ 94540117Smckusick m = NFSMCOPY(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT); 94640117Smckusick if (m == NULL) 94740117Smckusick goto wakeup; 94840117Smckusick nfsstats.rpcretries++; 94938414Smckusick #ifdef MGETHDR 95040117Smckusick m->m_pkthdr.len = rep->r_msiz; 95138414Smckusick #endif 95240117Smckusick (void)(*so->so_proto->pr_usrreq)(so, PRU_SEND, m, 95340327Ssklower (struct mbuf *)0, (struct mbuf *)0); 95440117Smckusick 95540117Smckusick /* We need to time the request even though we're 95640117Smckusick * retransmitting, in order to maintain backoff. */ 95740117Smckusick mntp->nm_rtt = 0; 95840117Smckusick ++mntp->nm_hostinfo->nh_sent; 95940117Smckusick rep->r_flags |= (R_SENT|R_TIMING); 96040117Smckusick rep->r_timer = rep->r_timerinit; 96140117Smckusick wakeup: 96240117Smckusick /* If error or interruptible mount, give user a look */ 96340117Smckusick if (error || (mntp->nm_flag & NFSMNT_INT)) 96440117Smckusick sorwakeup(so); 96540117Smckusick } 96640117Smckusick splx(s); 96740117Smckusick timeout(nfs_timer, (caddr_t)0, hz/NFS_HZ); 96840117Smckusick } 96940117Smckusick 97040117Smckusick /* 97140117Smckusick * NFS timer update and backoff. The "Jacobson/Karels/Karn" scheme is 97240117Smckusick * used here. The timer state is held in the nfsmount structure and 97340117Smckusick * a single request is used to clock the response. When successful 97440117Smckusick * the rtt smoothing in nfs_updatetimer is used, when failed the backoff 97540117Smckusick * is done by nfs_backofftimer. We also log failure messages in these 97640117Smckusick * routines. 97740117Smckusick * 97840117Smckusick * Congestion variables are held in the nfshost structure which 97940117Smckusick * is referenced by nfsmounts and shared per-server. This separation 98040117Smckusick * makes it possible to do per-mount timing which allows varying disk 98140117Smckusick * access times to be dealt with, while preserving a network oriented 98240117Smckusick * congestion control scheme. 98340117Smckusick * 98440117Smckusick * The windowing implements the Jacobson/Karels slowstart algorithm 98540117Smckusick * with adjusted scaling factors. We start with one request, then send 98640117Smckusick * 4 more after each success until the ssthresh limit is reached, then 98740117Smckusick * we increment at a rate proportional to the window. On failure, we 98840117Smckusick * remember 3/4 the current window and clamp the send limit to 1. Note 98940117Smckusick * ICMP source quench is not reflected in so->so_error so we ignore that 99040117Smckusick * for now. 99140117Smckusick * 99240117Smckusick * NFS behaves much more like a transport protocol with these changes, 99340117Smckusick * shedding the teenage pedal-to-the-metal tendencies of "other" 99440117Smckusick * implementations. 99540117Smckusick * 99640117Smckusick * Timers and congestion avoidance by Tom Talpey, Open Software Foundation. 99740117Smckusick */ 99840117Smckusick 99940117Smckusick /* 100040117Smckusick * The TCP algorithm was not forgiving enough. Because the NFS server 100140117Smckusick * responds only after performing lookups/diskio/etc, we have to be 100240117Smckusick * more prepared to accept a spiky variance. The TCP algorithm is: 100340117Smckusick * TCP_RTO(mntp) ((((mntp)->nm_srtt >> 2) + (mntp)->nm_rttvar) >> 1) 100440117Smckusick */ 100540117Smckusick #define NFS_RTO(mntp) (((mntp)->nm_srtt >> 3) + (mntp)->nm_rttvar) 100640117Smckusick 100740117Smckusick nfs_updatetimer(mntp) 100840117Smckusick register struct nfsmount *mntp; 100940117Smckusick { 101040117Smckusick register struct nfshost *nfshp = mntp->nm_hostinfo; 101140117Smckusick 101240117Smckusick /* If retransmitted, clear and return */ 101340117Smckusick if (mntp->nm_rexmit || nfshp->nh_currexmit) { 101440117Smckusick if (nfshp->nh_currexmit >= nfsrexmtthresh) 101540351Smckusick nfs_log("NFS server %s OK\n", 101640351Smckusick mntp->nm_mountp->m_stat.f_mntfromname); 101740117Smckusick mntp->nm_rexmit = nfshp->nh_currexmit = 0; 101840117Smckusick return; 101940117Smckusick } 102040117Smckusick /* If have a measurement, do smoothing */ 102140117Smckusick if (mntp->nm_srtt) { 102240117Smckusick register short delta; 102340117Smckusick delta = mntp->nm_rtt - (mntp->nm_srtt >> 3); 102440117Smckusick if ((mntp->nm_srtt += delta) <= 0) 102540117Smckusick mntp->nm_srtt = 1; 102640117Smckusick if (delta < 0) 102740117Smckusick delta = -delta; 102840117Smckusick delta -= (mntp->nm_rttvar >> 2); 102940117Smckusick if ((mntp->nm_rttvar += delta) <= 0) 103040117Smckusick mntp->nm_rttvar = 1; 103140117Smckusick /* Else initialize */ 103240117Smckusick } else { 103340117Smckusick mntp->nm_rttvar = mntp->nm_rtt << 1; 103440117Smckusick if (mntp->nm_rttvar == 0) mntp->nm_rttvar = 2; 103540117Smckusick mntp->nm_srtt = mntp->nm_rttvar << 2; 103640117Smckusick } 103740117Smckusick /* Compute new Retransmission TimeOut and clip */ 103840117Smckusick mntp->nm_rto = NFS_RTO(mntp); 103940117Smckusick if (mntp->nm_rto < NFS_MINTIMEO) 104040117Smckusick mntp->nm_rto = NFS_MINTIMEO; 104140117Smckusick else if (mntp->nm_rto > NFS_MAXTIMEO) 104240117Smckusick mntp->nm_rto = NFS_MAXTIMEO; 104340117Smckusick nfshp->nh_currto = mntp->nm_rto; 104440117Smckusick 104540117Smckusick /* Update window estimate */ 104640117Smckusick if (nfshp->nh_window < nfshp->nh_ssthresh) /* quickly */ 104740117Smckusick nfshp->nh_window += 4; 104840117Smckusick else { /* slowly */ 104940117Smckusick register long incr = ++nfshp->nh_winext; 105040117Smckusick incr = (incr * incr) / nfshp->nh_window; 105140117Smckusick if (incr > 0) { 105240117Smckusick nfshp->nh_winext = 0; 105340117Smckusick ++nfshp->nh_window; 105440117Smckusick } 105540117Smckusick } 105640117Smckusick if (nfshp->nh_window > NFS_MAXWINDOW) 105740117Smckusick nfshp->nh_window = NFS_MAXWINDOW; 105840117Smckusick } 105940117Smckusick 106040117Smckusick nfs_backofftimer(mntp) 106140117Smckusick register struct nfsmount *mntp; 106240117Smckusick { 106340117Smckusick register struct nfshost *nfshp = mntp->nm_hostinfo; 106440117Smckusick register unsigned long newrto; 106540117Smckusick 106640117Smckusick /* Clip shift count */ 106740117Smckusick if (++mntp->nm_rexmit > 8 * sizeof mntp->nm_rto) 106840117Smckusick mntp->nm_rexmit = 8 * sizeof mntp->nm_rto; 106940117Smckusick /* Back off RTO exponentially */ 107040117Smckusick newrto = NFS_RTO(mntp); 107140117Smckusick newrto <<= (mntp->nm_rexmit - 1); 107240117Smckusick if (newrto == 0 || newrto > NFS_MAXTIMEO) 107340117Smckusick newrto = NFS_MAXTIMEO; 107440117Smckusick mntp->nm_rto = nfshp->nh_currto = newrto; 107540117Smckusick 107640117Smckusick /* If too many retries, message, assume a bogus RTT and re-measure */ 107740117Smckusick if (nfshp->nh_currexmit < mntp->nm_rexmit) { 107840117Smckusick nfshp->nh_currexmit = mntp->nm_rexmit; 107940117Smckusick if (nfshp->nh_currexmit >= nfsrexmtthresh) { 108040117Smckusick if (nfshp->nh_currexmit == nfsrexmtthresh) { 108140117Smckusick nfs_log("NFS server %s not responding\n", 108240351Smckusick mntp->nm_mountp->m_stat.f_mntfromname); 108340117Smckusick mntp->nm_rttvar += (mntp->nm_srtt >> 2); 108440117Smckusick mntp->nm_srtt = 0; 108538414Smckusick } 108640117Smckusick /* The routing invalidation should be a usrreq PRU */ 108740117Smckusick if (mtod(nfshp->nh_sockaddr, 108840117Smckusick struct sockaddr *)->sa_family == AF_INET) 108940117Smckusick in_losing(mntp->nm_so->so_pcb); 109038414Smckusick } 109138414Smckusick } 109240117Smckusick /* Close down window but remember this point (3/4 current) for later */ 109340117Smckusick nfshp->nh_ssthresh = ((nfshp->nh_window << 1) + nfshp->nh_window) >> 2; 109440117Smckusick nfshp->nh_window = 1; 109540117Smckusick nfshp->nh_winext = 0; 109638414Smckusick } 109738414Smckusick 109838414Smckusick /* 109940117Smckusick * Not all errors are fatal. The closed checks deal 110040117Smckusick * with errors a little strangely. 110138414Smckusick */ 110240117Smckusick 110340117Smckusick nfs_sockerr(so, sending) 110440117Smckusick struct socket *so; 110540117Smckusick int sending; 110638414Smckusick { 110740117Smckusick if (sending && (so->so_state & SS_CANTSENDMORE)) { 110840117Smckusick so->so_error = EPIPE; 110940117Smckusick return (EPIPE); 111040117Smckusick } 111140117Smckusick 111240117Smckusick switch (so->so_error) { /* inhibit certain errors */ 111340117Smckusick case ENETDOWN: 111440117Smckusick case ENETUNREACH: 111540117Smckusick case EHOSTDOWN: 111640117Smckusick case EHOSTUNREACH: 111740117Smckusick so->so_error = 0; 111840117Smckusick case 0: 111940117Smckusick break; 112040117Smckusick default: /* return all others */ 112140117Smckusick printf("nfs_sockerr: error %d on %s\n", so->so_error, 112240117Smckusick sending?"send":"receive"); 112340117Smckusick return (so->so_error); 112440117Smckusick } 112540117Smckusick 112640117Smckusick if (!sending && (so->so_state & SS_CANTRCVMORE)) { 112740117Smckusick so->so_error = 0; /* (no error) */ 112840117Smckusick return (EPIPE); 112940117Smckusick } 113040117Smckusick return (so->so_error); 113138414Smckusick } 1132