138414Smckusick /* 238414Smckusick * Copyright (c) 1989 The Regents of the University of California. 338414Smckusick * All rights reserved. 438414Smckusick * 538414Smckusick * This code is derived from software contributed to Berkeley by 638414Smckusick * Rick Macklem at The University of Guelph. 738414Smckusick * 838414Smckusick * Redistribution and use in source and binary forms are permitted 938414Smckusick * provided that the above copyright notice and this paragraph are 1038414Smckusick * duplicated in all such forms and that any documentation, 1138414Smckusick * advertising materials, and other materials related to such 1238414Smckusick * distribution and use acknowledge that the software was developed 1338414Smckusick * by the University of California, Berkeley. The name of the 1438414Smckusick * University may not be used to endorse or promote products derived 1538414Smckusick * from this software without specific prior written permission. 1638414Smckusick * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 1738414Smckusick * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 1838414Smckusick * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 1938414Smckusick * 20*40117Smckusick * @(#)nfs_socket.c 7.6 (Berkeley) 02/16/90 2138414Smckusick */ 2238414Smckusick 2338414Smckusick /* 2438414Smckusick * Socket operations for use by nfs (similar to uipc_socket.c, but never 2538414Smckusick * with copies to/from a uio vector) 26*40117Smckusick * NB: For now, they only work for datagram sockets. 2738414Smckusick * (Use on stream sockets would require some record boundary mark in the 2839754Smckusick * stream as defined by "RPC: Remote Procedure Call Protocol 2939754Smckusick * Specification" RFC1057 Section 10) 3038414Smckusick * and different versions of send, receive and reply that do not assume 3138414Smckusick * an atomic protocol 3238414Smckusick */ 3338414Smckusick 3438414Smckusick #include "types.h" 3538414Smckusick #include "param.h" 3638414Smckusick #include "uio.h" 3738414Smckusick #include "user.h" 38*40117Smckusick #include "proc.h" 39*40117Smckusick #include "signal.h" 4038414Smckusick #include "mount.h" 4138414Smckusick #include "kernel.h" 4238414Smckusick #include "malloc.h" 4338414Smckusick #include "mbuf.h" 4438414Smckusick #include "vnode.h" 4538414Smckusick #include "domain.h" 4638414Smckusick #include "protosw.h" 4738414Smckusick #include "socket.h" 4838414Smckusick #include "socketvar.h" 4938414Smckusick #include "rpcv2.h" 5038414Smckusick #include "nfsv2.h" 5138414Smckusick #include "nfs.h" 5238414Smckusick #include "xdr_subs.h" 5338414Smckusick #include "nfsm_subs.h" 5438414Smckusick #include "nfsmount.h" 5538414Smckusick 56*40117Smckusick #include "syslog.h" 57*40117Smckusick #define nfs_log(message, host) log(LOG_ERR, message, host) 58*40117Smckusick 5938414Smckusick #define TRUE 1 6038414Smckusick 6138414Smckusick /* set lock on sockbuf sb, sleep at neg prio */ 6238414Smckusick #define nfs_sblock(sb) { \ 6338414Smckusick while ((sb)->sb_flags & SB_LOCK) { \ 6438414Smckusick (sb)->sb_flags |= SB_WANT; \ 6538414Smckusick sleep((caddr_t)&(sb)->sb_flags, PZERO-1); \ 6638414Smckusick } \ 6738414Smckusick (sb)->sb_flags |= SB_LOCK; \ 6838414Smckusick } 69*40117Smckusick /* 70*40117Smckusick * nfs_sbwait() is simply sbwait() but at a negative priority so that it 71*40117Smckusick * can not be interrupted by a signal. 72*40117Smckusick */ 73*40117Smckusick nfs_sbwait(sb) 74*40117Smckusick struct sockbuf *sb; 75*40117Smckusick { 76*40117Smckusick sb->sb_flags |= SB_WAIT; 77*40117Smckusick sleep((caddr_t)&sb->sb_cc, PZERO-2); 78*40117Smckusick } 7938414Smckusick 8038414Smckusick /* 8138414Smckusick * External data, mostly RPC constants in XDR form 8238414Smckusick */ 8338414Smckusick extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix, 8438414Smckusick rpc_msgaccepted, rpc_call; 8538414Smckusick extern u_long nfs_prog, nfs_vers; 8638414Smckusick int nfsrv_null(), 8738414Smckusick nfsrv_getattr(), 8838414Smckusick nfsrv_setattr(), 8938414Smckusick nfsrv_lookup(), 9038414Smckusick nfsrv_readlink(), 9138414Smckusick nfsrv_read(), 9238414Smckusick nfsrv_write(), 9338414Smckusick nfsrv_create(), 9438414Smckusick nfsrv_remove(), 9538414Smckusick nfsrv_rename(), 9638414Smckusick nfsrv_link(), 9738414Smckusick nfsrv_symlink(), 9838414Smckusick nfsrv_mkdir(), 9938414Smckusick nfsrv_rmdir(), 10038414Smckusick nfsrv_readdir(), 10138414Smckusick nfsrv_statfs(), 10238414Smckusick nfsrv_noop(); 10338414Smckusick 10438414Smckusick int (*nfsrv_procs[NFS_NPROCS])() = { 10538414Smckusick nfsrv_null, 10638414Smckusick nfsrv_getattr, 10738414Smckusick nfsrv_setattr, 10838414Smckusick nfsrv_noop, 10938414Smckusick nfsrv_lookup, 11038414Smckusick nfsrv_readlink, 11138414Smckusick nfsrv_read, 11238414Smckusick nfsrv_noop, 11338414Smckusick nfsrv_write, 11438414Smckusick nfsrv_create, 11538414Smckusick nfsrv_remove, 11638414Smckusick nfsrv_rename, 11738414Smckusick nfsrv_link, 11838414Smckusick nfsrv_symlink, 11938414Smckusick nfsrv_mkdir, 12038414Smckusick nfsrv_rmdir, 12138414Smckusick nfsrv_readdir, 12238414Smckusick nfsrv_statfs, 12338414Smckusick }; 12438414Smckusick 125*40117Smckusick struct nfshost *nfshosth; 126*40117Smckusick struct nfsreq nfsreqh; 127*40117Smckusick int nfsrexmtthresh = NFS_FISHY; 12838414Smckusick 12938414Smckusick /* 130*40117Smckusick * Initialize sockets and per-host congestion for a new NFS connection. 131*40117Smckusick * We do not free the sockaddr if error. 13238414Smckusick */ 133*40117Smckusick nfs_connect(nmp, saddr) 134*40117Smckusick register struct nfsmount *nmp; 135*40117Smckusick struct mbuf *saddr; 136*40117Smckusick { 137*40117Smckusick int s, error, srvaddrlen; 138*40117Smckusick struct mbuf *m; 139*40117Smckusick register struct nfshost *nfshp; 140*40117Smckusick 141*40117Smckusick nmp->nm_so = 0; 142*40117Smckusick if (error = socreate(mtod(saddr, struct sockaddr *)->sa_family, 143*40117Smckusick &nmp->nm_so, SOCK_DGRAM, 0)) 144*40117Smckusick goto bad; 145*40117Smckusick 146*40117Smckusick /* Unix sockets do not provide a local bind for server reply */ 147*40117Smckusick if (mtod(saddr, struct sockaddr *)->sa_family == AF_UNIX) { 148*40117Smckusick struct sockaddr *sa; 149*40117Smckusick static char client[] = "/tmp/.nfs/nfsclient##"; 150*40117Smckusick static int serial; 151*40117Smckusick int firstserial; 152*40117Smckusick m = m_getclr(M_WAIT, MT_SONAME); 153*40117Smckusick if (m == NULL) { 154*40117Smckusick error = ENOBUFS; 155*40117Smckusick goto bad; 156*40117Smckusick } 157*40117Smckusick m->m_len = sizeof (client) + 2; 158*40117Smckusick sa = mtod(m, struct sockaddr *); 159*40117Smckusick sa->sa_family = AF_UNIX; 160*40117Smckusick #ifdef MSG_TRUNC /* Have sa_len to set? */ 161*40117Smckusick sa->sa_len = m->m_len; 162*40117Smckusick #endif 163*40117Smckusick bcopy(client, sa->sa_data, sizeof(client)); 164*40117Smckusick firstserial = serial; 165*40117Smckusick do { 166*40117Smckusick if (++serial >= 100) serial = 0; 167*40117Smckusick sa->sa_data[19] = (serial / 10) + '0'; 168*40117Smckusick sa->sa_data[20] = (serial % 10) + '0'; 169*40117Smckusick error = sobind(nmp->nm_so, m); 170*40117Smckusick if (firstserial == serial) break; 171*40117Smckusick } while (error == EADDRINUSE); 172*40117Smckusick m_freem(m); 173*40117Smckusick if (error) 174*40117Smckusick goto bad; 175*40117Smckusick } 176*40117Smckusick 177*40117Smckusick if (error = soconnect(nmp->nm_so, saddr)) 178*40117Smckusick goto bad; 179*40117Smckusick error = soreserve(nmp->nm_so, /* get space ! */ 180*40117Smckusick nmp->nm_wsize + 1024, /* one out */ 181*40117Smckusick (nmp->nm_rsize + 1024) * 4); /* four in */ 182*40117Smckusick if (error) 183*40117Smckusick goto bad; 184*40117Smckusick 185*40117Smckusick /* 186*40117Smckusick * Search mount list for existing server entry. 187*40117Smckusick * 188*40117Smckusick * Note, even though we have a sockaddr, it is not quite reliable 189*40117Smckusick * enough to bcmp against. For instance, a sockaddr_in has a 190*40117Smckusick * sin_zero field which is not reliably zeroed by user code (e.g. 191*40117Smckusick * mount). So what we do as an attempt at transport independence 192*40117Smckusick * is to get the peeraddr of our connected socket into a zeroed 193*40117Smckusick * sockaddr. Then we cache that and compare against it. This is 194*40117Smckusick * not exactly perfect. However it is not critical that it be, if 195*40117Smckusick * we cannot match the sockaddr we will simply allocate a new nfshp 196*40117Smckusick * per mount, which will disable the per-host congestion but 197*40117Smckusick * everything else will work as normal. 198*40117Smckusick */ 199*40117Smckusick m = m_getclr(M_WAIT, MT_SONAME); 200*40117Smckusick if (m && (*(nmp->nm_so->so_proto->pr_usrreq))(nmp->nm_so, PRU_PEERADDR, 201*40117Smckusick (struct mbuf *)0, m, (struct mbuf *)0) == 0) { 202*40117Smckusick m_freem(saddr); 203*40117Smckusick saddr = m; 204*40117Smckusick } else 205*40117Smckusick m_freem(m); 206*40117Smckusick srvaddrlen = saddr->m_len; 207*40117Smckusick 208*40117Smckusick s = splnet(); 209*40117Smckusick 210*40117Smckusick for (nfshp = nfshosth; nfshp; nfshp = nfshp->nh_next) { 211*40117Smckusick if (srvaddrlen != nfshp->nh_salen) 212*40117Smckusick continue; 213*40117Smckusick if (!bcmp(mtod(saddr,caddr_t),mtod(nfshp->nh_sockaddr,caddr_t), 214*40117Smckusick srvaddrlen)) 215*40117Smckusick break; 216*40117Smckusick } 217*40117Smckusick if (nfshp) /* Have an existing mount host */ 218*40117Smckusick m_freem(saddr); 219*40117Smckusick else { 220*40117Smckusick MALLOC(nfshp,struct nfshost *,sizeof *nfshp,M_NFSMNT,M_WAITOK); 221*40117Smckusick bzero((caddr_t)nfshp, sizeof *nfshp); 222*40117Smckusick nfshp->nh_sockaddr = saddr; 223*40117Smckusick nfshp->nh_salen = srvaddrlen; 224*40117Smckusick /* Initialize other non-zero congestion variables */ 225*40117Smckusick nfshp->nh_currto = NFS_TIMEO; 226*40117Smckusick nfshp->nh_window = 1; /* Initial send window */ 227*40117Smckusick nfshp->nh_ssthresh = NFS_MAXWINDOW; /* Slowstart threshold */ 228*40117Smckusick if (nfshosth) nfshosth->nh_prev = nfshp; /* Chain in */ 229*40117Smckusick nfshp->nh_next = nfshosth; 230*40117Smckusick nfshosth = nfshp; 231*40117Smckusick } 232*40117Smckusick nfshp->nh_refcnt++; 233*40117Smckusick splx(s); 234*40117Smckusick nmp->nm_hostinfo = nfshp; 235*40117Smckusick if (nmp->nm_rto == NFS_TIMEO) { 236*40117Smckusick nmp->nm_rto = nfshp->nh_currto; 237*40117Smckusick nmp->nm_rttvar = nmp->nm_rto << 1; 238*40117Smckusick } 239*40117Smckusick return (0); 240*40117Smckusick 241*40117Smckusick bad: 242*40117Smckusick if (nmp->nm_so) (void) soclose(nmp->nm_so); 243*40117Smckusick nmp->nm_so = 0; 244*40117Smckusick return (error); 245*40117Smckusick } 246*40117Smckusick 247*40117Smckusick /* 248*40117Smckusick * NFS disconnect. Clean up and unlink. 249*40117Smckusick */ 250*40117Smckusick nfs_disconnect(nmp) 251*40117Smckusick register struct nfsmount *nmp; 252*40117Smckusick { 253*40117Smckusick register struct nfshost *nfshp; 254*40117Smckusick 255*40117Smckusick if (nmp->nm_so) 256*40117Smckusick soclose(nmp->nm_so); 257*40117Smckusick nmp->nm_so = 0; 258*40117Smckusick if (nfshp = nmp->nm_hostinfo) { 259*40117Smckusick int s = splnet(); 260*40117Smckusick if (--nfshp->nh_refcnt <= 0) { 261*40117Smckusick if (nfshp->nh_next) 262*40117Smckusick nfshp->nh_next->nh_prev = nfshp->nh_prev; 263*40117Smckusick if (nfshp->nh_prev) 264*40117Smckusick nfshp->nh_prev->nh_next = nfshp->nh_next; 265*40117Smckusick else 266*40117Smckusick nfshosth = nfshp->nh_next; 267*40117Smckusick /* If unix family, remove the nfsclient from /tmp */ 268*40117Smckusick if (mtod(nfshp->nh_sockaddr, 269*40117Smckusick struct sockaddr *)->sa_family == AF_UNIX) { 270*40117Smckusick /* Lookup sa_data, do VOP_REMOVE... */ 271*40117Smckusick } 272*40117Smckusick m_freem(nfshp->nh_sockaddr); 273*40117Smckusick FREE(nfshp, M_NFSMNT); 274*40117Smckusick } 275*40117Smckusick nmp->nm_hostinfo = 0; 276*40117Smckusick splx(s); 277*40117Smckusick } 278*40117Smckusick } 279*40117Smckusick 280*40117Smckusick /* 281*40117Smckusick * This is a stripped down non-interruptible version of sosend(). 282*40117Smckusick */ 283*40117Smckusick nfs_send(so, nam, top, flags, siz) 28438414Smckusick register struct socket *so; 28538414Smckusick struct mbuf *nam; 28638414Smckusick struct mbuf *top; 28738414Smckusick int flags; 28838414Smckusick int siz; 28938414Smckusick { 290*40117Smckusick int error, s; 29138414Smckusick 29238414Smckusick #ifdef MGETHDR 29338414Smckusick top->m_pkthdr.len = siz; 29438414Smckusick #endif 295*40117Smckusick for (;;) { 296*40117Smckusick nfs_sblock(&so->so_snd); 297*40117Smckusick s = splnet(); 298*40117Smckusick if (error = nfs_sockerr(so, 1)) { 299*40117Smckusick splx(s); 300*40117Smckusick m_freem(top); 301*40117Smckusick break; 302*40117Smckusick } 303*40117Smckusick if (sbspace(&so->so_snd) < siz) { 304*40117Smckusick sbunlock(&so->so_snd); 305*40117Smckusick nfs_sbwait(&so->so_snd); 306*40117Smckusick splx(s); 307*40117Smckusick continue; 308*40117Smckusick } 309*40117Smckusick error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, top, 310*40117Smckusick (struct mbuf *)nam, (struct mbuf *)0, (struct mbuf *)0); 31138414Smckusick splx(s); 312*40117Smckusick break; 31338414Smckusick } 31438414Smckusick sbunlock(&so->so_snd); 31538414Smckusick return (error); 31638414Smckusick } 31738414Smckusick 31838414Smckusick /* 319*40117Smckusick * This is a stripped down datagram specific version of soreceive() 32038414Smckusick */ 321*40117Smckusick nfs_dgreceive(so, msk, mtch, aname, mp) 32238414Smckusick register struct socket *so; 32339754Smckusick u_long msk; 32439754Smckusick u_long mtch; 32538414Smckusick struct mbuf **aname; 32638414Smckusick struct mbuf **mp; 32738414Smckusick { 32838414Smckusick register struct mbuf *m; 32938414Smckusick int s, error = 0; 33038414Smckusick struct mbuf *nextrecord; 33138414Smckusick 33238414Smckusick if (aname) 33338414Smckusick *aname = 0; 33438414Smckusick 335*40117Smckusick for (;;) { 336*40117Smckusick sblock(&so->so_rcv); 337*40117Smckusick s = splnet(); 33838414Smckusick 339*40117Smckusick if (so->so_rcv.sb_cc == 0) { 340*40117Smckusick if (error = nfs_sockerr(so, 0)) { 341*40117Smckusick so->so_error = 0; 342*40117Smckusick break; 343*40117Smckusick } 34439754Smckusick sbunlock(&so->so_rcv); 345*40117Smckusick sbwait(&so->so_rcv); 34639754Smckusick splx(s); 347*40117Smckusick continue; 34839754Smckusick } 34938414Smckusick m = so->so_rcv.sb_mb; 350*40117Smckusick if (m == 0) 351*40117Smckusick panic("nfs_dgreceive 1"); 352*40117Smckusick nextrecord = m->m_nextpkt; 353*40117Smckusick /* Save sender's address */ 354*40117Smckusick if (m->m_type != MT_SONAME) 355*40117Smckusick panic("nfs_dgreceive 1a"); 35638414Smckusick sbfree(&so->so_rcv, m); 357*40117Smckusick if (aname) { 358*40117Smckusick *aname = m; 359*40117Smckusick so->so_rcv.sb_mb = m->m_next; 360*40117Smckusick m->m_next = 0; 361*40117Smckusick m = so->so_rcv.sb_mb; 362*40117Smckusick } else { 363*40117Smckusick MFREE(m, so->so_rcv.sb_mb); 364*40117Smckusick m = so->so_rcv.sb_mb; 365*40117Smckusick } 366*40117Smckusick /* Drop control mbuf's */ 367*40117Smckusick if (m && m->m_type == MT_RIGHTS) 368*40117Smckusick panic("nfs_dgreceive 2"); 369*40117Smckusick if (m && m->m_type == MT_CONTROL) { 370*40117Smckusick sbfree(&so->so_rcv, m); 371*40117Smckusick MFREE(m, so->so_rcv.sb_mb); 372*40117Smckusick m = so->so_rcv.sb_mb; 373*40117Smckusick } 374*40117Smckusick /* Dequeue packet from sockbuf */ 375*40117Smckusick *mp = m; 376*40117Smckusick while (m) { 377*40117Smckusick if (m->m_type != MT_DATA && m->m_type != MT_HEADER) 378*40117Smckusick panic("nfs_dgreceive 3"); 379*40117Smckusick sbfree(&so->so_rcv, m); 380*40117Smckusick m = so->so_rcv.sb_mb = m->m_next; 381*40117Smckusick } 382*40117Smckusick so->so_rcv.sb_mb = nextrecord; 383*40117Smckusick /* Return */ 384*40117Smckusick break; 38538414Smckusick } 38638414Smckusick sbunlock(&so->so_rcv); 38738414Smckusick splx(s); 38838414Smckusick return (error); 38938414Smckusick } 39038414Smckusick 39138414Smckusick struct rpc_replyhead { 39238414Smckusick u_long r_xid; 39338414Smckusick u_long r_rep; 39438414Smckusick }; 39538414Smckusick 39638414Smckusick /* 397*40117Smckusick * Implement NFS client side datagram receive. 39838414Smckusick * We depend on the way that records are added to the sockbuf 39938414Smckusick * by sbappend*. In particular, each record (mbufs linked through m_next) 40038414Smckusick * must begin with an address, followed by optional MT_CONTROL mbuf 40138414Smckusick * and then zero or more mbufs of data. 40238414Smckusick * We must search through the list of received datagrams matching them 40338414Smckusick * with outstanding requests using the xid, until ours is found. 40438414Smckusick */ 405*40117Smckusick nfs_dgreply(so, mntp, myrep) 40638414Smckusick register struct socket *so; 40738414Smckusick struct nfsmount *mntp; 40839344Smckusick struct nfsreq *myrep; 40938414Smckusick { 41038414Smckusick register struct mbuf *m; 41138414Smckusick register struct nfsreq *rep; 41238414Smckusick register int error = 0, s; 413*40117Smckusick int logged = 0; 41438414Smckusick struct mbuf *nextrecord; 41538414Smckusick struct rpc_replyhead replyh; 41638414Smckusick 41738414Smckusick restart: 41839344Smckusick nfs_sblock(&so->so_rcv); 419*40117Smckusick s = splnet(); 420*40117Smckusick /* Already received and queued for us, bye bye */ 42139344Smckusick if (myrep->r_mrep != NULL) { 422*40117Smckusick error = 0; 423*40117Smckusick goto release; 42439344Smckusick } 425*40117Smckusick /* If we have run out of retries (hard mounts have bogus count) */ 426*40117Smckusick if (myrep->r_rexmit > myrep->r_retry) { 427*40117Smckusick error = ETIMEDOUT; 428*40117Smckusick nfsstats.rpctimeouts++; 429*40117Smckusick giveup: 430*40117Smckusick if (myrep->r_flags & R_TIMING) { 431*40117Smckusick myrep->r_flags &= ~R_TIMING; 432*40117Smckusick mntp->nm_rtt = -1; 433*40117Smckusick } 434*40117Smckusick if (myrep->r_flags & R_SENT) { 435*40117Smckusick myrep->r_flags &= ~R_SENT; 436*40117Smckusick --mntp->nm_hostinfo->nh_sent; 437*40117Smckusick /* If count now 0, want to initiate new req */ 438*40117Smckusick } 439*40117Smckusick goto release; 44039344Smckusick } 44138414Smckusick 44239344Smckusick m = so->so_rcv.sb_mb; 44339344Smckusick if (m == 0) { 44439344Smckusick if (so->so_rcv.sb_cc) 44539344Smckusick panic("nfs_soreply 1"); 446*40117Smckusick if (error = nfs_sockerr(so, 0)) { 44738414Smckusick so->so_error = 0; 448*40117Smckusick goto giveup; 44938414Smckusick } 450*40117Smckusick /* Allow signals to interrupt request? (nfs_timer wakes up) */ 451*40117Smckusick if ((mntp->nm_flag & NFSMNT_INT) && 452*40117Smckusick u.u_procp->p_sig & ~u.u_procp->p_sigmask) { 453*40117Smckusick error = EINTR; 454*40117Smckusick goto giveup; 455*40117Smckusick } 456*40117Smckusick if (mntp->nm_rexmit >= nfsrexmtthresh && logged++ == 0) 457*40117Smckusick uprintf("NFS server %s not responding, retrying\n", 458*40117Smckusick mntp->nm_host); 45938414Smckusick sbunlock(&so->so_rcv); 46038414Smckusick nfs_sbwait(&so->so_rcv); 46138414Smckusick splx(s); 46238414Smckusick goto restart; 46338414Smckusick } 46438414Smckusick 46538414Smckusick /* 46638414Smckusick * Take off the address, check for rights and ditch any control 46738414Smckusick * mbufs. 46838414Smckusick */ 469*40117Smckusick nextrecord = m->m_nextpkt; 47038414Smckusick if (m->m_type != MT_SONAME) 47138414Smckusick panic("nfs reply SONAME"); 47238414Smckusick sbfree(&so->so_rcv, m); 47338414Smckusick MFREE(m, so->so_rcv.sb_mb); 47438414Smckusick m = so->so_rcv.sb_mb; 47538414Smckusick if (m && m->m_type == MT_RIGHTS) 47638414Smckusick panic("nfs reply RIGHTS"); 47738414Smckusick if (m && m->m_type == MT_CONTROL) { 47838414Smckusick sbfree(&so->so_rcv, m); 47938414Smckusick MFREE(m, so->so_rcv.sb_mb); 48038414Smckusick m = so->so_rcv.sb_mb; 48138414Smckusick } 48239344Smckusick if (m) { 48338414Smckusick m->m_nextpkt = nextrecord; 48439344Smckusick } else { 48539344Smckusick so->so_rcv.sb_mb = nextrecord; 48638414Smckusick sbunlock(&so->so_rcv); 48738414Smckusick splx(s); 48838414Smckusick goto restart; 48938414Smckusick } 49038414Smckusick 49138414Smckusick /* 49238414Smckusick * Get the xid and check that it is an rpc reply 49338414Smckusick */ 494*40117Smckusick if (m->m_len >= sizeof replyh) 495*40117Smckusick bcopy(mtod(m, caddr_t), (caddr_t)&replyh, sizeof replyh); 49638414Smckusick else { 497*40117Smckusick struct mbuf *mp = m; 498*40117Smckusick caddr_t cp = (caddr_t)&replyh; 499*40117Smckusick int cnt = sizeof replyh; 500*40117Smckusick do { 50138414Smckusick if (mp->m_len > 0) { 502*40117Smckusick int xfer = (mp->m_len >= cnt) ? cnt : mp->m_len; 50338414Smckusick bcopy(mtod(mp, caddr_t), cp, xfer); 50438414Smckusick cnt -= xfer; 50538414Smckusick cp += xfer; 50638414Smckusick } 50738414Smckusick if (cnt > 0) 50838414Smckusick mp = mp->m_next; 509*40117Smckusick } while (mp && cnt > 0); 510*40117Smckusick if (mp == NULL) { /* Insufficient length */ 511*40117Smckusick nfsstats.rpcinvalid++; 512*40117Smckusick goto dropit; 51338414Smckusick } 51438414Smckusick } 515*40117Smckusick if (replyh.r_rep != rpc_reply) { /* Not a reply */ 516*40117Smckusick nfsstats.rpcinvalid++; 51738414Smckusick goto dropit; 518*40117Smckusick } 51938414Smckusick /* 52038414Smckusick * Loop through the request list to match up the reply 521*40117Smckusick * If no match, just drop the datagram 52238414Smckusick */ 523*40117Smckusick if (rep = nfsreqh.r_next) { 524*40117Smckusick while (rep != &nfsreqh) { 525*40117Smckusick /* The socket, being connected, will only queue matches */ 526*40117Smckusick if (replyh.r_xid == rep->r_xid && so == rep->r_mntp->nm_so) { 52738414Smckusick /* Found it.. */ 528*40117Smckusick if (rep->r_mrep) /* Already there - duplicate */ 529*40117Smckusick break; 53038414Smckusick rep->r_mrep = m; 53138414Smckusick while (m) { 53238414Smckusick if (m->m_type != MT_DATA && m->m_type != MT_HEADER) 53338414Smckusick panic("nfs_soreply 3"); 53438414Smckusick sbfree(&so->so_rcv, m); 53538414Smckusick m = so->so_rcv.sb_mb = m->m_next; 53638414Smckusick } 53738414Smckusick so->so_rcv.sb_mb = nextrecord; 538*40117Smckusick if (rep->r_flags & R_TIMING) { 539*40117Smckusick nfs_updatetimer(mntp); 540*40117Smckusick rep->r_flags &= ~R_TIMING; 541*40117Smckusick mntp->nm_rtt = -1; /* re-arm timer */ 542*40117Smckusick } 543*40117Smckusick if (rep->r_flags & R_SENT) { 544*40117Smckusick rep->r_flags &= ~R_SENT; 545*40117Smckusick --mntp->nm_hostinfo->nh_sent; 546*40117Smckusick /* If count now 0, want to initiate new req */ 547*40117Smckusick } 548*40117Smckusick if (rep == myrep) { /* This is success */ 549*40117Smckusick if (logged) 550*40117Smckusick uprintf("NFS server %s responded\n", 551*40117Smckusick mntp->nm_host); 55238414Smckusick goto release; 553*40117Smckusick } 554*40117Smckusick /* Else wake up other sleeper and wait for next */ 555*40117Smckusick sbunlock(&so->so_rcv); 556*40117Smckusick sorwakeup(so); 557*40117Smckusick splx(s); 558*40117Smckusick goto restart; 55938414Smckusick } 56038414Smckusick rep = rep->r_next; 561*40117Smckusick } 56238414Smckusick } 563*40117Smckusick /* If not matched to request, drop it */ 564*40117Smckusick nfsstats.rpcunexpected++; 56538414Smckusick dropit: 566*40117Smckusick sbdroprecord(&so->so_rcv); 56738414Smckusick sbunlock(&so->so_rcv); 56838414Smckusick splx(s); 56938414Smckusick goto restart; 570*40117Smckusick 57138414Smckusick release: 57238414Smckusick sbunlock(&so->so_rcv); 57338414Smckusick splx(s); 57438414Smckusick return (error); 57538414Smckusick } 57638414Smckusick 57738414Smckusick /* 57838414Smckusick * nfs_request - goes something like this 57938414Smckusick * - fill in request struct 58038414Smckusick * - links it into list 58138414Smckusick * - calls nfs_sosend() for first transmit 58238414Smckusick * - calls nfs_soreceive() to get reply 58338414Smckusick * - break down rpc header and return with nfs reply pointed to 58438414Smckusick * by mrep or error 58538414Smckusick * nb: always frees up mreq mbuf list 58638414Smckusick */ 587*40117Smckusick nfs_request(vp, mreq, xid, idem, mp, mrp, mdp, dposp) 58838414Smckusick struct vnode *vp; 58938414Smckusick struct mbuf *mreq; 59038414Smckusick u_long xid; 591*40117Smckusick int idem; 59238414Smckusick struct mount *mp; 59338414Smckusick struct mbuf **mrp; 59438414Smckusick struct mbuf **mdp; 59538414Smckusick caddr_t *dposp; 59638414Smckusick { 59738414Smckusick register struct mbuf *m, *mrep; 59838414Smckusick register struct nfsreq *rep; 59938414Smckusick register u_long *p; 60038414Smckusick register int len; 60138414Smckusick struct nfsmount *mntp; 60238414Smckusick struct mbuf *md; 60339344Smckusick struct nfsreq *reph; 60438414Smckusick caddr_t dpos; 60538414Smckusick char *cp2; 60638414Smckusick int t1; 60738414Smckusick int s; 60838414Smckusick int error; 60938414Smckusick 61038414Smckusick mntp = vfs_to_nfs(mp); 61138414Smckusick m = mreq; 61238414Smckusick MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK); 61338414Smckusick rep->r_xid = xid; 61438414Smckusick rep->r_mntp = mntp; 61538414Smckusick rep->r_vp = vp; 61638414Smckusick if (mntp->nm_flag & NFSMNT_SOFT) 617*40117Smckusick rep->r_retry = mntp->nm_retry; 61838414Smckusick else 619*40117Smckusick rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */ 620*40117Smckusick rep->r_flags = rep->r_rexmit = 0; 621*40117Smckusick /* Idempotency: add N * MINTIMEO to requests if not, else use 0 */ 622*40117Smckusick rep->r_timer = rep->r_timerinit = -(idem * NFS_MINTIMEO); 62338414Smckusick rep->r_mrep = NULL; 62438414Smckusick rep->r_mreq = m; 62538414Smckusick len = 0; 62638414Smckusick while (m) { 62738414Smckusick len += m->m_len; 62838414Smckusick m = m->m_next; 62938414Smckusick } 63038414Smckusick rep->r_msiz = len; 63138414Smckusick 632*40117Smckusick /* 633*40117Smckusick * Do the client side RPC. 634*40117Smckusick */ 635*40117Smckusick nfsstats.rpcrequests++; 636*40117Smckusick s = splnet(); 637*40117Smckusick /* Chain request into list of outstanding requests. Be sure 638*40117Smckusick * to put it LAST so timer finds oldest requests first. */ 63939344Smckusick reph = &nfsreqh; 64039344Smckusick if (reph->r_prev == NULL) { 64139344Smckusick reph->r_next = rep; 64239344Smckusick rep->r_prev = reph; 64339344Smckusick } else { 64439344Smckusick reph->r_prev->r_next = rep; 64539344Smckusick rep->r_prev = reph->r_prev; 64639344Smckusick } 64739344Smckusick reph->r_prev = rep; 64839344Smckusick rep->r_next = reph; 649*40117Smckusick /* 650*40117Smckusick * If backing off another request or avoiding congestion, don't 651*40117Smckusick * send this one now but let timer do it. If not timing a request, 652*40117Smckusick * do it now. 653*40117Smckusick */ 654*40117Smckusick if (mntp->nm_hostinfo->nh_sent > 0 && 655*40117Smckusick (mntp->nm_hostinfo->nh_currexmit != 0 || 656*40117Smckusick mntp->nm_hostinfo->nh_sent >= mntp->nm_hostinfo->nh_window)) { 657*40117Smckusick splx(s); 658*40117Smckusick goto skipsend; 659*40117Smckusick } 660*40117Smckusick ++mntp->nm_hostinfo->nh_sent; /* Inconsistent if can't NFSMCOPY */ 661*40117Smckusick rep->r_flags |= R_SENT; /* But not a catastrophe */ 662*40117Smckusick if (mntp->nm_rtt == -1) { 663*40117Smckusick mntp->nm_rtt = 0; 664*40117Smckusick rep->r_flags |= R_TIMING; 665*40117Smckusick } 66638414Smckusick splx(s); 66738414Smckusick 66838414Smckusick /* 669*40117Smckusick * If we can get a packet to send, send it off... 67038414Smckusick * otherwise the timer will retransmit later 67138414Smckusick */ 672*40117Smckusick m = NFSMCOPY(mreq, 0, M_COPYALL, M_WAIT); 67338414Smckusick if (m != NULL) 674*40117Smckusick (void) nfs_send(mntp->nm_so, (struct mbuf *)0, m, 0, len); 675*40117Smckusick /* 676*40117Smckusick * Wait for the reply from our send or the timer's. 677*40117Smckusick */ 678*40117Smckusick skipsend: 679*40117Smckusick error = nfs_dgreply(mntp->nm_so, mntp, rep); 68038414Smckusick 681*40117Smckusick /* 682*40117Smckusick * RPC done, unlink the request. 683*40117Smckusick */ 68438414Smckusick s = splnet(); 68538414Smckusick rep->r_prev->r_next = rep->r_next; 68639344Smckusick rep->r_next->r_prev = rep->r_prev; 68738414Smckusick splx(s); 68838414Smckusick m_freem(rep->r_mreq); 68938414Smckusick mrep = md = rep->r_mrep; 69038414Smckusick FREE((caddr_t)rep, M_NFSREQ); 69138414Smckusick if (error) 69238414Smckusick return (error); 69338414Smckusick 69438414Smckusick /* 69538414Smckusick * break down the rpc header and check if ok 69638414Smckusick */ 69738414Smckusick dpos = mtod(md, caddr_t); 69838414Smckusick nfsm_disect(p, u_long *, 5*NFSX_UNSIGNED); 69938414Smckusick p += 2; 70038414Smckusick if (*p++ == rpc_msgdenied) { 70138414Smckusick if (*p == rpc_mismatch) 70238414Smckusick error = EOPNOTSUPP; 70338414Smckusick else 70438414Smckusick error = EACCES; 70538414Smckusick m_freem(mrep); 70638414Smckusick return (error); 70738414Smckusick } 70838414Smckusick /* 70938414Smckusick * skip over the auth_verf, someday we may want to cache auth_short's 71038414Smckusick * for nfs_reqhead(), but for now just dump it 71138414Smckusick */ 71238414Smckusick if (*++p != 0) { 71338414Smckusick len = nfsm_rndup(fxdr_unsigned(long, *p)); 71438414Smckusick nfsm_adv(len); 71538414Smckusick } 71638414Smckusick nfsm_disect(p, u_long *, NFSX_UNSIGNED); 71738414Smckusick /* 0 == ok */ 71838414Smckusick if (*p == 0) { 71938414Smckusick nfsm_disect(p, u_long *, NFSX_UNSIGNED); 72038414Smckusick if (*p != 0) { 72138414Smckusick error = fxdr_unsigned(int, *p); 72238414Smckusick m_freem(mrep); 72338414Smckusick return (error); 72438414Smckusick } 72538414Smckusick *mrp = mrep; 72638414Smckusick *mdp = md; 72738414Smckusick *dposp = dpos; 72838414Smckusick return (0); 72938414Smckusick } 73038414Smckusick m_freem(mrep); 73138414Smckusick return (EPROTONOSUPPORT); 73238414Smckusick nfsmout: 73338414Smckusick return (error); 73438414Smckusick } 73538414Smckusick 73638414Smckusick /* 73738414Smckusick * Get a request for the server main loop 73838414Smckusick * - receive a request via. nfs_soreceive() 73938414Smckusick * - verify it 74038414Smckusick * - fill in the cred struct. 74138414Smckusick */ 74239754Smckusick nfs_getreq(so, prog, vers, maxproc, nam, mrp, mdp, dposp, retxid, proc, cr, 74339754Smckusick msk, mtch) 74438414Smckusick struct socket *so; 74538414Smckusick u_long prog; 74638414Smckusick u_long vers; 74738414Smckusick int maxproc; 74838414Smckusick struct mbuf **nam; 74938414Smckusick struct mbuf **mrp; 75038414Smckusick struct mbuf **mdp; 75138414Smckusick caddr_t *dposp; 75238414Smckusick u_long *retxid; 75338414Smckusick u_long *proc; 75438414Smckusick register struct ucred *cr; 75539754Smckusick u_long msk; 75639754Smckusick u_long mtch; 75738414Smckusick { 75838414Smckusick register int i; 75939494Smckusick register u_long *p; 76039494Smckusick register long t1; 76139494Smckusick caddr_t dpos, cp2; 76239494Smckusick int error = 0; 76339494Smckusick struct mbuf *mrep, *md; 76439494Smckusick int len; 76538414Smckusick 766*40117Smckusick if (error = nfs_dgreceive(so, msk, mtch, nam, &mrep)) 76738414Smckusick return (error); 76838414Smckusick md = mrep; 76938414Smckusick dpos = mtod(mrep, caddr_t); 77038414Smckusick nfsm_disect(p, u_long *, 10*NFSX_UNSIGNED); 77138414Smckusick *retxid = *p++; 77238414Smckusick if (*p++ != rpc_call) { 77338414Smckusick m_freem(mrep); 77438414Smckusick return (ERPCMISMATCH); 77538414Smckusick } 77638414Smckusick if (*p++ != rpc_vers) { 77738414Smckusick m_freem(mrep); 77838414Smckusick return (ERPCMISMATCH); 77938414Smckusick } 78038414Smckusick if (*p++ != prog) { 78138414Smckusick m_freem(mrep); 78238414Smckusick return (EPROGUNAVAIL); 78338414Smckusick } 78438414Smckusick if (*p++ != vers) { 78538414Smckusick m_freem(mrep); 78638414Smckusick return (EPROGMISMATCH); 78738414Smckusick } 78838414Smckusick *proc = fxdr_unsigned(u_long, *p++); 78938414Smckusick if (*proc == NFSPROC_NULL) { 79038414Smckusick *mrp = mrep; 79138414Smckusick return (0); 79238414Smckusick } 79338414Smckusick if (*proc > maxproc || *p++ != rpc_auth_unix) { 79438414Smckusick m_freem(mrep); 79538414Smckusick return (EPROCUNAVAIL); 79638414Smckusick } 79739494Smckusick (void) fxdr_unsigned(int, *p++); 79839494Smckusick len = fxdr_unsigned(int, *++p); 79939494Smckusick nfsm_adv(nfsm_rndup(len)); 80038414Smckusick nfsm_disect(p, u_long *, 3*NFSX_UNSIGNED); 80138414Smckusick cr->cr_uid = fxdr_unsigned(uid_t, *p++); 80238414Smckusick cr->cr_gid = fxdr_unsigned(gid_t, *p++); 80339494Smckusick len = fxdr_unsigned(int, *p); 80439494Smckusick if (len > 10) { 80538414Smckusick m_freem(mrep); 80638414Smckusick return (EBADRPC); 80738414Smckusick } 80839494Smckusick nfsm_disect(p, u_long *, (len + 2)*NFSX_UNSIGNED); 80939494Smckusick for (i = 1; i <= len; i++) 81038414Smckusick cr->cr_groups[i] = fxdr_unsigned(gid_t, *p++); 81139494Smckusick cr->cr_ngroups = len + 1; 81238414Smckusick /* 81338414Smckusick * Do we have any use for the verifier. 81438414Smckusick * According to the "Remote Procedure Call Protocol Spec." it 81538414Smckusick * should be AUTH_NULL, but some clients make it AUTH_UNIX? 81638414Smckusick * For now, just skip over it 81738414Smckusick */ 81839494Smckusick len = fxdr_unsigned(int, *++p); 81939494Smckusick if (len > 0) 82039494Smckusick nfsm_adv(nfsm_rndup(len)); 82138414Smckusick *mrp = mrep; 82238414Smckusick *mdp = md; 82338414Smckusick *dposp = dpos; 82438414Smckusick return (0); 82538414Smckusick nfsmout: 82638414Smckusick return (error); 82738414Smckusick } 82838414Smckusick 82938414Smckusick /* 83038414Smckusick * Generate the rpc reply header 83138414Smckusick * siz arg. is used to decide if adding a cluster is worthwhile 83238414Smckusick */ 83338414Smckusick nfs_rephead(siz, retxid, err, mrq, mbp, bposp) 83438414Smckusick int siz; 83538414Smckusick u_long retxid; 83638414Smckusick int err; 83738414Smckusick struct mbuf **mrq; 83838414Smckusick struct mbuf **mbp; 83938414Smckusick caddr_t *bposp; 84038414Smckusick { 84139494Smckusick register u_long *p; 84239494Smckusick register long t1; 84339494Smckusick caddr_t bpos; 84439494Smckusick struct mbuf *mreq, *mb, *mb2; 84538414Smckusick 84638414Smckusick NFSMGETHDR(mreq); 84738414Smckusick mb = mreq; 84838414Smckusick if ((siz+RPC_REPLYSIZ) > MHLEN) 84938414Smckusick NFSMCLGET(mreq, M_WAIT); 85038414Smckusick p = mtod(mreq, u_long *); 85138414Smckusick mreq->m_len = 6*NFSX_UNSIGNED; 85238414Smckusick bpos = ((caddr_t)p)+mreq->m_len; 85338414Smckusick *p++ = retxid; 85438414Smckusick *p++ = rpc_reply; 85538414Smckusick if (err == ERPCMISMATCH) { 85638414Smckusick *p++ = rpc_msgdenied; 85738414Smckusick *p++ = rpc_mismatch; 85838414Smckusick *p++ = txdr_unsigned(2); 85938414Smckusick *p = txdr_unsigned(2); 86038414Smckusick } else { 86138414Smckusick *p++ = rpc_msgaccepted; 86238414Smckusick *p++ = 0; 86338414Smckusick *p++ = 0; 86438414Smckusick switch (err) { 86538414Smckusick case EPROGUNAVAIL: 86638414Smckusick *p = txdr_unsigned(RPC_PROGUNAVAIL); 86738414Smckusick break; 86838414Smckusick case EPROGMISMATCH: 86938414Smckusick *p = txdr_unsigned(RPC_PROGMISMATCH); 87038414Smckusick nfsm_build(p, u_long *, 2*NFSX_UNSIGNED); 87138414Smckusick *p++ = txdr_unsigned(2); 87238414Smckusick *p = txdr_unsigned(2); /* someday 3 */ 87338414Smckusick break; 87438414Smckusick case EPROCUNAVAIL: 87538414Smckusick *p = txdr_unsigned(RPC_PROCUNAVAIL); 87638414Smckusick break; 87738414Smckusick default: 87838414Smckusick *p = 0; 87938414Smckusick if (err != VNOVAL) { 88038414Smckusick nfsm_build(p, u_long *, NFSX_UNSIGNED); 88138414Smckusick *p = txdr_unsigned(err); 88238414Smckusick } 88338414Smckusick break; 88438414Smckusick }; 88538414Smckusick } 88638414Smckusick *mrq = mreq; 88738414Smckusick *mbp = mb; 88838414Smckusick *bposp = bpos; 88938414Smckusick if (err != 0 && err != VNOVAL) 89038414Smckusick nfsstats.srvrpc_errs++; 89138414Smckusick return (0); 89238414Smckusick } 89338414Smckusick 89438414Smckusick /* 89538414Smckusick * Nfs timer routine 89638414Smckusick * Scan the nfsreq list and retranmit any requests that have timed out 89738414Smckusick * To avoid retransmission attempts on STREAM sockets (in the future) make 898*40117Smckusick * sure to set the r_retry field to 0 (implies nm_retry == 0). 89938414Smckusick */ 90038414Smckusick nfs_timer() 90138414Smckusick { 90238414Smckusick register struct nfsreq *rep; 90338414Smckusick register struct mbuf *m; 90438414Smckusick register struct socket *so; 905*40117Smckusick register struct nfsmount *mntp; 906*40117Smckusick int s, error; 90738414Smckusick 90838414Smckusick s = splnet(); 90938414Smckusick rep = nfsreqh.r_next; 910*40117Smckusick if (rep) for ( ; rep != &nfsreqh; rep = rep->r_next) { 911*40117Smckusick mntp = rep->r_mntp; 912*40117Smckusick if (rep->r_flags & R_TIMING) /* update rtt in mount */ 913*40117Smckusick mntp->nm_rtt++; 914*40117Smckusick /* If not timed out or reply already received, skip */ 915*40117Smckusick if (++rep->r_timer < mntp->nm_rto || rep->r_mrep) 916*40117Smckusick continue; 917*40117Smckusick /* Do backoff and save new timeout in mount */ 918*40117Smckusick if (rep->r_flags & R_TIMING) { 919*40117Smckusick nfs_backofftimer(mntp); 920*40117Smckusick rep->r_flags &= ~R_TIMING; 921*40117Smckusick mntp->nm_rtt = -1; 922*40117Smckusick } 923*40117Smckusick if (rep->r_flags & R_SENT) { 924*40117Smckusick rep->r_flags &= ~R_SENT; 925*40117Smckusick --mntp->nm_hostinfo->nh_sent; 926*40117Smckusick } 927*40117Smckusick /* Check state of socket, cf nfs_send */ 928*40117Smckusick so = mntp->nm_so; 929*40117Smckusick if (error = nfs_sockerr(so, 1)) 930*40117Smckusick goto wakeup; 931*40117Smckusick if (sbspace(&so->so_snd) < rep->r_msiz) 932*40117Smckusick goto wakeup; 933*40117Smckusick /* Check for too many retries, cf nfs_dgreply */ 934*40117Smckusick if (++rep->r_rexmit > NFS_MAXREXMIT) /* clip */ 935*40117Smckusick rep->r_rexmit = NFS_MAXREXMIT; 936*40117Smckusick if (rep->r_rexmit > rep->r_retry) /* too many */ 937*40117Smckusick goto wakeup; 938*40117Smckusick /* Check for congestion control, cf nfs_request */ 939*40117Smckusick if (mntp->nm_hostinfo->nh_sent >= mntp->nm_hostinfo->nh_window) 940*40117Smckusick goto wakeup; 941*40117Smckusick /* Send it! */ 942*40117Smckusick m = NFSMCOPY(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT); 943*40117Smckusick if (m == NULL) 944*40117Smckusick goto wakeup; 945*40117Smckusick nfsstats.rpcretries++; 94638414Smckusick #ifdef MGETHDR 947*40117Smckusick m->m_pkthdr.len = rep->r_msiz; 94838414Smckusick #endif 949*40117Smckusick (void)(*so->so_proto->pr_usrreq)(so, PRU_SEND, m, 950*40117Smckusick (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0); 951*40117Smckusick 952*40117Smckusick /* We need to time the request even though we're 953*40117Smckusick * retransmitting, in order to maintain backoff. */ 954*40117Smckusick mntp->nm_rtt = 0; 955*40117Smckusick ++mntp->nm_hostinfo->nh_sent; 956*40117Smckusick rep->r_flags |= (R_SENT|R_TIMING); 957*40117Smckusick rep->r_timer = rep->r_timerinit; 958*40117Smckusick wakeup: 959*40117Smckusick /* If error or interruptible mount, give user a look */ 960*40117Smckusick if (error || (mntp->nm_flag & NFSMNT_INT)) 961*40117Smckusick sorwakeup(so); 962*40117Smckusick } 963*40117Smckusick splx(s); 964*40117Smckusick timeout(nfs_timer, (caddr_t)0, hz/NFS_HZ); 965*40117Smckusick } 966*40117Smckusick 967*40117Smckusick /* 968*40117Smckusick * NFS timer update and backoff. The "Jacobson/Karels/Karn" scheme is 969*40117Smckusick * used here. The timer state is held in the nfsmount structure and 970*40117Smckusick * a single request is used to clock the response. When successful 971*40117Smckusick * the rtt smoothing in nfs_updatetimer is used, when failed the backoff 972*40117Smckusick * is done by nfs_backofftimer. We also log failure messages in these 973*40117Smckusick * routines. 974*40117Smckusick * 975*40117Smckusick * Congestion variables are held in the nfshost structure which 976*40117Smckusick * is referenced by nfsmounts and shared per-server. This separation 977*40117Smckusick * makes it possible to do per-mount timing which allows varying disk 978*40117Smckusick * access times to be dealt with, while preserving a network oriented 979*40117Smckusick * congestion control scheme. 980*40117Smckusick * 981*40117Smckusick * The windowing implements the Jacobson/Karels slowstart algorithm 982*40117Smckusick * with adjusted scaling factors. We start with one request, then send 983*40117Smckusick * 4 more after each success until the ssthresh limit is reached, then 984*40117Smckusick * we increment at a rate proportional to the window. On failure, we 985*40117Smckusick * remember 3/4 the current window and clamp the send limit to 1. Note 986*40117Smckusick * ICMP source quench is not reflected in so->so_error so we ignore that 987*40117Smckusick * for now. 988*40117Smckusick * 989*40117Smckusick * NFS behaves much more like a transport protocol with these changes, 990*40117Smckusick * shedding the teenage pedal-to-the-metal tendencies of "other" 991*40117Smckusick * implementations. 992*40117Smckusick * 993*40117Smckusick * Timers and congestion avoidance by Tom Talpey, Open Software Foundation. 994*40117Smckusick */ 995*40117Smckusick 996*40117Smckusick /* 997*40117Smckusick * The TCP algorithm was not forgiving enough. Because the NFS server 998*40117Smckusick * responds only after performing lookups/diskio/etc, we have to be 999*40117Smckusick * more prepared to accept a spiky variance. The TCP algorithm is: 1000*40117Smckusick * TCP_RTO(mntp) ((((mntp)->nm_srtt >> 2) + (mntp)->nm_rttvar) >> 1) 1001*40117Smckusick */ 1002*40117Smckusick #define NFS_RTO(mntp) (((mntp)->nm_srtt >> 3) + (mntp)->nm_rttvar) 1003*40117Smckusick 1004*40117Smckusick nfs_updatetimer(mntp) 1005*40117Smckusick register struct nfsmount *mntp; 1006*40117Smckusick { 1007*40117Smckusick register struct nfshost *nfshp = mntp->nm_hostinfo; 1008*40117Smckusick 1009*40117Smckusick /* If retransmitted, clear and return */ 1010*40117Smckusick if (mntp->nm_rexmit || nfshp->nh_currexmit) { 1011*40117Smckusick if (nfshp->nh_currexmit >= nfsrexmtthresh) 1012*40117Smckusick nfs_log("NFS server %s OK\n", mntp->nm_host); 1013*40117Smckusick mntp->nm_rexmit = nfshp->nh_currexmit = 0; 1014*40117Smckusick return; 1015*40117Smckusick } 1016*40117Smckusick /* If have a measurement, do smoothing */ 1017*40117Smckusick if (mntp->nm_srtt) { 1018*40117Smckusick register short delta; 1019*40117Smckusick delta = mntp->nm_rtt - (mntp->nm_srtt >> 3); 1020*40117Smckusick if ((mntp->nm_srtt += delta) <= 0) 1021*40117Smckusick mntp->nm_srtt = 1; 1022*40117Smckusick if (delta < 0) 1023*40117Smckusick delta = -delta; 1024*40117Smckusick delta -= (mntp->nm_rttvar >> 2); 1025*40117Smckusick if ((mntp->nm_rttvar += delta) <= 0) 1026*40117Smckusick mntp->nm_rttvar = 1; 1027*40117Smckusick /* Else initialize */ 1028*40117Smckusick } else { 1029*40117Smckusick mntp->nm_rttvar = mntp->nm_rtt << 1; 1030*40117Smckusick if (mntp->nm_rttvar == 0) mntp->nm_rttvar = 2; 1031*40117Smckusick mntp->nm_srtt = mntp->nm_rttvar << 2; 1032*40117Smckusick } 1033*40117Smckusick /* Compute new Retransmission TimeOut and clip */ 1034*40117Smckusick mntp->nm_rto = NFS_RTO(mntp); 1035*40117Smckusick if (mntp->nm_rto < NFS_MINTIMEO) 1036*40117Smckusick mntp->nm_rto = NFS_MINTIMEO; 1037*40117Smckusick else if (mntp->nm_rto > NFS_MAXTIMEO) 1038*40117Smckusick mntp->nm_rto = NFS_MAXTIMEO; 1039*40117Smckusick nfshp->nh_currto = mntp->nm_rto; 1040*40117Smckusick 1041*40117Smckusick /* Update window estimate */ 1042*40117Smckusick if (nfshp->nh_window < nfshp->nh_ssthresh) /* quickly */ 1043*40117Smckusick nfshp->nh_window += 4; 1044*40117Smckusick else { /* slowly */ 1045*40117Smckusick register long incr = ++nfshp->nh_winext; 1046*40117Smckusick incr = (incr * incr) / nfshp->nh_window; 1047*40117Smckusick if (incr > 0) { 1048*40117Smckusick nfshp->nh_winext = 0; 1049*40117Smckusick ++nfshp->nh_window; 1050*40117Smckusick } 1051*40117Smckusick } 1052*40117Smckusick if (nfshp->nh_window > NFS_MAXWINDOW) 1053*40117Smckusick nfshp->nh_window = NFS_MAXWINDOW; 1054*40117Smckusick } 1055*40117Smckusick 1056*40117Smckusick nfs_backofftimer(mntp) 1057*40117Smckusick register struct nfsmount *mntp; 1058*40117Smckusick { 1059*40117Smckusick register struct nfshost *nfshp = mntp->nm_hostinfo; 1060*40117Smckusick register unsigned long newrto; 1061*40117Smckusick 1062*40117Smckusick /* Clip shift count */ 1063*40117Smckusick if (++mntp->nm_rexmit > 8 * sizeof mntp->nm_rto) 1064*40117Smckusick mntp->nm_rexmit = 8 * sizeof mntp->nm_rto; 1065*40117Smckusick /* Back off RTO exponentially */ 1066*40117Smckusick newrto = NFS_RTO(mntp); 1067*40117Smckusick newrto <<= (mntp->nm_rexmit - 1); 1068*40117Smckusick if (newrto == 0 || newrto > NFS_MAXTIMEO) 1069*40117Smckusick newrto = NFS_MAXTIMEO; 1070*40117Smckusick mntp->nm_rto = nfshp->nh_currto = newrto; 1071*40117Smckusick 1072*40117Smckusick /* If too many retries, message, assume a bogus RTT and re-measure */ 1073*40117Smckusick if (nfshp->nh_currexmit < mntp->nm_rexmit) { 1074*40117Smckusick nfshp->nh_currexmit = mntp->nm_rexmit; 1075*40117Smckusick if (nfshp->nh_currexmit >= nfsrexmtthresh) { 1076*40117Smckusick if (nfshp->nh_currexmit == nfsrexmtthresh) { 1077*40117Smckusick nfs_log("NFS server %s not responding\n", 1078*40117Smckusick mntp->nm_host); 1079*40117Smckusick mntp->nm_rttvar += (mntp->nm_srtt >> 2); 1080*40117Smckusick mntp->nm_srtt = 0; 108138414Smckusick } 1082*40117Smckusick /* The routing invalidation should be a usrreq PRU */ 1083*40117Smckusick if (mtod(nfshp->nh_sockaddr, 1084*40117Smckusick struct sockaddr *)->sa_family == AF_INET) 1085*40117Smckusick in_losing(mntp->nm_so->so_pcb); 108638414Smckusick } 108738414Smckusick } 1088*40117Smckusick /* Close down window but remember this point (3/4 current) for later */ 1089*40117Smckusick nfshp->nh_ssthresh = ((nfshp->nh_window << 1) + nfshp->nh_window) >> 2; 1090*40117Smckusick nfshp->nh_window = 1; 1091*40117Smckusick nfshp->nh_winext = 0; 109238414Smckusick } 109338414Smckusick 109438414Smckusick /* 1095*40117Smckusick * Not all errors are fatal. The closed checks deal 1096*40117Smckusick * with errors a little strangely. 109738414Smckusick */ 1098*40117Smckusick 1099*40117Smckusick nfs_sockerr(so, sending) 1100*40117Smckusick struct socket *so; 1101*40117Smckusick int sending; 110238414Smckusick { 1103*40117Smckusick if (sending && (so->so_state & SS_CANTSENDMORE)) { 1104*40117Smckusick so->so_error = EPIPE; 1105*40117Smckusick return (EPIPE); 1106*40117Smckusick } 1107*40117Smckusick 1108*40117Smckusick switch (so->so_error) { /* inhibit certain errors */ 1109*40117Smckusick case ENETDOWN: 1110*40117Smckusick case ENETUNREACH: 1111*40117Smckusick case EHOSTDOWN: 1112*40117Smckusick case EHOSTUNREACH: 1113*40117Smckusick so->so_error = 0; 1114*40117Smckusick case 0: 1115*40117Smckusick break; 1116*40117Smckusick default: /* return all others */ 1117*40117Smckusick printf("nfs_sockerr: error %d on %s\n", so->so_error, 1118*40117Smckusick sending?"send":"receive"); 1119*40117Smckusick return (so->so_error); 1120*40117Smckusick } 1121*40117Smckusick 1122*40117Smckusick if (!sending && (so->so_state & SS_CANTRCVMORE)) { 1123*40117Smckusick so->so_error = 0; /* (no error) */ 1124*40117Smckusick return (EPIPE); 1125*40117Smckusick } 1126*40117Smckusick return (so->so_error); 112738414Smckusick } 1128