1 /* 2 * Copyright (c) 1989, 1991 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Rick Macklem at The University of Guelph. 7 * 8 * %sccs.include.redist.c% 9 * 10 * @(#)nfs_socket.c 7.37 (Berkeley) 09/16/92 11 */ 12 13 /* 14 * Socket operations for use by nfs 15 */ 16 17 #include <sys/param.h> 18 #include <sys/systm.h> 19 #include <sys/proc.h> 20 #include <sys/mount.h> 21 #include <sys/kernel.h> 22 #include <sys/mbuf.h> 23 #include <sys/vnode.h> 24 #include <sys/domain.h> 25 #include <sys/protosw.h> 26 #include <sys/socket.h> 27 #include <sys/socketvar.h> 28 #include <sys/syslog.h> 29 #include <sys/tprintf.h> 30 #include <netinet/in.h> 31 #include <netinet/tcp.h> 32 #include <nfs/rpcv2.h> 33 #include <nfs/nfsv2.h> 34 #include <nfs/nfs.h> 35 #include <nfs/xdr_subs.h> 36 #include <nfs/nfsm_subs.h> 37 #include <nfs/nfsmount.h> 38 #include <nfs/nfsnode.h> 39 #include <nfs/nfsrtt.h> 40 #include <nfs/nqnfs.h> 41 42 #define TRUE 1 43 #define FALSE 0 44 45 /* 46 * Estimate rto for an nfs rpc sent via. an unreliable datagram. 47 * Use the mean and mean deviation of rtt for the appropriate type of rpc 48 * for the frequent rpcs and a default for the others. 49 * The justification for doing "other" this way is that these rpcs 50 * happen so infrequently that timer est. would probably be stale. 51 * Also, since many of these rpcs are 52 * non-idempotent, a conservative timeout is desired. 53 * getattr, lookup - A+2D 54 * read, write - A+4D 55 * other - nm_timeo 56 */ 57 #define NFS_RTO(n, t) \ 58 ((t) == 0 ? (n)->nm_timeo : \ 59 ((t) < 3 ? \ 60 (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \ 61 ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1))) 62 #define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1] 63 #define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1] 64 /* 65 * External data, mostly RPC constants in XDR form 66 */ 67 extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix, 68 rpc_msgaccepted, rpc_call, rpc_autherr, rpc_rejectedcred, 69 rpc_auth_kerb; 70 extern u_long nfs_prog, nfs_vers, nqnfs_prog, nqnfs_vers; 71 extern time_t nqnfsstarttime; 72 extern int nonidempotent[NFS_NPROCS]; 73 74 /* 75 * Maps errno values to nfs error numbers. 76 * Use NFSERR_IO as the catch all for ones not specifically defined in 77 * RFC 1094. 78 */ 79 static int nfsrv_errmap[ELAST] = { 80 NFSERR_PERM, NFSERR_NOENT, NFSERR_IO, NFSERR_IO, NFSERR_IO, 81 NFSERR_NXIO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, 82 NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_IO, NFSERR_IO, 83 NFSERR_IO, NFSERR_EXIST, NFSERR_IO, NFSERR_NODEV, NFSERR_NOTDIR, 84 NFSERR_ISDIR, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, 85 NFSERR_IO, NFSERR_FBIG, NFSERR_NOSPC, NFSERR_IO, NFSERR_ROFS, 86 NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, 87 NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, 88 NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, 89 NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, 90 NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, 91 NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, 92 NFSERR_IO, NFSERR_IO, NFSERR_NAMETOL, NFSERR_IO, NFSERR_IO, 93 NFSERR_NOTEMPTY, NFSERR_IO, NFSERR_IO, NFSERR_DQUOT, NFSERR_STALE, 94 NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, 95 NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, 96 NFSERR_IO, 97 }; 98 99 /* 100 * Defines which timer to use for the procnum. 101 * 0 - default 102 * 1 - getattr 103 * 2 - lookup 104 * 3 - read 105 * 4 - write 106 */ 107 static int proct[NFS_NPROCS] = { 108 0, 1, 0, 0, 2, 3, 3, 0, 4, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0, 0, 0, 109 }; 110 111 /* 112 * There is a congestion window for outstanding rpcs maintained per mount 113 * point. The cwnd size is adjusted in roughly the way that: 114 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of 115 * SIGCOMM '88". ACM, August 1988. 116 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout 117 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd 118 * of rpcs is in progress. 119 * (The sent count and cwnd are scaled for integer arith.) 120 * Variants of "slow start" were tried and were found to be too much of a 121 * performance hit (ave. rtt 3 times larger), 122 * I suspect due to the large rtt that nfs rpcs have. 123 */ 124 #define NFS_CWNDSCALE 256 125 #define NFS_MAXCWND (NFS_CWNDSCALE * 32) 126 static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, }; 127 int nfs_sbwait(); 128 void nfs_disconnect(), nfs_realign(), nfsrv_wakenfsd(), nfs_sndunlock(); 129 void nfs_rcvunlock(), nqnfs_serverd(), nqnfs_clientlease(); 130 struct mbuf *nfsm_rpchead(); 131 int nfsrtton = 0; 132 struct nfsrtt nfsrtt; 133 struct nfsd nfsd_head; 134 135 int nfsrv_null(), 136 nfsrv_getattr(), 137 nfsrv_setattr(), 138 nfsrv_lookup(), 139 nfsrv_readlink(), 140 nfsrv_read(), 141 nfsrv_write(), 142 nfsrv_create(), 143 nfsrv_remove(), 144 nfsrv_rename(), 145 nfsrv_link(), 146 nfsrv_symlink(), 147 nfsrv_mkdir(), 148 nfsrv_rmdir(), 149 nfsrv_readdir(), 150 nfsrv_statfs(), 151 nfsrv_noop(), 152 nqnfsrv_readdirlook(), 153 nqnfsrv_getlease(), 154 nqnfsrv_vacated(); 155 156 int (*nfsrv_procs[NFS_NPROCS])() = { 157 nfsrv_null, 158 nfsrv_getattr, 159 nfsrv_setattr, 160 nfsrv_noop, 161 nfsrv_lookup, 162 nfsrv_readlink, 163 nfsrv_read, 164 nfsrv_noop, 165 nfsrv_write, 166 nfsrv_create, 167 nfsrv_remove, 168 nfsrv_rename, 169 nfsrv_link, 170 nfsrv_symlink, 171 nfsrv_mkdir, 172 nfsrv_rmdir, 173 nfsrv_readdir, 174 nfsrv_statfs, 175 nqnfsrv_readdirlook, 176 nqnfsrv_getlease, 177 nqnfsrv_vacated, 178 }; 179 180 struct nfsreq nfsreqh; 181 182 /* 183 * Initialize sockets and congestion for a new NFS connection. 184 * We do not free the sockaddr if error. 185 */ 186 nfs_connect(nmp, rep) 187 register struct nfsmount *nmp; 188 struct nfsreq *rep; 189 { 190 register struct socket *so; 191 int s, error, rcvreserve, sndreserve; 192 struct sockaddr *saddr; 193 struct sockaddr_in *sin; 194 struct mbuf *m; 195 u_short tport; 196 197 nmp->nm_so = (struct socket *)0; 198 saddr = mtod(nmp->nm_nam, struct sockaddr *); 199 if (error = socreate(saddr->sa_family, 200 &nmp->nm_so, nmp->nm_sotype, nmp->nm_soproto)) 201 goto bad; 202 so = nmp->nm_so; 203 nmp->nm_soflags = so->so_proto->pr_flags; 204 205 /* 206 * Some servers require that the client port be a reserved port number. 207 */ 208 if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) { 209 MGET(m, M_WAIT, MT_SONAME); 210 sin = mtod(m, struct sockaddr_in *); 211 sin->sin_len = m->m_len = sizeof (struct sockaddr_in); 212 sin->sin_family = AF_INET; 213 sin->sin_addr.s_addr = INADDR_ANY; 214 tport = IPPORT_RESERVED - 1; 215 sin->sin_port = htons(tport); 216 while ((error = sobind(so, m)) == EADDRINUSE && 217 --tport > IPPORT_RESERVED / 2) 218 sin->sin_port = htons(tport); 219 m_freem(m); 220 if (error) 221 goto bad; 222 } 223 224 /* 225 * Protocols that do not require connections may be optionally left 226 * unconnected for servers that reply from a port other than NFS_PORT. 227 */ 228 if (nmp->nm_flag & NFSMNT_NOCONN) { 229 if (nmp->nm_soflags & PR_CONNREQUIRED) { 230 error = ENOTCONN; 231 goto bad; 232 } 233 } else { 234 if (error = soconnect(so, nmp->nm_nam)) 235 goto bad; 236 237 /* 238 * Wait for the connection to complete. Cribbed from the 239 * connect system call but with the wait timing out so 240 * that interruptible mounts don't hang here for a long time. 241 */ 242 s = splnet(); 243 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 244 (void) tsleep((caddr_t)&so->so_timeo, PSOCK, 245 "nfscon", 2 * hz); 246 if ((so->so_state & SS_ISCONNECTING) && 247 so->so_error == 0 && rep && 248 (error = nfs_sigintr(nmp, rep, rep->r_procp))) { 249 so->so_state &= ~SS_ISCONNECTING; 250 splx(s); 251 goto bad; 252 } 253 } 254 if (so->so_error) { 255 error = so->so_error; 256 so->so_error = 0; 257 splx(s); 258 goto bad; 259 } 260 splx(s); 261 } 262 if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) { 263 so->so_rcv.sb_timeo = (5 * hz); 264 so->so_snd.sb_timeo = (5 * hz); 265 } else { 266 so->so_rcv.sb_timeo = 0; 267 so->so_snd.sb_timeo = 0; 268 } 269 if (nmp->nm_sotype == SOCK_DGRAM) { 270 sndreserve = nmp->nm_wsize + NFS_MAXPKTHDR; 271 rcvreserve = nmp->nm_rsize + NFS_MAXPKTHDR; 272 } else if (nmp->nm_sotype == SOCK_SEQPACKET) { 273 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 2; 274 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) * 2; 275 } else { 276 if (nmp->nm_sotype != SOCK_STREAM) 277 panic("nfscon sotype"); 278 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 279 MGET(m, M_WAIT, MT_SOOPTS); 280 *mtod(m, int *) = 1; 281 m->m_len = sizeof(int); 282 sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, m); 283 } 284 if (so->so_proto->pr_protocol == IPPROTO_TCP) { 285 MGET(m, M_WAIT, MT_SOOPTS); 286 *mtod(m, int *) = 1; 287 m->m_len = sizeof(int); 288 sosetopt(so, IPPROTO_TCP, TCP_NODELAY, m); 289 } 290 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_long)) 291 * 2; 292 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + sizeof (u_long)) 293 * 2; 294 } 295 if (error = soreserve(so, sndreserve, rcvreserve)) 296 goto bad; 297 so->so_rcv.sb_flags |= SB_NOINTR; 298 so->so_snd.sb_flags |= SB_NOINTR; 299 300 /* Initialize other non-zero congestion variables */ 301 nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] = nmp->nm_srtt[3] = 302 nmp->nm_srtt[4] = (NFS_TIMEO << 3); 303 nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] = 304 nmp->nm_sdrtt[3] = nmp->nm_sdrtt[4] = 0; 305 nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */ 306 nmp->nm_sent = 0; 307 nmp->nm_timeouts = 0; 308 return (0); 309 310 bad: 311 nfs_disconnect(nmp); 312 return (error); 313 } 314 315 /* 316 * Reconnect routine: 317 * Called when a connection is broken on a reliable protocol. 318 * - clean up the old socket 319 * - nfs_connect() again 320 * - set R_MUSTRESEND for all outstanding requests on mount point 321 * If this fails the mount point is DEAD! 322 * nb: Must be called with the nfs_sndlock() set on the mount point. 323 */ 324 nfs_reconnect(rep) 325 register struct nfsreq *rep; 326 { 327 register struct nfsreq *rp; 328 register struct nfsmount *nmp = rep->r_nmp; 329 int error; 330 331 nfs_disconnect(nmp); 332 while (error = nfs_connect(nmp, rep)) { 333 if (error == EINTR || error == ERESTART) 334 return (EINTR); 335 (void) tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0); 336 } 337 338 /* 339 * Loop through outstanding request list and fix up all requests 340 * on old socket. 341 */ 342 rp = nfsreqh.r_next; 343 while (rp != &nfsreqh) { 344 if (rp->r_nmp == nmp) 345 rp->r_flags |= R_MUSTRESEND; 346 rp = rp->r_next; 347 } 348 return (0); 349 } 350 351 /* 352 * NFS disconnect. Clean up and unlink. 353 */ 354 void 355 nfs_disconnect(nmp) 356 register struct nfsmount *nmp; 357 { 358 register struct socket *so; 359 360 if (nmp->nm_so) { 361 so = nmp->nm_so; 362 nmp->nm_so = (struct socket *)0; 363 soshutdown(so, 2); 364 soclose(so); 365 } 366 } 367 368 /* 369 * This is the nfs send routine. For connection based socket types, it 370 * must be called with an nfs_sndlock() on the socket. 371 * "rep == NULL" indicates that it has been called from a server. 372 * For the client side: 373 * - return EINTR if the RPC is terminated, 0 otherwise 374 * - set R_MUSTRESEND if the send fails for any reason 375 * - do any cleanup required by recoverable socket errors (???) 376 * For the server side: 377 * - return EINTR or ERESTART if interrupted by a signal 378 * - return EPIPE if a connection is lost for connection based sockets (TCP...) 379 * - do any cleanup required by recoverable socket errors (???) 380 */ 381 nfs_send(so, nam, top, rep) 382 register struct socket *so; 383 struct mbuf *nam; 384 register struct mbuf *top; 385 struct nfsreq *rep; 386 { 387 struct mbuf *sendnam; 388 int error, soflags, flags; 389 390 if (rep) { 391 if (rep->r_flags & R_SOFTTERM) { 392 m_freem(top); 393 return (EINTR); 394 } 395 if ((so = rep->r_nmp->nm_so) == NULL) { 396 rep->r_flags |= R_MUSTRESEND; 397 m_freem(top); 398 return (0); 399 } 400 rep->r_flags &= ~R_MUSTRESEND; 401 soflags = rep->r_nmp->nm_soflags; 402 } else 403 soflags = so->so_proto->pr_flags; 404 if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED)) 405 sendnam = (struct mbuf *)0; 406 else 407 sendnam = nam; 408 if (so->so_type == SOCK_SEQPACKET) 409 flags = MSG_EOR; 410 else 411 flags = 0; 412 413 error = sosend(so, sendnam, (struct uio *)0, top, 414 (struct mbuf *)0, flags); 415 if (error) { 416 if (rep) { 417 log(LOG_INFO, "nfs send error %d for server %s\n",error, 418 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 419 /* 420 * Deal with errors for the client side. 421 */ 422 if (rep->r_flags & R_SOFTTERM) 423 error = EINTR; 424 else 425 rep->r_flags |= R_MUSTRESEND; 426 } else 427 log(LOG_INFO, "nfsd send error %d\n", error); 428 429 /* 430 * Handle any recoverable (soft) socket errors here. (???) 431 */ 432 if (error != EINTR && error != ERESTART && 433 error != EWOULDBLOCK && error != EPIPE) 434 error = 0; 435 } 436 return (error); 437 } 438 439 /* 440 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all 441 * done by soreceive(), but for SOCK_STREAM we must deal with the Record 442 * Mark and consolidate the data into a new mbuf list. 443 * nb: Sometimes TCP passes the data up to soreceive() in long lists of 444 * small mbufs. 445 * For SOCK_STREAM we must be very careful to read an entire record once 446 * we have read any of it, even if the system call has been interrupted. 447 */ 448 nfs_receive(rep, aname, mp) 449 register struct nfsreq *rep; 450 struct mbuf **aname; 451 struct mbuf **mp; 452 { 453 register struct socket *so; 454 struct uio auio; 455 struct iovec aio; 456 register struct mbuf *m; 457 struct mbuf *control; 458 u_long len; 459 struct mbuf **getnam; 460 int error, sotype, rcvflg; 461 struct proc *p = curproc; /* XXX */ 462 463 /* 464 * Set up arguments for soreceive() 465 */ 466 *mp = (struct mbuf *)0; 467 *aname = (struct mbuf *)0; 468 sotype = rep->r_nmp->nm_sotype; 469 470 /* 471 * For reliable protocols, lock against other senders/receivers 472 * in case a reconnect is necessary. 473 * For SOCK_STREAM, first get the Record Mark to find out how much 474 * more there is to get. 475 * We must lock the socket against other receivers 476 * until we have an entire rpc request/reply. 477 */ 478 if (sotype != SOCK_DGRAM) { 479 if (error = nfs_sndlock(&rep->r_nmp->nm_flag, rep)) 480 return (error); 481 tryagain: 482 /* 483 * Check for fatal errors and resending request. 484 */ 485 /* 486 * Ugh: If a reconnect attempt just happened, nm_so 487 * would have changed. NULL indicates a failed 488 * attempt that has essentially shut down this 489 * mount point. 490 */ 491 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) { 492 nfs_sndunlock(&rep->r_nmp->nm_flag); 493 return (EINTR); 494 } 495 if ((so = rep->r_nmp->nm_so) == NULL) { 496 if (error = nfs_reconnect(rep)) { 497 nfs_sndunlock(&rep->r_nmp->nm_flag); 498 return (error); 499 } 500 goto tryagain; 501 } 502 while (rep->r_flags & R_MUSTRESEND) { 503 m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT); 504 nfsstats.rpcretries++; 505 if (error = nfs_send(so, rep->r_nmp->nm_nam, m, rep)) { 506 if (error == EINTR || error == ERESTART || 507 (error = nfs_reconnect(rep))) { 508 nfs_sndunlock(&rep->r_nmp->nm_flag); 509 return (error); 510 } 511 goto tryagain; 512 } 513 } 514 nfs_sndunlock(&rep->r_nmp->nm_flag); 515 if (sotype == SOCK_STREAM) { 516 aio.iov_base = (caddr_t) &len; 517 aio.iov_len = sizeof(u_long); 518 auio.uio_iov = &aio; 519 auio.uio_iovcnt = 1; 520 auio.uio_segflg = UIO_SYSSPACE; 521 auio.uio_rw = UIO_READ; 522 auio.uio_offset = 0; 523 auio.uio_resid = sizeof(u_long); 524 auio.uio_procp = p; 525 do { 526 rcvflg = MSG_WAITALL; 527 error = soreceive(so, (struct mbuf **)0, &auio, 528 (struct mbuf **)0, (struct mbuf **)0, &rcvflg); 529 if (error == EWOULDBLOCK && rep) { 530 if (rep->r_flags & R_SOFTTERM) 531 return (EINTR); 532 } 533 } while (error == EWOULDBLOCK); 534 if (!error && auio.uio_resid > 0) { 535 log(LOG_INFO, 536 "short receive (%d/%d) from nfs server %s\n", 537 sizeof(u_long) - auio.uio_resid, 538 sizeof(u_long), 539 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 540 error = EPIPE; 541 } 542 if (error) 543 goto errout; 544 len = ntohl(len) & ~0x80000000; 545 /* 546 * This is SERIOUS! We are out of sync with the sender 547 * and forcing a disconnect/reconnect is all I can do. 548 */ 549 if (len > NFS_MAXPACKET) { 550 log(LOG_ERR, "%s (%d) from nfs server %s\n", 551 "impossible packet length", 552 len, 553 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 554 error = EFBIG; 555 goto errout; 556 } 557 auio.uio_resid = len; 558 do { 559 rcvflg = MSG_WAITALL; 560 error = soreceive(so, (struct mbuf **)0, 561 &auio, mp, (struct mbuf **)0, &rcvflg); 562 } while (error == EWOULDBLOCK || error == EINTR || 563 error == ERESTART); 564 if (!error && auio.uio_resid > 0) { 565 log(LOG_INFO, 566 "short receive (%d/%d) from nfs server %s\n", 567 len - auio.uio_resid, len, 568 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 569 error = EPIPE; 570 } 571 } else { 572 /* 573 * NB: Since uio_resid is big, MSG_WAITALL is ignored 574 * and soreceive() will return when it has either a 575 * control msg or a data msg. 576 * We have no use for control msg., but must grab them 577 * and then throw them away so we know what is going 578 * on. 579 */ 580 auio.uio_resid = len = 100000000; /* Anything Big */ 581 auio.uio_procp = p; 582 do { 583 rcvflg = 0; 584 error = soreceive(so, (struct mbuf **)0, 585 &auio, mp, &control, &rcvflg); 586 if (control) 587 m_freem(control); 588 if (error == EWOULDBLOCK && rep) { 589 if (rep->r_flags & R_SOFTTERM) 590 return (EINTR); 591 } 592 } while (error == EWOULDBLOCK || 593 (!error && *mp == NULL && control)); 594 if ((rcvflg & MSG_EOR) == 0) 595 printf("Egad!!\n"); 596 if (!error && *mp == NULL) 597 error = EPIPE; 598 len -= auio.uio_resid; 599 } 600 errout: 601 if (error && error != EINTR && error != ERESTART) { 602 m_freem(*mp); 603 *mp = (struct mbuf *)0; 604 if (error != EPIPE) 605 log(LOG_INFO, 606 "receive error %d from nfs server %s\n", 607 error, 608 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 609 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); 610 if (!error) 611 error = nfs_reconnect(rep); 612 if (!error) 613 goto tryagain; 614 } 615 } else { 616 if ((so = rep->r_nmp->nm_so) == NULL) 617 return (EACCES); 618 if (so->so_state & SS_ISCONNECTED) 619 getnam = (struct mbuf **)0; 620 else 621 getnam = aname; 622 auio.uio_resid = len = 1000000; 623 auio.uio_procp = p; 624 do { 625 rcvflg = 0; 626 error = soreceive(so, getnam, &auio, mp, 627 (struct mbuf **)0, &rcvflg); 628 if (error == EWOULDBLOCK && 629 (rep->r_flags & R_SOFTTERM)) 630 return (EINTR); 631 } while (error == EWOULDBLOCK); 632 len -= auio.uio_resid; 633 } 634 if (error) { 635 m_freem(*mp); 636 *mp = (struct mbuf *)0; 637 } 638 /* 639 * Search for any mbufs that are not a multiple of 4 bytes long 640 * or with m_data not longword aligned. 641 * These could cause pointer alignment problems, so copy them to 642 * well aligned mbufs. 643 */ 644 nfs_realign(*mp, 5 * NFSX_UNSIGNED); 645 return (error); 646 } 647 648 /* 649 * Implement receipt of reply on a socket. 650 * We must search through the list of received datagrams matching them 651 * with outstanding requests using the xid, until ours is found. 652 */ 653 /* ARGSUSED */ 654 nfs_reply(myrep) 655 struct nfsreq *myrep; 656 { 657 register struct nfsreq *rep; 658 register struct nfsmount *nmp = myrep->r_nmp; 659 register long t1; 660 struct mbuf *mrep, *nam, *md; 661 u_long rxid, *tl; 662 caddr_t dpos, cp2; 663 int error; 664 665 /* 666 * Loop around until we get our own reply 667 */ 668 for (;;) { 669 /* 670 * Lock against other receivers so that I don't get stuck in 671 * sbwait() after someone else has received my reply for me. 672 * Also necessary for connection based protocols to avoid 673 * race conditions during a reconnect. 674 */ 675 if (error = nfs_rcvlock(myrep)) 676 return (error); 677 /* Already received, bye bye */ 678 if (myrep->r_mrep != NULL) { 679 nfs_rcvunlock(&nmp->nm_flag); 680 return (0); 681 } 682 /* 683 * Get the next Rpc reply off the socket 684 */ 685 error = nfs_receive(myrep, &nam, &mrep); 686 nfs_rcvunlock(&nmp->nm_flag); 687 if (error) printf("rcv err=%d\n",error); 688 if (error) { 689 690 /* 691 * Ignore routing errors on connectionless protocols?? 692 */ 693 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) { 694 nmp->nm_so->so_error = 0; 695 continue; 696 } 697 return (error); 698 } 699 if (nam) 700 m_freem(nam); 701 702 /* 703 * Get the xid and check that it is an rpc reply 704 */ 705 md = mrep; 706 dpos = mtod(md, caddr_t); 707 nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED); 708 rxid = *tl++; 709 if (*tl != rpc_reply) { 710 if (nmp->nm_flag & NFSMNT_NQNFS) { 711 if (nqnfs_callback(nmp, mrep, md, dpos)) 712 nfsstats.rpcinvalid++; 713 } else { 714 nfsstats.rpcinvalid++; 715 m_freem(mrep); 716 } 717 nfsmout: 718 continue; 719 } 720 721 /* 722 * Loop through the request list to match up the reply 723 * Iff no match, just drop the datagram 724 */ 725 rep = nfsreqh.r_next; 726 while (rep != &nfsreqh) { 727 if (rep->r_mrep == NULL && rxid == rep->r_xid) { 728 /* Found it.. */ 729 rep->r_mrep = mrep; 730 rep->r_md = md; 731 rep->r_dpos = dpos; 732 if (nfsrtton) { 733 struct rttl *rt; 734 735 rt = &nfsrtt.rttl[nfsrtt.pos]; 736 rt->proc = rep->r_procnum; 737 rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]); 738 rt->sent = nmp->nm_sent; 739 rt->cwnd = nmp->nm_cwnd; 740 rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1]; 741 rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1]; 742 rt->fsid = nmp->nm_mountp->mnt_stat.f_fsid; 743 rt->tstamp = time; 744 if (rep->r_flags & R_TIMING) 745 rt->rtt = rep->r_rtt; 746 else 747 rt->rtt = 1000000; 748 nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ; 749 } 750 /* 751 * Update congestion window. 752 * Do the additive increase of 753 * one rpc/rtt. 754 */ 755 if (nmp->nm_cwnd <= nmp->nm_sent) { 756 nmp->nm_cwnd += 757 (NFS_CWNDSCALE * NFS_CWNDSCALE + 758 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd; 759 if (nmp->nm_cwnd > NFS_MAXCWND) 760 nmp->nm_cwnd = NFS_MAXCWND; 761 } 762 nmp->nm_sent -= NFS_CWNDSCALE; 763 /* 764 * Update rtt using a gain of 0.125 on the mean 765 * and a gain of 0.25 on the deviation. 766 */ 767 if (rep->r_flags & R_TIMING) { 768 /* 769 * Since the timer resolution of 770 * NFS_HZ is so course, it can often 771 * result in r_rtt == 0. Since 772 * r_rtt == N means that the actual 773 * rtt is between N+dt and N+2-dt ticks, 774 * add 1. 775 */ 776 t1 = rep->r_rtt + 1; 777 t1 -= (NFS_SRTT(rep) >> 3); 778 NFS_SRTT(rep) += t1; 779 if (t1 < 0) 780 t1 = -t1; 781 t1 -= (NFS_SDRTT(rep) >> 2); 782 NFS_SDRTT(rep) += t1; 783 } 784 nmp->nm_timeouts = 0; 785 break; 786 } 787 rep = rep->r_next; 788 } 789 /* 790 * If not matched to a request, drop it. 791 * If it's mine, get out. 792 */ 793 if (rep == &nfsreqh) { 794 nfsstats.rpcunexpected++; 795 m_freem(mrep); 796 } else if (rep == myrep) { 797 if (rep->r_mrep == NULL) 798 panic("nfsreply nil"); 799 return (0); 800 } 801 } 802 } 803 804 /* 805 * nfs_request - goes something like this 806 * - fill in request struct 807 * - links it into list 808 * - calls nfs_send() for first transmit 809 * - calls nfs_receive() to get reply 810 * - break down rpc header and return with nfs reply pointed to 811 * by mrep or error 812 * nb: always frees up mreq mbuf list 813 */ 814 nfs_request(vp, mrest, procnum, procp, cred, mrp, mdp, dposp) 815 struct vnode *vp; 816 struct mbuf *mrest; 817 int procnum; 818 struct proc *procp; 819 struct ucred *cred; 820 struct mbuf **mrp; 821 struct mbuf **mdp; 822 caddr_t *dposp; 823 { 824 register struct mbuf *m, *mrep; 825 register struct nfsreq *rep; 826 register u_long *tl; 827 register int i; 828 struct nfsmount *nmp; 829 struct mbuf *md, *mheadend; 830 struct nfsreq *reph; 831 struct nfsnode *tp, *np; 832 time_t reqtime, waituntil; 833 caddr_t dpos, cp2; 834 int t1, nqlflag, cachable, s, error = 0, mrest_len, auth_len, auth_type; 835 int trylater_delay = NQ_TRYLATERDEL, trylater_cnt = 0, failed_auth = 0; 836 u_long xid; 837 u_quad_t frev; 838 char *auth_str; 839 840 nmp = VFSTONFS(vp->v_mount); 841 MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK); 842 rep->r_nmp = nmp; 843 rep->r_vp = vp; 844 rep->r_procp = procp; 845 rep->r_procnum = procnum; 846 i = 0; 847 m = mrest; 848 while (m) { 849 i += m->m_len; 850 m = m->m_next; 851 } 852 mrest_len = i; 853 854 /* 855 * Get the RPC header with authorization. 856 */ 857 kerbauth: 858 auth_str = (char *)0; 859 if (nmp->nm_flag & NFSMNT_KERB) { 860 if (failed_auth) { 861 error = nfs_getauth(nmp, rep, cred, &auth_type, 862 &auth_str, &auth_len); 863 if (error) { 864 free((caddr_t)rep, M_NFSREQ); 865 m_freem(mrest); 866 return (error); 867 } 868 } else { 869 auth_type = RPCAUTH_UNIX; 870 auth_len = 5 * NFSX_UNSIGNED; 871 } 872 } else { 873 auth_type = RPCAUTH_UNIX; 874 if (cred->cr_ngroups < 1) 875 panic("nfsreq nogrps"); 876 auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ? 877 nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) + 878 5 * NFSX_UNSIGNED; 879 } 880 m = nfsm_rpchead(cred, (nmp->nm_flag & NFSMNT_NQNFS), procnum, 881 auth_type, auth_len, auth_str, mrest, mrest_len, &mheadend, &xid); 882 if (auth_str) 883 free(auth_str, M_TEMP); 884 885 /* 886 * For stream protocols, insert a Sun RPC Record Mark. 887 */ 888 if (nmp->nm_sotype == SOCK_STREAM) { 889 M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); 890 *mtod(m, u_long *) = htonl(0x80000000 | 891 (m->m_pkthdr.len - NFSX_UNSIGNED)); 892 } 893 rep->r_mreq = m; 894 rep->r_xid = xid; 895 tryagain: 896 if (nmp->nm_flag & NFSMNT_SOFT) 897 rep->r_retry = nmp->nm_retry; 898 else 899 rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */ 900 rep->r_rtt = rep->r_rexmit = 0; 901 if (proct[procnum] > 0) 902 rep->r_flags = R_TIMING; 903 else 904 rep->r_flags = 0; 905 rep->r_mrep = NULL; 906 907 /* 908 * Do the client side RPC. 909 */ 910 nfsstats.rpcrequests++; 911 /* 912 * Chain request into list of outstanding requests. Be sure 913 * to put it LAST so timer finds oldest requests first. 914 */ 915 s = splsoftclock(); 916 reph = &nfsreqh; 917 reph->r_prev->r_next = rep; 918 rep->r_prev = reph->r_prev; 919 reph->r_prev = rep; 920 rep->r_next = reph; 921 922 /* Get send time for nqnfs */ 923 reqtime = time.tv_sec; 924 925 /* 926 * If backing off another request or avoiding congestion, don't 927 * send this one now but let timer do it. If not timing a request, 928 * do it now. 929 */ 930 if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM || 931 (nmp->nm_flag & NFSMNT_DUMBTIMR) || 932 nmp->nm_sent < nmp->nm_cwnd)) { 933 splx(s); 934 if (nmp->nm_soflags & PR_CONNREQUIRED) 935 error = nfs_sndlock(&nmp->nm_flag, rep); 936 if (!error) { 937 m = m_copym(m, 0, M_COPYALL, M_WAIT); 938 error = nfs_send(nmp->nm_so, nmp->nm_nam, m, rep); 939 if (nmp->nm_soflags & PR_CONNREQUIRED) 940 nfs_sndunlock(&nmp->nm_flag); 941 } 942 if (!error && (rep->r_flags & R_MUSTRESEND) == 0) { 943 nmp->nm_sent += NFS_CWNDSCALE; 944 rep->r_flags |= R_SENT; 945 } 946 } else { 947 splx(s); 948 rep->r_rtt = -1; 949 } 950 951 /* 952 * Wait for the reply from our send or the timer's. 953 */ 954 if (!error || error == EPIPE) 955 error = nfs_reply(rep); 956 957 /* 958 * RPC done, unlink the request. 959 */ 960 s = splsoftclock(); 961 rep->r_prev->r_next = rep->r_next; 962 rep->r_next->r_prev = rep->r_prev; 963 splx(s); 964 965 /* 966 * If there was a successful reply and a tprintf msg. 967 * tprintf a response. 968 */ 969 if (!error && (rep->r_flags & R_TPRINTFMSG)) 970 nfs_msg(rep->r_procp, nmp->nm_mountp->mnt_stat.f_mntfromname, 971 "is alive again"); 972 mrep = rep->r_mrep; 973 md = rep->r_md; 974 dpos = rep->r_dpos; 975 if (error) { 976 m_freem(rep->r_mreq); 977 free((caddr_t)rep, M_NFSREQ); 978 return (error); 979 } 980 981 /* 982 * break down the rpc header and check if ok 983 */ 984 nfsm_dissect(tl, u_long *, 3*NFSX_UNSIGNED); 985 if (*tl++ == rpc_msgdenied) { 986 if (*tl == rpc_mismatch) 987 error = EOPNOTSUPP; 988 else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) { 989 if (*tl == rpc_rejectedcred && failed_auth == 0) { 990 failed_auth++; 991 mheadend->m_next = (struct mbuf *)0; 992 m_freem(mrep); 993 m_freem(rep->r_mreq); 994 goto kerbauth; 995 } else 996 error = EAUTH; 997 } else 998 error = EACCES; 999 m_freem(mrep); 1000 m_freem(rep->r_mreq); 1001 free((caddr_t)rep, M_NFSREQ); 1002 return (error); 1003 } 1004 1005 /* 1006 * skip over the auth_verf, someday we may want to cache auth_short's 1007 * for nfs_reqhead(), but for now just dump it 1008 */ 1009 if (*++tl != 0) { 1010 i = nfsm_rndup(fxdr_unsigned(long, *tl)); 1011 nfsm_adv(i); 1012 } 1013 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); 1014 /* 0 == ok */ 1015 if (*tl == 0) { 1016 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); 1017 if (*tl != 0) { 1018 error = fxdr_unsigned(int, *tl); 1019 m_freem(mrep); 1020 if ((nmp->nm_flag & NFSMNT_NQNFS) && 1021 error == NQNFS_TRYLATER) { 1022 error = 0; 1023 waituntil = time.tv_sec + trylater_delay; 1024 while (time.tv_sec < waituntil) 1025 (void) tsleep((caddr_t)&lbolt, 1026 PSOCK, "nqnfstry", 0); 1027 trylater_delay *= nfs_backoff[trylater_cnt]; 1028 if (trylater_cnt < 7) 1029 trylater_cnt++; 1030 goto tryagain; 1031 } 1032 1033 /* 1034 * If the File Handle was stale, invalidate the 1035 * lookup cache, just in case. 1036 */ 1037 if (error == ESTALE) 1038 cache_purge(vp); 1039 m_freem(rep->r_mreq); 1040 free((caddr_t)rep, M_NFSREQ); 1041 return (error); 1042 } 1043 1044 /* 1045 * For nqnfs, get any lease in reply 1046 */ 1047 if (nmp->nm_flag & NFSMNT_NQNFS) { 1048 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); 1049 if (*tl) { 1050 np = VTONFS(vp); 1051 nqlflag = fxdr_unsigned(int, *tl); 1052 nfsm_dissect(tl, u_long *, 4*NFSX_UNSIGNED); 1053 cachable = fxdr_unsigned(int, *tl++); 1054 reqtime += fxdr_unsigned(int, *tl++); 1055 if (reqtime > time.tv_sec) { 1056 fxdr_hyper(tl, &frev); 1057 nqnfs_clientlease(nmp, np, nqlflag, 1058 cachable, reqtime, frev); 1059 } 1060 } 1061 } 1062 *mrp = mrep; 1063 *mdp = md; 1064 *dposp = dpos; 1065 m_freem(rep->r_mreq); 1066 FREE((caddr_t)rep, M_NFSREQ); 1067 return (0); 1068 } 1069 m_freem(mrep); 1070 m_freem(rep->r_mreq); 1071 free((caddr_t)rep, M_NFSREQ); 1072 error = EPROTONOSUPPORT; 1073 nfsmout: 1074 return (error); 1075 } 1076 1077 /* 1078 * Generate the rpc reply header 1079 * siz arg. is used to decide if adding a cluster is worthwhile 1080 */ 1081 nfs_rephead(siz, nd, err, cache, frev, mrq, mbp, bposp) 1082 int siz; 1083 struct nfsd *nd; 1084 int err; 1085 int cache; 1086 u_quad_t *frev; 1087 struct mbuf **mrq; 1088 struct mbuf **mbp; 1089 caddr_t *bposp; 1090 { 1091 register u_long *tl; 1092 register struct mbuf *mreq; 1093 caddr_t bpos; 1094 struct mbuf *mb, *mb2; 1095 1096 MGETHDR(mreq, M_WAIT, MT_DATA); 1097 mb = mreq; 1098 /* 1099 * If this is a big reply, use a cluster else 1100 * try and leave leading space for the lower level headers. 1101 */ 1102 siz += RPC_REPLYSIZ; 1103 if (siz >= MINCLSIZE) { 1104 MCLGET(mreq, M_WAIT); 1105 } else 1106 mreq->m_data += max_hdr; 1107 tl = mtod(mreq, u_long *); 1108 mreq->m_len = 6*NFSX_UNSIGNED; 1109 bpos = ((caddr_t)tl)+mreq->m_len; 1110 *tl++ = nd->nd_retxid; 1111 *tl++ = rpc_reply; 1112 if (err == ERPCMISMATCH || err == NQNFS_AUTHERR) { 1113 *tl++ = rpc_msgdenied; 1114 if (err == NQNFS_AUTHERR) { 1115 *tl++ = rpc_autherr; 1116 *tl = rpc_rejectedcred; 1117 mreq->m_len -= NFSX_UNSIGNED; 1118 bpos -= NFSX_UNSIGNED; 1119 } else { 1120 *tl++ = rpc_mismatch; 1121 *tl++ = txdr_unsigned(2); 1122 *tl = txdr_unsigned(2); 1123 } 1124 } else { 1125 *tl++ = rpc_msgaccepted; 1126 *tl++ = 0; 1127 *tl++ = 0; 1128 switch (err) { 1129 case EPROGUNAVAIL: 1130 *tl = txdr_unsigned(RPC_PROGUNAVAIL); 1131 break; 1132 case EPROGMISMATCH: 1133 *tl = txdr_unsigned(RPC_PROGMISMATCH); 1134 nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); 1135 *tl++ = txdr_unsigned(2); 1136 *tl = txdr_unsigned(2); /* someday 3 */ 1137 break; 1138 case EPROCUNAVAIL: 1139 *tl = txdr_unsigned(RPC_PROCUNAVAIL); 1140 break; 1141 default: 1142 *tl = 0; 1143 if (err != VNOVAL) { 1144 nfsm_build(tl, u_long *, NFSX_UNSIGNED); 1145 if (err) 1146 *tl = txdr_unsigned(nfsrv_errmap[err - 1]); 1147 else 1148 *tl = 0; 1149 } 1150 break; 1151 }; 1152 } 1153 1154 /* 1155 * For nqnfs, piggyback lease as requested. 1156 */ 1157 if (nd->nd_nqlflag != NQL_NOVAL && err == 0) { 1158 if (nd->nd_nqlflag) { 1159 nfsm_build(tl, u_long *, 5*NFSX_UNSIGNED); 1160 *tl++ = txdr_unsigned(nd->nd_nqlflag); 1161 *tl++ = txdr_unsigned(cache); 1162 *tl++ = txdr_unsigned(nd->nd_duration); 1163 txdr_hyper(frev, tl); 1164 } else { 1165 if (nd->nd_nqlflag != 0) 1166 panic("nqreph"); 1167 nfsm_build(tl, u_long *, NFSX_UNSIGNED); 1168 *tl = 0; 1169 } 1170 } 1171 *mrq = mreq; 1172 *mbp = mb; 1173 *bposp = bpos; 1174 if (err != 0 && err != VNOVAL) 1175 nfsstats.srvrpc_errs++; 1176 return (0); 1177 } 1178 1179 /* 1180 * Nfs timer routine 1181 * Scan the nfsreq list and retranmit any requests that have timed out 1182 * To avoid retransmission attempts on STREAM sockets (in the future) make 1183 * sure to set the r_retry field to 0 (implies nm_retry == 0). 1184 */ 1185 void 1186 nfs_timer(arg) 1187 void *arg; 1188 { 1189 register struct nfsreq *rep; 1190 register struct mbuf *m; 1191 register struct socket *so; 1192 register struct nfsmount *nmp; 1193 register int timeo; 1194 static long lasttime = 0; 1195 int s, error; 1196 1197 s = splnet(); 1198 for (rep = nfsreqh.r_next; rep != &nfsreqh; rep = rep->r_next) { 1199 nmp = rep->r_nmp; 1200 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) 1201 continue; 1202 if (nfs_sigintr(nmp, rep, rep->r_procp)) { 1203 rep->r_flags |= R_SOFTTERM; 1204 continue; 1205 } 1206 if (rep->r_rtt >= 0) { 1207 rep->r_rtt++; 1208 if (nmp->nm_flag & NFSMNT_DUMBTIMR) 1209 timeo = nmp->nm_timeo; 1210 else 1211 timeo = NFS_RTO(nmp, proct[rep->r_procnum]); 1212 if (nmp->nm_timeouts > 0) 1213 timeo *= nfs_backoff[nmp->nm_timeouts - 1]; 1214 if (rep->r_rtt <= timeo) 1215 continue; 1216 if (nmp->nm_timeouts < 8) 1217 nmp->nm_timeouts++; 1218 } 1219 /* 1220 * Check for server not responding 1221 */ 1222 if ((rep->r_flags & R_TPRINTFMSG) == 0 && 1223 rep->r_rexmit > nmp->nm_deadthresh) { 1224 nfs_msg(rep->r_procp, 1225 nmp->nm_mountp->mnt_stat.f_mntfromname, 1226 "not responding"); 1227 rep->r_flags |= R_TPRINTFMSG; 1228 } 1229 if (rep->r_rexmit >= rep->r_retry) { /* too many */ 1230 nfsstats.rpctimeouts++; 1231 rep->r_flags |= R_SOFTTERM; 1232 continue; 1233 } 1234 if (nmp->nm_sotype != SOCK_DGRAM) { 1235 if (++rep->r_rexmit > NFS_MAXREXMIT) 1236 rep->r_rexmit = NFS_MAXREXMIT; 1237 continue; 1238 } 1239 if ((so = nmp->nm_so) == NULL) 1240 continue; 1241 1242 /* 1243 * If there is enough space and the window allows.. 1244 * Resend it 1245 * Set r_rtt to -1 in case we fail to send it now. 1246 */ 1247 rep->r_rtt = -1; 1248 if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len && 1249 ((nmp->nm_flag & NFSMNT_DUMBTIMR) || 1250 (rep->r_flags & R_SENT) || 1251 nmp->nm_sent < nmp->nm_cwnd) && 1252 (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){ 1253 if ((nmp->nm_flag & NFSMNT_NOCONN) == 0) 1254 error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m, 1255 (struct mbuf *)0, (struct mbuf *)0); 1256 else 1257 error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m, 1258 nmp->nm_nam, (struct mbuf *)0); 1259 if (error) { 1260 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) 1261 so->so_error = 0; 1262 } else { 1263 /* 1264 * Iff first send, start timing 1265 * else turn timing off, backoff timer 1266 * and divide congestion window by 2. 1267 */ 1268 if (rep->r_flags & R_SENT) { 1269 rep->r_flags &= ~R_TIMING; 1270 if (++rep->r_rexmit > NFS_MAXREXMIT) 1271 rep->r_rexmit = NFS_MAXREXMIT; 1272 nmp->nm_cwnd >>= 1; 1273 if (nmp->nm_cwnd < NFS_CWNDSCALE) 1274 nmp->nm_cwnd = NFS_CWNDSCALE; 1275 nfsstats.rpcretries++; 1276 } else { 1277 rep->r_flags |= R_SENT; 1278 nmp->nm_sent += NFS_CWNDSCALE; 1279 } 1280 rep->r_rtt = 0; 1281 } 1282 } 1283 } 1284 1285 /* 1286 * Call the nqnfs server timer once a second to handle leases. 1287 */ 1288 if (lasttime != time.tv_sec) { 1289 lasttime = time.tv_sec; 1290 nqnfs_serverd(); 1291 } 1292 splx(s); 1293 timeout(nfs_timer, (caddr_t)0, hz/NFS_HZ); 1294 } 1295 1296 /* 1297 * Test for a termination condition pending on the process. 1298 * This is used for NFSMNT_INT mounts. 1299 */ 1300 nfs_sigintr(nmp, rep, p) 1301 struct nfsmount *nmp; 1302 struct nfsreq *rep; 1303 register struct proc *p; 1304 { 1305 1306 if (rep && (rep->r_flags & R_SOFTTERM)) 1307 return (EINTR); 1308 if (!(nmp->nm_flag & NFSMNT_INT)) 1309 return (0); 1310 if (p && p->p_sig && (((p->p_sig &~ p->p_sigmask) &~ p->p_sigignore) & 1311 NFSINT_SIGMASK)) 1312 return (EINTR); 1313 return (0); 1314 } 1315 1316 /* 1317 * Lock a socket against others. 1318 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply 1319 * and also to avoid race conditions between the processes with nfs requests 1320 * in progress when a reconnect is necessary. 1321 */ 1322 nfs_sndlock(flagp, rep) 1323 register int *flagp; 1324 struct nfsreq *rep; 1325 { 1326 struct proc *p; 1327 1328 if (rep) 1329 p = rep->r_procp; 1330 else 1331 p = (struct proc *)0; 1332 while (*flagp & NFSMNT_SNDLOCK) { 1333 if (nfs_sigintr(rep->r_nmp, rep, p)) 1334 return (EINTR); 1335 *flagp |= NFSMNT_WANTSND; 1336 (void) tsleep((caddr_t)flagp, PZERO-1, "nfsndlck", 0); 1337 } 1338 *flagp |= NFSMNT_SNDLOCK; 1339 return (0); 1340 } 1341 1342 /* 1343 * Unlock the stream socket for others. 1344 */ 1345 void 1346 nfs_sndunlock(flagp) 1347 register int *flagp; 1348 { 1349 1350 if ((*flagp & NFSMNT_SNDLOCK) == 0) 1351 panic("nfs sndunlock"); 1352 *flagp &= ~NFSMNT_SNDLOCK; 1353 if (*flagp & NFSMNT_WANTSND) { 1354 *flagp &= ~NFSMNT_WANTSND; 1355 wakeup((caddr_t)flagp); 1356 } 1357 } 1358 1359 nfs_rcvlock(rep) 1360 register struct nfsreq *rep; 1361 { 1362 register int *flagp = &rep->r_nmp->nm_flag; 1363 1364 while (*flagp & NFSMNT_RCVLOCK) { 1365 if (nfs_sigintr(rep->r_nmp, rep, rep->r_procp)) 1366 return (EINTR); 1367 *flagp |= NFSMNT_WANTRCV; 1368 (void) tsleep((caddr_t)flagp, PZERO-1, "nfsrcvlck", 0); 1369 } 1370 *flagp |= NFSMNT_RCVLOCK; 1371 return (0); 1372 } 1373 1374 /* 1375 * Unlock the stream socket for others. 1376 */ 1377 void 1378 nfs_rcvunlock(flagp) 1379 register int *flagp; 1380 { 1381 1382 if ((*flagp & NFSMNT_RCVLOCK) == 0) 1383 panic("nfs rcvunlock"); 1384 *flagp &= ~NFSMNT_RCVLOCK; 1385 if (*flagp & NFSMNT_WANTRCV) { 1386 *flagp &= ~NFSMNT_WANTRCV; 1387 wakeup((caddr_t)flagp); 1388 } 1389 } 1390 1391 /* 1392 * Check for badly aligned mbuf data areas and 1393 * realign data in an mbuf list by copying the data areas up, as required. 1394 */ 1395 void 1396 nfs_realign(m, hsiz) 1397 register struct mbuf *m; 1398 int hsiz; 1399 { 1400 register struct mbuf *m2; 1401 register int siz, mlen, olen; 1402 register caddr_t tcp, fcp; 1403 struct mbuf *mnew; 1404 1405 while (m) { 1406 /* 1407 * This never happens for UDP, rarely happens for TCP 1408 * but frequently happens for iso transport. 1409 */ 1410 if ((m->m_len & 0x3) || (mtod(m, int) & 0x3)) { 1411 olen = m->m_len; 1412 fcp = mtod(m, caddr_t); 1413 m->m_flags &= ~M_PKTHDR; 1414 if (m->m_flags & M_EXT) 1415 m->m_data = m->m_ext.ext_buf; 1416 else 1417 m->m_data = m->m_dat; 1418 m->m_len = 0; 1419 tcp = mtod(m, caddr_t); 1420 mnew = m; 1421 m2 = m->m_next; 1422 1423 /* 1424 * If possible, only put the first invariant part 1425 * of the RPC header in the first mbuf. 1426 */ 1427 if (olen <= hsiz) 1428 mlen = hsiz; 1429 else 1430 mlen = M_TRAILINGSPACE(m); 1431 1432 /* 1433 * Loop through the mbuf list consolidating data. 1434 */ 1435 while (m) { 1436 while (olen > 0) { 1437 if (mlen == 0) { 1438 m2->m_flags &= ~M_PKTHDR; 1439 if (m2->m_flags & M_EXT) 1440 m2->m_data = m2->m_ext.ext_buf; 1441 else 1442 m2->m_data = m2->m_dat; 1443 m2->m_len = 0; 1444 mlen = M_TRAILINGSPACE(m2); 1445 tcp = mtod(m2, caddr_t); 1446 mnew = m2; 1447 m2 = m2->m_next; 1448 } 1449 siz = min(mlen, olen); 1450 if (tcp != fcp) 1451 bcopy(fcp, tcp, siz); 1452 mnew->m_len += siz; 1453 mlen -= siz; 1454 olen -= siz; 1455 tcp += siz; 1456 fcp += siz; 1457 } 1458 m = m->m_next; 1459 if (m) { 1460 olen = m->m_len; 1461 fcp = mtod(m, caddr_t); 1462 } 1463 } 1464 1465 /* 1466 * Finally, set m_len == 0 for any trailing mbufs that have 1467 * been copied out of. 1468 */ 1469 while (m2) { 1470 m2->m_len = 0; 1471 m2 = m2->m_next; 1472 } 1473 return; 1474 } 1475 m = m->m_next; 1476 } 1477 } 1478 1479 /* 1480 * Socket upcall routine for the nfsd sockets. 1481 * The caddr_t arg is a pointer to the "struct nfssvc_sock". 1482 * Essentially do as much as possible non-blocking, else punt and it will 1483 * be called with M_WAIT from an nfsd. 1484 */ 1485 void 1486 nfsrv_rcv(so, arg, waitflag) 1487 struct socket *so; 1488 caddr_t arg; 1489 int waitflag; 1490 { 1491 register struct nfssvc_sock *slp = (struct nfssvc_sock *)arg; 1492 register struct mbuf *m; 1493 struct mbuf *mp, *nam; 1494 struct uio auio; 1495 int flags, error; 1496 1497 if ((slp->ns_flag & SLP_VALID) == 0) 1498 return; 1499 #ifdef notdef 1500 /* 1501 * Define this to test for nfsds handling this under heavy load. 1502 */ 1503 if (waitflag == M_DONTWAIT) { 1504 slp->ns_flag |= SLP_NEEDQ; goto dorecs; 1505 } 1506 #endif 1507 auio.uio_procp = NULL; 1508 if (so->so_type == SOCK_STREAM) { 1509 /* 1510 * If there are already records on the queue, defer soreceive() 1511 * to an nfsd so that there is feedback to the TCP layer that 1512 * the nfs servers are heavily loaded. 1513 */ 1514 if (slp->ns_rec && waitflag == M_DONTWAIT) { 1515 slp->ns_flag |= SLP_NEEDQ; 1516 goto dorecs; 1517 } 1518 1519 /* 1520 * Do soreceive(). 1521 */ 1522 auio.uio_resid = 1000000000; 1523 flags = MSG_DONTWAIT; 1524 error = soreceive(so, &nam, &auio, &mp, (struct mbuf **)0, &flags); 1525 if (error || mp == (struct mbuf *)0) { 1526 if (error == EWOULDBLOCK) 1527 slp->ns_flag |= SLP_NEEDQ; 1528 else 1529 slp->ns_flag |= SLP_DISCONN; 1530 goto dorecs; 1531 } 1532 m = mp; 1533 if (slp->ns_rawend) { 1534 slp->ns_rawend->m_next = m; 1535 slp->ns_cc += 1000000000 - auio.uio_resid; 1536 } else { 1537 slp->ns_raw = m; 1538 slp->ns_cc = 1000000000 - auio.uio_resid; 1539 } 1540 while (m->m_next) 1541 m = m->m_next; 1542 slp->ns_rawend = m; 1543 1544 /* 1545 * Now try and parse record(s) out of the raw stream data. 1546 */ 1547 if (error = nfsrv_getstream(slp, waitflag)) { 1548 if (error == EPERM) 1549 slp->ns_flag |= SLP_DISCONN; 1550 else 1551 slp->ns_flag |= SLP_NEEDQ; 1552 } 1553 } else { 1554 do { 1555 auio.uio_resid = 1000000000; 1556 flags = MSG_DONTWAIT; 1557 error = soreceive(so, &nam, &auio, &mp, 1558 (struct mbuf **)0, &flags); 1559 if (mp) { 1560 nfs_realign(mp, 10 * NFSX_UNSIGNED); 1561 if (nam) { 1562 m = nam; 1563 m->m_next = mp; 1564 } else 1565 m = mp; 1566 if (slp->ns_recend) 1567 slp->ns_recend->m_nextpkt = m; 1568 else 1569 slp->ns_rec = m; 1570 slp->ns_recend = m; 1571 m->m_nextpkt = (struct mbuf *)0; 1572 } 1573 if (error) { 1574 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) 1575 && error != EWOULDBLOCK) { 1576 slp->ns_flag |= SLP_DISCONN; 1577 goto dorecs; 1578 } 1579 } 1580 } while (mp); 1581 } 1582 1583 /* 1584 * Now try and process the request records, non-blocking. 1585 */ 1586 dorecs: 1587 if (waitflag == M_DONTWAIT && 1588 (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)))) 1589 nfsrv_wakenfsd(slp); 1590 } 1591 1592 /* 1593 * Try and extract an RPC request from the mbuf data list received on a 1594 * stream socket. The "waitflag" argument indicates whether or not it 1595 * can sleep. 1596 */ 1597 nfsrv_getstream(slp, waitflag) 1598 register struct nfssvc_sock *slp; 1599 int waitflag; 1600 { 1601 register struct mbuf *m; 1602 register char *cp1, *cp2; 1603 register int len; 1604 struct mbuf *om, *m2, *recm; 1605 u_long recmark; 1606 1607 if (slp->ns_flag & SLP_GETSTREAM) 1608 panic("nfs getstream"); 1609 slp->ns_flag |= SLP_GETSTREAM; 1610 for (;;) { 1611 if (slp->ns_reclen == 0) { 1612 if (slp->ns_cc < NFSX_UNSIGNED) { 1613 slp->ns_flag &= ~SLP_GETSTREAM; 1614 return (0); 1615 } 1616 m = slp->ns_raw; 1617 if (m->m_len >= NFSX_UNSIGNED) { 1618 bcopy(mtod(m, caddr_t), (caddr_t)&recmark, NFSX_UNSIGNED); 1619 m->m_data += NFSX_UNSIGNED; 1620 m->m_len -= NFSX_UNSIGNED; 1621 } else { 1622 cp1 = (caddr_t)&recmark; 1623 cp2 = mtod(m, caddr_t); 1624 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) { 1625 while (m->m_len == 0) { 1626 m = m->m_next; 1627 cp2 = mtod(m, caddr_t); 1628 } 1629 *cp1++ = *cp2++; 1630 m->m_data++; 1631 m->m_len--; 1632 } 1633 } 1634 slp->ns_cc -= NFSX_UNSIGNED; 1635 slp->ns_reclen = ntohl(recmark) & ~0x80000000; 1636 if (slp->ns_reclen < NFS_MINPACKET || slp->ns_reclen > NFS_MAXPACKET) { 1637 slp->ns_flag &= ~SLP_GETSTREAM; 1638 return (EPERM); 1639 } 1640 } 1641 1642 /* 1643 * Now get the record part. 1644 */ 1645 if (slp->ns_cc == slp->ns_reclen) { 1646 recm = slp->ns_raw; 1647 slp->ns_raw = slp->ns_rawend = (struct mbuf *)0; 1648 slp->ns_cc = slp->ns_reclen = 0; 1649 } else if (slp->ns_cc > slp->ns_reclen) { 1650 len = 0; 1651 m = slp->ns_raw; 1652 om = (struct mbuf *)0; 1653 while (len < slp->ns_reclen) { 1654 if ((len + m->m_len) > slp->ns_reclen) { 1655 m2 = m_copym(m, 0, slp->ns_reclen - len, 1656 waitflag); 1657 if (m2) { 1658 if (om) { 1659 om->m_next = m2; 1660 recm = slp->ns_raw; 1661 } else 1662 recm = m2; 1663 m->m_data += slp->ns_reclen - len; 1664 m->m_len -= slp->ns_reclen - len; 1665 len = slp->ns_reclen; 1666 } else { 1667 slp->ns_flag &= ~SLP_GETSTREAM; 1668 return (EWOULDBLOCK); 1669 } 1670 } else if ((len + m->m_len) == slp->ns_reclen) { 1671 om = m; 1672 len += m->m_len; 1673 m = m->m_next; 1674 recm = slp->ns_raw; 1675 om->m_next = (struct mbuf *)0; 1676 } else { 1677 om = m; 1678 len += m->m_len; 1679 m = m->m_next; 1680 } 1681 } 1682 slp->ns_raw = m; 1683 slp->ns_cc -= len; 1684 slp->ns_reclen = 0; 1685 } else { 1686 slp->ns_flag &= ~SLP_GETSTREAM; 1687 return (0); 1688 } 1689 nfs_realign(recm, 10 * NFSX_UNSIGNED); 1690 if (slp->ns_recend) 1691 slp->ns_recend->m_nextpkt = recm; 1692 else 1693 slp->ns_rec = recm; 1694 slp->ns_recend = recm; 1695 } 1696 } 1697 1698 /* 1699 * Parse an RPC header. 1700 */ 1701 nfsrv_dorec(slp, nd) 1702 register struct nfssvc_sock *slp; 1703 register struct nfsd *nd; 1704 { 1705 register struct mbuf *m; 1706 int error; 1707 1708 if ((slp->ns_flag & SLP_VALID) == 0 || 1709 (m = slp->ns_rec) == (struct mbuf *)0) 1710 return (ENOBUFS); 1711 if (slp->ns_rec = m->m_nextpkt) 1712 m->m_nextpkt = (struct mbuf *)0; 1713 else 1714 slp->ns_recend = (struct mbuf *)0; 1715 if (m->m_type == MT_SONAME) { 1716 nd->nd_nam = m; 1717 nd->nd_md = nd->nd_mrep = m->m_next; 1718 m->m_next = (struct mbuf *)0; 1719 } else { 1720 nd->nd_nam = (struct mbuf *)0; 1721 nd->nd_md = nd->nd_mrep = m; 1722 } 1723 nd->nd_dpos = mtod(nd->nd_md, caddr_t); 1724 if (error = nfs_getreq(nd, TRUE)) { 1725 m_freem(nd->nd_nam); 1726 return (error); 1727 } 1728 return (0); 1729 } 1730 1731 /* 1732 * Parse an RPC request 1733 * - verify it 1734 * - fill in the cred struct. 1735 */ 1736 nfs_getreq(nd, has_header) 1737 register struct nfsd *nd; 1738 int has_header; 1739 { 1740 register int len, i; 1741 register u_long *tl; 1742 register long t1; 1743 struct uio uio; 1744 struct iovec iov; 1745 caddr_t dpos, cp2; 1746 u_long nfsvers, auth_type; 1747 int error = 0, nqnfs = 0; 1748 struct mbuf *mrep, *md; 1749 1750 mrep = nd->nd_mrep; 1751 md = nd->nd_md; 1752 dpos = nd->nd_dpos; 1753 if (has_header) { 1754 nfsm_dissect(tl, u_long *, 10*NFSX_UNSIGNED); 1755 nd->nd_retxid = *tl++; 1756 if (*tl++ != rpc_call) { 1757 m_freem(mrep); 1758 return (EBADRPC); 1759 } 1760 } else { 1761 nfsm_dissect(tl, u_long *, 8*NFSX_UNSIGNED); 1762 } 1763 nd->nd_repstat = 0; 1764 if (*tl++ != rpc_vers) { 1765 nd->nd_repstat = ERPCMISMATCH; 1766 nd->nd_procnum = NFSPROC_NOOP; 1767 return (0); 1768 } 1769 nfsvers = nfs_vers; 1770 if (*tl != nfs_prog) { 1771 if (*tl == nqnfs_prog) { 1772 nqnfs++; 1773 nfsvers = nqnfs_vers; 1774 } else { 1775 nd->nd_repstat = EPROGUNAVAIL; 1776 nd->nd_procnum = NFSPROC_NOOP; 1777 return (0); 1778 } 1779 } 1780 tl++; 1781 if (*tl++ != nfsvers) { 1782 nd->nd_repstat = EPROGMISMATCH; 1783 nd->nd_procnum = NFSPROC_NOOP; 1784 return (0); 1785 } 1786 nd->nd_procnum = fxdr_unsigned(u_long, *tl++); 1787 if (nd->nd_procnum == NFSPROC_NULL) 1788 return (0); 1789 if (nd->nd_procnum >= NFS_NPROCS || 1790 (!nqnfs && nd->nd_procnum > NFSPROC_STATFS) || 1791 (*tl != rpc_auth_unix && *tl != rpc_auth_kerb)) { 1792 nd->nd_repstat = EPROCUNAVAIL; 1793 nd->nd_procnum = NFSPROC_NOOP; 1794 return (0); 1795 } 1796 auth_type = *tl++; 1797 len = fxdr_unsigned(int, *tl++); 1798 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1799 m_freem(mrep); 1800 return (EBADRPC); 1801 } 1802 1803 /* 1804 * Handle auth_unix or auth_kerb. 1805 */ 1806 if (auth_type == rpc_auth_unix) { 1807 len = fxdr_unsigned(int, *++tl); 1808 if (len < 0 || len > NFS_MAXNAMLEN) { 1809 m_freem(mrep); 1810 return (EBADRPC); 1811 } 1812 nfsm_adv(nfsm_rndup(len)); 1813 nfsm_dissect(tl, u_long *, 3*NFSX_UNSIGNED); 1814 nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++); 1815 nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++); 1816 len = fxdr_unsigned(int, *tl); 1817 if (len < 0 || len > RPCAUTH_UNIXGIDS) { 1818 m_freem(mrep); 1819 return (EBADRPC); 1820 } 1821 nfsm_dissect(tl, u_long *, (len + 2)*NFSX_UNSIGNED); 1822 for (i = 1; i <= len; i++) 1823 if (i < NGROUPS) 1824 nd->nd_cr.cr_groups[i] = fxdr_unsigned(gid_t, *tl++); 1825 else 1826 tl++; 1827 nd->nd_cr.cr_ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1); 1828 } else if (auth_type == rpc_auth_kerb) { 1829 nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++); 1830 nd->nd_authlen = fxdr_unsigned(int, *tl); 1831 iov.iov_len = uio.uio_resid = nfsm_rndup(nd->nd_authlen); 1832 if (uio.uio_resid > (len - 2*NFSX_UNSIGNED)) { 1833 m_freem(mrep); 1834 return (EBADRPC); 1835 } 1836 uio.uio_offset = 0; 1837 uio.uio_iov = &iov; 1838 uio.uio_iovcnt = 1; 1839 uio.uio_segflg = UIO_SYSSPACE; 1840 iov.iov_base = (caddr_t)nd->nd_authstr; 1841 nfsm_mtouio(&uio, uio.uio_resid); 1842 nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED); 1843 nd->nd_flag |= NFSD_NEEDAUTH; 1844 } 1845 1846 /* 1847 * Do we have any use for the verifier. 1848 * According to the "Remote Procedure Call Protocol Spec." it 1849 * should be AUTH_NULL, but some clients make it AUTH_UNIX? 1850 * For now, just skip over it 1851 */ 1852 len = fxdr_unsigned(int, *++tl); 1853 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1854 m_freem(mrep); 1855 return (EBADRPC); 1856 } 1857 if (len > 0) { 1858 nfsm_adv(nfsm_rndup(len)); 1859 } 1860 1861 /* 1862 * For nqnfs, get piggybacked lease request. 1863 */ 1864 if (nqnfs && nd->nd_procnum != NQNFSPROC_EVICTED) { 1865 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); 1866 nd->nd_nqlflag = fxdr_unsigned(int, *tl); 1867 if (nd->nd_nqlflag) { 1868 nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); 1869 nd->nd_duration = fxdr_unsigned(int, *tl); 1870 } else 1871 nd->nd_duration = NQ_MINLEASE; 1872 } else { 1873 nd->nd_nqlflag = NQL_NOVAL; 1874 nd->nd_duration = NQ_MINLEASE; 1875 } 1876 nd->nd_md = md; 1877 nd->nd_dpos = dpos; 1878 return (0); 1879 nfsmout: 1880 return (error); 1881 } 1882 1883 /* 1884 * Search for a sleeping nfsd and wake it up. 1885 * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the 1886 * running nfsds will go look for the work in the nfssvc_sock list. 1887 */ 1888 void 1889 nfsrv_wakenfsd(slp) 1890 struct nfssvc_sock *slp; 1891 { 1892 register struct nfsd *nd = nfsd_head.nd_next; 1893 1894 if ((slp->ns_flag & SLP_VALID) == 0) 1895 return; 1896 while (nd != (struct nfsd *)&nfsd_head) { 1897 if (nd->nd_flag & NFSD_WAITING) { 1898 nd->nd_flag &= ~NFSD_WAITING; 1899 if (nd->nd_slp) 1900 panic("nfsd wakeup"); 1901 slp->ns_sref++; 1902 nd->nd_slp = slp; 1903 wakeup((caddr_t)nd); 1904 return; 1905 } 1906 nd = nd->nd_next; 1907 } 1908 slp->ns_flag |= SLP_DOREC; 1909 nfsd_head.nd_flag |= NFSD_CHECKSLP; 1910 } 1911 1912 nfs_msg(p, server, msg) 1913 struct proc *p; 1914 char *server, *msg; 1915 { 1916 tpr_t tpr; 1917 1918 if (p) 1919 tpr = tprintf_open(p); 1920 else 1921 tpr = NULL; 1922 tprintf(tpr, "nfs server %s: %s\n", server, msg); 1923 tprintf_close(tpr); 1924 } 1925