1 /* $OpenBSD: nfs_socket.c,v 1.95 2009/08/25 13:41:29 thib Exp $ */ 2 /* $NetBSD: nfs_socket.c,v 1.27 1996/04/15 20:20:00 thorpej Exp $ */ 3 4 /* 5 * Copyright (c) 1989, 1991, 1993, 1995 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Rick Macklem at The University of Guelph. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95 36 */ 37 38 /* 39 * Socket operations for use by nfs 40 */ 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/proc.h> 45 #include <sys/mount.h> 46 #include <sys/kernel.h> 47 #include <sys/mbuf.h> 48 #include <sys/vnode.h> 49 #include <sys/domain.h> 50 #include <sys/protosw.h> 51 #include <sys/socket.h> 52 #include <sys/socketvar.h> 53 #include <sys/syslog.h> 54 #include <sys/tprintf.h> 55 #include <sys/namei.h> 56 #include <sys/pool.h> 57 #include <sys/queue.h> 58 59 #include <netinet/in.h> 60 #include <netinet/tcp.h> 61 62 #include <nfs/rpcv2.h> 63 #include <nfs/nfsproto.h> 64 #include <nfs/nfs.h> 65 #include <nfs/xdr_subs.h> 66 #include <nfs/nfsm_subs.h> 67 #include <nfs/nfsmount.h> 68 #include <nfs/nfsnode.h> 69 #include <nfs/nfs_var.h> 70 71 /* External data, mostly RPC constants in XDR form. */ 72 extern u_int32_t rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, 73 rpc_auth_unix, rpc_msgaccepted, rpc_call, rpc_autherr; 74 extern u_int32_t nfs_prog; 75 extern struct nfsstats nfsstats; 76 extern int nfsv3_procid[NFS_NPROCS]; 77 extern int nfs_ticks; 78 79 extern struct pool nfsrv_descript_pl; 80 81 /* 82 * There is a congestion window for outstanding rpcs maintained per mount 83 * point. The cwnd size is adjusted in roughly the way that: 84 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of 85 * SIGCOMM '88". ACM, August 1988. 86 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout 87 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd 88 * of rpcs is in progress. 89 * (The sent count and cwnd are scaled for integer arith.) 90 * Variants of "slow start" were tried and were found to be too much of a 91 * performance hit (ave. rtt 3 times larger), 92 * I suspect due to the large rtt that nfs rpcs have. 93 */ 94 #define NFS_CWNDSCALE 256 95 #define NFS_MAXCWND (NFS_CWNDSCALE * 32) 96 int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256 }; 97 98 /* RTT estimator */ 99 enum nfs_rto_timers nfs_ptimers[NFS_NPROCS] = { 100 NFS_DEFAULT_TIMER, /* NULL */ 101 NFS_GETATTR_TIMER, /* GETATTR */ 102 NFS_DEFAULT_TIMER, /* SETATTR */ 103 NFS_LOOKUP_TIMER, /* LOOKUP */ 104 NFS_GETATTR_TIMER, /* ACCESS */ 105 NFS_READ_TIMER, /* READLINK */ 106 NFS_READ_TIMER, /* READ */ 107 NFS_WRITE_TIMER, /* WRITE */ 108 NFS_DEFAULT_TIMER, /* CREATE */ 109 NFS_DEFAULT_TIMER, /* MKDIR */ 110 NFS_DEFAULT_TIMER, /* SYMLINK */ 111 NFS_DEFAULT_TIMER, /* MKNOD */ 112 NFS_DEFAULT_TIMER, /* REMOVE */ 113 NFS_DEFAULT_TIMER, /* RMDIR */ 114 NFS_DEFAULT_TIMER, /* RENAME */ 115 NFS_DEFAULT_TIMER, /* LINK */ 116 NFS_READ_TIMER, /* READDIR */ 117 NFS_READ_TIMER, /* READDIRPLUS */ 118 NFS_DEFAULT_TIMER, /* FSSTAT */ 119 NFS_DEFAULT_TIMER, /* FSINFO */ 120 NFS_DEFAULT_TIMER, /* PATHCONF */ 121 NFS_DEFAULT_TIMER, /* COMMIT */ 122 NFS_DEFAULT_TIMER, /* NOOP */ 123 }; 124 125 void nfs_init_rtt(struct nfsmount *); 126 void nfs_update_rtt(struct nfsreq *); 127 int nfs_estimate_rto(struct nfsmount *, u_int32_t procnum); 128 129 void nfs_realign(struct mbuf **, int); 130 void nfs_realign_fixup(struct mbuf *, struct mbuf *, unsigned int *); 131 unsigned int nfs_realign_test = 0; 132 unsigned int nfs_realign_count = 0; 133 134 /* Initialize the RTT estimator state for a new mount point. */ 135 void 136 nfs_init_rtt(struct nfsmount *nmp) 137 { 138 int i; 139 140 for (i = 0; i < NFS_MAX_TIMER; i++) 141 nmp->nm_srtt[i] = NFS_INITRTT; 142 for (i = 0; i < NFS_MAX_TIMER; i++) 143 nmp->nm_sdrtt[i] = 0; 144 } 145 146 /* 147 * Update a mount point's RTT estimator state using data from the 148 * passed-in request. 149 * 150 * Use a gain of 0.125 on the mean and a gain of 0.25 on the deviation. 151 * 152 * NB: Since the timer resolution of NFS_HZ is so course, it can often 153 * result in r_rtt == 0. Since r_rtt == N means that the actual RTT is 154 * between N + dt and N + 2 - dt ticks, add 1 before calculating the 155 * update values. 156 */ 157 void 158 nfs_update_rtt(struct nfsreq *rep) 159 { 160 int t1 = rep->r_rtt + 1; 161 int index = nfs_ptimers[rep->r_procnum] - 1; 162 int *srtt = &rep->r_nmp->nm_srtt[index]; 163 int *sdrtt = &rep->r_nmp->nm_sdrtt[index]; 164 165 t1 -= *srtt >> 3; 166 *srtt += t1; 167 if (t1 < 0) 168 t1 = -t1; 169 t1 -= *sdrtt >> 2; 170 *sdrtt += t1; 171 } 172 173 /* 174 * Estimate RTO for an NFS RPC sent via an unreliable datagram. 175 * 176 * Use the mean and mean deviation of RTT for the appropriate type 177 * of RPC for the frequent RPCs and a default for the others. 178 * The justification for doing "other" this way is that these RPCs 179 * happen so infrequently that timer est. would probably be stale. 180 * Also, since many of these RPCs are non-idempotent, a conservative 181 * timeout is desired. 182 * 183 * getattr, lookup - A+2D 184 * read, write - A+4D 185 * other - nm_timeo 186 */ 187 int 188 nfs_estimate_rto(struct nfsmount *nmp, u_int32_t procnum) 189 { 190 enum nfs_rto_timers timer = nfs_ptimers[procnum]; 191 int index = timer - 1; 192 int rto; 193 194 switch (timer) { 195 case NFS_GETATTR_TIMER: 196 case NFS_LOOKUP_TIMER: 197 rto = ((nmp->nm_srtt[index] + 3) >> 2) + 198 ((nmp->nm_sdrtt[index] + 1) >> 1); 199 break; 200 case NFS_READ_TIMER: 201 case NFS_WRITE_TIMER: 202 rto = ((nmp->nm_srtt[index] + 7) >> 3) + 203 (nmp->nm_sdrtt[index] + 1); 204 break; 205 default: 206 rto = nmp->nm_timeo; 207 return (rto); 208 } 209 210 if (rto < NFS_MINRTO) 211 rto = NFS_MINRTO; 212 else if (rto > NFS_MAXRTO) 213 rto = NFS_MAXRTO; 214 215 return (rto); 216 } 217 218 219 220 /* 221 * Initialize sockets and congestion for a new NFS connection. 222 * We do not free the sockaddr if error. 223 */ 224 int 225 nfs_connect(nmp, rep) 226 struct nfsmount *nmp; 227 struct nfsreq *rep; 228 { 229 struct socket *so; 230 int s, error, rcvreserve, sndreserve; 231 struct sockaddr *saddr; 232 struct sockaddr_in *sin; 233 struct mbuf *m; 234 235 nmp->nm_so = NULL; 236 saddr = mtod(nmp->nm_nam, struct sockaddr *); 237 error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype, 238 nmp->nm_soproto); 239 if (error) 240 goto bad; 241 so = nmp->nm_so; 242 nmp->nm_soflags = so->so_proto->pr_flags; 243 244 /* 245 * Some servers require that the client port be a reserved port number. 246 * We always allocate a reserved port, as this prevents filehandle 247 * disclosure through UDP port capture. 248 */ 249 if (saddr->sa_family == AF_INET) { 250 struct mbuf *mopt; 251 int *ip; 252 253 MGET(mopt, M_WAIT, MT_SOOPTS); 254 mopt->m_len = sizeof(int); 255 ip = mtod(mopt, int *); 256 *ip = IP_PORTRANGE_LOW; 257 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); 258 if (error) 259 goto bad; 260 261 MGET(m, M_WAIT, MT_SONAME); 262 sin = mtod(m, struct sockaddr_in *); 263 sin->sin_len = m->m_len = sizeof (struct sockaddr_in); 264 sin->sin_family = AF_INET; 265 sin->sin_addr.s_addr = INADDR_ANY; 266 sin->sin_port = htons(0); 267 error = sobind(so, m, &proc0); 268 m_freem(m); 269 if (error) 270 goto bad; 271 272 MGET(mopt, M_WAIT, MT_SOOPTS); 273 mopt->m_len = sizeof(int); 274 ip = mtod(mopt, int *); 275 *ip = IP_PORTRANGE_DEFAULT; 276 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); 277 if (error) 278 goto bad; 279 } 280 281 /* 282 * Protocols that do not require connections may be optionally left 283 * unconnected for servers that reply from a port other than NFS_PORT. 284 */ 285 if (nmp->nm_flag & NFSMNT_NOCONN) { 286 if (nmp->nm_soflags & PR_CONNREQUIRED) { 287 error = ENOTCONN; 288 goto bad; 289 } 290 } else { 291 error = soconnect(so, nmp->nm_nam); 292 if (error) 293 goto bad; 294 295 /* 296 * Wait for the connection to complete. Cribbed from the 297 * connect system call but with the wait timing out so 298 * that interruptible mounts don't hang here for a long time. 299 */ 300 s = splsoftnet(); 301 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 302 (void) tsleep((caddr_t)&so->so_timeo, PSOCK, 303 "nfscon", 2 * hz); 304 if ((so->so_state & SS_ISCONNECTING) && 305 so->so_error == 0 && rep && 306 (error = nfs_sigintr(nmp, rep, rep->r_procp)) != 0){ 307 so->so_state &= ~SS_ISCONNECTING; 308 splx(s); 309 goto bad; 310 } 311 } 312 if (so->so_error) { 313 error = so->so_error; 314 so->so_error = 0; 315 splx(s); 316 goto bad; 317 } 318 splx(s); 319 } 320 /* 321 * Always set receive timeout to detect server crash and reconnect. 322 * Otherwise, we can get stuck in soreceive forever. 323 */ 324 so->so_rcv.sb_timeo = (5 * hz); 325 if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) 326 so->so_snd.sb_timeo = (5 * hz); 327 else 328 so->so_snd.sb_timeo = 0; 329 if (nmp->nm_sotype == SOCK_DGRAM) { 330 sndreserve = nmp->nm_wsize + NFS_MAXPKTHDR; 331 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) + 332 NFS_MAXPKTHDR) * 2; 333 } else if (nmp->nm_sotype == SOCK_SEQPACKET) { 334 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 2; 335 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) + 336 NFS_MAXPKTHDR) * 2; 337 } else { 338 if (nmp->nm_sotype != SOCK_STREAM) 339 panic("nfscon sotype"); 340 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 341 MGET(m, M_WAIT, MT_SOOPTS); 342 *mtod(m, int32_t *) = 1; 343 m->m_len = sizeof(int32_t); 344 sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, m); 345 } 346 if (so->so_proto->pr_protocol == IPPROTO_TCP) { 347 MGET(m, M_WAIT, MT_SOOPTS); 348 *mtod(m, int32_t *) = 1; 349 m->m_len = sizeof(int32_t); 350 sosetopt(so, IPPROTO_TCP, TCP_NODELAY, m); 351 } 352 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + 353 sizeof (u_int32_t)) * 2; 354 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + 355 sizeof (u_int32_t)) * 2; 356 } 357 error = soreserve(so, sndreserve, rcvreserve); 358 if (error) 359 goto bad; 360 so->so_rcv.sb_flags |= SB_NOINTR; 361 so->so_snd.sb_flags |= SB_NOINTR; 362 363 /* Initialize other non-zero congestion variables */ 364 nfs_init_rtt(nmp); 365 nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */ 366 nmp->nm_sent = 0; 367 nmp->nm_timeouts = 0; 368 return (0); 369 370 bad: 371 nfs_disconnect(nmp); 372 return (error); 373 } 374 375 /* 376 * Reconnect routine: 377 * Called when a connection is broken on a reliable protocol. 378 * - clean up the old socket 379 * - nfs_connect() again 380 * - set R_MUSTRESEND for all outstanding requests on mount point 381 * If this fails the mount point is DEAD! 382 * nb: Must be called with the nfs_sndlock() set on the mount point. 383 */ 384 int 385 nfs_reconnect(rep) 386 struct nfsreq *rep; 387 { 388 struct nfsreq *rp; 389 struct nfsmount *nmp = rep->r_nmp; 390 int s, error; 391 392 nfs_disconnect(nmp); 393 while ((error = nfs_connect(nmp, rep)) != 0) { 394 if (error == EINTR || error == ERESTART) 395 return (EINTR); 396 (void) tsleep((caddr_t)&lbolt, PSOCK, "nfsrecon", 0); 397 } 398 399 /* 400 * Loop through outstanding request list and fix up all requests 401 * on old socket. 402 */ 403 s = splsoftnet(); 404 TAILQ_FOREACH(rp, &nmp->nm_reqsq, r_chain) { 405 rp->r_flags |= R_MUSTRESEND; 406 rp->r_rexmit = 0; 407 } 408 splx(s); 409 return (0); 410 } 411 412 /* 413 * NFS disconnect. Clean up and unlink. 414 */ 415 void 416 nfs_disconnect(nmp) 417 struct nfsmount *nmp; 418 { 419 struct socket *so; 420 421 if (nmp->nm_so) { 422 so = nmp->nm_so; 423 nmp->nm_so = NULL; 424 soshutdown(so, SHUT_RDWR); 425 soclose(so); 426 } 427 } 428 429 /* 430 * This is the nfs send routine. For connection based socket types, it 431 * must be called with an nfs_sndlock() on the socket. 432 * "rep == NULL" indicates that it has been called from a server. 433 * For the client side: 434 * - return EINTR if the RPC is terminated, 0 otherwise 435 * - set R_MUSTRESEND if the send fails for any reason 436 * - do any cleanup required by recoverable socket errors (???) 437 * For the server side: 438 * - return EINTR or ERESTART if interrupted by a signal 439 * - return EPIPE if a connection is lost for connection based sockets (TCP...) 440 * - do any cleanup required by recoverable socket errors (???) 441 */ 442 int 443 nfs_send(so, nam, top, rep) 444 struct socket *so; 445 struct mbuf *nam; 446 struct mbuf *top; 447 struct nfsreq *rep; 448 { 449 struct mbuf *sendnam; 450 int error, soflags, flags; 451 452 if (rep) { 453 if (rep->r_flags & R_SOFTTERM) { 454 m_freem(top); 455 return (EINTR); 456 } 457 if ((so = rep->r_nmp->nm_so) == NULL) { 458 rep->r_flags |= R_MUSTRESEND; 459 m_freem(top); 460 return (0); 461 } 462 rep->r_flags &= ~R_MUSTRESEND; 463 soflags = rep->r_nmp->nm_soflags; 464 } else 465 soflags = so->so_proto->pr_flags; 466 if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED)) 467 sendnam = NULL; 468 else 469 sendnam = nam; 470 if (so->so_type == SOCK_SEQPACKET) 471 flags = MSG_EOR; 472 else 473 flags = 0; 474 475 error = sosend(so, sendnam, NULL, top, NULL, flags); 476 if (error) { 477 if (rep) { 478 /* 479 * Deal with errors for the client side. 480 */ 481 if (rep->r_flags & R_SOFTTERM) 482 error = EINTR; 483 else 484 rep->r_flags |= R_MUSTRESEND; 485 } 486 487 /* 488 * Handle any recoverable (soft) socket errors here. (???) 489 */ 490 if (error != EINTR && error != ERESTART && 491 error != EWOULDBLOCK && error != EPIPE) 492 error = 0; 493 } 494 return (error); 495 } 496 497 #ifdef NFSCLIENT 498 /* 499 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all 500 * done by soreceive(), but for SOCK_STREAM we must deal with the Record 501 * Mark and consolidate the data into a new mbuf list. 502 * nb: Sometimes TCP passes the data up to soreceive() in long lists of 503 * small mbufs. 504 * For SOCK_STREAM we must be very careful to read an entire record once 505 * we have read any of it, even if the system call has been interrupted. 506 */ 507 int 508 nfs_receive(rep, aname, mp) 509 struct nfsreq *rep; 510 struct mbuf **aname; 511 struct mbuf **mp; 512 { 513 struct socket *so; 514 struct uio auio; 515 struct iovec aio; 516 struct mbuf *m; 517 struct mbuf *control; 518 u_int32_t len; 519 struct mbuf **getnam; 520 int error, sotype, rcvflg; 521 struct proc *p = curproc; /* XXX */ 522 523 /* 524 * Set up arguments for soreceive() 525 */ 526 *mp = NULL; 527 *aname = NULL; 528 sotype = rep->r_nmp->nm_sotype; 529 530 /* 531 * For reliable protocols, lock against other senders/receivers 532 * in case a reconnect is necessary. 533 * For SOCK_STREAM, first get the Record Mark to find out how much 534 * more there is to get. 535 * We must lock the socket against other receivers 536 * until we have an entire rpc request/reply. 537 */ 538 if (sotype != SOCK_DGRAM) { 539 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); 540 if (error) 541 return (error); 542 tryagain: 543 /* 544 * Check for fatal errors and resending request. 545 */ 546 /* 547 * Ugh: If a reconnect attempt just happened, nm_so 548 * would have changed. NULL indicates a failed 549 * attempt that has essentially shut down this 550 * mount point. 551 */ 552 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) { 553 nfs_sndunlock(&rep->r_nmp->nm_flag); 554 return (EINTR); 555 } 556 so = rep->r_nmp->nm_so; 557 if (!so) { 558 error = nfs_reconnect(rep); 559 if (error) { 560 nfs_sndunlock(&rep->r_nmp->nm_flag); 561 return (error); 562 } 563 goto tryagain; 564 } 565 while (rep->r_flags & R_MUSTRESEND) { 566 m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT); 567 nfsstats.rpcretries++; 568 rep->r_rtt = 0; 569 rep->r_flags &= ~R_TIMING; 570 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep); 571 if (error) { 572 if (error == EINTR || error == ERESTART || 573 (error = nfs_reconnect(rep)) != 0) { 574 nfs_sndunlock(&rep->r_nmp->nm_flag); 575 return (error); 576 } 577 goto tryagain; 578 } 579 } 580 nfs_sndunlock(&rep->r_nmp->nm_flag); 581 if (sotype == SOCK_STREAM) { 582 aio.iov_base = (caddr_t) &len; 583 aio.iov_len = sizeof(u_int32_t); 584 auio.uio_iov = &aio; 585 auio.uio_iovcnt = 1; 586 auio.uio_segflg = UIO_SYSSPACE; 587 auio.uio_rw = UIO_READ; 588 auio.uio_offset = 0; 589 auio.uio_resid = sizeof(u_int32_t); 590 auio.uio_procp = p; 591 do { 592 rcvflg = MSG_WAITALL; 593 error = soreceive(so, NULL, &auio, NULL, NULL, 594 &rcvflg, 0); 595 if (error == EWOULDBLOCK && rep) { 596 if (rep->r_flags & R_SOFTTERM) 597 return (EINTR); 598 /* 599 * looks like the server died after it 600 * received the request, make sure 601 * that we will retransmit and we 602 * don't get stuck here forever. 603 */ 604 if (rep->r_rexmit >= rep->r_nmp->nm_retry) { 605 nfsstats.rpctimeouts++; 606 error = EPIPE; 607 } 608 } 609 } while (error == EWOULDBLOCK); 610 if (!error && auio.uio_resid > 0) { 611 log(LOG_INFO, 612 "short receive (%d/%d) from nfs server %s\n", 613 sizeof(u_int32_t) - auio.uio_resid, 614 sizeof(u_int32_t), 615 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 616 error = EPIPE; 617 } 618 if (error) 619 goto errout; 620 621 len = ntohl(len) & ~0x80000000; 622 /* 623 * This is SERIOUS! We are out of sync with the sender 624 * and forcing a disconnect/reconnect is all I can do. 625 */ 626 if (len > NFS_MAXPACKET) { 627 log(LOG_ERR, "%s (%d) from nfs server %s\n", 628 "impossible packet length", 629 len, 630 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 631 error = EFBIG; 632 goto errout; 633 } 634 auio.uio_resid = len; 635 do { 636 rcvflg = MSG_WAITALL; 637 error = soreceive(so, NULL, &auio, mp, NULL, 638 &rcvflg, 0); 639 } while (error == EWOULDBLOCK || error == EINTR || 640 error == ERESTART); 641 if (!error && auio.uio_resid > 0) { 642 log(LOG_INFO, 643 "short receive (%d/%d) from nfs server %s\n", 644 len - auio.uio_resid, len, 645 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 646 error = EPIPE; 647 } 648 } else { 649 /* 650 * NB: Since uio_resid is big, MSG_WAITALL is ignored 651 * and soreceive() will return when it has either a 652 * control msg or a data msg. 653 * We have no use for control msg., but must grab them 654 * and then throw them away so we know what is going 655 * on. 656 */ 657 auio.uio_resid = len = 100000000; /* Anything Big */ 658 auio.uio_procp = p; 659 do { 660 rcvflg = 0; 661 error = soreceive(so, NULL, &auio, mp, &control, 662 &rcvflg, 0); 663 if (control) 664 m_freem(control); 665 if (error == EWOULDBLOCK && rep) { 666 if (rep->r_flags & R_SOFTTERM) 667 return (EINTR); 668 } 669 } while (error == EWOULDBLOCK || 670 (!error && *mp == NULL && control)); 671 if ((rcvflg & MSG_EOR) == 0) 672 printf("Egad!!\n"); 673 if (!error && *mp == NULL) 674 error = EPIPE; 675 len -= auio.uio_resid; 676 } 677 errout: 678 if (error && error != EINTR && error != ERESTART) { 679 m_freem(*mp); 680 *mp = NULL; 681 if (error != EPIPE) 682 log(LOG_INFO, 683 "receive error %d from nfs server %s\n", 684 error, 685 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 686 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); 687 if (!error) { 688 error = nfs_reconnect(rep); 689 if (!error) 690 goto tryagain; 691 nfs_sndunlock(&rep->r_nmp->nm_flag); 692 } 693 } 694 } else { 695 if ((so = rep->r_nmp->nm_so) == NULL) 696 return (EACCES); 697 if (so->so_state & SS_ISCONNECTED) 698 getnam = NULL; 699 else 700 getnam = aname; 701 auio.uio_resid = len = 1000000; 702 auio.uio_procp = p; 703 do { 704 rcvflg = 0; 705 error = soreceive(so, getnam, &auio, mp, NULL, 706 &rcvflg, 0); 707 if (error == EWOULDBLOCK && 708 (rep->r_flags & R_SOFTTERM)) 709 return (EINTR); 710 } while (error == EWOULDBLOCK); 711 len -= auio.uio_resid; 712 } 713 if (error) { 714 m_freem(*mp); 715 *mp = NULL; 716 } 717 /* 718 * Search for any mbufs that are not a multiple of 4 bytes long 719 * or with m_data not longword aligned. 720 * These could cause pointer alignment problems, so copy them to 721 * well aligned mbufs. 722 */ 723 nfs_realign(mp, 5 * NFSX_UNSIGNED); 724 return (error); 725 } 726 727 /* 728 * Implement receipt of reply on a socket. 729 * We must search through the list of received datagrams matching them 730 * with outstanding requests using the xid, until ours is found. 731 */ 732 int 733 nfs_reply(myrep) 734 struct nfsreq *myrep; 735 { 736 struct nfsreq *rep; 737 struct nfsmount *nmp = myrep->r_nmp; 738 struct nfsm_info info; 739 struct mbuf *nam; 740 u_int32_t rxid, *tl, t1; 741 caddr_t cp2; 742 int s, error; 743 744 /* 745 * Loop around until we get our own reply 746 */ 747 for (;;) { 748 /* 749 * Lock against other receivers so that I don't get stuck in 750 * sbwait() after someone else has received my reply for me. 751 * Also necessary for connection based protocols to avoid 752 * race conditions during a reconnect. 753 */ 754 error = nfs_rcvlock(myrep); 755 if (error) 756 return (error == EALREADY ? 0 : error); 757 758 /* 759 * Get the next Rpc reply off the socket 760 */ 761 error = nfs_receive(myrep, &nam, &info.nmi_mrep); 762 nfs_rcvunlock(&nmp->nm_flag); 763 if (error) { 764 765 /* 766 * Ignore routing errors on connectionless protocols?? 767 */ 768 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) { 769 if (nmp->nm_so) 770 nmp->nm_so->so_error = 0; 771 continue; 772 } 773 return (error); 774 } 775 if (nam) 776 m_freem(nam); 777 778 /* 779 * Get the xid and check that it is an rpc reply 780 */ 781 info.nmi_md = info.nmi_mrep; 782 info.nmi_dpos = mtod(info.nmi_md, caddr_t); 783 nfsm_dissect(tl, u_int32_t *, 2*NFSX_UNSIGNED); 784 rxid = *tl++; 785 if (*tl != rpc_reply) { 786 nfsstats.rpcinvalid++; 787 m_freem(info.nmi_mrep); 788 nfsmout: 789 continue; 790 } 791 792 /* 793 * Loop through the request list to match up the reply 794 * Iff no match, just drop the datagram 795 */ 796 s = splsoftnet(); 797 TAILQ_FOREACH(rep, &nmp->nm_reqsq, r_chain) { 798 if (rep->r_mrep == NULL && rxid == rep->r_xid) { 799 /* Found it.. */ 800 rep->r_mrep = info.nmi_mrep; 801 rep->r_md = info.nmi_md; 802 rep->r_dpos = info.nmi_dpos; 803 804 /* 805 * Update congestion window. 806 * Do the additive increase of 807 * one rpc/rtt. 808 */ 809 if (nmp->nm_cwnd <= nmp->nm_sent) { 810 nmp->nm_cwnd += 811 (NFS_CWNDSCALE * NFS_CWNDSCALE + 812 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd; 813 if (nmp->nm_cwnd > NFS_MAXCWND) 814 nmp->nm_cwnd = NFS_MAXCWND; 815 } 816 rep->r_flags &= ~R_SENT; 817 nmp->nm_sent -= NFS_CWNDSCALE; 818 819 if (rep->r_flags & R_TIMING) 820 nfs_update_rtt(rep); 821 822 nmp->nm_timeouts = 0; 823 break; 824 } 825 } 826 splx(s); 827 /* 828 * If not matched to a request, drop it. 829 * If it's mine, get out. 830 */ 831 if (rep == 0) { 832 nfsstats.rpcunexpected++; 833 m_freem(info.nmi_mrep); 834 } else if (rep == myrep) { 835 if (rep->r_mrep == NULL) 836 panic("nfsreply nil"); 837 return (0); 838 } 839 } 840 } 841 842 /* 843 * nfs_request - goes something like this 844 * - fill in request struct 845 * - links it into list 846 * - calls nfs_send() for first transmit 847 * - calls nfs_receive() to get reply 848 * - break down rpc header and return with nfs reply pointed to 849 * by mrep or error 850 * nb: always frees up mreq mbuf list 851 */ 852 int 853 nfs_request(struct vnode *vp, int procnum, struct nfsm_info *infop) 854 { 855 struct mbuf *m; 856 u_int32_t *tl; 857 struct nfsmount *nmp; 858 struct timeval tv; 859 caddr_t cp2; 860 int t1, i, s, error = 0; 861 int trylater_delay; 862 struct nfsreq *rep; 863 int mrest_len; 864 struct nfsm_info info; 865 866 rep = pool_get(&nfsreqpl, PR_WAITOK); 867 rep->r_nmp = VFSTONFS(vp->v_mount); 868 rep->r_vp = vp; 869 rep->r_procp = infop->nmi_procp; 870 rep->r_procnum = procnum; 871 872 mrest_len = 0; 873 m = infop->nmi_mreq; 874 while (m) { 875 mrest_len += m->m_len; 876 m = m->m_next; 877 } 878 879 /* empty mbuf for AUTH_UNIX header */ 880 rep->r_mreq = m_gethdr(M_WAIT, MT_DATA); 881 rep->r_mreq->m_next = infop->nmi_mreq; 882 rep->r_mreq->m_pkthdr.len = mrest_len; 883 884 trylater_delay = NFS_MINTIMEO; 885 886 nmp = rep->r_nmp; 887 888 /* Get the RPC header with authorization. */ 889 nfsm_rpchead(rep, infop->nmi_cred, RPCAUTH_UNIX); 890 m = rep->r_mreq; 891 892 /* 893 * For stream protocols, insert a Sun RPC Record Mark. 894 */ 895 if (nmp->nm_sotype == SOCK_STREAM) { 896 M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); 897 *mtod(m, u_int32_t *) = htonl(0x80000000 | 898 (m->m_pkthdr.len - NFSX_UNSIGNED)); 899 } 900 901 tryagain: 902 rep->r_rtt = rep->r_rexmit = 0; 903 if (nfs_ptimers[rep->r_procnum] != NFS_DEFAULT_TIMER) 904 rep->r_flags = R_TIMING; 905 else 906 rep->r_flags = 0; 907 rep->r_mrep = NULL; 908 909 /* 910 * Do the client side RPC. 911 */ 912 nfsstats.rpcrequests++; 913 /* 914 * Chain request into list of outstanding requests. Be sure 915 * to put it LAST so timer finds oldest requests first. 916 */ 917 s = splsoftnet(); 918 if (TAILQ_EMPTY(&nmp->nm_reqsq)) 919 timeout_add(&nmp->nm_rtimeout, nfs_ticks); 920 TAILQ_INSERT_TAIL(&nmp->nm_reqsq, rep, r_chain); 921 922 /* 923 * If backing off another request or avoiding congestion, don't 924 * send this one now but let timer do it. If not timing a request, 925 * do it now. 926 */ 927 if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM || 928 (nmp->nm_flag & NFSMNT_DUMBTIMR) || 929 nmp->nm_sent < nmp->nm_cwnd)) { 930 splx(s); 931 if (nmp->nm_soflags & PR_CONNREQUIRED) 932 error = nfs_sndlock(&nmp->nm_flag, rep); 933 if (!error) { 934 error = nfs_send(nmp->nm_so, nmp->nm_nam, 935 m_copym(m, 0, M_COPYALL, M_WAIT), 936 rep); 937 if (nmp->nm_soflags & PR_CONNREQUIRED) 938 nfs_sndunlock(&nmp->nm_flag); 939 } 940 if (!error && (rep->r_flags & R_MUSTRESEND) == 0) { 941 nmp->nm_sent += NFS_CWNDSCALE; 942 rep->r_flags |= R_SENT; 943 } 944 } else { 945 splx(s); 946 rep->r_rtt = -1; 947 } 948 949 /* 950 * Wait for the reply from our send or the timer's. 951 */ 952 if (!error || error == EPIPE) 953 error = nfs_reply(rep); 954 955 /* 956 * RPC done, unlink the request. 957 */ 958 s = splsoftnet(); 959 TAILQ_REMOVE(&nmp->nm_reqsq, rep, r_chain); 960 if (TAILQ_EMPTY(&nmp->nm_reqsq)) 961 timeout_del(&nmp->nm_rtimeout); 962 splx(s); 963 964 /* 965 * Decrement the outstanding request count. 966 */ 967 if (rep->r_flags & R_SENT) { 968 rep->r_flags &= ~R_SENT; /* paranoia */ 969 nmp->nm_sent -= NFS_CWNDSCALE; 970 } 971 972 /* 973 * If there was a successful reply and a tprintf msg. 974 * tprintf a response. 975 */ 976 if (!error && (rep->r_flags & R_TPRINTFMSG)) 977 nfs_msg(rep, "is alive again"); 978 info.nmi_mrep = rep->r_mrep; 979 info.nmi_md = rep->r_md; 980 info.nmi_dpos = rep->r_dpos; 981 if (error) { 982 infop->nmi_mrep = NULL; 983 goto nfsmout1; 984 } 985 986 /* 987 * break down the rpc header and check if ok 988 */ 989 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); 990 if (*tl++ == rpc_msgdenied) { 991 if (*tl == rpc_mismatch) 992 error = EOPNOTSUPP; 993 else 994 error = EACCES; /* Should be EAUTH. */ 995 infop->nmi_mrep = NULL; 996 goto nfsmout1; 997 } 998 999 /* 1000 * Since we only support RPCAUTH_UNIX atm we step over the 1001 * reply verifer type, and in the (error) case that there really 1002 * is any data in it, we advance over it. 1003 */ 1004 tl++; /* Step over verifer type */ 1005 i = fxdr_unsigned(int32_t, *tl); 1006 if (i > 0) 1007 nfsm_adv(nfsm_rndup(i)); /* Should not happen */ 1008 1009 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); 1010 /* 0 == ok */ 1011 if (*tl == 0) { 1012 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); 1013 if (*tl != 0) { 1014 error = fxdr_unsigned(int, *tl); 1015 if ((nmp->nm_flag & NFSMNT_NFSV3) && 1016 error == NFSERR_TRYLATER) { 1017 m_freem(info.nmi_mrep); 1018 error = 0; 1019 tv.tv_sec = time_second + trylater_delay; 1020 tv.tv_usec = 0; 1021 tsleep(&tv, PSOCK, "nfsretry", hzto(&tv)); 1022 trylater_delay *= NFS_TIMEOUTMUL; 1023 if (trylater_delay > NFS_MAXTIMEO) 1024 trylater_delay = NFS_MAXTIMEO; 1025 1026 goto tryagain; 1027 } 1028 1029 /* 1030 * If the File Handle was stale, invalidate the 1031 * lookup cache, just in case. 1032 */ 1033 if (error == ESTALE) 1034 cache_purge(rep->r_vp); 1035 } 1036 goto nfsmout; 1037 } 1038 1039 error = EPROTONOSUPPORT; 1040 1041 nfsmout: 1042 infop->nmi_mrep = info.nmi_mrep; 1043 infop->nmi_md = info.nmi_md; 1044 infop->nmi_dpos = info.nmi_dpos; 1045 nfsmout1: 1046 m_freem(rep->r_mreq); 1047 pool_put(&nfsreqpl, rep); 1048 return (error); 1049 } 1050 #endif /* NFSCLIENT */ 1051 1052 /* 1053 * Generate the rpc reply header 1054 * siz arg. is used to decide if adding a cluster is worthwhile 1055 */ 1056 int 1057 nfs_rephead(siz, nd, slp, err, mrq, mbp) 1058 int siz; 1059 struct nfsrv_descript *nd; 1060 struct nfssvc_sock *slp; 1061 int err; 1062 struct mbuf **mrq; 1063 struct mbuf **mbp; 1064 { 1065 u_int32_t *tl; 1066 struct mbuf *mreq; 1067 struct mbuf *mb; 1068 1069 MGETHDR(mreq, M_WAIT, MT_DATA); 1070 mb = mreq; 1071 /* 1072 * If this is a big reply, use a cluster else 1073 * try and leave leading space for the lower level headers. 1074 */ 1075 siz += RPC_REPLYSIZ; 1076 if (siz >= max_datalen) { 1077 MCLGET(mreq, M_WAIT); 1078 } else 1079 mreq->m_data += max_hdr; 1080 tl = mtod(mreq, u_int32_t *); 1081 mreq->m_len = 6 * NFSX_UNSIGNED; 1082 *tl++ = txdr_unsigned(nd->nd_retxid); 1083 *tl++ = rpc_reply; 1084 if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) { 1085 *tl++ = rpc_msgdenied; 1086 if (err & NFSERR_AUTHERR) { 1087 *tl++ = rpc_autherr; 1088 *tl = txdr_unsigned(err & ~NFSERR_AUTHERR); 1089 mreq->m_len -= NFSX_UNSIGNED; 1090 } else { 1091 *tl++ = rpc_mismatch; 1092 *tl++ = txdr_unsigned(RPC_VER2); 1093 *tl = txdr_unsigned(RPC_VER2); 1094 } 1095 } else { 1096 *tl++ = rpc_msgaccepted; 1097 1098 /* AUTH_UNIX requires RPCAUTH_NULL. */ 1099 *tl++ = 0; 1100 *tl++ = 0; 1101 1102 switch (err) { 1103 case EPROGUNAVAIL: 1104 *tl = txdr_unsigned(RPC_PROGUNAVAIL); 1105 break; 1106 case EPROGMISMATCH: 1107 *tl = txdr_unsigned(RPC_PROGMISMATCH); 1108 tl = nfsm_build(&mb, 2 * NFSX_UNSIGNED); 1109 *tl++ = txdr_unsigned(NFS_VER2); 1110 *tl = txdr_unsigned(NFS_VER3); 1111 break; 1112 case EPROCUNAVAIL: 1113 *tl = txdr_unsigned(RPC_PROCUNAVAIL); 1114 break; 1115 case EBADRPC: 1116 *tl = txdr_unsigned(RPC_GARBAGE); 1117 break; 1118 default: 1119 *tl = 0; 1120 if (err != NFSERR_RETVOID) { 1121 tl = nfsm_build(&mb, NFSX_UNSIGNED); 1122 if (err) 1123 *tl = txdr_unsigned(nfsrv_errmap(nd, err)); 1124 else 1125 *tl = 0; 1126 } 1127 break; 1128 }; 1129 } 1130 1131 *mrq = mreq; 1132 if (mbp != NULL) 1133 *mbp = mb; 1134 if (err != 0 && err != NFSERR_RETVOID) 1135 nfsstats.srvrpc_errs++; 1136 return (0); 1137 } 1138 1139 /* 1140 * nfs timer routine 1141 * Scan the nfsreq list and retranmit any requests that have timed out. 1142 */ 1143 void 1144 nfs_timer(void *arg) 1145 { 1146 struct nfsmount *nmp = arg; 1147 struct nfsreq *rep; 1148 struct mbuf *m; 1149 struct socket *so; 1150 int timeo, s, error; 1151 1152 s = splsoftnet(); 1153 TAILQ_FOREACH(rep, &nmp->nm_reqsq, r_chain) { 1154 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) 1155 continue; 1156 if (nfs_sigintr(nmp, rep, rep->r_procp)) { 1157 rep->r_flags |= R_SOFTTERM; 1158 continue; 1159 } 1160 if (rep->r_rtt >= 0) { 1161 rep->r_rtt++; 1162 if (nmp->nm_flag & NFSMNT_DUMBTIMR) 1163 timeo = nmp->nm_timeo; 1164 else 1165 timeo = nfs_estimate_rto(nmp, rep->r_procnum); 1166 if (nmp->nm_timeouts > 0) 1167 timeo *= nfs_backoff[nmp->nm_timeouts - 1]; 1168 if (rep->r_rtt <= timeo) 1169 continue; 1170 if (nmp->nm_timeouts < nitems(nfs_backoff)) 1171 nmp->nm_timeouts++; 1172 } 1173 1174 /* Check for server not responding. */ 1175 if ((rep->r_flags & R_TPRINTFMSG) == 0 && rep->r_rexmit > 4) { 1176 nfs_msg(rep, "not responding"); 1177 rep->r_flags |= R_TPRINTFMSG; 1178 } 1179 if (rep->r_rexmit >= nmp->nm_retry) { /* too many */ 1180 nfsstats.rpctimeouts++; 1181 rep->r_flags |= R_SOFTTERM; 1182 continue; 1183 } 1184 if (nmp->nm_sotype != SOCK_DGRAM) { 1185 if (++rep->r_rexmit > NFS_MAXREXMIT) 1186 rep->r_rexmit = NFS_MAXREXMIT; 1187 continue; 1188 } 1189 1190 if ((so = nmp->nm_so) == NULL) 1191 continue; 1192 1193 /* 1194 * If there is enough space and the window allows.. 1195 * Resend it 1196 * Set r_rtt to -1 in case we fail to send it now. 1197 */ 1198 rep->r_rtt = -1; 1199 if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len && 1200 ((nmp->nm_flag & NFSMNT_DUMBTIMR) || 1201 (rep->r_flags & R_SENT) || 1202 nmp->nm_sent < nmp->nm_cwnd) && 1203 (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){ 1204 if ((nmp->nm_flag & NFSMNT_NOCONN) == 0) 1205 error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m, 1206 NULL, NULL, curproc); 1207 else 1208 error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m, 1209 nmp->nm_nam, NULL, curproc); 1210 if (error) { 1211 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) 1212 so->so_error = 0; 1213 } else { 1214 /* 1215 * Iff first send, start timing 1216 * else turn timing off, backoff timer 1217 * and divide congestion window by 2. 1218 */ 1219 if (rep->r_flags & R_SENT) { 1220 rep->r_flags &= ~R_TIMING; 1221 if (++rep->r_rexmit > NFS_MAXREXMIT) 1222 rep->r_rexmit = NFS_MAXREXMIT; 1223 nmp->nm_cwnd >>= 1; 1224 if (nmp->nm_cwnd < NFS_CWNDSCALE) 1225 nmp->nm_cwnd = NFS_CWNDSCALE; 1226 nfsstats.rpcretries++; 1227 } else { 1228 rep->r_flags |= R_SENT; 1229 nmp->nm_sent += NFS_CWNDSCALE; 1230 } 1231 rep->r_rtt = 0; 1232 } 1233 } 1234 } 1235 splx(s); 1236 timeout_add(&nmp->nm_rtimeout, nfs_ticks); 1237 } 1238 1239 /* 1240 * Test for a termination condition pending on the process. 1241 * This is used for NFSMNT_INT mounts. 1242 */ 1243 int 1244 nfs_sigintr(nmp, rep, p) 1245 struct nfsmount *nmp; 1246 struct nfsreq *rep; 1247 struct proc *p; 1248 { 1249 1250 if (rep && (rep->r_flags & R_SOFTTERM)) 1251 return (EINTR); 1252 if (!(nmp->nm_flag & NFSMNT_INT)) 1253 return (0); 1254 if (p && p->p_siglist && 1255 (((p->p_siglist & ~p->p_sigmask) & ~p->p_sigignore) & 1256 NFSINT_SIGMASK)) 1257 return (EINTR); 1258 return (0); 1259 } 1260 1261 /* 1262 * Lock a socket against others. 1263 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply 1264 * and also to avoid race conditions between the processes with nfs requests 1265 * in progress when a reconnect is necessary. 1266 */ 1267 int 1268 nfs_sndlock(flagp, rep) 1269 int *flagp; 1270 struct nfsreq *rep; 1271 { 1272 struct proc *p; 1273 int slpflag = 0, slptimeo = 0; 1274 1275 if (rep) { 1276 p = rep->r_procp; 1277 if (rep->r_nmp->nm_flag & NFSMNT_INT) 1278 slpflag = PCATCH; 1279 } else 1280 p = NULL; 1281 while (*flagp & NFSMNT_SNDLOCK) { 1282 if (rep && nfs_sigintr(rep->r_nmp, rep, p)) 1283 return (EINTR); 1284 *flagp |= NFSMNT_WANTSND; 1285 (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsndlck", 1286 slptimeo); 1287 if (slpflag == PCATCH) { 1288 slpflag = 0; 1289 slptimeo = 2 * hz; 1290 } 1291 } 1292 *flagp |= NFSMNT_SNDLOCK; 1293 return (0); 1294 } 1295 1296 /* 1297 * Unlock the stream socket for others. 1298 */ 1299 void 1300 nfs_sndunlock(flagp) 1301 int *flagp; 1302 { 1303 1304 if ((*flagp & NFSMNT_SNDLOCK) == 0) 1305 panic("nfs sndunlock"); 1306 *flagp &= ~NFSMNT_SNDLOCK; 1307 if (*flagp & NFSMNT_WANTSND) { 1308 *flagp &= ~NFSMNT_WANTSND; 1309 wakeup((caddr_t)flagp); 1310 } 1311 } 1312 1313 int 1314 nfs_rcvlock(rep) 1315 struct nfsreq *rep; 1316 { 1317 int *flagp = &rep->r_nmp->nm_flag; 1318 int slpflag, slptimeo = 0; 1319 1320 if (*flagp & NFSMNT_INT) 1321 slpflag = PCATCH; 1322 else 1323 slpflag = 0; 1324 1325 while (*flagp & NFSMNT_RCVLOCK) { 1326 if (nfs_sigintr(rep->r_nmp, rep, rep->r_procp)) 1327 return (EINTR); 1328 *flagp |= NFSMNT_WANTRCV; 1329 (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsrcvlk", 1330 slptimeo); 1331 if (rep->r_mrep != NULL) { 1332 /* 1333 * Don't take the lock if our reply has been received 1334 * while we where sleeping. 1335 */ 1336 return (EALREADY); 1337 } 1338 if (slpflag == PCATCH) { 1339 slpflag = 0; 1340 slptimeo = 2 * hz; 1341 } 1342 } 1343 *flagp |= NFSMNT_RCVLOCK; 1344 return (0); 1345 } 1346 1347 /* 1348 * Unlock the stream socket for others. 1349 */ 1350 void 1351 nfs_rcvunlock(flagp) 1352 int *flagp; 1353 { 1354 1355 if ((*flagp & NFSMNT_RCVLOCK) == 0) 1356 panic("nfs rcvunlock"); 1357 *flagp &= ~NFSMNT_RCVLOCK; 1358 if (*flagp & NFSMNT_WANTRCV) { 1359 *flagp &= ~NFSMNT_WANTRCV; 1360 wakeup((caddr_t)flagp); 1361 } 1362 } 1363 1364 /* 1365 * Auxiliary routine to align the length of mbuf copies made with m_copyback(). 1366 */ 1367 void 1368 nfs_realign_fixup(struct mbuf *m, struct mbuf *n, unsigned int *off) 1369 { 1370 size_t padding; 1371 1372 /* 1373 * The maximum number of bytes that m_copyback() places in a mbuf is 1374 * always an aligned quantity, so realign happens at the chain's tail. 1375 */ 1376 while (n->m_next != NULL) 1377 n = n->m_next; 1378 1379 /* 1380 * Pad from the next elements in the source chain. Loop until the 1381 * destination chain is aligned, or the end of the source is reached. 1382 */ 1383 do { 1384 m = m->m_next; 1385 if (m == NULL) 1386 return; 1387 1388 padding = min(ALIGN(n->m_len) - n->m_len, m->m_len); 1389 if (padding > M_TRAILINGSPACE(n)) 1390 panic("nfs_realign_fixup: no memory to pad to"); 1391 1392 bcopy(mtod(m, void *), mtod(n, char *) + n->m_len, padding); 1393 1394 n->m_len += padding; 1395 m_adj(m, padding); 1396 *off += padding; 1397 1398 } while (!ALIGNED_POINTER(n->m_len, void *)); 1399 } 1400 1401 /* 1402 * The NFS RPC parsing code uses the data address and the length of mbuf 1403 * structures to calculate on-memory addresses. This function makes sure these 1404 * parameters are correctly aligned. 1405 */ 1406 void 1407 nfs_realign(struct mbuf **pm, int hsiz) 1408 { 1409 struct mbuf *m; 1410 struct mbuf *n = NULL; 1411 unsigned int off = 0; 1412 1413 ++nfs_realign_test; 1414 while ((m = *pm) != NULL) { 1415 if (!ALIGNED_POINTER(m->m_data, void *) || 1416 !ALIGNED_POINTER(m->m_len, void *)) { 1417 MGET(n, M_WAIT, MT_DATA); 1418 if (ALIGN(m->m_len) >= MINCLSIZE) { 1419 MCLGET(n, M_WAIT); 1420 } 1421 n->m_len = 0; 1422 break; 1423 } 1424 pm = &m->m_next; 1425 } 1426 /* 1427 * If n is non-NULL, loop on m copying data, then replace the 1428 * portion of the chain that had to be realigned. 1429 */ 1430 if (n != NULL) { 1431 ++nfs_realign_count; 1432 while (m) { 1433 m_copyback(n, off, m->m_len, mtod(m, caddr_t)); 1434 1435 /* 1436 * If an unaligned amount of memory was copied, fix up 1437 * the last mbuf created by m_copyback(). 1438 */ 1439 if (!ALIGNED_POINTER(m->m_len, void *)) 1440 nfs_realign_fixup(m, n, &off); 1441 1442 off += m->m_len; 1443 m = m->m_next; 1444 } 1445 m_freem(*pm); 1446 *pm = n; 1447 } 1448 } 1449 1450 1451 /* 1452 * Parse an RPC request 1453 * - verify it 1454 * - fill in the cred struct. 1455 */ 1456 int 1457 nfs_getreq(nd, nfsd, has_header) 1458 struct nfsrv_descript *nd; 1459 struct nfsd *nfsd; 1460 int has_header; 1461 { 1462 int len, i; 1463 u_int32_t *tl; 1464 int32_t t1; 1465 caddr_t cp2; 1466 u_int32_t nfsvers, auth_type; 1467 int error = 0; 1468 struct nfsm_info info; 1469 1470 info.nmi_mrep = nd->nd_mrep; 1471 info.nmi_md = nd->nd_md; 1472 info.nmi_dpos = nd->nd_dpos; 1473 if (has_header) { 1474 nfsm_dissect(tl, u_int32_t *, 10 * NFSX_UNSIGNED); 1475 nd->nd_retxid = fxdr_unsigned(u_int32_t, *tl++); 1476 if (*tl++ != rpc_call) { 1477 m_freem(info.nmi_mrep); 1478 return (EBADRPC); 1479 } 1480 } else 1481 nfsm_dissect(tl, u_int32_t *, 8 * NFSX_UNSIGNED); 1482 nd->nd_repstat = 0; 1483 nd->nd_flag = 0; 1484 if (*tl++ != rpc_vers) { 1485 nd->nd_repstat = ERPCMISMATCH; 1486 nd->nd_procnum = NFSPROC_NOOP; 1487 return (0); 1488 } 1489 if (*tl != nfs_prog) { 1490 nd->nd_repstat = EPROGUNAVAIL; 1491 nd->nd_procnum = NFSPROC_NOOP; 1492 return (0); 1493 } 1494 tl++; 1495 nfsvers = fxdr_unsigned(u_int32_t, *tl++); 1496 if (nfsvers != NFS_VER2 && nfsvers != NFS_VER3) { 1497 nd->nd_repstat = EPROGMISMATCH; 1498 nd->nd_procnum = NFSPROC_NOOP; 1499 return (0); 1500 } 1501 if (nfsvers == NFS_VER3) 1502 nd->nd_flag = ND_NFSV3; 1503 nd->nd_procnum = fxdr_unsigned(u_int32_t, *tl++); 1504 if (nd->nd_procnum == NFSPROC_NULL) 1505 return (0); 1506 if (nd->nd_procnum >= NFS_NPROCS || 1507 (nd->nd_procnum > NFSPROC_COMMIT) || 1508 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) { 1509 nd->nd_repstat = EPROCUNAVAIL; 1510 nd->nd_procnum = NFSPROC_NOOP; 1511 return (0); 1512 } 1513 if ((nd->nd_flag & ND_NFSV3) == 0) 1514 nd->nd_procnum = nfsv3_procid[nd->nd_procnum]; 1515 auth_type = *tl++; 1516 len = fxdr_unsigned(int, *tl++); 1517 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1518 m_freem(info.nmi_mrep); 1519 return (EBADRPC); 1520 } 1521 1522 /* Handle auth_unix */ 1523 if (auth_type == rpc_auth_unix) { 1524 len = fxdr_unsigned(int, *++tl); 1525 if (len < 0 || len > NFS_MAXNAMLEN) { 1526 m_freem(info.nmi_mrep); 1527 return (EBADRPC); 1528 } 1529 nfsm_adv(nfsm_rndup(len)); 1530 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); 1531 bzero((caddr_t)&nd->nd_cr, sizeof (struct ucred)); 1532 nd->nd_cr.cr_ref = 1; 1533 nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++); 1534 nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++); 1535 len = fxdr_unsigned(int, *tl); 1536 if (len < 0 || len > RPCAUTH_UNIXGIDS) { 1537 m_freem(info.nmi_mrep); 1538 return (EBADRPC); 1539 } 1540 nfsm_dissect(tl, u_int32_t *, (len + 2) * NFSX_UNSIGNED); 1541 for (i = 0; i < len; i++) 1542 if (i < NGROUPS) 1543 nd->nd_cr.cr_groups[i] = fxdr_unsigned(gid_t, *tl++); 1544 else 1545 tl++; 1546 nd->nd_cr.cr_ngroups = (len > NGROUPS) ? NGROUPS : len; 1547 len = fxdr_unsigned(int, *++tl); 1548 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1549 m_freem(info.nmi_mrep); 1550 return (EBADRPC); 1551 } 1552 if (len > 0) 1553 nfsm_adv(nfsm_rndup(len)); 1554 } else { 1555 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED); 1556 nd->nd_procnum = NFSPROC_NOOP; 1557 return (0); 1558 } 1559 1560 nd->nd_md = info.nmi_md; 1561 nd->nd_dpos = info.nmi_dpos; 1562 return (0); 1563 nfsmout: 1564 return (error); 1565 } 1566 1567 void 1568 nfs_msg(struct nfsreq *rep, char *msg) 1569 { 1570 tpr_t tpr; 1571 1572 if (rep->r_procp) 1573 tpr = tprintf_open(rep->r_procp); 1574 else 1575 tpr = NULL; 1576 1577 tprintf(tpr, "nfs server %s: %s\n", 1578 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname, msg); 1579 tprintf_close(tpr); 1580 } 1581 1582 #ifdef NFSSERVER 1583 /* 1584 * Socket upcall routine for the nfsd sockets. 1585 * The caddr_t arg is a pointer to the "struct nfssvc_sock". 1586 * Essentially do as much as possible non-blocking, else punt and it will 1587 * be called with M_WAIT from an nfsd. 1588 */ 1589 void 1590 nfsrv_rcv(so, arg, waitflag) 1591 struct socket *so; 1592 caddr_t arg; 1593 int waitflag; 1594 { 1595 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg; 1596 struct mbuf *m; 1597 struct mbuf *mp, *nam; 1598 struct uio auio; 1599 int flags, error; 1600 1601 if ((slp->ns_flag & SLP_VALID) == 0) 1602 return; 1603 #ifdef notdef 1604 /* 1605 * Define this to test for nfsds handling this under heavy load. 1606 */ 1607 if (waitflag == M_DONTWAIT) { 1608 slp->ns_flag |= SLP_NEEDQ; goto dorecs; 1609 } 1610 #endif 1611 auio.uio_procp = NULL; 1612 if (so->so_type == SOCK_STREAM) { 1613 /* 1614 * If there are already records on the queue, defer soreceive() 1615 * to an nfsd so that there is feedback to the TCP layer that 1616 * the nfs servers are heavily loaded. 1617 */ 1618 if (slp->ns_rec && waitflag == M_DONTWAIT) { 1619 slp->ns_flag |= SLP_NEEDQ; 1620 goto dorecs; 1621 } 1622 1623 /* 1624 * Do soreceive(). 1625 */ 1626 auio.uio_resid = 1000000000; 1627 flags = MSG_DONTWAIT; 1628 error = soreceive(so, &nam, &auio, &mp, NULL, 1629 &flags, 0); 1630 if (error || mp == NULL) { 1631 if (error == EWOULDBLOCK) 1632 slp->ns_flag |= SLP_NEEDQ; 1633 else 1634 slp->ns_flag |= SLP_DISCONN; 1635 goto dorecs; 1636 } 1637 m = mp; 1638 if (slp->ns_rawend) { 1639 slp->ns_rawend->m_next = m; 1640 slp->ns_cc += 1000000000 - auio.uio_resid; 1641 } else { 1642 slp->ns_raw = m; 1643 slp->ns_cc = 1000000000 - auio.uio_resid; 1644 } 1645 while (m->m_next) 1646 m = m->m_next; 1647 slp->ns_rawend = m; 1648 1649 /* 1650 * Now try and parse record(s) out of the raw stream data. 1651 */ 1652 error = nfsrv_getstream(slp, waitflag); 1653 if (error) { 1654 if (error == EPERM) 1655 slp->ns_flag |= SLP_DISCONN; 1656 else 1657 slp->ns_flag |= SLP_NEEDQ; 1658 } 1659 } else { 1660 do { 1661 auio.uio_resid = 1000000000; 1662 flags = MSG_DONTWAIT; 1663 error = soreceive(so, &nam, &auio, &mp, 1664 NULL, &flags, 0); 1665 if (mp) { 1666 if (nam) { 1667 m = nam; 1668 m->m_next = mp; 1669 } else 1670 m = mp; 1671 if (slp->ns_recend) 1672 slp->ns_recend->m_nextpkt = m; 1673 else 1674 slp->ns_rec = m; 1675 slp->ns_recend = m; 1676 m->m_nextpkt = NULL; 1677 } 1678 if (error) { 1679 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) 1680 && error != EWOULDBLOCK) { 1681 slp->ns_flag |= SLP_DISCONN; 1682 goto dorecs; 1683 } 1684 } 1685 } while (mp); 1686 } 1687 1688 /* 1689 * Now try and process the request records, non-blocking. 1690 */ 1691 dorecs: 1692 if (waitflag == M_DONTWAIT && 1693 (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)))) 1694 nfsrv_wakenfsd(slp); 1695 } 1696 1697 /* 1698 * Try and extract an RPC request from the mbuf data list received on a 1699 * stream socket. The "waitflag" argument indicates whether or not it 1700 * can sleep. 1701 */ 1702 int 1703 nfsrv_getstream(slp, waitflag) 1704 struct nfssvc_sock *slp; 1705 int waitflag; 1706 { 1707 struct mbuf *m, **mpp; 1708 char *cp1, *cp2; 1709 int len; 1710 struct mbuf *om, *m2, *recm; 1711 u_int32_t recmark; 1712 1713 if (slp->ns_flag & SLP_GETSTREAM) 1714 panic("nfs getstream"); 1715 slp->ns_flag |= SLP_GETSTREAM; 1716 for (;;) { 1717 if (slp->ns_reclen == 0) { 1718 if (slp->ns_cc < NFSX_UNSIGNED) { 1719 slp->ns_flag &= ~SLP_GETSTREAM; 1720 return (0); 1721 } 1722 m = slp->ns_raw; 1723 if (m->m_len >= NFSX_UNSIGNED) { 1724 bcopy(mtod(m, caddr_t), (caddr_t)&recmark, NFSX_UNSIGNED); 1725 m->m_data += NFSX_UNSIGNED; 1726 m->m_len -= NFSX_UNSIGNED; 1727 } else { 1728 cp1 = (caddr_t)&recmark; 1729 cp2 = mtod(m, caddr_t); 1730 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) { 1731 while (m->m_len == 0) { 1732 m = m->m_next; 1733 cp2 = mtod(m, caddr_t); 1734 } 1735 *cp1++ = *cp2++; 1736 m->m_data++; 1737 m->m_len--; 1738 } 1739 } 1740 slp->ns_cc -= NFSX_UNSIGNED; 1741 recmark = ntohl(recmark); 1742 slp->ns_reclen = recmark & ~0x80000000; 1743 if (recmark & 0x80000000) 1744 slp->ns_flag |= SLP_LASTFRAG; 1745 else 1746 slp->ns_flag &= ~SLP_LASTFRAG; 1747 if (slp->ns_reclen > NFS_MAXPACKET) { 1748 slp->ns_flag &= ~SLP_GETSTREAM; 1749 return (EPERM); 1750 } 1751 } 1752 1753 /* 1754 * Now get the record part. 1755 */ 1756 recm = NULL; 1757 if (slp->ns_cc == slp->ns_reclen) { 1758 recm = slp->ns_raw; 1759 slp->ns_raw = slp->ns_rawend = NULL; 1760 slp->ns_cc = slp->ns_reclen = 0; 1761 } else if (slp->ns_cc > slp->ns_reclen) { 1762 len = 0; 1763 m = slp->ns_raw; 1764 om = NULL; 1765 while (len < slp->ns_reclen) { 1766 if ((len + m->m_len) > slp->ns_reclen) { 1767 m2 = m_copym(m, 0, slp->ns_reclen - len, 1768 waitflag); 1769 if (m2) { 1770 if (om) { 1771 om->m_next = m2; 1772 recm = slp->ns_raw; 1773 } else 1774 recm = m2; 1775 m->m_data += slp->ns_reclen - len; 1776 m->m_len -= slp->ns_reclen - len; 1777 len = slp->ns_reclen; 1778 } else { 1779 slp->ns_flag &= ~SLP_GETSTREAM; 1780 return (EWOULDBLOCK); 1781 } 1782 } else if ((len + m->m_len) == slp->ns_reclen) { 1783 om = m; 1784 len += m->m_len; 1785 m = m->m_next; 1786 recm = slp->ns_raw; 1787 om->m_next = NULL; 1788 } else { 1789 om = m; 1790 len += m->m_len; 1791 m = m->m_next; 1792 } 1793 } 1794 slp->ns_raw = m; 1795 slp->ns_cc -= len; 1796 slp->ns_reclen = 0; 1797 } else { 1798 slp->ns_flag &= ~SLP_GETSTREAM; 1799 return (0); 1800 } 1801 1802 /* 1803 * Accumulate the fragments into a record. 1804 */ 1805 mpp = &slp->ns_frag; 1806 while (*mpp) 1807 mpp = &((*mpp)->m_next); 1808 *mpp = recm; 1809 if (slp->ns_flag & SLP_LASTFRAG) { 1810 if (slp->ns_recend) 1811 slp->ns_recend->m_nextpkt = slp->ns_frag; 1812 else 1813 slp->ns_rec = slp->ns_frag; 1814 slp->ns_recend = slp->ns_frag; 1815 slp->ns_frag = NULL; 1816 } 1817 } 1818 } 1819 1820 /* 1821 * Parse an RPC header. 1822 */ 1823 int 1824 nfsrv_dorec(slp, nfsd, ndp) 1825 struct nfssvc_sock *slp; 1826 struct nfsd *nfsd; 1827 struct nfsrv_descript **ndp; 1828 { 1829 struct mbuf *m, *nam; 1830 struct nfsrv_descript *nd; 1831 int error; 1832 1833 *ndp = NULL; 1834 if ((slp->ns_flag & SLP_VALID) == 0 || 1835 (m = slp->ns_rec) == NULL) 1836 return (ENOBUFS); 1837 slp->ns_rec = m->m_nextpkt; 1838 if (slp->ns_rec) 1839 m->m_nextpkt = NULL; 1840 else 1841 slp->ns_recend = NULL; 1842 if (m->m_type == MT_SONAME) { 1843 nam = m; 1844 m = m->m_next; 1845 nam->m_next = NULL; 1846 } else 1847 nam = NULL; 1848 nd = pool_get(&nfsrv_descript_pl, PR_WAITOK); 1849 nfs_realign(&m, 10 * NFSX_UNSIGNED); 1850 nd->nd_md = nd->nd_mrep = m; 1851 nd->nd_nam2 = nam; 1852 nd->nd_dpos = mtod(m, caddr_t); 1853 error = nfs_getreq(nd, nfsd, 1); 1854 if (error) { 1855 m_freem(nam); 1856 pool_put(&nfsrv_descript_pl, nd); 1857 return (error); 1858 } 1859 *ndp = nd; 1860 nfsd->nfsd_nd = nd; 1861 return (0); 1862 } 1863 1864 1865 /* 1866 * Search for a sleeping nfsd and wake it up. 1867 * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the 1868 * running nfsds will go look for the work in the nfssvc_sock list. 1869 */ 1870 void 1871 nfsrv_wakenfsd(struct nfssvc_sock *slp) 1872 { 1873 struct nfsd *nfsd; 1874 1875 if ((slp->ns_flag & SLP_VALID) == 0) 1876 return; 1877 1878 TAILQ_FOREACH(nfsd, &nfsd_head, nfsd_chain) { 1879 if (nfsd->nfsd_flag & NFSD_WAITING) { 1880 nfsd->nfsd_flag &= ~NFSD_WAITING; 1881 if (nfsd->nfsd_slp) 1882 panic("nfsd wakeup"); 1883 slp->ns_sref++; 1884 nfsd->nfsd_slp = slp; 1885 wakeup_one(nfsd); 1886 return; 1887 } 1888 } 1889 1890 slp->ns_flag |= SLP_DOREC; 1891 nfsd_head_flag |= NFSD_CHECKSLP; 1892 } 1893 #endif /* NFSSERVER */ 1894