1 /* $OpenBSD: nfs_socket.c,v 1.106 2014/11/14 23:01:44 tedu Exp $ */ 2 /* $NetBSD: nfs_socket.c,v 1.27 1996/04/15 20:20:00 thorpej Exp $ */ 3 4 /* 5 * Copyright (c) 1989, 1991, 1993, 1995 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Rick Macklem at The University of Guelph. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95 36 */ 37 38 /* 39 * Socket operations for use by nfs 40 */ 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/proc.h> 45 #include <sys/mount.h> 46 #include <sys/kernel.h> 47 #include <sys/mbuf.h> 48 #include <sys/vnode.h> 49 #include <sys/domain.h> 50 #include <sys/protosw.h> 51 #include <sys/signalvar.h> 52 #include <sys/socket.h> 53 #include <sys/socketvar.h> 54 #include <sys/syslog.h> 55 #include <sys/tprintf.h> 56 #include <sys/namei.h> 57 #include <sys/pool.h> 58 #include <sys/queue.h> 59 60 #include <netinet/in.h> 61 #include <netinet/tcp.h> 62 63 #include <nfs/rpcv2.h> 64 #include <nfs/nfsproto.h> 65 #include <nfs/nfs.h> 66 #include <nfs/xdr_subs.h> 67 #include <nfs/nfsm_subs.h> 68 #include <nfs/nfsmount.h> 69 #include <nfs/nfsnode.h> 70 #include <nfs/nfs_var.h> 71 72 /* External data, mostly RPC constants in XDR form. */ 73 extern u_int32_t rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, 74 rpc_auth_unix, rpc_msgaccepted, rpc_call, rpc_autherr; 75 extern u_int32_t nfs_prog; 76 extern struct nfsstats nfsstats; 77 extern int nfsv3_procid[NFS_NPROCS]; 78 extern int nfs_ticks; 79 80 extern struct pool nfsrv_descript_pl; 81 82 /* 83 * There is a congestion window for outstanding rpcs maintained per mount 84 * point. The cwnd size is adjusted in roughly the way that: 85 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of 86 * SIGCOMM '88". ACM, August 1988. 87 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout 88 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd 89 * of rpcs is in progress. 90 * (The sent count and cwnd are scaled for integer arith.) 91 * Variants of "slow start" were tried and were found to be too much of a 92 * performance hit (ave. rtt 3 times larger), 93 * I suspect due to the large rtt that nfs rpcs have. 94 */ 95 #define NFS_CWNDSCALE 256 96 #define NFS_MAXCWND (NFS_CWNDSCALE * 32) 97 int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256 }; 98 99 /* RTT estimator */ 100 enum nfs_rto_timers nfs_ptimers[NFS_NPROCS] = { 101 NFS_DEFAULT_TIMER, /* NULL */ 102 NFS_GETATTR_TIMER, /* GETATTR */ 103 NFS_DEFAULT_TIMER, /* SETATTR */ 104 NFS_LOOKUP_TIMER, /* LOOKUP */ 105 NFS_GETATTR_TIMER, /* ACCESS */ 106 NFS_READ_TIMER, /* READLINK */ 107 NFS_READ_TIMER, /* READ */ 108 NFS_WRITE_TIMER, /* WRITE */ 109 NFS_DEFAULT_TIMER, /* CREATE */ 110 NFS_DEFAULT_TIMER, /* MKDIR */ 111 NFS_DEFAULT_TIMER, /* SYMLINK */ 112 NFS_DEFAULT_TIMER, /* MKNOD */ 113 NFS_DEFAULT_TIMER, /* REMOVE */ 114 NFS_DEFAULT_TIMER, /* RMDIR */ 115 NFS_DEFAULT_TIMER, /* RENAME */ 116 NFS_DEFAULT_TIMER, /* LINK */ 117 NFS_READ_TIMER, /* READDIR */ 118 NFS_READ_TIMER, /* READDIRPLUS */ 119 NFS_DEFAULT_TIMER, /* FSSTAT */ 120 NFS_DEFAULT_TIMER, /* FSINFO */ 121 NFS_DEFAULT_TIMER, /* PATHCONF */ 122 NFS_DEFAULT_TIMER, /* COMMIT */ 123 NFS_DEFAULT_TIMER, /* NOOP */ 124 }; 125 126 void nfs_init_rtt(struct nfsmount *); 127 void nfs_update_rtt(struct nfsreq *); 128 int nfs_estimate_rto(struct nfsmount *, u_int32_t procnum); 129 130 void nfs_realign(struct mbuf **, int); 131 void nfs_realign_fixup(struct mbuf *, struct mbuf *, unsigned int *); 132 unsigned int nfs_realign_test = 0; 133 unsigned int nfs_realign_count = 0; 134 135 /* Initialize the RTT estimator state for a new mount point. */ 136 void 137 nfs_init_rtt(struct nfsmount *nmp) 138 { 139 int i; 140 141 for (i = 0; i < NFS_MAX_TIMER; i++) 142 nmp->nm_srtt[i] = NFS_INITRTT; 143 for (i = 0; i < NFS_MAX_TIMER; i++) 144 nmp->nm_sdrtt[i] = 0; 145 } 146 147 /* 148 * Update a mount point's RTT estimator state using data from the 149 * passed-in request. 150 * 151 * Use a gain of 0.125 on the mean and a gain of 0.25 on the deviation. 152 * 153 * NB: Since the timer resolution of NFS_HZ is so course, it can often 154 * result in r_rtt == 0. Since r_rtt == N means that the actual RTT is 155 * between N + dt and N + 2 - dt ticks, add 1 before calculating the 156 * update values. 157 */ 158 void 159 nfs_update_rtt(struct nfsreq *rep) 160 { 161 int t1 = rep->r_rtt + 1; 162 int index = nfs_ptimers[rep->r_procnum] - 1; 163 int *srtt = &rep->r_nmp->nm_srtt[index]; 164 int *sdrtt = &rep->r_nmp->nm_sdrtt[index]; 165 166 t1 -= *srtt >> 3; 167 *srtt += t1; 168 if (t1 < 0) 169 t1 = -t1; 170 t1 -= *sdrtt >> 2; 171 *sdrtt += t1; 172 } 173 174 /* 175 * Estimate RTO for an NFS RPC sent via an unreliable datagram. 176 * 177 * Use the mean and mean deviation of RTT for the appropriate type 178 * of RPC for the frequent RPCs and a default for the others. 179 * The justification for doing "other" this way is that these RPCs 180 * happen so infrequently that timer est. would probably be stale. 181 * Also, since many of these RPCs are non-idempotent, a conservative 182 * timeout is desired. 183 * 184 * getattr, lookup - A+2D 185 * read, write - A+4D 186 * other - nm_timeo 187 */ 188 int 189 nfs_estimate_rto(struct nfsmount *nmp, u_int32_t procnum) 190 { 191 enum nfs_rto_timers timer = nfs_ptimers[procnum]; 192 int index = timer - 1; 193 int rto; 194 195 switch (timer) { 196 case NFS_GETATTR_TIMER: 197 case NFS_LOOKUP_TIMER: 198 rto = ((nmp->nm_srtt[index] + 3) >> 2) + 199 ((nmp->nm_sdrtt[index] + 1) >> 1); 200 break; 201 case NFS_READ_TIMER: 202 case NFS_WRITE_TIMER: 203 rto = ((nmp->nm_srtt[index] + 7) >> 3) + 204 (nmp->nm_sdrtt[index] + 1); 205 break; 206 default: 207 rto = nmp->nm_timeo; 208 return (rto); 209 } 210 211 if (rto < NFS_MINRTO) 212 rto = NFS_MINRTO; 213 else if (rto > NFS_MAXRTO) 214 rto = NFS_MAXRTO; 215 216 return (rto); 217 } 218 219 220 221 /* 222 * Initialize sockets and congestion for a new NFS connection. 223 * We do not free the sockaddr if error. 224 */ 225 int 226 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep) 227 { 228 struct socket *so; 229 int s, error, rcvreserve, sndreserve; 230 struct sockaddr *saddr; 231 struct sockaddr_in *sin; 232 struct mbuf *m; 233 234 nmp->nm_so = NULL; 235 saddr = mtod(nmp->nm_nam, struct sockaddr *); 236 error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype, 237 nmp->nm_soproto); 238 if (error) 239 goto bad; 240 so = nmp->nm_so; 241 nmp->nm_soflags = so->so_proto->pr_flags; 242 243 /* 244 * Some servers require that the client port be a reserved port number. 245 * We always allocate a reserved port, as this prevents filehandle 246 * disclosure through UDP port capture. 247 */ 248 if (saddr->sa_family == AF_INET) { 249 struct mbuf *mopt; 250 int *ip; 251 252 MGET(mopt, M_WAIT, MT_SOOPTS); 253 mopt->m_len = sizeof(int); 254 ip = mtod(mopt, int *); 255 *ip = IP_PORTRANGE_LOW; 256 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); 257 if (error) 258 goto bad; 259 260 MGET(m, M_WAIT, MT_SONAME); 261 sin = mtod(m, struct sockaddr_in *); 262 sin->sin_len = m->m_len = sizeof (struct sockaddr_in); 263 sin->sin_family = AF_INET; 264 sin->sin_addr.s_addr = INADDR_ANY; 265 sin->sin_port = htons(0); 266 error = sobind(so, m, &proc0); 267 m_freem(m); 268 if (error) 269 goto bad; 270 271 MGET(mopt, M_WAIT, MT_SOOPTS); 272 mopt->m_len = sizeof(int); 273 ip = mtod(mopt, int *); 274 *ip = IP_PORTRANGE_DEFAULT; 275 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); 276 if (error) 277 goto bad; 278 } 279 280 /* 281 * Protocols that do not require connections may be optionally left 282 * unconnected for servers that reply from a port other than NFS_PORT. 283 */ 284 if (nmp->nm_flag & NFSMNT_NOCONN) { 285 if (nmp->nm_soflags & PR_CONNREQUIRED) { 286 error = ENOTCONN; 287 goto bad; 288 } 289 } else { 290 error = soconnect(so, nmp->nm_nam); 291 if (error) 292 goto bad; 293 294 /* 295 * Wait for the connection to complete. Cribbed from the 296 * connect system call but with the wait timing out so 297 * that interruptible mounts don't hang here for a long time. 298 */ 299 s = splsoftnet(); 300 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 301 (void) tsleep((caddr_t)&so->so_timeo, PSOCK, 302 "nfscon", 2 * hz); 303 if ((so->so_state & SS_ISCONNECTING) && 304 so->so_error == 0 && rep && 305 (error = nfs_sigintr(nmp, rep, rep->r_procp)) != 0){ 306 so->so_state &= ~SS_ISCONNECTING; 307 splx(s); 308 goto bad; 309 } 310 } 311 if (so->so_error) { 312 error = so->so_error; 313 so->so_error = 0; 314 splx(s); 315 goto bad; 316 } 317 splx(s); 318 } 319 /* 320 * Always set receive timeout to detect server crash and reconnect. 321 * Otherwise, we can get stuck in soreceive forever. 322 */ 323 so->so_rcv.sb_timeo = (5 * hz); 324 if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) 325 so->so_snd.sb_timeo = (5 * hz); 326 else 327 so->so_snd.sb_timeo = 0; 328 if (nmp->nm_sotype == SOCK_DGRAM) { 329 sndreserve = nmp->nm_wsize + NFS_MAXPKTHDR; 330 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) + 331 NFS_MAXPKTHDR) * 2; 332 } else if (nmp->nm_sotype == SOCK_SEQPACKET) { 333 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 2; 334 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) + 335 NFS_MAXPKTHDR) * 2; 336 } else { 337 if (nmp->nm_sotype != SOCK_STREAM) 338 panic("nfscon sotype"); 339 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 340 MGET(m, M_WAIT, MT_SOOPTS); 341 *mtod(m, int32_t *) = 1; 342 m->m_len = sizeof(int32_t); 343 sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, m); 344 } 345 if (so->so_proto->pr_protocol == IPPROTO_TCP) { 346 MGET(m, M_WAIT, MT_SOOPTS); 347 *mtod(m, int32_t *) = 1; 348 m->m_len = sizeof(int32_t); 349 sosetopt(so, IPPROTO_TCP, TCP_NODELAY, m); 350 } 351 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + 352 sizeof (u_int32_t)) * 2; 353 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + 354 sizeof (u_int32_t)) * 2; 355 } 356 error = soreserve(so, sndreserve, rcvreserve); 357 if (error) 358 goto bad; 359 so->so_rcv.sb_flags |= SB_NOINTR; 360 so->so_snd.sb_flags |= SB_NOINTR; 361 362 /* Initialize other non-zero congestion variables */ 363 nfs_init_rtt(nmp); 364 nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */ 365 nmp->nm_sent = 0; 366 nmp->nm_timeouts = 0; 367 return (0); 368 369 bad: 370 nfs_disconnect(nmp); 371 return (error); 372 } 373 374 /* 375 * Reconnect routine: 376 * Called when a connection is broken on a reliable protocol. 377 * - clean up the old socket 378 * - nfs_connect() again 379 * - set R_MUSTRESEND for all outstanding requests on mount point 380 * If this fails the mount point is DEAD! 381 * nb: Must be called with the nfs_sndlock() set on the mount point. 382 */ 383 int 384 nfs_reconnect(struct nfsreq *rep) 385 { 386 struct nfsreq *rp; 387 struct nfsmount *nmp = rep->r_nmp; 388 int s, error; 389 390 nfs_disconnect(nmp); 391 while ((error = nfs_connect(nmp, rep)) != 0) { 392 if (error == EINTR || error == ERESTART) 393 return (EINTR); 394 (void) tsleep((caddr_t)&lbolt, PSOCK, "nfsrecon", 0); 395 } 396 397 /* 398 * Loop through outstanding request list and fix up all requests 399 * on old socket. 400 */ 401 s = splsoftnet(); 402 TAILQ_FOREACH(rp, &nmp->nm_reqsq, r_chain) { 403 rp->r_flags |= R_MUSTRESEND; 404 rp->r_rexmit = 0; 405 } 406 splx(s); 407 return (0); 408 } 409 410 /* 411 * NFS disconnect. Clean up and unlink. 412 */ 413 void 414 nfs_disconnect(struct nfsmount *nmp) 415 { 416 struct socket *so; 417 418 if (nmp->nm_so) { 419 so = nmp->nm_so; 420 nmp->nm_so = NULL; 421 soshutdown(so, SHUT_RDWR); 422 soclose(so); 423 } 424 } 425 426 /* 427 * This is the nfs send routine. For connection based socket types, it 428 * must be called with an nfs_sndlock() on the socket. 429 * "rep == NULL" indicates that it has been called from a server. 430 * For the client side: 431 * - return EINTR if the RPC is terminated, 0 otherwise 432 * - set R_MUSTRESEND if the send fails for any reason 433 * - do any cleanup required by recoverable socket errors (???) 434 * For the server side: 435 * - return EINTR or ERESTART if interrupted by a signal 436 * - return EPIPE if a connection is lost for connection based sockets (TCP...) 437 * - do any cleanup required by recoverable socket errors (???) 438 */ 439 int 440 nfs_send(struct socket *so, struct mbuf *nam, struct mbuf *top, 441 struct nfsreq *rep) 442 { 443 struct mbuf *sendnam; 444 int error, soflags, flags; 445 446 if (rep) { 447 if (rep->r_flags & R_SOFTTERM) { 448 m_freem(top); 449 return (EINTR); 450 } 451 if ((so = rep->r_nmp->nm_so) == NULL) { 452 rep->r_flags |= R_MUSTRESEND; 453 m_freem(top); 454 return (0); 455 } 456 rep->r_flags &= ~R_MUSTRESEND; 457 soflags = rep->r_nmp->nm_soflags; 458 } else 459 soflags = so->so_proto->pr_flags; 460 if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED)) 461 sendnam = NULL; 462 else 463 sendnam = nam; 464 if (so->so_type == SOCK_SEQPACKET) 465 flags = MSG_EOR; 466 else 467 flags = 0; 468 469 error = sosend(so, sendnam, NULL, top, NULL, flags); 470 if (error) { 471 if (rep) { 472 /* 473 * Deal with errors for the client side. 474 */ 475 if (rep->r_flags & R_SOFTTERM) 476 error = EINTR; 477 else 478 rep->r_flags |= R_MUSTRESEND; 479 } 480 481 /* 482 * Handle any recoverable (soft) socket errors here. (???) 483 */ 484 if (error != EINTR && error != ERESTART && 485 error != EWOULDBLOCK && error != EPIPE) 486 error = 0; 487 } 488 return (error); 489 } 490 491 #ifdef NFSCLIENT 492 /* 493 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all 494 * done by soreceive(), but for SOCK_STREAM we must deal with the Record 495 * Mark and consolidate the data into a new mbuf list. 496 * nb: Sometimes TCP passes the data up to soreceive() in long lists of 497 * small mbufs. 498 * For SOCK_STREAM we must be very careful to read an entire record once 499 * we have read any of it, even if the system call has been interrupted. 500 */ 501 int 502 nfs_receive(struct nfsreq *rep, struct mbuf **aname, struct mbuf **mp) 503 { 504 struct socket *so; 505 struct uio auio; 506 struct iovec aio; 507 struct mbuf *m; 508 struct mbuf *control; 509 u_int32_t len; 510 struct mbuf **getnam; 511 int error, sotype, rcvflg; 512 struct proc *p = curproc; /* XXX */ 513 514 /* 515 * Set up arguments for soreceive() 516 */ 517 *mp = NULL; 518 *aname = NULL; 519 sotype = rep->r_nmp->nm_sotype; 520 521 /* 522 * For reliable protocols, lock against other senders/receivers 523 * in case a reconnect is necessary. 524 * For SOCK_STREAM, first get the Record Mark to find out how much 525 * more there is to get. 526 * We must lock the socket against other receivers 527 * until we have an entire rpc request/reply. 528 */ 529 if (sotype != SOCK_DGRAM) { 530 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); 531 if (error) 532 return (error); 533 tryagain: 534 /* 535 * Check for fatal errors and resending request. 536 */ 537 /* 538 * Ugh: If a reconnect attempt just happened, nm_so 539 * would have changed. NULL indicates a failed 540 * attempt that has essentially shut down this 541 * mount point. 542 */ 543 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) { 544 nfs_sndunlock(&rep->r_nmp->nm_flag); 545 return (EINTR); 546 } 547 so = rep->r_nmp->nm_so; 548 if (!so) { 549 error = nfs_reconnect(rep); 550 if (error) { 551 nfs_sndunlock(&rep->r_nmp->nm_flag); 552 return (error); 553 } 554 goto tryagain; 555 } 556 while (rep->r_flags & R_MUSTRESEND) { 557 m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT); 558 nfsstats.rpcretries++; 559 rep->r_rtt = 0; 560 rep->r_flags &= ~R_TIMING; 561 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep); 562 if (error) { 563 if (error == EINTR || error == ERESTART || 564 (error = nfs_reconnect(rep)) != 0) { 565 nfs_sndunlock(&rep->r_nmp->nm_flag); 566 return (error); 567 } 568 goto tryagain; 569 } 570 } 571 nfs_sndunlock(&rep->r_nmp->nm_flag); 572 if (sotype == SOCK_STREAM) { 573 aio.iov_base = (caddr_t) &len; 574 aio.iov_len = sizeof(u_int32_t); 575 auio.uio_iov = &aio; 576 auio.uio_iovcnt = 1; 577 auio.uio_segflg = UIO_SYSSPACE; 578 auio.uio_rw = UIO_READ; 579 auio.uio_offset = 0; 580 auio.uio_resid = sizeof(u_int32_t); 581 auio.uio_procp = p; 582 do { 583 rcvflg = MSG_WAITALL; 584 error = soreceive(so, NULL, &auio, NULL, NULL, 585 &rcvflg, 0); 586 if (error == EWOULDBLOCK && rep) { 587 if (rep->r_flags & R_SOFTTERM) 588 return (EINTR); 589 /* 590 * looks like the server died after it 591 * received the request, make sure 592 * that we will retransmit and we 593 * don't get stuck here forever. 594 */ 595 if (rep->r_rexmit >= rep->r_nmp->nm_retry) { 596 nfsstats.rpctimeouts++; 597 error = EPIPE; 598 } 599 } 600 } while (error == EWOULDBLOCK); 601 if (!error && auio.uio_resid > 0) { 602 log(LOG_INFO, 603 "short receive (%zu/%zu) from nfs server %s\n", 604 sizeof(u_int32_t) - auio.uio_resid, 605 sizeof(u_int32_t), 606 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 607 error = EPIPE; 608 } 609 if (error) 610 goto errout; 611 612 len = ntohl(len) & ~0x80000000; 613 /* 614 * This is SERIOUS! We are out of sync with the sender 615 * and forcing a disconnect/reconnect is all I can do. 616 */ 617 if (len > NFS_MAXPACKET) { 618 log(LOG_ERR, "%s (%u) from nfs server %s\n", 619 "impossible packet length", 620 len, 621 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 622 error = EFBIG; 623 goto errout; 624 } 625 auio.uio_resid = len; 626 do { 627 rcvflg = MSG_WAITALL; 628 error = soreceive(so, NULL, &auio, mp, NULL, 629 &rcvflg, 0); 630 } while (error == EWOULDBLOCK || error == EINTR || 631 error == ERESTART); 632 if (!error && auio.uio_resid > 0) { 633 log(LOG_INFO, 634 "short receive (%zu/%u) from nfs server %s\n", 635 len - auio.uio_resid, len, 636 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 637 error = EPIPE; 638 } 639 } else { 640 /* 641 * NB: Since uio_resid is big, MSG_WAITALL is ignored 642 * and soreceive() will return when it has either a 643 * control msg or a data msg. 644 * We have no use for control msg., but must grab them 645 * and then throw them away so we know what is going 646 * on. 647 */ 648 auio.uio_resid = len = 100000000; /* Anything Big */ 649 auio.uio_procp = p; 650 do { 651 rcvflg = 0; 652 error = soreceive(so, NULL, &auio, mp, &control, 653 &rcvflg, 0); 654 if (control) 655 m_freem(control); 656 if (error == EWOULDBLOCK && rep) { 657 if (rep->r_flags & R_SOFTTERM) 658 return (EINTR); 659 } 660 } while (error == EWOULDBLOCK || 661 (!error && *mp == NULL && control)); 662 if ((rcvflg & MSG_EOR) == 0) 663 printf("Egad!!\n"); 664 if (!error && *mp == NULL) 665 error = EPIPE; 666 len -= auio.uio_resid; 667 } 668 errout: 669 if (error && error != EINTR && error != ERESTART) { 670 m_freem(*mp); 671 *mp = NULL; 672 if (error != EPIPE) 673 log(LOG_INFO, 674 "receive error %d from nfs server %s\n", 675 error, 676 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 677 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); 678 if (!error) { 679 error = nfs_reconnect(rep); 680 if (!error) 681 goto tryagain; 682 nfs_sndunlock(&rep->r_nmp->nm_flag); 683 } 684 } 685 } else { 686 if ((so = rep->r_nmp->nm_so) == NULL) 687 return (EACCES); 688 if (so->so_state & SS_ISCONNECTED) 689 getnam = NULL; 690 else 691 getnam = aname; 692 auio.uio_resid = len = 1000000; 693 auio.uio_procp = p; 694 do { 695 rcvflg = 0; 696 error = soreceive(so, getnam, &auio, mp, NULL, 697 &rcvflg, 0); 698 if (error == EWOULDBLOCK && 699 (rep->r_flags & R_SOFTTERM)) 700 return (EINTR); 701 } while (error == EWOULDBLOCK); 702 len -= auio.uio_resid; 703 } 704 if (error) { 705 m_freem(*mp); 706 *mp = NULL; 707 } 708 /* 709 * Search for any mbufs that are not a multiple of 4 bytes long 710 * or with m_data not longword aligned. 711 * These could cause pointer alignment problems, so copy them to 712 * well aligned mbufs. 713 */ 714 nfs_realign(mp, 5 * NFSX_UNSIGNED); 715 return (error); 716 } 717 718 /* 719 * Implement receipt of reply on a socket. 720 * We must search through the list of received datagrams matching them 721 * with outstanding requests using the xid, until ours is found. 722 */ 723 int 724 nfs_reply(struct nfsreq *myrep) 725 { 726 struct nfsreq *rep; 727 struct nfsmount *nmp = myrep->r_nmp; 728 struct nfsm_info info; 729 struct mbuf *nam; 730 u_int32_t rxid, *tl, t1; 731 caddr_t cp2; 732 int s, error; 733 734 /* 735 * Loop around until we get our own reply 736 */ 737 for (;;) { 738 /* 739 * Lock against other receivers so that I don't get stuck in 740 * sbwait() after someone else has received my reply for me. 741 * Also necessary for connection based protocols to avoid 742 * race conditions during a reconnect. 743 */ 744 error = nfs_rcvlock(myrep); 745 if (error) 746 return (error == EALREADY ? 0 : error); 747 748 /* 749 * Get the next Rpc reply off the socket 750 */ 751 error = nfs_receive(myrep, &nam, &info.nmi_mrep); 752 nfs_rcvunlock(&nmp->nm_flag); 753 if (error) { 754 755 /* 756 * Ignore routing errors on connectionless protocols?? 757 */ 758 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) { 759 if (nmp->nm_so) 760 nmp->nm_so->so_error = 0; 761 continue; 762 } 763 return (error); 764 } 765 if (nam) 766 m_freem(nam); 767 768 /* 769 * Get the xid and check that it is an rpc reply 770 */ 771 info.nmi_md = info.nmi_mrep; 772 info.nmi_dpos = mtod(info.nmi_md, caddr_t); 773 nfsm_dissect(tl, u_int32_t *, 2*NFSX_UNSIGNED); 774 rxid = *tl++; 775 if (*tl != rpc_reply) { 776 nfsstats.rpcinvalid++; 777 m_freem(info.nmi_mrep); 778 nfsmout: 779 continue; 780 } 781 782 /* 783 * Loop through the request list to match up the reply 784 * Iff no match, just drop the datagram 785 */ 786 s = splsoftnet(); 787 TAILQ_FOREACH(rep, &nmp->nm_reqsq, r_chain) { 788 if (rep->r_mrep == NULL && rxid == rep->r_xid) { 789 /* Found it.. */ 790 rep->r_mrep = info.nmi_mrep; 791 rep->r_md = info.nmi_md; 792 rep->r_dpos = info.nmi_dpos; 793 794 /* 795 * Update congestion window. 796 * Do the additive increase of 797 * one rpc/rtt. 798 */ 799 if (nmp->nm_cwnd <= nmp->nm_sent) { 800 nmp->nm_cwnd += 801 (NFS_CWNDSCALE * NFS_CWNDSCALE + 802 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd; 803 if (nmp->nm_cwnd > NFS_MAXCWND) 804 nmp->nm_cwnd = NFS_MAXCWND; 805 } 806 rep->r_flags &= ~R_SENT; 807 nmp->nm_sent -= NFS_CWNDSCALE; 808 809 if (rep->r_flags & R_TIMING) 810 nfs_update_rtt(rep); 811 812 nmp->nm_timeouts = 0; 813 break; 814 } 815 } 816 splx(s); 817 /* 818 * If not matched to a request, drop it. 819 * If it's mine, get out. 820 */ 821 if (rep == 0) { 822 nfsstats.rpcunexpected++; 823 m_freem(info.nmi_mrep); 824 } else if (rep == myrep) { 825 if (rep->r_mrep == NULL) 826 panic("nfsreply nil"); 827 return (0); 828 } 829 } 830 } 831 832 /* 833 * nfs_request - goes something like this 834 * - fill in request struct 835 * - links it into list 836 * - calls nfs_send() for first transmit 837 * - calls nfs_receive() to get reply 838 * - break down rpc header and return with nfs reply pointed to 839 * by mrep or error 840 * nb: always frees up mreq mbuf list 841 */ 842 int 843 nfs_request(struct vnode *vp, int procnum, struct nfsm_info *infop) 844 { 845 struct mbuf *m; 846 u_int32_t *tl; 847 struct nfsmount *nmp; 848 struct timeval tv; 849 caddr_t cp2; 850 int t1, i, s, error = 0; 851 int trylater_delay; 852 struct nfsreq *rep; 853 int mrest_len; 854 struct nfsm_info info; 855 856 rep = pool_get(&nfsreqpl, PR_WAITOK); 857 rep->r_nmp = VFSTONFS(vp->v_mount); 858 rep->r_vp = vp; 859 rep->r_procp = infop->nmi_procp; 860 rep->r_procnum = procnum; 861 862 mrest_len = 0; 863 m = infop->nmi_mreq; 864 while (m) { 865 mrest_len += m->m_len; 866 m = m->m_next; 867 } 868 869 /* empty mbuf for AUTH_UNIX header */ 870 rep->r_mreq = m_gethdr(M_WAIT, MT_DATA); 871 rep->r_mreq->m_next = infop->nmi_mreq; 872 rep->r_mreq->m_pkthdr.len = mrest_len; 873 874 trylater_delay = NFS_MINTIMEO; 875 876 nmp = rep->r_nmp; 877 878 /* Get the RPC header with authorization. */ 879 nfsm_rpchead(rep, infop->nmi_cred, RPCAUTH_UNIX); 880 m = rep->r_mreq; 881 882 /* 883 * For stream protocols, insert a Sun RPC Record Mark. 884 */ 885 if (nmp->nm_sotype == SOCK_STREAM) { 886 M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); 887 *mtod(m, u_int32_t *) = htonl(0x80000000 | 888 (m->m_pkthdr.len - NFSX_UNSIGNED)); 889 } 890 891 tryagain: 892 rep->r_rtt = rep->r_rexmit = 0; 893 if (nfs_ptimers[rep->r_procnum] != NFS_DEFAULT_TIMER) 894 rep->r_flags = R_TIMING; 895 else 896 rep->r_flags = 0; 897 rep->r_mrep = NULL; 898 899 /* 900 * Do the client side RPC. 901 */ 902 nfsstats.rpcrequests++; 903 /* 904 * Chain request into list of outstanding requests. Be sure 905 * to put it LAST so timer finds oldest requests first. 906 */ 907 s = splsoftnet(); 908 if (TAILQ_EMPTY(&nmp->nm_reqsq)) 909 timeout_add(&nmp->nm_rtimeout, nfs_ticks); 910 TAILQ_INSERT_TAIL(&nmp->nm_reqsq, rep, r_chain); 911 912 /* 913 * If backing off another request or avoiding congestion, don't 914 * send this one now but let timer do it. If not timing a request, 915 * do it now. 916 */ 917 if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM || 918 (nmp->nm_flag & NFSMNT_DUMBTIMR) || 919 nmp->nm_sent < nmp->nm_cwnd)) { 920 splx(s); 921 if (nmp->nm_soflags & PR_CONNREQUIRED) 922 error = nfs_sndlock(&nmp->nm_flag, rep); 923 if (!error) { 924 error = nfs_send(nmp->nm_so, nmp->nm_nam, 925 m_copym(m, 0, M_COPYALL, M_WAIT), 926 rep); 927 if (nmp->nm_soflags & PR_CONNREQUIRED) 928 nfs_sndunlock(&nmp->nm_flag); 929 } 930 if (!error && (rep->r_flags & R_MUSTRESEND) == 0) { 931 nmp->nm_sent += NFS_CWNDSCALE; 932 rep->r_flags |= R_SENT; 933 } 934 } else { 935 splx(s); 936 rep->r_rtt = -1; 937 } 938 939 /* 940 * Wait for the reply from our send or the timer's. 941 */ 942 if (!error || error == EPIPE) 943 error = nfs_reply(rep); 944 945 /* 946 * RPC done, unlink the request. 947 */ 948 s = splsoftnet(); 949 TAILQ_REMOVE(&nmp->nm_reqsq, rep, r_chain); 950 if (TAILQ_EMPTY(&nmp->nm_reqsq)) 951 timeout_del(&nmp->nm_rtimeout); 952 splx(s); 953 954 /* 955 * Decrement the outstanding request count. 956 */ 957 if (rep->r_flags & R_SENT) { 958 rep->r_flags &= ~R_SENT; /* paranoia */ 959 nmp->nm_sent -= NFS_CWNDSCALE; 960 } 961 962 /* 963 * If there was a successful reply and a tprintf msg. 964 * tprintf a response. 965 */ 966 if (!error && (rep->r_flags & R_TPRINTFMSG)) 967 nfs_msg(rep, "is alive again"); 968 info.nmi_mrep = rep->r_mrep; 969 info.nmi_md = rep->r_md; 970 info.nmi_dpos = rep->r_dpos; 971 if (error) { 972 infop->nmi_mrep = NULL; 973 goto nfsmout1; 974 } 975 976 /* 977 * break down the rpc header and check if ok 978 */ 979 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); 980 if (*tl++ == rpc_msgdenied) { 981 if (*tl == rpc_mismatch) 982 error = EOPNOTSUPP; 983 else 984 error = EACCES; /* Should be EAUTH. */ 985 infop->nmi_mrep = NULL; 986 goto nfsmout1; 987 } 988 989 /* 990 * Since we only support RPCAUTH_UNIX atm we step over the 991 * reply verifer type, and in the (error) case that there really 992 * is any data in it, we advance over it. 993 */ 994 tl++; /* Step over verifer type */ 995 i = fxdr_unsigned(int32_t, *tl); 996 if (i > 0) 997 nfsm_adv(nfsm_rndup(i)); /* Should not happen */ 998 999 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); 1000 /* 0 == ok */ 1001 if (*tl == 0) { 1002 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); 1003 if (*tl != 0) { 1004 error = fxdr_unsigned(int, *tl); 1005 if ((nmp->nm_flag & NFSMNT_NFSV3) && 1006 error == NFSERR_TRYLATER) { 1007 m_freem(info.nmi_mrep); 1008 error = 0; 1009 tv.tv_sec = time_second + trylater_delay; 1010 tv.tv_usec = 0; 1011 tsleep(&tv, PSOCK, "nfsretry", hzto(&tv)); 1012 trylater_delay *= NFS_TIMEOUTMUL; 1013 if (trylater_delay > NFS_MAXTIMEO) 1014 trylater_delay = NFS_MAXTIMEO; 1015 1016 goto tryagain; 1017 } 1018 1019 /* 1020 * If the File Handle was stale, invalidate the 1021 * lookup cache, just in case. 1022 */ 1023 if (error == ESTALE) 1024 cache_purge(rep->r_vp); 1025 } 1026 goto nfsmout; 1027 } 1028 1029 error = EPROTONOSUPPORT; 1030 1031 nfsmout: 1032 infop->nmi_mrep = info.nmi_mrep; 1033 infop->nmi_md = info.nmi_md; 1034 infop->nmi_dpos = info.nmi_dpos; 1035 nfsmout1: 1036 m_freem(rep->r_mreq); 1037 pool_put(&nfsreqpl, rep); 1038 return (error); 1039 } 1040 #endif /* NFSCLIENT */ 1041 1042 /* 1043 * Generate the rpc reply header 1044 * siz arg. is used to decide if adding a cluster is worthwhile 1045 */ 1046 int 1047 nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp, 1048 int err, struct mbuf **mrq, struct mbuf **mbp) 1049 { 1050 u_int32_t *tl; 1051 struct mbuf *mreq; 1052 struct mbuf *mb; 1053 1054 MGETHDR(mreq, M_WAIT, MT_DATA); 1055 mb = mreq; 1056 /* 1057 * If this is a big reply, use a cluster else 1058 * try and leave leading space for the lower level headers. 1059 */ 1060 siz += RPC_REPLYSIZ; 1061 if (siz >= MHLEN - max_hdr) { 1062 MCLGET(mreq, M_WAIT); 1063 } else 1064 mreq->m_data += max_hdr; 1065 tl = mtod(mreq, u_int32_t *); 1066 mreq->m_len = 6 * NFSX_UNSIGNED; 1067 *tl++ = txdr_unsigned(nd->nd_retxid); 1068 *tl++ = rpc_reply; 1069 if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) { 1070 *tl++ = rpc_msgdenied; 1071 if (err & NFSERR_AUTHERR) { 1072 *tl++ = rpc_autherr; 1073 *tl = txdr_unsigned(err & ~NFSERR_AUTHERR); 1074 mreq->m_len -= NFSX_UNSIGNED; 1075 } else { 1076 *tl++ = rpc_mismatch; 1077 *tl++ = txdr_unsigned(RPC_VER2); 1078 *tl = txdr_unsigned(RPC_VER2); 1079 } 1080 } else { 1081 *tl++ = rpc_msgaccepted; 1082 1083 /* AUTH_UNIX requires RPCAUTH_NULL. */ 1084 *tl++ = 0; 1085 *tl++ = 0; 1086 1087 switch (err) { 1088 case EPROGUNAVAIL: 1089 *tl = txdr_unsigned(RPC_PROGUNAVAIL); 1090 break; 1091 case EPROGMISMATCH: 1092 *tl = txdr_unsigned(RPC_PROGMISMATCH); 1093 tl = nfsm_build(&mb, 2 * NFSX_UNSIGNED); 1094 *tl++ = txdr_unsigned(NFS_VER2); 1095 *tl = txdr_unsigned(NFS_VER3); 1096 break; 1097 case EPROCUNAVAIL: 1098 *tl = txdr_unsigned(RPC_PROCUNAVAIL); 1099 break; 1100 case EBADRPC: 1101 *tl = txdr_unsigned(RPC_GARBAGE); 1102 break; 1103 default: 1104 *tl = 0; 1105 if (err != NFSERR_RETVOID) { 1106 tl = nfsm_build(&mb, NFSX_UNSIGNED); 1107 if (err) 1108 *tl = txdr_unsigned(nfsrv_errmap(nd, err)); 1109 else 1110 *tl = 0; 1111 } 1112 break; 1113 }; 1114 } 1115 1116 *mrq = mreq; 1117 if (mbp != NULL) 1118 *mbp = mb; 1119 if (err != 0 && err != NFSERR_RETVOID) 1120 nfsstats.srvrpc_errs++; 1121 return (0); 1122 } 1123 1124 /* 1125 * nfs timer routine 1126 * Scan the nfsreq list and retranmit any requests that have timed out. 1127 */ 1128 void 1129 nfs_timer(void *arg) 1130 { 1131 struct nfsmount *nmp = arg; 1132 struct nfsreq *rep; 1133 struct mbuf *m; 1134 struct socket *so; 1135 int timeo, s, error; 1136 1137 s = splsoftnet(); 1138 TAILQ_FOREACH(rep, &nmp->nm_reqsq, r_chain) { 1139 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) 1140 continue; 1141 if (nfs_sigintr(nmp, rep, rep->r_procp)) { 1142 rep->r_flags |= R_SOFTTERM; 1143 continue; 1144 } 1145 if (rep->r_rtt >= 0) { 1146 rep->r_rtt++; 1147 if (nmp->nm_flag & NFSMNT_DUMBTIMR) 1148 timeo = nmp->nm_timeo; 1149 else 1150 timeo = nfs_estimate_rto(nmp, rep->r_procnum); 1151 if (nmp->nm_timeouts > 0) 1152 timeo *= nfs_backoff[nmp->nm_timeouts - 1]; 1153 if (rep->r_rtt <= timeo) 1154 continue; 1155 if (nmp->nm_timeouts < nitems(nfs_backoff)) 1156 nmp->nm_timeouts++; 1157 } 1158 1159 /* Check for server not responding. */ 1160 if ((rep->r_flags & R_TPRINTFMSG) == 0 && rep->r_rexmit > 4) { 1161 nfs_msg(rep, "not responding"); 1162 rep->r_flags |= R_TPRINTFMSG; 1163 } 1164 if (rep->r_rexmit >= nmp->nm_retry) { /* too many */ 1165 nfsstats.rpctimeouts++; 1166 rep->r_flags |= R_SOFTTERM; 1167 continue; 1168 } 1169 if (nmp->nm_sotype != SOCK_DGRAM) { 1170 if (++rep->r_rexmit > NFS_MAXREXMIT) 1171 rep->r_rexmit = NFS_MAXREXMIT; 1172 continue; 1173 } 1174 1175 if ((so = nmp->nm_so) == NULL) 1176 continue; 1177 1178 /* 1179 * If there is enough space and the window allows.. 1180 * Resend it 1181 * Set r_rtt to -1 in case we fail to send it now. 1182 */ 1183 rep->r_rtt = -1; 1184 if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len && 1185 ((nmp->nm_flag & NFSMNT_DUMBTIMR) || 1186 (rep->r_flags & R_SENT) || 1187 nmp->nm_sent < nmp->nm_cwnd) && 1188 (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){ 1189 if ((nmp->nm_flag & NFSMNT_NOCONN) == 0) 1190 error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m, 1191 NULL, NULL, curproc); 1192 else 1193 error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m, 1194 nmp->nm_nam, NULL, curproc); 1195 if (error) { 1196 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) 1197 so->so_error = 0; 1198 } else { 1199 /* 1200 * Iff first send, start timing 1201 * else turn timing off, backoff timer 1202 * and divide congestion window by 2. 1203 */ 1204 if (rep->r_flags & R_SENT) { 1205 rep->r_flags &= ~R_TIMING; 1206 if (++rep->r_rexmit > NFS_MAXREXMIT) 1207 rep->r_rexmit = NFS_MAXREXMIT; 1208 nmp->nm_cwnd >>= 1; 1209 if (nmp->nm_cwnd < NFS_CWNDSCALE) 1210 nmp->nm_cwnd = NFS_CWNDSCALE; 1211 nfsstats.rpcretries++; 1212 } else { 1213 rep->r_flags |= R_SENT; 1214 nmp->nm_sent += NFS_CWNDSCALE; 1215 } 1216 rep->r_rtt = 0; 1217 } 1218 } 1219 } 1220 splx(s); 1221 timeout_add(&nmp->nm_rtimeout, nfs_ticks); 1222 } 1223 1224 /* 1225 * Test for a termination condition pending on the process. 1226 * This is used for NFSMNT_INT mounts. 1227 */ 1228 int 1229 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct proc *p) 1230 { 1231 1232 if (rep && (rep->r_flags & R_SOFTTERM)) 1233 return (EINTR); 1234 if (!(nmp->nm_flag & NFSMNT_INT)) 1235 return (0); 1236 if (p && p->p_siglist && 1237 (((p->p_siglist & ~p->p_sigmask) & 1238 ~p->p_p->ps_sigacts->ps_sigignore) & NFSINT_SIGMASK)) 1239 return (EINTR); 1240 return (0); 1241 } 1242 1243 /* 1244 * Lock a socket against others. 1245 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply 1246 * and also to avoid race conditions between the processes with nfs requests 1247 * in progress when a reconnect is necessary. 1248 */ 1249 int 1250 nfs_sndlock(int *flagp, struct nfsreq *rep) 1251 { 1252 struct proc *p; 1253 int slpflag = 0, slptimeo = 0; 1254 1255 if (rep) { 1256 p = rep->r_procp; 1257 if (rep->r_nmp->nm_flag & NFSMNT_INT) 1258 slpflag = PCATCH; 1259 } else 1260 p = NULL; 1261 while (*flagp & NFSMNT_SNDLOCK) { 1262 if (rep && nfs_sigintr(rep->r_nmp, rep, p)) 1263 return (EINTR); 1264 *flagp |= NFSMNT_WANTSND; 1265 (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsndlck", 1266 slptimeo); 1267 if (slpflag == PCATCH) { 1268 slpflag = 0; 1269 slptimeo = 2 * hz; 1270 } 1271 } 1272 *flagp |= NFSMNT_SNDLOCK; 1273 return (0); 1274 } 1275 1276 /* 1277 * Unlock the stream socket for others. 1278 */ 1279 void 1280 nfs_sndunlock(int *flagp) 1281 { 1282 1283 if ((*flagp & NFSMNT_SNDLOCK) == 0) 1284 panic("nfs sndunlock"); 1285 *flagp &= ~NFSMNT_SNDLOCK; 1286 if (*flagp & NFSMNT_WANTSND) { 1287 *flagp &= ~NFSMNT_WANTSND; 1288 wakeup((caddr_t)flagp); 1289 } 1290 } 1291 1292 int 1293 nfs_rcvlock(struct nfsreq *rep) 1294 { 1295 int *flagp = &rep->r_nmp->nm_flag; 1296 int slpflag, slptimeo = 0; 1297 1298 if (*flagp & NFSMNT_INT) 1299 slpflag = PCATCH; 1300 else 1301 slpflag = 0; 1302 1303 while (*flagp & NFSMNT_RCVLOCK) { 1304 if (nfs_sigintr(rep->r_nmp, rep, rep->r_procp)) 1305 return (EINTR); 1306 *flagp |= NFSMNT_WANTRCV; 1307 (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsrcvlk", 1308 slptimeo); 1309 if (rep->r_mrep != NULL) { 1310 /* 1311 * Don't take the lock if our reply has been received 1312 * while we where sleeping. 1313 */ 1314 return (EALREADY); 1315 } 1316 if (slpflag == PCATCH) { 1317 slpflag = 0; 1318 slptimeo = 2 * hz; 1319 } 1320 } 1321 *flagp |= NFSMNT_RCVLOCK; 1322 return (0); 1323 } 1324 1325 /* 1326 * Unlock the stream socket for others. 1327 */ 1328 void 1329 nfs_rcvunlock(int *flagp) 1330 { 1331 1332 if ((*flagp & NFSMNT_RCVLOCK) == 0) 1333 panic("nfs rcvunlock"); 1334 *flagp &= ~NFSMNT_RCVLOCK; 1335 if (*flagp & NFSMNT_WANTRCV) { 1336 *flagp &= ~NFSMNT_WANTRCV; 1337 wakeup((caddr_t)flagp); 1338 } 1339 } 1340 1341 /* 1342 * Auxiliary routine to align the length of mbuf copies made with m_copyback(). 1343 */ 1344 void 1345 nfs_realign_fixup(struct mbuf *m, struct mbuf *n, unsigned int *off) 1346 { 1347 size_t padding; 1348 1349 /* 1350 * The maximum number of bytes that m_copyback() places in a mbuf is 1351 * always an aligned quantity, so realign happens at the chain's tail. 1352 */ 1353 while (n->m_next != NULL) 1354 n = n->m_next; 1355 1356 /* 1357 * Pad from the next elements in the source chain. Loop until the 1358 * destination chain is aligned, or the end of the source is reached. 1359 */ 1360 do { 1361 m = m->m_next; 1362 if (m == NULL) 1363 return; 1364 1365 padding = min(ALIGN(n->m_len) - n->m_len, m->m_len); 1366 if (padding > M_TRAILINGSPACE(n)) 1367 panic("nfs_realign_fixup: no memory to pad to"); 1368 1369 bcopy(mtod(m, void *), mtod(n, char *) + n->m_len, padding); 1370 1371 n->m_len += padding; 1372 m_adj(m, padding); 1373 *off += padding; 1374 1375 } while (!ALIGNED_POINTER(n->m_len, void *)); 1376 } 1377 1378 /* 1379 * The NFS RPC parsing code uses the data address and the length of mbuf 1380 * structures to calculate on-memory addresses. This function makes sure these 1381 * parameters are correctly aligned. 1382 */ 1383 void 1384 nfs_realign(struct mbuf **pm, int hsiz) 1385 { 1386 struct mbuf *m; 1387 struct mbuf *n = NULL; 1388 unsigned int off = 0; 1389 1390 ++nfs_realign_test; 1391 while ((m = *pm) != NULL) { 1392 if (!ALIGNED_POINTER(m->m_data, void *) || 1393 !ALIGNED_POINTER(m->m_len, void *)) { 1394 MGET(n, M_WAIT, MT_DATA); 1395 #define ALIGN_POINTER(n) ((u_int)(((n) + sizeof(void *)) & ~sizeof(void *))) 1396 if (ALIGN_POINTER(m->m_len) >= MINCLSIZE) { 1397 MCLGET(n, M_WAIT); 1398 } 1399 n->m_len = 0; 1400 break; 1401 } 1402 pm = &m->m_next; 1403 } 1404 /* 1405 * If n is non-NULL, loop on m copying data, then replace the 1406 * portion of the chain that had to be realigned. 1407 */ 1408 if (n != NULL) { 1409 ++nfs_realign_count; 1410 while (m) { 1411 m_copyback(n, off, m->m_len, mtod(m, caddr_t), M_WAIT); 1412 1413 /* 1414 * If an unaligned amount of memory was copied, fix up 1415 * the last mbuf created by m_copyback(). 1416 */ 1417 if (!ALIGNED_POINTER(m->m_len, void *)) 1418 nfs_realign_fixup(m, n, &off); 1419 1420 off += m->m_len; 1421 m = m->m_next; 1422 } 1423 m_freem(*pm); 1424 *pm = n; 1425 } 1426 } 1427 1428 1429 /* 1430 * Parse an RPC request 1431 * - verify it 1432 * - fill in the cred struct. 1433 */ 1434 int 1435 nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header) 1436 { 1437 int len, i; 1438 u_int32_t *tl; 1439 int32_t t1; 1440 caddr_t cp2; 1441 u_int32_t nfsvers, auth_type; 1442 int error = 0; 1443 struct nfsm_info info; 1444 1445 info.nmi_mrep = nd->nd_mrep; 1446 info.nmi_md = nd->nd_md; 1447 info.nmi_dpos = nd->nd_dpos; 1448 if (has_header) { 1449 nfsm_dissect(tl, u_int32_t *, 10 * NFSX_UNSIGNED); 1450 nd->nd_retxid = fxdr_unsigned(u_int32_t, *tl++); 1451 if (*tl++ != rpc_call) { 1452 m_freem(info.nmi_mrep); 1453 return (EBADRPC); 1454 } 1455 } else 1456 nfsm_dissect(tl, u_int32_t *, 8 * NFSX_UNSIGNED); 1457 nd->nd_repstat = 0; 1458 nd->nd_flag = 0; 1459 if (*tl++ != rpc_vers) { 1460 nd->nd_repstat = ERPCMISMATCH; 1461 nd->nd_procnum = NFSPROC_NOOP; 1462 return (0); 1463 } 1464 if (*tl != nfs_prog) { 1465 nd->nd_repstat = EPROGUNAVAIL; 1466 nd->nd_procnum = NFSPROC_NOOP; 1467 return (0); 1468 } 1469 tl++; 1470 nfsvers = fxdr_unsigned(u_int32_t, *tl++); 1471 if (nfsvers != NFS_VER2 && nfsvers != NFS_VER3) { 1472 nd->nd_repstat = EPROGMISMATCH; 1473 nd->nd_procnum = NFSPROC_NOOP; 1474 return (0); 1475 } 1476 if (nfsvers == NFS_VER3) 1477 nd->nd_flag = ND_NFSV3; 1478 nd->nd_procnum = fxdr_unsigned(u_int32_t, *tl++); 1479 if (nd->nd_procnum == NFSPROC_NULL) 1480 return (0); 1481 if (nd->nd_procnum >= NFS_NPROCS || 1482 (nd->nd_procnum > NFSPROC_COMMIT) || 1483 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) { 1484 nd->nd_repstat = EPROCUNAVAIL; 1485 nd->nd_procnum = NFSPROC_NOOP; 1486 return (0); 1487 } 1488 if ((nd->nd_flag & ND_NFSV3) == 0) 1489 nd->nd_procnum = nfsv3_procid[nd->nd_procnum]; 1490 auth_type = *tl++; 1491 len = fxdr_unsigned(int, *tl++); 1492 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1493 m_freem(info.nmi_mrep); 1494 return (EBADRPC); 1495 } 1496 1497 /* Handle auth_unix */ 1498 if (auth_type == rpc_auth_unix) { 1499 len = fxdr_unsigned(int, *++tl); 1500 if (len < 0 || len > NFS_MAXNAMLEN) { 1501 m_freem(info.nmi_mrep); 1502 return (EBADRPC); 1503 } 1504 nfsm_adv(nfsm_rndup(len)); 1505 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); 1506 memset(&nd->nd_cr, 0, sizeof (struct ucred)); 1507 nd->nd_cr.cr_ref = 1; 1508 nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++); 1509 nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++); 1510 len = fxdr_unsigned(int, *tl); 1511 if (len < 0 || len > RPCAUTH_UNIXGIDS) { 1512 m_freem(info.nmi_mrep); 1513 return (EBADRPC); 1514 } 1515 nfsm_dissect(tl, u_int32_t *, (len + 2) * NFSX_UNSIGNED); 1516 for (i = 0; i < len; i++) 1517 if (i < NGROUPS) 1518 nd->nd_cr.cr_groups[i] = fxdr_unsigned(gid_t, *tl++); 1519 else 1520 tl++; 1521 nd->nd_cr.cr_ngroups = (len > NGROUPS) ? NGROUPS : len; 1522 len = fxdr_unsigned(int, *++tl); 1523 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1524 m_freem(info.nmi_mrep); 1525 return (EBADRPC); 1526 } 1527 if (len > 0) 1528 nfsm_adv(nfsm_rndup(len)); 1529 } else { 1530 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED); 1531 nd->nd_procnum = NFSPROC_NOOP; 1532 return (0); 1533 } 1534 1535 nd->nd_md = info.nmi_md; 1536 nd->nd_dpos = info.nmi_dpos; 1537 return (0); 1538 nfsmout: 1539 return (error); 1540 } 1541 1542 void 1543 nfs_msg(struct nfsreq *rep, char *msg) 1544 { 1545 tpr_t tpr; 1546 1547 if (rep->r_procp) 1548 tpr = tprintf_open(rep->r_procp); 1549 else 1550 tpr = NULL; 1551 1552 tprintf(tpr, "nfs server %s: %s\n", 1553 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname, msg); 1554 tprintf_close(tpr); 1555 } 1556 1557 #ifdef NFSSERVER 1558 /* 1559 * Socket upcall routine for the nfsd sockets. 1560 * The caddr_t arg is a pointer to the "struct nfssvc_sock". 1561 * Essentially do as much as possible non-blocking, else punt and it will 1562 * be called with M_WAIT from an nfsd. 1563 */ 1564 void 1565 nfsrv_rcv(struct socket *so, caddr_t arg, int waitflag) 1566 { 1567 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg; 1568 struct mbuf *m; 1569 struct mbuf *mp, *nam; 1570 struct uio auio; 1571 int flags, error; 1572 1573 if ((slp->ns_flag & SLP_VALID) == 0) 1574 return; 1575 #ifdef notdef 1576 /* 1577 * Define this to test for nfsds handling this under heavy load. 1578 */ 1579 if (waitflag == M_DONTWAIT) { 1580 slp->ns_flag |= SLP_NEEDQ; goto dorecs; 1581 } 1582 #endif 1583 auio.uio_procp = NULL; 1584 if (so->so_type == SOCK_STREAM) { 1585 /* 1586 * If there are already records on the queue, defer soreceive() 1587 * to an nfsd so that there is feedback to the TCP layer that 1588 * the nfs servers are heavily loaded. 1589 */ 1590 if (slp->ns_rec && waitflag == M_DONTWAIT) { 1591 slp->ns_flag |= SLP_NEEDQ; 1592 goto dorecs; 1593 } 1594 1595 /* 1596 * Do soreceive(). 1597 */ 1598 auio.uio_resid = 1000000000; 1599 flags = MSG_DONTWAIT; 1600 error = soreceive(so, &nam, &auio, &mp, NULL, 1601 &flags, 0); 1602 if (error || mp == NULL) { 1603 if (error == EWOULDBLOCK) 1604 slp->ns_flag |= SLP_NEEDQ; 1605 else 1606 slp->ns_flag |= SLP_DISCONN; 1607 goto dorecs; 1608 } 1609 m = mp; 1610 if (slp->ns_rawend) { 1611 slp->ns_rawend->m_next = m; 1612 slp->ns_cc += 1000000000 - auio.uio_resid; 1613 } else { 1614 slp->ns_raw = m; 1615 slp->ns_cc = 1000000000 - auio.uio_resid; 1616 } 1617 while (m->m_next) 1618 m = m->m_next; 1619 slp->ns_rawend = m; 1620 1621 /* 1622 * Now try and parse record(s) out of the raw stream data. 1623 */ 1624 error = nfsrv_getstream(slp, waitflag); 1625 if (error) { 1626 if (error == EPERM) 1627 slp->ns_flag |= SLP_DISCONN; 1628 else 1629 slp->ns_flag |= SLP_NEEDQ; 1630 } 1631 } else { 1632 do { 1633 auio.uio_resid = 1000000000; 1634 flags = MSG_DONTWAIT; 1635 error = soreceive(so, &nam, &auio, &mp, 1636 NULL, &flags, 0); 1637 if (mp) { 1638 if (nam) { 1639 m = nam; 1640 m->m_next = mp; 1641 } else 1642 m = mp; 1643 if (slp->ns_recend) 1644 slp->ns_recend->m_nextpkt = m; 1645 else 1646 slp->ns_rec = m; 1647 slp->ns_recend = m; 1648 m->m_nextpkt = NULL; 1649 } 1650 if (error) { 1651 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) 1652 && error != EWOULDBLOCK) { 1653 slp->ns_flag |= SLP_DISCONN; 1654 goto dorecs; 1655 } 1656 } 1657 } while (mp); 1658 } 1659 1660 /* 1661 * Now try and process the request records, non-blocking. 1662 */ 1663 dorecs: 1664 if (waitflag == M_DONTWAIT && 1665 (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)))) 1666 nfsrv_wakenfsd(slp); 1667 } 1668 1669 /* 1670 * Try and extract an RPC request from the mbuf data list received on a 1671 * stream socket. The "waitflag" argument indicates whether or not it 1672 * can sleep. 1673 */ 1674 int 1675 nfsrv_getstream(struct nfssvc_sock *slp, int waitflag) 1676 { 1677 struct mbuf *m, **mpp; 1678 char *cp1, *cp2; 1679 int len; 1680 struct mbuf *om, *m2, *recm; 1681 u_int32_t recmark; 1682 1683 if (slp->ns_flag & SLP_GETSTREAM) 1684 return (0); 1685 slp->ns_flag |= SLP_GETSTREAM; 1686 for (;;) { 1687 if (slp->ns_reclen == 0) { 1688 if (slp->ns_cc < NFSX_UNSIGNED) { 1689 slp->ns_flag &= ~SLP_GETSTREAM; 1690 return (0); 1691 } 1692 m = slp->ns_raw; 1693 if (m->m_len >= NFSX_UNSIGNED) { 1694 bcopy(mtod(m, caddr_t), (caddr_t)&recmark, NFSX_UNSIGNED); 1695 m->m_data += NFSX_UNSIGNED; 1696 m->m_len -= NFSX_UNSIGNED; 1697 } else { 1698 cp1 = (caddr_t)&recmark; 1699 cp2 = mtod(m, caddr_t); 1700 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) { 1701 while (m->m_len == 0) { 1702 m = m->m_next; 1703 cp2 = mtod(m, caddr_t); 1704 } 1705 *cp1++ = *cp2++; 1706 m->m_data++; 1707 m->m_len--; 1708 } 1709 } 1710 slp->ns_cc -= NFSX_UNSIGNED; 1711 recmark = ntohl(recmark); 1712 slp->ns_reclen = recmark & ~0x80000000; 1713 if (recmark & 0x80000000) 1714 slp->ns_flag |= SLP_LASTFRAG; 1715 else 1716 slp->ns_flag &= ~SLP_LASTFRAG; 1717 if (slp->ns_reclen > NFS_MAXPACKET) { 1718 slp->ns_flag &= ~SLP_GETSTREAM; 1719 return (EPERM); 1720 } 1721 } 1722 1723 /* 1724 * Now get the record part. 1725 */ 1726 recm = NULL; 1727 if (slp->ns_cc == slp->ns_reclen) { 1728 recm = slp->ns_raw; 1729 slp->ns_raw = slp->ns_rawend = NULL; 1730 slp->ns_cc = slp->ns_reclen = 0; 1731 } else if (slp->ns_cc > slp->ns_reclen) { 1732 len = 0; 1733 m = slp->ns_raw; 1734 om = NULL; 1735 while (len < slp->ns_reclen) { 1736 if ((len + m->m_len) > slp->ns_reclen) { 1737 m2 = m_copym(m, 0, slp->ns_reclen - len, 1738 waitflag); 1739 if (m2) { 1740 if (om) { 1741 om->m_next = m2; 1742 recm = slp->ns_raw; 1743 } else 1744 recm = m2; 1745 m->m_data += slp->ns_reclen - len; 1746 m->m_len -= slp->ns_reclen - len; 1747 len = slp->ns_reclen; 1748 } else { 1749 slp->ns_flag &= ~SLP_GETSTREAM; 1750 return (EWOULDBLOCK); 1751 } 1752 } else if ((len + m->m_len) == slp->ns_reclen) { 1753 om = m; 1754 len += m->m_len; 1755 m = m->m_next; 1756 recm = slp->ns_raw; 1757 om->m_next = NULL; 1758 } else { 1759 om = m; 1760 len += m->m_len; 1761 m = m->m_next; 1762 } 1763 } 1764 slp->ns_raw = m; 1765 slp->ns_cc -= len; 1766 slp->ns_reclen = 0; 1767 } else { 1768 slp->ns_flag &= ~SLP_GETSTREAM; 1769 return (0); 1770 } 1771 1772 /* 1773 * Accumulate the fragments into a record. 1774 */ 1775 mpp = &slp->ns_frag; 1776 while (*mpp) 1777 mpp = &((*mpp)->m_next); 1778 *mpp = recm; 1779 if (slp->ns_flag & SLP_LASTFRAG) { 1780 if (slp->ns_recend) 1781 slp->ns_recend->m_nextpkt = slp->ns_frag; 1782 else 1783 slp->ns_rec = slp->ns_frag; 1784 slp->ns_recend = slp->ns_frag; 1785 slp->ns_frag = NULL; 1786 } 1787 } 1788 } 1789 1790 /* 1791 * Parse an RPC header. 1792 */ 1793 int 1794 nfsrv_dorec(struct nfssvc_sock *slp, struct nfsd *nfsd, 1795 struct nfsrv_descript **ndp) 1796 { 1797 struct mbuf *m, *nam; 1798 struct nfsrv_descript *nd; 1799 int error; 1800 1801 *ndp = NULL; 1802 if ((slp->ns_flag & SLP_VALID) == 0 || 1803 (m = slp->ns_rec) == NULL) 1804 return (ENOBUFS); 1805 slp->ns_rec = m->m_nextpkt; 1806 if (slp->ns_rec) 1807 m->m_nextpkt = NULL; 1808 else 1809 slp->ns_recend = NULL; 1810 if (m->m_type == MT_SONAME) { 1811 nam = m; 1812 m = m->m_next; 1813 nam->m_next = NULL; 1814 } else 1815 nam = NULL; 1816 nd = pool_get(&nfsrv_descript_pl, PR_WAITOK); 1817 nfs_realign(&m, 10 * NFSX_UNSIGNED); 1818 nd->nd_md = nd->nd_mrep = m; 1819 nd->nd_nam2 = nam; 1820 nd->nd_dpos = mtod(m, caddr_t); 1821 error = nfs_getreq(nd, nfsd, 1); 1822 if (error) { 1823 m_freem(nam); 1824 pool_put(&nfsrv_descript_pl, nd); 1825 return (error); 1826 } 1827 *ndp = nd; 1828 nfsd->nfsd_nd = nd; 1829 return (0); 1830 } 1831 1832 1833 /* 1834 * Search for a sleeping nfsd and wake it up. 1835 * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the 1836 * running nfsds will go look for the work in the nfssvc_sock list. 1837 */ 1838 void 1839 nfsrv_wakenfsd(struct nfssvc_sock *slp) 1840 { 1841 struct nfsd *nfsd; 1842 1843 if ((slp->ns_flag & SLP_VALID) == 0) 1844 return; 1845 1846 TAILQ_FOREACH(nfsd, &nfsd_head, nfsd_chain) { 1847 if (nfsd->nfsd_flag & NFSD_WAITING) { 1848 nfsd->nfsd_flag &= ~NFSD_WAITING; 1849 if (nfsd->nfsd_slp) 1850 panic("nfsd wakeup"); 1851 slp->ns_sref++; 1852 nfsd->nfsd_slp = slp; 1853 wakeup_one(nfsd); 1854 return; 1855 } 1856 } 1857 1858 slp->ns_flag |= SLP_DOREC; 1859 nfsd_head_flag |= NFSD_CHECKSLP; 1860 } 1861 #endif /* NFSSERVER */ 1862