1 /* $OpenBSD: nfs_socket.c,v 1.143 2022/08/13 21:01:46 mvs Exp $ */ 2 /* $NetBSD: nfs_socket.c,v 1.27 1996/04/15 20:20:00 thorpej Exp $ */ 3 4 /* 5 * Copyright (c) 1989, 1991, 1993, 1995 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Rick Macklem at The University of Guelph. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95 36 */ 37 38 /* 39 * Socket operations for use by nfs 40 */ 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/proc.h> 45 #include <sys/mount.h> 46 #include <sys/kernel.h> 47 #include <sys/mbuf.h> 48 #include <sys/vnode.h> 49 #include <sys/protosw.h> 50 #include <sys/signalvar.h> 51 #include <sys/socket.h> 52 #include <sys/socketvar.h> 53 #include <sys/syslog.h> 54 #include <sys/tprintf.h> 55 #include <sys/namei.h> 56 #include <sys/pool.h> 57 #include <sys/queue.h> 58 59 #include <netinet/in.h> 60 #include <netinet/tcp.h> 61 62 #include <nfs/rpcv2.h> 63 #include <nfs/nfsproto.h> 64 #include <nfs/nfs.h> 65 #include <nfs/xdr_subs.h> 66 #include <nfs/nfsm_subs.h> 67 #include <nfs/nfsmount.h> 68 #include <nfs/nfs_var.h> 69 70 /* External data, mostly RPC constants in XDR form. */ 71 extern u_int32_t rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, 72 rpc_auth_unix, rpc_msgaccepted, rpc_call, rpc_autherr; 73 extern u_int32_t nfs_prog; 74 extern struct nfsstats nfsstats; 75 extern int nfsv3_procid[NFS_NPROCS]; 76 extern int nfs_ticks; 77 78 extern struct pool nfsrv_descript_pl; 79 80 /* 81 * There is a congestion window for outstanding rpcs maintained per mount 82 * point. The cwnd size is adjusted in roughly the way that: 83 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of 84 * SIGCOMM '88". ACM, August 1988. 85 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout 86 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd 87 * of rpcs is in progress. 88 * (The sent count and cwnd are scaled for integer arith.) 89 * Variants of "slow start" were tried and were found to be too much of a 90 * performance hit (ave. rtt 3 times larger), 91 * I suspect due to the large rtt that nfs rpcs have. 92 */ 93 #define NFS_CWNDSCALE 256 94 #define NFS_MAXCWND (NFS_CWNDSCALE * 32) 95 int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256 }; 96 97 /* RTT estimator */ 98 enum nfs_rto_timers nfs_ptimers[NFS_NPROCS] = { 99 NFS_DEFAULT_TIMER, /* NULL */ 100 NFS_GETATTR_TIMER, /* GETATTR */ 101 NFS_DEFAULT_TIMER, /* SETATTR */ 102 NFS_LOOKUP_TIMER, /* LOOKUP */ 103 NFS_GETATTR_TIMER, /* ACCESS */ 104 NFS_READ_TIMER, /* READLINK */ 105 NFS_READ_TIMER, /* READ */ 106 NFS_WRITE_TIMER, /* WRITE */ 107 NFS_DEFAULT_TIMER, /* CREATE */ 108 NFS_DEFAULT_TIMER, /* MKDIR */ 109 NFS_DEFAULT_TIMER, /* SYMLINK */ 110 NFS_DEFAULT_TIMER, /* MKNOD */ 111 NFS_DEFAULT_TIMER, /* REMOVE */ 112 NFS_DEFAULT_TIMER, /* RMDIR */ 113 NFS_DEFAULT_TIMER, /* RENAME */ 114 NFS_DEFAULT_TIMER, /* LINK */ 115 NFS_READ_TIMER, /* READDIR */ 116 NFS_READ_TIMER, /* READDIRPLUS */ 117 NFS_DEFAULT_TIMER, /* FSSTAT */ 118 NFS_DEFAULT_TIMER, /* FSINFO */ 119 NFS_DEFAULT_TIMER, /* PATHCONF */ 120 NFS_DEFAULT_TIMER, /* COMMIT */ 121 NFS_DEFAULT_TIMER, /* NOOP */ 122 }; 123 124 void nfs_init_rtt(struct nfsmount *); 125 void nfs_update_rtt(struct nfsreq *); 126 int nfs_estimate_rto(struct nfsmount *, u_int32_t procnum); 127 128 void nfs_realign(struct mbuf **, int); 129 void nfs_realign_fixup(struct mbuf *, struct mbuf *, unsigned int *); 130 131 int nfs_rcvlock(struct nfsreq *); 132 int nfs_receive(struct nfsreq *, struct mbuf **, struct mbuf **); 133 int nfs_reconnect(struct nfsreq *); 134 int nfs_reply(struct nfsreq *); 135 void nfs_msg(struct nfsreq *, char *); 136 void nfs_rcvunlock(int *); 137 138 int nfsrv_getstream(struct nfssvc_sock *, int); 139 140 unsigned int nfs_realign_test = 0; 141 unsigned int nfs_realign_count = 0; 142 143 /* Initialize the RTT estimator state for a new mount point. */ 144 void 145 nfs_init_rtt(struct nfsmount *nmp) 146 { 147 int i; 148 149 for (i = 0; i < NFS_MAX_TIMER; i++) 150 nmp->nm_srtt[i] = NFS_INITRTT; 151 for (i = 0; i < NFS_MAX_TIMER; i++) 152 nmp->nm_sdrtt[i] = 0; 153 } 154 155 /* 156 * Update a mount point's RTT estimator state using data from the 157 * passed-in request. 158 * 159 * Use a gain of 0.125 on the mean and a gain of 0.25 on the deviation. 160 * 161 * NB: Since the timer resolution of NFS_HZ is so course, it can often 162 * result in r_rtt == 0. Since r_rtt == N means that the actual RTT is 163 * between N + dt and N + 2 - dt ticks, add 1 before calculating the 164 * update values. 165 */ 166 void 167 nfs_update_rtt(struct nfsreq *rep) 168 { 169 int t1 = rep->r_rtt + 1; 170 int index = nfs_ptimers[rep->r_procnum] - 1; 171 int *srtt = &rep->r_nmp->nm_srtt[index]; 172 int *sdrtt = &rep->r_nmp->nm_sdrtt[index]; 173 174 t1 -= *srtt >> 3; 175 *srtt += t1; 176 if (t1 < 0) 177 t1 = -t1; 178 t1 -= *sdrtt >> 2; 179 *sdrtt += t1; 180 } 181 182 /* 183 * Estimate RTO for an NFS RPC sent via an unreliable datagram. 184 * 185 * Use the mean and mean deviation of RTT for the appropriate type 186 * of RPC for the frequent RPCs and a default for the others. 187 * The justification for doing "other" this way is that these RPCs 188 * happen so infrequently that timer est. would probably be stale. 189 * Also, since many of these RPCs are non-idempotent, a conservative 190 * timeout is desired. 191 * 192 * getattr, lookup - A+2D 193 * read, write - A+4D 194 * other - nm_timeo 195 */ 196 int 197 nfs_estimate_rto(struct nfsmount *nmp, u_int32_t procnum) 198 { 199 enum nfs_rto_timers timer = nfs_ptimers[procnum]; 200 int index = timer - 1; 201 int rto; 202 203 switch (timer) { 204 case NFS_GETATTR_TIMER: 205 case NFS_LOOKUP_TIMER: 206 rto = ((nmp->nm_srtt[index] + 3) >> 2) + 207 ((nmp->nm_sdrtt[index] + 1) >> 1); 208 break; 209 case NFS_READ_TIMER: 210 case NFS_WRITE_TIMER: 211 rto = ((nmp->nm_srtt[index] + 7) >> 3) + 212 (nmp->nm_sdrtt[index] + 1); 213 break; 214 default: 215 rto = nmp->nm_timeo; 216 return (rto); 217 } 218 219 if (rto < NFS_MINRTO) 220 rto = NFS_MINRTO; 221 else if (rto > NFS_MAXRTO) 222 rto = NFS_MAXRTO; 223 224 return (rto); 225 } 226 227 228 229 /* 230 * Initialize sockets and congestion for a new NFS connection. 231 * We do not free the sockaddr if error. 232 */ 233 int 234 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep) 235 { 236 struct socket *so; 237 int error, rcvreserve, sndreserve; 238 struct sockaddr *saddr; 239 struct sockaddr_in *sin; 240 struct mbuf *nam = NULL, *mopt = NULL; 241 242 if (!(nmp->nm_sotype == SOCK_DGRAM || nmp->nm_sotype == SOCK_STREAM)) 243 return (EINVAL); 244 245 nmp->nm_so = NULL; 246 saddr = mtod(nmp->nm_nam, struct sockaddr *); 247 error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype, 248 nmp->nm_soproto); 249 if (error) { 250 nfs_disconnect(nmp); 251 return (error); 252 } 253 254 /* Allocate mbufs possibly waiting before grabbing the socket lock. */ 255 if (nmp->nm_sotype == SOCK_STREAM || saddr->sa_family == AF_INET) 256 MGET(mopt, M_WAIT, MT_SOOPTS); 257 if (saddr->sa_family == AF_INET) 258 MGET(nam, M_WAIT, MT_SONAME); 259 260 so = nmp->nm_so; 261 solock(so); 262 nmp->nm_soflags = so->so_proto->pr_flags; 263 264 /* 265 * Some servers require that the client port be a reserved port number. 266 * We always allocate a reserved port, as this prevents filehandle 267 * disclosure through UDP port capture. 268 */ 269 if (saddr->sa_family == AF_INET) { 270 int *ip; 271 272 mopt->m_len = sizeof(int); 273 ip = mtod(mopt, int *); 274 *ip = IP_PORTRANGE_LOW; 275 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); 276 if (error) 277 goto bad; 278 279 sin = mtod(nam, struct sockaddr_in *); 280 memset(sin, 0, sizeof(*sin)); 281 sin->sin_len = nam->m_len = sizeof(struct sockaddr_in); 282 sin->sin_family = AF_INET; 283 sin->sin_addr.s_addr = INADDR_ANY; 284 sin->sin_port = htons(0); 285 error = sobind(so, nam, &proc0); 286 if (error) 287 goto bad; 288 289 mopt->m_len = sizeof(int); 290 ip = mtod(mopt, int *); 291 *ip = IP_PORTRANGE_DEFAULT; 292 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); 293 if (error) 294 goto bad; 295 } 296 297 /* 298 * Protocols that do not require connections may be optionally left 299 * unconnected for servers that reply from a port other than NFS_PORT. 300 */ 301 if (nmp->nm_flag & NFSMNT_NOCONN) { 302 if (nmp->nm_soflags & PR_CONNREQUIRED) { 303 error = ENOTCONN; 304 goto bad; 305 } 306 } else { 307 error = soconnect(so, nmp->nm_nam); 308 if (error) 309 goto bad; 310 311 /* 312 * Wait for the connection to complete. Cribbed from the 313 * connect system call but with the wait timing out so 314 * that interruptible mounts don't hang here for a long time. 315 */ 316 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 317 sosleep_nsec(so, &so->so_timeo, PSOCK, "nfscon", 318 SEC_TO_NSEC(2)); 319 if ((so->so_state & SS_ISCONNECTING) && 320 so->so_error == 0 && rep && 321 (error = nfs_sigintr(nmp, rep, rep->r_procp)) != 0){ 322 so->so_state &= ~SS_ISCONNECTING; 323 goto bad; 324 } 325 } 326 if (so->so_error) { 327 error = so->so_error; 328 so->so_error = 0; 329 goto bad; 330 } 331 } 332 /* 333 * Always set receive timeout to detect server crash and reconnect. 334 * Otherwise, we can get stuck in soreceive forever. 335 */ 336 so->so_rcv.sb_timeo_nsecs = SEC_TO_NSEC(5); 337 if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) 338 so->so_snd.sb_timeo_nsecs = SEC_TO_NSEC(5); 339 else 340 so->so_snd.sb_timeo_nsecs = INFSLP; 341 if (nmp->nm_sotype == SOCK_DGRAM) { 342 sndreserve = nmp->nm_wsize + NFS_MAXPKTHDR; 343 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) + 344 NFS_MAXPKTHDR) * 2; 345 } else if (nmp->nm_sotype == SOCK_STREAM) { 346 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 347 *mtod(mopt, int32_t *) = 1; 348 mopt->m_len = sizeof(int32_t); 349 sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, mopt); 350 } 351 if (so->so_proto->pr_protocol == IPPROTO_TCP) { 352 *mtod(mopt, int32_t *) = 1; 353 mopt->m_len = sizeof(int32_t); 354 sosetopt(so, IPPROTO_TCP, TCP_NODELAY, mopt); 355 } 356 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + 357 sizeof (u_int32_t)) * 2; 358 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + 359 sizeof (u_int32_t)) * 2; 360 } else { 361 panic("%s: nm_sotype %d", __func__, nmp->nm_sotype); 362 } 363 error = soreserve(so, sndreserve, rcvreserve); 364 if (error) 365 goto bad; 366 so->so_rcv.sb_flags |= SB_NOINTR; 367 so->so_snd.sb_flags |= SB_NOINTR; 368 sounlock(so); 369 370 m_freem(mopt); 371 m_freem(nam); 372 373 /* Initialize other non-zero congestion variables */ 374 nfs_init_rtt(nmp); 375 nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */ 376 nmp->nm_sent = 0; 377 nmp->nm_timeouts = 0; 378 return (0); 379 380 bad: 381 sounlock(so); 382 383 m_freem(mopt); 384 m_freem(nam); 385 386 nfs_disconnect(nmp); 387 return (error); 388 } 389 390 /* 391 * Reconnect routine: 392 * Called when a connection is broken on a reliable protocol. 393 * - clean up the old socket 394 * - nfs_connect() again 395 * - set R_MUSTRESEND for all outstanding requests on mount point 396 * If this fails the mount point is DEAD! 397 * nb: Must be called with the nfs_sndlock() set on the mount point. 398 */ 399 int 400 nfs_reconnect(struct nfsreq *rep) 401 { 402 struct nfsreq *rp; 403 struct nfsmount *nmp = rep->r_nmp; 404 int error; 405 406 nfs_disconnect(nmp); 407 while ((error = nfs_connect(nmp, rep)) != 0) { 408 if (error == EINTR || error == ERESTART) 409 return (EINTR); 410 tsleep_nsec(&nowake, PSOCK, "nfsrecon", SEC_TO_NSEC(1)); 411 } 412 413 /* 414 * Loop through outstanding request list and fix up all requests 415 * on old socket. 416 */ 417 TAILQ_FOREACH(rp, &nmp->nm_reqsq, r_chain) { 418 rp->r_flags |= R_MUSTRESEND; 419 rp->r_rexmit = 0; 420 } 421 return (0); 422 } 423 424 /* 425 * NFS disconnect. Clean up and unlink. 426 */ 427 void 428 nfs_disconnect(struct nfsmount *nmp) 429 { 430 struct socket *so; 431 432 if (nmp->nm_so) { 433 so = nmp->nm_so; 434 nmp->nm_so = NULL; 435 soshutdown(so, SHUT_RDWR); 436 soclose(so, 0); 437 } 438 } 439 440 /* 441 * This is the nfs send routine. For connection based socket types, it 442 * must be called with an nfs_sndlock() on the socket. 443 * "rep == NULL" indicates that it has been called from a server. 444 * For the client side: 445 * - return EINTR if the RPC is terminated, 0 otherwise 446 * - set R_MUSTRESEND if the send fails for any reason 447 * - do any cleanup required by recoverable socket errors (???) 448 * For the server side: 449 * - return EINTR or ERESTART if interrupted by a signal 450 * - return EPIPE if a connection is lost for connection based sockets (TCP...) 451 * - do any cleanup required by recoverable socket errors (???) 452 */ 453 int 454 nfs_send(struct socket *so, struct mbuf *nam, struct mbuf *top, 455 struct nfsreq *rep) 456 { 457 struct mbuf *sendnam; 458 int error, soflags, flags; 459 460 if (rep) { 461 if (rep->r_flags & R_SOFTTERM) { 462 m_freem(top); 463 return (EINTR); 464 } 465 if ((so = rep->r_nmp->nm_so) == NULL) { 466 rep->r_flags |= R_MUSTRESEND; 467 m_freem(top); 468 return (0); 469 } 470 rep->r_flags &= ~R_MUSTRESEND; 471 soflags = rep->r_nmp->nm_soflags; 472 } else 473 soflags = so->so_proto->pr_flags; 474 if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED)) 475 sendnam = NULL; 476 else 477 sendnam = nam; 478 flags = 0; 479 480 error = sosend(so, sendnam, NULL, top, NULL, flags); 481 if (error) { 482 if (rep) { 483 /* 484 * Deal with errors for the client side. 485 */ 486 if (rep->r_flags & R_SOFTTERM) 487 error = EINTR; 488 else 489 rep->r_flags |= R_MUSTRESEND; 490 } 491 492 /* 493 * Handle any recoverable (soft) socket errors here. (???) 494 */ 495 if (error != EINTR && error != ERESTART && 496 error != EWOULDBLOCK && error != EPIPE) 497 error = 0; 498 } 499 return (error); 500 } 501 502 #ifdef NFSCLIENT 503 /* 504 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all 505 * done by soreceive(), but for SOCK_STREAM we must deal with the Record 506 * Mark and consolidate the data into a new mbuf list. 507 * nb: Sometimes TCP passes the data up to soreceive() in long lists of 508 * small mbufs. 509 * For SOCK_STREAM we must be very careful to read an entire record once 510 * we have read any of it, even if the system call has been interrupted. 511 */ 512 int 513 nfs_receive(struct nfsreq *rep, struct mbuf **aname, struct mbuf **mp) 514 { 515 struct socket *so; 516 struct uio auio; 517 struct iovec aio; 518 struct mbuf *m; 519 struct mbuf *control; 520 u_int32_t len; 521 struct mbuf **getnam; 522 int error, sotype, rcvflg; 523 struct proc *p = curproc; /* XXX */ 524 525 /* 526 * Set up arguments for soreceive() 527 */ 528 *mp = NULL; 529 *aname = NULL; 530 sotype = rep->r_nmp->nm_sotype; 531 532 /* 533 * For reliable protocols, lock against other senders/receivers 534 * in case a reconnect is necessary. 535 * For SOCK_STREAM, first get the Record Mark to find out how much 536 * more there is to get. 537 * We must lock the socket against other receivers 538 * until we have an entire rpc request/reply. 539 */ 540 if (sotype != SOCK_DGRAM) { 541 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); 542 if (error) 543 return (error); 544 tryagain: 545 /* 546 * Check for fatal errors and resending request. 547 */ 548 /* 549 * Ugh: If a reconnect attempt just happened, nm_so 550 * would have changed. NULL indicates a failed 551 * attempt that has essentially shut down this 552 * mount point. 553 */ 554 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) { 555 nfs_sndunlock(&rep->r_nmp->nm_flag); 556 return (EINTR); 557 } 558 so = rep->r_nmp->nm_so; 559 if (!so) { 560 error = nfs_reconnect(rep); 561 if (error) { 562 nfs_sndunlock(&rep->r_nmp->nm_flag); 563 return (error); 564 } 565 goto tryagain; 566 } 567 while (rep->r_flags & R_MUSTRESEND) { 568 m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT); 569 nfsstats.rpcretries++; 570 rep->r_rtt = 0; 571 rep->r_flags &= ~R_TIMING; 572 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep); 573 if (error) { 574 if (error == EINTR || error == ERESTART || 575 (error = nfs_reconnect(rep)) != 0) { 576 nfs_sndunlock(&rep->r_nmp->nm_flag); 577 return (error); 578 } 579 goto tryagain; 580 } 581 } 582 nfs_sndunlock(&rep->r_nmp->nm_flag); 583 if (sotype == SOCK_STREAM) { 584 aio.iov_base = (caddr_t) &len; 585 aio.iov_len = sizeof(u_int32_t); 586 auio.uio_iov = &aio; 587 auio.uio_iovcnt = 1; 588 auio.uio_segflg = UIO_SYSSPACE; 589 auio.uio_rw = UIO_READ; 590 auio.uio_offset = 0; 591 auio.uio_resid = sizeof(u_int32_t); 592 auio.uio_procp = p; 593 do { 594 rcvflg = MSG_WAITALL; 595 error = soreceive(so, NULL, &auio, NULL, NULL, 596 &rcvflg, 0); 597 if (error == EWOULDBLOCK && rep) { 598 if (rep->r_flags & R_SOFTTERM) 599 return (EINTR); 600 /* 601 * looks like the server died after it 602 * received the request, make sure 603 * that we will retransmit and we 604 * don't get stuck here forever. 605 */ 606 if (rep->r_rexmit >= 607 rep->r_nmp->nm_retry) { 608 nfsstats.rpctimeouts++; 609 error = EPIPE; 610 } 611 } 612 } while (error == EWOULDBLOCK); 613 if (!error && auio.uio_resid > 0) { 614 log(LOG_INFO, 615 "short receive (%zu/%zu) from nfs server %s\n", 616 sizeof(u_int32_t) - auio.uio_resid, 617 sizeof(u_int32_t), 618 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 619 error = EPIPE; 620 } 621 if (error) 622 goto errout; 623 624 len = ntohl(len) & ~0x80000000; 625 /* 626 * This is SERIOUS! We are out of sync with the sender 627 * and forcing a disconnect/reconnect is all I can do. 628 */ 629 if (len > NFS_MAXPACKET) { 630 log(LOG_ERR, "%s (%u) from nfs server %s\n", 631 "impossible packet length", 632 len, 633 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 634 error = EFBIG; 635 goto errout; 636 } 637 auio.uio_resid = len; 638 do { 639 rcvflg = MSG_WAITALL; 640 error = soreceive(so, NULL, &auio, mp, NULL, 641 &rcvflg, 0); 642 } while (error == EWOULDBLOCK || error == EINTR || 643 error == ERESTART); 644 if (!error && auio.uio_resid > 0) { 645 log(LOG_INFO, "short receive (%zu/%u) from " 646 "nfs server %s\n", len - auio.uio_resid, 647 len, rep->r_nmp->nm_mountp-> 648 mnt_stat.f_mntfromname); 649 error = EPIPE; 650 } 651 } else { 652 /* 653 * NB: Since uio_resid is big, MSG_WAITALL is ignored 654 * and soreceive() will return when it has either a 655 * control msg or a data msg. 656 * We have no use for control msg., but must grab them 657 * and then throw them away so we know what is going 658 * on. 659 */ 660 auio.uio_resid = len = 100000000; /* Anything Big */ 661 auio.uio_procp = p; 662 do { 663 rcvflg = 0; 664 error = soreceive(so, NULL, &auio, mp, &control, 665 &rcvflg, 0); 666 m_freem(control); 667 if (error == EWOULDBLOCK && rep) { 668 if (rep->r_flags & R_SOFTTERM) 669 return (EINTR); 670 } 671 } while (error == EWOULDBLOCK || 672 (!error && *mp == NULL && control)); 673 if ((rcvflg & MSG_EOR) == 0) 674 printf("Egad!!\n"); 675 if (!error && *mp == NULL) 676 error = EPIPE; 677 len -= auio.uio_resid; 678 } 679 errout: 680 if (error && error != EINTR && error != ERESTART) { 681 m_freemp(mp); 682 if (error != EPIPE) 683 log(LOG_INFO, 684 "receive error %d from nfs server %s\n", 685 error, 686 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 687 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); 688 if (!error) { 689 error = nfs_reconnect(rep); 690 if (!error) 691 goto tryagain; 692 nfs_sndunlock(&rep->r_nmp->nm_flag); 693 } 694 } 695 } else { 696 if ((so = rep->r_nmp->nm_so) == NULL) 697 return (EACCES); 698 if (so->so_state & SS_ISCONNECTED) 699 getnam = NULL; 700 else 701 getnam = aname; 702 auio.uio_resid = len = 1000000; 703 auio.uio_procp = p; 704 do { 705 rcvflg = 0; 706 error = soreceive(so, getnam, &auio, mp, NULL, 707 &rcvflg, 0); 708 if (error == EWOULDBLOCK && 709 (rep->r_flags & R_SOFTTERM)) 710 return (EINTR); 711 } while (error == EWOULDBLOCK); 712 len -= auio.uio_resid; 713 } 714 if (error) 715 m_freemp(mp); 716 /* 717 * Search for any mbufs that are not a multiple of 4 bytes long 718 * or with m_data not longword aligned. 719 * These could cause pointer alignment problems, so copy them to 720 * well aligned mbufs. 721 */ 722 nfs_realign(mp, 5 * NFSX_UNSIGNED); 723 return (error); 724 } 725 726 /* 727 * Implement receipt of reply on a socket. 728 * We must search through the list of received datagrams matching them 729 * with outstanding requests using the xid, until ours is found. 730 */ 731 int 732 nfs_reply(struct nfsreq *myrep) 733 { 734 struct nfsreq *rep; 735 struct nfsmount *nmp = myrep->r_nmp; 736 struct nfsm_info info; 737 struct mbuf *nam; 738 u_int32_t rxid, *tl, t1; 739 caddr_t cp2; 740 int error; 741 742 /* 743 * Loop around until we get our own reply 744 */ 745 for (;;) { 746 /* 747 * Lock against other receivers so that I don't get stuck in 748 * sbwait() after someone else has received my reply for me. 749 * Also necessary for connection based protocols to avoid 750 * race conditions during a reconnect. 751 */ 752 error = nfs_rcvlock(myrep); 753 if (error) 754 return (error == EALREADY ? 0 : error); 755 756 /* 757 * Get the next Rpc reply off the socket 758 */ 759 error = nfs_receive(myrep, &nam, &info.nmi_mrep); 760 nfs_rcvunlock(&nmp->nm_flag); 761 if (error) { 762 763 /* 764 * Ignore routing errors on connectionless protocols?? 765 */ 766 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) { 767 if (nmp->nm_so) 768 nmp->nm_so->so_error = 0; 769 continue; 770 } 771 return (error); 772 } 773 m_freem(nam); 774 775 /* 776 * Get the xid and check that it is an rpc reply 777 */ 778 info.nmi_md = info.nmi_mrep; 779 info.nmi_dpos = mtod(info.nmi_md, caddr_t); 780 nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED); 781 rxid = *tl++; 782 if (*tl != rpc_reply) { 783 nfsstats.rpcinvalid++; 784 m_freem(info.nmi_mrep); 785 nfsmout: 786 continue; 787 } 788 789 /* 790 * Loop through the request list to match up the reply 791 * Iff no match, just drop the datagram 792 */ 793 TAILQ_FOREACH(rep, &nmp->nm_reqsq, r_chain) { 794 if (rep->r_mrep == NULL && rxid == rep->r_xid) { 795 /* Found it.. */ 796 rep->r_mrep = info.nmi_mrep; 797 rep->r_md = info.nmi_md; 798 rep->r_dpos = info.nmi_dpos; 799 800 /* 801 * Update congestion window. 802 * Do the additive increase of 803 * one rpc/rtt. 804 */ 805 if (nmp->nm_cwnd <= nmp->nm_sent) { 806 nmp->nm_cwnd += 807 (NFS_CWNDSCALE * NFS_CWNDSCALE + 808 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd; 809 if (nmp->nm_cwnd > NFS_MAXCWND) 810 nmp->nm_cwnd = NFS_MAXCWND; 811 } 812 rep->r_flags &= ~R_SENT; 813 nmp->nm_sent -= NFS_CWNDSCALE; 814 815 if (rep->r_flags & R_TIMING) 816 nfs_update_rtt(rep); 817 818 nmp->nm_timeouts = 0; 819 break; 820 } 821 } 822 /* 823 * If not matched to a request, drop it. 824 * If it's mine, get out. 825 */ 826 if (rep == 0) { 827 nfsstats.rpcunexpected++; 828 m_freem(info.nmi_mrep); 829 } else if (rep == myrep) { 830 if (rep->r_mrep == NULL) 831 panic("nfsreply nil"); 832 return (0); 833 } 834 } 835 } 836 837 /* 838 * nfs_request - goes something like this 839 * - fill in request struct 840 * - links it into list 841 * - calls nfs_send() for first transmit 842 * - calls nfs_receive() to get reply 843 * - break down rpc header and return with nfs reply pointed to 844 * by mrep or error 845 * nb: always frees up mreq mbuf list 846 */ 847 int 848 nfs_request(struct vnode *vp, int procnum, struct nfsm_info *infop) 849 { 850 struct mbuf *m; 851 u_int32_t *tl; 852 struct nfsmount *nmp; 853 caddr_t cp2; 854 int t1, i, error = 0; 855 int trylater_delay; 856 struct nfsreq *rep; 857 struct nfsm_info info; 858 859 rep = pool_get(&nfsreqpl, PR_WAITOK); 860 rep->r_nmp = VFSTONFS(vp->v_mount); 861 rep->r_vp = vp; 862 rep->r_procp = infop->nmi_procp; 863 rep->r_procnum = procnum; 864 865 /* empty mbuf for AUTH_UNIX header */ 866 rep->r_mreq = m_gethdr(M_WAIT, MT_DATA); 867 rep->r_mreq->m_next = infop->nmi_mreq; 868 rep->r_mreq->m_len = 0; 869 m_calchdrlen(rep->r_mreq); 870 871 trylater_delay = NFS_MINTIMEO; 872 873 nmp = rep->r_nmp; 874 875 /* Get the RPC header with authorization. */ 876 nfsm_rpchead(rep, infop->nmi_cred, RPCAUTH_UNIX); 877 m = rep->r_mreq; 878 879 /* 880 * For stream protocols, insert a Sun RPC Record Mark. 881 */ 882 if (nmp->nm_sotype == SOCK_STREAM) { 883 M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); 884 *mtod(m, u_int32_t *) = htonl(0x80000000 | 885 (m->m_pkthdr.len - NFSX_UNSIGNED)); 886 } 887 888 tryagain: 889 rep->r_rtt = rep->r_rexmit = 0; 890 if (nfs_ptimers[rep->r_procnum] != NFS_DEFAULT_TIMER) 891 rep->r_flags = R_TIMING; 892 else 893 rep->r_flags = 0; 894 rep->r_mrep = NULL; 895 896 /* 897 * Do the client side RPC. 898 */ 899 nfsstats.rpcrequests++; 900 /* 901 * Chain request into list of outstanding requests. Be sure 902 * to put it LAST so timer finds oldest requests first. 903 */ 904 if (TAILQ_EMPTY(&nmp->nm_reqsq)) 905 timeout_add(&nmp->nm_rtimeout, nfs_ticks); 906 TAILQ_INSERT_TAIL(&nmp->nm_reqsq, rep, r_chain); 907 908 /* 909 * If backing off another request or avoiding congestion, don't 910 * send this one now but let timer do it. If not timing a request, 911 * do it now. 912 */ 913 if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM || 914 (nmp->nm_flag & NFSMNT_DUMBTIMR) || 915 nmp->nm_sent < nmp->nm_cwnd)) { 916 if (nmp->nm_soflags & PR_CONNREQUIRED) 917 error = nfs_sndlock(&nmp->nm_flag, rep); 918 if (!error) { 919 error = nfs_send(nmp->nm_so, nmp->nm_nam, 920 m_copym(m, 0, M_COPYALL, M_WAIT), rep); 921 if (nmp->nm_soflags & PR_CONNREQUIRED) 922 nfs_sndunlock(&nmp->nm_flag); 923 } 924 if (!error && (rep->r_flags & R_MUSTRESEND) == 0) { 925 nmp->nm_sent += NFS_CWNDSCALE; 926 rep->r_flags |= R_SENT; 927 } 928 } else { 929 rep->r_rtt = -1; 930 } 931 932 /* 933 * Wait for the reply from our send or the timer's. 934 */ 935 if (!error || error == EPIPE) 936 error = nfs_reply(rep); 937 938 /* 939 * RPC done, unlink the request. 940 */ 941 TAILQ_REMOVE(&nmp->nm_reqsq, rep, r_chain); 942 if (TAILQ_EMPTY(&nmp->nm_reqsq)) 943 timeout_del(&nmp->nm_rtimeout); 944 945 /* 946 * Decrement the outstanding request count. 947 */ 948 if (rep->r_flags & R_SENT) { 949 rep->r_flags &= ~R_SENT; /* paranoia */ 950 nmp->nm_sent -= NFS_CWNDSCALE; 951 } 952 953 /* 954 * If there was a successful reply and a tprintf msg. 955 * tprintf a response. 956 */ 957 if (!error && (rep->r_flags & R_TPRINTFMSG)) 958 nfs_msg(rep, "is alive again"); 959 info.nmi_mrep = rep->r_mrep; 960 info.nmi_md = rep->r_md; 961 info.nmi_dpos = rep->r_dpos; 962 if (error) { 963 infop->nmi_mrep = NULL; 964 goto nfsmout1; 965 } 966 967 /* 968 * break down the rpc header and check if ok 969 */ 970 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); 971 if (*tl++ == rpc_msgdenied) { 972 if (*tl == rpc_mismatch) 973 error = EOPNOTSUPP; 974 else 975 error = EACCES; /* Should be EAUTH. */ 976 infop->nmi_mrep = NULL; 977 goto nfsmout1; 978 } 979 980 /* 981 * Since we only support RPCAUTH_UNIX atm we step over the 982 * reply verifer type, and in the (error) case that there really 983 * is any data in it, we advance over it. 984 */ 985 tl++; /* Step over verifer type */ 986 i = fxdr_unsigned(int32_t, *tl); 987 if (i > 0) 988 nfsm_adv(nfsm_rndup(i)); /* Should not happen */ 989 990 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); 991 /* 0 == ok */ 992 if (*tl == 0) { 993 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); 994 if (*tl != 0) { 995 error = fxdr_unsigned(int, *tl); 996 if ((nmp->nm_flag & NFSMNT_NFSV3) && 997 error == NFSERR_TRYLATER) { 998 m_freem(info.nmi_mrep); 999 error = 0; 1000 tsleep_nsec(&nowake, PSOCK, "nfsretry", 1001 SEC_TO_NSEC(trylater_delay)); 1002 trylater_delay *= NFS_TIMEOUTMUL; 1003 if (trylater_delay > NFS_MAXTIMEO) 1004 trylater_delay = NFS_MAXTIMEO; 1005 1006 goto tryagain; 1007 } 1008 1009 /* 1010 * If the File Handle was stale, invalidate the 1011 * lookup cache, just in case. 1012 */ 1013 if (error == ESTALE) 1014 cache_purge(rep->r_vp); 1015 } 1016 goto nfsmout; 1017 } 1018 1019 error = EPROTONOSUPPORT; 1020 1021 nfsmout: 1022 infop->nmi_mrep = info.nmi_mrep; 1023 infop->nmi_md = info.nmi_md; 1024 infop->nmi_dpos = info.nmi_dpos; 1025 nfsmout1: 1026 m_freem(rep->r_mreq); 1027 pool_put(&nfsreqpl, rep); 1028 return (error); 1029 } 1030 #endif /* NFSCLIENT */ 1031 1032 /* 1033 * Generate the rpc reply header 1034 * siz arg. is used to decide if adding a cluster is worthwhile 1035 */ 1036 int 1037 nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp, 1038 int err, struct mbuf **mrq, struct mbuf **mbp) 1039 { 1040 u_int32_t *tl; 1041 struct mbuf *mreq; 1042 struct mbuf *mb; 1043 1044 MGETHDR(mreq, M_WAIT, MT_DATA); 1045 mb = mreq; 1046 /* 1047 * If this is a big reply, use a cluster else 1048 * try and leave leading space for the lower level headers. 1049 */ 1050 siz += RPC_REPLYSIZ; 1051 if (siz >= MHLEN - max_hdr) { 1052 MCLGET(mreq, M_WAIT); 1053 } else 1054 mreq->m_data += max_hdr; 1055 tl = mtod(mreq, u_int32_t *); 1056 mreq->m_len = 6 * NFSX_UNSIGNED; 1057 *tl++ = txdr_unsigned(nd->nd_retxid); 1058 *tl++ = rpc_reply; 1059 if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) { 1060 *tl++ = rpc_msgdenied; 1061 if (err & NFSERR_AUTHERR) { 1062 *tl++ = rpc_autherr; 1063 *tl = txdr_unsigned(err & ~NFSERR_AUTHERR); 1064 mreq->m_len -= NFSX_UNSIGNED; 1065 } else { 1066 *tl++ = rpc_mismatch; 1067 *tl++ = txdr_unsigned(RPC_VER2); 1068 *tl = txdr_unsigned(RPC_VER2); 1069 } 1070 } else { 1071 *tl++ = rpc_msgaccepted; 1072 1073 /* AUTH_UNIX requires RPCAUTH_NULL. */ 1074 *tl++ = 0; 1075 *tl++ = 0; 1076 1077 switch (err) { 1078 case EPROGUNAVAIL: 1079 *tl = txdr_unsigned(RPC_PROGUNAVAIL); 1080 break; 1081 case EPROGMISMATCH: 1082 *tl = txdr_unsigned(RPC_PROGMISMATCH); 1083 tl = nfsm_build(&mb, 2 * NFSX_UNSIGNED); 1084 *tl++ = txdr_unsigned(NFS_VER2); 1085 *tl = txdr_unsigned(NFS_VER3); 1086 break; 1087 case EPROCUNAVAIL: 1088 *tl = txdr_unsigned(RPC_PROCUNAVAIL); 1089 break; 1090 case EBADRPC: 1091 *tl = txdr_unsigned(RPC_GARBAGE); 1092 break; 1093 default: 1094 *tl = 0; 1095 if (err != NFSERR_RETVOID) { 1096 tl = nfsm_build(&mb, NFSX_UNSIGNED); 1097 if (err) 1098 *tl = txdr_unsigned(nfsrv_errmap(nd, err)); 1099 else 1100 *tl = 0; 1101 } 1102 break; 1103 }; 1104 } 1105 1106 *mrq = mreq; 1107 if (mbp != NULL) 1108 *mbp = mb; 1109 if (err != 0 && err != NFSERR_RETVOID) 1110 nfsstats.srvrpc_errs++; 1111 return (0); 1112 } 1113 1114 /* 1115 * nfs timer routine 1116 * Scan the nfsreq list and retransmit any requests that have timed out. 1117 */ 1118 void 1119 nfs_timer(void *arg) 1120 { 1121 struct nfsmount *nmp = arg; 1122 struct nfsreq *rep; 1123 struct mbuf *m; 1124 struct socket *so; 1125 int timeo, error; 1126 1127 NET_LOCK(); 1128 TAILQ_FOREACH(rep, &nmp->nm_reqsq, r_chain) { 1129 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) 1130 continue; 1131 if (nfs_sigintr(nmp, rep, rep->r_procp)) { 1132 rep->r_flags |= R_SOFTTERM; 1133 continue; 1134 } 1135 if (rep->r_rtt >= 0) { 1136 rep->r_rtt++; 1137 if (nmp->nm_flag & NFSMNT_DUMBTIMR) 1138 timeo = nmp->nm_timeo; 1139 else 1140 timeo = nfs_estimate_rto(nmp, rep->r_procnum); 1141 if (nmp->nm_timeouts > 0) 1142 timeo *= nfs_backoff[nmp->nm_timeouts - 1]; 1143 if (rep->r_rtt <= timeo) 1144 continue; 1145 if (nmp->nm_timeouts < nitems(nfs_backoff)) 1146 nmp->nm_timeouts++; 1147 } 1148 1149 /* Check for server not responding. */ 1150 if ((rep->r_flags & R_TPRINTFMSG) == 0 && rep->r_rexmit > 4) { 1151 nfs_msg(rep, "not responding"); 1152 rep->r_flags |= R_TPRINTFMSG; 1153 } 1154 if (rep->r_rexmit >= nmp->nm_retry) { /* too many */ 1155 nfsstats.rpctimeouts++; 1156 rep->r_flags |= R_SOFTTERM; 1157 continue; 1158 } 1159 if (nmp->nm_sotype != SOCK_DGRAM) { 1160 if (++rep->r_rexmit > NFS_MAXREXMIT) 1161 rep->r_rexmit = NFS_MAXREXMIT; 1162 continue; 1163 } 1164 1165 if ((so = nmp->nm_so) == NULL) 1166 continue; 1167 1168 /* 1169 * If there is enough space and the window allows.. 1170 * Resend it 1171 * Set r_rtt to -1 in case we fail to send it now. 1172 */ 1173 rep->r_rtt = -1; 1174 if (sbspace(so, &so->so_snd) >= rep->r_mreq->m_pkthdr.len && 1175 ((nmp->nm_flag & NFSMNT_DUMBTIMR) || 1176 (rep->r_flags & R_SENT) || 1177 nmp->nm_sent < nmp->nm_cwnd) && 1178 (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){ 1179 if ((nmp->nm_flag & NFSMNT_NOCONN) == 0) 1180 error = pru_send(so, m, NULL, NULL); 1181 else 1182 error = pru_send(so, m, nmp->nm_nam, NULL); 1183 if (error) { 1184 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) 1185 so->so_error = 0; 1186 } else { 1187 /* 1188 * Iff first send, start timing 1189 * else turn timing off, backoff timer 1190 * and divide congestion window by 2. 1191 */ 1192 if (rep->r_flags & R_SENT) { 1193 rep->r_flags &= ~R_TIMING; 1194 if (++rep->r_rexmit > NFS_MAXREXMIT) 1195 rep->r_rexmit = NFS_MAXREXMIT; 1196 nmp->nm_cwnd >>= 1; 1197 if (nmp->nm_cwnd < NFS_CWNDSCALE) 1198 nmp->nm_cwnd = NFS_CWNDSCALE; 1199 nfsstats.rpcretries++; 1200 } else { 1201 rep->r_flags |= R_SENT; 1202 nmp->nm_sent += NFS_CWNDSCALE; 1203 } 1204 rep->r_rtt = 0; 1205 } 1206 } 1207 } 1208 NET_UNLOCK(); 1209 timeout_add(&nmp->nm_rtimeout, nfs_ticks); 1210 } 1211 1212 /* 1213 * Test for a termination condition pending on the process. 1214 * This is used for NFSMNT_INT mounts. 1215 */ 1216 int 1217 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct proc *p) 1218 { 1219 1220 if (rep && (rep->r_flags & R_SOFTTERM)) 1221 return (EINTR); 1222 if (!(nmp->nm_flag & NFSMNT_INT)) 1223 return (0); 1224 if (p && (SIGPENDING(p) & ~p->p_p->ps_sigacts->ps_sigignore & 1225 NFSINT_SIGMASK)) 1226 return (EINTR); 1227 return (0); 1228 } 1229 1230 /* 1231 * Lock a socket against others. 1232 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply 1233 * and also to avoid race conditions between the processes with nfs requests 1234 * in progress when a reconnect is necessary. 1235 */ 1236 int 1237 nfs_sndlock(int *flagp, struct nfsreq *rep) 1238 { 1239 uint64_t slptimeo = INFSLP; 1240 struct proc *p; 1241 int slpflag = 0; 1242 1243 if (rep) { 1244 p = rep->r_procp; 1245 if (rep->r_nmp->nm_flag & NFSMNT_INT) 1246 slpflag = PCATCH; 1247 } else 1248 p = NULL; 1249 while (*flagp & NFSMNT_SNDLOCK) { 1250 if (rep && nfs_sigintr(rep->r_nmp, rep, p)) 1251 return (EINTR); 1252 *flagp |= NFSMNT_WANTSND; 1253 tsleep_nsec(flagp, slpflag | (PZERO - 1), "nfsndlck", slptimeo); 1254 if (slpflag == PCATCH) { 1255 slpflag = 0; 1256 slptimeo = SEC_TO_NSEC(2); 1257 } 1258 } 1259 *flagp |= NFSMNT_SNDLOCK; 1260 return (0); 1261 } 1262 1263 /* 1264 * Unlock the stream socket for others. 1265 */ 1266 void 1267 nfs_sndunlock(int *flagp) 1268 { 1269 1270 if ((*flagp & NFSMNT_SNDLOCK) == 0) 1271 panic("nfs sndunlock"); 1272 *flagp &= ~NFSMNT_SNDLOCK; 1273 if (*flagp & NFSMNT_WANTSND) { 1274 *flagp &= ~NFSMNT_WANTSND; 1275 wakeup((caddr_t)flagp); 1276 } 1277 } 1278 1279 int 1280 nfs_rcvlock(struct nfsreq *rep) 1281 { 1282 uint64_t slptimeo = INFSLP; 1283 int *flagp = &rep->r_nmp->nm_flag; 1284 int slpflag; 1285 1286 if (*flagp & NFSMNT_INT) 1287 slpflag = PCATCH; 1288 else 1289 slpflag = 0; 1290 1291 while (*flagp & NFSMNT_RCVLOCK) { 1292 if (nfs_sigintr(rep->r_nmp, rep, rep->r_procp)) 1293 return (EINTR); 1294 *flagp |= NFSMNT_WANTRCV; 1295 tsleep_nsec(flagp, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo); 1296 if (rep->r_mrep != NULL) { 1297 /* 1298 * Don't take the lock if our reply has been received 1299 * while we where sleeping. 1300 */ 1301 return (EALREADY); 1302 } 1303 if (slpflag == PCATCH) { 1304 slpflag = 0; 1305 slptimeo = SEC_TO_NSEC(2); 1306 } 1307 } 1308 *flagp |= NFSMNT_RCVLOCK; 1309 return (0); 1310 } 1311 1312 /* 1313 * Unlock the stream socket for others. 1314 */ 1315 void 1316 nfs_rcvunlock(int *flagp) 1317 { 1318 1319 if ((*flagp & NFSMNT_RCVLOCK) == 0) 1320 panic("nfs rcvunlock"); 1321 *flagp &= ~NFSMNT_RCVLOCK; 1322 if (*flagp & NFSMNT_WANTRCV) { 1323 *flagp &= ~NFSMNT_WANTRCV; 1324 wakeup(flagp); 1325 } 1326 } 1327 1328 /* 1329 * Auxiliary routine to align the length of mbuf copies made with m_copyback(). 1330 */ 1331 void 1332 nfs_realign_fixup(struct mbuf *m, struct mbuf *n, unsigned int *off) 1333 { 1334 size_t padding; 1335 1336 /* 1337 * The maximum number of bytes that m_copyback() places in a mbuf is 1338 * always an aligned quantity, so realign happens at the chain's tail. 1339 */ 1340 while (n->m_next != NULL) 1341 n = n->m_next; 1342 1343 /* 1344 * Pad from the next elements in the source chain. Loop until the 1345 * destination chain is aligned, or the end of the source is reached. 1346 */ 1347 do { 1348 m = m->m_next; 1349 if (m == NULL) 1350 return; 1351 1352 padding = min(ALIGN(n->m_len) - n->m_len, m->m_len); 1353 if (padding > m_trailingspace(n)) 1354 panic("nfs_realign_fixup: no memory to pad to"); 1355 1356 bcopy(mtod(m, void *), mtod(n, char *) + n->m_len, padding); 1357 1358 n->m_len += padding; 1359 m_adj(m, padding); 1360 *off += padding; 1361 1362 } while (!ALIGNED_POINTER(n->m_len, void *)); 1363 } 1364 1365 /* 1366 * The NFS RPC parsing code uses the data address and the length of mbuf 1367 * structures to calculate on-memory addresses. This function makes sure these 1368 * parameters are correctly aligned. 1369 */ 1370 void 1371 nfs_realign(struct mbuf **pm, int hsiz) 1372 { 1373 struct mbuf *m; 1374 struct mbuf *n = NULL; 1375 unsigned int off = 0; 1376 1377 ++nfs_realign_test; 1378 while ((m = *pm) != NULL) { 1379 if (!ALIGNED_POINTER(m->m_data, void *) || 1380 !ALIGNED_POINTER(m->m_len, void *)) { 1381 MGET(n, M_WAIT, MT_DATA); 1382 #define ALIGN_POINTER(n) ((u_int)(((n) + sizeof(void *)) & ~sizeof(void *))) 1383 if (ALIGN_POINTER(m->m_len) >= MINCLSIZE) { 1384 MCLGET(n, M_WAIT); 1385 } 1386 n->m_len = 0; 1387 break; 1388 } 1389 pm = &m->m_next; 1390 } 1391 /* 1392 * If n is non-NULL, loop on m copying data, then replace the 1393 * portion of the chain that had to be realigned. 1394 */ 1395 if (n != NULL) { 1396 ++nfs_realign_count; 1397 while (m) { 1398 m_copyback(n, off, m->m_len, mtod(m, caddr_t), M_WAIT); 1399 1400 /* 1401 * If an unaligned amount of memory was copied, fix up 1402 * the last mbuf created by m_copyback(). 1403 */ 1404 if (!ALIGNED_POINTER(m->m_len, void *)) 1405 nfs_realign_fixup(m, n, &off); 1406 1407 off += m->m_len; 1408 m = m->m_next; 1409 } 1410 m_freemp(pm); 1411 *pm = n; 1412 } 1413 } 1414 1415 1416 /* 1417 * Parse an RPC request 1418 * - verify it 1419 * - fill in the cred struct. 1420 */ 1421 int 1422 nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header) 1423 { 1424 int len, i; 1425 u_int32_t *tl; 1426 int32_t t1; 1427 caddr_t cp2; 1428 u_int32_t nfsvers, auth_type; 1429 int error = 0; 1430 struct nfsm_info info; 1431 1432 info.nmi_mrep = nd->nd_mrep; 1433 info.nmi_md = nd->nd_md; 1434 info.nmi_dpos = nd->nd_dpos; 1435 if (has_header) { 1436 nfsm_dissect(tl, u_int32_t *, 10 * NFSX_UNSIGNED); 1437 nd->nd_retxid = fxdr_unsigned(u_int32_t, *tl++); 1438 if (*tl++ != rpc_call) { 1439 m_freem(info.nmi_mrep); 1440 return (EBADRPC); 1441 } 1442 } else 1443 nfsm_dissect(tl, u_int32_t *, 8 * NFSX_UNSIGNED); 1444 nd->nd_repstat = 0; 1445 nd->nd_flag = 0; 1446 if (*tl++ != rpc_vers) { 1447 nd->nd_repstat = ERPCMISMATCH; 1448 nd->nd_procnum = NFSPROC_NOOP; 1449 return (0); 1450 } 1451 if (*tl != nfs_prog) { 1452 nd->nd_repstat = EPROGUNAVAIL; 1453 nd->nd_procnum = NFSPROC_NOOP; 1454 return (0); 1455 } 1456 tl++; 1457 nfsvers = fxdr_unsigned(u_int32_t, *tl++); 1458 if (nfsvers != NFS_VER2 && nfsvers != NFS_VER3) { 1459 nd->nd_repstat = EPROGMISMATCH; 1460 nd->nd_procnum = NFSPROC_NOOP; 1461 return (0); 1462 } 1463 if (nfsvers == NFS_VER3) 1464 nd->nd_flag = ND_NFSV3; 1465 nd->nd_procnum = fxdr_unsigned(u_int32_t, *tl++); 1466 if (nd->nd_procnum == NFSPROC_NULL) 1467 return (0); 1468 if (nd->nd_procnum >= NFS_NPROCS || 1469 (nd->nd_procnum > NFSPROC_COMMIT) || 1470 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) { 1471 nd->nd_repstat = EPROCUNAVAIL; 1472 nd->nd_procnum = NFSPROC_NOOP; 1473 return (0); 1474 } 1475 if ((nd->nd_flag & ND_NFSV3) == 0) 1476 nd->nd_procnum = nfsv3_procid[nd->nd_procnum]; 1477 auth_type = *tl++; 1478 len = fxdr_unsigned(int, *tl++); 1479 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1480 m_freem(info.nmi_mrep); 1481 return (EBADRPC); 1482 } 1483 1484 /* Handle auth_unix */ 1485 if (auth_type == rpc_auth_unix) { 1486 len = fxdr_unsigned(int, *++tl); 1487 if (len < 0 || len > NFS_MAXNAMLEN) { 1488 m_freem(info.nmi_mrep); 1489 return (EBADRPC); 1490 } 1491 nfsm_adv(nfsm_rndup(len)); 1492 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); 1493 memset(&nd->nd_cr, 0, sizeof (struct ucred)); 1494 refcnt_init(&nd->nd_cr.cr_refcnt); 1495 nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++); 1496 nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++); 1497 len = fxdr_unsigned(int, *tl); 1498 if (len < 0 || len > RPCAUTH_UNIXGIDS) { 1499 m_freem(info.nmi_mrep); 1500 return (EBADRPC); 1501 } 1502 nfsm_dissect(tl, u_int32_t *, (len + 2) * NFSX_UNSIGNED); 1503 for (i = 0; i < len; i++) { 1504 if (i < NGROUPS_MAX) 1505 nd->nd_cr.cr_groups[i] = 1506 fxdr_unsigned(gid_t, *tl++); 1507 else 1508 tl++; 1509 } 1510 nd->nd_cr.cr_ngroups = (len > NGROUPS_MAX) ? NGROUPS_MAX : len; 1511 len = fxdr_unsigned(int, *++tl); 1512 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1513 m_freem(info.nmi_mrep); 1514 return (EBADRPC); 1515 } 1516 if (len > 0) 1517 nfsm_adv(nfsm_rndup(len)); 1518 } else { 1519 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED); 1520 nd->nd_procnum = NFSPROC_NOOP; 1521 return (0); 1522 } 1523 1524 nd->nd_md = info.nmi_md; 1525 nd->nd_dpos = info.nmi_dpos; 1526 return (0); 1527 nfsmout: 1528 return (error); 1529 } 1530 1531 void 1532 nfs_msg(struct nfsreq *rep, char *msg) 1533 { 1534 tpr_t tpr; 1535 1536 if (rep->r_procp) 1537 tpr = tprintf_open(rep->r_procp); 1538 else 1539 tpr = NULL; 1540 1541 tprintf(tpr, "nfs server %s: %s\n", 1542 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname, msg); 1543 tprintf_close(tpr); 1544 } 1545 1546 #ifdef NFSSERVER 1547 /* 1548 * Socket upcall routine for the nfsd sockets. 1549 * The caddr_t arg is a pointer to the "struct nfssvc_sock". 1550 * Essentially do as much as possible non-blocking, else punt and it will 1551 * be called with M_WAIT from an nfsd. 1552 */ 1553 void 1554 nfsrv_rcv(struct socket *so, caddr_t arg, int waitflag) 1555 { 1556 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg; 1557 struct mbuf *m; 1558 struct mbuf *mp, *nam; 1559 struct uio auio; 1560 int flags, error; 1561 1562 KERNEL_LOCK(); 1563 1564 if ((slp->ns_flag & SLP_VALID) == 0) 1565 goto out; 1566 1567 /* Defer soreceive() to an nfsd. */ 1568 if (waitflag == M_DONTWAIT) { 1569 slp->ns_flag |= SLP_NEEDQ; 1570 goto dorecs; 1571 } 1572 1573 auio.uio_procp = NULL; 1574 if (so->so_type == SOCK_STREAM) { 1575 /* 1576 * Do soreceive(). 1577 */ 1578 auio.uio_resid = 1000000000; 1579 flags = MSG_DONTWAIT; 1580 error = soreceive(so, &nam, &auio, &mp, NULL, 1581 &flags, 0); 1582 if (error || mp == NULL) { 1583 if (error == EWOULDBLOCK) 1584 slp->ns_flag |= SLP_NEEDQ; 1585 else 1586 slp->ns_flag |= SLP_DISCONN; 1587 goto dorecs; 1588 } 1589 m = mp; 1590 if (slp->ns_rawend) { 1591 slp->ns_rawend->m_next = m; 1592 slp->ns_cc += 1000000000 - auio.uio_resid; 1593 } else { 1594 slp->ns_raw = m; 1595 slp->ns_cc = 1000000000 - auio.uio_resid; 1596 } 1597 while (m->m_next) 1598 m = m->m_next; 1599 slp->ns_rawend = m; 1600 1601 /* 1602 * Now try and parse record(s) out of the raw stream data. 1603 */ 1604 error = nfsrv_getstream(slp, waitflag); 1605 if (error) { 1606 if (error == EPERM) 1607 slp->ns_flag |= SLP_DISCONN; 1608 else 1609 slp->ns_flag |= SLP_NEEDQ; 1610 } 1611 } else { 1612 do { 1613 auio.uio_resid = 1000000000; 1614 flags = MSG_DONTWAIT; 1615 error = soreceive(so, &nam, &auio, &mp, 1616 NULL, &flags, 0); 1617 if (mp) { 1618 if (nam) { 1619 m = nam; 1620 m->m_next = mp; 1621 } else 1622 m = mp; 1623 if (slp->ns_recend) 1624 slp->ns_recend->m_nextpkt = m; 1625 else 1626 slp->ns_rec = m; 1627 slp->ns_recend = m; 1628 m->m_nextpkt = NULL; 1629 } 1630 if (error) { 1631 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) 1632 && error != EWOULDBLOCK) { 1633 slp->ns_flag |= SLP_DISCONN; 1634 goto dorecs; 1635 } 1636 } 1637 } while (mp); 1638 } 1639 1640 /* 1641 * Now try and process the request records, non-blocking. 1642 */ 1643 dorecs: 1644 if (waitflag == M_DONTWAIT && 1645 (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)))) 1646 nfsrv_wakenfsd(slp); 1647 1648 out: 1649 KERNEL_UNLOCK(); 1650 } 1651 1652 /* 1653 * Try and extract an RPC request from the mbuf data list received on a 1654 * stream socket. The "waitflag" argument indicates whether or not it 1655 * can sleep. 1656 */ 1657 int 1658 nfsrv_getstream(struct nfssvc_sock *slp, int waitflag) 1659 { 1660 struct mbuf *m, **mpp; 1661 char *cp1, *cp2; 1662 int len; 1663 struct mbuf *om, *m2, *recm; 1664 u_int32_t recmark; 1665 1666 if (slp->ns_flag & SLP_GETSTREAM) 1667 return (0); 1668 slp->ns_flag |= SLP_GETSTREAM; 1669 for (;;) { 1670 if (slp->ns_reclen == 0) { 1671 if (slp->ns_cc < NFSX_UNSIGNED) { 1672 slp->ns_flag &= ~SLP_GETSTREAM; 1673 return (0); 1674 } 1675 m = slp->ns_raw; 1676 if (m->m_len >= NFSX_UNSIGNED) { 1677 bcopy(mtod(m, caddr_t), &recmark, 1678 NFSX_UNSIGNED); 1679 m->m_data += NFSX_UNSIGNED; 1680 m->m_len -= NFSX_UNSIGNED; 1681 } else { 1682 cp1 = (caddr_t)&recmark; 1683 cp2 = mtod(m, caddr_t); 1684 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) { 1685 while (m->m_len == 0) { 1686 m = m->m_next; 1687 cp2 = mtod(m, caddr_t); 1688 } 1689 *cp1++ = *cp2++; 1690 m->m_data++; 1691 m->m_len--; 1692 } 1693 } 1694 slp->ns_cc -= NFSX_UNSIGNED; 1695 recmark = ntohl(recmark); 1696 slp->ns_reclen = recmark & ~0x80000000; 1697 if (recmark & 0x80000000) 1698 slp->ns_flag |= SLP_LASTFRAG; 1699 else 1700 slp->ns_flag &= ~SLP_LASTFRAG; 1701 if (slp->ns_reclen > NFS_MAXPACKET) { 1702 slp->ns_flag &= ~SLP_GETSTREAM; 1703 return (EPERM); 1704 } 1705 } 1706 1707 /* 1708 * Now get the record part. 1709 */ 1710 recm = NULL; 1711 if (slp->ns_cc == slp->ns_reclen) { 1712 recm = slp->ns_raw; 1713 slp->ns_raw = slp->ns_rawend = NULL; 1714 slp->ns_cc = slp->ns_reclen = 0; 1715 } else if (slp->ns_cc > slp->ns_reclen) { 1716 len = 0; 1717 m = slp->ns_raw; 1718 om = NULL; 1719 while (len < slp->ns_reclen) { 1720 if ((len + m->m_len) > slp->ns_reclen) { 1721 m2 = m_copym(m, 0, slp->ns_reclen - len, 1722 waitflag); 1723 if (m2) { 1724 if (om) { 1725 om->m_next = m2; 1726 recm = slp->ns_raw; 1727 } else 1728 recm = m2; 1729 m->m_data += slp->ns_reclen-len; 1730 m->m_len -= slp->ns_reclen-len; 1731 len = slp->ns_reclen; 1732 } else { 1733 slp->ns_flag &= ~SLP_GETSTREAM; 1734 return (EWOULDBLOCK); 1735 } 1736 } else if ((len + m->m_len) == slp->ns_reclen) { 1737 om = m; 1738 len += m->m_len; 1739 m = m->m_next; 1740 recm = slp->ns_raw; 1741 om->m_next = NULL; 1742 } else { 1743 om = m; 1744 len += m->m_len; 1745 m = m->m_next; 1746 } 1747 } 1748 slp->ns_raw = m; 1749 slp->ns_cc -= len; 1750 slp->ns_reclen = 0; 1751 } else { 1752 slp->ns_flag &= ~SLP_GETSTREAM; 1753 return (0); 1754 } 1755 1756 /* 1757 * Accumulate the fragments into a record. 1758 */ 1759 mpp = &slp->ns_frag; 1760 while (*mpp) 1761 mpp = &((*mpp)->m_next); 1762 *mpp = recm; 1763 if (slp->ns_flag & SLP_LASTFRAG) { 1764 if (slp->ns_recend) 1765 slp->ns_recend->m_nextpkt = slp->ns_frag; 1766 else 1767 slp->ns_rec = slp->ns_frag; 1768 slp->ns_recend = slp->ns_frag; 1769 slp->ns_frag = NULL; 1770 } 1771 } 1772 } 1773 1774 /* 1775 * Parse an RPC header. 1776 */ 1777 int 1778 nfsrv_dorec(struct nfssvc_sock *slp, struct nfsd *nfsd, 1779 struct nfsrv_descript **ndp) 1780 { 1781 struct mbuf *m, *nam; 1782 struct nfsrv_descript *nd; 1783 int error; 1784 1785 *ndp = NULL; 1786 if ((slp->ns_flag & SLP_VALID) == 0 || 1787 (m = slp->ns_rec) == NULL) 1788 return (ENOBUFS); 1789 slp->ns_rec = m->m_nextpkt; 1790 if (slp->ns_rec) 1791 m->m_nextpkt = NULL; 1792 else 1793 slp->ns_recend = NULL; 1794 if (m->m_type == MT_SONAME) { 1795 nam = m; 1796 m = m->m_next; 1797 nam->m_next = NULL; 1798 } else 1799 nam = NULL; 1800 nd = pool_get(&nfsrv_descript_pl, PR_WAITOK); 1801 nfs_realign(&m, 10 * NFSX_UNSIGNED); 1802 nd->nd_md = nd->nd_mrep = m; 1803 nd->nd_nam2 = nam; 1804 nd->nd_dpos = mtod(m, caddr_t); 1805 error = nfs_getreq(nd, nfsd, 1); 1806 if (error) { 1807 m_freem(nam); 1808 pool_put(&nfsrv_descript_pl, nd); 1809 return (error); 1810 } 1811 *ndp = nd; 1812 nfsd->nfsd_nd = nd; 1813 return (0); 1814 } 1815 1816 1817 /* 1818 * Search for a sleeping nfsd and wake it up. 1819 * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the 1820 * running nfsds will go look for the work in the nfssvc_sock list. 1821 */ 1822 void 1823 nfsrv_wakenfsd(struct nfssvc_sock *slp) 1824 { 1825 struct nfsd *nfsd; 1826 1827 if ((slp->ns_flag & SLP_VALID) == 0) 1828 return; 1829 1830 TAILQ_FOREACH(nfsd, &nfsd_head, nfsd_chain) { 1831 if (nfsd->nfsd_flag & NFSD_WAITING) { 1832 nfsd->nfsd_flag &= ~NFSD_WAITING; 1833 if (nfsd->nfsd_slp) 1834 panic("nfsd wakeup"); 1835 slp->ns_sref++; 1836 nfsd->nfsd_slp = slp; 1837 wakeup_one(nfsd); 1838 return; 1839 } 1840 } 1841 1842 slp->ns_flag |= SLP_DOREC; 1843 nfsd_head_flag |= NFSD_CHECKSLP; 1844 } 1845 #endif /* NFSSERVER */ 1846