1 /* $OpenBSD: nfs_socket.c,v 1.137 2021/01/02 02:41:42 cheloha Exp $ */ 2 /* $NetBSD: nfs_socket.c,v 1.27 1996/04/15 20:20:00 thorpej Exp $ */ 3 4 /* 5 * Copyright (c) 1989, 1991, 1993, 1995 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Rick Macklem at The University of Guelph. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95 36 */ 37 38 /* 39 * Socket operations for use by nfs 40 */ 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/proc.h> 45 #include <sys/mount.h> 46 #include <sys/kernel.h> 47 #include <sys/mbuf.h> 48 #include <sys/vnode.h> 49 #include <sys/domain.h> 50 #include <sys/protosw.h> 51 #include <sys/signalvar.h> 52 #include <sys/socket.h> 53 #include <sys/socketvar.h> 54 #include <sys/syslog.h> 55 #include <sys/tprintf.h> 56 #include <sys/namei.h> 57 #include <sys/pool.h> 58 #include <sys/queue.h> 59 60 #include <netinet/in.h> 61 #include <netinet/tcp.h> 62 63 #include <nfs/rpcv2.h> 64 #include <nfs/nfsproto.h> 65 #include <nfs/nfs.h> 66 #include <nfs/xdr_subs.h> 67 #include <nfs/nfsm_subs.h> 68 #include <nfs/nfsmount.h> 69 #include <nfs/nfs_var.h> 70 71 /* External data, mostly RPC constants in XDR form. */ 72 extern u_int32_t rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, 73 rpc_auth_unix, rpc_msgaccepted, rpc_call, rpc_autherr; 74 extern u_int32_t nfs_prog; 75 extern struct nfsstats nfsstats; 76 extern int nfsv3_procid[NFS_NPROCS]; 77 extern int nfs_ticks; 78 79 extern struct pool nfsrv_descript_pl; 80 81 /* 82 * There is a congestion window for outstanding rpcs maintained per mount 83 * point. The cwnd size is adjusted in roughly the way that: 84 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of 85 * SIGCOMM '88". ACM, August 1988. 86 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout 87 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd 88 * of rpcs is in progress. 89 * (The sent count and cwnd are scaled for integer arith.) 90 * Variants of "slow start" were tried and were found to be too much of a 91 * performance hit (ave. rtt 3 times larger), 92 * I suspect due to the large rtt that nfs rpcs have. 93 */ 94 #define NFS_CWNDSCALE 256 95 #define NFS_MAXCWND (NFS_CWNDSCALE * 32) 96 int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256 }; 97 98 /* RTT estimator */ 99 enum nfs_rto_timers nfs_ptimers[NFS_NPROCS] = { 100 NFS_DEFAULT_TIMER, /* NULL */ 101 NFS_GETATTR_TIMER, /* GETATTR */ 102 NFS_DEFAULT_TIMER, /* SETATTR */ 103 NFS_LOOKUP_TIMER, /* LOOKUP */ 104 NFS_GETATTR_TIMER, /* ACCESS */ 105 NFS_READ_TIMER, /* READLINK */ 106 NFS_READ_TIMER, /* READ */ 107 NFS_WRITE_TIMER, /* WRITE */ 108 NFS_DEFAULT_TIMER, /* CREATE */ 109 NFS_DEFAULT_TIMER, /* MKDIR */ 110 NFS_DEFAULT_TIMER, /* SYMLINK */ 111 NFS_DEFAULT_TIMER, /* MKNOD */ 112 NFS_DEFAULT_TIMER, /* REMOVE */ 113 NFS_DEFAULT_TIMER, /* RMDIR */ 114 NFS_DEFAULT_TIMER, /* RENAME */ 115 NFS_DEFAULT_TIMER, /* LINK */ 116 NFS_READ_TIMER, /* READDIR */ 117 NFS_READ_TIMER, /* READDIRPLUS */ 118 NFS_DEFAULT_TIMER, /* FSSTAT */ 119 NFS_DEFAULT_TIMER, /* FSINFO */ 120 NFS_DEFAULT_TIMER, /* PATHCONF */ 121 NFS_DEFAULT_TIMER, /* COMMIT */ 122 NFS_DEFAULT_TIMER, /* NOOP */ 123 }; 124 125 void nfs_init_rtt(struct nfsmount *); 126 void nfs_update_rtt(struct nfsreq *); 127 int nfs_estimate_rto(struct nfsmount *, u_int32_t procnum); 128 129 void nfs_realign(struct mbuf **, int); 130 void nfs_realign_fixup(struct mbuf *, struct mbuf *, unsigned int *); 131 132 int nfs_rcvlock(struct nfsreq *); 133 int nfs_receive(struct nfsreq *, struct mbuf **, struct mbuf **); 134 int nfs_reconnect(struct nfsreq *); 135 int nfs_reply(struct nfsreq *); 136 void nfs_msg(struct nfsreq *, char *); 137 void nfs_rcvunlock(int *); 138 139 int nfsrv_getstream(struct nfssvc_sock *, int); 140 141 unsigned int nfs_realign_test = 0; 142 unsigned int nfs_realign_count = 0; 143 144 /* Initialize the RTT estimator state for a new mount point. */ 145 void 146 nfs_init_rtt(struct nfsmount *nmp) 147 { 148 int i; 149 150 for (i = 0; i < NFS_MAX_TIMER; i++) 151 nmp->nm_srtt[i] = NFS_INITRTT; 152 for (i = 0; i < NFS_MAX_TIMER; i++) 153 nmp->nm_sdrtt[i] = 0; 154 } 155 156 /* 157 * Update a mount point's RTT estimator state using data from the 158 * passed-in request. 159 * 160 * Use a gain of 0.125 on the mean and a gain of 0.25 on the deviation. 161 * 162 * NB: Since the timer resolution of NFS_HZ is so course, it can often 163 * result in r_rtt == 0. Since r_rtt == N means that the actual RTT is 164 * between N + dt and N + 2 - dt ticks, add 1 before calculating the 165 * update values. 166 */ 167 void 168 nfs_update_rtt(struct nfsreq *rep) 169 { 170 int t1 = rep->r_rtt + 1; 171 int index = nfs_ptimers[rep->r_procnum] - 1; 172 int *srtt = &rep->r_nmp->nm_srtt[index]; 173 int *sdrtt = &rep->r_nmp->nm_sdrtt[index]; 174 175 t1 -= *srtt >> 3; 176 *srtt += t1; 177 if (t1 < 0) 178 t1 = -t1; 179 t1 -= *sdrtt >> 2; 180 *sdrtt += t1; 181 } 182 183 /* 184 * Estimate RTO for an NFS RPC sent via an unreliable datagram. 185 * 186 * Use the mean and mean deviation of RTT for the appropriate type 187 * of RPC for the frequent RPCs and a default for the others. 188 * The justification for doing "other" this way is that these RPCs 189 * happen so infrequently that timer est. would probably be stale. 190 * Also, since many of these RPCs are non-idempotent, a conservative 191 * timeout is desired. 192 * 193 * getattr, lookup - A+2D 194 * read, write - A+4D 195 * other - nm_timeo 196 */ 197 int 198 nfs_estimate_rto(struct nfsmount *nmp, u_int32_t procnum) 199 { 200 enum nfs_rto_timers timer = nfs_ptimers[procnum]; 201 int index = timer - 1; 202 int rto; 203 204 switch (timer) { 205 case NFS_GETATTR_TIMER: 206 case NFS_LOOKUP_TIMER: 207 rto = ((nmp->nm_srtt[index] + 3) >> 2) + 208 ((nmp->nm_sdrtt[index] + 1) >> 1); 209 break; 210 case NFS_READ_TIMER: 211 case NFS_WRITE_TIMER: 212 rto = ((nmp->nm_srtt[index] + 7) >> 3) + 213 (nmp->nm_sdrtt[index] + 1); 214 break; 215 default: 216 rto = nmp->nm_timeo; 217 return (rto); 218 } 219 220 if (rto < NFS_MINRTO) 221 rto = NFS_MINRTO; 222 else if (rto > NFS_MAXRTO) 223 rto = NFS_MAXRTO; 224 225 return (rto); 226 } 227 228 229 230 /* 231 * Initialize sockets and congestion for a new NFS connection. 232 * We do not free the sockaddr if error. 233 */ 234 int 235 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep) 236 { 237 struct socket *so; 238 int s, error, rcvreserve, sndreserve; 239 struct sockaddr *saddr; 240 struct sockaddr_in *sin; 241 struct mbuf *nam = NULL, *mopt = NULL; 242 243 if (!(nmp->nm_sotype == SOCK_DGRAM || nmp->nm_sotype == SOCK_STREAM)) 244 return (EINVAL); 245 246 nmp->nm_so = NULL; 247 saddr = mtod(nmp->nm_nam, struct sockaddr *); 248 error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype, 249 nmp->nm_soproto); 250 if (error) { 251 nfs_disconnect(nmp); 252 return (error); 253 } 254 255 /* Allocate mbufs possibly waiting before grabbing the socket lock. */ 256 if (nmp->nm_sotype == SOCK_STREAM || saddr->sa_family == AF_INET) 257 MGET(mopt, M_WAIT, MT_SOOPTS); 258 if (saddr->sa_family == AF_INET) 259 MGET(nam, M_WAIT, MT_SONAME); 260 261 so = nmp->nm_so; 262 s = solock(so); 263 nmp->nm_soflags = so->so_proto->pr_flags; 264 265 /* 266 * Some servers require that the client port be a reserved port number. 267 * We always allocate a reserved port, as this prevents filehandle 268 * disclosure through UDP port capture. 269 */ 270 if (saddr->sa_family == AF_INET) { 271 int *ip; 272 273 mopt->m_len = sizeof(int); 274 ip = mtod(mopt, int *); 275 *ip = IP_PORTRANGE_LOW; 276 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); 277 if (error) 278 goto bad; 279 280 sin = mtod(nam, struct sockaddr_in *); 281 memset(sin, 0, sizeof(*sin)); 282 sin->sin_len = nam->m_len = sizeof(struct sockaddr_in); 283 sin->sin_family = AF_INET; 284 sin->sin_addr.s_addr = INADDR_ANY; 285 sin->sin_port = htons(0); 286 error = sobind(so, nam, &proc0); 287 if (error) 288 goto bad; 289 290 mopt->m_len = sizeof(int); 291 ip = mtod(mopt, int *); 292 *ip = IP_PORTRANGE_DEFAULT; 293 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); 294 if (error) 295 goto bad; 296 } 297 298 /* 299 * Protocols that do not require connections may be optionally left 300 * unconnected for servers that reply from a port other than NFS_PORT. 301 */ 302 if (nmp->nm_flag & NFSMNT_NOCONN) { 303 if (nmp->nm_soflags & PR_CONNREQUIRED) { 304 error = ENOTCONN; 305 goto bad; 306 } 307 } else { 308 error = soconnect(so, nmp->nm_nam); 309 if (error) 310 goto bad; 311 312 /* 313 * Wait for the connection to complete. Cribbed from the 314 * connect system call but with the wait timing out so 315 * that interruptible mounts don't hang here for a long time. 316 */ 317 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 318 sosleep_nsec(so, &so->so_timeo, PSOCK, "nfscon", 319 SEC_TO_NSEC(2)); 320 if ((so->so_state & SS_ISCONNECTING) && 321 so->so_error == 0 && rep && 322 (error = nfs_sigintr(nmp, rep, rep->r_procp)) != 0){ 323 so->so_state &= ~SS_ISCONNECTING; 324 goto bad; 325 } 326 } 327 if (so->so_error) { 328 error = so->so_error; 329 so->so_error = 0; 330 goto bad; 331 } 332 } 333 /* 334 * Always set receive timeout to detect server crash and reconnect. 335 * Otherwise, we can get stuck in soreceive forever. 336 */ 337 so->so_rcv.sb_timeo_nsecs = SEC_TO_NSEC(5); 338 if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) 339 so->so_snd.sb_timeo_nsecs = SEC_TO_NSEC(5); 340 else 341 so->so_snd.sb_timeo_nsecs = INFSLP; 342 if (nmp->nm_sotype == SOCK_DGRAM) { 343 sndreserve = nmp->nm_wsize + NFS_MAXPKTHDR; 344 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) + 345 NFS_MAXPKTHDR) * 2; 346 } else if (nmp->nm_sotype == SOCK_STREAM) { 347 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 348 *mtod(mopt, int32_t *) = 1; 349 mopt->m_len = sizeof(int32_t); 350 sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, mopt); 351 } 352 if (so->so_proto->pr_protocol == IPPROTO_TCP) { 353 *mtod(mopt, int32_t *) = 1; 354 mopt->m_len = sizeof(int32_t); 355 sosetopt(so, IPPROTO_TCP, TCP_NODELAY, mopt); 356 } 357 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + 358 sizeof (u_int32_t)) * 2; 359 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + 360 sizeof (u_int32_t)) * 2; 361 } else { 362 panic("%s: nm_sotype %d", __func__, nmp->nm_sotype); 363 } 364 error = soreserve(so, sndreserve, rcvreserve); 365 if (error) 366 goto bad; 367 so->so_rcv.sb_flags |= SB_NOINTR; 368 so->so_snd.sb_flags |= SB_NOINTR; 369 sounlock(so, s); 370 371 m_freem(mopt); 372 m_freem(nam); 373 374 /* Initialize other non-zero congestion variables */ 375 nfs_init_rtt(nmp); 376 nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */ 377 nmp->nm_sent = 0; 378 nmp->nm_timeouts = 0; 379 return (0); 380 381 bad: 382 sounlock(so, s); 383 384 m_freem(mopt); 385 m_freem(nam); 386 387 nfs_disconnect(nmp); 388 return (error); 389 } 390 391 /* 392 * Reconnect routine: 393 * Called when a connection is broken on a reliable protocol. 394 * - clean up the old socket 395 * - nfs_connect() again 396 * - set R_MUSTRESEND for all outstanding requests on mount point 397 * If this fails the mount point is DEAD! 398 * nb: Must be called with the nfs_sndlock() set on the mount point. 399 */ 400 int 401 nfs_reconnect(struct nfsreq *rep) 402 { 403 struct nfsreq *rp; 404 struct nfsmount *nmp = rep->r_nmp; 405 int error; 406 407 nfs_disconnect(nmp); 408 while ((error = nfs_connect(nmp, rep)) != 0) { 409 if (error == EINTR || error == ERESTART) 410 return (EINTR); 411 tsleep_nsec(&nowake, PSOCK, "nfsrecon", SEC_TO_NSEC(1)); 412 } 413 414 /* 415 * Loop through outstanding request list and fix up all requests 416 * on old socket. 417 */ 418 TAILQ_FOREACH(rp, &nmp->nm_reqsq, r_chain) { 419 rp->r_flags |= R_MUSTRESEND; 420 rp->r_rexmit = 0; 421 } 422 return (0); 423 } 424 425 /* 426 * NFS disconnect. Clean up and unlink. 427 */ 428 void 429 nfs_disconnect(struct nfsmount *nmp) 430 { 431 struct socket *so; 432 433 if (nmp->nm_so) { 434 so = nmp->nm_so; 435 nmp->nm_so = NULL; 436 soshutdown(so, SHUT_RDWR); 437 soclose(so, 0); 438 } 439 } 440 441 /* 442 * This is the nfs send routine. For connection based socket types, it 443 * must be called with an nfs_sndlock() on the socket. 444 * "rep == NULL" indicates that it has been called from a server. 445 * For the client side: 446 * - return EINTR if the RPC is terminated, 0 otherwise 447 * - set R_MUSTRESEND if the send fails for any reason 448 * - do any cleanup required by recoverable socket errors (???) 449 * For the server side: 450 * - return EINTR or ERESTART if interrupted by a signal 451 * - return EPIPE if a connection is lost for connection based sockets (TCP...) 452 * - do any cleanup required by recoverable socket errors (???) 453 */ 454 int 455 nfs_send(struct socket *so, struct mbuf *nam, struct mbuf *top, 456 struct nfsreq *rep) 457 { 458 struct mbuf *sendnam; 459 int error, soflags, flags; 460 461 if (rep) { 462 if (rep->r_flags & R_SOFTTERM) { 463 m_freem(top); 464 return (EINTR); 465 } 466 if ((so = rep->r_nmp->nm_so) == NULL) { 467 rep->r_flags |= R_MUSTRESEND; 468 m_freem(top); 469 return (0); 470 } 471 rep->r_flags &= ~R_MUSTRESEND; 472 soflags = rep->r_nmp->nm_soflags; 473 } else 474 soflags = so->so_proto->pr_flags; 475 if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED)) 476 sendnam = NULL; 477 else 478 sendnam = nam; 479 flags = 0; 480 481 error = sosend(so, sendnam, NULL, top, NULL, flags); 482 if (error) { 483 if (rep) { 484 /* 485 * Deal with errors for the client side. 486 */ 487 if (rep->r_flags & R_SOFTTERM) 488 error = EINTR; 489 else 490 rep->r_flags |= R_MUSTRESEND; 491 } 492 493 /* 494 * Handle any recoverable (soft) socket errors here. (???) 495 */ 496 if (error != EINTR && error != ERESTART && 497 error != EWOULDBLOCK && error != EPIPE) 498 error = 0; 499 } 500 return (error); 501 } 502 503 #ifdef NFSCLIENT 504 /* 505 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all 506 * done by soreceive(), but for SOCK_STREAM we must deal with the Record 507 * Mark and consolidate the data into a new mbuf list. 508 * nb: Sometimes TCP passes the data up to soreceive() in long lists of 509 * small mbufs. 510 * For SOCK_STREAM we must be very careful to read an entire record once 511 * we have read any of it, even if the system call has been interrupted. 512 */ 513 int 514 nfs_receive(struct nfsreq *rep, struct mbuf **aname, struct mbuf **mp) 515 { 516 struct socket *so; 517 struct uio auio; 518 struct iovec aio; 519 struct mbuf *m; 520 struct mbuf *control; 521 u_int32_t len; 522 struct mbuf **getnam; 523 int error, sotype, rcvflg; 524 struct proc *p = curproc; /* XXX */ 525 526 /* 527 * Set up arguments for soreceive() 528 */ 529 *mp = NULL; 530 *aname = NULL; 531 sotype = rep->r_nmp->nm_sotype; 532 533 /* 534 * For reliable protocols, lock against other senders/receivers 535 * in case a reconnect is necessary. 536 * For SOCK_STREAM, first get the Record Mark to find out how much 537 * more there is to get. 538 * We must lock the socket against other receivers 539 * until we have an entire rpc request/reply. 540 */ 541 if (sotype != SOCK_DGRAM) { 542 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); 543 if (error) 544 return (error); 545 tryagain: 546 /* 547 * Check for fatal errors and resending request. 548 */ 549 /* 550 * Ugh: If a reconnect attempt just happened, nm_so 551 * would have changed. NULL indicates a failed 552 * attempt that has essentially shut down this 553 * mount point. 554 */ 555 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) { 556 nfs_sndunlock(&rep->r_nmp->nm_flag); 557 return (EINTR); 558 } 559 so = rep->r_nmp->nm_so; 560 if (!so) { 561 error = nfs_reconnect(rep); 562 if (error) { 563 nfs_sndunlock(&rep->r_nmp->nm_flag); 564 return (error); 565 } 566 goto tryagain; 567 } 568 while (rep->r_flags & R_MUSTRESEND) { 569 m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT); 570 nfsstats.rpcretries++; 571 rep->r_rtt = 0; 572 rep->r_flags &= ~R_TIMING; 573 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep); 574 if (error) { 575 if (error == EINTR || error == ERESTART || 576 (error = nfs_reconnect(rep)) != 0) { 577 nfs_sndunlock(&rep->r_nmp->nm_flag); 578 return (error); 579 } 580 goto tryagain; 581 } 582 } 583 nfs_sndunlock(&rep->r_nmp->nm_flag); 584 if (sotype == SOCK_STREAM) { 585 aio.iov_base = (caddr_t) &len; 586 aio.iov_len = sizeof(u_int32_t); 587 auio.uio_iov = &aio; 588 auio.uio_iovcnt = 1; 589 auio.uio_segflg = UIO_SYSSPACE; 590 auio.uio_rw = UIO_READ; 591 auio.uio_offset = 0; 592 auio.uio_resid = sizeof(u_int32_t); 593 auio.uio_procp = p; 594 do { 595 rcvflg = MSG_WAITALL; 596 error = soreceive(so, NULL, &auio, NULL, NULL, 597 &rcvflg, 0); 598 if (error == EWOULDBLOCK && rep) { 599 if (rep->r_flags & R_SOFTTERM) 600 return (EINTR); 601 /* 602 * looks like the server died after it 603 * received the request, make sure 604 * that we will retransmit and we 605 * don't get stuck here forever. 606 */ 607 if (rep->r_rexmit >= 608 rep->r_nmp->nm_retry) { 609 nfsstats.rpctimeouts++; 610 error = EPIPE; 611 } 612 } 613 } while (error == EWOULDBLOCK); 614 if (!error && auio.uio_resid > 0) { 615 log(LOG_INFO, 616 "short receive (%zu/%zu) from nfs server %s\n", 617 sizeof(u_int32_t) - auio.uio_resid, 618 sizeof(u_int32_t), 619 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 620 error = EPIPE; 621 } 622 if (error) 623 goto errout; 624 625 len = ntohl(len) & ~0x80000000; 626 /* 627 * This is SERIOUS! We are out of sync with the sender 628 * and forcing a disconnect/reconnect is all I can do. 629 */ 630 if (len > NFS_MAXPACKET) { 631 log(LOG_ERR, "%s (%u) from nfs server %s\n", 632 "impossible packet length", 633 len, 634 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 635 error = EFBIG; 636 goto errout; 637 } 638 auio.uio_resid = len; 639 do { 640 rcvflg = MSG_WAITALL; 641 error = soreceive(so, NULL, &auio, mp, NULL, 642 &rcvflg, 0); 643 } while (error == EWOULDBLOCK || error == EINTR || 644 error == ERESTART); 645 if (!error && auio.uio_resid > 0) { 646 log(LOG_INFO, "short receive (%zu/%u) from " 647 "nfs server %s\n", len - auio.uio_resid, 648 len, rep->r_nmp->nm_mountp-> 649 mnt_stat.f_mntfromname); 650 error = EPIPE; 651 } 652 } else { 653 /* 654 * NB: Since uio_resid is big, MSG_WAITALL is ignored 655 * and soreceive() will return when it has either a 656 * control msg or a data msg. 657 * We have no use for control msg., but must grab them 658 * and then throw them away so we know what is going 659 * on. 660 */ 661 auio.uio_resid = len = 100000000; /* Anything Big */ 662 auio.uio_procp = p; 663 do { 664 rcvflg = 0; 665 error = soreceive(so, NULL, &auio, mp, &control, 666 &rcvflg, 0); 667 m_freem(control); 668 if (error == EWOULDBLOCK && rep) { 669 if (rep->r_flags & R_SOFTTERM) 670 return (EINTR); 671 } 672 } while (error == EWOULDBLOCK || 673 (!error && *mp == NULL && control)); 674 if ((rcvflg & MSG_EOR) == 0) 675 printf("Egad!!\n"); 676 if (!error && *mp == NULL) 677 error = EPIPE; 678 len -= auio.uio_resid; 679 } 680 errout: 681 if (error && error != EINTR && error != ERESTART) { 682 m_freemp(mp); 683 if (error != EPIPE) 684 log(LOG_INFO, 685 "receive error %d from nfs server %s\n", 686 error, 687 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 688 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); 689 if (!error) { 690 error = nfs_reconnect(rep); 691 if (!error) 692 goto tryagain; 693 nfs_sndunlock(&rep->r_nmp->nm_flag); 694 } 695 } 696 } else { 697 if ((so = rep->r_nmp->nm_so) == NULL) 698 return (EACCES); 699 if (so->so_state & SS_ISCONNECTED) 700 getnam = NULL; 701 else 702 getnam = aname; 703 auio.uio_resid = len = 1000000; 704 auio.uio_procp = p; 705 do { 706 rcvflg = 0; 707 error = soreceive(so, getnam, &auio, mp, NULL, 708 &rcvflg, 0); 709 if (error == EWOULDBLOCK && 710 (rep->r_flags & R_SOFTTERM)) 711 return (EINTR); 712 } while (error == EWOULDBLOCK); 713 len -= auio.uio_resid; 714 } 715 if (error) 716 m_freemp(mp); 717 /* 718 * Search for any mbufs that are not a multiple of 4 bytes long 719 * or with m_data not longword aligned. 720 * These could cause pointer alignment problems, so copy them to 721 * well aligned mbufs. 722 */ 723 nfs_realign(mp, 5 * NFSX_UNSIGNED); 724 return (error); 725 } 726 727 /* 728 * Implement receipt of reply on a socket. 729 * We must search through the list of received datagrams matching them 730 * with outstanding requests using the xid, until ours is found. 731 */ 732 int 733 nfs_reply(struct nfsreq *myrep) 734 { 735 struct nfsreq *rep; 736 struct nfsmount *nmp = myrep->r_nmp; 737 struct nfsm_info info; 738 struct mbuf *nam; 739 u_int32_t rxid, *tl, t1; 740 caddr_t cp2; 741 int error; 742 743 /* 744 * Loop around until we get our own reply 745 */ 746 for (;;) { 747 /* 748 * Lock against other receivers so that I don't get stuck in 749 * sbwait() after someone else has received my reply for me. 750 * Also necessary for connection based protocols to avoid 751 * race conditions during a reconnect. 752 */ 753 error = nfs_rcvlock(myrep); 754 if (error) 755 return (error == EALREADY ? 0 : error); 756 757 /* 758 * Get the next Rpc reply off the socket 759 */ 760 error = nfs_receive(myrep, &nam, &info.nmi_mrep); 761 nfs_rcvunlock(&nmp->nm_flag); 762 if (error) { 763 764 /* 765 * Ignore routing errors on connectionless protocols?? 766 */ 767 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) { 768 if (nmp->nm_so) 769 nmp->nm_so->so_error = 0; 770 continue; 771 } 772 return (error); 773 } 774 m_freem(nam); 775 776 /* 777 * Get the xid and check that it is an rpc reply 778 */ 779 info.nmi_md = info.nmi_mrep; 780 info.nmi_dpos = mtod(info.nmi_md, caddr_t); 781 nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED); 782 rxid = *tl++; 783 if (*tl != rpc_reply) { 784 nfsstats.rpcinvalid++; 785 m_freem(info.nmi_mrep); 786 nfsmout: 787 continue; 788 } 789 790 /* 791 * Loop through the request list to match up the reply 792 * Iff no match, just drop the datagram 793 */ 794 TAILQ_FOREACH(rep, &nmp->nm_reqsq, r_chain) { 795 if (rep->r_mrep == NULL && rxid == rep->r_xid) { 796 /* Found it.. */ 797 rep->r_mrep = info.nmi_mrep; 798 rep->r_md = info.nmi_md; 799 rep->r_dpos = info.nmi_dpos; 800 801 /* 802 * Update congestion window. 803 * Do the additive increase of 804 * one rpc/rtt. 805 */ 806 if (nmp->nm_cwnd <= nmp->nm_sent) { 807 nmp->nm_cwnd += 808 (NFS_CWNDSCALE * NFS_CWNDSCALE + 809 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd; 810 if (nmp->nm_cwnd > NFS_MAXCWND) 811 nmp->nm_cwnd = NFS_MAXCWND; 812 } 813 rep->r_flags &= ~R_SENT; 814 nmp->nm_sent -= NFS_CWNDSCALE; 815 816 if (rep->r_flags & R_TIMING) 817 nfs_update_rtt(rep); 818 819 nmp->nm_timeouts = 0; 820 break; 821 } 822 } 823 /* 824 * If not matched to a request, drop it. 825 * If it's mine, get out. 826 */ 827 if (rep == 0) { 828 nfsstats.rpcunexpected++; 829 m_freem(info.nmi_mrep); 830 } else if (rep == myrep) { 831 if (rep->r_mrep == NULL) 832 panic("nfsreply nil"); 833 return (0); 834 } 835 } 836 } 837 838 /* 839 * nfs_request - goes something like this 840 * - fill in request struct 841 * - links it into list 842 * - calls nfs_send() for first transmit 843 * - calls nfs_receive() to get reply 844 * - break down rpc header and return with nfs reply pointed to 845 * by mrep or error 846 * nb: always frees up mreq mbuf list 847 */ 848 int 849 nfs_request(struct vnode *vp, int procnum, struct nfsm_info *infop) 850 { 851 struct mbuf *m; 852 u_int32_t *tl; 853 struct nfsmount *nmp; 854 caddr_t cp2; 855 int t1, i, error = 0; 856 int trylater_delay; 857 struct nfsreq *rep; 858 struct nfsm_info info; 859 860 rep = pool_get(&nfsreqpl, PR_WAITOK); 861 rep->r_nmp = VFSTONFS(vp->v_mount); 862 rep->r_vp = vp; 863 rep->r_procp = infop->nmi_procp; 864 rep->r_procnum = procnum; 865 866 /* empty mbuf for AUTH_UNIX header */ 867 rep->r_mreq = m_gethdr(M_WAIT, MT_DATA); 868 rep->r_mreq->m_next = infop->nmi_mreq; 869 rep->r_mreq->m_len = 0; 870 m_calchdrlen(rep->r_mreq); 871 872 trylater_delay = NFS_MINTIMEO; 873 874 nmp = rep->r_nmp; 875 876 /* Get the RPC header with authorization. */ 877 nfsm_rpchead(rep, infop->nmi_cred, RPCAUTH_UNIX); 878 m = rep->r_mreq; 879 880 /* 881 * For stream protocols, insert a Sun RPC Record Mark. 882 */ 883 if (nmp->nm_sotype == SOCK_STREAM) { 884 M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); 885 *mtod(m, u_int32_t *) = htonl(0x80000000 | 886 (m->m_pkthdr.len - NFSX_UNSIGNED)); 887 } 888 889 tryagain: 890 rep->r_rtt = rep->r_rexmit = 0; 891 if (nfs_ptimers[rep->r_procnum] != NFS_DEFAULT_TIMER) 892 rep->r_flags = R_TIMING; 893 else 894 rep->r_flags = 0; 895 rep->r_mrep = NULL; 896 897 /* 898 * Do the client side RPC. 899 */ 900 nfsstats.rpcrequests++; 901 /* 902 * Chain request into list of outstanding requests. Be sure 903 * to put it LAST so timer finds oldest requests first. 904 */ 905 if (TAILQ_EMPTY(&nmp->nm_reqsq)) 906 timeout_add(&nmp->nm_rtimeout, nfs_ticks); 907 TAILQ_INSERT_TAIL(&nmp->nm_reqsq, rep, r_chain); 908 909 /* 910 * If backing off another request or avoiding congestion, don't 911 * send this one now but let timer do it. If not timing a request, 912 * do it now. 913 */ 914 if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM || 915 (nmp->nm_flag & NFSMNT_DUMBTIMR) || 916 nmp->nm_sent < nmp->nm_cwnd)) { 917 if (nmp->nm_soflags & PR_CONNREQUIRED) 918 error = nfs_sndlock(&nmp->nm_flag, rep); 919 if (!error) { 920 error = nfs_send(nmp->nm_so, nmp->nm_nam, 921 m_copym(m, 0, M_COPYALL, M_WAIT), rep); 922 if (nmp->nm_soflags & PR_CONNREQUIRED) 923 nfs_sndunlock(&nmp->nm_flag); 924 } 925 if (!error && (rep->r_flags & R_MUSTRESEND) == 0) { 926 nmp->nm_sent += NFS_CWNDSCALE; 927 rep->r_flags |= R_SENT; 928 } 929 } else { 930 rep->r_rtt = -1; 931 } 932 933 /* 934 * Wait for the reply from our send or the timer's. 935 */ 936 if (!error || error == EPIPE) 937 error = nfs_reply(rep); 938 939 /* 940 * RPC done, unlink the request. 941 */ 942 TAILQ_REMOVE(&nmp->nm_reqsq, rep, r_chain); 943 if (TAILQ_EMPTY(&nmp->nm_reqsq)) 944 timeout_del(&nmp->nm_rtimeout); 945 946 /* 947 * Decrement the outstanding request count. 948 */ 949 if (rep->r_flags & R_SENT) { 950 rep->r_flags &= ~R_SENT; /* paranoia */ 951 nmp->nm_sent -= NFS_CWNDSCALE; 952 } 953 954 /* 955 * If there was a successful reply and a tprintf msg. 956 * tprintf a response. 957 */ 958 if (!error && (rep->r_flags & R_TPRINTFMSG)) 959 nfs_msg(rep, "is alive again"); 960 info.nmi_mrep = rep->r_mrep; 961 info.nmi_md = rep->r_md; 962 info.nmi_dpos = rep->r_dpos; 963 if (error) { 964 infop->nmi_mrep = NULL; 965 goto nfsmout1; 966 } 967 968 /* 969 * break down the rpc header and check if ok 970 */ 971 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); 972 if (*tl++ == rpc_msgdenied) { 973 if (*tl == rpc_mismatch) 974 error = EOPNOTSUPP; 975 else 976 error = EACCES; /* Should be EAUTH. */ 977 infop->nmi_mrep = NULL; 978 goto nfsmout1; 979 } 980 981 /* 982 * Since we only support RPCAUTH_UNIX atm we step over the 983 * reply verifer type, and in the (error) case that there really 984 * is any data in it, we advance over it. 985 */ 986 tl++; /* Step over verifer type */ 987 i = fxdr_unsigned(int32_t, *tl); 988 if (i > 0) 989 nfsm_adv(nfsm_rndup(i)); /* Should not happen */ 990 991 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); 992 /* 0 == ok */ 993 if (*tl == 0) { 994 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); 995 if (*tl != 0) { 996 error = fxdr_unsigned(int, *tl); 997 if ((nmp->nm_flag & NFSMNT_NFSV3) && 998 error == NFSERR_TRYLATER) { 999 m_freem(info.nmi_mrep); 1000 error = 0; 1001 tsleep_nsec(&nowake, PSOCK, "nfsretry", 1002 SEC_TO_NSEC(trylater_delay)); 1003 trylater_delay *= NFS_TIMEOUTMUL; 1004 if (trylater_delay > NFS_MAXTIMEO) 1005 trylater_delay = NFS_MAXTIMEO; 1006 1007 goto tryagain; 1008 } 1009 1010 /* 1011 * If the File Handle was stale, invalidate the 1012 * lookup cache, just in case. 1013 */ 1014 if (error == ESTALE) 1015 cache_purge(rep->r_vp); 1016 } 1017 goto nfsmout; 1018 } 1019 1020 error = EPROTONOSUPPORT; 1021 1022 nfsmout: 1023 infop->nmi_mrep = info.nmi_mrep; 1024 infop->nmi_md = info.nmi_md; 1025 infop->nmi_dpos = info.nmi_dpos; 1026 nfsmout1: 1027 m_freem(rep->r_mreq); 1028 pool_put(&nfsreqpl, rep); 1029 return (error); 1030 } 1031 #endif /* NFSCLIENT */ 1032 1033 /* 1034 * Generate the rpc reply header 1035 * siz arg. is used to decide if adding a cluster is worthwhile 1036 */ 1037 int 1038 nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp, 1039 int err, struct mbuf **mrq, struct mbuf **mbp) 1040 { 1041 u_int32_t *tl; 1042 struct mbuf *mreq; 1043 struct mbuf *mb; 1044 1045 MGETHDR(mreq, M_WAIT, MT_DATA); 1046 mb = mreq; 1047 /* 1048 * If this is a big reply, use a cluster else 1049 * try and leave leading space for the lower level headers. 1050 */ 1051 siz += RPC_REPLYSIZ; 1052 if (siz >= MHLEN - max_hdr) { 1053 MCLGET(mreq, M_WAIT); 1054 } else 1055 mreq->m_data += max_hdr; 1056 tl = mtod(mreq, u_int32_t *); 1057 mreq->m_len = 6 * NFSX_UNSIGNED; 1058 *tl++ = txdr_unsigned(nd->nd_retxid); 1059 *tl++ = rpc_reply; 1060 if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) { 1061 *tl++ = rpc_msgdenied; 1062 if (err & NFSERR_AUTHERR) { 1063 *tl++ = rpc_autherr; 1064 *tl = txdr_unsigned(err & ~NFSERR_AUTHERR); 1065 mreq->m_len -= NFSX_UNSIGNED; 1066 } else { 1067 *tl++ = rpc_mismatch; 1068 *tl++ = txdr_unsigned(RPC_VER2); 1069 *tl = txdr_unsigned(RPC_VER2); 1070 } 1071 } else { 1072 *tl++ = rpc_msgaccepted; 1073 1074 /* AUTH_UNIX requires RPCAUTH_NULL. */ 1075 *tl++ = 0; 1076 *tl++ = 0; 1077 1078 switch (err) { 1079 case EPROGUNAVAIL: 1080 *tl = txdr_unsigned(RPC_PROGUNAVAIL); 1081 break; 1082 case EPROGMISMATCH: 1083 *tl = txdr_unsigned(RPC_PROGMISMATCH); 1084 tl = nfsm_build(&mb, 2 * NFSX_UNSIGNED); 1085 *tl++ = txdr_unsigned(NFS_VER2); 1086 *tl = txdr_unsigned(NFS_VER3); 1087 break; 1088 case EPROCUNAVAIL: 1089 *tl = txdr_unsigned(RPC_PROCUNAVAIL); 1090 break; 1091 case EBADRPC: 1092 *tl = txdr_unsigned(RPC_GARBAGE); 1093 break; 1094 default: 1095 *tl = 0; 1096 if (err != NFSERR_RETVOID) { 1097 tl = nfsm_build(&mb, NFSX_UNSIGNED); 1098 if (err) 1099 *tl = txdr_unsigned(nfsrv_errmap(nd, err)); 1100 else 1101 *tl = 0; 1102 } 1103 break; 1104 }; 1105 } 1106 1107 *mrq = mreq; 1108 if (mbp != NULL) 1109 *mbp = mb; 1110 if (err != 0 && err != NFSERR_RETVOID) 1111 nfsstats.srvrpc_errs++; 1112 return (0); 1113 } 1114 1115 /* 1116 * nfs timer routine 1117 * Scan the nfsreq list and retranmit any requests that have timed out. 1118 */ 1119 void 1120 nfs_timer(void *arg) 1121 { 1122 struct nfsmount *nmp = arg; 1123 struct nfsreq *rep; 1124 struct mbuf *m; 1125 struct socket *so; 1126 int timeo, error; 1127 1128 NET_LOCK(); 1129 TAILQ_FOREACH(rep, &nmp->nm_reqsq, r_chain) { 1130 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) 1131 continue; 1132 if (nfs_sigintr(nmp, rep, rep->r_procp)) { 1133 rep->r_flags |= R_SOFTTERM; 1134 continue; 1135 } 1136 if (rep->r_rtt >= 0) { 1137 rep->r_rtt++; 1138 if (nmp->nm_flag & NFSMNT_DUMBTIMR) 1139 timeo = nmp->nm_timeo; 1140 else 1141 timeo = nfs_estimate_rto(nmp, rep->r_procnum); 1142 if (nmp->nm_timeouts > 0) 1143 timeo *= nfs_backoff[nmp->nm_timeouts - 1]; 1144 if (rep->r_rtt <= timeo) 1145 continue; 1146 if (nmp->nm_timeouts < nitems(nfs_backoff)) 1147 nmp->nm_timeouts++; 1148 } 1149 1150 /* Check for server not responding. */ 1151 if ((rep->r_flags & R_TPRINTFMSG) == 0 && rep->r_rexmit > 4) { 1152 nfs_msg(rep, "not responding"); 1153 rep->r_flags |= R_TPRINTFMSG; 1154 } 1155 if (rep->r_rexmit >= nmp->nm_retry) { /* too many */ 1156 nfsstats.rpctimeouts++; 1157 rep->r_flags |= R_SOFTTERM; 1158 continue; 1159 } 1160 if (nmp->nm_sotype != SOCK_DGRAM) { 1161 if (++rep->r_rexmit > NFS_MAXREXMIT) 1162 rep->r_rexmit = NFS_MAXREXMIT; 1163 continue; 1164 } 1165 1166 if ((so = nmp->nm_so) == NULL) 1167 continue; 1168 1169 /* 1170 * If there is enough space and the window allows.. 1171 * Resend it 1172 * Set r_rtt to -1 in case we fail to send it now. 1173 */ 1174 rep->r_rtt = -1; 1175 if (sbspace(so, &so->so_snd) >= rep->r_mreq->m_pkthdr.len && 1176 ((nmp->nm_flag & NFSMNT_DUMBTIMR) || 1177 (rep->r_flags & R_SENT) || 1178 nmp->nm_sent < nmp->nm_cwnd) && 1179 (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){ 1180 if ((nmp->nm_flag & NFSMNT_NOCONN) == 0) 1181 error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, 1182 m, NULL, NULL, curproc); 1183 else 1184 error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, 1185 m, nmp->nm_nam, NULL, curproc); 1186 if (error) { 1187 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) 1188 so->so_error = 0; 1189 } else { 1190 /* 1191 * Iff first send, start timing 1192 * else turn timing off, backoff timer 1193 * and divide congestion window by 2. 1194 */ 1195 if (rep->r_flags & R_SENT) { 1196 rep->r_flags &= ~R_TIMING; 1197 if (++rep->r_rexmit > NFS_MAXREXMIT) 1198 rep->r_rexmit = NFS_MAXREXMIT; 1199 nmp->nm_cwnd >>= 1; 1200 if (nmp->nm_cwnd < NFS_CWNDSCALE) 1201 nmp->nm_cwnd = NFS_CWNDSCALE; 1202 nfsstats.rpcretries++; 1203 } else { 1204 rep->r_flags |= R_SENT; 1205 nmp->nm_sent += NFS_CWNDSCALE; 1206 } 1207 rep->r_rtt = 0; 1208 } 1209 } 1210 } 1211 NET_UNLOCK(); 1212 timeout_add(&nmp->nm_rtimeout, nfs_ticks); 1213 } 1214 1215 /* 1216 * Test for a termination condition pending on the process. 1217 * This is used for NFSMNT_INT mounts. 1218 */ 1219 int 1220 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct proc *p) 1221 { 1222 1223 if (rep && (rep->r_flags & R_SOFTTERM)) 1224 return (EINTR); 1225 if (!(nmp->nm_flag & NFSMNT_INT)) 1226 return (0); 1227 if (p && (SIGPENDING(p) & ~p->p_p->ps_sigacts->ps_sigignore & 1228 NFSINT_SIGMASK)) 1229 return (EINTR); 1230 return (0); 1231 } 1232 1233 /* 1234 * Lock a socket against others. 1235 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply 1236 * and also to avoid race conditions between the processes with nfs requests 1237 * in progress when a reconnect is necessary. 1238 */ 1239 int 1240 nfs_sndlock(int *flagp, struct nfsreq *rep) 1241 { 1242 uint64_t slptimeo = INFSLP; 1243 struct proc *p; 1244 int slpflag = 0; 1245 1246 if (rep) { 1247 p = rep->r_procp; 1248 if (rep->r_nmp->nm_flag & NFSMNT_INT) 1249 slpflag = PCATCH; 1250 } else 1251 p = NULL; 1252 while (*flagp & NFSMNT_SNDLOCK) { 1253 if (rep && nfs_sigintr(rep->r_nmp, rep, p)) 1254 return (EINTR); 1255 *flagp |= NFSMNT_WANTSND; 1256 tsleep_nsec(flagp, slpflag | (PZERO - 1), "nfsndlck", slptimeo); 1257 if (slpflag == PCATCH) { 1258 slpflag = 0; 1259 slptimeo = SEC_TO_NSEC(2); 1260 } 1261 } 1262 *flagp |= NFSMNT_SNDLOCK; 1263 return (0); 1264 } 1265 1266 /* 1267 * Unlock the stream socket for others. 1268 */ 1269 void 1270 nfs_sndunlock(int *flagp) 1271 { 1272 1273 if ((*flagp & NFSMNT_SNDLOCK) == 0) 1274 panic("nfs sndunlock"); 1275 *flagp &= ~NFSMNT_SNDLOCK; 1276 if (*flagp & NFSMNT_WANTSND) { 1277 *flagp &= ~NFSMNT_WANTSND; 1278 wakeup((caddr_t)flagp); 1279 } 1280 } 1281 1282 int 1283 nfs_rcvlock(struct nfsreq *rep) 1284 { 1285 uint64_t slptimeo = INFSLP; 1286 int *flagp = &rep->r_nmp->nm_flag; 1287 int slpflag; 1288 1289 if (*flagp & NFSMNT_INT) 1290 slpflag = PCATCH; 1291 else 1292 slpflag = 0; 1293 1294 while (*flagp & NFSMNT_RCVLOCK) { 1295 if (nfs_sigintr(rep->r_nmp, rep, rep->r_procp)) 1296 return (EINTR); 1297 *flagp |= NFSMNT_WANTRCV; 1298 tsleep_nsec(flagp, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo); 1299 if (rep->r_mrep != NULL) { 1300 /* 1301 * Don't take the lock if our reply has been received 1302 * while we where sleeping. 1303 */ 1304 return (EALREADY); 1305 } 1306 if (slpflag == PCATCH) { 1307 slpflag = 0; 1308 slptimeo = SEC_TO_NSEC(2); 1309 } 1310 } 1311 *flagp |= NFSMNT_RCVLOCK; 1312 return (0); 1313 } 1314 1315 /* 1316 * Unlock the stream socket for others. 1317 */ 1318 void 1319 nfs_rcvunlock(int *flagp) 1320 { 1321 1322 if ((*flagp & NFSMNT_RCVLOCK) == 0) 1323 panic("nfs rcvunlock"); 1324 *flagp &= ~NFSMNT_RCVLOCK; 1325 if (*flagp & NFSMNT_WANTRCV) { 1326 *flagp &= ~NFSMNT_WANTRCV; 1327 wakeup(flagp); 1328 } 1329 } 1330 1331 /* 1332 * Auxiliary routine to align the length of mbuf copies made with m_copyback(). 1333 */ 1334 void 1335 nfs_realign_fixup(struct mbuf *m, struct mbuf *n, unsigned int *off) 1336 { 1337 size_t padding; 1338 1339 /* 1340 * The maximum number of bytes that m_copyback() places in a mbuf is 1341 * always an aligned quantity, so realign happens at the chain's tail. 1342 */ 1343 while (n->m_next != NULL) 1344 n = n->m_next; 1345 1346 /* 1347 * Pad from the next elements in the source chain. Loop until the 1348 * destination chain is aligned, or the end of the source is reached. 1349 */ 1350 do { 1351 m = m->m_next; 1352 if (m == NULL) 1353 return; 1354 1355 padding = min(ALIGN(n->m_len) - n->m_len, m->m_len); 1356 if (padding > m_trailingspace(n)) 1357 panic("nfs_realign_fixup: no memory to pad to"); 1358 1359 bcopy(mtod(m, void *), mtod(n, char *) + n->m_len, padding); 1360 1361 n->m_len += padding; 1362 m_adj(m, padding); 1363 *off += padding; 1364 1365 } while (!ALIGNED_POINTER(n->m_len, void *)); 1366 } 1367 1368 /* 1369 * The NFS RPC parsing code uses the data address and the length of mbuf 1370 * structures to calculate on-memory addresses. This function makes sure these 1371 * parameters are correctly aligned. 1372 */ 1373 void 1374 nfs_realign(struct mbuf **pm, int hsiz) 1375 { 1376 struct mbuf *m; 1377 struct mbuf *n = NULL; 1378 unsigned int off = 0; 1379 1380 ++nfs_realign_test; 1381 while ((m = *pm) != NULL) { 1382 if (!ALIGNED_POINTER(m->m_data, void *) || 1383 !ALIGNED_POINTER(m->m_len, void *)) { 1384 MGET(n, M_WAIT, MT_DATA); 1385 #define ALIGN_POINTER(n) ((u_int)(((n) + sizeof(void *)) & ~sizeof(void *))) 1386 if (ALIGN_POINTER(m->m_len) >= MINCLSIZE) { 1387 MCLGET(n, M_WAIT); 1388 } 1389 n->m_len = 0; 1390 break; 1391 } 1392 pm = &m->m_next; 1393 } 1394 /* 1395 * If n is non-NULL, loop on m copying data, then replace the 1396 * portion of the chain that had to be realigned. 1397 */ 1398 if (n != NULL) { 1399 ++nfs_realign_count; 1400 while (m) { 1401 m_copyback(n, off, m->m_len, mtod(m, caddr_t), M_WAIT); 1402 1403 /* 1404 * If an unaligned amount of memory was copied, fix up 1405 * the last mbuf created by m_copyback(). 1406 */ 1407 if (!ALIGNED_POINTER(m->m_len, void *)) 1408 nfs_realign_fixup(m, n, &off); 1409 1410 off += m->m_len; 1411 m = m->m_next; 1412 } 1413 m_freemp(pm); 1414 *pm = n; 1415 } 1416 } 1417 1418 1419 /* 1420 * Parse an RPC request 1421 * - verify it 1422 * - fill in the cred struct. 1423 */ 1424 int 1425 nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header) 1426 { 1427 int len, i; 1428 u_int32_t *tl; 1429 int32_t t1; 1430 caddr_t cp2; 1431 u_int32_t nfsvers, auth_type; 1432 int error = 0; 1433 struct nfsm_info info; 1434 1435 info.nmi_mrep = nd->nd_mrep; 1436 info.nmi_md = nd->nd_md; 1437 info.nmi_dpos = nd->nd_dpos; 1438 if (has_header) { 1439 nfsm_dissect(tl, u_int32_t *, 10 * NFSX_UNSIGNED); 1440 nd->nd_retxid = fxdr_unsigned(u_int32_t, *tl++); 1441 if (*tl++ != rpc_call) { 1442 m_freem(info.nmi_mrep); 1443 return (EBADRPC); 1444 } 1445 } else 1446 nfsm_dissect(tl, u_int32_t *, 8 * NFSX_UNSIGNED); 1447 nd->nd_repstat = 0; 1448 nd->nd_flag = 0; 1449 if (*tl++ != rpc_vers) { 1450 nd->nd_repstat = ERPCMISMATCH; 1451 nd->nd_procnum = NFSPROC_NOOP; 1452 return (0); 1453 } 1454 if (*tl != nfs_prog) { 1455 nd->nd_repstat = EPROGUNAVAIL; 1456 nd->nd_procnum = NFSPROC_NOOP; 1457 return (0); 1458 } 1459 tl++; 1460 nfsvers = fxdr_unsigned(u_int32_t, *tl++); 1461 if (nfsvers != NFS_VER2 && nfsvers != NFS_VER3) { 1462 nd->nd_repstat = EPROGMISMATCH; 1463 nd->nd_procnum = NFSPROC_NOOP; 1464 return (0); 1465 } 1466 if (nfsvers == NFS_VER3) 1467 nd->nd_flag = ND_NFSV3; 1468 nd->nd_procnum = fxdr_unsigned(u_int32_t, *tl++); 1469 if (nd->nd_procnum == NFSPROC_NULL) 1470 return (0); 1471 if (nd->nd_procnum >= NFS_NPROCS || 1472 (nd->nd_procnum > NFSPROC_COMMIT) || 1473 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) { 1474 nd->nd_repstat = EPROCUNAVAIL; 1475 nd->nd_procnum = NFSPROC_NOOP; 1476 return (0); 1477 } 1478 if ((nd->nd_flag & ND_NFSV3) == 0) 1479 nd->nd_procnum = nfsv3_procid[nd->nd_procnum]; 1480 auth_type = *tl++; 1481 len = fxdr_unsigned(int, *tl++); 1482 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1483 m_freem(info.nmi_mrep); 1484 return (EBADRPC); 1485 } 1486 1487 /* Handle auth_unix */ 1488 if (auth_type == rpc_auth_unix) { 1489 len = fxdr_unsigned(int, *++tl); 1490 if (len < 0 || len > NFS_MAXNAMLEN) { 1491 m_freem(info.nmi_mrep); 1492 return (EBADRPC); 1493 } 1494 nfsm_adv(nfsm_rndup(len)); 1495 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); 1496 memset(&nd->nd_cr, 0, sizeof (struct ucred)); 1497 nd->nd_cr.cr_ref = 1; 1498 nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++); 1499 nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++); 1500 len = fxdr_unsigned(int, *tl); 1501 if (len < 0 || len > RPCAUTH_UNIXGIDS) { 1502 m_freem(info.nmi_mrep); 1503 return (EBADRPC); 1504 } 1505 nfsm_dissect(tl, u_int32_t *, (len + 2) * NFSX_UNSIGNED); 1506 for (i = 0; i < len; i++) { 1507 if (i < NGROUPS_MAX) 1508 nd->nd_cr.cr_groups[i] = 1509 fxdr_unsigned(gid_t, *tl++); 1510 else 1511 tl++; 1512 } 1513 nd->nd_cr.cr_ngroups = (len > NGROUPS_MAX) ? NGROUPS_MAX : len; 1514 len = fxdr_unsigned(int, *++tl); 1515 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1516 m_freem(info.nmi_mrep); 1517 return (EBADRPC); 1518 } 1519 if (len > 0) 1520 nfsm_adv(nfsm_rndup(len)); 1521 } else { 1522 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED); 1523 nd->nd_procnum = NFSPROC_NOOP; 1524 return (0); 1525 } 1526 1527 nd->nd_md = info.nmi_md; 1528 nd->nd_dpos = info.nmi_dpos; 1529 return (0); 1530 nfsmout: 1531 return (error); 1532 } 1533 1534 void 1535 nfs_msg(struct nfsreq *rep, char *msg) 1536 { 1537 tpr_t tpr; 1538 1539 if (rep->r_procp) 1540 tpr = tprintf_open(rep->r_procp); 1541 else 1542 tpr = NULL; 1543 1544 tprintf(tpr, "nfs server %s: %s\n", 1545 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname, msg); 1546 tprintf_close(tpr); 1547 } 1548 1549 #ifdef NFSSERVER 1550 /* 1551 * Socket upcall routine for the nfsd sockets. 1552 * The caddr_t arg is a pointer to the "struct nfssvc_sock". 1553 * Essentially do as much as possible non-blocking, else punt and it will 1554 * be called with M_WAIT from an nfsd. 1555 */ 1556 void 1557 nfsrv_rcv(struct socket *so, caddr_t arg, int waitflag) 1558 { 1559 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg; 1560 struct mbuf *m; 1561 struct mbuf *mp, *nam; 1562 struct uio auio; 1563 int flags, error; 1564 1565 if ((slp->ns_flag & SLP_VALID) == 0) 1566 return; 1567 1568 /* Defer soreceive() to an nfsd. */ 1569 if (waitflag == M_DONTWAIT) { 1570 slp->ns_flag |= SLP_NEEDQ; 1571 goto dorecs; 1572 } 1573 1574 auio.uio_procp = NULL; 1575 if (so->so_type == SOCK_STREAM) { 1576 /* 1577 * Do soreceive(). 1578 */ 1579 auio.uio_resid = 1000000000; 1580 flags = MSG_DONTWAIT; 1581 error = soreceive(so, &nam, &auio, &mp, NULL, 1582 &flags, 0); 1583 if (error || mp == NULL) { 1584 if (error == EWOULDBLOCK) 1585 slp->ns_flag |= SLP_NEEDQ; 1586 else 1587 slp->ns_flag |= SLP_DISCONN; 1588 goto dorecs; 1589 } 1590 m = mp; 1591 if (slp->ns_rawend) { 1592 slp->ns_rawend->m_next = m; 1593 slp->ns_cc += 1000000000 - auio.uio_resid; 1594 } else { 1595 slp->ns_raw = m; 1596 slp->ns_cc = 1000000000 - auio.uio_resid; 1597 } 1598 while (m->m_next) 1599 m = m->m_next; 1600 slp->ns_rawend = m; 1601 1602 /* 1603 * Now try and parse record(s) out of the raw stream data. 1604 */ 1605 error = nfsrv_getstream(slp, waitflag); 1606 if (error) { 1607 if (error == EPERM) 1608 slp->ns_flag |= SLP_DISCONN; 1609 else 1610 slp->ns_flag |= SLP_NEEDQ; 1611 } 1612 } else { 1613 do { 1614 auio.uio_resid = 1000000000; 1615 flags = MSG_DONTWAIT; 1616 error = soreceive(so, &nam, &auio, &mp, 1617 NULL, &flags, 0); 1618 if (mp) { 1619 if (nam) { 1620 m = nam; 1621 m->m_next = mp; 1622 } else 1623 m = mp; 1624 if (slp->ns_recend) 1625 slp->ns_recend->m_nextpkt = m; 1626 else 1627 slp->ns_rec = m; 1628 slp->ns_recend = m; 1629 m->m_nextpkt = NULL; 1630 } 1631 if (error) { 1632 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) 1633 && error != EWOULDBLOCK) { 1634 slp->ns_flag |= SLP_DISCONN; 1635 goto dorecs; 1636 } 1637 } 1638 } while (mp); 1639 } 1640 1641 /* 1642 * Now try and process the request records, non-blocking. 1643 */ 1644 dorecs: 1645 if (waitflag == M_DONTWAIT && 1646 (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)))) 1647 nfsrv_wakenfsd(slp); 1648 } 1649 1650 /* 1651 * Try and extract an RPC request from the mbuf data list received on a 1652 * stream socket. The "waitflag" argument indicates whether or not it 1653 * can sleep. 1654 */ 1655 int 1656 nfsrv_getstream(struct nfssvc_sock *slp, int waitflag) 1657 { 1658 struct mbuf *m, **mpp; 1659 char *cp1, *cp2; 1660 int len; 1661 struct mbuf *om, *m2, *recm; 1662 u_int32_t recmark; 1663 1664 if (slp->ns_flag & SLP_GETSTREAM) 1665 return (0); 1666 slp->ns_flag |= SLP_GETSTREAM; 1667 for (;;) { 1668 if (slp->ns_reclen == 0) { 1669 if (slp->ns_cc < NFSX_UNSIGNED) { 1670 slp->ns_flag &= ~SLP_GETSTREAM; 1671 return (0); 1672 } 1673 m = slp->ns_raw; 1674 if (m->m_len >= NFSX_UNSIGNED) { 1675 bcopy(mtod(m, caddr_t), &recmark, 1676 NFSX_UNSIGNED); 1677 m->m_data += NFSX_UNSIGNED; 1678 m->m_len -= NFSX_UNSIGNED; 1679 } else { 1680 cp1 = (caddr_t)&recmark; 1681 cp2 = mtod(m, caddr_t); 1682 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) { 1683 while (m->m_len == 0) { 1684 m = m->m_next; 1685 cp2 = mtod(m, caddr_t); 1686 } 1687 *cp1++ = *cp2++; 1688 m->m_data++; 1689 m->m_len--; 1690 } 1691 } 1692 slp->ns_cc -= NFSX_UNSIGNED; 1693 recmark = ntohl(recmark); 1694 slp->ns_reclen = recmark & ~0x80000000; 1695 if (recmark & 0x80000000) 1696 slp->ns_flag |= SLP_LASTFRAG; 1697 else 1698 slp->ns_flag &= ~SLP_LASTFRAG; 1699 if (slp->ns_reclen > NFS_MAXPACKET) { 1700 slp->ns_flag &= ~SLP_GETSTREAM; 1701 return (EPERM); 1702 } 1703 } 1704 1705 /* 1706 * Now get the record part. 1707 */ 1708 recm = NULL; 1709 if (slp->ns_cc == slp->ns_reclen) { 1710 recm = slp->ns_raw; 1711 slp->ns_raw = slp->ns_rawend = NULL; 1712 slp->ns_cc = slp->ns_reclen = 0; 1713 } else if (slp->ns_cc > slp->ns_reclen) { 1714 len = 0; 1715 m = slp->ns_raw; 1716 om = NULL; 1717 while (len < slp->ns_reclen) { 1718 if ((len + m->m_len) > slp->ns_reclen) { 1719 m2 = m_copym(m, 0, slp->ns_reclen - len, 1720 waitflag); 1721 if (m2) { 1722 if (om) { 1723 om->m_next = m2; 1724 recm = slp->ns_raw; 1725 } else 1726 recm = m2; 1727 m->m_data += slp->ns_reclen-len; 1728 m->m_len -= slp->ns_reclen-len; 1729 len = slp->ns_reclen; 1730 } else { 1731 slp->ns_flag &= ~SLP_GETSTREAM; 1732 return (EWOULDBLOCK); 1733 } 1734 } else if ((len + m->m_len) == slp->ns_reclen) { 1735 om = m; 1736 len += m->m_len; 1737 m = m->m_next; 1738 recm = slp->ns_raw; 1739 om->m_next = NULL; 1740 } else { 1741 om = m; 1742 len += m->m_len; 1743 m = m->m_next; 1744 } 1745 } 1746 slp->ns_raw = m; 1747 slp->ns_cc -= len; 1748 slp->ns_reclen = 0; 1749 } else { 1750 slp->ns_flag &= ~SLP_GETSTREAM; 1751 return (0); 1752 } 1753 1754 /* 1755 * Accumulate the fragments into a record. 1756 */ 1757 mpp = &slp->ns_frag; 1758 while (*mpp) 1759 mpp = &((*mpp)->m_next); 1760 *mpp = recm; 1761 if (slp->ns_flag & SLP_LASTFRAG) { 1762 if (slp->ns_recend) 1763 slp->ns_recend->m_nextpkt = slp->ns_frag; 1764 else 1765 slp->ns_rec = slp->ns_frag; 1766 slp->ns_recend = slp->ns_frag; 1767 slp->ns_frag = NULL; 1768 } 1769 } 1770 } 1771 1772 /* 1773 * Parse an RPC header. 1774 */ 1775 int 1776 nfsrv_dorec(struct nfssvc_sock *slp, struct nfsd *nfsd, 1777 struct nfsrv_descript **ndp) 1778 { 1779 struct mbuf *m, *nam; 1780 struct nfsrv_descript *nd; 1781 int error; 1782 1783 *ndp = NULL; 1784 if ((slp->ns_flag & SLP_VALID) == 0 || 1785 (m = slp->ns_rec) == NULL) 1786 return (ENOBUFS); 1787 slp->ns_rec = m->m_nextpkt; 1788 if (slp->ns_rec) 1789 m->m_nextpkt = NULL; 1790 else 1791 slp->ns_recend = NULL; 1792 if (m->m_type == MT_SONAME) { 1793 nam = m; 1794 m = m->m_next; 1795 nam->m_next = NULL; 1796 } else 1797 nam = NULL; 1798 nd = pool_get(&nfsrv_descript_pl, PR_WAITOK); 1799 nfs_realign(&m, 10 * NFSX_UNSIGNED); 1800 nd->nd_md = nd->nd_mrep = m; 1801 nd->nd_nam2 = nam; 1802 nd->nd_dpos = mtod(m, caddr_t); 1803 error = nfs_getreq(nd, nfsd, 1); 1804 if (error) { 1805 m_freem(nam); 1806 pool_put(&nfsrv_descript_pl, nd); 1807 return (error); 1808 } 1809 *ndp = nd; 1810 nfsd->nfsd_nd = nd; 1811 return (0); 1812 } 1813 1814 1815 /* 1816 * Search for a sleeping nfsd and wake it up. 1817 * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the 1818 * running nfsds will go look for the work in the nfssvc_sock list. 1819 */ 1820 void 1821 nfsrv_wakenfsd(struct nfssvc_sock *slp) 1822 { 1823 struct nfsd *nfsd; 1824 1825 if ((slp->ns_flag & SLP_VALID) == 0) 1826 return; 1827 1828 TAILQ_FOREACH(nfsd, &nfsd_head, nfsd_chain) { 1829 if (nfsd->nfsd_flag & NFSD_WAITING) { 1830 nfsd->nfsd_flag &= ~NFSD_WAITING; 1831 if (nfsd->nfsd_slp) 1832 panic("nfsd wakeup"); 1833 slp->ns_sref++; 1834 nfsd->nfsd_slp = slp; 1835 wakeup_one(nfsd); 1836 return; 1837 } 1838 } 1839 1840 slp->ns_flag |= SLP_DOREC; 1841 nfsd_head_flag |= NFSD_CHECKSLP; 1842 } 1843 #endif /* NFSSERVER */ 1844