1 /* $OpenBSD: nfs_socket.c,v 1.146 2024/03/22 07:15:04 claudio Exp $ */ 2 /* $NetBSD: nfs_socket.c,v 1.27 1996/04/15 20:20:00 thorpej Exp $ */ 3 4 /* 5 * Copyright (c) 1989, 1991, 1993, 1995 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Rick Macklem at The University of Guelph. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95 36 */ 37 38 /* 39 * Socket operations for use by nfs 40 */ 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/proc.h> 45 #include <sys/mount.h> 46 #include <sys/kernel.h> 47 #include <sys/mbuf.h> 48 #include <sys/vnode.h> 49 #include <sys/protosw.h> 50 #include <sys/signalvar.h> 51 #include <sys/socket.h> 52 #include <sys/socketvar.h> 53 #include <sys/syslog.h> 54 #include <sys/tprintf.h> 55 #include <sys/namei.h> 56 #include <sys/pool.h> 57 #include <sys/queue.h> 58 59 #include <netinet/in.h> 60 #include <netinet/tcp.h> 61 62 #include <nfs/rpcv2.h> 63 #include <nfs/nfsproto.h> 64 #include <nfs/nfs.h> 65 #include <nfs/xdr_subs.h> 66 #include <nfs/nfsm_subs.h> 67 #include <nfs/nfsmount.h> 68 #include <nfs/nfs_var.h> 69 70 /* External data, mostly RPC constants in XDR form. */ 71 extern u_int32_t rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, 72 rpc_auth_unix, rpc_msgaccepted, rpc_call, rpc_autherr; 73 extern u_int32_t nfs_prog; 74 extern struct nfsstats nfsstats; 75 extern int nfsv3_procid[NFS_NPROCS]; 76 extern int nfs_ticks; 77 78 extern struct pool nfsrv_descript_pl; 79 80 /* 81 * There is a congestion window for outstanding rpcs maintained per mount 82 * point. The cwnd size is adjusted in roughly the way that: 83 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of 84 * SIGCOMM '88". ACM, August 1988. 85 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout 86 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd 87 * of rpcs is in progress. 88 * (The sent count and cwnd are scaled for integer arith.) 89 * Variants of "slow start" were tried and were found to be too much of a 90 * performance hit (ave. rtt 3 times larger), 91 * I suspect due to the large rtt that nfs rpcs have. 92 */ 93 #define NFS_CWNDSCALE 256 94 #define NFS_MAXCWND (NFS_CWNDSCALE * 32) 95 int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256 }; 96 97 /* RTT estimator */ 98 enum nfs_rto_timers nfs_ptimers[NFS_NPROCS] = { 99 NFS_DEFAULT_TIMER, /* NULL */ 100 NFS_GETATTR_TIMER, /* GETATTR */ 101 NFS_DEFAULT_TIMER, /* SETATTR */ 102 NFS_LOOKUP_TIMER, /* LOOKUP */ 103 NFS_GETATTR_TIMER, /* ACCESS */ 104 NFS_READ_TIMER, /* READLINK */ 105 NFS_READ_TIMER, /* READ */ 106 NFS_WRITE_TIMER, /* WRITE */ 107 NFS_DEFAULT_TIMER, /* CREATE */ 108 NFS_DEFAULT_TIMER, /* MKDIR */ 109 NFS_DEFAULT_TIMER, /* SYMLINK */ 110 NFS_DEFAULT_TIMER, /* MKNOD */ 111 NFS_DEFAULT_TIMER, /* REMOVE */ 112 NFS_DEFAULT_TIMER, /* RMDIR */ 113 NFS_DEFAULT_TIMER, /* RENAME */ 114 NFS_DEFAULT_TIMER, /* LINK */ 115 NFS_READ_TIMER, /* READDIR */ 116 NFS_READ_TIMER, /* READDIRPLUS */ 117 NFS_DEFAULT_TIMER, /* FSSTAT */ 118 NFS_DEFAULT_TIMER, /* FSINFO */ 119 NFS_DEFAULT_TIMER, /* PATHCONF */ 120 NFS_DEFAULT_TIMER, /* COMMIT */ 121 NFS_DEFAULT_TIMER, /* NOOP */ 122 }; 123 124 void nfs_init_rtt(struct nfsmount *); 125 void nfs_update_rtt(struct nfsreq *); 126 int nfs_estimate_rto(struct nfsmount *, u_int32_t procnum); 127 128 void nfs_realign(struct mbuf **, int); 129 void nfs_realign_fixup(struct mbuf *, struct mbuf *, unsigned int *); 130 131 int nfs_rcvlock(struct nfsreq *); 132 int nfs_receive(struct nfsreq *, struct mbuf **, struct mbuf **); 133 int nfs_reconnect(struct nfsreq *); 134 int nfs_reply(struct nfsreq *); 135 void nfs_msg(struct nfsreq *, char *); 136 void nfs_rcvunlock(int *); 137 138 int nfsrv_getstream(struct nfssvc_sock *, int); 139 140 unsigned int nfs_realign_test = 0; 141 unsigned int nfs_realign_count = 0; 142 143 /* Initialize the RTT estimator state for a new mount point. */ 144 void 145 nfs_init_rtt(struct nfsmount *nmp) 146 { 147 int i; 148 149 for (i = 0; i < NFS_MAX_TIMER; i++) 150 nmp->nm_srtt[i] = NFS_INITRTT; 151 for (i = 0; i < NFS_MAX_TIMER; i++) 152 nmp->nm_sdrtt[i] = 0; 153 } 154 155 /* 156 * Update a mount point's RTT estimator state using data from the 157 * passed-in request. 158 * 159 * Use a gain of 0.125 on the mean and a gain of 0.25 on the deviation. 160 * 161 * NB: Since the timer resolution of NFS_HZ is so course, it can often 162 * result in r_rtt == 0. Since r_rtt == N means that the actual RTT is 163 * between N + dt and N + 2 - dt ticks, add 1 before calculating the 164 * update values. 165 */ 166 void 167 nfs_update_rtt(struct nfsreq *rep) 168 { 169 int t1 = rep->r_rtt + 1; 170 int index = nfs_ptimers[rep->r_procnum] - 1; 171 int *srtt = &rep->r_nmp->nm_srtt[index]; 172 int *sdrtt = &rep->r_nmp->nm_sdrtt[index]; 173 174 t1 -= *srtt >> 3; 175 *srtt += t1; 176 if (t1 < 0) 177 t1 = -t1; 178 t1 -= *sdrtt >> 2; 179 *sdrtt += t1; 180 } 181 182 /* 183 * Estimate RTO for an NFS RPC sent via an unreliable datagram. 184 * 185 * Use the mean and mean deviation of RTT for the appropriate type 186 * of RPC for the frequent RPCs and a default for the others. 187 * The justification for doing "other" this way is that these RPCs 188 * happen so infrequently that timer est. would probably be stale. 189 * Also, since many of these RPCs are non-idempotent, a conservative 190 * timeout is desired. 191 * 192 * getattr, lookup - A+2D 193 * read, write - A+4D 194 * other - nm_timeo 195 */ 196 int 197 nfs_estimate_rto(struct nfsmount *nmp, u_int32_t procnum) 198 { 199 enum nfs_rto_timers timer = nfs_ptimers[procnum]; 200 int index = timer - 1; 201 int rto; 202 203 switch (timer) { 204 case NFS_GETATTR_TIMER: 205 case NFS_LOOKUP_TIMER: 206 rto = ((nmp->nm_srtt[index] + 3) >> 2) + 207 ((nmp->nm_sdrtt[index] + 1) >> 1); 208 break; 209 case NFS_READ_TIMER: 210 case NFS_WRITE_TIMER: 211 rto = ((nmp->nm_srtt[index] + 7) >> 3) + 212 (nmp->nm_sdrtt[index] + 1); 213 break; 214 default: 215 rto = nmp->nm_timeo; 216 return (rto); 217 } 218 219 if (rto < NFS_MINRTO) 220 rto = NFS_MINRTO; 221 else if (rto > NFS_MAXRTO) 222 rto = NFS_MAXRTO; 223 224 return (rto); 225 } 226 227 228 229 /* 230 * Initialize sockets and congestion for a new NFS connection. 231 * We do not free the sockaddr if error. 232 */ 233 int 234 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep) 235 { 236 struct socket *so; 237 int error, rcvreserve, sndreserve; 238 struct sockaddr *saddr; 239 struct sockaddr_in *sin; 240 struct mbuf *nam = NULL, *mopt = NULL; 241 242 if (!(nmp->nm_sotype == SOCK_DGRAM || nmp->nm_sotype == SOCK_STREAM)) 243 return (EINVAL); 244 245 nmp->nm_so = NULL; 246 saddr = mtod(nmp->nm_nam, struct sockaddr *); 247 error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype, 248 nmp->nm_soproto); 249 if (error) { 250 nfs_disconnect(nmp); 251 return (error); 252 } 253 254 /* Allocate mbufs possibly waiting before grabbing the socket lock. */ 255 if (nmp->nm_sotype == SOCK_STREAM || saddr->sa_family == AF_INET) 256 MGET(mopt, M_WAIT, MT_SOOPTS); 257 if (saddr->sa_family == AF_INET) 258 MGET(nam, M_WAIT, MT_SONAME); 259 260 so = nmp->nm_so; 261 nmp->nm_soflags = so->so_proto->pr_flags; 262 263 /* 264 * Some servers require that the client port be a reserved port number. 265 * We always allocate a reserved port, as this prevents filehandle 266 * disclosure through UDP port capture. 267 */ 268 if (saddr->sa_family == AF_INET) { 269 int *ip; 270 271 mopt->m_len = sizeof(int); 272 ip = mtod(mopt, int *); 273 *ip = IP_PORTRANGE_LOW; 274 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); 275 if (error) 276 goto bad; 277 278 sin = mtod(nam, struct sockaddr_in *); 279 memset(sin, 0, sizeof(*sin)); 280 sin->sin_len = nam->m_len = sizeof(struct sockaddr_in); 281 sin->sin_family = AF_INET; 282 sin->sin_addr.s_addr = INADDR_ANY; 283 sin->sin_port = htons(0); 284 solock(so); 285 error = sobind(so, nam, &proc0); 286 sounlock(so); 287 if (error) 288 goto bad; 289 290 mopt->m_len = sizeof(int); 291 ip = mtod(mopt, int *); 292 *ip = IP_PORTRANGE_DEFAULT; 293 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); 294 if (error) 295 goto bad; 296 } 297 298 /* 299 * Protocols that do not require connections may be optionally left 300 * unconnected for servers that reply from a port other than NFS_PORT. 301 */ 302 if (nmp->nm_flag & NFSMNT_NOCONN) { 303 if (nmp->nm_soflags & PR_CONNREQUIRED) { 304 error = ENOTCONN; 305 goto bad; 306 } 307 } else { 308 solock(so); 309 error = soconnect(so, nmp->nm_nam); 310 if (error) 311 goto bad_locked; 312 313 /* 314 * Wait for the connection to complete. Cribbed from the 315 * connect system call but with the wait timing out so 316 * that interruptible mounts don't hang here for a long time. 317 */ 318 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 319 sosleep_nsec(so, &so->so_timeo, PSOCK, "nfscon", 320 SEC_TO_NSEC(2)); 321 if ((so->so_state & SS_ISCONNECTING) && 322 so->so_error == 0 && rep && 323 (error = nfs_sigintr(nmp, rep, rep->r_procp)) != 0){ 324 so->so_state &= ~SS_ISCONNECTING; 325 goto bad_locked; 326 } 327 } 328 if (so->so_error) { 329 error = so->so_error; 330 so->so_error = 0; 331 goto bad_locked; 332 } 333 sounlock(so); 334 } 335 /* 336 * Always set receive timeout to detect server crash and reconnect. 337 * Otherwise, we can get stuck in soreceive forever. 338 */ 339 mtx_enter(&so->so_rcv.sb_mtx); 340 so->so_rcv.sb_timeo_nsecs = SEC_TO_NSEC(5); 341 mtx_leave(&so->so_rcv.sb_mtx); 342 mtx_enter(&so->so_snd.sb_mtx); 343 if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) 344 so->so_snd.sb_timeo_nsecs = SEC_TO_NSEC(5); 345 else 346 so->so_snd.sb_timeo_nsecs = INFSLP; 347 mtx_leave(&so->so_snd.sb_mtx); 348 if (nmp->nm_sotype == SOCK_DGRAM) { 349 sndreserve = nmp->nm_wsize + NFS_MAXPKTHDR; 350 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) + 351 NFS_MAXPKTHDR) * 2; 352 } else if (nmp->nm_sotype == SOCK_STREAM) { 353 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 354 *mtod(mopt, int32_t *) = 1; 355 mopt->m_len = sizeof(int32_t); 356 sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, mopt); 357 } 358 if (so->so_proto->pr_protocol == IPPROTO_TCP) { 359 *mtod(mopt, int32_t *) = 1; 360 mopt->m_len = sizeof(int32_t); 361 sosetopt(so, IPPROTO_TCP, TCP_NODELAY, mopt); 362 } 363 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + 364 sizeof (u_int32_t)) * 2; 365 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + 366 sizeof (u_int32_t)) * 2; 367 } else { 368 panic("%s: nm_sotype %d", __func__, nmp->nm_sotype); 369 } 370 solock(so); 371 error = soreserve(so, sndreserve, rcvreserve); 372 if (error) 373 goto bad_locked; 374 so->so_rcv.sb_flags |= SB_NOINTR; 375 so->so_snd.sb_flags |= SB_NOINTR; 376 sounlock(so); 377 378 m_freem(mopt); 379 m_freem(nam); 380 381 /* Initialize other non-zero congestion variables */ 382 nfs_init_rtt(nmp); 383 nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */ 384 nmp->nm_sent = 0; 385 nmp->nm_timeouts = 0; 386 return (0); 387 388 bad_locked: 389 sounlock(so); 390 bad: 391 392 m_freem(mopt); 393 m_freem(nam); 394 395 nfs_disconnect(nmp); 396 return (error); 397 } 398 399 /* 400 * Reconnect routine: 401 * Called when a connection is broken on a reliable protocol. 402 * - clean up the old socket 403 * - nfs_connect() again 404 * - set R_MUSTRESEND for all outstanding requests on mount point 405 * If this fails the mount point is DEAD! 406 * nb: Must be called with the nfs_sndlock() set on the mount point. 407 */ 408 int 409 nfs_reconnect(struct nfsreq *rep) 410 { 411 struct nfsreq *rp; 412 struct nfsmount *nmp = rep->r_nmp; 413 int error; 414 415 nfs_disconnect(nmp); 416 while ((error = nfs_connect(nmp, rep)) != 0) { 417 if (error == EINTR || error == ERESTART) 418 return (EINTR); 419 tsleep_nsec(&nowake, PSOCK, "nfsrecon", SEC_TO_NSEC(1)); 420 } 421 422 /* 423 * Loop through outstanding request list and fix up all requests 424 * on old socket. 425 */ 426 TAILQ_FOREACH(rp, &nmp->nm_reqsq, r_chain) { 427 rp->r_flags |= R_MUSTRESEND; 428 rp->r_rexmit = 0; 429 } 430 return (0); 431 } 432 433 /* 434 * NFS disconnect. Clean up and unlink. 435 */ 436 void 437 nfs_disconnect(struct nfsmount *nmp) 438 { 439 struct socket *so; 440 441 if (nmp->nm_so) { 442 so = nmp->nm_so; 443 nmp->nm_so = NULL; 444 soshutdown(so, SHUT_RDWR); 445 soclose(so, 0); 446 } 447 } 448 449 /* 450 * This is the nfs send routine. For connection based socket types, it 451 * must be called with an nfs_sndlock() on the socket. 452 * "rep == NULL" indicates that it has been called from a server. 453 * For the client side: 454 * - return EINTR if the RPC is terminated, 0 otherwise 455 * - set R_MUSTRESEND if the send fails for any reason 456 * - do any cleanup required by recoverable socket errors (???) 457 * For the server side: 458 * - return EINTR or ERESTART if interrupted by a signal 459 * - return EPIPE if a connection is lost for connection based sockets (TCP...) 460 * - do any cleanup required by recoverable socket errors (???) 461 */ 462 int 463 nfs_send(struct socket *so, struct mbuf *nam, struct mbuf *top, 464 struct nfsreq *rep) 465 { 466 struct mbuf *sendnam; 467 int error, soflags, flags; 468 469 if (rep) { 470 if (rep->r_flags & R_SOFTTERM) { 471 m_freem(top); 472 return (EINTR); 473 } 474 if ((so = rep->r_nmp->nm_so) == NULL) { 475 rep->r_flags |= R_MUSTRESEND; 476 m_freem(top); 477 return (0); 478 } 479 rep->r_flags &= ~R_MUSTRESEND; 480 soflags = rep->r_nmp->nm_soflags; 481 } else 482 soflags = so->so_proto->pr_flags; 483 if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED)) 484 sendnam = NULL; 485 else 486 sendnam = nam; 487 flags = 0; 488 489 error = sosend(so, sendnam, NULL, top, NULL, flags); 490 if (error) { 491 if (rep) { 492 /* 493 * Deal with errors for the client side. 494 */ 495 if (rep->r_flags & R_SOFTTERM) 496 error = EINTR; 497 else 498 rep->r_flags |= R_MUSTRESEND; 499 } 500 501 /* 502 * Handle any recoverable (soft) socket errors here. (???) 503 */ 504 if (error != EINTR && error != ERESTART && 505 error != EWOULDBLOCK && error != EPIPE) 506 error = 0; 507 } 508 return (error); 509 } 510 511 #ifdef NFSCLIENT 512 /* 513 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all 514 * done by soreceive(), but for SOCK_STREAM we must deal with the Record 515 * Mark and consolidate the data into a new mbuf list. 516 * nb: Sometimes TCP passes the data up to soreceive() in long lists of 517 * small mbufs. 518 * For SOCK_STREAM we must be very careful to read an entire record once 519 * we have read any of it, even if the system call has been interrupted. 520 */ 521 int 522 nfs_receive(struct nfsreq *rep, struct mbuf **aname, struct mbuf **mp) 523 { 524 struct socket *so; 525 struct uio auio; 526 struct iovec aio; 527 struct mbuf *m; 528 struct mbuf *control; 529 u_int32_t len; 530 struct mbuf **getnam; 531 int error, sotype, rcvflg; 532 struct proc *p = curproc; /* XXX */ 533 534 /* 535 * Set up arguments for soreceive() 536 */ 537 *mp = NULL; 538 *aname = NULL; 539 sotype = rep->r_nmp->nm_sotype; 540 541 /* 542 * For reliable protocols, lock against other senders/receivers 543 * in case a reconnect is necessary. 544 * For SOCK_STREAM, first get the Record Mark to find out how much 545 * more there is to get. 546 * We must lock the socket against other receivers 547 * until we have an entire rpc request/reply. 548 */ 549 if (sotype != SOCK_DGRAM) { 550 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); 551 if (error) 552 return (error); 553 tryagain: 554 /* 555 * Check for fatal errors and resending request. 556 */ 557 /* 558 * Ugh: If a reconnect attempt just happened, nm_so 559 * would have changed. NULL indicates a failed 560 * attempt that has essentially shut down this 561 * mount point. 562 */ 563 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) { 564 nfs_sndunlock(&rep->r_nmp->nm_flag); 565 return (EINTR); 566 } 567 so = rep->r_nmp->nm_so; 568 if (!so) { 569 error = nfs_reconnect(rep); 570 if (error) { 571 nfs_sndunlock(&rep->r_nmp->nm_flag); 572 return (error); 573 } 574 goto tryagain; 575 } 576 while (rep->r_flags & R_MUSTRESEND) { 577 m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT); 578 nfsstats.rpcretries++; 579 rep->r_rtt = 0; 580 rep->r_flags &= ~R_TIMING; 581 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep); 582 if (error) { 583 if (error == EINTR || error == ERESTART || 584 (error = nfs_reconnect(rep)) != 0) { 585 nfs_sndunlock(&rep->r_nmp->nm_flag); 586 return (error); 587 } 588 goto tryagain; 589 } 590 } 591 nfs_sndunlock(&rep->r_nmp->nm_flag); 592 if (sotype == SOCK_STREAM) { 593 aio.iov_base = (caddr_t) &len; 594 aio.iov_len = sizeof(u_int32_t); 595 auio.uio_iov = &aio; 596 auio.uio_iovcnt = 1; 597 auio.uio_segflg = UIO_SYSSPACE; 598 auio.uio_rw = UIO_READ; 599 auio.uio_offset = 0; 600 auio.uio_resid = sizeof(u_int32_t); 601 auio.uio_procp = p; 602 do { 603 rcvflg = MSG_WAITALL; 604 error = soreceive(so, NULL, &auio, NULL, NULL, 605 &rcvflg, 0); 606 if (error == EWOULDBLOCK && rep) { 607 if (rep->r_flags & R_SOFTTERM) 608 return (EINTR); 609 /* 610 * looks like the server died after it 611 * received the request, make sure 612 * that we will retransmit and we 613 * don't get stuck here forever. 614 */ 615 if (rep->r_rexmit >= 616 rep->r_nmp->nm_retry) { 617 nfsstats.rpctimeouts++; 618 error = EPIPE; 619 } 620 } 621 } while (error == EWOULDBLOCK); 622 if (!error && auio.uio_resid > 0) { 623 log(LOG_INFO, 624 "short receive (%zu/%zu) from nfs server %s\n", 625 sizeof(u_int32_t) - auio.uio_resid, 626 sizeof(u_int32_t), 627 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 628 error = EPIPE; 629 } 630 if (error) 631 goto errout; 632 633 len = ntohl(len) & ~0x80000000; 634 /* 635 * This is SERIOUS! We are out of sync with the sender 636 * and forcing a disconnect/reconnect is all I can do. 637 */ 638 if (len > NFS_MAXPACKET) { 639 log(LOG_ERR, "%s (%u) from nfs server %s\n", 640 "impossible packet length", 641 len, 642 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 643 error = EFBIG; 644 goto errout; 645 } 646 auio.uio_resid = len; 647 do { 648 rcvflg = MSG_WAITALL; 649 error = soreceive(so, NULL, &auio, mp, NULL, 650 &rcvflg, 0); 651 } while (error == EWOULDBLOCK || error == EINTR || 652 error == ERESTART); 653 if (!error && auio.uio_resid > 0) { 654 log(LOG_INFO, "short receive (%zu/%u) from " 655 "nfs server %s\n", len - auio.uio_resid, 656 len, rep->r_nmp->nm_mountp-> 657 mnt_stat.f_mntfromname); 658 error = EPIPE; 659 } 660 } else { 661 /* 662 * NB: Since uio_resid is big, MSG_WAITALL is ignored 663 * and soreceive() will return when it has either a 664 * control msg or a data msg. 665 * We have no use for control msg., but must grab them 666 * and then throw them away so we know what is going 667 * on. 668 */ 669 auio.uio_resid = len = 100000000; /* Anything Big */ 670 auio.uio_procp = p; 671 do { 672 rcvflg = 0; 673 error = soreceive(so, NULL, &auio, mp, &control, 674 &rcvflg, 0); 675 m_freem(control); 676 if (error == EWOULDBLOCK && rep) { 677 if (rep->r_flags & R_SOFTTERM) 678 return (EINTR); 679 } 680 } while (error == EWOULDBLOCK || 681 (!error && *mp == NULL && control)); 682 if ((rcvflg & MSG_EOR) == 0) 683 printf("Egad!!\n"); 684 if (!error && *mp == NULL) 685 error = EPIPE; 686 len -= auio.uio_resid; 687 } 688 errout: 689 if (error && error != EINTR && error != ERESTART) { 690 m_freemp(mp); 691 if (error != EPIPE) 692 log(LOG_INFO, 693 "receive error %d from nfs server %s\n", 694 error, 695 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 696 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); 697 if (!error) { 698 error = nfs_reconnect(rep); 699 if (!error) 700 goto tryagain; 701 nfs_sndunlock(&rep->r_nmp->nm_flag); 702 } 703 } 704 } else { 705 if ((so = rep->r_nmp->nm_so) == NULL) 706 return (EACCES); 707 if (so->so_state & SS_ISCONNECTED) 708 getnam = NULL; 709 else 710 getnam = aname; 711 auio.uio_resid = len = 1000000; 712 auio.uio_procp = p; 713 do { 714 rcvflg = 0; 715 error = soreceive(so, getnam, &auio, mp, NULL, 716 &rcvflg, 0); 717 if (error == EWOULDBLOCK && 718 (rep->r_flags & R_SOFTTERM)) 719 return (EINTR); 720 } while (error == EWOULDBLOCK); 721 len -= auio.uio_resid; 722 } 723 if (error) 724 m_freemp(mp); 725 /* 726 * Search for any mbufs that are not a multiple of 4 bytes long 727 * or with m_data not longword aligned. 728 * These could cause pointer alignment problems, so copy them to 729 * well aligned mbufs. 730 */ 731 nfs_realign(mp, 5 * NFSX_UNSIGNED); 732 return (error); 733 } 734 735 /* 736 * Implement receipt of reply on a socket. 737 * We must search through the list of received datagrams matching them 738 * with outstanding requests using the xid, until ours is found. 739 */ 740 int 741 nfs_reply(struct nfsreq *myrep) 742 { 743 struct nfsreq *rep; 744 struct nfsmount *nmp = myrep->r_nmp; 745 struct nfsm_info info; 746 struct mbuf *nam; 747 u_int32_t rxid, *tl, t1; 748 caddr_t cp2; 749 int error; 750 751 /* 752 * Loop around until we get our own reply 753 */ 754 for (;;) { 755 /* 756 * Lock against other receivers so that I don't get stuck in 757 * sbwait() after someone else has received my reply for me. 758 * Also necessary for connection based protocols to avoid 759 * race conditions during a reconnect. 760 */ 761 error = nfs_rcvlock(myrep); 762 if (error) 763 return (error == EALREADY ? 0 : error); 764 765 /* 766 * Get the next Rpc reply off the socket 767 */ 768 error = nfs_receive(myrep, &nam, &info.nmi_mrep); 769 nfs_rcvunlock(&nmp->nm_flag); 770 if (error) { 771 772 /* 773 * Ignore routing errors on connectionless protocols?? 774 */ 775 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) { 776 if (nmp->nm_so) 777 nmp->nm_so->so_error = 0; 778 continue; 779 } 780 return (error); 781 } 782 m_freem(nam); 783 784 /* 785 * Get the xid and check that it is an rpc reply 786 */ 787 info.nmi_md = info.nmi_mrep; 788 info.nmi_dpos = mtod(info.nmi_md, caddr_t); 789 nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED); 790 rxid = *tl++; 791 if (*tl != rpc_reply) { 792 nfsstats.rpcinvalid++; 793 m_freem(info.nmi_mrep); 794 nfsmout: 795 continue; 796 } 797 798 /* 799 * Loop through the request list to match up the reply 800 * Iff no match, just drop the datagram 801 */ 802 TAILQ_FOREACH(rep, &nmp->nm_reqsq, r_chain) { 803 if (rep->r_mrep == NULL && rxid == rep->r_xid) { 804 /* Found it.. */ 805 rep->r_mrep = info.nmi_mrep; 806 rep->r_md = info.nmi_md; 807 rep->r_dpos = info.nmi_dpos; 808 809 /* 810 * Update congestion window. 811 * Do the additive increase of 812 * one rpc/rtt. 813 */ 814 if (nmp->nm_cwnd <= nmp->nm_sent) { 815 nmp->nm_cwnd += 816 (NFS_CWNDSCALE * NFS_CWNDSCALE + 817 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd; 818 if (nmp->nm_cwnd > NFS_MAXCWND) 819 nmp->nm_cwnd = NFS_MAXCWND; 820 } 821 rep->r_flags &= ~R_SENT; 822 nmp->nm_sent -= NFS_CWNDSCALE; 823 824 if (rep->r_flags & R_TIMING) 825 nfs_update_rtt(rep); 826 827 nmp->nm_timeouts = 0; 828 break; 829 } 830 } 831 /* 832 * If not matched to a request, drop it. 833 * If it's mine, get out. 834 */ 835 if (rep == 0) { 836 nfsstats.rpcunexpected++; 837 m_freem(info.nmi_mrep); 838 } else if (rep == myrep) { 839 if (rep->r_mrep == NULL) 840 panic("nfsreply nil"); 841 return (0); 842 } 843 } 844 } 845 846 /* 847 * nfs_request - goes something like this 848 * - fill in request struct 849 * - links it into list 850 * - calls nfs_send() for first transmit 851 * - calls nfs_receive() to get reply 852 * - break down rpc header and return with nfs reply pointed to 853 * by mrep or error 854 * nb: always frees up mreq mbuf list 855 */ 856 int 857 nfs_request(struct vnode *vp, int procnum, struct nfsm_info *infop) 858 { 859 struct mbuf *m; 860 u_int32_t *tl; 861 struct nfsmount *nmp; 862 caddr_t cp2; 863 int t1, i, error = 0; 864 int trylater_delay; 865 struct nfsreq *rep; 866 struct nfsm_info info; 867 868 rep = pool_get(&nfsreqpl, PR_WAITOK); 869 rep->r_nmp = VFSTONFS(vp->v_mount); 870 rep->r_vp = vp; 871 rep->r_procp = infop->nmi_procp; 872 rep->r_procnum = procnum; 873 874 /* empty mbuf for AUTH_UNIX header */ 875 rep->r_mreq = m_gethdr(M_WAIT, MT_DATA); 876 rep->r_mreq->m_next = infop->nmi_mreq; 877 rep->r_mreq->m_len = 0; 878 m_calchdrlen(rep->r_mreq); 879 880 trylater_delay = NFS_MINTIMEO; 881 882 nmp = rep->r_nmp; 883 884 /* Get the RPC header with authorization. */ 885 nfsm_rpchead(rep, infop->nmi_cred, RPCAUTH_UNIX); 886 m = rep->r_mreq; 887 888 /* 889 * For stream protocols, insert a Sun RPC Record Mark. 890 */ 891 if (nmp->nm_sotype == SOCK_STREAM) { 892 M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); 893 *mtod(m, u_int32_t *) = htonl(0x80000000 | 894 (m->m_pkthdr.len - NFSX_UNSIGNED)); 895 } 896 897 tryagain: 898 rep->r_rtt = rep->r_rexmit = 0; 899 if (nfs_ptimers[rep->r_procnum] != NFS_DEFAULT_TIMER) 900 rep->r_flags = R_TIMING; 901 else 902 rep->r_flags = 0; 903 rep->r_mrep = NULL; 904 905 /* 906 * Do the client side RPC. 907 */ 908 nfsstats.rpcrequests++; 909 /* 910 * Chain request into list of outstanding requests. Be sure 911 * to put it LAST so timer finds oldest requests first. 912 */ 913 if (TAILQ_EMPTY(&nmp->nm_reqsq)) 914 timeout_add(&nmp->nm_rtimeout, nfs_ticks); 915 TAILQ_INSERT_TAIL(&nmp->nm_reqsq, rep, r_chain); 916 917 /* 918 * If backing off another request or avoiding congestion, don't 919 * send this one now but let timer do it. If not timing a request, 920 * do it now. 921 */ 922 if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM || 923 (nmp->nm_flag & NFSMNT_DUMBTIMR) || 924 nmp->nm_sent < nmp->nm_cwnd)) { 925 if (nmp->nm_soflags & PR_CONNREQUIRED) 926 error = nfs_sndlock(&nmp->nm_flag, rep); 927 if (!error) { 928 error = nfs_send(nmp->nm_so, nmp->nm_nam, 929 m_copym(m, 0, M_COPYALL, M_WAIT), rep); 930 if (nmp->nm_soflags & PR_CONNREQUIRED) 931 nfs_sndunlock(&nmp->nm_flag); 932 } 933 if (!error && (rep->r_flags & R_MUSTRESEND) == 0) { 934 nmp->nm_sent += NFS_CWNDSCALE; 935 rep->r_flags |= R_SENT; 936 } 937 } else { 938 rep->r_rtt = -1; 939 } 940 941 /* 942 * Wait for the reply from our send or the timer's. 943 */ 944 if (!error || error == EPIPE) 945 error = nfs_reply(rep); 946 947 /* 948 * RPC done, unlink the request. 949 */ 950 TAILQ_REMOVE(&nmp->nm_reqsq, rep, r_chain); 951 if (TAILQ_EMPTY(&nmp->nm_reqsq)) 952 timeout_del(&nmp->nm_rtimeout); 953 954 /* 955 * Decrement the outstanding request count. 956 */ 957 if (rep->r_flags & R_SENT) { 958 rep->r_flags &= ~R_SENT; /* paranoia */ 959 nmp->nm_sent -= NFS_CWNDSCALE; 960 } 961 962 /* 963 * If there was a successful reply and a tprintf msg. 964 * tprintf a response. 965 */ 966 if (!error && (rep->r_flags & R_TPRINTFMSG)) 967 nfs_msg(rep, "is alive again"); 968 info.nmi_mrep = rep->r_mrep; 969 info.nmi_md = rep->r_md; 970 info.nmi_dpos = rep->r_dpos; 971 if (error) { 972 infop->nmi_mrep = NULL; 973 goto nfsmout1; 974 } 975 976 /* 977 * break down the rpc header and check if ok 978 */ 979 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); 980 if (*tl++ == rpc_msgdenied) { 981 if (*tl == rpc_mismatch) 982 error = EOPNOTSUPP; 983 else 984 error = EACCES; /* Should be EAUTH. */ 985 infop->nmi_mrep = NULL; 986 goto nfsmout1; 987 } 988 989 /* 990 * Since we only support RPCAUTH_UNIX atm we step over the 991 * reply verifer type, and in the (error) case that there really 992 * is any data in it, we advance over it. 993 */ 994 tl++; /* Step over verifer type */ 995 i = fxdr_unsigned(int32_t, *tl); 996 if (i > 0) 997 nfsm_adv(nfsm_rndup(i)); /* Should not happen */ 998 999 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); 1000 /* 0 == ok */ 1001 if (*tl == 0) { 1002 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); 1003 if (*tl != 0) { 1004 error = fxdr_unsigned(int, *tl); 1005 if ((nmp->nm_flag & NFSMNT_NFSV3) && 1006 error == NFSERR_TRYLATER) { 1007 m_freem(info.nmi_mrep); 1008 error = 0; 1009 tsleep_nsec(&nowake, PSOCK, "nfsretry", 1010 SEC_TO_NSEC(trylater_delay)); 1011 trylater_delay *= NFS_TIMEOUTMUL; 1012 if (trylater_delay > NFS_MAXTIMEO) 1013 trylater_delay = NFS_MAXTIMEO; 1014 1015 goto tryagain; 1016 } 1017 1018 /* 1019 * If the File Handle was stale, invalidate the 1020 * lookup cache, just in case. 1021 */ 1022 if (error == ESTALE) 1023 cache_purge(rep->r_vp); 1024 } 1025 goto nfsmout; 1026 } 1027 1028 error = EPROTONOSUPPORT; 1029 1030 nfsmout: 1031 infop->nmi_mrep = info.nmi_mrep; 1032 infop->nmi_md = info.nmi_md; 1033 infop->nmi_dpos = info.nmi_dpos; 1034 nfsmout1: 1035 m_freem(rep->r_mreq); 1036 pool_put(&nfsreqpl, rep); 1037 return (error); 1038 } 1039 #endif /* NFSCLIENT */ 1040 1041 /* 1042 * Generate the rpc reply header 1043 * siz arg. is used to decide if adding a cluster is worthwhile 1044 */ 1045 int 1046 nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp, 1047 int err, struct mbuf **mrq, struct mbuf **mbp) 1048 { 1049 u_int32_t *tl; 1050 struct mbuf *mreq; 1051 struct mbuf *mb; 1052 1053 MGETHDR(mreq, M_WAIT, MT_DATA); 1054 mb = mreq; 1055 /* 1056 * If this is a big reply, use a cluster else 1057 * try and leave leading space for the lower level headers. 1058 */ 1059 siz += RPC_REPLYSIZ; 1060 if (siz >= MHLEN - max_hdr) { 1061 MCLGET(mreq, M_WAIT); 1062 } else 1063 mreq->m_data += max_hdr; 1064 tl = mtod(mreq, u_int32_t *); 1065 mreq->m_len = 6 * NFSX_UNSIGNED; 1066 *tl++ = txdr_unsigned(nd->nd_retxid); 1067 *tl++ = rpc_reply; 1068 if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) { 1069 *tl++ = rpc_msgdenied; 1070 if (err & NFSERR_AUTHERR) { 1071 *tl++ = rpc_autherr; 1072 *tl = txdr_unsigned(err & ~NFSERR_AUTHERR); 1073 mreq->m_len -= NFSX_UNSIGNED; 1074 } else { 1075 *tl++ = rpc_mismatch; 1076 *tl++ = txdr_unsigned(RPC_VER2); 1077 *tl = txdr_unsigned(RPC_VER2); 1078 } 1079 } else { 1080 *tl++ = rpc_msgaccepted; 1081 1082 /* AUTH_UNIX requires RPCAUTH_NULL. */ 1083 *tl++ = 0; 1084 *tl++ = 0; 1085 1086 switch (err) { 1087 case EPROGUNAVAIL: 1088 *tl = txdr_unsigned(RPC_PROGUNAVAIL); 1089 break; 1090 case EPROGMISMATCH: 1091 *tl = txdr_unsigned(RPC_PROGMISMATCH); 1092 tl = nfsm_build(&mb, 2 * NFSX_UNSIGNED); 1093 *tl++ = txdr_unsigned(NFS_VER2); 1094 *tl = txdr_unsigned(NFS_VER3); 1095 break; 1096 case EPROCUNAVAIL: 1097 *tl = txdr_unsigned(RPC_PROCUNAVAIL); 1098 break; 1099 case EBADRPC: 1100 *tl = txdr_unsigned(RPC_GARBAGE); 1101 break; 1102 default: 1103 *tl = 0; 1104 if (err != NFSERR_RETVOID) { 1105 tl = nfsm_build(&mb, NFSX_UNSIGNED); 1106 if (err) 1107 *tl = txdr_unsigned(nfsrv_errmap(nd, err)); 1108 else 1109 *tl = 0; 1110 } 1111 break; 1112 }; 1113 } 1114 1115 *mrq = mreq; 1116 if (mbp != NULL) 1117 *mbp = mb; 1118 if (err != 0 && err != NFSERR_RETVOID) 1119 nfsstats.srvrpc_errs++; 1120 return (0); 1121 } 1122 1123 /* 1124 * nfs timer routine 1125 * Scan the nfsreq list and retransmit any requests that have timed out. 1126 */ 1127 void 1128 nfs_timer(void *arg) 1129 { 1130 struct nfsmount *nmp = arg; 1131 struct nfsreq *rep; 1132 struct mbuf *m; 1133 struct socket *so; 1134 int timeo, error; 1135 1136 NET_LOCK(); 1137 TAILQ_FOREACH(rep, &nmp->nm_reqsq, r_chain) { 1138 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) 1139 continue; 1140 if (nfs_sigintr(nmp, rep, rep->r_procp)) { 1141 rep->r_flags |= R_SOFTTERM; 1142 continue; 1143 } 1144 if (rep->r_rtt >= 0) { 1145 rep->r_rtt++; 1146 if (nmp->nm_flag & NFSMNT_DUMBTIMR) 1147 timeo = nmp->nm_timeo; 1148 else 1149 timeo = nfs_estimate_rto(nmp, rep->r_procnum); 1150 if (nmp->nm_timeouts > 0) 1151 timeo *= nfs_backoff[nmp->nm_timeouts - 1]; 1152 if (rep->r_rtt <= timeo) 1153 continue; 1154 if (nmp->nm_timeouts < nitems(nfs_backoff)) 1155 nmp->nm_timeouts++; 1156 } 1157 1158 /* Check for server not responding. */ 1159 if ((rep->r_flags & R_TPRINTFMSG) == 0 && rep->r_rexmit > 4) { 1160 nfs_msg(rep, "not responding"); 1161 rep->r_flags |= R_TPRINTFMSG; 1162 } 1163 if (rep->r_rexmit >= nmp->nm_retry) { /* too many */ 1164 nfsstats.rpctimeouts++; 1165 rep->r_flags |= R_SOFTTERM; 1166 continue; 1167 } 1168 if (nmp->nm_sotype != SOCK_DGRAM) { 1169 if (++rep->r_rexmit > NFS_MAXREXMIT) 1170 rep->r_rexmit = NFS_MAXREXMIT; 1171 continue; 1172 } 1173 1174 if ((so = nmp->nm_so) == NULL) 1175 continue; 1176 1177 /* 1178 * If there is enough space and the window allows.. 1179 * Resend it 1180 * Set r_rtt to -1 in case we fail to send it now. 1181 */ 1182 rep->r_rtt = -1; 1183 if (sbspace(so, &so->so_snd) >= rep->r_mreq->m_pkthdr.len && 1184 ((nmp->nm_flag & NFSMNT_DUMBTIMR) || 1185 (rep->r_flags & R_SENT) || 1186 nmp->nm_sent < nmp->nm_cwnd) && 1187 (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){ 1188 if ((nmp->nm_flag & NFSMNT_NOCONN) == 0) 1189 error = pru_send(so, m, NULL, NULL); 1190 else 1191 error = pru_send(so, m, nmp->nm_nam, NULL); 1192 if (error) { 1193 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) 1194 so->so_error = 0; 1195 } else { 1196 /* 1197 * Iff first send, start timing 1198 * else turn timing off, backoff timer 1199 * and divide congestion window by 2. 1200 */ 1201 if (rep->r_flags & R_SENT) { 1202 rep->r_flags &= ~R_TIMING; 1203 if (++rep->r_rexmit > NFS_MAXREXMIT) 1204 rep->r_rexmit = NFS_MAXREXMIT; 1205 nmp->nm_cwnd >>= 1; 1206 if (nmp->nm_cwnd < NFS_CWNDSCALE) 1207 nmp->nm_cwnd = NFS_CWNDSCALE; 1208 nfsstats.rpcretries++; 1209 } else { 1210 rep->r_flags |= R_SENT; 1211 nmp->nm_sent += NFS_CWNDSCALE; 1212 } 1213 rep->r_rtt = 0; 1214 } 1215 } 1216 } 1217 NET_UNLOCK(); 1218 timeout_add(&nmp->nm_rtimeout, nfs_ticks); 1219 } 1220 1221 /* 1222 * Test for a termination condition pending on the process. 1223 * This is used for NFSMNT_INT mounts. 1224 */ 1225 int 1226 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct proc *p) 1227 { 1228 1229 if (rep && (rep->r_flags & R_SOFTTERM)) 1230 return (EINTR); 1231 if (!(nmp->nm_flag & NFSMNT_INT)) 1232 return (0); 1233 if (p && (SIGPENDING(p) & ~p->p_p->ps_sigacts->ps_sigignore & 1234 NFSINT_SIGMASK)) 1235 return (EINTR); 1236 return (0); 1237 } 1238 1239 /* 1240 * Lock a socket against others. 1241 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply 1242 * and also to avoid race conditions between the processes with nfs requests 1243 * in progress when a reconnect is necessary. 1244 */ 1245 int 1246 nfs_sndlock(int *flagp, struct nfsreq *rep) 1247 { 1248 uint64_t slptimeo = INFSLP; 1249 struct proc *p; 1250 int slpflag = 0; 1251 1252 if (rep) { 1253 p = rep->r_procp; 1254 if (rep->r_nmp->nm_flag & NFSMNT_INT) 1255 slpflag = PCATCH; 1256 } else 1257 p = NULL; 1258 while (*flagp & NFSMNT_SNDLOCK) { 1259 if (rep && nfs_sigintr(rep->r_nmp, rep, p)) 1260 return (EINTR); 1261 *flagp |= NFSMNT_WANTSND; 1262 tsleep_nsec(flagp, slpflag | (PZERO - 1), "nfsndlck", slptimeo); 1263 if (slpflag == PCATCH) { 1264 slpflag = 0; 1265 slptimeo = SEC_TO_NSEC(2); 1266 } 1267 } 1268 *flagp |= NFSMNT_SNDLOCK; 1269 return (0); 1270 } 1271 1272 /* 1273 * Unlock the stream socket for others. 1274 */ 1275 void 1276 nfs_sndunlock(int *flagp) 1277 { 1278 1279 if ((*flagp & NFSMNT_SNDLOCK) == 0) 1280 panic("nfs sndunlock"); 1281 *flagp &= ~NFSMNT_SNDLOCK; 1282 if (*flagp & NFSMNT_WANTSND) { 1283 *flagp &= ~NFSMNT_WANTSND; 1284 wakeup((caddr_t)flagp); 1285 } 1286 } 1287 1288 int 1289 nfs_rcvlock(struct nfsreq *rep) 1290 { 1291 uint64_t slptimeo = INFSLP; 1292 int *flagp = &rep->r_nmp->nm_flag; 1293 int slpflag; 1294 1295 if (*flagp & NFSMNT_INT) 1296 slpflag = PCATCH; 1297 else 1298 slpflag = 0; 1299 1300 while (*flagp & NFSMNT_RCVLOCK) { 1301 if (nfs_sigintr(rep->r_nmp, rep, rep->r_procp)) 1302 return (EINTR); 1303 *flagp |= NFSMNT_WANTRCV; 1304 tsleep_nsec(flagp, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo); 1305 if (rep->r_mrep != NULL) { 1306 /* 1307 * Don't take the lock if our reply has been received 1308 * while we where sleeping. 1309 */ 1310 return (EALREADY); 1311 } 1312 if (slpflag == PCATCH) { 1313 slpflag = 0; 1314 slptimeo = SEC_TO_NSEC(2); 1315 } 1316 } 1317 *flagp |= NFSMNT_RCVLOCK; 1318 return (0); 1319 } 1320 1321 /* 1322 * Unlock the stream socket for others. 1323 */ 1324 void 1325 nfs_rcvunlock(int *flagp) 1326 { 1327 1328 if ((*flagp & NFSMNT_RCVLOCK) == 0) 1329 panic("nfs rcvunlock"); 1330 *flagp &= ~NFSMNT_RCVLOCK; 1331 if (*flagp & NFSMNT_WANTRCV) { 1332 *flagp &= ~NFSMNT_WANTRCV; 1333 wakeup(flagp); 1334 } 1335 } 1336 1337 /* 1338 * Auxiliary routine to align the length of mbuf copies made with m_copyback(). 1339 */ 1340 void 1341 nfs_realign_fixup(struct mbuf *m, struct mbuf *n, unsigned int *off) 1342 { 1343 size_t padding; 1344 1345 /* 1346 * The maximum number of bytes that m_copyback() places in a mbuf is 1347 * always an aligned quantity, so realign happens at the chain's tail. 1348 */ 1349 while (n->m_next != NULL) 1350 n = n->m_next; 1351 1352 /* 1353 * Pad from the next elements in the source chain. Loop until the 1354 * destination chain is aligned, or the end of the source is reached. 1355 */ 1356 do { 1357 m = m->m_next; 1358 if (m == NULL) 1359 return; 1360 1361 padding = min(ALIGN(n->m_len) - n->m_len, m->m_len); 1362 if (padding > m_trailingspace(n)) 1363 panic("nfs_realign_fixup: no memory to pad to"); 1364 1365 bcopy(mtod(m, void *), mtod(n, char *) + n->m_len, padding); 1366 1367 n->m_len += padding; 1368 m_adj(m, padding); 1369 *off += padding; 1370 1371 } while (!ALIGNED_POINTER(n->m_len, void *)); 1372 } 1373 1374 /* 1375 * The NFS RPC parsing code uses the data address and the length of mbuf 1376 * structures to calculate on-memory addresses. This function makes sure these 1377 * parameters are correctly aligned. 1378 */ 1379 void 1380 nfs_realign(struct mbuf **pm, int hsiz) 1381 { 1382 struct mbuf *m; 1383 struct mbuf *n = NULL; 1384 unsigned int off = 0; 1385 1386 ++nfs_realign_test; 1387 while ((m = *pm) != NULL) { 1388 if (!ALIGNED_POINTER(m->m_data, void *) || 1389 !ALIGNED_POINTER(m->m_len, void *)) { 1390 MGET(n, M_WAIT, MT_DATA); 1391 #define ALIGN_POINTER(n) ((u_int)(((n) + sizeof(void *)) & ~sizeof(void *))) 1392 if (ALIGN_POINTER(m->m_len) >= MINCLSIZE) { 1393 MCLGET(n, M_WAIT); 1394 } 1395 n->m_len = 0; 1396 break; 1397 } 1398 pm = &m->m_next; 1399 } 1400 /* 1401 * If n is non-NULL, loop on m copying data, then replace the 1402 * portion of the chain that had to be realigned. 1403 */ 1404 if (n != NULL) { 1405 ++nfs_realign_count; 1406 while (m) { 1407 m_copyback(n, off, m->m_len, mtod(m, caddr_t), M_WAIT); 1408 1409 /* 1410 * If an unaligned amount of memory was copied, fix up 1411 * the last mbuf created by m_copyback(). 1412 */ 1413 if (!ALIGNED_POINTER(m->m_len, void *)) 1414 nfs_realign_fixup(m, n, &off); 1415 1416 off += m->m_len; 1417 m = m->m_next; 1418 } 1419 m_freemp(pm); 1420 *pm = n; 1421 } 1422 } 1423 1424 1425 /* 1426 * Parse an RPC request 1427 * - verify it 1428 * - fill in the cred struct. 1429 */ 1430 int 1431 nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header) 1432 { 1433 int len, i; 1434 u_int32_t *tl; 1435 int32_t t1; 1436 caddr_t cp2; 1437 u_int32_t nfsvers, auth_type; 1438 int error = 0; 1439 struct nfsm_info info; 1440 1441 info.nmi_mrep = nd->nd_mrep; 1442 info.nmi_md = nd->nd_md; 1443 info.nmi_dpos = nd->nd_dpos; 1444 if (has_header) { 1445 nfsm_dissect(tl, u_int32_t *, 10 * NFSX_UNSIGNED); 1446 nd->nd_retxid = fxdr_unsigned(u_int32_t, *tl++); 1447 if (*tl++ != rpc_call) { 1448 m_freem(info.nmi_mrep); 1449 return (EBADRPC); 1450 } 1451 } else 1452 nfsm_dissect(tl, u_int32_t *, 8 * NFSX_UNSIGNED); 1453 nd->nd_repstat = 0; 1454 nd->nd_flag = 0; 1455 if (*tl++ != rpc_vers) { 1456 nd->nd_repstat = ERPCMISMATCH; 1457 nd->nd_procnum = NFSPROC_NOOP; 1458 return (0); 1459 } 1460 if (*tl != nfs_prog) { 1461 nd->nd_repstat = EPROGUNAVAIL; 1462 nd->nd_procnum = NFSPROC_NOOP; 1463 return (0); 1464 } 1465 tl++; 1466 nfsvers = fxdr_unsigned(u_int32_t, *tl++); 1467 if (nfsvers != NFS_VER2 && nfsvers != NFS_VER3) { 1468 nd->nd_repstat = EPROGMISMATCH; 1469 nd->nd_procnum = NFSPROC_NOOP; 1470 return (0); 1471 } 1472 if (nfsvers == NFS_VER3) 1473 nd->nd_flag = ND_NFSV3; 1474 nd->nd_procnum = fxdr_unsigned(u_int32_t, *tl++); 1475 if (nd->nd_procnum == NFSPROC_NULL) 1476 return (0); 1477 if (nd->nd_procnum >= NFS_NPROCS || 1478 (nd->nd_procnum > NFSPROC_COMMIT) || 1479 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) { 1480 nd->nd_repstat = EPROCUNAVAIL; 1481 nd->nd_procnum = NFSPROC_NOOP; 1482 return (0); 1483 } 1484 if ((nd->nd_flag & ND_NFSV3) == 0) 1485 nd->nd_procnum = nfsv3_procid[nd->nd_procnum]; 1486 auth_type = *tl++; 1487 len = fxdr_unsigned(int, *tl++); 1488 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1489 m_freem(info.nmi_mrep); 1490 return (EBADRPC); 1491 } 1492 1493 /* Handle auth_unix */ 1494 if (auth_type == rpc_auth_unix) { 1495 len = fxdr_unsigned(int, *++tl); 1496 if (len < 0 || len > NFS_MAXNAMLEN) { 1497 m_freem(info.nmi_mrep); 1498 return (EBADRPC); 1499 } 1500 nfsm_adv(nfsm_rndup(len)); 1501 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); 1502 memset(&nd->nd_cr, 0, sizeof (struct ucred)); 1503 refcnt_init(&nd->nd_cr.cr_refcnt); 1504 nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++); 1505 nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++); 1506 len = fxdr_unsigned(int, *tl); 1507 if (len < 0 || len > RPCAUTH_UNIXGIDS) { 1508 m_freem(info.nmi_mrep); 1509 return (EBADRPC); 1510 } 1511 nfsm_dissect(tl, u_int32_t *, (len + 2) * NFSX_UNSIGNED); 1512 for (i = 0; i < len; i++) { 1513 if (i < NGROUPS_MAX) 1514 nd->nd_cr.cr_groups[i] = 1515 fxdr_unsigned(gid_t, *tl++); 1516 else 1517 tl++; 1518 } 1519 nd->nd_cr.cr_ngroups = (len > NGROUPS_MAX) ? NGROUPS_MAX : len; 1520 len = fxdr_unsigned(int, *++tl); 1521 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1522 m_freem(info.nmi_mrep); 1523 return (EBADRPC); 1524 } 1525 if (len > 0) 1526 nfsm_adv(nfsm_rndup(len)); 1527 } else { 1528 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED); 1529 nd->nd_procnum = NFSPROC_NOOP; 1530 return (0); 1531 } 1532 1533 nd->nd_md = info.nmi_md; 1534 nd->nd_dpos = info.nmi_dpos; 1535 return (0); 1536 nfsmout: 1537 return (error); 1538 } 1539 1540 void 1541 nfs_msg(struct nfsreq *rep, char *msg) 1542 { 1543 tpr_t tpr; 1544 1545 if (rep->r_procp) 1546 tpr = tprintf_open(rep->r_procp); 1547 else 1548 tpr = NULL; 1549 1550 tprintf(tpr, "nfs server %s: %s\n", 1551 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname, msg); 1552 tprintf_close(tpr); 1553 } 1554 1555 #ifdef NFSSERVER 1556 /* 1557 * Socket upcall routine for the nfsd sockets. 1558 * The caddr_t arg is a pointer to the "struct nfssvc_sock". 1559 * Essentially do as much as possible non-blocking, else punt and it will 1560 * be called with M_WAIT from an nfsd. 1561 */ 1562 void 1563 nfsrv_rcv(struct socket *so, caddr_t arg, int waitflag) 1564 { 1565 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg; 1566 struct mbuf *m; 1567 struct mbuf *mp, *nam; 1568 struct uio auio; 1569 int flags, error; 1570 1571 KERNEL_LOCK(); 1572 1573 if ((slp->ns_flag & SLP_VALID) == 0) 1574 goto out; 1575 1576 /* Defer soreceive() to an nfsd. */ 1577 if (waitflag == M_DONTWAIT) { 1578 slp->ns_flag |= SLP_NEEDQ; 1579 goto dorecs; 1580 } 1581 1582 auio.uio_procp = NULL; 1583 if (so->so_type == SOCK_STREAM) { 1584 /* 1585 * Do soreceive(). 1586 */ 1587 auio.uio_resid = 1000000000; 1588 flags = MSG_DONTWAIT; 1589 error = soreceive(so, &nam, &auio, &mp, NULL, 1590 &flags, 0); 1591 if (error || mp == NULL) { 1592 if (error == EWOULDBLOCK) 1593 slp->ns_flag |= SLP_NEEDQ; 1594 else 1595 slp->ns_flag |= SLP_DISCONN; 1596 goto dorecs; 1597 } 1598 m = mp; 1599 if (slp->ns_rawend) { 1600 slp->ns_rawend->m_next = m; 1601 slp->ns_cc += 1000000000 - auio.uio_resid; 1602 } else { 1603 slp->ns_raw = m; 1604 slp->ns_cc = 1000000000 - auio.uio_resid; 1605 } 1606 while (m->m_next) 1607 m = m->m_next; 1608 slp->ns_rawend = m; 1609 1610 /* 1611 * Now try and parse record(s) out of the raw stream data. 1612 */ 1613 error = nfsrv_getstream(slp, waitflag); 1614 if (error) { 1615 if (error == EPERM) 1616 slp->ns_flag |= SLP_DISCONN; 1617 else 1618 slp->ns_flag |= SLP_NEEDQ; 1619 } 1620 } else { 1621 do { 1622 auio.uio_resid = 1000000000; 1623 flags = MSG_DONTWAIT; 1624 error = soreceive(so, &nam, &auio, &mp, 1625 NULL, &flags, 0); 1626 if (mp) { 1627 struct sockaddr_in *sin; 1628 1629 if (nam == NULL) { 1630 nfsstats.srv_errs++; 1631 m_freem(mp); 1632 continue; 1633 } 1634 if (in_nam2sin(nam, &sin) != 0 || 1635 ntohs(sin->sin_port) >= IPPORT_RESERVED) { 1636 nfsstats.srv_errs++; 1637 m_freem(nam); 1638 m_freem(mp); 1639 continue; 1640 } 1641 m = nam; 1642 m->m_next = mp; 1643 if (slp->ns_recend) 1644 slp->ns_recend->m_nextpkt = m; 1645 else 1646 slp->ns_rec = m; 1647 slp->ns_recend = m; 1648 m->m_nextpkt = NULL; 1649 } 1650 if (error) { 1651 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) 1652 && error != EWOULDBLOCK) { 1653 slp->ns_flag |= SLP_DISCONN; 1654 goto dorecs; 1655 } 1656 } 1657 } while (mp); 1658 } 1659 1660 /* 1661 * Now try and process the request records, non-blocking. 1662 */ 1663 dorecs: 1664 if (waitflag == M_DONTWAIT && 1665 (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)))) 1666 nfsrv_wakenfsd(slp); 1667 1668 out: 1669 KERNEL_UNLOCK(); 1670 } 1671 1672 /* 1673 * Try and extract an RPC request from the mbuf data list received on a 1674 * stream socket. The "waitflag" argument indicates whether or not it 1675 * can sleep. 1676 */ 1677 int 1678 nfsrv_getstream(struct nfssvc_sock *slp, int waitflag) 1679 { 1680 struct mbuf *m, **mpp; 1681 char *cp1, *cp2; 1682 int len; 1683 struct mbuf *om, *m2, *recm; 1684 u_int32_t recmark; 1685 1686 if (slp->ns_flag & SLP_GETSTREAM) 1687 return (0); 1688 slp->ns_flag |= SLP_GETSTREAM; 1689 for (;;) { 1690 if (slp->ns_reclen == 0) { 1691 if (slp->ns_cc < NFSX_UNSIGNED) { 1692 slp->ns_flag &= ~SLP_GETSTREAM; 1693 return (0); 1694 } 1695 m = slp->ns_raw; 1696 if (m->m_len >= NFSX_UNSIGNED) { 1697 bcopy(mtod(m, caddr_t), &recmark, 1698 NFSX_UNSIGNED); 1699 m->m_data += NFSX_UNSIGNED; 1700 m->m_len -= NFSX_UNSIGNED; 1701 } else { 1702 cp1 = (caddr_t)&recmark; 1703 cp2 = mtod(m, caddr_t); 1704 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) { 1705 while (m->m_len == 0) { 1706 m = m->m_next; 1707 cp2 = mtod(m, caddr_t); 1708 } 1709 *cp1++ = *cp2++; 1710 m->m_data++; 1711 m->m_len--; 1712 } 1713 } 1714 slp->ns_cc -= NFSX_UNSIGNED; 1715 recmark = ntohl(recmark); 1716 slp->ns_reclen = recmark & ~0x80000000; 1717 if (recmark & 0x80000000) 1718 slp->ns_flag |= SLP_LASTFRAG; 1719 else 1720 slp->ns_flag &= ~SLP_LASTFRAG; 1721 if (slp->ns_reclen > NFS_MAXPACKET) { 1722 slp->ns_flag &= ~SLP_GETSTREAM; 1723 return (EPERM); 1724 } 1725 } 1726 1727 /* 1728 * Now get the record part. 1729 */ 1730 recm = NULL; 1731 if (slp->ns_cc == slp->ns_reclen) { 1732 recm = slp->ns_raw; 1733 slp->ns_raw = slp->ns_rawend = NULL; 1734 slp->ns_cc = slp->ns_reclen = 0; 1735 } else if (slp->ns_cc > slp->ns_reclen) { 1736 len = 0; 1737 m = slp->ns_raw; 1738 om = NULL; 1739 while (len < slp->ns_reclen) { 1740 if ((len + m->m_len) > slp->ns_reclen) { 1741 m2 = m_copym(m, 0, slp->ns_reclen - len, 1742 waitflag); 1743 if (m2) { 1744 if (om) { 1745 om->m_next = m2; 1746 recm = slp->ns_raw; 1747 } else 1748 recm = m2; 1749 m->m_data += slp->ns_reclen-len; 1750 m->m_len -= slp->ns_reclen-len; 1751 len = slp->ns_reclen; 1752 } else { 1753 slp->ns_flag &= ~SLP_GETSTREAM; 1754 return (EWOULDBLOCK); 1755 } 1756 } else if ((len + m->m_len) == slp->ns_reclen) { 1757 om = m; 1758 len += m->m_len; 1759 m = m->m_next; 1760 recm = slp->ns_raw; 1761 om->m_next = NULL; 1762 } else { 1763 om = m; 1764 len += m->m_len; 1765 m = m->m_next; 1766 } 1767 } 1768 slp->ns_raw = m; 1769 slp->ns_cc -= len; 1770 slp->ns_reclen = 0; 1771 } else { 1772 slp->ns_flag &= ~SLP_GETSTREAM; 1773 return (0); 1774 } 1775 1776 /* 1777 * Accumulate the fragments into a record. 1778 */ 1779 mpp = &slp->ns_frag; 1780 while (*mpp) 1781 mpp = &((*mpp)->m_next); 1782 *mpp = recm; 1783 if (slp->ns_flag & SLP_LASTFRAG) { 1784 if (slp->ns_recend) 1785 slp->ns_recend->m_nextpkt = slp->ns_frag; 1786 else 1787 slp->ns_rec = slp->ns_frag; 1788 slp->ns_recend = slp->ns_frag; 1789 slp->ns_frag = NULL; 1790 } 1791 } 1792 } 1793 1794 /* 1795 * Parse an RPC header. 1796 */ 1797 int 1798 nfsrv_dorec(struct nfssvc_sock *slp, struct nfsd *nfsd, 1799 struct nfsrv_descript **ndp) 1800 { 1801 struct mbuf *m, *nam; 1802 struct nfsrv_descript *nd; 1803 int error; 1804 1805 *ndp = NULL; 1806 if ((slp->ns_flag & SLP_VALID) == 0 || 1807 (m = slp->ns_rec) == NULL) 1808 return (ENOBUFS); 1809 slp->ns_rec = m->m_nextpkt; 1810 if (slp->ns_rec) 1811 m->m_nextpkt = NULL; 1812 else 1813 slp->ns_recend = NULL; 1814 if (m->m_type == MT_SONAME) { 1815 nam = m; 1816 m = m->m_next; 1817 nam->m_next = NULL; 1818 } else 1819 nam = NULL; 1820 nd = pool_get(&nfsrv_descript_pl, PR_WAITOK); 1821 nfs_realign(&m, 10 * NFSX_UNSIGNED); 1822 nd->nd_md = nd->nd_mrep = m; 1823 nd->nd_nam2 = nam; 1824 nd->nd_dpos = mtod(m, caddr_t); 1825 error = nfs_getreq(nd, nfsd, 1); 1826 if (error) { 1827 m_freem(nam); 1828 pool_put(&nfsrv_descript_pl, nd); 1829 return (error); 1830 } 1831 *ndp = nd; 1832 nfsd->nfsd_nd = nd; 1833 return (0); 1834 } 1835 1836 1837 /* 1838 * Search for a sleeping nfsd and wake it up. 1839 * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the 1840 * running nfsds will go look for the work in the nfssvc_sock list. 1841 */ 1842 void 1843 nfsrv_wakenfsd(struct nfssvc_sock *slp) 1844 { 1845 struct nfsd *nfsd; 1846 1847 if ((slp->ns_flag & SLP_VALID) == 0) 1848 return; 1849 1850 TAILQ_FOREACH(nfsd, &nfsd_head, nfsd_chain) { 1851 if (nfsd->nfsd_flag & NFSD_WAITING) { 1852 nfsd->nfsd_flag &= ~NFSD_WAITING; 1853 if (nfsd->nfsd_slp) 1854 panic("nfsd wakeup"); 1855 slp->ns_sref++; 1856 nfsd->nfsd_slp = slp; 1857 wakeup_one(nfsd); 1858 return; 1859 } 1860 } 1861 1862 slp->ns_flag |= SLP_DOREC; 1863 nfsd_head_flag |= NFSD_CHECKSLP; 1864 } 1865 #endif /* NFSSERVER */ 1866