1 /* $OpenBSD: nfs_socket.c,v 1.151 2024/07/12 17:20:18 mvs Exp $ */ 2 /* $NetBSD: nfs_socket.c,v 1.27 1996/04/15 20:20:00 thorpej Exp $ */ 3 4 /* 5 * Copyright (c) 1989, 1991, 1993, 1995 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Rick Macklem at The University of Guelph. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95 36 */ 37 38 /* 39 * Socket operations for use by nfs 40 */ 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/proc.h> 45 #include <sys/mount.h> 46 #include <sys/kernel.h> 47 #include <sys/mbuf.h> 48 #include <sys/vnode.h> 49 #include <sys/protosw.h> 50 #include <sys/signalvar.h> 51 #include <sys/socket.h> 52 #include <sys/socketvar.h> 53 #include <sys/syslog.h> 54 #include <sys/tprintf.h> 55 #include <sys/namei.h> 56 #include <sys/pool.h> 57 #include <sys/queue.h> 58 59 #include <netinet/in.h> 60 #include <netinet/tcp.h> 61 62 #include <nfs/rpcv2.h> 63 #include <nfs/nfsproto.h> 64 #include <nfs/nfs.h> 65 #include <nfs/xdr_subs.h> 66 #include <nfs/nfsmount.h> 67 #include <nfs/nfs_var.h> 68 #include <nfs/nfsm_subs.h> 69 70 /* External data, mostly RPC constants in XDR form. */ 71 extern u_int32_t rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, 72 rpc_auth_unix, rpc_msgaccepted, rpc_call, rpc_autherr; 73 extern u_int32_t nfs_prog; 74 extern struct nfsstats nfsstats; 75 extern const int nfsv3_procid[NFS_NPROCS]; 76 extern int nfs_ticks; 77 78 extern struct pool nfsrv_descript_pl; 79 80 /* 81 * There is a congestion window for outstanding rpcs maintained per mount 82 * point. The cwnd size is adjusted in roughly the way that: 83 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of 84 * SIGCOMM '88". ACM, August 1988. 85 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout 86 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd 87 * of rpcs is in progress. 88 * (The sent count and cwnd are scaled for integer arith.) 89 * Variants of "slow start" were tried and were found to be too much of a 90 * performance hit (ave. rtt 3 times larger), 91 * I suspect due to the large rtt that nfs rpcs have. 92 */ 93 #define NFS_CWNDSCALE 256 94 #define NFS_MAXCWND (NFS_CWNDSCALE * 32) 95 static const int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256 }; 96 97 /* RTT estimator */ 98 static const enum nfs_rto_timers nfs_ptimers[NFS_NPROCS] = { 99 NFS_DEFAULT_TIMER, /* NULL */ 100 NFS_GETATTR_TIMER, /* GETATTR */ 101 NFS_DEFAULT_TIMER, /* SETATTR */ 102 NFS_LOOKUP_TIMER, /* LOOKUP */ 103 NFS_GETATTR_TIMER, /* ACCESS */ 104 NFS_READ_TIMER, /* READLINK */ 105 NFS_READ_TIMER, /* READ */ 106 NFS_WRITE_TIMER, /* WRITE */ 107 NFS_DEFAULT_TIMER, /* CREATE */ 108 NFS_DEFAULT_TIMER, /* MKDIR */ 109 NFS_DEFAULT_TIMER, /* SYMLINK */ 110 NFS_DEFAULT_TIMER, /* MKNOD */ 111 NFS_DEFAULT_TIMER, /* REMOVE */ 112 NFS_DEFAULT_TIMER, /* RMDIR */ 113 NFS_DEFAULT_TIMER, /* RENAME */ 114 NFS_DEFAULT_TIMER, /* LINK */ 115 NFS_READ_TIMER, /* READDIR */ 116 NFS_READ_TIMER, /* READDIRPLUS */ 117 NFS_DEFAULT_TIMER, /* FSSTAT */ 118 NFS_DEFAULT_TIMER, /* FSINFO */ 119 NFS_DEFAULT_TIMER, /* PATHCONF */ 120 NFS_DEFAULT_TIMER, /* COMMIT */ 121 NFS_DEFAULT_TIMER, /* NOOP */ 122 }; 123 124 void nfs_init_rtt(struct nfsmount *); 125 void nfs_update_rtt(struct nfsreq *); 126 int nfs_estimate_rto(struct nfsmount *, u_int32_t procnum); 127 128 void nfs_realign(struct mbuf **, int); 129 void nfs_realign_fixup(struct mbuf *, struct mbuf *, unsigned int *); 130 131 int nfs_rcvlock(struct nfsreq *); 132 int nfs_receive(struct nfsreq *, struct mbuf **, struct mbuf **); 133 int nfs_reconnect(struct nfsreq *); 134 int nfs_reply(struct nfsreq *); 135 void nfs_msg(struct nfsreq *, char *); 136 void nfs_rcvunlock(int *); 137 138 int nfsrv_getstream(struct nfssvc_sock *, int); 139 140 unsigned int nfs_realign_test = 0; 141 unsigned int nfs_realign_count = 0; 142 143 /* Initialize the RTT estimator state for a new mount point. */ 144 void 145 nfs_init_rtt(struct nfsmount *nmp) 146 { 147 int i; 148 149 for (i = 0; i < NFS_MAX_TIMER; i++) 150 nmp->nm_srtt[i] = NFS_INITRTT; 151 for (i = 0; i < NFS_MAX_TIMER; i++) 152 nmp->nm_sdrtt[i] = 0; 153 } 154 155 /* 156 * Update a mount point's RTT estimator state using data from the 157 * passed-in request. 158 * 159 * Use a gain of 0.125 on the mean and a gain of 0.25 on the deviation. 160 * 161 * NB: Since the timer resolution of NFS_HZ is so coarse, it can often 162 * result in r_rtt == 0. Since r_rtt == N means that the actual RTT is 163 * between N + dt and N + 2 - dt ticks, add 1 before calculating the 164 * update values. 165 */ 166 void 167 nfs_update_rtt(struct nfsreq *rep) 168 { 169 int t1 = rep->r_rtt + 1; 170 int index = nfs_ptimers[rep->r_procnum] - 1; 171 int *srtt = &rep->r_nmp->nm_srtt[index]; 172 int *sdrtt = &rep->r_nmp->nm_sdrtt[index]; 173 174 t1 -= *srtt >> 3; 175 *srtt += t1; 176 if (t1 < 0) 177 t1 = -t1; 178 t1 -= *sdrtt >> 2; 179 *sdrtt += t1; 180 } 181 182 /* 183 * Estimate RTO for an NFS RPC sent via an unreliable datagram. 184 * 185 * Use the mean and mean deviation of RTT for the appropriate type 186 * of RPC for the frequent RPCs and a default for the others. 187 * The justification for doing "other" this way is that these RPCs 188 * happen so infrequently that timer est. would probably be stale. 189 * Also, since many of these RPCs are non-idempotent, a conservative 190 * timeout is desired. 191 * 192 * getattr, lookup - A+2D 193 * read, write - A+4D 194 * other - nm_timeo 195 */ 196 int 197 nfs_estimate_rto(struct nfsmount *nmp, u_int32_t procnum) 198 { 199 enum nfs_rto_timers timer = nfs_ptimers[procnum]; 200 int index = timer - 1; 201 int rto; 202 203 switch (timer) { 204 case NFS_GETATTR_TIMER: 205 case NFS_LOOKUP_TIMER: 206 rto = ((nmp->nm_srtt[index] + 3) >> 2) + 207 ((nmp->nm_sdrtt[index] + 1) >> 1); 208 break; 209 case NFS_READ_TIMER: 210 case NFS_WRITE_TIMER: 211 rto = ((nmp->nm_srtt[index] + 7) >> 3) + 212 (nmp->nm_sdrtt[index] + 1); 213 break; 214 default: 215 rto = nmp->nm_timeo; 216 return (rto); 217 } 218 219 if (rto < NFS_MINRTO) 220 rto = NFS_MINRTO; 221 else if (rto > NFS_MAXRTO) 222 rto = NFS_MAXRTO; 223 224 return (rto); 225 } 226 227 228 229 /* 230 * Initialize sockets and congestion for a new NFS connection. 231 * We do not free the sockaddr if error. 232 */ 233 int 234 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep) 235 { 236 struct socket *so; 237 int error, rcvreserve, sndreserve; 238 struct sockaddr *saddr; 239 struct sockaddr_in *sin; 240 struct mbuf *nam = NULL, *mopt = NULL; 241 242 if (!(nmp->nm_sotype == SOCK_DGRAM || nmp->nm_sotype == SOCK_STREAM)) 243 return (EINVAL); 244 245 nmp->nm_so = NULL; 246 saddr = mtod(nmp->nm_nam, struct sockaddr *); 247 error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype, 248 nmp->nm_soproto); 249 if (error) { 250 nfs_disconnect(nmp); 251 return (error); 252 } 253 254 /* Allocate mbufs possibly waiting before grabbing the socket lock. */ 255 if (nmp->nm_sotype == SOCK_STREAM || saddr->sa_family == AF_INET) 256 MGET(mopt, M_WAIT, MT_SOOPTS); 257 if (saddr->sa_family == AF_INET) 258 MGET(nam, M_WAIT, MT_SONAME); 259 260 so = nmp->nm_so; 261 nmp->nm_soflags = so->so_proto->pr_flags; 262 263 /* 264 * Some servers require that the client port be a reserved port number. 265 * We always allocate a reserved port, as this prevents filehandle 266 * disclosure through UDP port capture. 267 */ 268 if (saddr->sa_family == AF_INET) { 269 int *ip; 270 271 mopt->m_len = sizeof(int); 272 ip = mtod(mopt, int *); 273 *ip = IP_PORTRANGE_LOW; 274 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); 275 if (error) 276 goto bad; 277 278 sin = mtod(nam, struct sockaddr_in *); 279 memset(sin, 0, sizeof(*sin)); 280 sin->sin_len = nam->m_len = sizeof(struct sockaddr_in); 281 sin->sin_family = AF_INET; 282 sin->sin_addr.s_addr = INADDR_ANY; 283 sin->sin_port = htons(0); 284 solock(so); 285 error = sobind(so, nam, &proc0); 286 sounlock(so); 287 if (error) 288 goto bad; 289 290 mopt->m_len = sizeof(int); 291 ip = mtod(mopt, int *); 292 *ip = IP_PORTRANGE_DEFAULT; 293 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); 294 if (error) 295 goto bad; 296 } 297 298 /* 299 * Protocols that do not require connections may be optionally left 300 * unconnected for servers that reply from a port other than NFS_PORT. 301 */ 302 if (nmp->nm_flag & NFSMNT_NOCONN) { 303 if (nmp->nm_soflags & PR_CONNREQUIRED) { 304 error = ENOTCONN; 305 goto bad; 306 } 307 } else { 308 solock(so); 309 error = soconnect(so, nmp->nm_nam); 310 if (error) 311 goto bad_locked; 312 313 /* 314 * Wait for the connection to complete. Cribbed from the 315 * connect system call but with the wait timing out so 316 * that interruptible mounts don't hang here for a long time. 317 */ 318 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 319 sosleep_nsec(so, &so->so_timeo, PSOCK, "nfscon", 320 SEC_TO_NSEC(2)); 321 if ((so->so_state & SS_ISCONNECTING) && 322 so->so_error == 0 && rep && 323 (error = nfs_sigintr(nmp, rep, rep->r_procp)) != 0){ 324 so->so_state &= ~SS_ISCONNECTING; 325 goto bad_locked; 326 } 327 } 328 if (so->so_error) { 329 error = so->so_error; 330 so->so_error = 0; 331 goto bad_locked; 332 } 333 sounlock(so); 334 } 335 /* 336 * Always set receive timeout to detect server crash and reconnect. 337 * Otherwise, we can get stuck in soreceive forever. 338 */ 339 mtx_enter(&so->so_rcv.sb_mtx); 340 so->so_rcv.sb_timeo_nsecs = SEC_TO_NSEC(5); 341 mtx_leave(&so->so_rcv.sb_mtx); 342 mtx_enter(&so->so_snd.sb_mtx); 343 if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) 344 so->so_snd.sb_timeo_nsecs = SEC_TO_NSEC(5); 345 else 346 so->so_snd.sb_timeo_nsecs = INFSLP; 347 mtx_leave(&so->so_snd.sb_mtx); 348 if (nmp->nm_sotype == SOCK_DGRAM) { 349 sndreserve = nmp->nm_wsize + NFS_MAXPKTHDR; 350 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) + 351 NFS_MAXPKTHDR) * 2; 352 } else if (nmp->nm_sotype == SOCK_STREAM) { 353 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 354 *mtod(mopt, int32_t *) = 1; 355 mopt->m_len = sizeof(int32_t); 356 sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, mopt); 357 } 358 if (so->so_proto->pr_protocol == IPPROTO_TCP) { 359 *mtod(mopt, int32_t *) = 1; 360 mopt->m_len = sizeof(int32_t); 361 sosetopt(so, IPPROTO_TCP, TCP_NODELAY, mopt); 362 } 363 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + 364 sizeof (u_int32_t)) * 2; 365 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + 366 sizeof (u_int32_t)) * 2; 367 } else { 368 panic("%s: nm_sotype %d", __func__, nmp->nm_sotype); 369 } 370 solock(so); 371 error = soreserve(so, sndreserve, rcvreserve); 372 if (error) 373 goto bad_locked; 374 mtx_enter(&so->so_rcv.sb_mtx); 375 so->so_rcv.sb_flags |= SB_NOINTR; 376 mtx_leave(&so->so_rcv.sb_mtx); 377 mtx_enter(&so->so_snd.sb_mtx); 378 so->so_snd.sb_flags |= SB_NOINTR; 379 mtx_leave(&so->so_snd.sb_mtx); 380 sounlock(so); 381 382 m_freem(mopt); 383 m_freem(nam); 384 385 /* Initialize other non-zero congestion variables */ 386 nfs_init_rtt(nmp); 387 nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */ 388 nmp->nm_sent = 0; 389 nmp->nm_timeouts = 0; 390 return (0); 391 392 bad_locked: 393 sounlock(so); 394 bad: 395 396 m_freem(mopt); 397 m_freem(nam); 398 399 nfs_disconnect(nmp); 400 return (error); 401 } 402 403 /* 404 * Reconnect routine: 405 * Called when a connection is broken on a reliable protocol. 406 * - clean up the old socket 407 * - nfs_connect() again 408 * - set R_MUSTRESEND for all outstanding requests on mount point 409 * If this fails the mount point is DEAD! 410 * nb: Must be called with the nfs_sndlock() set on the mount point. 411 */ 412 int 413 nfs_reconnect(struct nfsreq *rep) 414 { 415 struct nfsreq *rp; 416 struct nfsmount *nmp = rep->r_nmp; 417 int error; 418 419 nfs_disconnect(nmp); 420 while ((error = nfs_connect(nmp, rep)) != 0) { 421 if (error == EINTR || error == ERESTART) 422 return (EINTR); 423 tsleep_nsec(&nowake, PSOCK, "nfsrecon", SEC_TO_NSEC(1)); 424 } 425 426 /* 427 * Loop through outstanding request list and fix up all requests 428 * on old socket. 429 */ 430 TAILQ_FOREACH(rp, &nmp->nm_reqsq, r_chain) { 431 rp->r_flags |= R_MUSTRESEND; 432 rp->r_rexmit = 0; 433 } 434 return (0); 435 } 436 437 /* 438 * NFS disconnect. Clean up and unlink. 439 */ 440 void 441 nfs_disconnect(struct nfsmount *nmp) 442 { 443 struct socket *so; 444 445 if (nmp->nm_so) { 446 so = nmp->nm_so; 447 nmp->nm_so = NULL; 448 soshutdown(so, SHUT_RDWR); 449 soclose(so, 0); 450 } 451 } 452 453 /* 454 * This is the nfs send routine. For connection based socket types, it 455 * must be called with an nfs_sndlock() on the socket. 456 * "rep == NULL" indicates that it has been called from a server. 457 * For the client side: 458 * - return EINTR if the RPC is terminated, 0 otherwise 459 * - set R_MUSTRESEND if the send fails for any reason 460 * - do any cleanup required by recoverable socket errors (???) 461 * For the server side: 462 * - return EINTR or ERESTART if interrupted by a signal 463 * - return EPIPE if a connection is lost for connection based sockets (TCP...) 464 * - do any cleanup required by recoverable socket errors (???) 465 */ 466 int 467 nfs_send(struct socket *so, struct mbuf *nam, struct mbuf *top, 468 struct nfsreq *rep) 469 { 470 struct mbuf *sendnam; 471 int error, soflags, flags; 472 473 if (rep) { 474 if (rep->r_flags & R_SOFTTERM) { 475 m_freem(top); 476 return (EINTR); 477 } 478 if ((so = rep->r_nmp->nm_so) == NULL) { 479 rep->r_flags |= R_MUSTRESEND; 480 m_freem(top); 481 return (0); 482 } 483 rep->r_flags &= ~R_MUSTRESEND; 484 soflags = rep->r_nmp->nm_soflags; 485 } else 486 soflags = so->so_proto->pr_flags; 487 if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED)) 488 sendnam = NULL; 489 else 490 sendnam = nam; 491 flags = 0; 492 493 error = sosend(so, sendnam, NULL, top, NULL, flags); 494 if (error) { 495 if (rep) { 496 /* 497 * Deal with errors for the client side. 498 */ 499 if (rep->r_flags & R_SOFTTERM) 500 error = EINTR; 501 else 502 rep->r_flags |= R_MUSTRESEND; 503 } 504 505 /* 506 * Handle any recoverable (soft) socket errors here. (???) 507 */ 508 if (error != EINTR && error != ERESTART && 509 error != EWOULDBLOCK && error != EPIPE) 510 error = 0; 511 } 512 return (error); 513 } 514 515 #ifdef NFSCLIENT 516 /* 517 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all 518 * done by soreceive(), but for SOCK_STREAM we must deal with the Record 519 * Mark and consolidate the data into a new mbuf list. 520 * nb: Sometimes TCP passes the data up to soreceive() in long lists of 521 * small mbufs. 522 * For SOCK_STREAM we must be very careful to read an entire record once 523 * we have read any of it, even if the system call has been interrupted. 524 */ 525 int 526 nfs_receive(struct nfsreq *rep, struct mbuf **aname, struct mbuf **mp) 527 { 528 struct socket *so; 529 struct uio auio; 530 struct iovec aio; 531 struct mbuf *m; 532 struct mbuf *control; 533 u_int32_t len; 534 struct mbuf **getnam; 535 int error, sotype, rcvflg; 536 struct proc *p = curproc; /* XXX */ 537 538 /* 539 * Set up arguments for soreceive() 540 */ 541 *mp = NULL; 542 *aname = NULL; 543 sotype = rep->r_nmp->nm_sotype; 544 545 /* 546 * For reliable protocols, lock against other senders/receivers 547 * in case a reconnect is necessary. 548 * For SOCK_STREAM, first get the Record Mark to find out how much 549 * more there is to get. 550 * We must lock the socket against other receivers 551 * until we have an entire rpc request/reply. 552 */ 553 if (sotype != SOCK_DGRAM) { 554 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); 555 if (error) 556 return (error); 557 tryagain: 558 /* 559 * Check for fatal errors and resending request. 560 */ 561 /* 562 * Ugh: If a reconnect attempt just happened, nm_so 563 * would have changed. NULL indicates a failed 564 * attempt that has essentially shut down this 565 * mount point. 566 */ 567 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) { 568 nfs_sndunlock(&rep->r_nmp->nm_flag); 569 return (EINTR); 570 } 571 so = rep->r_nmp->nm_so; 572 if (!so) { 573 error = nfs_reconnect(rep); 574 if (error) { 575 nfs_sndunlock(&rep->r_nmp->nm_flag); 576 return (error); 577 } 578 goto tryagain; 579 } 580 while (rep->r_flags & R_MUSTRESEND) { 581 m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT); 582 nfsstats.rpcretries++; 583 rep->r_rtt = 0; 584 rep->r_flags &= ~R_TIMING; 585 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep); 586 if (error) { 587 if (error == EINTR || error == ERESTART || 588 (error = nfs_reconnect(rep)) != 0) { 589 nfs_sndunlock(&rep->r_nmp->nm_flag); 590 return (error); 591 } 592 goto tryagain; 593 } 594 } 595 nfs_sndunlock(&rep->r_nmp->nm_flag); 596 if (sotype == SOCK_STREAM) { 597 aio.iov_base = (caddr_t) &len; 598 aio.iov_len = sizeof(u_int32_t); 599 auio.uio_iov = &aio; 600 auio.uio_iovcnt = 1; 601 auio.uio_segflg = UIO_SYSSPACE; 602 auio.uio_rw = UIO_READ; 603 auio.uio_offset = 0; 604 auio.uio_resid = sizeof(u_int32_t); 605 auio.uio_procp = p; 606 do { 607 rcvflg = MSG_WAITALL; 608 error = soreceive(so, NULL, &auio, NULL, NULL, 609 &rcvflg, 0); 610 if (error == EWOULDBLOCK && rep) { 611 if (rep->r_flags & R_SOFTTERM) 612 return (EINTR); 613 /* 614 * looks like the server died after it 615 * received the request, make sure 616 * that we will retransmit and we 617 * don't get stuck here forever. 618 */ 619 if (rep->r_rexmit >= 620 rep->r_nmp->nm_retry) { 621 nfsstats.rpctimeouts++; 622 error = EPIPE; 623 } 624 } 625 } while (error == EWOULDBLOCK); 626 if (!error && auio.uio_resid > 0) { 627 log(LOG_INFO, 628 "short receive (%zu/%zu) from nfs server %s\n", 629 sizeof(u_int32_t) - auio.uio_resid, 630 sizeof(u_int32_t), 631 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 632 error = EPIPE; 633 } 634 if (error) 635 goto errout; 636 637 len = ntohl(len) & ~0x80000000; 638 /* 639 * This is SERIOUS! We are out of sync with the sender 640 * and forcing a disconnect/reconnect is all I can do. 641 */ 642 if (len > NFS_MAXPACKET) { 643 log(LOG_ERR, "%s (%u) from nfs server %s\n", 644 "impossible packet length", 645 len, 646 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 647 error = EFBIG; 648 goto errout; 649 } 650 auio.uio_resid = len; 651 do { 652 rcvflg = MSG_WAITALL; 653 error = soreceive(so, NULL, &auio, mp, NULL, 654 &rcvflg, 0); 655 } while (error == EWOULDBLOCK || error == EINTR || 656 error == ERESTART); 657 if (!error && auio.uio_resid > 0) { 658 log(LOG_INFO, "short receive (%zu/%u) from " 659 "nfs server %s\n", len - auio.uio_resid, 660 len, rep->r_nmp->nm_mountp-> 661 mnt_stat.f_mntfromname); 662 error = EPIPE; 663 } 664 } else { 665 /* 666 * NB: Since uio_resid is big, MSG_WAITALL is ignored 667 * and soreceive() will return when it has either a 668 * control msg or a data msg. 669 * We have no use for control msg., but must grab them 670 * and then throw them away so we know what is going 671 * on. 672 */ 673 auio.uio_resid = len = 100000000; /* Anything Big */ 674 auio.uio_procp = p; 675 do { 676 rcvflg = 0; 677 error = soreceive(so, NULL, &auio, mp, &control, 678 &rcvflg, 0); 679 m_freem(control); 680 if (error == EWOULDBLOCK && rep) { 681 if (rep->r_flags & R_SOFTTERM) 682 return (EINTR); 683 } 684 } while (error == EWOULDBLOCK || 685 (!error && *mp == NULL && control)); 686 if ((rcvflg & MSG_EOR) == 0) 687 printf("Egad!!\n"); 688 if (!error && *mp == NULL) 689 error = EPIPE; 690 len -= auio.uio_resid; 691 } 692 errout: 693 if (error && error != EINTR && error != ERESTART) { 694 m_freemp(mp); 695 if (error != EPIPE) 696 log(LOG_INFO, 697 "receive error %d from nfs server %s\n", 698 error, 699 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 700 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); 701 if (!error) { 702 error = nfs_reconnect(rep); 703 if (!error) 704 goto tryagain; 705 nfs_sndunlock(&rep->r_nmp->nm_flag); 706 } 707 } 708 } else { 709 if ((so = rep->r_nmp->nm_so) == NULL) 710 return (EACCES); 711 if (so->so_state & SS_ISCONNECTED) 712 getnam = NULL; 713 else 714 getnam = aname; 715 auio.uio_resid = len = 1000000; 716 auio.uio_procp = p; 717 do { 718 rcvflg = 0; 719 error = soreceive(so, getnam, &auio, mp, NULL, 720 &rcvflg, 0); 721 if (error == EWOULDBLOCK && 722 (rep->r_flags & R_SOFTTERM)) 723 return (EINTR); 724 } while (error == EWOULDBLOCK); 725 len -= auio.uio_resid; 726 } 727 if (error) 728 m_freemp(mp); 729 /* 730 * Search for any mbufs that are not a multiple of 4 bytes long 731 * or with m_data not longword aligned. 732 * These could cause pointer alignment problems, so copy them to 733 * well aligned mbufs. 734 */ 735 nfs_realign(mp, 5 * NFSX_UNSIGNED); 736 return (error); 737 } 738 739 /* 740 * Implement receipt of reply on a socket. 741 * We must search through the list of received datagrams matching them 742 * with outstanding requests using the xid, until ours is found. 743 */ 744 int 745 nfs_reply(struct nfsreq *myrep) 746 { 747 struct nfsreq *rep; 748 struct nfsmount *nmp = myrep->r_nmp; 749 struct nfsm_info info; 750 struct mbuf *nam; 751 u_int32_t rxid, *tl; 752 int error; 753 754 /* 755 * Loop around until we get our own reply 756 */ 757 for (;;) { 758 /* 759 * Lock against other receivers so that I don't get stuck in 760 * sbwait() after someone else has received my reply for me. 761 * Also necessary for connection based protocols to avoid 762 * race conditions during a reconnect. 763 */ 764 error = nfs_rcvlock(myrep); 765 if (error) 766 return (error == EALREADY ? 0 : error); 767 768 /* 769 * Get the next Rpc reply off the socket 770 */ 771 error = nfs_receive(myrep, &nam, &info.nmi_mrep); 772 nfs_rcvunlock(&nmp->nm_flag); 773 if (error) { 774 775 /* 776 * Ignore routing errors on connectionless protocols?? 777 */ 778 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) { 779 if (nmp->nm_so) 780 nmp->nm_so->so_error = 0; 781 continue; 782 } 783 return (error); 784 } 785 m_freem(nam); 786 787 /* 788 * Get the xid and check that it is an rpc reply 789 */ 790 info.nmi_md = info.nmi_mrep; 791 info.nmi_dpos = mtod(info.nmi_md, caddr_t); 792 info.nmi_errorp = &error; 793 tl = (uint32_t *)nfsm_dissect(&info, 2 * NFSX_UNSIGNED); 794 if (tl == NULL) 795 goto nfsmout; 796 rxid = *tl++; 797 if (*tl != rpc_reply) { 798 nfsstats.rpcinvalid++; 799 m_freem(info.nmi_mrep); 800 nfsmout: 801 continue; 802 } 803 804 /* 805 * Loop through the request list to match up the reply 806 * Iff no match, just drop the datagram 807 */ 808 TAILQ_FOREACH(rep, &nmp->nm_reqsq, r_chain) { 809 if (rep->r_mrep == NULL && rxid == rep->r_xid) { 810 /* Found it.. */ 811 rep->r_mrep = info.nmi_mrep; 812 rep->r_md = info.nmi_md; 813 rep->r_dpos = info.nmi_dpos; 814 815 /* 816 * Update congestion window. 817 * Do the additive increase of 818 * one rpc/rtt. 819 */ 820 if (nmp->nm_cwnd <= nmp->nm_sent) { 821 nmp->nm_cwnd += 822 (NFS_CWNDSCALE * NFS_CWNDSCALE + 823 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd; 824 if (nmp->nm_cwnd > NFS_MAXCWND) 825 nmp->nm_cwnd = NFS_MAXCWND; 826 } 827 rep->r_flags &= ~R_SENT; 828 nmp->nm_sent -= NFS_CWNDSCALE; 829 830 if (rep->r_flags & R_TIMING) 831 nfs_update_rtt(rep); 832 833 nmp->nm_timeouts = 0; 834 break; 835 } 836 } 837 /* 838 * If not matched to a request, drop it. 839 * If it's mine, get out. 840 */ 841 if (rep == 0) { 842 nfsstats.rpcunexpected++; 843 m_freem(info.nmi_mrep); 844 } else if (rep == myrep) { 845 if (rep->r_mrep == NULL) 846 panic("nfsreply nil"); 847 return (0); 848 } 849 } 850 } 851 852 /* 853 * nfs_request - goes something like this 854 * - fill in request struct 855 * - links it into list 856 * - calls nfs_send() for first transmit 857 * - calls nfs_receive() to get reply 858 * - break down rpc header and return with nfs reply pointed to 859 * by mrep or error 860 * nb: always frees up mreq mbuf list 861 */ 862 int 863 nfs_request(struct vnode *vp, int procnum, struct nfsm_info *infop) 864 { 865 struct mbuf *m; 866 u_int32_t *tl; 867 struct nfsmount *nmp; 868 int i, error = 0; 869 int trylater_delay; 870 struct nfsreq *rep; 871 struct nfsm_info info; 872 873 rep = pool_get(&nfsreqpl, PR_WAITOK); 874 rep->r_nmp = VFSTONFS(vp->v_mount); 875 rep->r_vp = vp; 876 rep->r_procp = infop->nmi_procp; 877 rep->r_procnum = procnum; 878 879 /* empty mbuf for AUTH_UNIX header */ 880 rep->r_mreq = m_gethdr(M_WAIT, MT_DATA); 881 rep->r_mreq->m_next = infop->nmi_mreq; 882 rep->r_mreq->m_len = 0; 883 m_calchdrlen(rep->r_mreq); 884 885 trylater_delay = NFS_MINTIMEO; 886 887 nmp = rep->r_nmp; 888 889 /* Get the RPC header with authorization. */ 890 nfsm_rpchead(rep, infop->nmi_cred, RPCAUTH_UNIX); 891 m = rep->r_mreq; 892 893 /* 894 * For stream protocols, insert a Sun RPC Record Mark. 895 */ 896 if (nmp->nm_sotype == SOCK_STREAM) { 897 M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); 898 *mtod(m, u_int32_t *) = htonl(0x80000000 | 899 (m->m_pkthdr.len - NFSX_UNSIGNED)); 900 } 901 902 tryagain: 903 rep->r_rtt = rep->r_rexmit = 0; 904 if (nfs_ptimers[rep->r_procnum] != NFS_DEFAULT_TIMER) 905 rep->r_flags = R_TIMING; 906 else 907 rep->r_flags = 0; 908 rep->r_mrep = NULL; 909 910 /* 911 * Do the client side RPC. 912 */ 913 nfsstats.rpcrequests++; 914 /* 915 * Chain request into list of outstanding requests. Be sure 916 * to put it LAST so timer finds oldest requests first. 917 */ 918 if (TAILQ_EMPTY(&nmp->nm_reqsq)) 919 timeout_add(&nmp->nm_rtimeout, nfs_ticks); 920 TAILQ_INSERT_TAIL(&nmp->nm_reqsq, rep, r_chain); 921 922 /* 923 * If backing off another request or avoiding congestion, don't 924 * send this one now but let timer do it. If not timing a request, 925 * do it now. 926 */ 927 if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM || 928 (nmp->nm_flag & NFSMNT_DUMBTIMR) || 929 nmp->nm_sent < nmp->nm_cwnd)) { 930 if (nmp->nm_soflags & PR_CONNREQUIRED) 931 error = nfs_sndlock(&nmp->nm_flag, rep); 932 if (!error) { 933 error = nfs_send(nmp->nm_so, nmp->nm_nam, 934 m_copym(m, 0, M_COPYALL, M_WAIT), rep); 935 if (nmp->nm_soflags & PR_CONNREQUIRED) 936 nfs_sndunlock(&nmp->nm_flag); 937 } 938 if (!error && (rep->r_flags & R_MUSTRESEND) == 0) { 939 nmp->nm_sent += NFS_CWNDSCALE; 940 rep->r_flags |= R_SENT; 941 } 942 } else { 943 rep->r_rtt = -1; 944 } 945 946 /* 947 * Wait for the reply from our send or the timer's. 948 */ 949 if (!error || error == EPIPE) 950 error = nfs_reply(rep); 951 952 /* 953 * RPC done, unlink the request. 954 */ 955 TAILQ_REMOVE(&nmp->nm_reqsq, rep, r_chain); 956 if (TAILQ_EMPTY(&nmp->nm_reqsq)) 957 timeout_del(&nmp->nm_rtimeout); 958 959 /* 960 * Decrement the outstanding request count. 961 */ 962 if (rep->r_flags & R_SENT) { 963 rep->r_flags &= ~R_SENT; /* paranoia */ 964 nmp->nm_sent -= NFS_CWNDSCALE; 965 } 966 967 /* 968 * If there was a successful reply and a tprintf msg. 969 * tprintf a response. 970 */ 971 if (!error && (rep->r_flags & R_TPRINTFMSG)) 972 nfs_msg(rep, "is alive again"); 973 info.nmi_mrep = rep->r_mrep; 974 info.nmi_md = rep->r_md; 975 info.nmi_dpos = rep->r_dpos; 976 info.nmi_errorp = &error; 977 if (error) { 978 infop->nmi_mrep = NULL; 979 goto nfsmout1; 980 } 981 982 /* 983 * break down the rpc header and check if ok 984 */ 985 tl = (uint32_t *)nfsm_dissect(&info, 3 * NFSX_UNSIGNED); 986 if (tl == NULL) 987 goto nfsmout; 988 if (*tl++ == rpc_msgdenied) { 989 if (*tl == rpc_mismatch) 990 error = EOPNOTSUPP; 991 else 992 error = EACCES; /* Should be EAUTH. */ 993 infop->nmi_mrep = NULL; 994 goto nfsmout1; 995 } 996 997 /* 998 * Since we only support RPCAUTH_UNIX atm we step over the 999 * reply verifer type, and in the (error) case that there really 1000 * is any data in it, we advance over it. 1001 */ 1002 tl++; /* Step over verifer type */ 1003 i = fxdr_unsigned(int32_t, *tl); 1004 if (i > 0) { 1005 /* Should not happen */ 1006 if (nfsm_adv(&info, nfsm_rndup(i)) != 0) 1007 goto nfsmout; 1008 } 1009 1010 tl = (uint32_t *)nfsm_dissect(&info, NFSX_UNSIGNED); 1011 if (tl == NULL) 1012 goto nfsmout; 1013 /* 0 == ok */ 1014 if (*tl == 0) { 1015 tl = (uint32_t *)nfsm_dissect(&info, NFSX_UNSIGNED); 1016 if (tl == NULL) 1017 goto nfsmout; 1018 if (*tl != 0) { 1019 error = fxdr_unsigned(int, *tl); 1020 if ((nmp->nm_flag & NFSMNT_NFSV3) && 1021 error == NFSERR_TRYLATER) { 1022 m_freem(info.nmi_mrep); 1023 error = 0; 1024 tsleep_nsec(&nowake, PSOCK, "nfsretry", 1025 SEC_TO_NSEC(trylater_delay)); 1026 trylater_delay *= NFS_TIMEOUTMUL; 1027 if (trylater_delay > NFS_MAXTIMEO) 1028 trylater_delay = NFS_MAXTIMEO; 1029 1030 goto tryagain; 1031 } 1032 1033 /* 1034 * If the File Handle was stale, invalidate the 1035 * lookup cache, just in case. 1036 */ 1037 if (error == ESTALE) 1038 cache_purge(rep->r_vp); 1039 } 1040 goto nfsmout; 1041 } 1042 1043 error = EPROTONOSUPPORT; 1044 1045 nfsmout: 1046 infop->nmi_mrep = info.nmi_mrep; 1047 infop->nmi_md = info.nmi_md; 1048 infop->nmi_dpos = info.nmi_dpos; 1049 nfsmout1: 1050 m_freem(rep->r_mreq); 1051 pool_put(&nfsreqpl, rep); 1052 return (error); 1053 } 1054 #endif /* NFSCLIENT */ 1055 1056 /* 1057 * Generate the rpc reply header 1058 * siz arg. is used to decide if adding a cluster is worthwhile 1059 */ 1060 int 1061 nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp, 1062 int err, struct mbuf **mrq, struct mbuf **mbp) 1063 { 1064 u_int32_t *tl; 1065 struct mbuf *mreq; 1066 struct mbuf *mb; 1067 1068 MGETHDR(mreq, M_WAIT, MT_DATA); 1069 mb = mreq; 1070 /* 1071 * If this is a big reply, use a cluster else 1072 * try and leave leading space for the lower level headers. 1073 */ 1074 siz += RPC_REPLYSIZ; 1075 if (siz >= MHLEN - max_hdr) { 1076 MCLGET(mreq, M_WAIT); 1077 } else 1078 mreq->m_data += max_hdr; 1079 tl = mtod(mreq, u_int32_t *); 1080 mreq->m_len = 6 * NFSX_UNSIGNED; 1081 *tl++ = txdr_unsigned(nd->nd_retxid); 1082 *tl++ = rpc_reply; 1083 if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) { 1084 *tl++ = rpc_msgdenied; 1085 if (err & NFSERR_AUTHERR) { 1086 *tl++ = rpc_autherr; 1087 *tl = txdr_unsigned(err & ~NFSERR_AUTHERR); 1088 mreq->m_len -= NFSX_UNSIGNED; 1089 } else { 1090 *tl++ = rpc_mismatch; 1091 *tl++ = txdr_unsigned(RPC_VER2); 1092 *tl = txdr_unsigned(RPC_VER2); 1093 } 1094 } else { 1095 *tl++ = rpc_msgaccepted; 1096 1097 /* AUTH_UNIX requires RPCAUTH_NULL. */ 1098 *tl++ = 0; 1099 *tl++ = 0; 1100 1101 switch (err) { 1102 case EPROGUNAVAIL: 1103 *tl = txdr_unsigned(RPC_PROGUNAVAIL); 1104 break; 1105 case EPROGMISMATCH: 1106 *tl = txdr_unsigned(RPC_PROGMISMATCH); 1107 tl = nfsm_build(&mb, 2 * NFSX_UNSIGNED); 1108 *tl++ = txdr_unsigned(NFS_VER2); 1109 *tl = txdr_unsigned(NFS_VER3); 1110 break; 1111 case EPROCUNAVAIL: 1112 *tl = txdr_unsigned(RPC_PROCUNAVAIL); 1113 break; 1114 case EBADRPC: 1115 *tl = txdr_unsigned(RPC_GARBAGE); 1116 break; 1117 default: 1118 *tl = 0; 1119 if (err != NFSERR_RETVOID) { 1120 tl = nfsm_build(&mb, NFSX_UNSIGNED); 1121 if (err) 1122 *tl = txdr_unsigned(nfsrv_errmap(nd, err)); 1123 else 1124 *tl = 0; 1125 } 1126 break; 1127 }; 1128 } 1129 1130 *mrq = mreq; 1131 if (mbp != NULL) 1132 *mbp = mb; 1133 if (err != 0 && err != NFSERR_RETVOID) 1134 nfsstats.srvrpc_errs++; 1135 return (0); 1136 } 1137 1138 /* 1139 * nfs timer routine 1140 * Scan the nfsreq list and retransmit any requests that have timed out. 1141 */ 1142 void 1143 nfs_timer(void *arg) 1144 { 1145 struct nfsmount *nmp = arg; 1146 struct nfsreq *rep; 1147 struct mbuf *m; 1148 struct socket *so; 1149 int timeo, error; 1150 1151 NET_LOCK(); 1152 TAILQ_FOREACH(rep, &nmp->nm_reqsq, r_chain) { 1153 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) 1154 continue; 1155 if (nfs_sigintr(nmp, rep, rep->r_procp)) { 1156 rep->r_flags |= R_SOFTTERM; 1157 continue; 1158 } 1159 if (rep->r_rtt >= 0) { 1160 rep->r_rtt++; 1161 if (nmp->nm_flag & NFSMNT_DUMBTIMR) 1162 timeo = nmp->nm_timeo; 1163 else 1164 timeo = nfs_estimate_rto(nmp, rep->r_procnum); 1165 if (nmp->nm_timeouts > 0) 1166 timeo *= nfs_backoff[nmp->nm_timeouts - 1]; 1167 if (rep->r_rtt <= timeo) 1168 continue; 1169 if (nmp->nm_timeouts < nitems(nfs_backoff)) 1170 nmp->nm_timeouts++; 1171 } 1172 1173 /* Check for server not responding. */ 1174 if ((rep->r_flags & R_TPRINTFMSG) == 0 && rep->r_rexmit > 4) { 1175 nfs_msg(rep, "not responding"); 1176 rep->r_flags |= R_TPRINTFMSG; 1177 } 1178 if (rep->r_rexmit >= nmp->nm_retry) { /* too many */ 1179 nfsstats.rpctimeouts++; 1180 rep->r_flags |= R_SOFTTERM; 1181 continue; 1182 } 1183 if (nmp->nm_sotype != SOCK_DGRAM) { 1184 if (++rep->r_rexmit > NFS_MAXREXMIT) 1185 rep->r_rexmit = NFS_MAXREXMIT; 1186 continue; 1187 } 1188 1189 if ((so = nmp->nm_so) == NULL) 1190 continue; 1191 1192 /* 1193 * If there is enough space and the window allows.. 1194 * Resend it 1195 * Set r_rtt to -1 in case we fail to send it now. 1196 */ 1197 rep->r_rtt = -1; 1198 if (sbspace(so, &so->so_snd) >= rep->r_mreq->m_pkthdr.len && 1199 ((nmp->nm_flag & NFSMNT_DUMBTIMR) || 1200 (rep->r_flags & R_SENT) || 1201 nmp->nm_sent < nmp->nm_cwnd) && 1202 (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){ 1203 if ((nmp->nm_flag & NFSMNT_NOCONN) == 0) 1204 error = pru_send(so, m, NULL, NULL); 1205 else 1206 error = pru_send(so, m, nmp->nm_nam, NULL); 1207 if (error) { 1208 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) 1209 so->so_error = 0; 1210 } else { 1211 /* 1212 * Iff first send, start timing 1213 * else turn timing off, backoff timer 1214 * and divide congestion window by 2. 1215 */ 1216 if (rep->r_flags & R_SENT) { 1217 rep->r_flags &= ~R_TIMING; 1218 if (++rep->r_rexmit > NFS_MAXREXMIT) 1219 rep->r_rexmit = NFS_MAXREXMIT; 1220 nmp->nm_cwnd >>= 1; 1221 if (nmp->nm_cwnd < NFS_CWNDSCALE) 1222 nmp->nm_cwnd = NFS_CWNDSCALE; 1223 nfsstats.rpcretries++; 1224 } else { 1225 rep->r_flags |= R_SENT; 1226 nmp->nm_sent += NFS_CWNDSCALE; 1227 } 1228 rep->r_rtt = 0; 1229 } 1230 } 1231 } 1232 NET_UNLOCK(); 1233 timeout_add(&nmp->nm_rtimeout, nfs_ticks); 1234 } 1235 1236 /* 1237 * Test for a termination condition pending on the process. 1238 * This is used for NFSMNT_INT mounts. 1239 */ 1240 int 1241 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct proc *p) 1242 { 1243 1244 if (rep && (rep->r_flags & R_SOFTTERM)) 1245 return (EINTR); 1246 if (!(nmp->nm_flag & NFSMNT_INT)) 1247 return (0); 1248 if (p && (SIGPENDING(p) & ~p->p_p->ps_sigacts->ps_sigignore & 1249 NFSINT_SIGMASK)) 1250 return (EINTR); 1251 return (0); 1252 } 1253 1254 /* 1255 * Lock a socket against others. 1256 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply 1257 * and also to avoid race conditions between the processes with nfs requests 1258 * in progress when a reconnect is necessary. 1259 */ 1260 int 1261 nfs_sndlock(int *flagp, struct nfsreq *rep) 1262 { 1263 uint64_t slptimeo = INFSLP; 1264 struct proc *p; 1265 int slpflag = 0; 1266 1267 if (rep) { 1268 p = rep->r_procp; 1269 if (rep->r_nmp->nm_flag & NFSMNT_INT) 1270 slpflag = PCATCH; 1271 } else 1272 p = NULL; 1273 while (*flagp & NFSMNT_SNDLOCK) { 1274 if (rep && nfs_sigintr(rep->r_nmp, rep, p)) 1275 return (EINTR); 1276 *flagp |= NFSMNT_WANTSND; 1277 tsleep_nsec(flagp, slpflag | (PZERO - 1), "nfsndlck", slptimeo); 1278 if (slpflag == PCATCH) { 1279 slpflag = 0; 1280 slptimeo = SEC_TO_NSEC(2); 1281 } 1282 } 1283 *flagp |= NFSMNT_SNDLOCK; 1284 return (0); 1285 } 1286 1287 /* 1288 * Unlock the stream socket for others. 1289 */ 1290 void 1291 nfs_sndunlock(int *flagp) 1292 { 1293 1294 if ((*flagp & NFSMNT_SNDLOCK) == 0) 1295 panic("nfs sndunlock"); 1296 *flagp &= ~NFSMNT_SNDLOCK; 1297 if (*flagp & NFSMNT_WANTSND) { 1298 *flagp &= ~NFSMNT_WANTSND; 1299 wakeup((caddr_t)flagp); 1300 } 1301 } 1302 1303 int 1304 nfs_rcvlock(struct nfsreq *rep) 1305 { 1306 uint64_t slptimeo = INFSLP; 1307 int *flagp = &rep->r_nmp->nm_flag; 1308 int slpflag; 1309 1310 if (*flagp & NFSMNT_INT) 1311 slpflag = PCATCH; 1312 else 1313 slpflag = 0; 1314 1315 while (*flagp & NFSMNT_RCVLOCK) { 1316 if (nfs_sigintr(rep->r_nmp, rep, rep->r_procp)) 1317 return (EINTR); 1318 *flagp |= NFSMNT_WANTRCV; 1319 tsleep_nsec(flagp, slpflag | (PZERO - 1), "nfsrcvlk", slptimeo); 1320 if (rep->r_mrep != NULL) { 1321 /* 1322 * Don't take the lock if our reply has been received 1323 * while we where sleeping. 1324 */ 1325 return (EALREADY); 1326 } 1327 if (slpflag == PCATCH) { 1328 slpflag = 0; 1329 slptimeo = SEC_TO_NSEC(2); 1330 } 1331 } 1332 *flagp |= NFSMNT_RCVLOCK; 1333 return (0); 1334 } 1335 1336 /* 1337 * Unlock the stream socket for others. 1338 */ 1339 void 1340 nfs_rcvunlock(int *flagp) 1341 { 1342 1343 if ((*flagp & NFSMNT_RCVLOCK) == 0) 1344 panic("nfs rcvunlock"); 1345 *flagp &= ~NFSMNT_RCVLOCK; 1346 if (*flagp & NFSMNT_WANTRCV) { 1347 *flagp &= ~NFSMNT_WANTRCV; 1348 wakeup(flagp); 1349 } 1350 } 1351 1352 /* 1353 * Auxiliary routine to align the length of mbuf copies made with m_copyback(). 1354 */ 1355 void 1356 nfs_realign_fixup(struct mbuf *m, struct mbuf *n, unsigned int *off) 1357 { 1358 size_t padding; 1359 1360 /* 1361 * The maximum number of bytes that m_copyback() places in a mbuf is 1362 * always an aligned quantity, so realign happens at the chain's tail. 1363 */ 1364 while (n->m_next != NULL) 1365 n = n->m_next; 1366 1367 /* 1368 * Pad from the next elements in the source chain. Loop until the 1369 * destination chain is aligned, or the end of the source is reached. 1370 */ 1371 do { 1372 m = m->m_next; 1373 if (m == NULL) 1374 return; 1375 1376 padding = min(ALIGN(n->m_len) - n->m_len, m->m_len); 1377 if (padding > m_trailingspace(n)) 1378 panic("nfs_realign_fixup: no memory to pad to"); 1379 1380 bcopy(mtod(m, void *), mtod(n, char *) + n->m_len, padding); 1381 1382 n->m_len += padding; 1383 m_adj(m, padding); 1384 *off += padding; 1385 1386 } while (!ALIGNED_POINTER(n->m_len, void *)); 1387 } 1388 1389 /* 1390 * The NFS RPC parsing code uses the data address and the length of mbuf 1391 * structures to calculate on-memory addresses. This function makes sure these 1392 * parameters are correctly aligned. 1393 */ 1394 void 1395 nfs_realign(struct mbuf **pm, int hsiz) 1396 { 1397 struct mbuf *m; 1398 struct mbuf *n = NULL; 1399 unsigned int off = 0; 1400 1401 ++nfs_realign_test; 1402 while ((m = *pm) != NULL) { 1403 if (!ALIGNED_POINTER(m->m_data, void *) || 1404 !ALIGNED_POINTER(m->m_len, void *)) { 1405 MGET(n, M_WAIT, MT_DATA); 1406 #define ALIGN_POINTER(n) ((u_int)(((n) + sizeof(void *)) & ~sizeof(void *))) 1407 if (ALIGN_POINTER(m->m_len) >= MINCLSIZE) { 1408 MCLGET(n, M_WAIT); 1409 } 1410 n->m_len = 0; 1411 break; 1412 } 1413 pm = &m->m_next; 1414 } 1415 /* 1416 * If n is non-NULL, loop on m copying data, then replace the 1417 * portion of the chain that had to be realigned. 1418 */ 1419 if (n != NULL) { 1420 ++nfs_realign_count; 1421 while (m) { 1422 m_copyback(n, off, m->m_len, mtod(m, caddr_t), M_WAIT); 1423 1424 /* 1425 * If an unaligned amount of memory was copied, fix up 1426 * the last mbuf created by m_copyback(). 1427 */ 1428 if (!ALIGNED_POINTER(m->m_len, void *)) 1429 nfs_realign_fixup(m, n, &off); 1430 1431 off += m->m_len; 1432 m = m->m_next; 1433 } 1434 m_freemp(pm); 1435 *pm = n; 1436 } 1437 } 1438 1439 1440 /* 1441 * Parse an RPC request 1442 * - verify it 1443 * - fill in the cred struct. 1444 */ 1445 int 1446 nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header) 1447 { 1448 int len, i; 1449 u_int32_t *tl; 1450 u_int32_t nfsvers, auth_type; 1451 int error = 0; 1452 struct nfsm_info info; 1453 1454 info.nmi_mrep = nd->nd_mrep; 1455 info.nmi_md = nd->nd_md; 1456 info.nmi_dpos = nd->nd_dpos; 1457 info.nmi_errorp = &error; 1458 if (has_header) { 1459 tl = (uint32_t *)nfsm_dissect(&info, 10 * NFSX_UNSIGNED); 1460 if (tl == NULL) 1461 goto nfsmout; 1462 nd->nd_retxid = fxdr_unsigned(u_int32_t, *tl++); 1463 if (*tl++ != rpc_call) { 1464 m_freem(info.nmi_mrep); 1465 return (EBADRPC); 1466 } 1467 } else { 1468 tl = (uint32_t *)nfsm_dissect(&info, 8 * NFSX_UNSIGNED); 1469 if (tl == NULL) 1470 goto nfsmout; 1471 } 1472 nd->nd_repstat = 0; 1473 nd->nd_flag = 0; 1474 if (*tl++ != rpc_vers) { 1475 nd->nd_repstat = ERPCMISMATCH; 1476 nd->nd_procnum = NFSPROC_NOOP; 1477 return (0); 1478 } 1479 if (*tl != nfs_prog) { 1480 nd->nd_repstat = EPROGUNAVAIL; 1481 nd->nd_procnum = NFSPROC_NOOP; 1482 return (0); 1483 } 1484 tl++; 1485 nfsvers = fxdr_unsigned(u_int32_t, *tl++); 1486 if (nfsvers != NFS_VER2 && nfsvers != NFS_VER3) { 1487 nd->nd_repstat = EPROGMISMATCH; 1488 nd->nd_procnum = NFSPROC_NOOP; 1489 return (0); 1490 } 1491 if (nfsvers == NFS_VER3) 1492 nd->nd_flag = ND_NFSV3; 1493 nd->nd_procnum = fxdr_unsigned(u_int32_t, *tl++); 1494 if (nd->nd_procnum == NFSPROC_NULL) 1495 return (0); 1496 if (nd->nd_procnum >= NFS_NPROCS || 1497 (nd->nd_procnum > NFSPROC_COMMIT) || 1498 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) { 1499 nd->nd_repstat = EPROCUNAVAIL; 1500 nd->nd_procnum = NFSPROC_NOOP; 1501 return (0); 1502 } 1503 if ((nd->nd_flag & ND_NFSV3) == 0) 1504 nd->nd_procnum = nfsv3_procid[nd->nd_procnum]; 1505 auth_type = *tl++; 1506 len = fxdr_unsigned(int, *tl++); 1507 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1508 m_freem(info.nmi_mrep); 1509 return (EBADRPC); 1510 } 1511 1512 /* Handle auth_unix */ 1513 if (auth_type == rpc_auth_unix) { 1514 len = fxdr_unsigned(int, *++tl); 1515 if (len < 0 || len > NFS_MAXNAMLEN) { 1516 m_freem(info.nmi_mrep); 1517 return (EBADRPC); 1518 } 1519 if (nfsm_adv(&info, nfsm_rndup(len)) != 0) 1520 goto nfsmout; 1521 tl = (uint32_t *)nfsm_dissect(&info, 3 * NFSX_UNSIGNED); 1522 if (tl == NULL) 1523 goto nfsmout; 1524 memset(&nd->nd_cr, 0, sizeof (struct ucred)); 1525 refcnt_init(&nd->nd_cr.cr_refcnt); 1526 nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++); 1527 nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++); 1528 len = fxdr_unsigned(int, *tl); 1529 if (len < 0 || len > RPCAUTH_UNIXGIDS) { 1530 m_freem(info.nmi_mrep); 1531 return (EBADRPC); 1532 } 1533 tl = (uint32_t *) 1534 nfsm_dissect(&info, (len + 2) * NFSX_UNSIGNED); 1535 if (tl == NULL) 1536 goto nfsmout; 1537 for (i = 0; i < len; i++) { 1538 if (i < NGROUPS_MAX) 1539 nd->nd_cr.cr_groups[i] = 1540 fxdr_unsigned(gid_t, *tl++); 1541 else 1542 tl++; 1543 } 1544 nd->nd_cr.cr_ngroups = (len > NGROUPS_MAX) ? NGROUPS_MAX : len; 1545 len = fxdr_unsigned(int, *++tl); 1546 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1547 m_freem(info.nmi_mrep); 1548 return (EBADRPC); 1549 } 1550 if (len > 0) { 1551 if (nfsm_adv(&info, nfsm_rndup(len)) != 0) 1552 goto nfsmout; 1553 } 1554 } else { 1555 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED); 1556 nd->nd_procnum = NFSPROC_NOOP; 1557 return (0); 1558 } 1559 1560 nd->nd_md = info.nmi_md; 1561 nd->nd_dpos = info.nmi_dpos; 1562 return (0); 1563 nfsmout: 1564 return (error); 1565 } 1566 1567 void 1568 nfs_msg(struct nfsreq *rep, char *msg) 1569 { 1570 tpr_t tpr; 1571 1572 if (rep->r_procp) 1573 tpr = tprintf_open(rep->r_procp); 1574 else 1575 tpr = NULL; 1576 1577 tprintf(tpr, "nfs server %s: %s\n", 1578 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname, msg); 1579 tprintf_close(tpr); 1580 } 1581 1582 #ifdef NFSSERVER 1583 /* 1584 * Socket upcall routine for the nfsd sockets. 1585 * The caddr_t arg is a pointer to the "struct nfssvc_sock". 1586 * Essentially do as much as possible non-blocking, else punt and it will 1587 * be called with M_WAIT from an nfsd. 1588 */ 1589 void 1590 nfsrv_rcv(struct socket *so, caddr_t arg, int waitflag) 1591 { 1592 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg; 1593 struct mbuf *m; 1594 struct mbuf *mp, *nam; 1595 struct uio auio; 1596 int flags, error; 1597 1598 KERNEL_LOCK(); 1599 1600 if ((slp->ns_flag & SLP_VALID) == 0) 1601 goto out; 1602 1603 /* Defer soreceive() to an nfsd. */ 1604 if (waitflag == M_DONTWAIT) { 1605 slp->ns_flag |= SLP_NEEDQ; 1606 goto dorecs; 1607 } 1608 1609 auio.uio_procp = NULL; 1610 if (so->so_type == SOCK_STREAM) { 1611 /* 1612 * Do soreceive(). 1613 */ 1614 auio.uio_resid = 1000000000; 1615 flags = MSG_DONTWAIT; 1616 error = soreceive(so, NULL, &auio, &mp, NULL, 1617 &flags, 0); 1618 if (error || mp == NULL) { 1619 if (error == EWOULDBLOCK) 1620 slp->ns_flag |= SLP_NEEDQ; 1621 else 1622 slp->ns_flag |= SLP_DISCONN; 1623 goto dorecs; 1624 } 1625 m = mp; 1626 if (slp->ns_rawend) { 1627 slp->ns_rawend->m_next = m; 1628 slp->ns_cc += 1000000000 - auio.uio_resid; 1629 } else { 1630 slp->ns_raw = m; 1631 slp->ns_cc = 1000000000 - auio.uio_resid; 1632 } 1633 while (m->m_next) 1634 m = m->m_next; 1635 slp->ns_rawend = m; 1636 1637 /* 1638 * Now try and parse record(s) out of the raw stream data. 1639 */ 1640 error = nfsrv_getstream(slp, waitflag); 1641 if (error) { 1642 if (error == EPERM) 1643 slp->ns_flag |= SLP_DISCONN; 1644 else 1645 slp->ns_flag |= SLP_NEEDQ; 1646 } 1647 } else { 1648 do { 1649 auio.uio_resid = 1000000000; 1650 flags = MSG_DONTWAIT; 1651 error = soreceive(so, &nam, &auio, &mp, 1652 NULL, &flags, 0); 1653 if (mp) { 1654 m = nam; 1655 m->m_next = mp; 1656 if (slp->ns_recend) 1657 slp->ns_recend->m_nextpkt = m; 1658 else 1659 slp->ns_rec = m; 1660 slp->ns_recend = m; 1661 m->m_nextpkt = NULL; 1662 } 1663 if (error) { 1664 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) 1665 && error != EWOULDBLOCK) { 1666 slp->ns_flag |= SLP_DISCONN; 1667 goto dorecs; 1668 } 1669 } 1670 } while (mp); 1671 } 1672 1673 /* 1674 * Now try and process the request records, non-blocking. 1675 */ 1676 dorecs: 1677 if (waitflag == M_DONTWAIT && 1678 (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)))) 1679 nfsrv_wakenfsd(slp); 1680 1681 out: 1682 KERNEL_UNLOCK(); 1683 } 1684 1685 /* 1686 * Try and extract an RPC request from the mbuf data list received on a 1687 * stream socket. The "waitflag" argument indicates whether or not it 1688 * can sleep. 1689 */ 1690 int 1691 nfsrv_getstream(struct nfssvc_sock *slp, int waitflag) 1692 { 1693 struct mbuf *m, **mpp; 1694 char *cp1, *cp2; 1695 int len; 1696 struct mbuf *om, *m2, *recm; 1697 u_int32_t recmark; 1698 1699 if (slp->ns_flag & SLP_GETSTREAM) 1700 return (0); 1701 slp->ns_flag |= SLP_GETSTREAM; 1702 for (;;) { 1703 if (slp->ns_reclen == 0) { 1704 if (slp->ns_cc < NFSX_UNSIGNED) { 1705 slp->ns_flag &= ~SLP_GETSTREAM; 1706 return (0); 1707 } 1708 m = slp->ns_raw; 1709 if (m->m_len >= NFSX_UNSIGNED) { 1710 bcopy(mtod(m, caddr_t), &recmark, 1711 NFSX_UNSIGNED); 1712 m->m_data += NFSX_UNSIGNED; 1713 m->m_len -= NFSX_UNSIGNED; 1714 } else { 1715 cp1 = (caddr_t)&recmark; 1716 cp2 = mtod(m, caddr_t); 1717 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) { 1718 while (m->m_len == 0) { 1719 m = m->m_next; 1720 cp2 = mtod(m, caddr_t); 1721 } 1722 *cp1++ = *cp2++; 1723 m->m_data++; 1724 m->m_len--; 1725 } 1726 } 1727 slp->ns_cc -= NFSX_UNSIGNED; 1728 recmark = ntohl(recmark); 1729 slp->ns_reclen = recmark & ~0x80000000; 1730 if (recmark & 0x80000000) 1731 slp->ns_flag |= SLP_LASTFRAG; 1732 else 1733 slp->ns_flag &= ~SLP_LASTFRAG; 1734 if (slp->ns_reclen > NFS_MAXPACKET) { 1735 slp->ns_flag &= ~SLP_GETSTREAM; 1736 return (EPERM); 1737 } 1738 } 1739 1740 /* 1741 * Now get the record part. 1742 */ 1743 recm = NULL; 1744 if (slp->ns_cc == slp->ns_reclen) { 1745 recm = slp->ns_raw; 1746 slp->ns_raw = slp->ns_rawend = NULL; 1747 slp->ns_cc = slp->ns_reclen = 0; 1748 } else if (slp->ns_cc > slp->ns_reclen) { 1749 len = 0; 1750 m = slp->ns_raw; 1751 om = NULL; 1752 while (len < slp->ns_reclen) { 1753 if ((len + m->m_len) > slp->ns_reclen) { 1754 m2 = m_copym(m, 0, slp->ns_reclen - len, 1755 waitflag); 1756 if (m2) { 1757 if (om) { 1758 om->m_next = m2; 1759 recm = slp->ns_raw; 1760 } else 1761 recm = m2; 1762 m->m_data += slp->ns_reclen-len; 1763 m->m_len -= slp->ns_reclen-len; 1764 len = slp->ns_reclen; 1765 } else { 1766 slp->ns_flag &= ~SLP_GETSTREAM; 1767 return (EWOULDBLOCK); 1768 } 1769 } else if ((len + m->m_len) == slp->ns_reclen) { 1770 om = m; 1771 len += m->m_len; 1772 m = m->m_next; 1773 recm = slp->ns_raw; 1774 om->m_next = NULL; 1775 } else { 1776 om = m; 1777 len += m->m_len; 1778 m = m->m_next; 1779 } 1780 } 1781 slp->ns_raw = m; 1782 slp->ns_cc -= len; 1783 slp->ns_reclen = 0; 1784 } else { 1785 slp->ns_flag &= ~SLP_GETSTREAM; 1786 return (0); 1787 } 1788 1789 /* 1790 * Accumulate the fragments into a record. 1791 */ 1792 mpp = &slp->ns_frag; 1793 while (*mpp) 1794 mpp = &((*mpp)->m_next); 1795 *mpp = recm; 1796 if (slp->ns_flag & SLP_LASTFRAG) { 1797 if (slp->ns_recend) 1798 slp->ns_recend->m_nextpkt = slp->ns_frag; 1799 else 1800 slp->ns_rec = slp->ns_frag; 1801 slp->ns_recend = slp->ns_frag; 1802 slp->ns_frag = NULL; 1803 } 1804 } 1805 } 1806 1807 /* 1808 * Parse an RPC header. 1809 */ 1810 int 1811 nfsrv_dorec(struct nfssvc_sock *slp, struct nfsd *nfsd, 1812 struct nfsrv_descript **ndp) 1813 { 1814 struct mbuf *m, *nam; 1815 struct nfsrv_descript *nd; 1816 int error; 1817 1818 *ndp = NULL; 1819 if ((slp->ns_flag & SLP_VALID) == 0 || 1820 (m = slp->ns_rec) == NULL) 1821 return (ENOBUFS); 1822 slp->ns_rec = m->m_nextpkt; 1823 if (slp->ns_rec) 1824 m->m_nextpkt = NULL; 1825 else 1826 slp->ns_recend = NULL; 1827 if (m->m_type == MT_SONAME) { 1828 nam = m; 1829 m = m->m_next; 1830 nam->m_next = NULL; 1831 } else 1832 nam = NULL; 1833 nd = pool_get(&nfsrv_descript_pl, PR_WAITOK); 1834 nfs_realign(&m, 10 * NFSX_UNSIGNED); 1835 nd->nd_md = nd->nd_mrep = m; 1836 nd->nd_nam2 = nam; 1837 nd->nd_dpos = mtod(m, caddr_t); 1838 error = nfs_getreq(nd, nfsd, 1); 1839 if (error) { 1840 m_freem(nam); 1841 pool_put(&nfsrv_descript_pl, nd); 1842 return (error); 1843 } 1844 *ndp = nd; 1845 nfsd->nfsd_nd = nd; 1846 return (0); 1847 } 1848 1849 1850 /* 1851 * Search for a sleeping nfsd and wake it up. 1852 * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the 1853 * running nfsds will go look for the work in the nfssvc_sock list. 1854 */ 1855 void 1856 nfsrv_wakenfsd(struct nfssvc_sock *slp) 1857 { 1858 struct nfsd *nfsd; 1859 1860 if ((slp->ns_flag & SLP_VALID) == 0) 1861 return; 1862 1863 TAILQ_FOREACH(nfsd, &nfsd_head, nfsd_chain) { 1864 if (nfsd->nfsd_flag & NFSD_WAITING) { 1865 nfsd->nfsd_flag &= ~NFSD_WAITING; 1866 if (nfsd->nfsd_slp) 1867 panic("nfsd wakeup"); 1868 slp->ns_sref++; 1869 nfsd->nfsd_slp = slp; 1870 wakeup_one(nfsd); 1871 return; 1872 } 1873 } 1874 1875 slp->ns_flag |= SLP_DOREC; 1876 nfsd_head_flag |= NFSD_CHECKSLP; 1877 } 1878 #endif /* NFSSERVER */ 1879