1 /* $OpenBSD: nfs_socket.c,v 1.125 2017/08/14 16:56:57 tedu Exp $ */ 2 /* $NetBSD: nfs_socket.c,v 1.27 1996/04/15 20:20:00 thorpej Exp $ */ 3 4 /* 5 * Copyright (c) 1989, 1991, 1993, 1995 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Rick Macklem at The University of Guelph. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95 36 */ 37 38 /* 39 * Socket operations for use by nfs 40 */ 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/proc.h> 45 #include <sys/mount.h> 46 #include <sys/kernel.h> 47 #include <sys/mbuf.h> 48 #include <sys/vnode.h> 49 #include <sys/domain.h> 50 #include <sys/protosw.h> 51 #include <sys/signalvar.h> 52 #include <sys/socket.h> 53 #include <sys/socketvar.h> 54 #include <sys/syslog.h> 55 #include <sys/tprintf.h> 56 #include <sys/namei.h> 57 #include <sys/pool.h> 58 #include <sys/queue.h> 59 60 #include <netinet/in.h> 61 #include <netinet/tcp.h> 62 63 #include <nfs/rpcv2.h> 64 #include <nfs/nfsproto.h> 65 #include <nfs/nfs.h> 66 #include <nfs/xdr_subs.h> 67 #include <nfs/nfsm_subs.h> 68 #include <nfs/nfsmount.h> 69 #include <nfs/nfs_var.h> 70 71 /* External data, mostly RPC constants in XDR form. */ 72 extern u_int32_t rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, 73 rpc_auth_unix, rpc_msgaccepted, rpc_call, rpc_autherr; 74 extern u_int32_t nfs_prog; 75 extern struct nfsstats nfsstats; 76 extern int nfsv3_procid[NFS_NPROCS]; 77 extern int nfs_ticks; 78 79 extern struct pool nfsrv_descript_pl; 80 81 /* 82 * There is a congestion window for outstanding rpcs maintained per mount 83 * point. The cwnd size is adjusted in roughly the way that: 84 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of 85 * SIGCOMM '88". ACM, August 1988. 86 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout 87 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd 88 * of rpcs is in progress. 89 * (The sent count and cwnd are scaled for integer arith.) 90 * Variants of "slow start" were tried and were found to be too much of a 91 * performance hit (ave. rtt 3 times larger), 92 * I suspect due to the large rtt that nfs rpcs have. 93 */ 94 #define NFS_CWNDSCALE 256 95 #define NFS_MAXCWND (NFS_CWNDSCALE * 32) 96 int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256 }; 97 98 /* RTT estimator */ 99 enum nfs_rto_timers nfs_ptimers[NFS_NPROCS] = { 100 NFS_DEFAULT_TIMER, /* NULL */ 101 NFS_GETATTR_TIMER, /* GETATTR */ 102 NFS_DEFAULT_TIMER, /* SETATTR */ 103 NFS_LOOKUP_TIMER, /* LOOKUP */ 104 NFS_GETATTR_TIMER, /* ACCESS */ 105 NFS_READ_TIMER, /* READLINK */ 106 NFS_READ_TIMER, /* READ */ 107 NFS_WRITE_TIMER, /* WRITE */ 108 NFS_DEFAULT_TIMER, /* CREATE */ 109 NFS_DEFAULT_TIMER, /* MKDIR */ 110 NFS_DEFAULT_TIMER, /* SYMLINK */ 111 NFS_DEFAULT_TIMER, /* MKNOD */ 112 NFS_DEFAULT_TIMER, /* REMOVE */ 113 NFS_DEFAULT_TIMER, /* RMDIR */ 114 NFS_DEFAULT_TIMER, /* RENAME */ 115 NFS_DEFAULT_TIMER, /* LINK */ 116 NFS_READ_TIMER, /* READDIR */ 117 NFS_READ_TIMER, /* READDIRPLUS */ 118 NFS_DEFAULT_TIMER, /* FSSTAT */ 119 NFS_DEFAULT_TIMER, /* FSINFO */ 120 NFS_DEFAULT_TIMER, /* PATHCONF */ 121 NFS_DEFAULT_TIMER, /* COMMIT */ 122 NFS_DEFAULT_TIMER, /* NOOP */ 123 }; 124 125 void nfs_init_rtt(struct nfsmount *); 126 void nfs_update_rtt(struct nfsreq *); 127 int nfs_estimate_rto(struct nfsmount *, u_int32_t procnum); 128 129 void nfs_realign(struct mbuf **, int); 130 void nfs_realign_fixup(struct mbuf *, struct mbuf *, unsigned int *); 131 132 int nfs_rcvlock(struct nfsreq *); 133 int nfs_receive(struct nfsreq *, struct mbuf **, struct mbuf **); 134 int nfs_reconnect(struct nfsreq *); 135 int nfs_reply(struct nfsreq *); 136 void nfs_msg(struct nfsreq *, char *); 137 void nfs_rcvunlock(int *); 138 139 int nfsrv_getstream(struct nfssvc_sock *, int); 140 141 unsigned int nfs_realign_test = 0; 142 unsigned int nfs_realign_count = 0; 143 144 /* Initialize the RTT estimator state for a new mount point. */ 145 void 146 nfs_init_rtt(struct nfsmount *nmp) 147 { 148 int i; 149 150 for (i = 0; i < NFS_MAX_TIMER; i++) 151 nmp->nm_srtt[i] = NFS_INITRTT; 152 for (i = 0; i < NFS_MAX_TIMER; i++) 153 nmp->nm_sdrtt[i] = 0; 154 } 155 156 /* 157 * Update a mount point's RTT estimator state using data from the 158 * passed-in request. 159 * 160 * Use a gain of 0.125 on the mean and a gain of 0.25 on the deviation. 161 * 162 * NB: Since the timer resolution of NFS_HZ is so course, it can often 163 * result in r_rtt == 0. Since r_rtt == N means that the actual RTT is 164 * between N + dt and N + 2 - dt ticks, add 1 before calculating the 165 * update values. 166 */ 167 void 168 nfs_update_rtt(struct nfsreq *rep) 169 { 170 int t1 = rep->r_rtt + 1; 171 int index = nfs_ptimers[rep->r_procnum] - 1; 172 int *srtt = &rep->r_nmp->nm_srtt[index]; 173 int *sdrtt = &rep->r_nmp->nm_sdrtt[index]; 174 175 t1 -= *srtt >> 3; 176 *srtt += t1; 177 if (t1 < 0) 178 t1 = -t1; 179 t1 -= *sdrtt >> 2; 180 *sdrtt += t1; 181 } 182 183 /* 184 * Estimate RTO for an NFS RPC sent via an unreliable datagram. 185 * 186 * Use the mean and mean deviation of RTT for the appropriate type 187 * of RPC for the frequent RPCs and a default for the others. 188 * The justification for doing "other" this way is that these RPCs 189 * happen so infrequently that timer est. would probably be stale. 190 * Also, since many of these RPCs are non-idempotent, a conservative 191 * timeout is desired. 192 * 193 * getattr, lookup - A+2D 194 * read, write - A+4D 195 * other - nm_timeo 196 */ 197 int 198 nfs_estimate_rto(struct nfsmount *nmp, u_int32_t procnum) 199 { 200 enum nfs_rto_timers timer = nfs_ptimers[procnum]; 201 int index = timer - 1; 202 int rto; 203 204 switch (timer) { 205 case NFS_GETATTR_TIMER: 206 case NFS_LOOKUP_TIMER: 207 rto = ((nmp->nm_srtt[index] + 3) >> 2) + 208 ((nmp->nm_sdrtt[index] + 1) >> 1); 209 break; 210 case NFS_READ_TIMER: 211 case NFS_WRITE_TIMER: 212 rto = ((nmp->nm_srtt[index] + 7) >> 3) + 213 (nmp->nm_sdrtt[index] + 1); 214 break; 215 default: 216 rto = nmp->nm_timeo; 217 return (rto); 218 } 219 220 if (rto < NFS_MINRTO) 221 rto = NFS_MINRTO; 222 else if (rto > NFS_MAXRTO) 223 rto = NFS_MAXRTO; 224 225 return (rto); 226 } 227 228 229 230 /* 231 * Initialize sockets and congestion for a new NFS connection. 232 * We do not free the sockaddr if error. 233 */ 234 int 235 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep) 236 { 237 struct socket *so; 238 int s, error, rcvreserve, sndreserve; 239 struct sockaddr *saddr; 240 struct sockaddr_in *sin; 241 struct mbuf *m; 242 243 if (!(nmp->nm_sotype == SOCK_DGRAM || nmp->nm_sotype == SOCK_STREAM)) { 244 error = EINVAL; 245 goto bad; 246 } 247 248 nmp->nm_so = NULL; 249 saddr = mtod(nmp->nm_nam, struct sockaddr *); 250 error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype, 251 nmp->nm_soproto); 252 if (error) 253 goto bad; 254 so = nmp->nm_so; 255 nmp->nm_soflags = so->so_proto->pr_flags; 256 257 /* 258 * Some servers require that the client port be a reserved port number. 259 * We always allocate a reserved port, as this prevents filehandle 260 * disclosure through UDP port capture. 261 */ 262 if (saddr->sa_family == AF_INET) { 263 struct mbuf *mopt; 264 int *ip; 265 266 MGET(mopt, M_WAIT, MT_SOOPTS); 267 mopt->m_len = sizeof(int); 268 ip = mtod(mopt, int *); 269 *ip = IP_PORTRANGE_LOW; 270 s = solock(so); 271 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); 272 sounlock(s); 273 if (error) 274 goto bad; 275 276 MGET(m, M_WAIT, MT_SONAME); 277 sin = mtod(m, struct sockaddr_in *); 278 memset(sin, 0, sizeof(*sin)); 279 sin->sin_len = m->m_len = sizeof(struct sockaddr_in); 280 sin->sin_family = AF_INET; 281 sin->sin_addr.s_addr = INADDR_ANY; 282 sin->sin_port = htons(0); 283 s = solock(so); 284 error = sobind(so, m, &proc0); 285 sounlock(s); 286 m_freem(m); 287 if (error) 288 goto bad; 289 290 MGET(mopt, M_WAIT, MT_SOOPTS); 291 mopt->m_len = sizeof(int); 292 ip = mtod(mopt, int *); 293 *ip = IP_PORTRANGE_DEFAULT; 294 s = solock(so); 295 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); 296 sounlock(s); 297 if (error) 298 goto bad; 299 } 300 301 /* 302 * Protocols that do not require connections may be optionally left 303 * unconnected for servers that reply from a port other than NFS_PORT. 304 */ 305 if (nmp->nm_flag & NFSMNT_NOCONN) { 306 if (nmp->nm_soflags & PR_CONNREQUIRED) { 307 error = ENOTCONN; 308 goto bad; 309 } 310 } else { 311 s = solock(so); 312 error = soconnect(so, nmp->nm_nam); 313 if (error) { 314 sounlock(s); 315 goto bad; 316 } 317 318 /* 319 * Wait for the connection to complete. Cribbed from the 320 * connect system call but with the wait timing out so 321 * that interruptible mounts don't hang here for a long time. 322 */ 323 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 324 sosleep(so, &so->so_timeo, PSOCK, "nfscon", 2 * hz); 325 if ((so->so_state & SS_ISCONNECTING) && 326 so->so_error == 0 && rep && 327 (error = nfs_sigintr(nmp, rep, rep->r_procp)) != 0){ 328 so->so_state &= ~SS_ISCONNECTING; 329 sounlock(s); 330 goto bad; 331 } 332 } 333 if (so->so_error) { 334 error = so->so_error; 335 so->so_error = 0; 336 sounlock(s); 337 goto bad; 338 } 339 sounlock(s); 340 } 341 /* 342 * Always set receive timeout to detect server crash and reconnect. 343 * Otherwise, we can get stuck in soreceive forever. 344 */ 345 s = solock(so); 346 so->so_rcv.sb_timeo = (5 * hz); 347 if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) 348 so->so_snd.sb_timeo = (5 * hz); 349 else 350 so->so_snd.sb_timeo = 0; 351 if (nmp->nm_sotype == SOCK_DGRAM) { 352 sndreserve = nmp->nm_wsize + NFS_MAXPKTHDR; 353 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) + 354 NFS_MAXPKTHDR) * 2; 355 } else if (nmp->nm_sotype == SOCK_STREAM) { 356 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 357 MGET(m, M_WAIT, MT_SOOPTS); 358 *mtod(m, int32_t *) = 1; 359 m->m_len = sizeof(int32_t); 360 sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, m); 361 } 362 if (so->so_proto->pr_protocol == IPPROTO_TCP) { 363 MGET(m, M_WAIT, MT_SOOPTS); 364 *mtod(m, int32_t *) = 1; 365 m->m_len = sizeof(int32_t); 366 sosetopt(so, IPPROTO_TCP, TCP_NODELAY, m); 367 } 368 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + 369 sizeof (u_int32_t)) * 2; 370 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + 371 sizeof (u_int32_t)) * 2; 372 } 373 error = soreserve(so, sndreserve, rcvreserve); 374 sounlock(s); 375 if (error) 376 goto bad; 377 so->so_rcv.sb_flags |= SB_NOINTR; 378 so->so_snd.sb_flags |= SB_NOINTR; 379 380 /* Initialize other non-zero congestion variables */ 381 nfs_init_rtt(nmp); 382 nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */ 383 nmp->nm_sent = 0; 384 nmp->nm_timeouts = 0; 385 return (0); 386 387 bad: 388 nfs_disconnect(nmp); 389 return (error); 390 } 391 392 /* 393 * Reconnect routine: 394 * Called when a connection is broken on a reliable protocol. 395 * - clean up the old socket 396 * - nfs_connect() again 397 * - set R_MUSTRESEND for all outstanding requests on mount point 398 * If this fails the mount point is DEAD! 399 * nb: Must be called with the nfs_sndlock() set on the mount point. 400 */ 401 int 402 nfs_reconnect(struct nfsreq *rep) 403 { 404 struct nfsreq *rp; 405 struct nfsmount *nmp = rep->r_nmp; 406 int error; 407 408 nfs_disconnect(nmp); 409 while ((error = nfs_connect(nmp, rep)) != 0) { 410 if (error == EINTR || error == ERESTART) 411 return (EINTR); 412 (void)tsleep((caddr_t)&lbolt, PSOCK, "nfsrecon", 0); 413 } 414 415 /* 416 * Loop through outstanding request list and fix up all requests 417 * on old socket. 418 */ 419 TAILQ_FOREACH(rp, &nmp->nm_reqsq, r_chain) { 420 rp->r_flags |= R_MUSTRESEND; 421 rp->r_rexmit = 0; 422 } 423 return (0); 424 } 425 426 /* 427 * NFS disconnect. Clean up and unlink. 428 */ 429 void 430 nfs_disconnect(struct nfsmount *nmp) 431 { 432 struct socket *so; 433 434 if (nmp->nm_so) { 435 so = nmp->nm_so; 436 nmp->nm_so = NULL; 437 soshutdown(so, SHUT_RDWR); 438 soclose(so); 439 } 440 } 441 442 /* 443 * This is the nfs send routine. For connection based socket types, it 444 * must be called with an nfs_sndlock() on the socket. 445 * "rep == NULL" indicates that it has been called from a server. 446 * For the client side: 447 * - return EINTR if the RPC is terminated, 0 otherwise 448 * - set R_MUSTRESEND if the send fails for any reason 449 * - do any cleanup required by recoverable socket errors (???) 450 * For the server side: 451 * - return EINTR or ERESTART if interrupted by a signal 452 * - return EPIPE if a connection is lost for connection based sockets (TCP...) 453 * - do any cleanup required by recoverable socket errors (???) 454 */ 455 int 456 nfs_send(struct socket *so, struct mbuf *nam, struct mbuf *top, 457 struct nfsreq *rep) 458 { 459 struct mbuf *sendnam; 460 int error, soflags, flags; 461 462 if (rep) { 463 if (rep->r_flags & R_SOFTTERM) { 464 m_freem(top); 465 return (EINTR); 466 } 467 if ((so = rep->r_nmp->nm_so) == NULL) { 468 rep->r_flags |= R_MUSTRESEND; 469 m_freem(top); 470 return (0); 471 } 472 rep->r_flags &= ~R_MUSTRESEND; 473 soflags = rep->r_nmp->nm_soflags; 474 } else 475 soflags = so->so_proto->pr_flags; 476 if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED)) 477 sendnam = NULL; 478 else 479 sendnam = nam; 480 flags = 0; 481 482 error = sosend(so, sendnam, NULL, top, NULL, flags); 483 if (error) { 484 if (rep) { 485 /* 486 * Deal with errors for the client side. 487 */ 488 if (rep->r_flags & R_SOFTTERM) 489 error = EINTR; 490 else 491 rep->r_flags |= R_MUSTRESEND; 492 } 493 494 /* 495 * Handle any recoverable (soft) socket errors here. (???) 496 */ 497 if (error != EINTR && error != ERESTART && 498 error != EWOULDBLOCK && error != EPIPE) 499 error = 0; 500 } 501 return (error); 502 } 503 504 #ifdef NFSCLIENT 505 /* 506 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all 507 * done by soreceive(), but for SOCK_STREAM we must deal with the Record 508 * Mark and consolidate the data into a new mbuf list. 509 * nb: Sometimes TCP passes the data up to soreceive() in long lists of 510 * small mbufs. 511 * For SOCK_STREAM we must be very careful to read an entire record once 512 * we have read any of it, even if the system call has been interrupted. 513 */ 514 int 515 nfs_receive(struct nfsreq *rep, struct mbuf **aname, struct mbuf **mp) 516 { 517 struct socket *so; 518 struct uio auio; 519 struct iovec aio; 520 struct mbuf *m; 521 struct mbuf *control; 522 u_int32_t len; 523 struct mbuf **getnam; 524 int error, sotype, rcvflg; 525 struct proc *p = curproc; /* XXX */ 526 527 /* 528 * Set up arguments for soreceive() 529 */ 530 *mp = NULL; 531 *aname = NULL; 532 sotype = rep->r_nmp->nm_sotype; 533 534 /* 535 * For reliable protocols, lock against other senders/receivers 536 * in case a reconnect is necessary. 537 * For SOCK_STREAM, first get the Record Mark to find out how much 538 * more there is to get. 539 * We must lock the socket against other receivers 540 * until we have an entire rpc request/reply. 541 */ 542 if (sotype != SOCK_DGRAM) { 543 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); 544 if (error) 545 return (error); 546 tryagain: 547 /* 548 * Check for fatal errors and resending request. 549 */ 550 /* 551 * Ugh: If a reconnect attempt just happened, nm_so 552 * would have changed. NULL indicates a failed 553 * attempt that has essentially shut down this 554 * mount point. 555 */ 556 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) { 557 nfs_sndunlock(&rep->r_nmp->nm_flag); 558 return (EINTR); 559 } 560 so = rep->r_nmp->nm_so; 561 if (!so) { 562 error = nfs_reconnect(rep); 563 if (error) { 564 nfs_sndunlock(&rep->r_nmp->nm_flag); 565 return (error); 566 } 567 goto tryagain; 568 } 569 while (rep->r_flags & R_MUSTRESEND) { 570 m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT); 571 nfsstats.rpcretries++; 572 rep->r_rtt = 0; 573 rep->r_flags &= ~R_TIMING; 574 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep); 575 if (error) { 576 if (error == EINTR || error == ERESTART || 577 (error = nfs_reconnect(rep)) != 0) { 578 nfs_sndunlock(&rep->r_nmp->nm_flag); 579 return (error); 580 } 581 goto tryagain; 582 } 583 } 584 nfs_sndunlock(&rep->r_nmp->nm_flag); 585 if (sotype == SOCK_STREAM) { 586 aio.iov_base = (caddr_t) &len; 587 aio.iov_len = sizeof(u_int32_t); 588 auio.uio_iov = &aio; 589 auio.uio_iovcnt = 1; 590 auio.uio_segflg = UIO_SYSSPACE; 591 auio.uio_rw = UIO_READ; 592 auio.uio_offset = 0; 593 auio.uio_resid = sizeof(u_int32_t); 594 auio.uio_procp = p; 595 do { 596 rcvflg = MSG_WAITALL; 597 error = soreceive(so, NULL, &auio, NULL, NULL, 598 &rcvflg, 0); 599 if (error == EWOULDBLOCK && rep) { 600 if (rep->r_flags & R_SOFTTERM) 601 return (EINTR); 602 /* 603 * looks like the server died after it 604 * received the request, make sure 605 * that we will retransmit and we 606 * don't get stuck here forever. 607 */ 608 if (rep->r_rexmit >= 609 rep->r_nmp->nm_retry) { 610 nfsstats.rpctimeouts++; 611 error = EPIPE; 612 } 613 } 614 } while (error == EWOULDBLOCK); 615 if (!error && auio.uio_resid > 0) { 616 log(LOG_INFO, 617 "short receive (%zu/%zu) from nfs server %s\n", 618 sizeof(u_int32_t) - auio.uio_resid, 619 sizeof(u_int32_t), 620 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 621 error = EPIPE; 622 } 623 if (error) 624 goto errout; 625 626 len = ntohl(len) & ~0x80000000; 627 /* 628 * This is SERIOUS! We are out of sync with the sender 629 * and forcing a disconnect/reconnect is all I can do. 630 */ 631 if (len > NFS_MAXPACKET) { 632 log(LOG_ERR, "%s (%u) from nfs server %s\n", 633 "impossible packet length", 634 len, 635 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 636 error = EFBIG; 637 goto errout; 638 } 639 auio.uio_resid = len; 640 do { 641 rcvflg = MSG_WAITALL; 642 error = soreceive(so, NULL, &auio, mp, NULL, 643 &rcvflg, 0); 644 } while (error == EWOULDBLOCK || error == EINTR || 645 error == ERESTART); 646 if (!error && auio.uio_resid > 0) { 647 log(LOG_INFO, "short receive (%zu/%u) from " 648 "nfs server %s\n", len - auio.uio_resid, 649 len, rep->r_nmp->nm_mountp-> 650 mnt_stat.f_mntfromname); 651 error = EPIPE; 652 } 653 } else { 654 /* 655 * NB: Since uio_resid is big, MSG_WAITALL is ignored 656 * and soreceive() will return when it has either a 657 * control msg or a data msg. 658 * We have no use for control msg., but must grab them 659 * and then throw them away so we know what is going 660 * on. 661 */ 662 auio.uio_resid = len = 100000000; /* Anything Big */ 663 auio.uio_procp = p; 664 do { 665 rcvflg = 0; 666 error = soreceive(so, NULL, &auio, mp, &control, 667 &rcvflg, 0); 668 m_freem(control); 669 if (error == EWOULDBLOCK && rep) { 670 if (rep->r_flags & R_SOFTTERM) 671 return (EINTR); 672 } 673 } while (error == EWOULDBLOCK || 674 (!error && *mp == NULL && control)); 675 if ((rcvflg & MSG_EOR) == 0) 676 printf("Egad!!\n"); 677 if (!error && *mp == NULL) 678 error = EPIPE; 679 len -= auio.uio_resid; 680 } 681 errout: 682 if (error && error != EINTR && error != ERESTART) { 683 m_freemp(mp); 684 if (error != EPIPE) 685 log(LOG_INFO, 686 "receive error %d from nfs server %s\n", 687 error, 688 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 689 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); 690 if (!error) { 691 error = nfs_reconnect(rep); 692 if (!error) 693 goto tryagain; 694 nfs_sndunlock(&rep->r_nmp->nm_flag); 695 } 696 } 697 } else { 698 if ((so = rep->r_nmp->nm_so) == NULL) 699 return (EACCES); 700 if (so->so_state & SS_ISCONNECTED) 701 getnam = NULL; 702 else 703 getnam = aname; 704 auio.uio_resid = len = 1000000; 705 auio.uio_procp = p; 706 do { 707 rcvflg = 0; 708 error = soreceive(so, getnam, &auio, mp, NULL, 709 &rcvflg, 0); 710 if (error == EWOULDBLOCK && 711 (rep->r_flags & R_SOFTTERM)) 712 return (EINTR); 713 } while (error == EWOULDBLOCK); 714 len -= auio.uio_resid; 715 } 716 if (error) 717 m_freemp(mp); 718 /* 719 * Search for any mbufs that are not a multiple of 4 bytes long 720 * or with m_data not longword aligned. 721 * These could cause pointer alignment problems, so copy them to 722 * well aligned mbufs. 723 */ 724 nfs_realign(mp, 5 * NFSX_UNSIGNED); 725 return (error); 726 } 727 728 /* 729 * Implement receipt of reply on a socket. 730 * We must search through the list of received datagrams matching them 731 * with outstanding requests using the xid, until ours is found. 732 */ 733 int 734 nfs_reply(struct nfsreq *myrep) 735 { 736 struct nfsreq *rep; 737 struct nfsmount *nmp = myrep->r_nmp; 738 struct nfsm_info info; 739 struct mbuf *nam; 740 u_int32_t rxid, *tl, t1; 741 caddr_t cp2; 742 int error; 743 744 /* 745 * Loop around until we get our own reply 746 */ 747 for (;;) { 748 /* 749 * Lock against other receivers so that I don't get stuck in 750 * sbwait() after someone else has received my reply for me. 751 * Also necessary for connection based protocols to avoid 752 * race conditions during a reconnect. 753 */ 754 error = nfs_rcvlock(myrep); 755 if (error) 756 return (error == EALREADY ? 0 : error); 757 758 /* 759 * Get the next Rpc reply off the socket 760 */ 761 error = nfs_receive(myrep, &nam, &info.nmi_mrep); 762 nfs_rcvunlock(&nmp->nm_flag); 763 if (error) { 764 765 /* 766 * Ignore routing errors on connectionless protocols?? 767 */ 768 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) { 769 if (nmp->nm_so) 770 nmp->nm_so->so_error = 0; 771 continue; 772 } 773 return (error); 774 } 775 m_freem(nam); 776 777 /* 778 * Get the xid and check that it is an rpc reply 779 */ 780 info.nmi_md = info.nmi_mrep; 781 info.nmi_dpos = mtod(info.nmi_md, caddr_t); 782 nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED); 783 rxid = *tl++; 784 if (*tl != rpc_reply) { 785 nfsstats.rpcinvalid++; 786 m_freem(info.nmi_mrep); 787 nfsmout: 788 continue; 789 } 790 791 /* 792 * Loop through the request list to match up the reply 793 * Iff no match, just drop the datagram 794 */ 795 TAILQ_FOREACH(rep, &nmp->nm_reqsq, r_chain) { 796 if (rep->r_mrep == NULL && rxid == rep->r_xid) { 797 /* Found it.. */ 798 rep->r_mrep = info.nmi_mrep; 799 rep->r_md = info.nmi_md; 800 rep->r_dpos = info.nmi_dpos; 801 802 /* 803 * Update congestion window. 804 * Do the additive increase of 805 * one rpc/rtt. 806 */ 807 if (nmp->nm_cwnd <= nmp->nm_sent) { 808 nmp->nm_cwnd += 809 (NFS_CWNDSCALE * NFS_CWNDSCALE + 810 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd; 811 if (nmp->nm_cwnd > NFS_MAXCWND) 812 nmp->nm_cwnd = NFS_MAXCWND; 813 } 814 rep->r_flags &= ~R_SENT; 815 nmp->nm_sent -= NFS_CWNDSCALE; 816 817 if (rep->r_flags & R_TIMING) 818 nfs_update_rtt(rep); 819 820 nmp->nm_timeouts = 0; 821 break; 822 } 823 } 824 /* 825 * If not matched to a request, drop it. 826 * If it's mine, get out. 827 */ 828 if (rep == 0) { 829 nfsstats.rpcunexpected++; 830 m_freem(info.nmi_mrep); 831 } else if (rep == myrep) { 832 if (rep->r_mrep == NULL) 833 panic("nfsreply nil"); 834 return (0); 835 } 836 } 837 } 838 839 /* 840 * nfs_request - goes something like this 841 * - fill in request struct 842 * - links it into list 843 * - calls nfs_send() for first transmit 844 * - calls nfs_receive() to get reply 845 * - break down rpc header and return with nfs reply pointed to 846 * by mrep or error 847 * nb: always frees up mreq mbuf list 848 */ 849 int 850 nfs_request(struct vnode *vp, int procnum, struct nfsm_info *infop) 851 { 852 struct mbuf *m; 853 u_int32_t *tl; 854 struct nfsmount *nmp; 855 struct timeval tv; 856 caddr_t cp2; 857 int t1, i, error = 0; 858 int trylater_delay; 859 struct nfsreq *rep; 860 int mrest_len; 861 struct nfsm_info info; 862 863 rep = pool_get(&nfsreqpl, PR_WAITOK); 864 rep->r_nmp = VFSTONFS(vp->v_mount); 865 rep->r_vp = vp; 866 rep->r_procp = infop->nmi_procp; 867 rep->r_procnum = procnum; 868 869 mrest_len = 0; 870 m = infop->nmi_mreq; 871 while (m) { 872 mrest_len += m->m_len; 873 m = m->m_next; 874 } 875 876 /* empty mbuf for AUTH_UNIX header */ 877 rep->r_mreq = m_gethdr(M_WAIT, MT_DATA); 878 rep->r_mreq->m_next = infop->nmi_mreq; 879 rep->r_mreq->m_pkthdr.len = mrest_len; 880 881 trylater_delay = NFS_MINTIMEO; 882 883 nmp = rep->r_nmp; 884 885 /* Get the RPC header with authorization. */ 886 nfsm_rpchead(rep, infop->nmi_cred, RPCAUTH_UNIX); 887 m = rep->r_mreq; 888 889 /* 890 * For stream protocols, insert a Sun RPC Record Mark. 891 */ 892 if (nmp->nm_sotype == SOCK_STREAM) { 893 M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); 894 *mtod(m, u_int32_t *) = htonl(0x80000000 | 895 (m->m_pkthdr.len - NFSX_UNSIGNED)); 896 } 897 898 tryagain: 899 rep->r_rtt = rep->r_rexmit = 0; 900 if (nfs_ptimers[rep->r_procnum] != NFS_DEFAULT_TIMER) 901 rep->r_flags = R_TIMING; 902 else 903 rep->r_flags = 0; 904 rep->r_mrep = NULL; 905 906 /* 907 * Do the client side RPC. 908 */ 909 nfsstats.rpcrequests++; 910 /* 911 * Chain request into list of outstanding requests. Be sure 912 * to put it LAST so timer finds oldest requests first. 913 */ 914 if (TAILQ_EMPTY(&nmp->nm_reqsq)) 915 timeout_add(&nmp->nm_rtimeout, nfs_ticks); 916 TAILQ_INSERT_TAIL(&nmp->nm_reqsq, rep, r_chain); 917 918 /* 919 * If backing off another request or avoiding congestion, don't 920 * send this one now but let timer do it. If not timing a request, 921 * do it now. 922 */ 923 if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM || 924 (nmp->nm_flag & NFSMNT_DUMBTIMR) || 925 nmp->nm_sent < nmp->nm_cwnd)) { 926 if (nmp->nm_soflags & PR_CONNREQUIRED) 927 error = nfs_sndlock(&nmp->nm_flag, rep); 928 if (!error) { 929 error = nfs_send(nmp->nm_so, nmp->nm_nam, 930 m_copym(m, 0, M_COPYALL, M_WAIT), rep); 931 if (nmp->nm_soflags & PR_CONNREQUIRED) 932 nfs_sndunlock(&nmp->nm_flag); 933 } 934 if (!error && (rep->r_flags & R_MUSTRESEND) == 0) { 935 nmp->nm_sent += NFS_CWNDSCALE; 936 rep->r_flags |= R_SENT; 937 } 938 } else { 939 rep->r_rtt = -1; 940 } 941 942 /* 943 * Wait for the reply from our send or the timer's. 944 */ 945 if (!error || error == EPIPE) 946 error = nfs_reply(rep); 947 948 /* 949 * RPC done, unlink the request. 950 */ 951 TAILQ_REMOVE(&nmp->nm_reqsq, rep, r_chain); 952 if (TAILQ_EMPTY(&nmp->nm_reqsq)) 953 timeout_del(&nmp->nm_rtimeout); 954 955 /* 956 * Decrement the outstanding request count. 957 */ 958 if (rep->r_flags & R_SENT) { 959 rep->r_flags &= ~R_SENT; /* paranoia */ 960 nmp->nm_sent -= NFS_CWNDSCALE; 961 } 962 963 /* 964 * If there was a successful reply and a tprintf msg. 965 * tprintf a response. 966 */ 967 if (!error && (rep->r_flags & R_TPRINTFMSG)) 968 nfs_msg(rep, "is alive again"); 969 info.nmi_mrep = rep->r_mrep; 970 info.nmi_md = rep->r_md; 971 info.nmi_dpos = rep->r_dpos; 972 if (error) { 973 infop->nmi_mrep = NULL; 974 goto nfsmout1; 975 } 976 977 /* 978 * break down the rpc header and check if ok 979 */ 980 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); 981 if (*tl++ == rpc_msgdenied) { 982 if (*tl == rpc_mismatch) 983 error = EOPNOTSUPP; 984 else 985 error = EACCES; /* Should be EAUTH. */ 986 infop->nmi_mrep = NULL; 987 goto nfsmout1; 988 } 989 990 /* 991 * Since we only support RPCAUTH_UNIX atm we step over the 992 * reply verifer type, and in the (error) case that there really 993 * is any data in it, we advance over it. 994 */ 995 tl++; /* Step over verifer type */ 996 i = fxdr_unsigned(int32_t, *tl); 997 if (i > 0) 998 nfsm_adv(nfsm_rndup(i)); /* Should not happen */ 999 1000 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); 1001 /* 0 == ok */ 1002 if (*tl == 0) { 1003 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); 1004 if (*tl != 0) { 1005 error = fxdr_unsigned(int, *tl); 1006 if ((nmp->nm_flag & NFSMNT_NFSV3) && 1007 error == NFSERR_TRYLATER) { 1008 m_freem(info.nmi_mrep); 1009 error = 0; 1010 tv.tv_sec = trylater_delay; 1011 tv.tv_usec = 0; 1012 tsleep(&tv, PSOCK, "nfsretry", tvtohz(&tv)); 1013 trylater_delay *= NFS_TIMEOUTMUL; 1014 if (trylater_delay > NFS_MAXTIMEO) 1015 trylater_delay = NFS_MAXTIMEO; 1016 1017 goto tryagain; 1018 } 1019 1020 /* 1021 * If the File Handle was stale, invalidate the 1022 * lookup cache, just in case. 1023 */ 1024 if (error == ESTALE) 1025 cache_purge(rep->r_vp); 1026 } 1027 goto nfsmout; 1028 } 1029 1030 error = EPROTONOSUPPORT; 1031 1032 nfsmout: 1033 infop->nmi_mrep = info.nmi_mrep; 1034 infop->nmi_md = info.nmi_md; 1035 infop->nmi_dpos = info.nmi_dpos; 1036 nfsmout1: 1037 m_freem(rep->r_mreq); 1038 pool_put(&nfsreqpl, rep); 1039 return (error); 1040 } 1041 #endif /* NFSCLIENT */ 1042 1043 /* 1044 * Generate the rpc reply header 1045 * siz arg. is used to decide if adding a cluster is worthwhile 1046 */ 1047 int 1048 nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp, 1049 int err, struct mbuf **mrq, struct mbuf **mbp) 1050 { 1051 u_int32_t *tl; 1052 struct mbuf *mreq; 1053 struct mbuf *mb; 1054 1055 MGETHDR(mreq, M_WAIT, MT_DATA); 1056 mb = mreq; 1057 /* 1058 * If this is a big reply, use a cluster else 1059 * try and leave leading space for the lower level headers. 1060 */ 1061 siz += RPC_REPLYSIZ; 1062 if (siz >= MHLEN - max_hdr) { 1063 MCLGET(mreq, M_WAIT); 1064 } else 1065 mreq->m_data += max_hdr; 1066 tl = mtod(mreq, u_int32_t *); 1067 mreq->m_len = 6 * NFSX_UNSIGNED; 1068 *tl++ = txdr_unsigned(nd->nd_retxid); 1069 *tl++ = rpc_reply; 1070 if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) { 1071 *tl++ = rpc_msgdenied; 1072 if (err & NFSERR_AUTHERR) { 1073 *tl++ = rpc_autherr; 1074 *tl = txdr_unsigned(err & ~NFSERR_AUTHERR); 1075 mreq->m_len -= NFSX_UNSIGNED; 1076 } else { 1077 *tl++ = rpc_mismatch; 1078 *tl++ = txdr_unsigned(RPC_VER2); 1079 *tl = txdr_unsigned(RPC_VER2); 1080 } 1081 } else { 1082 *tl++ = rpc_msgaccepted; 1083 1084 /* AUTH_UNIX requires RPCAUTH_NULL. */ 1085 *tl++ = 0; 1086 *tl++ = 0; 1087 1088 switch (err) { 1089 case EPROGUNAVAIL: 1090 *tl = txdr_unsigned(RPC_PROGUNAVAIL); 1091 break; 1092 case EPROGMISMATCH: 1093 *tl = txdr_unsigned(RPC_PROGMISMATCH); 1094 tl = nfsm_build(&mb, 2 * NFSX_UNSIGNED); 1095 *tl++ = txdr_unsigned(NFS_VER2); 1096 *tl = txdr_unsigned(NFS_VER3); 1097 break; 1098 case EPROCUNAVAIL: 1099 *tl = txdr_unsigned(RPC_PROCUNAVAIL); 1100 break; 1101 case EBADRPC: 1102 *tl = txdr_unsigned(RPC_GARBAGE); 1103 break; 1104 default: 1105 *tl = 0; 1106 if (err != NFSERR_RETVOID) { 1107 tl = nfsm_build(&mb, NFSX_UNSIGNED); 1108 if (err) 1109 *tl = txdr_unsigned(nfsrv_errmap(nd, err)); 1110 else 1111 *tl = 0; 1112 } 1113 break; 1114 }; 1115 } 1116 1117 *mrq = mreq; 1118 if (mbp != NULL) 1119 *mbp = mb; 1120 if (err != 0 && err != NFSERR_RETVOID) 1121 nfsstats.srvrpc_errs++; 1122 return (0); 1123 } 1124 1125 /* 1126 * nfs timer routine 1127 * Scan the nfsreq list and retranmit any requests that have timed out. 1128 */ 1129 void 1130 nfs_timer(void *arg) 1131 { 1132 struct nfsmount *nmp = arg; 1133 struct nfsreq *rep; 1134 struct mbuf *m; 1135 struct socket *so; 1136 int timeo, error; 1137 1138 NET_LOCK(); 1139 TAILQ_FOREACH(rep, &nmp->nm_reqsq, r_chain) { 1140 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) 1141 continue; 1142 if (nfs_sigintr(nmp, rep, rep->r_procp)) { 1143 rep->r_flags |= R_SOFTTERM; 1144 continue; 1145 } 1146 if (rep->r_rtt >= 0) { 1147 rep->r_rtt++; 1148 if (nmp->nm_flag & NFSMNT_DUMBTIMR) 1149 timeo = nmp->nm_timeo; 1150 else 1151 timeo = nfs_estimate_rto(nmp, rep->r_procnum); 1152 if (nmp->nm_timeouts > 0) 1153 timeo *= nfs_backoff[nmp->nm_timeouts - 1]; 1154 if (rep->r_rtt <= timeo) 1155 continue; 1156 if (nmp->nm_timeouts < nitems(nfs_backoff)) 1157 nmp->nm_timeouts++; 1158 } 1159 1160 /* Check for server not responding. */ 1161 if ((rep->r_flags & R_TPRINTFMSG) == 0 && rep->r_rexmit > 4) { 1162 nfs_msg(rep, "not responding"); 1163 rep->r_flags |= R_TPRINTFMSG; 1164 } 1165 if (rep->r_rexmit >= nmp->nm_retry) { /* too many */ 1166 nfsstats.rpctimeouts++; 1167 rep->r_flags |= R_SOFTTERM; 1168 continue; 1169 } 1170 if (nmp->nm_sotype != SOCK_DGRAM) { 1171 if (++rep->r_rexmit > NFS_MAXREXMIT) 1172 rep->r_rexmit = NFS_MAXREXMIT; 1173 continue; 1174 } 1175 1176 if ((so = nmp->nm_so) == NULL) 1177 continue; 1178 1179 /* 1180 * If there is enough space and the window allows.. 1181 * Resend it 1182 * Set r_rtt to -1 in case we fail to send it now. 1183 */ 1184 rep->r_rtt = -1; 1185 if (sbspace(so, &so->so_snd) >= rep->r_mreq->m_pkthdr.len && 1186 ((nmp->nm_flag & NFSMNT_DUMBTIMR) || 1187 (rep->r_flags & R_SENT) || 1188 nmp->nm_sent < nmp->nm_cwnd) && 1189 (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){ 1190 if ((nmp->nm_flag & NFSMNT_NOCONN) == 0) 1191 error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, 1192 m, NULL, NULL, curproc); 1193 else 1194 error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, 1195 m, nmp->nm_nam, NULL, curproc); 1196 if (error) { 1197 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) 1198 so->so_error = 0; 1199 } else { 1200 /* 1201 * Iff first send, start timing 1202 * else turn timing off, backoff timer 1203 * and divide congestion window by 2. 1204 */ 1205 if (rep->r_flags & R_SENT) { 1206 rep->r_flags &= ~R_TIMING; 1207 if (++rep->r_rexmit > NFS_MAXREXMIT) 1208 rep->r_rexmit = NFS_MAXREXMIT; 1209 nmp->nm_cwnd >>= 1; 1210 if (nmp->nm_cwnd < NFS_CWNDSCALE) 1211 nmp->nm_cwnd = NFS_CWNDSCALE; 1212 nfsstats.rpcretries++; 1213 } else { 1214 rep->r_flags |= R_SENT; 1215 nmp->nm_sent += NFS_CWNDSCALE; 1216 } 1217 rep->r_rtt = 0; 1218 } 1219 } 1220 } 1221 NET_UNLOCK(); 1222 timeout_add(&nmp->nm_rtimeout, nfs_ticks); 1223 } 1224 1225 /* 1226 * Test for a termination condition pending on the process. 1227 * This is used for NFSMNT_INT mounts. 1228 */ 1229 int 1230 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct proc *p) 1231 { 1232 1233 if (rep && (rep->r_flags & R_SOFTTERM)) 1234 return (EINTR); 1235 if (!(nmp->nm_flag & NFSMNT_INT)) 1236 return (0); 1237 if (p && p->p_siglist && 1238 (((p->p_siglist & ~p->p_sigmask) & 1239 ~p->p_p->ps_sigacts->ps_sigignore) & NFSINT_SIGMASK)) 1240 return (EINTR); 1241 return (0); 1242 } 1243 1244 /* 1245 * Lock a socket against others. 1246 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply 1247 * and also to avoid race conditions between the processes with nfs requests 1248 * in progress when a reconnect is necessary. 1249 */ 1250 int 1251 nfs_sndlock(int *flagp, struct nfsreq *rep) 1252 { 1253 struct proc *p; 1254 int slpflag = 0, slptimeo = 0; 1255 1256 if (rep) { 1257 p = rep->r_procp; 1258 if (rep->r_nmp->nm_flag & NFSMNT_INT) 1259 slpflag = PCATCH; 1260 } else 1261 p = NULL; 1262 while (*flagp & NFSMNT_SNDLOCK) { 1263 if (rep && nfs_sigintr(rep->r_nmp, rep, p)) 1264 return (EINTR); 1265 *flagp |= NFSMNT_WANTSND; 1266 (void)tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsndlck", 1267 slptimeo); 1268 if (slpflag == PCATCH) { 1269 slpflag = 0; 1270 slptimeo = 2 * hz; 1271 } 1272 } 1273 *flagp |= NFSMNT_SNDLOCK; 1274 return (0); 1275 } 1276 1277 /* 1278 * Unlock the stream socket for others. 1279 */ 1280 void 1281 nfs_sndunlock(int *flagp) 1282 { 1283 1284 if ((*flagp & NFSMNT_SNDLOCK) == 0) 1285 panic("nfs sndunlock"); 1286 *flagp &= ~NFSMNT_SNDLOCK; 1287 if (*flagp & NFSMNT_WANTSND) { 1288 *flagp &= ~NFSMNT_WANTSND; 1289 wakeup((caddr_t)flagp); 1290 } 1291 } 1292 1293 int 1294 nfs_rcvlock(struct nfsreq *rep) 1295 { 1296 int *flagp = &rep->r_nmp->nm_flag; 1297 int slpflag, slptimeo = 0; 1298 1299 if (*flagp & NFSMNT_INT) 1300 slpflag = PCATCH; 1301 else 1302 slpflag = 0; 1303 1304 while (*flagp & NFSMNT_RCVLOCK) { 1305 if (nfs_sigintr(rep->r_nmp, rep, rep->r_procp)) 1306 return (EINTR); 1307 *flagp |= NFSMNT_WANTRCV; 1308 (void)tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsrcvlk", 1309 slptimeo); 1310 if (rep->r_mrep != NULL) { 1311 /* 1312 * Don't take the lock if our reply has been received 1313 * while we where sleeping. 1314 */ 1315 return (EALREADY); 1316 } 1317 if (slpflag == PCATCH) { 1318 slpflag = 0; 1319 slptimeo = 2 * hz; 1320 } 1321 } 1322 *flagp |= NFSMNT_RCVLOCK; 1323 return (0); 1324 } 1325 1326 /* 1327 * Unlock the stream socket for others. 1328 */ 1329 void 1330 nfs_rcvunlock(int *flagp) 1331 { 1332 1333 if ((*flagp & NFSMNT_RCVLOCK) == 0) 1334 panic("nfs rcvunlock"); 1335 *flagp &= ~NFSMNT_RCVLOCK; 1336 if (*flagp & NFSMNT_WANTRCV) { 1337 *flagp &= ~NFSMNT_WANTRCV; 1338 wakeup(flagp); 1339 } 1340 } 1341 1342 /* 1343 * Auxiliary routine to align the length of mbuf copies made with m_copyback(). 1344 */ 1345 void 1346 nfs_realign_fixup(struct mbuf *m, struct mbuf *n, unsigned int *off) 1347 { 1348 size_t padding; 1349 1350 /* 1351 * The maximum number of bytes that m_copyback() places in a mbuf is 1352 * always an aligned quantity, so realign happens at the chain's tail. 1353 */ 1354 while (n->m_next != NULL) 1355 n = n->m_next; 1356 1357 /* 1358 * Pad from the next elements in the source chain. Loop until the 1359 * destination chain is aligned, or the end of the source is reached. 1360 */ 1361 do { 1362 m = m->m_next; 1363 if (m == NULL) 1364 return; 1365 1366 padding = min(ALIGN(n->m_len) - n->m_len, m->m_len); 1367 if (padding > M_TRAILINGSPACE(n)) 1368 panic("nfs_realign_fixup: no memory to pad to"); 1369 1370 bcopy(mtod(m, void *), mtod(n, char *) + n->m_len, padding); 1371 1372 n->m_len += padding; 1373 m_adj(m, padding); 1374 *off += padding; 1375 1376 } while (!ALIGNED_POINTER(n->m_len, void *)); 1377 } 1378 1379 /* 1380 * The NFS RPC parsing code uses the data address and the length of mbuf 1381 * structures to calculate on-memory addresses. This function makes sure these 1382 * parameters are correctly aligned. 1383 */ 1384 void 1385 nfs_realign(struct mbuf **pm, int hsiz) 1386 { 1387 struct mbuf *m; 1388 struct mbuf *n = NULL; 1389 unsigned int off = 0; 1390 1391 ++nfs_realign_test; 1392 while ((m = *pm) != NULL) { 1393 if (!ALIGNED_POINTER(m->m_data, void *) || 1394 !ALIGNED_POINTER(m->m_len, void *)) { 1395 MGET(n, M_WAIT, MT_DATA); 1396 #define ALIGN_POINTER(n) ((u_int)(((n) + sizeof(void *)) & ~sizeof(void *))) 1397 if (ALIGN_POINTER(m->m_len) >= MINCLSIZE) { 1398 MCLGET(n, M_WAIT); 1399 } 1400 n->m_len = 0; 1401 break; 1402 } 1403 pm = &m->m_next; 1404 } 1405 /* 1406 * If n is non-NULL, loop on m copying data, then replace the 1407 * portion of the chain that had to be realigned. 1408 */ 1409 if (n != NULL) { 1410 ++nfs_realign_count; 1411 while (m) { 1412 m_copyback(n, off, m->m_len, mtod(m, caddr_t), M_WAIT); 1413 1414 /* 1415 * If an unaligned amount of memory was copied, fix up 1416 * the last mbuf created by m_copyback(). 1417 */ 1418 if (!ALIGNED_POINTER(m->m_len, void *)) 1419 nfs_realign_fixup(m, n, &off); 1420 1421 off += m->m_len; 1422 m = m->m_next; 1423 } 1424 m_freemp(pm); 1425 *pm = n; 1426 } 1427 } 1428 1429 1430 /* 1431 * Parse an RPC request 1432 * - verify it 1433 * - fill in the cred struct. 1434 */ 1435 int 1436 nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header) 1437 { 1438 int len, i; 1439 u_int32_t *tl; 1440 int32_t t1; 1441 caddr_t cp2; 1442 u_int32_t nfsvers, auth_type; 1443 int error = 0; 1444 struct nfsm_info info; 1445 1446 info.nmi_mrep = nd->nd_mrep; 1447 info.nmi_md = nd->nd_md; 1448 info.nmi_dpos = nd->nd_dpos; 1449 if (has_header) { 1450 nfsm_dissect(tl, u_int32_t *, 10 * NFSX_UNSIGNED); 1451 nd->nd_retxid = fxdr_unsigned(u_int32_t, *tl++); 1452 if (*tl++ != rpc_call) { 1453 m_freem(info.nmi_mrep); 1454 return (EBADRPC); 1455 } 1456 } else 1457 nfsm_dissect(tl, u_int32_t *, 8 * NFSX_UNSIGNED); 1458 nd->nd_repstat = 0; 1459 nd->nd_flag = 0; 1460 if (*tl++ != rpc_vers) { 1461 nd->nd_repstat = ERPCMISMATCH; 1462 nd->nd_procnum = NFSPROC_NOOP; 1463 return (0); 1464 } 1465 if (*tl != nfs_prog) { 1466 nd->nd_repstat = EPROGUNAVAIL; 1467 nd->nd_procnum = NFSPROC_NOOP; 1468 return (0); 1469 } 1470 tl++; 1471 nfsvers = fxdr_unsigned(u_int32_t, *tl++); 1472 if (nfsvers != NFS_VER2 && nfsvers != NFS_VER3) { 1473 nd->nd_repstat = EPROGMISMATCH; 1474 nd->nd_procnum = NFSPROC_NOOP; 1475 return (0); 1476 } 1477 if (nfsvers == NFS_VER3) 1478 nd->nd_flag = ND_NFSV3; 1479 nd->nd_procnum = fxdr_unsigned(u_int32_t, *tl++); 1480 if (nd->nd_procnum == NFSPROC_NULL) 1481 return (0); 1482 if (nd->nd_procnum >= NFS_NPROCS || 1483 (nd->nd_procnum > NFSPROC_COMMIT) || 1484 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) { 1485 nd->nd_repstat = EPROCUNAVAIL; 1486 nd->nd_procnum = NFSPROC_NOOP; 1487 return (0); 1488 } 1489 if ((nd->nd_flag & ND_NFSV3) == 0) 1490 nd->nd_procnum = nfsv3_procid[nd->nd_procnum]; 1491 auth_type = *tl++; 1492 len = fxdr_unsigned(int, *tl++); 1493 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1494 m_freem(info.nmi_mrep); 1495 return (EBADRPC); 1496 } 1497 1498 /* Handle auth_unix */ 1499 if (auth_type == rpc_auth_unix) { 1500 len = fxdr_unsigned(int, *++tl); 1501 if (len < 0 || len > NFS_MAXNAMLEN) { 1502 m_freem(info.nmi_mrep); 1503 return (EBADRPC); 1504 } 1505 nfsm_adv(nfsm_rndup(len)); 1506 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); 1507 memset(&nd->nd_cr, 0, sizeof (struct ucred)); 1508 nd->nd_cr.cr_ref = 1; 1509 nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++); 1510 nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++); 1511 len = fxdr_unsigned(int, *tl); 1512 if (len < 0 || len > RPCAUTH_UNIXGIDS) { 1513 m_freem(info.nmi_mrep); 1514 return (EBADRPC); 1515 } 1516 nfsm_dissect(tl, u_int32_t *, (len + 2) * NFSX_UNSIGNED); 1517 for (i = 0; i < len; i++) { 1518 if (i < NGROUPS_MAX) 1519 nd->nd_cr.cr_groups[i] = 1520 fxdr_unsigned(gid_t, *tl++); 1521 else 1522 tl++; 1523 } 1524 nd->nd_cr.cr_ngroups = (len > NGROUPS_MAX) ? NGROUPS_MAX : len; 1525 len = fxdr_unsigned(int, *++tl); 1526 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1527 m_freem(info.nmi_mrep); 1528 return (EBADRPC); 1529 } 1530 if (len > 0) 1531 nfsm_adv(nfsm_rndup(len)); 1532 } else { 1533 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED); 1534 nd->nd_procnum = NFSPROC_NOOP; 1535 return (0); 1536 } 1537 1538 nd->nd_md = info.nmi_md; 1539 nd->nd_dpos = info.nmi_dpos; 1540 return (0); 1541 nfsmout: 1542 return (error); 1543 } 1544 1545 void 1546 nfs_msg(struct nfsreq *rep, char *msg) 1547 { 1548 tpr_t tpr; 1549 1550 if (rep->r_procp) 1551 tpr = tprintf_open(rep->r_procp); 1552 else 1553 tpr = NULL; 1554 1555 tprintf(tpr, "nfs server %s: %s\n", 1556 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname, msg); 1557 tprintf_close(tpr); 1558 } 1559 1560 #ifdef NFSSERVER 1561 /* 1562 * Socket upcall routine for the nfsd sockets. 1563 * The caddr_t arg is a pointer to the "struct nfssvc_sock". 1564 * Essentially do as much as possible non-blocking, else punt and it will 1565 * be called with M_WAIT from an nfsd. 1566 */ 1567 void 1568 nfsrv_rcv(struct socket *so, caddr_t arg, int waitflag) 1569 { 1570 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg; 1571 struct mbuf *m; 1572 struct mbuf *mp, *nam; 1573 struct uio auio; 1574 int flags, error; 1575 1576 if ((slp->ns_flag & SLP_VALID) == 0) 1577 return; 1578 1579 /* Defer soreceive() to an nfsd. */ 1580 if (waitflag == M_DONTWAIT) { 1581 slp->ns_flag |= SLP_NEEDQ; 1582 goto dorecs; 1583 } 1584 1585 auio.uio_procp = NULL; 1586 if (so->so_type == SOCK_STREAM) { 1587 /* 1588 * Do soreceive(). 1589 */ 1590 auio.uio_resid = 1000000000; 1591 flags = MSG_DONTWAIT; 1592 error = soreceive(so, &nam, &auio, &mp, NULL, 1593 &flags, 0); 1594 if (error || mp == NULL) { 1595 if (error == EWOULDBLOCK) 1596 slp->ns_flag |= SLP_NEEDQ; 1597 else 1598 slp->ns_flag |= SLP_DISCONN; 1599 goto dorecs; 1600 } 1601 m = mp; 1602 if (slp->ns_rawend) { 1603 slp->ns_rawend->m_next = m; 1604 slp->ns_cc += 1000000000 - auio.uio_resid; 1605 } else { 1606 slp->ns_raw = m; 1607 slp->ns_cc = 1000000000 - auio.uio_resid; 1608 } 1609 while (m->m_next) 1610 m = m->m_next; 1611 slp->ns_rawend = m; 1612 1613 /* 1614 * Now try and parse record(s) out of the raw stream data. 1615 */ 1616 error = nfsrv_getstream(slp, waitflag); 1617 if (error) { 1618 if (error == EPERM) 1619 slp->ns_flag |= SLP_DISCONN; 1620 else 1621 slp->ns_flag |= SLP_NEEDQ; 1622 } 1623 } else { 1624 do { 1625 auio.uio_resid = 1000000000; 1626 flags = MSG_DONTWAIT; 1627 error = soreceive(so, &nam, &auio, &mp, 1628 NULL, &flags, 0); 1629 if (mp) { 1630 if (nam) { 1631 m = nam; 1632 m->m_next = mp; 1633 } else 1634 m = mp; 1635 if (slp->ns_recend) 1636 slp->ns_recend->m_nextpkt = m; 1637 else 1638 slp->ns_rec = m; 1639 slp->ns_recend = m; 1640 m->m_nextpkt = NULL; 1641 } 1642 if (error) { 1643 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) 1644 && error != EWOULDBLOCK) { 1645 slp->ns_flag |= SLP_DISCONN; 1646 goto dorecs; 1647 } 1648 } 1649 } while (mp); 1650 } 1651 1652 /* 1653 * Now try and process the request records, non-blocking. 1654 */ 1655 dorecs: 1656 if (waitflag == M_DONTWAIT && 1657 (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)))) 1658 nfsrv_wakenfsd(slp); 1659 } 1660 1661 /* 1662 * Try and extract an RPC request from the mbuf data list received on a 1663 * stream socket. The "waitflag" argument indicates whether or not it 1664 * can sleep. 1665 */ 1666 int 1667 nfsrv_getstream(struct nfssvc_sock *slp, int waitflag) 1668 { 1669 struct mbuf *m, **mpp; 1670 char *cp1, *cp2; 1671 int len; 1672 struct mbuf *om, *m2, *recm; 1673 u_int32_t recmark; 1674 1675 if (slp->ns_flag & SLP_GETSTREAM) 1676 return (0); 1677 slp->ns_flag |= SLP_GETSTREAM; 1678 for (;;) { 1679 if (slp->ns_reclen == 0) { 1680 if (slp->ns_cc < NFSX_UNSIGNED) { 1681 slp->ns_flag &= ~SLP_GETSTREAM; 1682 return (0); 1683 } 1684 m = slp->ns_raw; 1685 if (m->m_len >= NFSX_UNSIGNED) { 1686 bcopy(mtod(m, caddr_t), &recmark, 1687 NFSX_UNSIGNED); 1688 m->m_data += NFSX_UNSIGNED; 1689 m->m_len -= NFSX_UNSIGNED; 1690 } else { 1691 cp1 = (caddr_t)&recmark; 1692 cp2 = mtod(m, caddr_t); 1693 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) { 1694 while (m->m_len == 0) { 1695 m = m->m_next; 1696 cp2 = mtod(m, caddr_t); 1697 } 1698 *cp1++ = *cp2++; 1699 m->m_data++; 1700 m->m_len--; 1701 } 1702 } 1703 slp->ns_cc -= NFSX_UNSIGNED; 1704 recmark = ntohl(recmark); 1705 slp->ns_reclen = recmark & ~0x80000000; 1706 if (recmark & 0x80000000) 1707 slp->ns_flag |= SLP_LASTFRAG; 1708 else 1709 slp->ns_flag &= ~SLP_LASTFRAG; 1710 if (slp->ns_reclen > NFS_MAXPACKET) { 1711 slp->ns_flag &= ~SLP_GETSTREAM; 1712 return (EPERM); 1713 } 1714 } 1715 1716 /* 1717 * Now get the record part. 1718 */ 1719 recm = NULL; 1720 if (slp->ns_cc == slp->ns_reclen) { 1721 recm = slp->ns_raw; 1722 slp->ns_raw = slp->ns_rawend = NULL; 1723 slp->ns_cc = slp->ns_reclen = 0; 1724 } else if (slp->ns_cc > slp->ns_reclen) { 1725 len = 0; 1726 m = slp->ns_raw; 1727 om = NULL; 1728 while (len < slp->ns_reclen) { 1729 if ((len + m->m_len) > slp->ns_reclen) { 1730 m2 = m_copym(m, 0, slp->ns_reclen - len, 1731 waitflag); 1732 if (m2) { 1733 if (om) { 1734 om->m_next = m2; 1735 recm = slp->ns_raw; 1736 } else 1737 recm = m2; 1738 m->m_data += slp->ns_reclen-len; 1739 m->m_len -= slp->ns_reclen-len; 1740 len = slp->ns_reclen; 1741 } else { 1742 slp->ns_flag &= ~SLP_GETSTREAM; 1743 return (EWOULDBLOCK); 1744 } 1745 } else if ((len + m->m_len) == slp->ns_reclen) { 1746 om = m; 1747 len += m->m_len; 1748 m = m->m_next; 1749 recm = slp->ns_raw; 1750 om->m_next = NULL; 1751 } else { 1752 om = m; 1753 len += m->m_len; 1754 m = m->m_next; 1755 } 1756 } 1757 slp->ns_raw = m; 1758 slp->ns_cc -= len; 1759 slp->ns_reclen = 0; 1760 } else { 1761 slp->ns_flag &= ~SLP_GETSTREAM; 1762 return (0); 1763 } 1764 1765 /* 1766 * Accumulate the fragments into a record. 1767 */ 1768 mpp = &slp->ns_frag; 1769 while (*mpp) 1770 mpp = &((*mpp)->m_next); 1771 *mpp = recm; 1772 if (slp->ns_flag & SLP_LASTFRAG) { 1773 if (slp->ns_recend) 1774 slp->ns_recend->m_nextpkt = slp->ns_frag; 1775 else 1776 slp->ns_rec = slp->ns_frag; 1777 slp->ns_recend = slp->ns_frag; 1778 slp->ns_frag = NULL; 1779 } 1780 } 1781 } 1782 1783 /* 1784 * Parse an RPC header. 1785 */ 1786 int 1787 nfsrv_dorec(struct nfssvc_sock *slp, struct nfsd *nfsd, 1788 struct nfsrv_descript **ndp) 1789 { 1790 struct mbuf *m, *nam; 1791 struct nfsrv_descript *nd; 1792 int error; 1793 1794 *ndp = NULL; 1795 if ((slp->ns_flag & SLP_VALID) == 0 || 1796 (m = slp->ns_rec) == NULL) 1797 return (ENOBUFS); 1798 slp->ns_rec = m->m_nextpkt; 1799 if (slp->ns_rec) 1800 m->m_nextpkt = NULL; 1801 else 1802 slp->ns_recend = NULL; 1803 if (m->m_type == MT_SONAME) { 1804 nam = m; 1805 m = m->m_next; 1806 nam->m_next = NULL; 1807 } else 1808 nam = NULL; 1809 nd = pool_get(&nfsrv_descript_pl, PR_WAITOK); 1810 nfs_realign(&m, 10 * NFSX_UNSIGNED); 1811 nd->nd_md = nd->nd_mrep = m; 1812 nd->nd_nam2 = nam; 1813 nd->nd_dpos = mtod(m, caddr_t); 1814 error = nfs_getreq(nd, nfsd, 1); 1815 if (error) { 1816 m_freem(nam); 1817 pool_put(&nfsrv_descript_pl, nd); 1818 return (error); 1819 } 1820 *ndp = nd; 1821 nfsd->nfsd_nd = nd; 1822 return (0); 1823 } 1824 1825 1826 /* 1827 * Search for a sleeping nfsd and wake it up. 1828 * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the 1829 * running nfsds will go look for the work in the nfssvc_sock list. 1830 */ 1831 void 1832 nfsrv_wakenfsd(struct nfssvc_sock *slp) 1833 { 1834 struct nfsd *nfsd; 1835 1836 if ((slp->ns_flag & SLP_VALID) == 0) 1837 return; 1838 1839 TAILQ_FOREACH(nfsd, &nfsd_head, nfsd_chain) { 1840 if (nfsd->nfsd_flag & NFSD_WAITING) { 1841 nfsd->nfsd_flag &= ~NFSD_WAITING; 1842 if (nfsd->nfsd_slp) 1843 panic("nfsd wakeup"); 1844 slp->ns_sref++; 1845 nfsd->nfsd_slp = slp; 1846 wakeup_one(nfsd); 1847 return; 1848 } 1849 } 1850 1851 slp->ns_flag |= SLP_DOREC; 1852 nfsd_head_flag |= NFSD_CHECKSLP; 1853 } 1854 #endif /* NFSSERVER */ 1855