1 /* $OpenBSD: nfs_socket.c,v 1.119 2017/06/27 12:02:43 mpi Exp $ */ 2 /* $NetBSD: nfs_socket.c,v 1.27 1996/04/15 20:20:00 thorpej Exp $ */ 3 4 /* 5 * Copyright (c) 1989, 1991, 1993, 1995 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Rick Macklem at The University of Guelph. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95 36 */ 37 38 /* 39 * Socket operations for use by nfs 40 */ 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/proc.h> 45 #include <sys/mount.h> 46 #include <sys/kernel.h> 47 #include <sys/mbuf.h> 48 #include <sys/vnode.h> 49 #include <sys/domain.h> 50 #include <sys/protosw.h> 51 #include <sys/signalvar.h> 52 #include <sys/socket.h> 53 #include <sys/socketvar.h> 54 #include <sys/syslog.h> 55 #include <sys/tprintf.h> 56 #include <sys/namei.h> 57 #include <sys/pool.h> 58 #include <sys/queue.h> 59 60 #include <netinet/in.h> 61 #include <netinet/tcp.h> 62 63 #include <nfs/rpcv2.h> 64 #include <nfs/nfsproto.h> 65 #include <nfs/nfs.h> 66 #include <nfs/xdr_subs.h> 67 #include <nfs/nfsm_subs.h> 68 #include <nfs/nfsmount.h> 69 #include <nfs/nfs_var.h> 70 71 /* External data, mostly RPC constants in XDR form. */ 72 extern u_int32_t rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, 73 rpc_auth_unix, rpc_msgaccepted, rpc_call, rpc_autherr; 74 extern u_int32_t nfs_prog; 75 extern struct nfsstats nfsstats; 76 extern int nfsv3_procid[NFS_NPROCS]; 77 extern int nfs_ticks; 78 79 extern struct pool nfsrv_descript_pl; 80 81 /* 82 * There is a congestion window for outstanding rpcs maintained per mount 83 * point. The cwnd size is adjusted in roughly the way that: 84 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of 85 * SIGCOMM '88". ACM, August 1988. 86 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout 87 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd 88 * of rpcs is in progress. 89 * (The sent count and cwnd are scaled for integer arith.) 90 * Variants of "slow start" were tried and were found to be too much of a 91 * performance hit (ave. rtt 3 times larger), 92 * I suspect due to the large rtt that nfs rpcs have. 93 */ 94 #define NFS_CWNDSCALE 256 95 #define NFS_MAXCWND (NFS_CWNDSCALE * 32) 96 int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256 }; 97 98 /* RTT estimator */ 99 enum nfs_rto_timers nfs_ptimers[NFS_NPROCS] = { 100 NFS_DEFAULT_TIMER, /* NULL */ 101 NFS_GETATTR_TIMER, /* GETATTR */ 102 NFS_DEFAULT_TIMER, /* SETATTR */ 103 NFS_LOOKUP_TIMER, /* LOOKUP */ 104 NFS_GETATTR_TIMER, /* ACCESS */ 105 NFS_READ_TIMER, /* READLINK */ 106 NFS_READ_TIMER, /* READ */ 107 NFS_WRITE_TIMER, /* WRITE */ 108 NFS_DEFAULT_TIMER, /* CREATE */ 109 NFS_DEFAULT_TIMER, /* MKDIR */ 110 NFS_DEFAULT_TIMER, /* SYMLINK */ 111 NFS_DEFAULT_TIMER, /* MKNOD */ 112 NFS_DEFAULT_TIMER, /* REMOVE */ 113 NFS_DEFAULT_TIMER, /* RMDIR */ 114 NFS_DEFAULT_TIMER, /* RENAME */ 115 NFS_DEFAULT_TIMER, /* LINK */ 116 NFS_READ_TIMER, /* READDIR */ 117 NFS_READ_TIMER, /* READDIRPLUS */ 118 NFS_DEFAULT_TIMER, /* FSSTAT */ 119 NFS_DEFAULT_TIMER, /* FSINFO */ 120 NFS_DEFAULT_TIMER, /* PATHCONF */ 121 NFS_DEFAULT_TIMER, /* COMMIT */ 122 NFS_DEFAULT_TIMER, /* NOOP */ 123 }; 124 125 void nfs_init_rtt(struct nfsmount *); 126 void nfs_update_rtt(struct nfsreq *); 127 int nfs_estimate_rto(struct nfsmount *, u_int32_t procnum); 128 129 void nfs_realign(struct mbuf **, int); 130 void nfs_realign_fixup(struct mbuf *, struct mbuf *, unsigned int *); 131 132 int nfs_rcvlock(struct nfsreq *); 133 int nfs_receive(struct nfsreq *, struct mbuf **, struct mbuf **); 134 int nfs_reconnect(struct nfsreq *); 135 int nfs_reply(struct nfsreq *); 136 void nfs_msg(struct nfsreq *, char *); 137 void nfs_rcvunlock(int *); 138 139 int nfsrv_getstream(struct nfssvc_sock *, int); 140 141 unsigned int nfs_realign_test = 0; 142 unsigned int nfs_realign_count = 0; 143 144 /* Initialize the RTT estimator state for a new mount point. */ 145 void 146 nfs_init_rtt(struct nfsmount *nmp) 147 { 148 int i; 149 150 for (i = 0; i < NFS_MAX_TIMER; i++) 151 nmp->nm_srtt[i] = NFS_INITRTT; 152 for (i = 0; i < NFS_MAX_TIMER; i++) 153 nmp->nm_sdrtt[i] = 0; 154 } 155 156 /* 157 * Update a mount point's RTT estimator state using data from the 158 * passed-in request. 159 * 160 * Use a gain of 0.125 on the mean and a gain of 0.25 on the deviation. 161 * 162 * NB: Since the timer resolution of NFS_HZ is so course, it can often 163 * result in r_rtt == 0. Since r_rtt == N means that the actual RTT is 164 * between N + dt and N + 2 - dt ticks, add 1 before calculating the 165 * update values. 166 */ 167 void 168 nfs_update_rtt(struct nfsreq *rep) 169 { 170 int t1 = rep->r_rtt + 1; 171 int index = nfs_ptimers[rep->r_procnum] - 1; 172 int *srtt = &rep->r_nmp->nm_srtt[index]; 173 int *sdrtt = &rep->r_nmp->nm_sdrtt[index]; 174 175 t1 -= *srtt >> 3; 176 *srtt += t1; 177 if (t1 < 0) 178 t1 = -t1; 179 t1 -= *sdrtt >> 2; 180 *sdrtt += t1; 181 } 182 183 /* 184 * Estimate RTO for an NFS RPC sent via an unreliable datagram. 185 * 186 * Use the mean and mean deviation of RTT for the appropriate type 187 * of RPC for the frequent RPCs and a default for the others. 188 * The justification for doing "other" this way is that these RPCs 189 * happen so infrequently that timer est. would probably be stale. 190 * Also, since many of these RPCs are non-idempotent, a conservative 191 * timeout is desired. 192 * 193 * getattr, lookup - A+2D 194 * read, write - A+4D 195 * other - nm_timeo 196 */ 197 int 198 nfs_estimate_rto(struct nfsmount *nmp, u_int32_t procnum) 199 { 200 enum nfs_rto_timers timer = nfs_ptimers[procnum]; 201 int index = timer - 1; 202 int rto; 203 204 switch (timer) { 205 case NFS_GETATTR_TIMER: 206 case NFS_LOOKUP_TIMER: 207 rto = ((nmp->nm_srtt[index] + 3) >> 2) + 208 ((nmp->nm_sdrtt[index] + 1) >> 1); 209 break; 210 case NFS_READ_TIMER: 211 case NFS_WRITE_TIMER: 212 rto = ((nmp->nm_srtt[index] + 7) >> 3) + 213 (nmp->nm_sdrtt[index] + 1); 214 break; 215 default: 216 rto = nmp->nm_timeo; 217 return (rto); 218 } 219 220 if (rto < NFS_MINRTO) 221 rto = NFS_MINRTO; 222 else if (rto > NFS_MAXRTO) 223 rto = NFS_MAXRTO; 224 225 return (rto); 226 } 227 228 229 230 /* 231 * Initialize sockets and congestion for a new NFS connection. 232 * We do not free the sockaddr if error. 233 */ 234 int 235 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep) 236 { 237 struct socket *so; 238 int s, error, rcvreserve, sndreserve; 239 struct sockaddr *saddr; 240 struct sockaddr_in *sin; 241 struct mbuf *m; 242 243 nmp->nm_so = NULL; 244 saddr = mtod(nmp->nm_nam, struct sockaddr *); 245 error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype, 246 nmp->nm_soproto); 247 if (error) 248 goto bad; 249 so = nmp->nm_so; 250 nmp->nm_soflags = so->so_proto->pr_flags; 251 252 /* 253 * Some servers require that the client port be a reserved port number. 254 * We always allocate a reserved port, as this prevents filehandle 255 * disclosure through UDP port capture. 256 */ 257 if (saddr->sa_family == AF_INET) { 258 struct mbuf *mopt; 259 int *ip; 260 261 MGET(mopt, M_WAIT, MT_SOOPTS); 262 mopt->m_len = sizeof(int); 263 ip = mtod(mopt, int *); 264 *ip = IP_PORTRANGE_LOW; 265 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); 266 if (error) 267 goto bad; 268 269 MGET(m, M_WAIT, MT_SONAME); 270 sin = mtod(m, struct sockaddr_in *); 271 memset(sin, 0, sizeof(*sin)); 272 sin->sin_len = m->m_len = sizeof(struct sockaddr_in); 273 sin->sin_family = AF_INET; 274 sin->sin_addr.s_addr = INADDR_ANY; 275 sin->sin_port = htons(0); 276 error = sobind(so, m, &proc0); 277 m_freem(m); 278 if (error) 279 goto bad; 280 281 MGET(mopt, M_WAIT, MT_SOOPTS); 282 mopt->m_len = sizeof(int); 283 ip = mtod(mopt, int *); 284 *ip = IP_PORTRANGE_DEFAULT; 285 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); 286 if (error) 287 goto bad; 288 } 289 290 /* 291 * Protocols that do not require connections may be optionally left 292 * unconnected for servers that reply from a port other than NFS_PORT. 293 */ 294 if (nmp->nm_flag & NFSMNT_NOCONN) { 295 if (nmp->nm_soflags & PR_CONNREQUIRED) { 296 error = ENOTCONN; 297 goto bad; 298 } 299 } else { 300 error = soconnect(so, nmp->nm_nam); 301 if (error) 302 goto bad; 303 304 /* 305 * Wait for the connection to complete. Cribbed from the 306 * connect system call but with the wait timing out so 307 * that interruptible mounts don't hang here for a long time. 308 */ 309 s = solock(so); 310 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 311 sosleep(so, &so->so_timeo, PSOCK, "nfscon", 2 * hz); 312 if ((so->so_state & SS_ISCONNECTING) && 313 so->so_error == 0 && rep && 314 (error = nfs_sigintr(nmp, rep, rep->r_procp)) != 0){ 315 so->so_state &= ~SS_ISCONNECTING; 316 sounlock(s); 317 goto bad; 318 } 319 } 320 if (so->so_error) { 321 error = so->so_error; 322 so->so_error = 0; 323 sounlock(s); 324 goto bad; 325 } 326 sounlock(s); 327 } 328 /* 329 * Always set receive timeout to detect server crash and reconnect. 330 * Otherwise, we can get stuck in soreceive forever. 331 */ 332 so->so_rcv.sb_timeo = (5 * hz); 333 if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) 334 so->so_snd.sb_timeo = (5 * hz); 335 else 336 so->so_snd.sb_timeo = 0; 337 if (nmp->nm_sotype == SOCK_DGRAM) { 338 sndreserve = nmp->nm_wsize + NFS_MAXPKTHDR; 339 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) + 340 NFS_MAXPKTHDR) * 2; 341 } else if (nmp->nm_sotype == SOCK_SEQPACKET) { 342 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 2; 343 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) + 344 NFS_MAXPKTHDR) * 2; 345 } else { 346 if (nmp->nm_sotype != SOCK_STREAM) 347 panic("nfscon sotype"); 348 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 349 MGET(m, M_WAIT, MT_SOOPTS); 350 *mtod(m, int32_t *) = 1; 351 m->m_len = sizeof(int32_t); 352 sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, m); 353 } 354 if (so->so_proto->pr_protocol == IPPROTO_TCP) { 355 MGET(m, M_WAIT, MT_SOOPTS); 356 *mtod(m, int32_t *) = 1; 357 m->m_len = sizeof(int32_t); 358 sosetopt(so, IPPROTO_TCP, TCP_NODELAY, m); 359 } 360 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + 361 sizeof (u_int32_t)) * 2; 362 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + 363 sizeof (u_int32_t)) * 2; 364 } 365 s = solock(so); 366 error = soreserve(so, sndreserve, rcvreserve); 367 sounlock(s); 368 if (error) 369 goto bad; 370 so->so_rcv.sb_flags |= SB_NOINTR; 371 so->so_snd.sb_flags |= SB_NOINTR; 372 373 /* Initialize other non-zero congestion variables */ 374 nfs_init_rtt(nmp); 375 nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */ 376 nmp->nm_sent = 0; 377 nmp->nm_timeouts = 0; 378 return (0); 379 380 bad: 381 nfs_disconnect(nmp); 382 return (error); 383 } 384 385 /* 386 * Reconnect routine: 387 * Called when a connection is broken on a reliable protocol. 388 * - clean up the old socket 389 * - nfs_connect() again 390 * - set R_MUSTRESEND for all outstanding requests on mount point 391 * If this fails the mount point is DEAD! 392 * nb: Must be called with the nfs_sndlock() set on the mount point. 393 */ 394 int 395 nfs_reconnect(struct nfsreq *rep) 396 { 397 struct nfsreq *rp; 398 struct nfsmount *nmp = rep->r_nmp; 399 int error; 400 401 nfs_disconnect(nmp); 402 while ((error = nfs_connect(nmp, rep)) != 0) { 403 if (error == EINTR || error == ERESTART) 404 return (EINTR); 405 (void) tsleep((caddr_t)&lbolt, PSOCK, "nfsrecon", 0); 406 } 407 408 /* 409 * Loop through outstanding request list and fix up all requests 410 * on old socket. 411 */ 412 TAILQ_FOREACH(rp, &nmp->nm_reqsq, r_chain) { 413 rp->r_flags |= R_MUSTRESEND; 414 rp->r_rexmit = 0; 415 } 416 return (0); 417 } 418 419 /* 420 * NFS disconnect. Clean up and unlink. 421 */ 422 void 423 nfs_disconnect(struct nfsmount *nmp) 424 { 425 struct socket *so; 426 427 if (nmp->nm_so) { 428 so = nmp->nm_so; 429 nmp->nm_so = NULL; 430 soshutdown(so, SHUT_RDWR); 431 soclose(so); 432 } 433 } 434 435 /* 436 * This is the nfs send routine. For connection based socket types, it 437 * must be called with an nfs_sndlock() on the socket. 438 * "rep == NULL" indicates that it has been called from a server. 439 * For the client side: 440 * - return EINTR if the RPC is terminated, 0 otherwise 441 * - set R_MUSTRESEND if the send fails for any reason 442 * - do any cleanup required by recoverable socket errors (???) 443 * For the server side: 444 * - return EINTR or ERESTART if interrupted by a signal 445 * - return EPIPE if a connection is lost for connection based sockets (TCP...) 446 * - do any cleanup required by recoverable socket errors (???) 447 */ 448 int 449 nfs_send(struct socket *so, struct mbuf *nam, struct mbuf *top, 450 struct nfsreq *rep) 451 { 452 struct mbuf *sendnam; 453 int error, soflags, flags; 454 455 if (rep) { 456 if (rep->r_flags & R_SOFTTERM) { 457 m_freem(top); 458 return (EINTR); 459 } 460 if ((so = rep->r_nmp->nm_so) == NULL) { 461 rep->r_flags |= R_MUSTRESEND; 462 m_freem(top); 463 return (0); 464 } 465 rep->r_flags &= ~R_MUSTRESEND; 466 soflags = rep->r_nmp->nm_soflags; 467 } else 468 soflags = so->so_proto->pr_flags; 469 if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED)) 470 sendnam = NULL; 471 else 472 sendnam = nam; 473 if (so->so_type == SOCK_SEQPACKET) 474 flags = MSG_EOR; 475 else 476 flags = 0; 477 478 error = sosend(so, sendnam, NULL, top, NULL, flags); 479 if (error) { 480 if (rep) { 481 /* 482 * Deal with errors for the client side. 483 */ 484 if (rep->r_flags & R_SOFTTERM) 485 error = EINTR; 486 else 487 rep->r_flags |= R_MUSTRESEND; 488 } 489 490 /* 491 * Handle any recoverable (soft) socket errors here. (???) 492 */ 493 if (error != EINTR && error != ERESTART && 494 error != EWOULDBLOCK && error != EPIPE) 495 error = 0; 496 } 497 return (error); 498 } 499 500 #ifdef NFSCLIENT 501 /* 502 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all 503 * done by soreceive(), but for SOCK_STREAM we must deal with the Record 504 * Mark and consolidate the data into a new mbuf list. 505 * nb: Sometimes TCP passes the data up to soreceive() in long lists of 506 * small mbufs. 507 * For SOCK_STREAM we must be very careful to read an entire record once 508 * we have read any of it, even if the system call has been interrupted. 509 */ 510 int 511 nfs_receive(struct nfsreq *rep, struct mbuf **aname, struct mbuf **mp) 512 { 513 struct socket *so; 514 struct uio auio; 515 struct iovec aio; 516 struct mbuf *m; 517 struct mbuf *control; 518 u_int32_t len; 519 struct mbuf **getnam; 520 int error, sotype, rcvflg; 521 struct proc *p = curproc; /* XXX */ 522 523 /* 524 * Set up arguments for soreceive() 525 */ 526 *mp = NULL; 527 *aname = NULL; 528 sotype = rep->r_nmp->nm_sotype; 529 530 /* 531 * For reliable protocols, lock against other senders/receivers 532 * in case a reconnect is necessary. 533 * For SOCK_STREAM, first get the Record Mark to find out how much 534 * more there is to get. 535 * We must lock the socket against other receivers 536 * until we have an entire rpc request/reply. 537 */ 538 if (sotype != SOCK_DGRAM) { 539 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); 540 if (error) 541 return (error); 542 tryagain: 543 /* 544 * Check for fatal errors and resending request. 545 */ 546 /* 547 * Ugh: If a reconnect attempt just happened, nm_so 548 * would have changed. NULL indicates a failed 549 * attempt that has essentially shut down this 550 * mount point. 551 */ 552 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) { 553 nfs_sndunlock(&rep->r_nmp->nm_flag); 554 return (EINTR); 555 } 556 so = rep->r_nmp->nm_so; 557 if (!so) { 558 error = nfs_reconnect(rep); 559 if (error) { 560 nfs_sndunlock(&rep->r_nmp->nm_flag); 561 return (error); 562 } 563 goto tryagain; 564 } 565 while (rep->r_flags & R_MUSTRESEND) { 566 m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT); 567 nfsstats.rpcretries++; 568 rep->r_rtt = 0; 569 rep->r_flags &= ~R_TIMING; 570 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep); 571 if (error) { 572 if (error == EINTR || error == ERESTART || 573 (error = nfs_reconnect(rep)) != 0) { 574 nfs_sndunlock(&rep->r_nmp->nm_flag); 575 return (error); 576 } 577 goto tryagain; 578 } 579 } 580 nfs_sndunlock(&rep->r_nmp->nm_flag); 581 if (sotype == SOCK_STREAM) { 582 aio.iov_base = (caddr_t) &len; 583 aio.iov_len = sizeof(u_int32_t); 584 auio.uio_iov = &aio; 585 auio.uio_iovcnt = 1; 586 auio.uio_segflg = UIO_SYSSPACE; 587 auio.uio_rw = UIO_READ; 588 auio.uio_offset = 0; 589 auio.uio_resid = sizeof(u_int32_t); 590 auio.uio_procp = p; 591 do { 592 rcvflg = MSG_WAITALL; 593 error = soreceive(so, NULL, &auio, NULL, NULL, 594 &rcvflg, 0); 595 if (error == EWOULDBLOCK && rep) { 596 if (rep->r_flags & R_SOFTTERM) 597 return (EINTR); 598 /* 599 * looks like the server died after it 600 * received the request, make sure 601 * that we will retransmit and we 602 * don't get stuck here forever. 603 */ 604 if (rep->r_rexmit >= rep->r_nmp->nm_retry) { 605 nfsstats.rpctimeouts++; 606 error = EPIPE; 607 } 608 } 609 } while (error == EWOULDBLOCK); 610 if (!error && auio.uio_resid > 0) { 611 log(LOG_INFO, 612 "short receive (%zu/%zu) from nfs server %s\n", 613 sizeof(u_int32_t) - auio.uio_resid, 614 sizeof(u_int32_t), 615 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 616 error = EPIPE; 617 } 618 if (error) 619 goto errout; 620 621 len = ntohl(len) & ~0x80000000; 622 /* 623 * This is SERIOUS! We are out of sync with the sender 624 * and forcing a disconnect/reconnect is all I can do. 625 */ 626 if (len > NFS_MAXPACKET) { 627 log(LOG_ERR, "%s (%u) from nfs server %s\n", 628 "impossible packet length", 629 len, 630 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 631 error = EFBIG; 632 goto errout; 633 } 634 auio.uio_resid = len; 635 do { 636 rcvflg = MSG_WAITALL; 637 error = soreceive(so, NULL, &auio, mp, NULL, 638 &rcvflg, 0); 639 } while (error == EWOULDBLOCK || error == EINTR || 640 error == ERESTART); 641 if (!error && auio.uio_resid > 0) { 642 log(LOG_INFO, 643 "short receive (%zu/%u) from nfs server %s\n", 644 len - auio.uio_resid, len, 645 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 646 error = EPIPE; 647 } 648 } else { 649 /* 650 * NB: Since uio_resid is big, MSG_WAITALL is ignored 651 * and soreceive() will return when it has either a 652 * control msg or a data msg. 653 * We have no use for control msg., but must grab them 654 * and then throw them away so we know what is going 655 * on. 656 */ 657 auio.uio_resid = len = 100000000; /* Anything Big */ 658 auio.uio_procp = p; 659 do { 660 rcvflg = 0; 661 error = soreceive(so, NULL, &auio, mp, &control, 662 &rcvflg, 0); 663 m_freem(control); 664 if (error == EWOULDBLOCK && rep) { 665 if (rep->r_flags & R_SOFTTERM) 666 return (EINTR); 667 } 668 } while (error == EWOULDBLOCK || 669 (!error && *mp == NULL && control)); 670 if ((rcvflg & MSG_EOR) == 0) 671 printf("Egad!!\n"); 672 if (!error && *mp == NULL) 673 error = EPIPE; 674 len -= auio.uio_resid; 675 } 676 errout: 677 if (error && error != EINTR && error != ERESTART) { 678 m_freemp(mp); 679 if (error != EPIPE) 680 log(LOG_INFO, 681 "receive error %d from nfs server %s\n", 682 error, 683 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 684 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); 685 if (!error) { 686 error = nfs_reconnect(rep); 687 if (!error) 688 goto tryagain; 689 nfs_sndunlock(&rep->r_nmp->nm_flag); 690 } 691 } 692 } else { 693 if ((so = rep->r_nmp->nm_so) == NULL) 694 return (EACCES); 695 if (so->so_state & SS_ISCONNECTED) 696 getnam = NULL; 697 else 698 getnam = aname; 699 auio.uio_resid = len = 1000000; 700 auio.uio_procp = p; 701 do { 702 rcvflg = 0; 703 error = soreceive(so, getnam, &auio, mp, NULL, 704 &rcvflg, 0); 705 if (error == EWOULDBLOCK && 706 (rep->r_flags & R_SOFTTERM)) 707 return (EINTR); 708 } while (error == EWOULDBLOCK); 709 len -= auio.uio_resid; 710 } 711 if (error) 712 m_freemp(mp); 713 /* 714 * Search for any mbufs that are not a multiple of 4 bytes long 715 * or with m_data not longword aligned. 716 * These could cause pointer alignment problems, so copy them to 717 * well aligned mbufs. 718 */ 719 nfs_realign(mp, 5 * NFSX_UNSIGNED); 720 return (error); 721 } 722 723 /* 724 * Implement receipt of reply on a socket. 725 * We must search through the list of received datagrams matching them 726 * with outstanding requests using the xid, until ours is found. 727 */ 728 int 729 nfs_reply(struct nfsreq *myrep) 730 { 731 struct nfsreq *rep; 732 struct nfsmount *nmp = myrep->r_nmp; 733 struct nfsm_info info; 734 struct mbuf *nam; 735 u_int32_t rxid, *tl, t1; 736 caddr_t cp2; 737 int error; 738 739 /* 740 * Loop around until we get our own reply 741 */ 742 for (;;) { 743 /* 744 * Lock against other receivers so that I don't get stuck in 745 * sbwait() after someone else has received my reply for me. 746 * Also necessary for connection based protocols to avoid 747 * race conditions during a reconnect. 748 */ 749 error = nfs_rcvlock(myrep); 750 if (error) 751 return (error == EALREADY ? 0 : error); 752 753 /* 754 * Get the next Rpc reply off the socket 755 */ 756 error = nfs_receive(myrep, &nam, &info.nmi_mrep); 757 nfs_rcvunlock(&nmp->nm_flag); 758 if (error) { 759 760 /* 761 * Ignore routing errors on connectionless protocols?? 762 */ 763 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) { 764 if (nmp->nm_so) 765 nmp->nm_so->so_error = 0; 766 continue; 767 } 768 return (error); 769 } 770 m_freem(nam); 771 772 /* 773 * Get the xid and check that it is an rpc reply 774 */ 775 info.nmi_md = info.nmi_mrep; 776 info.nmi_dpos = mtod(info.nmi_md, caddr_t); 777 nfsm_dissect(tl, u_int32_t *, 2*NFSX_UNSIGNED); 778 rxid = *tl++; 779 if (*tl != rpc_reply) { 780 nfsstats.rpcinvalid++; 781 m_freem(info.nmi_mrep); 782 nfsmout: 783 continue; 784 } 785 786 /* 787 * Loop through the request list to match up the reply 788 * Iff no match, just drop the datagram 789 */ 790 TAILQ_FOREACH(rep, &nmp->nm_reqsq, r_chain) { 791 if (rep->r_mrep == NULL && rxid == rep->r_xid) { 792 /* Found it.. */ 793 rep->r_mrep = info.nmi_mrep; 794 rep->r_md = info.nmi_md; 795 rep->r_dpos = info.nmi_dpos; 796 797 /* 798 * Update congestion window. 799 * Do the additive increase of 800 * one rpc/rtt. 801 */ 802 if (nmp->nm_cwnd <= nmp->nm_sent) { 803 nmp->nm_cwnd += 804 (NFS_CWNDSCALE * NFS_CWNDSCALE + 805 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd; 806 if (nmp->nm_cwnd > NFS_MAXCWND) 807 nmp->nm_cwnd = NFS_MAXCWND; 808 } 809 rep->r_flags &= ~R_SENT; 810 nmp->nm_sent -= NFS_CWNDSCALE; 811 812 if (rep->r_flags & R_TIMING) 813 nfs_update_rtt(rep); 814 815 nmp->nm_timeouts = 0; 816 break; 817 } 818 } 819 /* 820 * If not matched to a request, drop it. 821 * If it's mine, get out. 822 */ 823 if (rep == 0) { 824 nfsstats.rpcunexpected++; 825 m_freem(info.nmi_mrep); 826 } else if (rep == myrep) { 827 if (rep->r_mrep == NULL) 828 panic("nfsreply nil"); 829 return (0); 830 } 831 } 832 } 833 834 /* 835 * nfs_request - goes something like this 836 * - fill in request struct 837 * - links it into list 838 * - calls nfs_send() for first transmit 839 * - calls nfs_receive() to get reply 840 * - break down rpc header and return with nfs reply pointed to 841 * by mrep or error 842 * nb: always frees up mreq mbuf list 843 */ 844 int 845 nfs_request(struct vnode *vp, int procnum, struct nfsm_info *infop) 846 { 847 struct mbuf *m; 848 u_int32_t *tl; 849 struct nfsmount *nmp; 850 struct timeval tv; 851 caddr_t cp2; 852 int t1, i, error = 0; 853 int trylater_delay; 854 struct nfsreq *rep; 855 int mrest_len; 856 struct nfsm_info info; 857 858 rep = pool_get(&nfsreqpl, PR_WAITOK); 859 rep->r_nmp = VFSTONFS(vp->v_mount); 860 rep->r_vp = vp; 861 rep->r_procp = infop->nmi_procp; 862 rep->r_procnum = procnum; 863 864 mrest_len = 0; 865 m = infop->nmi_mreq; 866 while (m) { 867 mrest_len += m->m_len; 868 m = m->m_next; 869 } 870 871 /* empty mbuf for AUTH_UNIX header */ 872 rep->r_mreq = m_gethdr(M_WAIT, MT_DATA); 873 rep->r_mreq->m_next = infop->nmi_mreq; 874 rep->r_mreq->m_pkthdr.len = mrest_len; 875 876 trylater_delay = NFS_MINTIMEO; 877 878 nmp = rep->r_nmp; 879 880 /* Get the RPC header with authorization. */ 881 nfsm_rpchead(rep, infop->nmi_cred, RPCAUTH_UNIX); 882 m = rep->r_mreq; 883 884 /* 885 * For stream protocols, insert a Sun RPC Record Mark. 886 */ 887 if (nmp->nm_sotype == SOCK_STREAM) { 888 M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); 889 *mtod(m, u_int32_t *) = htonl(0x80000000 | 890 (m->m_pkthdr.len - NFSX_UNSIGNED)); 891 } 892 893 tryagain: 894 rep->r_rtt = rep->r_rexmit = 0; 895 if (nfs_ptimers[rep->r_procnum] != NFS_DEFAULT_TIMER) 896 rep->r_flags = R_TIMING; 897 else 898 rep->r_flags = 0; 899 rep->r_mrep = NULL; 900 901 /* 902 * Do the client side RPC. 903 */ 904 nfsstats.rpcrequests++; 905 /* 906 * Chain request into list of outstanding requests. Be sure 907 * to put it LAST so timer finds oldest requests first. 908 */ 909 if (TAILQ_EMPTY(&nmp->nm_reqsq)) 910 timeout_add(&nmp->nm_rtimeout, nfs_ticks); 911 TAILQ_INSERT_TAIL(&nmp->nm_reqsq, rep, r_chain); 912 913 /* 914 * If backing off another request or avoiding congestion, don't 915 * send this one now but let timer do it. If not timing a request, 916 * do it now. 917 */ 918 if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM || 919 (nmp->nm_flag & NFSMNT_DUMBTIMR) || 920 nmp->nm_sent < nmp->nm_cwnd)) { 921 if (nmp->nm_soflags & PR_CONNREQUIRED) 922 error = nfs_sndlock(&nmp->nm_flag, rep); 923 if (!error) { 924 error = nfs_send(nmp->nm_so, nmp->nm_nam, 925 m_copym(m, 0, M_COPYALL, M_WAIT), 926 rep); 927 if (nmp->nm_soflags & PR_CONNREQUIRED) 928 nfs_sndunlock(&nmp->nm_flag); 929 } 930 if (!error && (rep->r_flags & R_MUSTRESEND) == 0) { 931 nmp->nm_sent += NFS_CWNDSCALE; 932 rep->r_flags |= R_SENT; 933 } 934 } else { 935 rep->r_rtt = -1; 936 } 937 938 /* 939 * Wait for the reply from our send or the timer's. 940 */ 941 if (!error || error == EPIPE) 942 error = nfs_reply(rep); 943 944 /* 945 * RPC done, unlink the request. 946 */ 947 TAILQ_REMOVE(&nmp->nm_reqsq, rep, r_chain); 948 if (TAILQ_EMPTY(&nmp->nm_reqsq)) 949 timeout_del(&nmp->nm_rtimeout); 950 951 /* 952 * Decrement the outstanding request count. 953 */ 954 if (rep->r_flags & R_SENT) { 955 rep->r_flags &= ~R_SENT; /* paranoia */ 956 nmp->nm_sent -= NFS_CWNDSCALE; 957 } 958 959 /* 960 * If there was a successful reply and a tprintf msg. 961 * tprintf a response. 962 */ 963 if (!error && (rep->r_flags & R_TPRINTFMSG)) 964 nfs_msg(rep, "is alive again"); 965 info.nmi_mrep = rep->r_mrep; 966 info.nmi_md = rep->r_md; 967 info.nmi_dpos = rep->r_dpos; 968 if (error) { 969 infop->nmi_mrep = NULL; 970 goto nfsmout1; 971 } 972 973 /* 974 * break down the rpc header and check if ok 975 */ 976 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); 977 if (*tl++ == rpc_msgdenied) { 978 if (*tl == rpc_mismatch) 979 error = EOPNOTSUPP; 980 else 981 error = EACCES; /* Should be EAUTH. */ 982 infop->nmi_mrep = NULL; 983 goto nfsmout1; 984 } 985 986 /* 987 * Since we only support RPCAUTH_UNIX atm we step over the 988 * reply verifer type, and in the (error) case that there really 989 * is any data in it, we advance over it. 990 */ 991 tl++; /* Step over verifer type */ 992 i = fxdr_unsigned(int32_t, *tl); 993 if (i > 0) 994 nfsm_adv(nfsm_rndup(i)); /* Should not happen */ 995 996 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); 997 /* 0 == ok */ 998 if (*tl == 0) { 999 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); 1000 if (*tl != 0) { 1001 error = fxdr_unsigned(int, *tl); 1002 if ((nmp->nm_flag & NFSMNT_NFSV3) && 1003 error == NFSERR_TRYLATER) { 1004 m_freem(info.nmi_mrep); 1005 error = 0; 1006 tv.tv_sec = trylater_delay; 1007 tv.tv_usec = 0; 1008 tsleep(&tv, PSOCK, "nfsretry", tvtohz(&tv)); 1009 trylater_delay *= NFS_TIMEOUTMUL; 1010 if (trylater_delay > NFS_MAXTIMEO) 1011 trylater_delay = NFS_MAXTIMEO; 1012 1013 goto tryagain; 1014 } 1015 1016 /* 1017 * If the File Handle was stale, invalidate the 1018 * lookup cache, just in case. 1019 */ 1020 if (error == ESTALE) 1021 cache_purge(rep->r_vp); 1022 } 1023 goto nfsmout; 1024 } 1025 1026 error = EPROTONOSUPPORT; 1027 1028 nfsmout: 1029 infop->nmi_mrep = info.nmi_mrep; 1030 infop->nmi_md = info.nmi_md; 1031 infop->nmi_dpos = info.nmi_dpos; 1032 nfsmout1: 1033 m_freem(rep->r_mreq); 1034 pool_put(&nfsreqpl, rep); 1035 return (error); 1036 } 1037 #endif /* NFSCLIENT */ 1038 1039 /* 1040 * Generate the rpc reply header 1041 * siz arg. is used to decide if adding a cluster is worthwhile 1042 */ 1043 int 1044 nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp, 1045 int err, struct mbuf **mrq, struct mbuf **mbp) 1046 { 1047 u_int32_t *tl; 1048 struct mbuf *mreq; 1049 struct mbuf *mb; 1050 1051 MGETHDR(mreq, M_WAIT, MT_DATA); 1052 mb = mreq; 1053 /* 1054 * If this is a big reply, use a cluster else 1055 * try and leave leading space for the lower level headers. 1056 */ 1057 siz += RPC_REPLYSIZ; 1058 if (siz >= MHLEN - max_hdr) { 1059 MCLGET(mreq, M_WAIT); 1060 } else 1061 mreq->m_data += max_hdr; 1062 tl = mtod(mreq, u_int32_t *); 1063 mreq->m_len = 6 * NFSX_UNSIGNED; 1064 *tl++ = txdr_unsigned(nd->nd_retxid); 1065 *tl++ = rpc_reply; 1066 if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) { 1067 *tl++ = rpc_msgdenied; 1068 if (err & NFSERR_AUTHERR) { 1069 *tl++ = rpc_autherr; 1070 *tl = txdr_unsigned(err & ~NFSERR_AUTHERR); 1071 mreq->m_len -= NFSX_UNSIGNED; 1072 } else { 1073 *tl++ = rpc_mismatch; 1074 *tl++ = txdr_unsigned(RPC_VER2); 1075 *tl = txdr_unsigned(RPC_VER2); 1076 } 1077 } else { 1078 *tl++ = rpc_msgaccepted; 1079 1080 /* AUTH_UNIX requires RPCAUTH_NULL. */ 1081 *tl++ = 0; 1082 *tl++ = 0; 1083 1084 switch (err) { 1085 case EPROGUNAVAIL: 1086 *tl = txdr_unsigned(RPC_PROGUNAVAIL); 1087 break; 1088 case EPROGMISMATCH: 1089 *tl = txdr_unsigned(RPC_PROGMISMATCH); 1090 tl = nfsm_build(&mb, 2 * NFSX_UNSIGNED); 1091 *tl++ = txdr_unsigned(NFS_VER2); 1092 *tl = txdr_unsigned(NFS_VER3); 1093 break; 1094 case EPROCUNAVAIL: 1095 *tl = txdr_unsigned(RPC_PROCUNAVAIL); 1096 break; 1097 case EBADRPC: 1098 *tl = txdr_unsigned(RPC_GARBAGE); 1099 break; 1100 default: 1101 *tl = 0; 1102 if (err != NFSERR_RETVOID) { 1103 tl = nfsm_build(&mb, NFSX_UNSIGNED); 1104 if (err) 1105 *tl = txdr_unsigned(nfsrv_errmap(nd, err)); 1106 else 1107 *tl = 0; 1108 } 1109 break; 1110 }; 1111 } 1112 1113 *mrq = mreq; 1114 if (mbp != NULL) 1115 *mbp = mb; 1116 if (err != 0 && err != NFSERR_RETVOID) 1117 nfsstats.srvrpc_errs++; 1118 return (0); 1119 } 1120 1121 /* 1122 * nfs timer routine 1123 * Scan the nfsreq list and retranmit any requests that have timed out. 1124 */ 1125 void 1126 nfs_timer(void *arg) 1127 { 1128 struct nfsmount *nmp = arg; 1129 struct nfsreq *rep; 1130 struct mbuf *m; 1131 struct socket *so; 1132 int timeo, s, error; 1133 1134 NET_LOCK(s); 1135 TAILQ_FOREACH(rep, &nmp->nm_reqsq, r_chain) { 1136 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) 1137 continue; 1138 if (nfs_sigintr(nmp, rep, rep->r_procp)) { 1139 rep->r_flags |= R_SOFTTERM; 1140 continue; 1141 } 1142 if (rep->r_rtt >= 0) { 1143 rep->r_rtt++; 1144 if (nmp->nm_flag & NFSMNT_DUMBTIMR) 1145 timeo = nmp->nm_timeo; 1146 else 1147 timeo = nfs_estimate_rto(nmp, rep->r_procnum); 1148 if (nmp->nm_timeouts > 0) 1149 timeo *= nfs_backoff[nmp->nm_timeouts - 1]; 1150 if (rep->r_rtt <= timeo) 1151 continue; 1152 if (nmp->nm_timeouts < nitems(nfs_backoff)) 1153 nmp->nm_timeouts++; 1154 } 1155 1156 /* Check for server not responding. */ 1157 if ((rep->r_flags & R_TPRINTFMSG) == 0 && rep->r_rexmit > 4) { 1158 nfs_msg(rep, "not responding"); 1159 rep->r_flags |= R_TPRINTFMSG; 1160 } 1161 if (rep->r_rexmit >= nmp->nm_retry) { /* too many */ 1162 nfsstats.rpctimeouts++; 1163 rep->r_flags |= R_SOFTTERM; 1164 continue; 1165 } 1166 if (nmp->nm_sotype != SOCK_DGRAM) { 1167 if (++rep->r_rexmit > NFS_MAXREXMIT) 1168 rep->r_rexmit = NFS_MAXREXMIT; 1169 continue; 1170 } 1171 1172 if ((so = nmp->nm_so) == NULL) 1173 continue; 1174 1175 /* 1176 * If there is enough space and the window allows.. 1177 * Resend it 1178 * Set r_rtt to -1 in case we fail to send it now. 1179 */ 1180 rep->r_rtt = -1; 1181 if (sbspace(so, &so->so_snd) >= rep->r_mreq->m_pkthdr.len && 1182 ((nmp->nm_flag & NFSMNT_DUMBTIMR) || 1183 (rep->r_flags & R_SENT) || 1184 nmp->nm_sent < nmp->nm_cwnd) && 1185 (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){ 1186 if ((nmp->nm_flag & NFSMNT_NOCONN) == 0) 1187 error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m, 1188 NULL, NULL, curproc); 1189 else 1190 error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m, 1191 nmp->nm_nam, NULL, curproc); 1192 if (error) { 1193 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) 1194 so->so_error = 0; 1195 } else { 1196 /* 1197 * Iff first send, start timing 1198 * else turn timing off, backoff timer 1199 * and divide congestion window by 2. 1200 */ 1201 if (rep->r_flags & R_SENT) { 1202 rep->r_flags &= ~R_TIMING; 1203 if (++rep->r_rexmit > NFS_MAXREXMIT) 1204 rep->r_rexmit = NFS_MAXREXMIT; 1205 nmp->nm_cwnd >>= 1; 1206 if (nmp->nm_cwnd < NFS_CWNDSCALE) 1207 nmp->nm_cwnd = NFS_CWNDSCALE; 1208 nfsstats.rpcretries++; 1209 } else { 1210 rep->r_flags |= R_SENT; 1211 nmp->nm_sent += NFS_CWNDSCALE; 1212 } 1213 rep->r_rtt = 0; 1214 } 1215 } 1216 } 1217 NET_UNLOCK(s); 1218 timeout_add(&nmp->nm_rtimeout, nfs_ticks); 1219 } 1220 1221 /* 1222 * Test for a termination condition pending on the process. 1223 * This is used for NFSMNT_INT mounts. 1224 */ 1225 int 1226 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct proc *p) 1227 { 1228 1229 if (rep && (rep->r_flags & R_SOFTTERM)) 1230 return (EINTR); 1231 if (!(nmp->nm_flag & NFSMNT_INT)) 1232 return (0); 1233 if (p && p->p_siglist && 1234 (((p->p_siglist & ~p->p_sigmask) & 1235 ~p->p_p->ps_sigacts->ps_sigignore) & NFSINT_SIGMASK)) 1236 return (EINTR); 1237 return (0); 1238 } 1239 1240 /* 1241 * Lock a socket against others. 1242 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply 1243 * and also to avoid race conditions between the processes with nfs requests 1244 * in progress when a reconnect is necessary. 1245 */ 1246 int 1247 nfs_sndlock(int *flagp, struct nfsreq *rep) 1248 { 1249 struct proc *p; 1250 int slpflag = 0, slptimeo = 0; 1251 1252 if (rep) { 1253 p = rep->r_procp; 1254 if (rep->r_nmp->nm_flag & NFSMNT_INT) 1255 slpflag = PCATCH; 1256 } else 1257 p = NULL; 1258 while (*flagp & NFSMNT_SNDLOCK) { 1259 if (rep && nfs_sigintr(rep->r_nmp, rep, p)) 1260 return (EINTR); 1261 *flagp |= NFSMNT_WANTSND; 1262 (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsndlck", 1263 slptimeo); 1264 if (slpflag == PCATCH) { 1265 slpflag = 0; 1266 slptimeo = 2 * hz; 1267 } 1268 } 1269 *flagp |= NFSMNT_SNDLOCK; 1270 return (0); 1271 } 1272 1273 /* 1274 * Unlock the stream socket for others. 1275 */ 1276 void 1277 nfs_sndunlock(int *flagp) 1278 { 1279 1280 if ((*flagp & NFSMNT_SNDLOCK) == 0) 1281 panic("nfs sndunlock"); 1282 *flagp &= ~NFSMNT_SNDLOCK; 1283 if (*flagp & NFSMNT_WANTSND) { 1284 *flagp &= ~NFSMNT_WANTSND; 1285 wakeup((caddr_t)flagp); 1286 } 1287 } 1288 1289 int 1290 nfs_rcvlock(struct nfsreq *rep) 1291 { 1292 int *flagp = &rep->r_nmp->nm_flag; 1293 int slpflag, slptimeo = 0; 1294 1295 if (*flagp & NFSMNT_INT) 1296 slpflag = PCATCH; 1297 else 1298 slpflag = 0; 1299 1300 while (*flagp & NFSMNT_RCVLOCK) { 1301 if (nfs_sigintr(rep->r_nmp, rep, rep->r_procp)) 1302 return (EINTR); 1303 *flagp |= NFSMNT_WANTRCV; 1304 (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsrcvlk", 1305 slptimeo); 1306 if (rep->r_mrep != NULL) { 1307 /* 1308 * Don't take the lock if our reply has been received 1309 * while we where sleeping. 1310 */ 1311 return (EALREADY); 1312 } 1313 if (slpflag == PCATCH) { 1314 slpflag = 0; 1315 slptimeo = 2 * hz; 1316 } 1317 } 1318 *flagp |= NFSMNT_RCVLOCK; 1319 return (0); 1320 } 1321 1322 /* 1323 * Unlock the stream socket for others. 1324 */ 1325 void 1326 nfs_rcvunlock(int *flagp) 1327 { 1328 1329 if ((*flagp & NFSMNT_RCVLOCK) == 0) 1330 panic("nfs rcvunlock"); 1331 *flagp &= ~NFSMNT_RCVLOCK; 1332 if (*flagp & NFSMNT_WANTRCV) { 1333 *flagp &= ~NFSMNT_WANTRCV; 1334 wakeup((caddr_t)flagp); 1335 } 1336 } 1337 1338 /* 1339 * Auxiliary routine to align the length of mbuf copies made with m_copyback(). 1340 */ 1341 void 1342 nfs_realign_fixup(struct mbuf *m, struct mbuf *n, unsigned int *off) 1343 { 1344 size_t padding; 1345 1346 /* 1347 * The maximum number of bytes that m_copyback() places in a mbuf is 1348 * always an aligned quantity, so realign happens at the chain's tail. 1349 */ 1350 while (n->m_next != NULL) 1351 n = n->m_next; 1352 1353 /* 1354 * Pad from the next elements in the source chain. Loop until the 1355 * destination chain is aligned, or the end of the source is reached. 1356 */ 1357 do { 1358 m = m->m_next; 1359 if (m == NULL) 1360 return; 1361 1362 padding = min(ALIGN(n->m_len) - n->m_len, m->m_len); 1363 if (padding > M_TRAILINGSPACE(n)) 1364 panic("nfs_realign_fixup: no memory to pad to"); 1365 1366 bcopy(mtod(m, void *), mtod(n, char *) + n->m_len, padding); 1367 1368 n->m_len += padding; 1369 m_adj(m, padding); 1370 *off += padding; 1371 1372 } while (!ALIGNED_POINTER(n->m_len, void *)); 1373 } 1374 1375 /* 1376 * The NFS RPC parsing code uses the data address and the length of mbuf 1377 * structures to calculate on-memory addresses. This function makes sure these 1378 * parameters are correctly aligned. 1379 */ 1380 void 1381 nfs_realign(struct mbuf **pm, int hsiz) 1382 { 1383 struct mbuf *m; 1384 struct mbuf *n = NULL; 1385 unsigned int off = 0; 1386 1387 ++nfs_realign_test; 1388 while ((m = *pm) != NULL) { 1389 if (!ALIGNED_POINTER(m->m_data, void *) || 1390 !ALIGNED_POINTER(m->m_len, void *)) { 1391 MGET(n, M_WAIT, MT_DATA); 1392 #define ALIGN_POINTER(n) ((u_int)(((n) + sizeof(void *)) & ~sizeof(void *))) 1393 if (ALIGN_POINTER(m->m_len) >= MINCLSIZE) { 1394 MCLGET(n, M_WAIT); 1395 } 1396 n->m_len = 0; 1397 break; 1398 } 1399 pm = &m->m_next; 1400 } 1401 /* 1402 * If n is non-NULL, loop on m copying data, then replace the 1403 * portion of the chain that had to be realigned. 1404 */ 1405 if (n != NULL) { 1406 ++nfs_realign_count; 1407 while (m) { 1408 m_copyback(n, off, m->m_len, mtod(m, caddr_t), M_WAIT); 1409 1410 /* 1411 * If an unaligned amount of memory was copied, fix up 1412 * the last mbuf created by m_copyback(). 1413 */ 1414 if (!ALIGNED_POINTER(m->m_len, void *)) 1415 nfs_realign_fixup(m, n, &off); 1416 1417 off += m->m_len; 1418 m = m->m_next; 1419 } 1420 m_freemp(pm); 1421 *pm = n; 1422 } 1423 } 1424 1425 1426 /* 1427 * Parse an RPC request 1428 * - verify it 1429 * - fill in the cred struct. 1430 */ 1431 int 1432 nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header) 1433 { 1434 int len, i; 1435 u_int32_t *tl; 1436 int32_t t1; 1437 caddr_t cp2; 1438 u_int32_t nfsvers, auth_type; 1439 int error = 0; 1440 struct nfsm_info info; 1441 1442 info.nmi_mrep = nd->nd_mrep; 1443 info.nmi_md = nd->nd_md; 1444 info.nmi_dpos = nd->nd_dpos; 1445 if (has_header) { 1446 nfsm_dissect(tl, u_int32_t *, 10 * NFSX_UNSIGNED); 1447 nd->nd_retxid = fxdr_unsigned(u_int32_t, *tl++); 1448 if (*tl++ != rpc_call) { 1449 m_freem(info.nmi_mrep); 1450 return (EBADRPC); 1451 } 1452 } else 1453 nfsm_dissect(tl, u_int32_t *, 8 * NFSX_UNSIGNED); 1454 nd->nd_repstat = 0; 1455 nd->nd_flag = 0; 1456 if (*tl++ != rpc_vers) { 1457 nd->nd_repstat = ERPCMISMATCH; 1458 nd->nd_procnum = NFSPROC_NOOP; 1459 return (0); 1460 } 1461 if (*tl != nfs_prog) { 1462 nd->nd_repstat = EPROGUNAVAIL; 1463 nd->nd_procnum = NFSPROC_NOOP; 1464 return (0); 1465 } 1466 tl++; 1467 nfsvers = fxdr_unsigned(u_int32_t, *tl++); 1468 if (nfsvers != NFS_VER2 && nfsvers != NFS_VER3) { 1469 nd->nd_repstat = EPROGMISMATCH; 1470 nd->nd_procnum = NFSPROC_NOOP; 1471 return (0); 1472 } 1473 if (nfsvers == NFS_VER3) 1474 nd->nd_flag = ND_NFSV3; 1475 nd->nd_procnum = fxdr_unsigned(u_int32_t, *tl++); 1476 if (nd->nd_procnum == NFSPROC_NULL) 1477 return (0); 1478 if (nd->nd_procnum >= NFS_NPROCS || 1479 (nd->nd_procnum > NFSPROC_COMMIT) || 1480 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) { 1481 nd->nd_repstat = EPROCUNAVAIL; 1482 nd->nd_procnum = NFSPROC_NOOP; 1483 return (0); 1484 } 1485 if ((nd->nd_flag & ND_NFSV3) == 0) 1486 nd->nd_procnum = nfsv3_procid[nd->nd_procnum]; 1487 auth_type = *tl++; 1488 len = fxdr_unsigned(int, *tl++); 1489 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1490 m_freem(info.nmi_mrep); 1491 return (EBADRPC); 1492 } 1493 1494 /* Handle auth_unix */ 1495 if (auth_type == rpc_auth_unix) { 1496 len = fxdr_unsigned(int, *++tl); 1497 if (len < 0 || len > NFS_MAXNAMLEN) { 1498 m_freem(info.nmi_mrep); 1499 return (EBADRPC); 1500 } 1501 nfsm_adv(nfsm_rndup(len)); 1502 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); 1503 memset(&nd->nd_cr, 0, sizeof (struct ucred)); 1504 nd->nd_cr.cr_ref = 1; 1505 nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++); 1506 nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++); 1507 len = fxdr_unsigned(int, *tl); 1508 if (len < 0 || len > RPCAUTH_UNIXGIDS) { 1509 m_freem(info.nmi_mrep); 1510 return (EBADRPC); 1511 } 1512 nfsm_dissect(tl, u_int32_t *, (len + 2) * NFSX_UNSIGNED); 1513 for (i = 0; i < len; i++) 1514 if (i < NGROUPS_MAX) 1515 nd->nd_cr.cr_groups[i] = fxdr_unsigned(gid_t, *tl++); 1516 else 1517 tl++; 1518 nd->nd_cr.cr_ngroups = (len > NGROUPS_MAX) ? NGROUPS_MAX : len; 1519 len = fxdr_unsigned(int, *++tl); 1520 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1521 m_freem(info.nmi_mrep); 1522 return (EBADRPC); 1523 } 1524 if (len > 0) 1525 nfsm_adv(nfsm_rndup(len)); 1526 } else { 1527 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED); 1528 nd->nd_procnum = NFSPROC_NOOP; 1529 return (0); 1530 } 1531 1532 nd->nd_md = info.nmi_md; 1533 nd->nd_dpos = info.nmi_dpos; 1534 return (0); 1535 nfsmout: 1536 return (error); 1537 } 1538 1539 void 1540 nfs_msg(struct nfsreq *rep, char *msg) 1541 { 1542 tpr_t tpr; 1543 1544 if (rep->r_procp) 1545 tpr = tprintf_open(rep->r_procp); 1546 else 1547 tpr = NULL; 1548 1549 tprintf(tpr, "nfs server %s: %s\n", 1550 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname, msg); 1551 tprintf_close(tpr); 1552 } 1553 1554 #ifdef NFSSERVER 1555 /* 1556 * Socket upcall routine for the nfsd sockets. 1557 * The caddr_t arg is a pointer to the "struct nfssvc_sock". 1558 * Essentially do as much as possible non-blocking, else punt and it will 1559 * be called with M_WAIT from an nfsd. 1560 */ 1561 void 1562 nfsrv_rcv(struct socket *so, caddr_t arg, int waitflag) 1563 { 1564 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg; 1565 struct mbuf *m; 1566 struct mbuf *mp, *nam; 1567 struct uio auio; 1568 int flags, error; 1569 1570 if ((slp->ns_flag & SLP_VALID) == 0) 1571 return; 1572 1573 /* Defer soreceive() to an nfsd. */ 1574 if (waitflag == M_DONTWAIT) { 1575 slp->ns_flag |= SLP_NEEDQ; 1576 goto dorecs; 1577 } 1578 1579 auio.uio_procp = NULL; 1580 if (so->so_type == SOCK_STREAM) { 1581 /* 1582 * Do soreceive(). 1583 */ 1584 auio.uio_resid = 1000000000; 1585 flags = MSG_DONTWAIT; 1586 error = soreceive(so, &nam, &auio, &mp, NULL, 1587 &flags, 0); 1588 if (error || mp == NULL) { 1589 if (error == EWOULDBLOCK) 1590 slp->ns_flag |= SLP_NEEDQ; 1591 else 1592 slp->ns_flag |= SLP_DISCONN; 1593 goto dorecs; 1594 } 1595 m = mp; 1596 if (slp->ns_rawend) { 1597 slp->ns_rawend->m_next = m; 1598 slp->ns_cc += 1000000000 - auio.uio_resid; 1599 } else { 1600 slp->ns_raw = m; 1601 slp->ns_cc = 1000000000 - auio.uio_resid; 1602 } 1603 while (m->m_next) 1604 m = m->m_next; 1605 slp->ns_rawend = m; 1606 1607 /* 1608 * Now try and parse record(s) out of the raw stream data. 1609 */ 1610 error = nfsrv_getstream(slp, waitflag); 1611 if (error) { 1612 if (error == EPERM) 1613 slp->ns_flag |= SLP_DISCONN; 1614 else 1615 slp->ns_flag |= SLP_NEEDQ; 1616 } 1617 } else { 1618 do { 1619 auio.uio_resid = 1000000000; 1620 flags = MSG_DONTWAIT; 1621 error = soreceive(so, &nam, &auio, &mp, 1622 NULL, &flags, 0); 1623 if (mp) { 1624 if (nam) { 1625 m = nam; 1626 m->m_next = mp; 1627 } else 1628 m = mp; 1629 if (slp->ns_recend) 1630 slp->ns_recend->m_nextpkt = m; 1631 else 1632 slp->ns_rec = m; 1633 slp->ns_recend = m; 1634 m->m_nextpkt = NULL; 1635 } 1636 if (error) { 1637 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) 1638 && error != EWOULDBLOCK) { 1639 slp->ns_flag |= SLP_DISCONN; 1640 goto dorecs; 1641 } 1642 } 1643 } while (mp); 1644 } 1645 1646 /* 1647 * Now try and process the request records, non-blocking. 1648 */ 1649 dorecs: 1650 if (waitflag == M_DONTWAIT && 1651 (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)))) 1652 nfsrv_wakenfsd(slp); 1653 } 1654 1655 /* 1656 * Try and extract an RPC request from the mbuf data list received on a 1657 * stream socket. The "waitflag" argument indicates whether or not it 1658 * can sleep. 1659 */ 1660 int 1661 nfsrv_getstream(struct nfssvc_sock *slp, int waitflag) 1662 { 1663 struct mbuf *m, **mpp; 1664 char *cp1, *cp2; 1665 int len; 1666 struct mbuf *om, *m2, *recm; 1667 u_int32_t recmark; 1668 1669 if (slp->ns_flag & SLP_GETSTREAM) 1670 return (0); 1671 slp->ns_flag |= SLP_GETSTREAM; 1672 for (;;) { 1673 if (slp->ns_reclen == 0) { 1674 if (slp->ns_cc < NFSX_UNSIGNED) { 1675 slp->ns_flag &= ~SLP_GETSTREAM; 1676 return (0); 1677 } 1678 m = slp->ns_raw; 1679 if (m->m_len >= NFSX_UNSIGNED) { 1680 bcopy(mtod(m, caddr_t), (caddr_t)&recmark, NFSX_UNSIGNED); 1681 m->m_data += NFSX_UNSIGNED; 1682 m->m_len -= NFSX_UNSIGNED; 1683 } else { 1684 cp1 = (caddr_t)&recmark; 1685 cp2 = mtod(m, caddr_t); 1686 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) { 1687 while (m->m_len == 0) { 1688 m = m->m_next; 1689 cp2 = mtod(m, caddr_t); 1690 } 1691 *cp1++ = *cp2++; 1692 m->m_data++; 1693 m->m_len--; 1694 } 1695 } 1696 slp->ns_cc -= NFSX_UNSIGNED; 1697 recmark = ntohl(recmark); 1698 slp->ns_reclen = recmark & ~0x80000000; 1699 if (recmark & 0x80000000) 1700 slp->ns_flag |= SLP_LASTFRAG; 1701 else 1702 slp->ns_flag &= ~SLP_LASTFRAG; 1703 if (slp->ns_reclen > NFS_MAXPACKET) { 1704 slp->ns_flag &= ~SLP_GETSTREAM; 1705 return (EPERM); 1706 } 1707 } 1708 1709 /* 1710 * Now get the record part. 1711 */ 1712 recm = NULL; 1713 if (slp->ns_cc == slp->ns_reclen) { 1714 recm = slp->ns_raw; 1715 slp->ns_raw = slp->ns_rawend = NULL; 1716 slp->ns_cc = slp->ns_reclen = 0; 1717 } else if (slp->ns_cc > slp->ns_reclen) { 1718 len = 0; 1719 m = slp->ns_raw; 1720 om = NULL; 1721 while (len < slp->ns_reclen) { 1722 if ((len + m->m_len) > slp->ns_reclen) { 1723 m2 = m_copym(m, 0, slp->ns_reclen - len, 1724 waitflag); 1725 if (m2) { 1726 if (om) { 1727 om->m_next = m2; 1728 recm = slp->ns_raw; 1729 } else 1730 recm = m2; 1731 m->m_data += slp->ns_reclen - len; 1732 m->m_len -= slp->ns_reclen - len; 1733 len = slp->ns_reclen; 1734 } else { 1735 slp->ns_flag &= ~SLP_GETSTREAM; 1736 return (EWOULDBLOCK); 1737 } 1738 } else if ((len + m->m_len) == slp->ns_reclen) { 1739 om = m; 1740 len += m->m_len; 1741 m = m->m_next; 1742 recm = slp->ns_raw; 1743 om->m_next = NULL; 1744 } else { 1745 om = m; 1746 len += m->m_len; 1747 m = m->m_next; 1748 } 1749 } 1750 slp->ns_raw = m; 1751 slp->ns_cc -= len; 1752 slp->ns_reclen = 0; 1753 } else { 1754 slp->ns_flag &= ~SLP_GETSTREAM; 1755 return (0); 1756 } 1757 1758 /* 1759 * Accumulate the fragments into a record. 1760 */ 1761 mpp = &slp->ns_frag; 1762 while (*mpp) 1763 mpp = &((*mpp)->m_next); 1764 *mpp = recm; 1765 if (slp->ns_flag & SLP_LASTFRAG) { 1766 if (slp->ns_recend) 1767 slp->ns_recend->m_nextpkt = slp->ns_frag; 1768 else 1769 slp->ns_rec = slp->ns_frag; 1770 slp->ns_recend = slp->ns_frag; 1771 slp->ns_frag = NULL; 1772 } 1773 } 1774 } 1775 1776 /* 1777 * Parse an RPC header. 1778 */ 1779 int 1780 nfsrv_dorec(struct nfssvc_sock *slp, struct nfsd *nfsd, 1781 struct nfsrv_descript **ndp) 1782 { 1783 struct mbuf *m, *nam; 1784 struct nfsrv_descript *nd; 1785 int error; 1786 1787 *ndp = NULL; 1788 if ((slp->ns_flag & SLP_VALID) == 0 || 1789 (m = slp->ns_rec) == NULL) 1790 return (ENOBUFS); 1791 slp->ns_rec = m->m_nextpkt; 1792 if (slp->ns_rec) 1793 m->m_nextpkt = NULL; 1794 else 1795 slp->ns_recend = NULL; 1796 if (m->m_type == MT_SONAME) { 1797 nam = m; 1798 m = m->m_next; 1799 nam->m_next = NULL; 1800 } else 1801 nam = NULL; 1802 nd = pool_get(&nfsrv_descript_pl, PR_WAITOK); 1803 nfs_realign(&m, 10 * NFSX_UNSIGNED); 1804 nd->nd_md = nd->nd_mrep = m; 1805 nd->nd_nam2 = nam; 1806 nd->nd_dpos = mtod(m, caddr_t); 1807 error = nfs_getreq(nd, nfsd, 1); 1808 if (error) { 1809 m_freem(nam); 1810 pool_put(&nfsrv_descript_pl, nd); 1811 return (error); 1812 } 1813 *ndp = nd; 1814 nfsd->nfsd_nd = nd; 1815 return (0); 1816 } 1817 1818 1819 /* 1820 * Search for a sleeping nfsd and wake it up. 1821 * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the 1822 * running nfsds will go look for the work in the nfssvc_sock list. 1823 */ 1824 void 1825 nfsrv_wakenfsd(struct nfssvc_sock *slp) 1826 { 1827 struct nfsd *nfsd; 1828 1829 if ((slp->ns_flag & SLP_VALID) == 0) 1830 return; 1831 1832 TAILQ_FOREACH(nfsd, &nfsd_head, nfsd_chain) { 1833 if (nfsd->nfsd_flag & NFSD_WAITING) { 1834 nfsd->nfsd_flag &= ~NFSD_WAITING; 1835 if (nfsd->nfsd_slp) 1836 panic("nfsd wakeup"); 1837 slp->ns_sref++; 1838 nfsd->nfsd_slp = slp; 1839 wakeup_one(nfsd); 1840 return; 1841 } 1842 } 1843 1844 slp->ns_flag |= SLP_DOREC; 1845 nfsd_head_flag |= NFSD_CHECKSLP; 1846 } 1847 #endif /* NFSSERVER */ 1848