1 /* $OpenBSD: nfs_socket.c,v 1.114 2017/03/03 09:41:20 mpi Exp $ */ 2 /* $NetBSD: nfs_socket.c,v 1.27 1996/04/15 20:20:00 thorpej Exp $ */ 3 4 /* 5 * Copyright (c) 1989, 1991, 1993, 1995 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Rick Macklem at The University of Guelph. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95 36 */ 37 38 /* 39 * Socket operations for use by nfs 40 */ 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/proc.h> 45 #include <sys/mount.h> 46 #include <sys/kernel.h> 47 #include <sys/mbuf.h> 48 #include <sys/vnode.h> 49 #include <sys/domain.h> 50 #include <sys/protosw.h> 51 #include <sys/signalvar.h> 52 #include <sys/socket.h> 53 #include <sys/socketvar.h> 54 #include <sys/syslog.h> 55 #include <sys/tprintf.h> 56 #include <sys/namei.h> 57 #include <sys/pool.h> 58 #include <sys/queue.h> 59 60 #include <netinet/in.h> 61 #include <netinet/tcp.h> 62 63 #include <nfs/rpcv2.h> 64 #include <nfs/nfsproto.h> 65 #include <nfs/nfs.h> 66 #include <nfs/xdr_subs.h> 67 #include <nfs/nfsm_subs.h> 68 #include <nfs/nfsmount.h> 69 #include <nfs/nfs_var.h> 70 71 /* External data, mostly RPC constants in XDR form. */ 72 extern u_int32_t rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, 73 rpc_auth_unix, rpc_msgaccepted, rpc_call, rpc_autherr; 74 extern u_int32_t nfs_prog; 75 extern struct nfsstats nfsstats; 76 extern int nfsv3_procid[NFS_NPROCS]; 77 extern int nfs_ticks; 78 79 extern struct pool nfsrv_descript_pl; 80 81 /* 82 * There is a congestion window for outstanding rpcs maintained per mount 83 * point. The cwnd size is adjusted in roughly the way that: 84 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of 85 * SIGCOMM '88". ACM, August 1988. 86 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout 87 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd 88 * of rpcs is in progress. 89 * (The sent count and cwnd are scaled for integer arith.) 90 * Variants of "slow start" were tried and were found to be too much of a 91 * performance hit (ave. rtt 3 times larger), 92 * I suspect due to the large rtt that nfs rpcs have. 93 */ 94 #define NFS_CWNDSCALE 256 95 #define NFS_MAXCWND (NFS_CWNDSCALE * 32) 96 int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256 }; 97 98 /* RTT estimator */ 99 enum nfs_rto_timers nfs_ptimers[NFS_NPROCS] = { 100 NFS_DEFAULT_TIMER, /* NULL */ 101 NFS_GETATTR_TIMER, /* GETATTR */ 102 NFS_DEFAULT_TIMER, /* SETATTR */ 103 NFS_LOOKUP_TIMER, /* LOOKUP */ 104 NFS_GETATTR_TIMER, /* ACCESS */ 105 NFS_READ_TIMER, /* READLINK */ 106 NFS_READ_TIMER, /* READ */ 107 NFS_WRITE_TIMER, /* WRITE */ 108 NFS_DEFAULT_TIMER, /* CREATE */ 109 NFS_DEFAULT_TIMER, /* MKDIR */ 110 NFS_DEFAULT_TIMER, /* SYMLINK */ 111 NFS_DEFAULT_TIMER, /* MKNOD */ 112 NFS_DEFAULT_TIMER, /* REMOVE */ 113 NFS_DEFAULT_TIMER, /* RMDIR */ 114 NFS_DEFAULT_TIMER, /* RENAME */ 115 NFS_DEFAULT_TIMER, /* LINK */ 116 NFS_READ_TIMER, /* READDIR */ 117 NFS_READ_TIMER, /* READDIRPLUS */ 118 NFS_DEFAULT_TIMER, /* FSSTAT */ 119 NFS_DEFAULT_TIMER, /* FSINFO */ 120 NFS_DEFAULT_TIMER, /* PATHCONF */ 121 NFS_DEFAULT_TIMER, /* COMMIT */ 122 NFS_DEFAULT_TIMER, /* NOOP */ 123 }; 124 125 void nfs_init_rtt(struct nfsmount *); 126 void nfs_update_rtt(struct nfsreq *); 127 int nfs_estimate_rto(struct nfsmount *, u_int32_t procnum); 128 129 void nfs_realign(struct mbuf **, int); 130 void nfs_realign_fixup(struct mbuf *, struct mbuf *, unsigned int *); 131 132 int nfs_rcvlock(struct nfsreq *); 133 int nfs_receive(struct nfsreq *, struct mbuf **, struct mbuf **); 134 int nfs_reconnect(struct nfsreq *); 135 int nfs_reply(struct nfsreq *); 136 void nfs_msg(struct nfsreq *, char *); 137 void nfs_rcvunlock(int *); 138 139 int nfsrv_getstream(struct nfssvc_sock *, int); 140 141 unsigned int nfs_realign_test = 0; 142 unsigned int nfs_realign_count = 0; 143 144 /* Initialize the RTT estimator state for a new mount point. */ 145 void 146 nfs_init_rtt(struct nfsmount *nmp) 147 { 148 int i; 149 150 for (i = 0; i < NFS_MAX_TIMER; i++) 151 nmp->nm_srtt[i] = NFS_INITRTT; 152 for (i = 0; i < NFS_MAX_TIMER; i++) 153 nmp->nm_sdrtt[i] = 0; 154 } 155 156 /* 157 * Update a mount point's RTT estimator state using data from the 158 * passed-in request. 159 * 160 * Use a gain of 0.125 on the mean and a gain of 0.25 on the deviation. 161 * 162 * NB: Since the timer resolution of NFS_HZ is so course, it can often 163 * result in r_rtt == 0. Since r_rtt == N means that the actual RTT is 164 * between N + dt and N + 2 - dt ticks, add 1 before calculating the 165 * update values. 166 */ 167 void 168 nfs_update_rtt(struct nfsreq *rep) 169 { 170 int t1 = rep->r_rtt + 1; 171 int index = nfs_ptimers[rep->r_procnum] - 1; 172 int *srtt = &rep->r_nmp->nm_srtt[index]; 173 int *sdrtt = &rep->r_nmp->nm_sdrtt[index]; 174 175 t1 -= *srtt >> 3; 176 *srtt += t1; 177 if (t1 < 0) 178 t1 = -t1; 179 t1 -= *sdrtt >> 2; 180 *sdrtt += t1; 181 } 182 183 /* 184 * Estimate RTO for an NFS RPC sent via an unreliable datagram. 185 * 186 * Use the mean and mean deviation of RTT for the appropriate type 187 * of RPC for the frequent RPCs and a default for the others. 188 * The justification for doing "other" this way is that these RPCs 189 * happen so infrequently that timer est. would probably be stale. 190 * Also, since many of these RPCs are non-idempotent, a conservative 191 * timeout is desired. 192 * 193 * getattr, lookup - A+2D 194 * read, write - A+4D 195 * other - nm_timeo 196 */ 197 int 198 nfs_estimate_rto(struct nfsmount *nmp, u_int32_t procnum) 199 { 200 enum nfs_rto_timers timer = nfs_ptimers[procnum]; 201 int index = timer - 1; 202 int rto; 203 204 switch (timer) { 205 case NFS_GETATTR_TIMER: 206 case NFS_LOOKUP_TIMER: 207 rto = ((nmp->nm_srtt[index] + 3) >> 2) + 208 ((nmp->nm_sdrtt[index] + 1) >> 1); 209 break; 210 case NFS_READ_TIMER: 211 case NFS_WRITE_TIMER: 212 rto = ((nmp->nm_srtt[index] + 7) >> 3) + 213 (nmp->nm_sdrtt[index] + 1); 214 break; 215 default: 216 rto = nmp->nm_timeo; 217 return (rto); 218 } 219 220 if (rto < NFS_MINRTO) 221 rto = NFS_MINRTO; 222 else if (rto > NFS_MAXRTO) 223 rto = NFS_MAXRTO; 224 225 return (rto); 226 } 227 228 229 230 /* 231 * Initialize sockets and congestion for a new NFS connection. 232 * We do not free the sockaddr if error. 233 */ 234 int 235 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep) 236 { 237 struct socket *so; 238 int s, error, rcvreserve, sndreserve; 239 struct sockaddr *saddr; 240 struct sockaddr_in *sin; 241 struct mbuf *m; 242 243 nmp->nm_so = NULL; 244 saddr = mtod(nmp->nm_nam, struct sockaddr *); 245 error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype, 246 nmp->nm_soproto); 247 if (error) 248 goto bad; 249 so = nmp->nm_so; 250 nmp->nm_soflags = so->so_proto->pr_flags; 251 252 /* 253 * Some servers require that the client port be a reserved port number. 254 * We always allocate a reserved port, as this prevents filehandle 255 * disclosure through UDP port capture. 256 */ 257 if (saddr->sa_family == AF_INET) { 258 struct mbuf *mopt; 259 int *ip; 260 261 MGET(mopt, M_WAIT, MT_SOOPTS); 262 mopt->m_len = sizeof(int); 263 ip = mtod(mopt, int *); 264 *ip = IP_PORTRANGE_LOW; 265 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); 266 if (error) 267 goto bad; 268 269 MGET(m, M_WAIT, MT_SONAME); 270 sin = mtod(m, struct sockaddr_in *); 271 memset(sin, 0, sizeof(*sin)); 272 sin->sin_len = m->m_len = sizeof(struct sockaddr_in); 273 sin->sin_family = AF_INET; 274 sin->sin_addr.s_addr = INADDR_ANY; 275 sin->sin_port = htons(0); 276 error = sobind(so, m, &proc0); 277 m_freem(m); 278 if (error) 279 goto bad; 280 281 MGET(mopt, M_WAIT, MT_SOOPTS); 282 mopt->m_len = sizeof(int); 283 ip = mtod(mopt, int *); 284 *ip = IP_PORTRANGE_DEFAULT; 285 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); 286 if (error) 287 goto bad; 288 } 289 290 /* 291 * Protocols that do not require connections may be optionally left 292 * unconnected for servers that reply from a port other than NFS_PORT. 293 */ 294 if (nmp->nm_flag & NFSMNT_NOCONN) { 295 if (nmp->nm_soflags & PR_CONNREQUIRED) { 296 error = ENOTCONN; 297 goto bad; 298 } 299 } else { 300 error = soconnect(so, nmp->nm_nam); 301 if (error) 302 goto bad; 303 304 /* 305 * Wait for the connection to complete. Cribbed from the 306 * connect system call but with the wait timing out so 307 * that interruptible mounts don't hang here for a long time. 308 */ 309 s = splsoftnet(); 310 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 311 (void) tsleep((caddr_t)&so->so_timeo, PSOCK, 312 "nfscon", 2 * hz); 313 if ((so->so_state & SS_ISCONNECTING) && 314 so->so_error == 0 && rep && 315 (error = nfs_sigintr(nmp, rep, rep->r_procp)) != 0){ 316 so->so_state &= ~SS_ISCONNECTING; 317 splx(s); 318 goto bad; 319 } 320 } 321 if (so->so_error) { 322 error = so->so_error; 323 so->so_error = 0; 324 splx(s); 325 goto bad; 326 } 327 splx(s); 328 } 329 /* 330 * Always set receive timeout to detect server crash and reconnect. 331 * Otherwise, we can get stuck in soreceive forever. 332 */ 333 so->so_rcv.sb_timeo = (5 * hz); 334 if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) 335 so->so_snd.sb_timeo = (5 * hz); 336 else 337 so->so_snd.sb_timeo = 0; 338 if (nmp->nm_sotype == SOCK_DGRAM) { 339 sndreserve = nmp->nm_wsize + NFS_MAXPKTHDR; 340 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) + 341 NFS_MAXPKTHDR) * 2; 342 } else if (nmp->nm_sotype == SOCK_SEQPACKET) { 343 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 2; 344 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) + 345 NFS_MAXPKTHDR) * 2; 346 } else { 347 if (nmp->nm_sotype != SOCK_STREAM) 348 panic("nfscon sotype"); 349 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 350 MGET(m, M_WAIT, MT_SOOPTS); 351 *mtod(m, int32_t *) = 1; 352 m->m_len = sizeof(int32_t); 353 sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, m); 354 } 355 if (so->so_proto->pr_protocol == IPPROTO_TCP) { 356 MGET(m, M_WAIT, MT_SOOPTS); 357 *mtod(m, int32_t *) = 1; 358 m->m_len = sizeof(int32_t); 359 sosetopt(so, IPPROTO_TCP, TCP_NODELAY, m); 360 } 361 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + 362 sizeof (u_int32_t)) * 2; 363 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + 364 sizeof (u_int32_t)) * 2; 365 } 366 error = soreserve(so, sndreserve, rcvreserve); 367 if (error) 368 goto bad; 369 so->so_rcv.sb_flags |= SB_NOINTR; 370 so->so_snd.sb_flags |= SB_NOINTR; 371 372 /* Initialize other non-zero congestion variables */ 373 nfs_init_rtt(nmp); 374 nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */ 375 nmp->nm_sent = 0; 376 nmp->nm_timeouts = 0; 377 return (0); 378 379 bad: 380 nfs_disconnect(nmp); 381 return (error); 382 } 383 384 /* 385 * Reconnect routine: 386 * Called when a connection is broken on a reliable protocol. 387 * - clean up the old socket 388 * - nfs_connect() again 389 * - set R_MUSTRESEND for all outstanding requests on mount point 390 * If this fails the mount point is DEAD! 391 * nb: Must be called with the nfs_sndlock() set on the mount point. 392 */ 393 int 394 nfs_reconnect(struct nfsreq *rep) 395 { 396 struct nfsreq *rp; 397 struct nfsmount *nmp = rep->r_nmp; 398 int s, error; 399 400 nfs_disconnect(nmp); 401 while ((error = nfs_connect(nmp, rep)) != 0) { 402 if (error == EINTR || error == ERESTART) 403 return (EINTR); 404 (void) tsleep((caddr_t)&lbolt, PSOCK, "nfsrecon", 0); 405 } 406 407 /* 408 * Loop through outstanding request list and fix up all requests 409 * on old socket. 410 */ 411 s = splsoftnet(); 412 TAILQ_FOREACH(rp, &nmp->nm_reqsq, r_chain) { 413 rp->r_flags |= R_MUSTRESEND; 414 rp->r_rexmit = 0; 415 } 416 splx(s); 417 return (0); 418 } 419 420 /* 421 * NFS disconnect. Clean up and unlink. 422 */ 423 void 424 nfs_disconnect(struct nfsmount *nmp) 425 { 426 struct socket *so; 427 428 if (nmp->nm_so) { 429 so = nmp->nm_so; 430 nmp->nm_so = NULL; 431 soshutdown(so, SHUT_RDWR); 432 soclose(so); 433 } 434 } 435 436 /* 437 * This is the nfs send routine. For connection based socket types, it 438 * must be called with an nfs_sndlock() on the socket. 439 * "rep == NULL" indicates that it has been called from a server. 440 * For the client side: 441 * - return EINTR if the RPC is terminated, 0 otherwise 442 * - set R_MUSTRESEND if the send fails for any reason 443 * - do any cleanup required by recoverable socket errors (???) 444 * For the server side: 445 * - return EINTR or ERESTART if interrupted by a signal 446 * - return EPIPE if a connection is lost for connection based sockets (TCP...) 447 * - do any cleanup required by recoverable socket errors (???) 448 */ 449 int 450 nfs_send(struct socket *so, struct mbuf *nam, struct mbuf *top, 451 struct nfsreq *rep) 452 { 453 struct mbuf *sendnam; 454 int error, soflags, flags; 455 456 if (rep) { 457 if (rep->r_flags & R_SOFTTERM) { 458 m_freem(top); 459 return (EINTR); 460 } 461 if ((so = rep->r_nmp->nm_so) == NULL) { 462 rep->r_flags |= R_MUSTRESEND; 463 m_freem(top); 464 return (0); 465 } 466 rep->r_flags &= ~R_MUSTRESEND; 467 soflags = rep->r_nmp->nm_soflags; 468 } else 469 soflags = so->so_proto->pr_flags; 470 if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED)) 471 sendnam = NULL; 472 else 473 sendnam = nam; 474 if (so->so_type == SOCK_SEQPACKET) 475 flags = MSG_EOR; 476 else 477 flags = 0; 478 479 error = sosend(so, sendnam, NULL, top, NULL, flags); 480 if (error) { 481 if (rep) { 482 /* 483 * Deal with errors for the client side. 484 */ 485 if (rep->r_flags & R_SOFTTERM) 486 error = EINTR; 487 else 488 rep->r_flags |= R_MUSTRESEND; 489 } 490 491 /* 492 * Handle any recoverable (soft) socket errors here. (???) 493 */ 494 if (error != EINTR && error != ERESTART && 495 error != EWOULDBLOCK && error != EPIPE) 496 error = 0; 497 } 498 return (error); 499 } 500 501 #ifdef NFSCLIENT 502 /* 503 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all 504 * done by soreceive(), but for SOCK_STREAM we must deal with the Record 505 * Mark and consolidate the data into a new mbuf list. 506 * nb: Sometimes TCP passes the data up to soreceive() in long lists of 507 * small mbufs. 508 * For SOCK_STREAM we must be very careful to read an entire record once 509 * we have read any of it, even if the system call has been interrupted. 510 */ 511 int 512 nfs_receive(struct nfsreq *rep, struct mbuf **aname, struct mbuf **mp) 513 { 514 struct socket *so; 515 struct uio auio; 516 struct iovec aio; 517 struct mbuf *m; 518 struct mbuf *control; 519 u_int32_t len; 520 struct mbuf **getnam; 521 int error, sotype, rcvflg; 522 struct proc *p = curproc; /* XXX */ 523 524 /* 525 * Set up arguments for soreceive() 526 */ 527 *mp = NULL; 528 *aname = NULL; 529 sotype = rep->r_nmp->nm_sotype; 530 531 /* 532 * For reliable protocols, lock against other senders/receivers 533 * in case a reconnect is necessary. 534 * For SOCK_STREAM, first get the Record Mark to find out how much 535 * more there is to get. 536 * We must lock the socket against other receivers 537 * until we have an entire rpc request/reply. 538 */ 539 if (sotype != SOCK_DGRAM) { 540 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); 541 if (error) 542 return (error); 543 tryagain: 544 /* 545 * Check for fatal errors and resending request. 546 */ 547 /* 548 * Ugh: If a reconnect attempt just happened, nm_so 549 * would have changed. NULL indicates a failed 550 * attempt that has essentially shut down this 551 * mount point. 552 */ 553 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) { 554 nfs_sndunlock(&rep->r_nmp->nm_flag); 555 return (EINTR); 556 } 557 so = rep->r_nmp->nm_so; 558 if (!so) { 559 error = nfs_reconnect(rep); 560 if (error) { 561 nfs_sndunlock(&rep->r_nmp->nm_flag); 562 return (error); 563 } 564 goto tryagain; 565 } 566 while (rep->r_flags & R_MUSTRESEND) { 567 m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT); 568 nfsstats.rpcretries++; 569 rep->r_rtt = 0; 570 rep->r_flags &= ~R_TIMING; 571 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep); 572 if (error) { 573 if (error == EINTR || error == ERESTART || 574 (error = nfs_reconnect(rep)) != 0) { 575 nfs_sndunlock(&rep->r_nmp->nm_flag); 576 return (error); 577 } 578 goto tryagain; 579 } 580 } 581 nfs_sndunlock(&rep->r_nmp->nm_flag); 582 if (sotype == SOCK_STREAM) { 583 aio.iov_base = (caddr_t) &len; 584 aio.iov_len = sizeof(u_int32_t); 585 auio.uio_iov = &aio; 586 auio.uio_iovcnt = 1; 587 auio.uio_segflg = UIO_SYSSPACE; 588 auio.uio_rw = UIO_READ; 589 auio.uio_offset = 0; 590 auio.uio_resid = sizeof(u_int32_t); 591 auio.uio_procp = p; 592 do { 593 rcvflg = MSG_WAITALL; 594 error = soreceive(so, NULL, &auio, NULL, NULL, 595 &rcvflg, 0); 596 if (error == EWOULDBLOCK && rep) { 597 if (rep->r_flags & R_SOFTTERM) 598 return (EINTR); 599 /* 600 * looks like the server died after it 601 * received the request, make sure 602 * that we will retransmit and we 603 * don't get stuck here forever. 604 */ 605 if (rep->r_rexmit >= rep->r_nmp->nm_retry) { 606 nfsstats.rpctimeouts++; 607 error = EPIPE; 608 } 609 } 610 } while (error == EWOULDBLOCK); 611 if (!error && auio.uio_resid > 0) { 612 log(LOG_INFO, 613 "short receive (%zu/%zu) from nfs server %s\n", 614 sizeof(u_int32_t) - auio.uio_resid, 615 sizeof(u_int32_t), 616 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 617 error = EPIPE; 618 } 619 if (error) 620 goto errout; 621 622 len = ntohl(len) & ~0x80000000; 623 /* 624 * This is SERIOUS! We are out of sync with the sender 625 * and forcing a disconnect/reconnect is all I can do. 626 */ 627 if (len > NFS_MAXPACKET) { 628 log(LOG_ERR, "%s (%u) from nfs server %s\n", 629 "impossible packet length", 630 len, 631 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 632 error = EFBIG; 633 goto errout; 634 } 635 auio.uio_resid = len; 636 do { 637 rcvflg = MSG_WAITALL; 638 error = soreceive(so, NULL, &auio, mp, NULL, 639 &rcvflg, 0); 640 } while (error == EWOULDBLOCK || error == EINTR || 641 error == ERESTART); 642 if (!error && auio.uio_resid > 0) { 643 log(LOG_INFO, 644 "short receive (%zu/%u) from nfs server %s\n", 645 len - auio.uio_resid, len, 646 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 647 error = EPIPE; 648 } 649 } else { 650 /* 651 * NB: Since uio_resid is big, MSG_WAITALL is ignored 652 * and soreceive() will return when it has either a 653 * control msg or a data msg. 654 * We have no use for control msg., but must grab them 655 * and then throw them away so we know what is going 656 * on. 657 */ 658 auio.uio_resid = len = 100000000; /* Anything Big */ 659 auio.uio_procp = p; 660 do { 661 rcvflg = 0; 662 error = soreceive(so, NULL, &auio, mp, &control, 663 &rcvflg, 0); 664 m_freem(control); 665 if (error == EWOULDBLOCK && rep) { 666 if (rep->r_flags & R_SOFTTERM) 667 return (EINTR); 668 } 669 } while (error == EWOULDBLOCK || 670 (!error && *mp == NULL && control)); 671 if ((rcvflg & MSG_EOR) == 0) 672 printf("Egad!!\n"); 673 if (!error && *mp == NULL) 674 error = EPIPE; 675 len -= auio.uio_resid; 676 } 677 errout: 678 if (error && error != EINTR && error != ERESTART) { 679 m_freem(*mp); 680 *mp = NULL; 681 if (error != EPIPE) 682 log(LOG_INFO, 683 "receive error %d from nfs server %s\n", 684 error, 685 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 686 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); 687 if (!error) { 688 error = nfs_reconnect(rep); 689 if (!error) 690 goto tryagain; 691 nfs_sndunlock(&rep->r_nmp->nm_flag); 692 } 693 } 694 } else { 695 if ((so = rep->r_nmp->nm_so) == NULL) 696 return (EACCES); 697 if (so->so_state & SS_ISCONNECTED) 698 getnam = NULL; 699 else 700 getnam = aname; 701 auio.uio_resid = len = 1000000; 702 auio.uio_procp = p; 703 do { 704 rcvflg = 0; 705 error = soreceive(so, getnam, &auio, mp, NULL, 706 &rcvflg, 0); 707 if (error == EWOULDBLOCK && 708 (rep->r_flags & R_SOFTTERM)) 709 return (EINTR); 710 } while (error == EWOULDBLOCK); 711 len -= auio.uio_resid; 712 } 713 if (error) { 714 m_freem(*mp); 715 *mp = NULL; 716 } 717 /* 718 * Search for any mbufs that are not a multiple of 4 bytes long 719 * or with m_data not longword aligned. 720 * These could cause pointer alignment problems, so copy them to 721 * well aligned mbufs. 722 */ 723 nfs_realign(mp, 5 * NFSX_UNSIGNED); 724 return (error); 725 } 726 727 /* 728 * Implement receipt of reply on a socket. 729 * We must search through the list of received datagrams matching them 730 * with outstanding requests using the xid, until ours is found. 731 */ 732 int 733 nfs_reply(struct nfsreq *myrep) 734 { 735 struct nfsreq *rep; 736 struct nfsmount *nmp = myrep->r_nmp; 737 struct nfsm_info info; 738 struct mbuf *nam; 739 u_int32_t rxid, *tl, t1; 740 caddr_t cp2; 741 int s, error; 742 743 /* 744 * Loop around until we get our own reply 745 */ 746 for (;;) { 747 /* 748 * Lock against other receivers so that I don't get stuck in 749 * sbwait() after someone else has received my reply for me. 750 * Also necessary for connection based protocols to avoid 751 * race conditions during a reconnect. 752 */ 753 error = nfs_rcvlock(myrep); 754 if (error) 755 return (error == EALREADY ? 0 : error); 756 757 /* 758 * Get the next Rpc reply off the socket 759 */ 760 error = nfs_receive(myrep, &nam, &info.nmi_mrep); 761 nfs_rcvunlock(&nmp->nm_flag); 762 if (error) { 763 764 /* 765 * Ignore routing errors on connectionless protocols?? 766 */ 767 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) { 768 if (nmp->nm_so) 769 nmp->nm_so->so_error = 0; 770 continue; 771 } 772 return (error); 773 } 774 m_freem(nam); 775 776 /* 777 * Get the xid and check that it is an rpc reply 778 */ 779 info.nmi_md = info.nmi_mrep; 780 info.nmi_dpos = mtod(info.nmi_md, caddr_t); 781 nfsm_dissect(tl, u_int32_t *, 2*NFSX_UNSIGNED); 782 rxid = *tl++; 783 if (*tl != rpc_reply) { 784 nfsstats.rpcinvalid++; 785 m_freem(info.nmi_mrep); 786 nfsmout: 787 continue; 788 } 789 790 /* 791 * Loop through the request list to match up the reply 792 * Iff no match, just drop the datagram 793 */ 794 s = splsoftnet(); 795 TAILQ_FOREACH(rep, &nmp->nm_reqsq, r_chain) { 796 if (rep->r_mrep == NULL && rxid == rep->r_xid) { 797 /* Found it.. */ 798 rep->r_mrep = info.nmi_mrep; 799 rep->r_md = info.nmi_md; 800 rep->r_dpos = info.nmi_dpos; 801 802 /* 803 * Update congestion window. 804 * Do the additive increase of 805 * one rpc/rtt. 806 */ 807 if (nmp->nm_cwnd <= nmp->nm_sent) { 808 nmp->nm_cwnd += 809 (NFS_CWNDSCALE * NFS_CWNDSCALE + 810 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd; 811 if (nmp->nm_cwnd > NFS_MAXCWND) 812 nmp->nm_cwnd = NFS_MAXCWND; 813 } 814 rep->r_flags &= ~R_SENT; 815 nmp->nm_sent -= NFS_CWNDSCALE; 816 817 if (rep->r_flags & R_TIMING) 818 nfs_update_rtt(rep); 819 820 nmp->nm_timeouts = 0; 821 break; 822 } 823 } 824 splx(s); 825 /* 826 * If not matched to a request, drop it. 827 * If it's mine, get out. 828 */ 829 if (rep == 0) { 830 nfsstats.rpcunexpected++; 831 m_freem(info.nmi_mrep); 832 } else if (rep == myrep) { 833 if (rep->r_mrep == NULL) 834 panic("nfsreply nil"); 835 return (0); 836 } 837 } 838 } 839 840 /* 841 * nfs_request - goes something like this 842 * - fill in request struct 843 * - links it into list 844 * - calls nfs_send() for first transmit 845 * - calls nfs_receive() to get reply 846 * - break down rpc header and return with nfs reply pointed to 847 * by mrep or error 848 * nb: always frees up mreq mbuf list 849 */ 850 int 851 nfs_request(struct vnode *vp, int procnum, struct nfsm_info *infop) 852 { 853 struct mbuf *m; 854 u_int32_t *tl; 855 struct nfsmount *nmp; 856 struct timeval tv; 857 caddr_t cp2; 858 int t1, i, s, error = 0; 859 int trylater_delay; 860 struct nfsreq *rep; 861 int mrest_len; 862 struct nfsm_info info; 863 864 rep = pool_get(&nfsreqpl, PR_WAITOK); 865 rep->r_nmp = VFSTONFS(vp->v_mount); 866 rep->r_vp = vp; 867 rep->r_procp = infop->nmi_procp; 868 rep->r_procnum = procnum; 869 870 mrest_len = 0; 871 m = infop->nmi_mreq; 872 while (m) { 873 mrest_len += m->m_len; 874 m = m->m_next; 875 } 876 877 /* empty mbuf for AUTH_UNIX header */ 878 rep->r_mreq = m_gethdr(M_WAIT, MT_DATA); 879 rep->r_mreq->m_next = infop->nmi_mreq; 880 rep->r_mreq->m_pkthdr.len = mrest_len; 881 882 trylater_delay = NFS_MINTIMEO; 883 884 nmp = rep->r_nmp; 885 886 /* Get the RPC header with authorization. */ 887 nfsm_rpchead(rep, infop->nmi_cred, RPCAUTH_UNIX); 888 m = rep->r_mreq; 889 890 /* 891 * For stream protocols, insert a Sun RPC Record Mark. 892 */ 893 if (nmp->nm_sotype == SOCK_STREAM) { 894 M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); 895 *mtod(m, u_int32_t *) = htonl(0x80000000 | 896 (m->m_pkthdr.len - NFSX_UNSIGNED)); 897 } 898 899 tryagain: 900 rep->r_rtt = rep->r_rexmit = 0; 901 if (nfs_ptimers[rep->r_procnum] != NFS_DEFAULT_TIMER) 902 rep->r_flags = R_TIMING; 903 else 904 rep->r_flags = 0; 905 rep->r_mrep = NULL; 906 907 /* 908 * Do the client side RPC. 909 */ 910 nfsstats.rpcrequests++; 911 /* 912 * Chain request into list of outstanding requests. Be sure 913 * to put it LAST so timer finds oldest requests first. 914 */ 915 s = splsoftnet(); 916 if (TAILQ_EMPTY(&nmp->nm_reqsq)) 917 timeout_add(&nmp->nm_rtimeout, nfs_ticks); 918 TAILQ_INSERT_TAIL(&nmp->nm_reqsq, rep, r_chain); 919 920 /* 921 * If backing off another request or avoiding congestion, don't 922 * send this one now but let timer do it. If not timing a request, 923 * do it now. 924 */ 925 if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM || 926 (nmp->nm_flag & NFSMNT_DUMBTIMR) || 927 nmp->nm_sent < nmp->nm_cwnd)) { 928 splx(s); 929 if (nmp->nm_soflags & PR_CONNREQUIRED) 930 error = nfs_sndlock(&nmp->nm_flag, rep); 931 if (!error) { 932 error = nfs_send(nmp->nm_so, nmp->nm_nam, 933 m_copym(m, 0, M_COPYALL, M_WAIT), 934 rep); 935 if (nmp->nm_soflags & PR_CONNREQUIRED) 936 nfs_sndunlock(&nmp->nm_flag); 937 } 938 if (!error && (rep->r_flags & R_MUSTRESEND) == 0) { 939 nmp->nm_sent += NFS_CWNDSCALE; 940 rep->r_flags |= R_SENT; 941 } 942 } else { 943 splx(s); 944 rep->r_rtt = -1; 945 } 946 947 /* 948 * Wait for the reply from our send or the timer's. 949 */ 950 if (!error || error == EPIPE) 951 error = nfs_reply(rep); 952 953 /* 954 * RPC done, unlink the request. 955 */ 956 s = splsoftnet(); 957 TAILQ_REMOVE(&nmp->nm_reqsq, rep, r_chain); 958 if (TAILQ_EMPTY(&nmp->nm_reqsq)) 959 timeout_del(&nmp->nm_rtimeout); 960 splx(s); 961 962 /* 963 * Decrement the outstanding request count. 964 */ 965 if (rep->r_flags & R_SENT) { 966 rep->r_flags &= ~R_SENT; /* paranoia */ 967 nmp->nm_sent -= NFS_CWNDSCALE; 968 } 969 970 /* 971 * If there was a successful reply and a tprintf msg. 972 * tprintf a response. 973 */ 974 if (!error && (rep->r_flags & R_TPRINTFMSG)) 975 nfs_msg(rep, "is alive again"); 976 info.nmi_mrep = rep->r_mrep; 977 info.nmi_md = rep->r_md; 978 info.nmi_dpos = rep->r_dpos; 979 if (error) { 980 infop->nmi_mrep = NULL; 981 goto nfsmout1; 982 } 983 984 /* 985 * break down the rpc header and check if ok 986 */ 987 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); 988 if (*tl++ == rpc_msgdenied) { 989 if (*tl == rpc_mismatch) 990 error = EOPNOTSUPP; 991 else 992 error = EACCES; /* Should be EAUTH. */ 993 infop->nmi_mrep = NULL; 994 goto nfsmout1; 995 } 996 997 /* 998 * Since we only support RPCAUTH_UNIX atm we step over the 999 * reply verifer type, and in the (error) case that there really 1000 * is any data in it, we advance over it. 1001 */ 1002 tl++; /* Step over verifer type */ 1003 i = fxdr_unsigned(int32_t, *tl); 1004 if (i > 0) 1005 nfsm_adv(nfsm_rndup(i)); /* Should not happen */ 1006 1007 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); 1008 /* 0 == ok */ 1009 if (*tl == 0) { 1010 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); 1011 if (*tl != 0) { 1012 error = fxdr_unsigned(int, *tl); 1013 if ((nmp->nm_flag & NFSMNT_NFSV3) && 1014 error == NFSERR_TRYLATER) { 1015 m_freem(info.nmi_mrep); 1016 error = 0; 1017 tv.tv_sec = trylater_delay; 1018 tv.tv_usec = 0; 1019 tsleep(&tv, PSOCK, "nfsretry", tvtohz(&tv)); 1020 trylater_delay *= NFS_TIMEOUTMUL; 1021 if (trylater_delay > NFS_MAXTIMEO) 1022 trylater_delay = NFS_MAXTIMEO; 1023 1024 goto tryagain; 1025 } 1026 1027 /* 1028 * If the File Handle was stale, invalidate the 1029 * lookup cache, just in case. 1030 */ 1031 if (error == ESTALE) 1032 cache_purge(rep->r_vp); 1033 } 1034 goto nfsmout; 1035 } 1036 1037 error = EPROTONOSUPPORT; 1038 1039 nfsmout: 1040 infop->nmi_mrep = info.nmi_mrep; 1041 infop->nmi_md = info.nmi_md; 1042 infop->nmi_dpos = info.nmi_dpos; 1043 nfsmout1: 1044 m_freem(rep->r_mreq); 1045 pool_put(&nfsreqpl, rep); 1046 return (error); 1047 } 1048 #endif /* NFSCLIENT */ 1049 1050 /* 1051 * Generate the rpc reply header 1052 * siz arg. is used to decide if adding a cluster is worthwhile 1053 */ 1054 int 1055 nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp, 1056 int err, struct mbuf **mrq, struct mbuf **mbp) 1057 { 1058 u_int32_t *tl; 1059 struct mbuf *mreq; 1060 struct mbuf *mb; 1061 1062 MGETHDR(mreq, M_WAIT, MT_DATA); 1063 mb = mreq; 1064 /* 1065 * If this is a big reply, use a cluster else 1066 * try and leave leading space for the lower level headers. 1067 */ 1068 siz += RPC_REPLYSIZ; 1069 if (siz >= MHLEN - max_hdr) { 1070 MCLGET(mreq, M_WAIT); 1071 } else 1072 mreq->m_data += max_hdr; 1073 tl = mtod(mreq, u_int32_t *); 1074 mreq->m_len = 6 * NFSX_UNSIGNED; 1075 *tl++ = txdr_unsigned(nd->nd_retxid); 1076 *tl++ = rpc_reply; 1077 if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) { 1078 *tl++ = rpc_msgdenied; 1079 if (err & NFSERR_AUTHERR) { 1080 *tl++ = rpc_autherr; 1081 *tl = txdr_unsigned(err & ~NFSERR_AUTHERR); 1082 mreq->m_len -= NFSX_UNSIGNED; 1083 } else { 1084 *tl++ = rpc_mismatch; 1085 *tl++ = txdr_unsigned(RPC_VER2); 1086 *tl = txdr_unsigned(RPC_VER2); 1087 } 1088 } else { 1089 *tl++ = rpc_msgaccepted; 1090 1091 /* AUTH_UNIX requires RPCAUTH_NULL. */ 1092 *tl++ = 0; 1093 *tl++ = 0; 1094 1095 switch (err) { 1096 case EPROGUNAVAIL: 1097 *tl = txdr_unsigned(RPC_PROGUNAVAIL); 1098 break; 1099 case EPROGMISMATCH: 1100 *tl = txdr_unsigned(RPC_PROGMISMATCH); 1101 tl = nfsm_build(&mb, 2 * NFSX_UNSIGNED); 1102 *tl++ = txdr_unsigned(NFS_VER2); 1103 *tl = txdr_unsigned(NFS_VER3); 1104 break; 1105 case EPROCUNAVAIL: 1106 *tl = txdr_unsigned(RPC_PROCUNAVAIL); 1107 break; 1108 case EBADRPC: 1109 *tl = txdr_unsigned(RPC_GARBAGE); 1110 break; 1111 default: 1112 *tl = 0; 1113 if (err != NFSERR_RETVOID) { 1114 tl = nfsm_build(&mb, NFSX_UNSIGNED); 1115 if (err) 1116 *tl = txdr_unsigned(nfsrv_errmap(nd, err)); 1117 else 1118 *tl = 0; 1119 } 1120 break; 1121 }; 1122 } 1123 1124 *mrq = mreq; 1125 if (mbp != NULL) 1126 *mbp = mb; 1127 if (err != 0 && err != NFSERR_RETVOID) 1128 nfsstats.srvrpc_errs++; 1129 return (0); 1130 } 1131 1132 /* 1133 * nfs timer routine 1134 * Scan the nfsreq list and retranmit any requests that have timed out. 1135 */ 1136 void 1137 nfs_timer(void *arg) 1138 { 1139 struct nfsmount *nmp = arg; 1140 struct nfsreq *rep; 1141 struct mbuf *m; 1142 struct socket *so; 1143 int timeo, s, error; 1144 1145 NET_LOCK(s); 1146 TAILQ_FOREACH(rep, &nmp->nm_reqsq, r_chain) { 1147 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) 1148 continue; 1149 if (nfs_sigintr(nmp, rep, rep->r_procp)) { 1150 rep->r_flags |= R_SOFTTERM; 1151 continue; 1152 } 1153 if (rep->r_rtt >= 0) { 1154 rep->r_rtt++; 1155 if (nmp->nm_flag & NFSMNT_DUMBTIMR) 1156 timeo = nmp->nm_timeo; 1157 else 1158 timeo = nfs_estimate_rto(nmp, rep->r_procnum); 1159 if (nmp->nm_timeouts > 0) 1160 timeo *= nfs_backoff[nmp->nm_timeouts - 1]; 1161 if (rep->r_rtt <= timeo) 1162 continue; 1163 if (nmp->nm_timeouts < nitems(nfs_backoff)) 1164 nmp->nm_timeouts++; 1165 } 1166 1167 /* Check for server not responding. */ 1168 if ((rep->r_flags & R_TPRINTFMSG) == 0 && rep->r_rexmit > 4) { 1169 nfs_msg(rep, "not responding"); 1170 rep->r_flags |= R_TPRINTFMSG; 1171 } 1172 if (rep->r_rexmit >= nmp->nm_retry) { /* too many */ 1173 nfsstats.rpctimeouts++; 1174 rep->r_flags |= R_SOFTTERM; 1175 continue; 1176 } 1177 if (nmp->nm_sotype != SOCK_DGRAM) { 1178 if (++rep->r_rexmit > NFS_MAXREXMIT) 1179 rep->r_rexmit = NFS_MAXREXMIT; 1180 continue; 1181 } 1182 1183 if ((so = nmp->nm_so) == NULL) 1184 continue; 1185 1186 /* 1187 * If there is enough space and the window allows.. 1188 * Resend it 1189 * Set r_rtt to -1 in case we fail to send it now. 1190 */ 1191 rep->r_rtt = -1; 1192 if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len && 1193 ((nmp->nm_flag & NFSMNT_DUMBTIMR) || 1194 (rep->r_flags & R_SENT) || 1195 nmp->nm_sent < nmp->nm_cwnd) && 1196 (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){ 1197 if ((nmp->nm_flag & NFSMNT_NOCONN) == 0) 1198 error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m, 1199 NULL, NULL, curproc); 1200 else 1201 error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m, 1202 nmp->nm_nam, NULL, curproc); 1203 if (error) { 1204 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) 1205 so->so_error = 0; 1206 } else { 1207 /* 1208 * Iff first send, start timing 1209 * else turn timing off, backoff timer 1210 * and divide congestion window by 2. 1211 */ 1212 if (rep->r_flags & R_SENT) { 1213 rep->r_flags &= ~R_TIMING; 1214 if (++rep->r_rexmit > NFS_MAXREXMIT) 1215 rep->r_rexmit = NFS_MAXREXMIT; 1216 nmp->nm_cwnd >>= 1; 1217 if (nmp->nm_cwnd < NFS_CWNDSCALE) 1218 nmp->nm_cwnd = NFS_CWNDSCALE; 1219 nfsstats.rpcretries++; 1220 } else { 1221 rep->r_flags |= R_SENT; 1222 nmp->nm_sent += NFS_CWNDSCALE; 1223 } 1224 rep->r_rtt = 0; 1225 } 1226 } 1227 } 1228 NET_UNLOCK(s); 1229 timeout_add(&nmp->nm_rtimeout, nfs_ticks); 1230 } 1231 1232 /* 1233 * Test for a termination condition pending on the process. 1234 * This is used for NFSMNT_INT mounts. 1235 */ 1236 int 1237 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct proc *p) 1238 { 1239 1240 if (rep && (rep->r_flags & R_SOFTTERM)) 1241 return (EINTR); 1242 if (!(nmp->nm_flag & NFSMNT_INT)) 1243 return (0); 1244 if (p && p->p_siglist && 1245 (((p->p_siglist & ~p->p_sigmask) & 1246 ~p->p_p->ps_sigacts->ps_sigignore) & NFSINT_SIGMASK)) 1247 return (EINTR); 1248 return (0); 1249 } 1250 1251 /* 1252 * Lock a socket against others. 1253 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply 1254 * and also to avoid race conditions between the processes with nfs requests 1255 * in progress when a reconnect is necessary. 1256 */ 1257 int 1258 nfs_sndlock(int *flagp, struct nfsreq *rep) 1259 { 1260 struct proc *p; 1261 int slpflag = 0, slptimeo = 0; 1262 1263 if (rep) { 1264 p = rep->r_procp; 1265 if (rep->r_nmp->nm_flag & NFSMNT_INT) 1266 slpflag = PCATCH; 1267 } else 1268 p = NULL; 1269 while (*flagp & NFSMNT_SNDLOCK) { 1270 if (rep && nfs_sigintr(rep->r_nmp, rep, p)) 1271 return (EINTR); 1272 *flagp |= NFSMNT_WANTSND; 1273 (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsndlck", 1274 slptimeo); 1275 if (slpflag == PCATCH) { 1276 slpflag = 0; 1277 slptimeo = 2 * hz; 1278 } 1279 } 1280 *flagp |= NFSMNT_SNDLOCK; 1281 return (0); 1282 } 1283 1284 /* 1285 * Unlock the stream socket for others. 1286 */ 1287 void 1288 nfs_sndunlock(int *flagp) 1289 { 1290 1291 if ((*flagp & NFSMNT_SNDLOCK) == 0) 1292 panic("nfs sndunlock"); 1293 *flagp &= ~NFSMNT_SNDLOCK; 1294 if (*flagp & NFSMNT_WANTSND) { 1295 *flagp &= ~NFSMNT_WANTSND; 1296 wakeup((caddr_t)flagp); 1297 } 1298 } 1299 1300 int 1301 nfs_rcvlock(struct nfsreq *rep) 1302 { 1303 int *flagp = &rep->r_nmp->nm_flag; 1304 int slpflag, slptimeo = 0; 1305 1306 if (*flagp & NFSMNT_INT) 1307 slpflag = PCATCH; 1308 else 1309 slpflag = 0; 1310 1311 while (*flagp & NFSMNT_RCVLOCK) { 1312 if (nfs_sigintr(rep->r_nmp, rep, rep->r_procp)) 1313 return (EINTR); 1314 *flagp |= NFSMNT_WANTRCV; 1315 (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsrcvlk", 1316 slptimeo); 1317 if (rep->r_mrep != NULL) { 1318 /* 1319 * Don't take the lock if our reply has been received 1320 * while we where sleeping. 1321 */ 1322 return (EALREADY); 1323 } 1324 if (slpflag == PCATCH) { 1325 slpflag = 0; 1326 slptimeo = 2 * hz; 1327 } 1328 } 1329 *flagp |= NFSMNT_RCVLOCK; 1330 return (0); 1331 } 1332 1333 /* 1334 * Unlock the stream socket for others. 1335 */ 1336 void 1337 nfs_rcvunlock(int *flagp) 1338 { 1339 1340 if ((*flagp & NFSMNT_RCVLOCK) == 0) 1341 panic("nfs rcvunlock"); 1342 *flagp &= ~NFSMNT_RCVLOCK; 1343 if (*flagp & NFSMNT_WANTRCV) { 1344 *flagp &= ~NFSMNT_WANTRCV; 1345 wakeup((caddr_t)flagp); 1346 } 1347 } 1348 1349 /* 1350 * Auxiliary routine to align the length of mbuf copies made with m_copyback(). 1351 */ 1352 void 1353 nfs_realign_fixup(struct mbuf *m, struct mbuf *n, unsigned int *off) 1354 { 1355 size_t padding; 1356 1357 /* 1358 * The maximum number of bytes that m_copyback() places in a mbuf is 1359 * always an aligned quantity, so realign happens at the chain's tail. 1360 */ 1361 while (n->m_next != NULL) 1362 n = n->m_next; 1363 1364 /* 1365 * Pad from the next elements in the source chain. Loop until the 1366 * destination chain is aligned, or the end of the source is reached. 1367 */ 1368 do { 1369 m = m->m_next; 1370 if (m == NULL) 1371 return; 1372 1373 padding = min(ALIGN(n->m_len) - n->m_len, m->m_len); 1374 if (padding > M_TRAILINGSPACE(n)) 1375 panic("nfs_realign_fixup: no memory to pad to"); 1376 1377 bcopy(mtod(m, void *), mtod(n, char *) + n->m_len, padding); 1378 1379 n->m_len += padding; 1380 m_adj(m, padding); 1381 *off += padding; 1382 1383 } while (!ALIGNED_POINTER(n->m_len, void *)); 1384 } 1385 1386 /* 1387 * The NFS RPC parsing code uses the data address and the length of mbuf 1388 * structures to calculate on-memory addresses. This function makes sure these 1389 * parameters are correctly aligned. 1390 */ 1391 void 1392 nfs_realign(struct mbuf **pm, int hsiz) 1393 { 1394 struct mbuf *m; 1395 struct mbuf *n = NULL; 1396 unsigned int off = 0; 1397 1398 ++nfs_realign_test; 1399 while ((m = *pm) != NULL) { 1400 if (!ALIGNED_POINTER(m->m_data, void *) || 1401 !ALIGNED_POINTER(m->m_len, void *)) { 1402 MGET(n, M_WAIT, MT_DATA); 1403 #define ALIGN_POINTER(n) ((u_int)(((n) + sizeof(void *)) & ~sizeof(void *))) 1404 if (ALIGN_POINTER(m->m_len) >= MINCLSIZE) { 1405 MCLGET(n, M_WAIT); 1406 } 1407 n->m_len = 0; 1408 break; 1409 } 1410 pm = &m->m_next; 1411 } 1412 /* 1413 * If n is non-NULL, loop on m copying data, then replace the 1414 * portion of the chain that had to be realigned. 1415 */ 1416 if (n != NULL) { 1417 ++nfs_realign_count; 1418 while (m) { 1419 m_copyback(n, off, m->m_len, mtod(m, caddr_t), M_WAIT); 1420 1421 /* 1422 * If an unaligned amount of memory was copied, fix up 1423 * the last mbuf created by m_copyback(). 1424 */ 1425 if (!ALIGNED_POINTER(m->m_len, void *)) 1426 nfs_realign_fixup(m, n, &off); 1427 1428 off += m->m_len; 1429 m = m->m_next; 1430 } 1431 m_freem(*pm); 1432 *pm = n; 1433 } 1434 } 1435 1436 1437 /* 1438 * Parse an RPC request 1439 * - verify it 1440 * - fill in the cred struct. 1441 */ 1442 int 1443 nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header) 1444 { 1445 int len, i; 1446 u_int32_t *tl; 1447 int32_t t1; 1448 caddr_t cp2; 1449 u_int32_t nfsvers, auth_type; 1450 int error = 0; 1451 struct nfsm_info info; 1452 1453 info.nmi_mrep = nd->nd_mrep; 1454 info.nmi_md = nd->nd_md; 1455 info.nmi_dpos = nd->nd_dpos; 1456 if (has_header) { 1457 nfsm_dissect(tl, u_int32_t *, 10 * NFSX_UNSIGNED); 1458 nd->nd_retxid = fxdr_unsigned(u_int32_t, *tl++); 1459 if (*tl++ != rpc_call) { 1460 m_freem(info.nmi_mrep); 1461 return (EBADRPC); 1462 } 1463 } else 1464 nfsm_dissect(tl, u_int32_t *, 8 * NFSX_UNSIGNED); 1465 nd->nd_repstat = 0; 1466 nd->nd_flag = 0; 1467 if (*tl++ != rpc_vers) { 1468 nd->nd_repstat = ERPCMISMATCH; 1469 nd->nd_procnum = NFSPROC_NOOP; 1470 return (0); 1471 } 1472 if (*tl != nfs_prog) { 1473 nd->nd_repstat = EPROGUNAVAIL; 1474 nd->nd_procnum = NFSPROC_NOOP; 1475 return (0); 1476 } 1477 tl++; 1478 nfsvers = fxdr_unsigned(u_int32_t, *tl++); 1479 if (nfsvers != NFS_VER2 && nfsvers != NFS_VER3) { 1480 nd->nd_repstat = EPROGMISMATCH; 1481 nd->nd_procnum = NFSPROC_NOOP; 1482 return (0); 1483 } 1484 if (nfsvers == NFS_VER3) 1485 nd->nd_flag = ND_NFSV3; 1486 nd->nd_procnum = fxdr_unsigned(u_int32_t, *tl++); 1487 if (nd->nd_procnum == NFSPROC_NULL) 1488 return (0); 1489 if (nd->nd_procnum >= NFS_NPROCS || 1490 (nd->nd_procnum > NFSPROC_COMMIT) || 1491 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) { 1492 nd->nd_repstat = EPROCUNAVAIL; 1493 nd->nd_procnum = NFSPROC_NOOP; 1494 return (0); 1495 } 1496 if ((nd->nd_flag & ND_NFSV3) == 0) 1497 nd->nd_procnum = nfsv3_procid[nd->nd_procnum]; 1498 auth_type = *tl++; 1499 len = fxdr_unsigned(int, *tl++); 1500 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1501 m_freem(info.nmi_mrep); 1502 return (EBADRPC); 1503 } 1504 1505 /* Handle auth_unix */ 1506 if (auth_type == rpc_auth_unix) { 1507 len = fxdr_unsigned(int, *++tl); 1508 if (len < 0 || len > NFS_MAXNAMLEN) { 1509 m_freem(info.nmi_mrep); 1510 return (EBADRPC); 1511 } 1512 nfsm_adv(nfsm_rndup(len)); 1513 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); 1514 memset(&nd->nd_cr, 0, sizeof (struct ucred)); 1515 nd->nd_cr.cr_ref = 1; 1516 nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++); 1517 nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++); 1518 len = fxdr_unsigned(int, *tl); 1519 if (len < 0 || len > RPCAUTH_UNIXGIDS) { 1520 m_freem(info.nmi_mrep); 1521 return (EBADRPC); 1522 } 1523 nfsm_dissect(tl, u_int32_t *, (len + 2) * NFSX_UNSIGNED); 1524 for (i = 0; i < len; i++) 1525 if (i < NGROUPS_MAX) 1526 nd->nd_cr.cr_groups[i] = fxdr_unsigned(gid_t, *tl++); 1527 else 1528 tl++; 1529 nd->nd_cr.cr_ngroups = (len > NGROUPS_MAX) ? NGROUPS_MAX : len; 1530 len = fxdr_unsigned(int, *++tl); 1531 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1532 m_freem(info.nmi_mrep); 1533 return (EBADRPC); 1534 } 1535 if (len > 0) 1536 nfsm_adv(nfsm_rndup(len)); 1537 } else { 1538 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED); 1539 nd->nd_procnum = NFSPROC_NOOP; 1540 return (0); 1541 } 1542 1543 nd->nd_md = info.nmi_md; 1544 nd->nd_dpos = info.nmi_dpos; 1545 return (0); 1546 nfsmout: 1547 return (error); 1548 } 1549 1550 void 1551 nfs_msg(struct nfsreq *rep, char *msg) 1552 { 1553 tpr_t tpr; 1554 1555 if (rep->r_procp) 1556 tpr = tprintf_open(rep->r_procp); 1557 else 1558 tpr = NULL; 1559 1560 tprintf(tpr, "nfs server %s: %s\n", 1561 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname, msg); 1562 tprintf_close(tpr); 1563 } 1564 1565 #ifdef NFSSERVER 1566 /* 1567 * Socket upcall routine for the nfsd sockets. 1568 * The caddr_t arg is a pointer to the "struct nfssvc_sock". 1569 * Essentially do as much as possible non-blocking, else punt and it will 1570 * be called with M_WAIT from an nfsd. 1571 */ 1572 void 1573 nfsrv_rcv(struct socket *so, caddr_t arg, int waitflag) 1574 { 1575 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg; 1576 struct mbuf *m; 1577 struct mbuf *mp, *nam; 1578 struct uio auio; 1579 int flags, error; 1580 1581 if ((slp->ns_flag & SLP_VALID) == 0) 1582 return; 1583 1584 /* Defer soreceive() to an nfsd. */ 1585 if (waitflag == M_DONTWAIT) { 1586 slp->ns_flag |= SLP_NEEDQ; 1587 goto dorecs; 1588 } 1589 1590 auio.uio_procp = NULL; 1591 if (so->so_type == SOCK_STREAM) { 1592 /* 1593 * Do soreceive(). 1594 */ 1595 auio.uio_resid = 1000000000; 1596 flags = MSG_DONTWAIT; 1597 error = soreceive(so, &nam, &auio, &mp, NULL, 1598 &flags, 0); 1599 if (error || mp == NULL) { 1600 if (error == EWOULDBLOCK) 1601 slp->ns_flag |= SLP_NEEDQ; 1602 else 1603 slp->ns_flag |= SLP_DISCONN; 1604 goto dorecs; 1605 } 1606 m = mp; 1607 if (slp->ns_rawend) { 1608 slp->ns_rawend->m_next = m; 1609 slp->ns_cc += 1000000000 - auio.uio_resid; 1610 } else { 1611 slp->ns_raw = m; 1612 slp->ns_cc = 1000000000 - auio.uio_resid; 1613 } 1614 while (m->m_next) 1615 m = m->m_next; 1616 slp->ns_rawend = m; 1617 1618 /* 1619 * Now try and parse record(s) out of the raw stream data. 1620 */ 1621 error = nfsrv_getstream(slp, waitflag); 1622 if (error) { 1623 if (error == EPERM) 1624 slp->ns_flag |= SLP_DISCONN; 1625 else 1626 slp->ns_flag |= SLP_NEEDQ; 1627 } 1628 } else { 1629 do { 1630 auio.uio_resid = 1000000000; 1631 flags = MSG_DONTWAIT; 1632 error = soreceive(so, &nam, &auio, &mp, 1633 NULL, &flags, 0); 1634 if (mp) { 1635 if (nam) { 1636 m = nam; 1637 m->m_next = mp; 1638 } else 1639 m = mp; 1640 if (slp->ns_recend) 1641 slp->ns_recend->m_nextpkt = m; 1642 else 1643 slp->ns_rec = m; 1644 slp->ns_recend = m; 1645 m->m_nextpkt = NULL; 1646 } 1647 if (error) { 1648 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) 1649 && error != EWOULDBLOCK) { 1650 slp->ns_flag |= SLP_DISCONN; 1651 goto dorecs; 1652 } 1653 } 1654 } while (mp); 1655 } 1656 1657 /* 1658 * Now try and process the request records, non-blocking. 1659 */ 1660 dorecs: 1661 if (waitflag == M_DONTWAIT && 1662 (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)))) 1663 nfsrv_wakenfsd(slp); 1664 } 1665 1666 /* 1667 * Try and extract an RPC request from the mbuf data list received on a 1668 * stream socket. The "waitflag" argument indicates whether or not it 1669 * can sleep. 1670 */ 1671 int 1672 nfsrv_getstream(struct nfssvc_sock *slp, int waitflag) 1673 { 1674 struct mbuf *m, **mpp; 1675 char *cp1, *cp2; 1676 int len; 1677 struct mbuf *om, *m2, *recm; 1678 u_int32_t recmark; 1679 1680 if (slp->ns_flag & SLP_GETSTREAM) 1681 return (0); 1682 slp->ns_flag |= SLP_GETSTREAM; 1683 for (;;) { 1684 if (slp->ns_reclen == 0) { 1685 if (slp->ns_cc < NFSX_UNSIGNED) { 1686 slp->ns_flag &= ~SLP_GETSTREAM; 1687 return (0); 1688 } 1689 m = slp->ns_raw; 1690 if (m->m_len >= NFSX_UNSIGNED) { 1691 bcopy(mtod(m, caddr_t), (caddr_t)&recmark, NFSX_UNSIGNED); 1692 m->m_data += NFSX_UNSIGNED; 1693 m->m_len -= NFSX_UNSIGNED; 1694 } else { 1695 cp1 = (caddr_t)&recmark; 1696 cp2 = mtod(m, caddr_t); 1697 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) { 1698 while (m->m_len == 0) { 1699 m = m->m_next; 1700 cp2 = mtod(m, caddr_t); 1701 } 1702 *cp1++ = *cp2++; 1703 m->m_data++; 1704 m->m_len--; 1705 } 1706 } 1707 slp->ns_cc -= NFSX_UNSIGNED; 1708 recmark = ntohl(recmark); 1709 slp->ns_reclen = recmark & ~0x80000000; 1710 if (recmark & 0x80000000) 1711 slp->ns_flag |= SLP_LASTFRAG; 1712 else 1713 slp->ns_flag &= ~SLP_LASTFRAG; 1714 if (slp->ns_reclen > NFS_MAXPACKET) { 1715 slp->ns_flag &= ~SLP_GETSTREAM; 1716 return (EPERM); 1717 } 1718 } 1719 1720 /* 1721 * Now get the record part. 1722 */ 1723 recm = NULL; 1724 if (slp->ns_cc == slp->ns_reclen) { 1725 recm = slp->ns_raw; 1726 slp->ns_raw = slp->ns_rawend = NULL; 1727 slp->ns_cc = slp->ns_reclen = 0; 1728 } else if (slp->ns_cc > slp->ns_reclen) { 1729 len = 0; 1730 m = slp->ns_raw; 1731 om = NULL; 1732 while (len < slp->ns_reclen) { 1733 if ((len + m->m_len) > slp->ns_reclen) { 1734 m2 = m_copym(m, 0, slp->ns_reclen - len, 1735 waitflag); 1736 if (m2) { 1737 if (om) { 1738 om->m_next = m2; 1739 recm = slp->ns_raw; 1740 } else 1741 recm = m2; 1742 m->m_data += slp->ns_reclen - len; 1743 m->m_len -= slp->ns_reclen - len; 1744 len = slp->ns_reclen; 1745 } else { 1746 slp->ns_flag &= ~SLP_GETSTREAM; 1747 return (EWOULDBLOCK); 1748 } 1749 } else if ((len + m->m_len) == slp->ns_reclen) { 1750 om = m; 1751 len += m->m_len; 1752 m = m->m_next; 1753 recm = slp->ns_raw; 1754 om->m_next = NULL; 1755 } else { 1756 om = m; 1757 len += m->m_len; 1758 m = m->m_next; 1759 } 1760 } 1761 slp->ns_raw = m; 1762 slp->ns_cc -= len; 1763 slp->ns_reclen = 0; 1764 } else { 1765 slp->ns_flag &= ~SLP_GETSTREAM; 1766 return (0); 1767 } 1768 1769 /* 1770 * Accumulate the fragments into a record. 1771 */ 1772 mpp = &slp->ns_frag; 1773 while (*mpp) 1774 mpp = &((*mpp)->m_next); 1775 *mpp = recm; 1776 if (slp->ns_flag & SLP_LASTFRAG) { 1777 if (slp->ns_recend) 1778 slp->ns_recend->m_nextpkt = slp->ns_frag; 1779 else 1780 slp->ns_rec = slp->ns_frag; 1781 slp->ns_recend = slp->ns_frag; 1782 slp->ns_frag = NULL; 1783 } 1784 } 1785 } 1786 1787 /* 1788 * Parse an RPC header. 1789 */ 1790 int 1791 nfsrv_dorec(struct nfssvc_sock *slp, struct nfsd *nfsd, 1792 struct nfsrv_descript **ndp) 1793 { 1794 struct mbuf *m, *nam; 1795 struct nfsrv_descript *nd; 1796 int error; 1797 1798 *ndp = NULL; 1799 if ((slp->ns_flag & SLP_VALID) == 0 || 1800 (m = slp->ns_rec) == NULL) 1801 return (ENOBUFS); 1802 slp->ns_rec = m->m_nextpkt; 1803 if (slp->ns_rec) 1804 m->m_nextpkt = NULL; 1805 else 1806 slp->ns_recend = NULL; 1807 if (m->m_type == MT_SONAME) { 1808 nam = m; 1809 m = m->m_next; 1810 nam->m_next = NULL; 1811 } else 1812 nam = NULL; 1813 nd = pool_get(&nfsrv_descript_pl, PR_WAITOK); 1814 nfs_realign(&m, 10 * NFSX_UNSIGNED); 1815 nd->nd_md = nd->nd_mrep = m; 1816 nd->nd_nam2 = nam; 1817 nd->nd_dpos = mtod(m, caddr_t); 1818 error = nfs_getreq(nd, nfsd, 1); 1819 if (error) { 1820 m_freem(nam); 1821 pool_put(&nfsrv_descript_pl, nd); 1822 return (error); 1823 } 1824 *ndp = nd; 1825 nfsd->nfsd_nd = nd; 1826 return (0); 1827 } 1828 1829 1830 /* 1831 * Search for a sleeping nfsd and wake it up. 1832 * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the 1833 * running nfsds will go look for the work in the nfssvc_sock list. 1834 */ 1835 void 1836 nfsrv_wakenfsd(struct nfssvc_sock *slp) 1837 { 1838 struct nfsd *nfsd; 1839 1840 if ((slp->ns_flag & SLP_VALID) == 0) 1841 return; 1842 1843 TAILQ_FOREACH(nfsd, &nfsd_head, nfsd_chain) { 1844 if (nfsd->nfsd_flag & NFSD_WAITING) { 1845 nfsd->nfsd_flag &= ~NFSD_WAITING; 1846 if (nfsd->nfsd_slp) 1847 panic("nfsd wakeup"); 1848 slp->ns_sref++; 1849 nfsd->nfsd_slp = slp; 1850 wakeup_one(nfsd); 1851 return; 1852 } 1853 } 1854 1855 slp->ns_flag |= SLP_DOREC; 1856 nfsd_head_flag |= NFSD_CHECKSLP; 1857 } 1858 #endif /* NFSSERVER */ 1859