1 /* $OpenBSD: nfs_socket.c,v 1.78 2009/02/22 07:47:22 otto Exp $ */ 2 /* $NetBSD: nfs_socket.c,v 1.27 1996/04/15 20:20:00 thorpej Exp $ */ 3 4 /* 5 * Copyright (c) 1989, 1991, 1993, 1995 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Rick Macklem at The University of Guelph. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95 36 */ 37 38 /* 39 * Socket operations for use by nfs 40 */ 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/proc.h> 45 #include <sys/mount.h> 46 #include <sys/kernel.h> 47 #include <sys/mbuf.h> 48 #include <sys/vnode.h> 49 #include <sys/domain.h> 50 #include <sys/protosw.h> 51 #include <sys/socket.h> 52 #include <sys/socketvar.h> 53 #include <sys/syslog.h> 54 #include <sys/tprintf.h> 55 #include <sys/namei.h> 56 #include <sys/pool.h> 57 #include <sys/queue.h> 58 59 #include <netinet/in.h> 60 #include <netinet/tcp.h> 61 62 #include <nfs/rpcv2.h> 63 #include <nfs/nfsproto.h> 64 #include <nfs/nfs.h> 65 #include <nfs/xdr_subs.h> 66 #include <nfs/nfsm_subs.h> 67 #include <nfs/nfsmount.h> 68 #include <nfs/nfsnode.h> 69 #include <nfs/nfsrtt.h> 70 #include <nfs/nfs_var.h> 71 72 /* 73 * Estimate rto for an nfs rpc sent via. an unreliable datagram. 74 * Use the mean and mean deviation of rtt for the appropriate type of rpc 75 * for the frequent rpcs and a default for the others. 76 * The justification for doing "other" this way is that these rpcs 77 * happen so infrequently that timer est. would probably be stale. 78 * Also, since many of these rpcs are 79 * non-idempotent, a conservative timeout is desired. 80 * getattr, lookup - A+2D 81 * read, write - A+4D 82 * other - nm_timeo 83 */ 84 #define NFS_RTO(n, t) \ 85 ((t) == 0 ? (n)->nm_timeo : \ 86 ((t) < 3 ? \ 87 (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \ 88 ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1))) 89 #define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1] 90 #define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1] 91 /* 92 * External data, mostly RPC constants in XDR form 93 */ 94 extern u_int32_t rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, 95 rpc_auth_unix, rpc_msgaccepted, rpc_call, rpc_autherr; 96 extern u_int32_t nfs_prog; 97 extern struct nfsstats nfsstats; 98 extern int nfsv3_procid[NFS_NPROCS]; 99 extern int nfs_ticks; 100 101 /* 102 * Defines which timer to use for the procnum. 103 * 0 - default 104 * 1 - getattr 105 * 2 - lookup 106 * 3 - read 107 * 4 - write 108 */ 109 static int proct[NFS_NPROCS] = { 110 0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 111 0, 0, 0, 112 }; 113 114 /* 115 * There is a congestion window for outstanding rpcs maintained per mount 116 * point. The cwnd size is adjusted in roughly the way that: 117 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of 118 * SIGCOMM '88". ACM, August 1988. 119 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout 120 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd 121 * of rpcs is in progress. 122 * (The sent count and cwnd are scaled for integer arith.) 123 * Variants of "slow start" were tried and were found to be too much of a 124 * performance hit (ave. rtt 3 times larger), 125 * I suspect due to the large rtt that nfs rpcs have. 126 */ 127 #define NFS_CWNDSCALE 256 128 #define NFS_MAXCWND (NFS_CWNDSCALE * 32) 129 static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, }; 130 int nfsrtton = 0; 131 struct nfsrtt nfsrtt; 132 133 void nfs_realign(struct mbuf **, int); 134 void nfs_realign_fixup(struct mbuf *, struct mbuf *, unsigned int *); 135 unsigned int nfs_realign_test = 0; 136 unsigned int nfs_realign_count = 0; 137 138 struct nfsreqhead nfs_reqq; 139 140 /* 141 * Initialize sockets and congestion for a new NFS connection. 142 * We do not free the sockaddr if error. 143 */ 144 int 145 nfs_connect(nmp, rep) 146 struct nfsmount *nmp; 147 struct nfsreq *rep; 148 { 149 struct socket *so; 150 int s, error, rcvreserve, sndreserve; 151 struct sockaddr *saddr; 152 struct sockaddr_in *sin; 153 struct mbuf *m; 154 155 nmp->nm_so = (struct socket *)0; 156 saddr = mtod(nmp->nm_nam, struct sockaddr *); 157 error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype, 158 nmp->nm_soproto); 159 if (error) 160 goto bad; 161 so = nmp->nm_so; 162 nmp->nm_soflags = so->so_proto->pr_flags; 163 164 /* 165 * Some servers require that the client port be a reserved port number. 166 * We always allocate a reserved port, as this prevents filehandle 167 * disclosure through UDP port capture. 168 */ 169 if (saddr->sa_family == AF_INET) { 170 struct mbuf *mopt; 171 int *ip; 172 173 MGET(mopt, M_WAIT, MT_SOOPTS); 174 mopt->m_len = sizeof(int); 175 ip = mtod(mopt, int *); 176 *ip = IP_PORTRANGE_LOW; 177 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); 178 if (error) 179 goto bad; 180 181 MGET(m, M_WAIT, MT_SONAME); 182 sin = mtod(m, struct sockaddr_in *); 183 sin->sin_len = m->m_len = sizeof (struct sockaddr_in); 184 sin->sin_family = AF_INET; 185 sin->sin_addr.s_addr = INADDR_ANY; 186 sin->sin_port = htons(0); 187 error = sobind(so, m, &proc0); 188 m_freem(m); 189 if (error) 190 goto bad; 191 192 MGET(mopt, M_WAIT, MT_SOOPTS); 193 mopt->m_len = sizeof(int); 194 ip = mtod(mopt, int *); 195 *ip = IP_PORTRANGE_DEFAULT; 196 error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); 197 if (error) 198 goto bad; 199 } 200 201 /* 202 * Protocols that do not require connections may be optionally left 203 * unconnected for servers that reply from a port other than NFS_PORT. 204 */ 205 if (nmp->nm_flag & NFSMNT_NOCONN) { 206 if (nmp->nm_soflags & PR_CONNREQUIRED) { 207 error = ENOTCONN; 208 goto bad; 209 } 210 } else { 211 error = soconnect(so, nmp->nm_nam); 212 if (error) 213 goto bad; 214 215 /* 216 * Wait for the connection to complete. Cribbed from the 217 * connect system call but with the wait timing out so 218 * that interruptible mounts don't hang here for a long time. 219 */ 220 s = splsoftnet(); 221 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 222 (void) tsleep((caddr_t)&so->so_timeo, PSOCK, 223 "nfscon", 2 * hz); 224 if ((so->so_state & SS_ISCONNECTING) && 225 so->so_error == 0 && rep && 226 (error = nfs_sigintr(nmp, rep, rep->r_procp)) != 0){ 227 so->so_state &= ~SS_ISCONNECTING; 228 splx(s); 229 goto bad; 230 } 231 } 232 if (so->so_error) { 233 error = so->so_error; 234 so->so_error = 0; 235 splx(s); 236 goto bad; 237 } 238 splx(s); 239 } 240 /* 241 * Always set receive timeout to detect server crash and reconnect. 242 * Otherwise, we can get stuck in soreceive forever. 243 */ 244 so->so_rcv.sb_timeo = (5 * hz); 245 if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) 246 so->so_snd.sb_timeo = (5 * hz); 247 else 248 so->so_snd.sb_timeo = 0; 249 if (nmp->nm_sotype == SOCK_DGRAM) { 250 sndreserve = nmp->nm_wsize + NFS_MAXPKTHDR; 251 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) + 252 NFS_MAXPKTHDR) * 2; 253 } else if (nmp->nm_sotype == SOCK_SEQPACKET) { 254 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 2; 255 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) + 256 NFS_MAXPKTHDR) * 2; 257 } else { 258 if (nmp->nm_sotype != SOCK_STREAM) 259 panic("nfscon sotype"); 260 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 261 MGET(m, M_WAIT, MT_SOOPTS); 262 *mtod(m, int32_t *) = 1; 263 m->m_len = sizeof(int32_t); 264 sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, m); 265 } 266 if (so->so_proto->pr_protocol == IPPROTO_TCP) { 267 MGET(m, M_WAIT, MT_SOOPTS); 268 *mtod(m, int32_t *) = 1; 269 m->m_len = sizeof(int32_t); 270 sosetopt(so, IPPROTO_TCP, TCP_NODELAY, m); 271 } 272 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + 273 sizeof (u_int32_t)) * 2; 274 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + 275 sizeof (u_int32_t)) * 2; 276 } 277 error = soreserve(so, sndreserve, rcvreserve); 278 if (error) 279 goto bad; 280 so->so_rcv.sb_flags |= SB_NOINTR; 281 so->so_snd.sb_flags |= SB_NOINTR; 282 283 /* Initialize other non-zero congestion variables */ 284 nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] = 285 nmp->nm_srtt[3] = (NFS_TIMEO << 3); 286 nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] = 287 nmp->nm_sdrtt[3] = 0; 288 nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */ 289 nmp->nm_sent = 0; 290 nmp->nm_timeouts = 0; 291 return (0); 292 293 bad: 294 nfs_disconnect(nmp); 295 return (error); 296 } 297 298 /* 299 * Reconnect routine: 300 * Called when a connection is broken on a reliable protocol. 301 * - clean up the old socket 302 * - nfs_connect() again 303 * - set R_MUSTRESEND for all outstanding requests on mount point 304 * If this fails the mount point is DEAD! 305 * nb: Must be called with the nfs_sndlock() set on the mount point. 306 */ 307 int 308 nfs_reconnect(rep) 309 struct nfsreq *rep; 310 { 311 struct nfsreq *rp; 312 struct nfsmount *nmp = rep->r_nmp; 313 int s, error; 314 315 nfs_disconnect(nmp); 316 while ((error = nfs_connect(nmp, rep)) != 0) { 317 if (error == EINTR || error == ERESTART) 318 return (EINTR); 319 (void) tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0); 320 } 321 322 /* 323 * Loop through outstanding request list and fix up all requests 324 * on old socket. 325 */ 326 s = splsoftnet(); 327 TAILQ_FOREACH(rp, &nfs_reqq, r_chain) { 328 if (rp->r_nmp == nmp) { 329 rp->r_flags |= R_MUSTRESEND; 330 rp->r_rexmit = 0; 331 } 332 } 333 splx(s); 334 return (0); 335 } 336 337 /* 338 * NFS disconnect. Clean up and unlink. 339 */ 340 void 341 nfs_disconnect(nmp) 342 struct nfsmount *nmp; 343 { 344 struct socket *so; 345 346 if (nmp->nm_so) { 347 so = nmp->nm_so; 348 nmp->nm_so = (struct socket *)0; 349 soshutdown(so, SHUT_RDWR); 350 soclose(so); 351 } 352 } 353 354 /* 355 * This is the nfs send routine. For connection based socket types, it 356 * must be called with an nfs_sndlock() on the socket. 357 * "rep == NULL" indicates that it has been called from a server. 358 * For the client side: 359 * - return EINTR if the RPC is terminated, 0 otherwise 360 * - set R_MUSTRESEND if the send fails for any reason 361 * - do any cleanup required by recoverable socket errors (???) 362 * For the server side: 363 * - return EINTR or ERESTART if interrupted by a signal 364 * - return EPIPE if a connection is lost for connection based sockets (TCP...) 365 * - do any cleanup required by recoverable socket errors (???) 366 */ 367 int 368 nfs_send(so, nam, top, rep) 369 struct socket *so; 370 struct mbuf *nam; 371 struct mbuf *top; 372 struct nfsreq *rep; 373 { 374 struct mbuf *sendnam; 375 int error, soflags, flags; 376 377 if (rep) { 378 if (rep->r_flags & R_SOFTTERM) { 379 m_freem(top); 380 return (EINTR); 381 } 382 if ((so = rep->r_nmp->nm_so) == NULL) { 383 rep->r_flags |= R_MUSTRESEND; 384 m_freem(top); 385 return (0); 386 } 387 rep->r_flags &= ~R_MUSTRESEND; 388 soflags = rep->r_nmp->nm_soflags; 389 } else 390 soflags = so->so_proto->pr_flags; 391 if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED)) 392 sendnam = (struct mbuf *)0; 393 else 394 sendnam = nam; 395 if (so->so_type == SOCK_SEQPACKET) 396 flags = MSG_EOR; 397 else 398 flags = 0; 399 400 error = sosend(so, sendnam, (struct uio *)0, top, 401 (struct mbuf *)0, flags); 402 if (error) { 403 if (rep) { 404 /* 405 * Deal with errors for the client side. 406 */ 407 if (rep->r_flags & R_SOFTTERM) 408 error = EINTR; 409 else 410 rep->r_flags |= R_MUSTRESEND; 411 } 412 413 /* 414 * Handle any recoverable (soft) socket errors here. (???) 415 */ 416 if (error != EINTR && error != ERESTART && 417 error != EWOULDBLOCK && error != EPIPE) 418 error = 0; 419 } 420 return (error); 421 } 422 423 #ifdef NFSCLIENT 424 /* 425 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all 426 * done by soreceive(), but for SOCK_STREAM we must deal with the Record 427 * Mark and consolidate the data into a new mbuf list. 428 * nb: Sometimes TCP passes the data up to soreceive() in long lists of 429 * small mbufs. 430 * For SOCK_STREAM we must be very careful to read an entire record once 431 * we have read any of it, even if the system call has been interrupted. 432 */ 433 int 434 nfs_receive(rep, aname, mp) 435 struct nfsreq *rep; 436 struct mbuf **aname; 437 struct mbuf **mp; 438 { 439 struct socket *so; 440 struct uio auio; 441 struct iovec aio; 442 struct mbuf *m; 443 struct mbuf *control; 444 u_int32_t len; 445 struct mbuf **getnam; 446 int error, sotype, rcvflg; 447 struct proc *p = curproc; /* XXX */ 448 449 /* 450 * Set up arguments for soreceive() 451 */ 452 *mp = (struct mbuf *)0; 453 *aname = (struct mbuf *)0; 454 sotype = rep->r_nmp->nm_sotype; 455 456 /* 457 * For reliable protocols, lock against other senders/receivers 458 * in case a reconnect is necessary. 459 * For SOCK_STREAM, first get the Record Mark to find out how much 460 * more there is to get. 461 * We must lock the socket against other receivers 462 * until we have an entire rpc request/reply. 463 */ 464 if (sotype != SOCK_DGRAM) { 465 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); 466 if (error) 467 return (error); 468 tryagain: 469 /* 470 * Check for fatal errors and resending request. 471 */ 472 /* 473 * Ugh: If a reconnect attempt just happened, nm_so 474 * would have changed. NULL indicates a failed 475 * attempt that has essentially shut down this 476 * mount point. 477 */ 478 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) { 479 nfs_sndunlock(&rep->r_nmp->nm_flag); 480 return (EINTR); 481 } 482 so = rep->r_nmp->nm_so; 483 if (!so) { 484 error = nfs_reconnect(rep); 485 if (error) { 486 nfs_sndunlock(&rep->r_nmp->nm_flag); 487 return (error); 488 } 489 goto tryagain; 490 } 491 while (rep->r_flags & R_MUSTRESEND) { 492 m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT); 493 nfsstats.rpcretries++; 494 rep->r_rtt = 0; 495 rep->r_flags &= ~R_TIMING; 496 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep); 497 if (error) { 498 if (error == EINTR || error == ERESTART || 499 (error = nfs_reconnect(rep)) != 0) { 500 nfs_sndunlock(&rep->r_nmp->nm_flag); 501 return (error); 502 } 503 goto tryagain; 504 } 505 } 506 nfs_sndunlock(&rep->r_nmp->nm_flag); 507 if (sotype == SOCK_STREAM) { 508 aio.iov_base = (caddr_t) &len; 509 aio.iov_len = sizeof(u_int32_t); 510 auio.uio_iov = &aio; 511 auio.uio_iovcnt = 1; 512 auio.uio_segflg = UIO_SYSSPACE; 513 auio.uio_rw = UIO_READ; 514 auio.uio_offset = 0; 515 auio.uio_resid = sizeof(u_int32_t); 516 auio.uio_procp = p; 517 do { 518 rcvflg = MSG_WAITALL; 519 error = soreceive(so, (struct mbuf **)0, &auio, 520 (struct mbuf **)0, (struct mbuf **)0, &rcvflg, 521 0); 522 if (error == EWOULDBLOCK && rep) { 523 if (rep->r_flags & R_SOFTTERM) 524 return (EINTR); 525 /* 526 * looks like the server died after it 527 * received the request, make sure 528 * that we will retransmit and we 529 * don't get stuck here forever. 530 */ 531 if (rep->r_rexmit >= rep->r_nmp->nm_retry) { 532 nfsstats.rpctimeouts++; 533 error = EPIPE; 534 } 535 } 536 } while (error == EWOULDBLOCK); 537 if (!error && auio.uio_resid > 0) { 538 log(LOG_INFO, 539 "short receive (%d/%d) from nfs server %s\n", 540 sizeof(u_int32_t) - auio.uio_resid, 541 sizeof(u_int32_t), 542 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 543 error = EPIPE; 544 } 545 if (error) 546 goto errout; 547 548 len = ntohl(len) & ~0x80000000; 549 /* 550 * This is SERIOUS! We are out of sync with the sender 551 * and forcing a disconnect/reconnect is all I can do. 552 */ 553 if (len > NFS_MAXPACKET) { 554 log(LOG_ERR, "%s (%d) from nfs server %s\n", 555 "impossible packet length", 556 len, 557 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 558 error = EFBIG; 559 goto errout; 560 } 561 auio.uio_resid = len; 562 do { 563 rcvflg = MSG_WAITALL; 564 error = soreceive(so, (struct mbuf **)0, 565 &auio, mp, (struct mbuf **)0, &rcvflg, 0); 566 } while (error == EWOULDBLOCK || error == EINTR || 567 error == ERESTART); 568 if (!error && auio.uio_resid > 0) { 569 log(LOG_INFO, 570 "short receive (%d/%d) from nfs server %s\n", 571 len - auio.uio_resid, len, 572 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 573 error = EPIPE; 574 } 575 } else { 576 /* 577 * NB: Since uio_resid is big, MSG_WAITALL is ignored 578 * and soreceive() will return when it has either a 579 * control msg or a data msg. 580 * We have no use for control msg., but must grab them 581 * and then throw them away so we know what is going 582 * on. 583 */ 584 auio.uio_resid = len = 100000000; /* Anything Big */ 585 auio.uio_procp = p; 586 do { 587 rcvflg = 0; 588 error = soreceive(so, (struct mbuf **)0, 589 &auio, mp, &control, &rcvflg, 0); 590 if (control) 591 m_freem(control); 592 if (error == EWOULDBLOCK && rep) { 593 if (rep->r_flags & R_SOFTTERM) 594 return (EINTR); 595 } 596 } while (error == EWOULDBLOCK || 597 (!error && *mp == NULL && control)); 598 if ((rcvflg & MSG_EOR) == 0) 599 printf("Egad!!\n"); 600 if (!error && *mp == NULL) 601 error = EPIPE; 602 len -= auio.uio_resid; 603 } 604 errout: 605 if (error && error != EINTR && error != ERESTART) { 606 m_freem(*mp); 607 *mp = (struct mbuf *)0; 608 if (error != EPIPE) 609 log(LOG_INFO, 610 "receive error %d from nfs server %s\n", 611 error, 612 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); 613 error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); 614 if (!error) { 615 error = nfs_reconnect(rep); 616 if (!error) 617 goto tryagain; 618 nfs_sndunlock(&rep->r_nmp->nm_flag); 619 } 620 } 621 } else { 622 if ((so = rep->r_nmp->nm_so) == NULL) 623 return (EACCES); 624 if (so->so_state & SS_ISCONNECTED) 625 getnam = (struct mbuf **)0; 626 else 627 getnam = aname; 628 auio.uio_resid = len = 1000000; 629 auio.uio_procp = p; 630 do { 631 rcvflg = 0; 632 error = soreceive(so, getnam, &auio, mp, 633 (struct mbuf **)0, &rcvflg, 0); 634 if (error == EWOULDBLOCK && 635 (rep->r_flags & R_SOFTTERM)) 636 return (EINTR); 637 } while (error == EWOULDBLOCK); 638 len -= auio.uio_resid; 639 } 640 if (error) { 641 m_freem(*mp); 642 *mp = (struct mbuf *)0; 643 } 644 /* 645 * Search for any mbufs that are not a multiple of 4 bytes long 646 * or with m_data not longword aligned. 647 * These could cause pointer alignment problems, so copy them to 648 * well aligned mbufs. 649 */ 650 nfs_realign(mp, 5 * NFSX_UNSIGNED); 651 return (error); 652 } 653 654 /* 655 * Implement receipt of reply on a socket. 656 * We must search through the list of received datagrams matching them 657 * with outstanding requests using the xid, until ours is found. 658 */ 659 int 660 nfs_reply(myrep) 661 struct nfsreq *myrep; 662 { 663 struct nfsreq *rep; 664 struct nfsmount *nmp = myrep->r_nmp; 665 int32_t t1; 666 struct mbuf *mrep, *nam, *md; 667 u_int32_t rxid, *tl; 668 caddr_t dpos, cp2; 669 int s, error; 670 671 /* 672 * Loop around until we get our own reply 673 */ 674 for (;;) { 675 /* 676 * Lock against other receivers so that I don't get stuck in 677 * sbwait() after someone else has received my reply for me. 678 * Also necessary for connection based protocols to avoid 679 * race conditions during a reconnect. 680 */ 681 error = nfs_rcvlock(myrep); 682 if (error) 683 return (error == EALREADY ? 0 : error); 684 685 /* 686 * Get the next Rpc reply off the socket 687 */ 688 error = nfs_receive(myrep, &nam, &mrep); 689 nfs_rcvunlock(&nmp->nm_flag); 690 if (error) { 691 692 /* 693 * Ignore routing errors on connectionless protocols?? 694 */ 695 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) { 696 if (nmp->nm_so) 697 nmp->nm_so->so_error = 0; 698 continue; 699 } 700 return (error); 701 } 702 if (nam) 703 m_freem(nam); 704 705 /* 706 * Get the xid and check that it is an rpc reply 707 */ 708 md = mrep; 709 dpos = mtod(md, caddr_t); 710 nfsm_dissect(tl, u_int32_t *, 2*NFSX_UNSIGNED); 711 rxid = *tl++; 712 if (*tl != rpc_reply) { 713 nfsstats.rpcinvalid++; 714 m_freem(mrep); 715 nfsmout: 716 continue; 717 } 718 719 /* 720 * Loop through the request list to match up the reply 721 * Iff no match, just drop the datagram 722 */ 723 s = splsoftnet(); 724 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) { 725 if (rep->r_mrep == NULL && rxid == rep->r_xid) { 726 /* Found it.. */ 727 rep->r_mrep = mrep; 728 rep->r_md = md; 729 rep->r_dpos = dpos; 730 if (nfsrtton) { 731 struct rttl *rt; 732 733 rt = &nfsrtt.rttl[nfsrtt.pos]; 734 rt->proc = rep->r_procnum; 735 rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]); 736 rt->sent = nmp->nm_sent; 737 rt->cwnd = nmp->nm_cwnd; 738 rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1]; 739 rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1]; 740 rt->fsid = nmp->nm_mountp->mnt_stat.f_fsid; 741 getmicrotime(&rt->tstamp); 742 if (rep->r_flags & R_TIMING) 743 rt->rtt = rep->r_rtt; 744 else 745 rt->rtt = 1000000; 746 nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ; 747 } 748 /* 749 * Update congestion window. 750 * Do the additive increase of 751 * one rpc/rtt. 752 */ 753 if (nmp->nm_cwnd <= nmp->nm_sent) { 754 nmp->nm_cwnd += 755 (NFS_CWNDSCALE * NFS_CWNDSCALE + 756 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd; 757 if (nmp->nm_cwnd > NFS_MAXCWND) 758 nmp->nm_cwnd = NFS_MAXCWND; 759 } 760 rep->r_flags &= ~R_SENT; 761 nmp->nm_sent -= NFS_CWNDSCALE; 762 /* 763 * Update rtt using a gain of 0.125 on the mean 764 * and a gain of 0.25 on the deviation. 765 */ 766 if (rep->r_flags & R_TIMING) { 767 /* 768 * Since the timer resolution of 769 * NFS_HZ is so course, it can often 770 * result in r_rtt == 0. Since 771 * r_rtt == N means that the actual 772 * rtt is between N+dt and N+2-dt ticks, 773 * add 1. 774 */ 775 t1 = rep->r_rtt + 1; 776 t1 -= (NFS_SRTT(rep) >> 3); 777 NFS_SRTT(rep) += t1; 778 if (t1 < 0) 779 t1 = -t1; 780 t1 -= (NFS_SDRTT(rep) >> 2); 781 NFS_SDRTT(rep) += t1; 782 } 783 nmp->nm_timeouts = 0; 784 break; 785 } 786 } 787 splx(s); 788 /* 789 * If not matched to a request, drop it. 790 * If it's mine, get out. 791 */ 792 if (rep == 0) { 793 nfsstats.rpcunexpected++; 794 m_freem(mrep); 795 } else if (rep == myrep) { 796 if (rep->r_mrep == NULL) 797 panic("nfsreply nil"); 798 return (0); 799 } 800 } 801 } 802 803 /* 804 * nfs_request - goes something like this 805 * - fill in request struct 806 * - links it into list 807 * - calls nfs_send() for first transmit 808 * - calls nfs_receive() to get reply 809 * - break down rpc header and return with nfs reply pointed to 810 * by mrep or error 811 * nb: always frees up mreq mbuf list 812 */ 813 int 814 nfs_request(vp, mrest, procnum, procp, cred, mrp, mdp, dposp) 815 struct vnode *vp; 816 struct mbuf *mrest; 817 int procnum; 818 struct proc *procp; 819 struct ucred *cred; 820 struct mbuf **mrp; 821 struct mbuf **mdp; 822 caddr_t *dposp; 823 { 824 struct mbuf *m; 825 struct nfsreq *rep; 826 int mrest_len; 827 828 rep = pool_get(&nfsreqpl, PR_WAITOK); 829 rep->r_nmp = VFSTONFS(vp->v_mount); 830 rep->r_vp = vp; 831 rep->r_procp = procp; 832 rep->r_procnum = procnum; 833 834 mrest_len = 0; 835 m = mrest; 836 while (m) { 837 mrest_len += m->m_len; 838 m = m->m_next; 839 } 840 841 /* empty mbuf for AUTH_UNIX header */ 842 rep->r_mreq = m_gethdr(M_WAIT, MT_DATA); 843 rep->r_mreq->m_next = mrest; 844 rep->r_mreq->m_pkthdr.len = mrest_len; 845 846 return (nfs_request1(rep, cred, mrp, mdp, dposp)); 847 } 848 849 int 850 nfs_request1(struct nfsreq *rep, struct ucred *cred, struct mbuf **mrp, 851 struct mbuf **mdp, caddr_t *dposp) 852 { 853 struct mbuf *m, *mrep; 854 u_int32_t *tl; 855 struct nfsmount *nmp; 856 struct mbuf *md; 857 time_t waituntil; 858 caddr_t dpos, cp2; 859 int t1, i, s, error = 0; 860 int trylater_delay; 861 862 trylater_delay = NFS_MINTIMEO; 863 864 nmp = rep->r_nmp; 865 866 /* Get the RPC header with authorization. */ 867 nfsm_rpchead(rep, cred, RPCAUTH_UNIX); 868 m = rep->r_mreq; 869 870 /* 871 * For stream protocols, insert a Sun RPC Record Mark. 872 */ 873 if (nmp->nm_sotype == SOCK_STREAM) { 874 M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); 875 *mtod(m, u_int32_t *) = htonl(0x80000000 | 876 (m->m_pkthdr.len - NFSX_UNSIGNED)); 877 } 878 879 tryagain: 880 if (nmp->nm_flag & NFSMNT_SOFT) 881 rep->r_retry = nmp->nm_retry; 882 else 883 rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */ 884 rep->r_rtt = rep->r_rexmit = 0; 885 if (proct[rep->r_procnum] > 0) 886 rep->r_flags = R_TIMING; 887 else 888 rep->r_flags = 0; 889 rep->r_mrep = NULL; 890 891 /* 892 * Do the client side RPC. 893 */ 894 nfsstats.rpcrequests++; 895 /* 896 * Chain request into list of outstanding requests. Be sure 897 * to put it LAST so timer finds oldest requests first. 898 */ 899 s = splsoftnet(); 900 TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain); 901 902 /* 903 * If backing off another request or avoiding congestion, don't 904 * send this one now but let timer do it. If not timing a request, 905 * do it now. 906 */ 907 if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM || 908 (nmp->nm_flag & NFSMNT_DUMBTIMR) || 909 nmp->nm_sent < nmp->nm_cwnd)) { 910 splx(s); 911 if (nmp->nm_soflags & PR_CONNREQUIRED) 912 error = nfs_sndlock(&nmp->nm_flag, rep); 913 if (!error) { 914 error = nfs_send(nmp->nm_so, nmp->nm_nam, 915 m_copym(m, 0, M_COPYALL, M_WAIT), 916 rep); 917 if (nmp->nm_soflags & PR_CONNREQUIRED) 918 nfs_sndunlock(&nmp->nm_flag); 919 } 920 if (!error && (rep->r_flags & R_MUSTRESEND) == 0) { 921 nmp->nm_sent += NFS_CWNDSCALE; 922 rep->r_flags |= R_SENT; 923 } 924 } else { 925 splx(s); 926 rep->r_rtt = -1; 927 } 928 929 /* 930 * Wait for the reply from our send or the timer's. 931 */ 932 if (!error || error == EPIPE) 933 error = nfs_reply(rep); 934 935 /* 936 * RPC done, unlink the request. 937 */ 938 s = splsoftnet(); 939 TAILQ_REMOVE(&nfs_reqq, rep, r_chain); 940 splx(s); 941 942 /* 943 * Decrement the outstanding request count. 944 */ 945 if (rep->r_flags & R_SENT) { 946 rep->r_flags &= ~R_SENT; /* paranoia */ 947 nmp->nm_sent -= NFS_CWNDSCALE; 948 } 949 950 /* 951 * If there was a successful reply and a tprintf msg. 952 * tprintf a response. 953 */ 954 if (!error && (rep->r_flags & R_TPRINTFMSG)) 955 nfs_msg(rep, "is alive again"); 956 mrep = rep->r_mrep; 957 md = rep->r_md; 958 dpos = rep->r_dpos; 959 if (error) { 960 m_freem(rep->r_mreq); 961 pool_put(&nfsreqpl, rep); 962 return (error); 963 } 964 965 /* 966 * break down the rpc header and check if ok 967 */ 968 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); 969 if (*tl++ == rpc_msgdenied) { 970 if (*tl == rpc_mismatch) 971 error = EOPNOTSUPP; 972 else 973 error = EACCES; 974 m_freem(mrep); 975 m_freem(rep->r_mreq); 976 pool_put(&nfsreqpl, rep); 977 return (error); 978 } 979 980 /* 981 * Since we only support RPCAUTH_UNIX atm we step over the 982 * reply verifer type, and if the (error) case that there really 983 * is any data init, we advance over it. 984 */ 985 tl++; /* Step over verifer type */ 986 i = fxdr_unsigned(int32_t, *tl); 987 if (i > 0) 988 nfsm_adv(nfsm_rndup(i)); /* Should not happen */ 989 990 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); 991 /* 0 == ok */ 992 if (*tl == 0) { 993 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED); 994 if (*tl != 0) { 995 error = fxdr_unsigned(int, *tl); 996 if ((nmp->nm_flag & NFSMNT_NFSV3) && 997 error == NFSERR_TRYLATER) { 998 m_freem(mrep); 999 error = 0; 1000 waituntil = time_second + trylater_delay; 1001 while (time_second < waituntil) 1002 (void) tsleep((caddr_t)&lbolt, 1003 PSOCK, "nqnfstry", 0); 1004 trylater_delay *= NFS_TIMEOUTMUL; 1005 if (trylater_delay > NFS_MAXTIMEO) 1006 trylater_delay = NFS_MAXTIMEO; 1007 1008 goto tryagain; 1009 } 1010 1011 /* 1012 * If the File Handle was stale, invalidate the 1013 * lookup cache, just in case. 1014 */ 1015 if (error == ESTALE) 1016 cache_purge(rep->r_vp); 1017 1018 if (nmp->nm_flag & NFSMNT_NFSV3 || error == ESTALE) { 1019 *mrp = mrep; 1020 *mdp = md; 1021 *dposp = dpos; 1022 error |= NFSERR_RETERR; 1023 } else 1024 m_freem(mrep); 1025 m_freem(rep->r_mreq); 1026 pool_put(&nfsreqpl, rep); 1027 return (error); 1028 } 1029 1030 *mrp = mrep; 1031 *mdp = md; 1032 *dposp = dpos; 1033 m_freem(rep->r_mreq); 1034 pool_put(&nfsreqpl, rep); 1035 return (0); 1036 } 1037 m_freem(mrep); 1038 error = EPROTONOSUPPORT; 1039 nfsmout: 1040 m_freem(rep->r_mreq); 1041 pool_put(&nfsreqpl, rep); 1042 return (error); 1043 } 1044 #endif /* NFSCLIENT */ 1045 1046 /* 1047 * Generate the rpc reply header 1048 * siz arg. is used to decide if adding a cluster is worthwhile 1049 */ 1050 int 1051 nfs_rephead(siz, nd, slp, err, mrq, mbp) 1052 int siz; 1053 struct nfsrv_descript *nd; 1054 struct nfssvc_sock *slp; 1055 int err; 1056 struct mbuf **mrq; 1057 struct mbuf **mbp; 1058 { 1059 u_int32_t *tl; 1060 struct mbuf *mreq; 1061 struct mbuf *mb; 1062 1063 MGETHDR(mreq, M_WAIT, MT_DATA); 1064 mb = mreq; 1065 /* 1066 * If this is a big reply, use a cluster else 1067 * try and leave leading space for the lower level headers. 1068 */ 1069 siz += RPC_REPLYSIZ; 1070 if (siz >= max_datalen) { 1071 MCLGET(mreq, M_WAIT); 1072 } else 1073 mreq->m_data += max_hdr; 1074 tl = mtod(mreq, u_int32_t *); 1075 mreq->m_len = 6 * NFSX_UNSIGNED; 1076 *tl++ = txdr_unsigned(nd->nd_retxid); 1077 *tl++ = rpc_reply; 1078 if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) { 1079 *tl++ = rpc_msgdenied; 1080 if (err & NFSERR_AUTHERR) { 1081 *tl++ = rpc_autherr; 1082 *tl = txdr_unsigned(err & ~NFSERR_AUTHERR); 1083 mreq->m_len -= NFSX_UNSIGNED; 1084 } else { 1085 *tl++ = rpc_mismatch; 1086 *tl++ = txdr_unsigned(RPC_VER2); 1087 *tl = txdr_unsigned(RPC_VER2); 1088 } 1089 } else { 1090 *tl++ = rpc_msgaccepted; 1091 1092 /* AUTH_UNIX requires RPCAUTH_NULL. */ 1093 *tl++ = 0; 1094 *tl++ = 0; 1095 1096 switch (err) { 1097 case EPROGUNAVAIL: 1098 *tl = txdr_unsigned(RPC_PROGUNAVAIL); 1099 break; 1100 case EPROGMISMATCH: 1101 *tl = txdr_unsigned(RPC_PROGMISMATCH); 1102 tl = nfsm_build(&mb, 2 * NFSX_UNSIGNED); 1103 *tl++ = txdr_unsigned(NFS_VER2); 1104 *tl = txdr_unsigned(NFS_VER3); 1105 break; 1106 case EPROCUNAVAIL: 1107 *tl = txdr_unsigned(RPC_PROCUNAVAIL); 1108 break; 1109 case EBADRPC: 1110 *tl = txdr_unsigned(RPC_GARBAGE); 1111 break; 1112 default: 1113 *tl = 0; 1114 if (err != NFSERR_RETVOID) { 1115 tl = nfsm_build(&mb, NFSX_UNSIGNED); 1116 if (err) 1117 *tl = txdr_unsigned(nfsrv_errmap(nd, err)); 1118 else 1119 *tl = 0; 1120 } 1121 break; 1122 }; 1123 } 1124 1125 *mrq = mreq; 1126 if (mbp != NULL) 1127 *mbp = mb; 1128 if (err != 0 && err != NFSERR_RETVOID) 1129 nfsstats.srvrpc_errs++; 1130 return (0); 1131 } 1132 1133 /* 1134 * Nfs timer routine 1135 * Scan the nfsreq list and retranmit any requests that have timed out 1136 * To avoid retransmission attempts on STREAM sockets (in the future) make 1137 * sure to set the r_retry field to 0 (implies nm_retry == 0). 1138 */ 1139 void 1140 nfs_timer(arg) 1141 void *arg; 1142 { 1143 struct timeout *to = (struct timeout *)arg; 1144 struct nfsreq *rep; 1145 struct mbuf *m; 1146 struct socket *so; 1147 struct nfsmount *nmp; 1148 int timeo; 1149 int s, error; 1150 #ifdef NFSSERVER 1151 struct nfssvc_sock *slp; 1152 struct timeval tv; 1153 #endif 1154 1155 s = splsoftnet(); 1156 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) { 1157 nmp = rep->r_nmp; 1158 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) 1159 continue; 1160 if (nfs_sigintr(nmp, rep, rep->r_procp)) { 1161 rep->r_flags |= R_SOFTTERM; 1162 continue; 1163 } 1164 if (rep->r_rtt >= 0) { 1165 rep->r_rtt++; 1166 if (nmp->nm_flag & NFSMNT_DUMBTIMR) 1167 timeo = nmp->nm_timeo; 1168 else 1169 timeo = NFS_RTO(nmp, proct[rep->r_procnum]); 1170 if (nmp->nm_timeouts > 0) 1171 timeo *= nfs_backoff[nmp->nm_timeouts - 1]; 1172 if (rep->r_rtt <= timeo) 1173 continue; 1174 if (nmp->nm_timeouts < nitems(nfs_backoff)) 1175 nmp->nm_timeouts++; 1176 } 1177 /* 1178 * Check for server not responding 1179 */ 1180 if ((rep->r_flags & R_TPRINTFMSG) == 0 && 1181 rep->r_rexmit > nmp->nm_deadthresh) { 1182 nfs_msg(rep, "not responding"); 1183 rep->r_flags |= R_TPRINTFMSG; 1184 } 1185 if (rep->r_rexmit >= rep->r_retry) { /* too many */ 1186 nfsstats.rpctimeouts++; 1187 rep->r_flags |= R_SOFTTERM; 1188 continue; 1189 } 1190 if (nmp->nm_sotype != SOCK_DGRAM) { 1191 if (++rep->r_rexmit > NFS_MAXREXMIT) 1192 rep->r_rexmit = NFS_MAXREXMIT; 1193 continue; 1194 } 1195 if ((so = nmp->nm_so) == NULL) 1196 continue; 1197 1198 /* 1199 * If there is enough space and the window allows.. 1200 * Resend it 1201 * Set r_rtt to -1 in case we fail to send it now. 1202 */ 1203 rep->r_rtt = -1; 1204 if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len && 1205 ((nmp->nm_flag & NFSMNT_DUMBTIMR) || 1206 (rep->r_flags & R_SENT) || 1207 nmp->nm_sent < nmp->nm_cwnd) && 1208 (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){ 1209 if ((nmp->nm_flag & NFSMNT_NOCONN) == 0) 1210 error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m, 1211 (struct mbuf *)0, (struct mbuf *)0, curproc); 1212 else 1213 error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m, 1214 nmp->nm_nam, (struct mbuf *)0, curproc); 1215 if (error) { 1216 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) 1217 so->so_error = 0; 1218 } else { 1219 /* 1220 * Iff first send, start timing 1221 * else turn timing off, backoff timer 1222 * and divide congestion window by 2. 1223 */ 1224 if (rep->r_flags & R_SENT) { 1225 rep->r_flags &= ~R_TIMING; 1226 if (++rep->r_rexmit > NFS_MAXREXMIT) 1227 rep->r_rexmit = NFS_MAXREXMIT; 1228 nmp->nm_cwnd >>= 1; 1229 if (nmp->nm_cwnd < NFS_CWNDSCALE) 1230 nmp->nm_cwnd = NFS_CWNDSCALE; 1231 nfsstats.rpcretries++; 1232 } else { 1233 rep->r_flags |= R_SENT; 1234 nmp->nm_sent += NFS_CWNDSCALE; 1235 } 1236 rep->r_rtt = 0; 1237 } 1238 } 1239 } 1240 1241 #ifdef NFSSERVER 1242 /* 1243 * Scan the write gathering queues for writes that need to be 1244 * completed now. 1245 */ 1246 getmicrotime(&tv); 1247 TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) { 1248 if (LIST_FIRST(&slp->ns_tq) && 1249 timercmp(&LIST_FIRST(&slp->ns_tq)->nd_time, &tv, <=)) 1250 nfsrv_wakenfsd(slp); 1251 } 1252 #endif /* NFSSERVER */ 1253 splx(s); 1254 timeout_add(to, nfs_ticks); 1255 } 1256 1257 /* 1258 * Test for a termination condition pending on the process. 1259 * This is used for NFSMNT_INT mounts. 1260 */ 1261 int 1262 nfs_sigintr(nmp, rep, p) 1263 struct nfsmount *nmp; 1264 struct nfsreq *rep; 1265 struct proc *p; 1266 { 1267 1268 if (rep && (rep->r_flags & R_SOFTTERM)) 1269 return (EINTR); 1270 if (!(nmp->nm_flag & NFSMNT_INT)) 1271 return (0); 1272 if (p && p->p_siglist && 1273 (((p->p_siglist & ~p->p_sigmask) & ~p->p_sigignore) & 1274 NFSINT_SIGMASK)) 1275 return (EINTR); 1276 return (0); 1277 } 1278 1279 /* 1280 * Lock a socket against others. 1281 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply 1282 * and also to avoid race conditions between the processes with nfs requests 1283 * in progress when a reconnect is necessary. 1284 */ 1285 int 1286 nfs_sndlock(flagp, rep) 1287 int *flagp; 1288 struct nfsreq *rep; 1289 { 1290 struct proc *p; 1291 int slpflag = 0, slptimeo = 0; 1292 1293 if (rep) { 1294 p = rep->r_procp; 1295 if (rep->r_nmp->nm_flag & NFSMNT_INT) 1296 slpflag = PCATCH; 1297 } else 1298 p = (struct proc *)0; 1299 while (*flagp & NFSMNT_SNDLOCK) { 1300 if (rep && nfs_sigintr(rep->r_nmp, rep, p)) 1301 return (EINTR); 1302 *flagp |= NFSMNT_WANTSND; 1303 (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsndlck", 1304 slptimeo); 1305 if (slpflag == PCATCH) { 1306 slpflag = 0; 1307 slptimeo = 2 * hz; 1308 } 1309 } 1310 *flagp |= NFSMNT_SNDLOCK; 1311 return (0); 1312 } 1313 1314 /* 1315 * Unlock the stream socket for others. 1316 */ 1317 void 1318 nfs_sndunlock(flagp) 1319 int *flagp; 1320 { 1321 1322 if ((*flagp & NFSMNT_SNDLOCK) == 0) 1323 panic("nfs sndunlock"); 1324 *flagp &= ~NFSMNT_SNDLOCK; 1325 if (*flagp & NFSMNT_WANTSND) { 1326 *flagp &= ~NFSMNT_WANTSND; 1327 wakeup((caddr_t)flagp); 1328 } 1329 } 1330 1331 int 1332 nfs_rcvlock(rep) 1333 struct nfsreq *rep; 1334 { 1335 int *flagp = &rep->r_nmp->nm_flag; 1336 int slpflag, slptimeo = 0; 1337 1338 if (*flagp & NFSMNT_INT) 1339 slpflag = PCATCH; 1340 else 1341 slpflag = 0; 1342 1343 while (*flagp & NFSMNT_RCVLOCK) { 1344 if (nfs_sigintr(rep->r_nmp, rep, rep->r_procp)) 1345 return (EINTR); 1346 *flagp |= NFSMNT_WANTRCV; 1347 (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsrcvlk", 1348 slptimeo); 1349 if (rep->r_mrep != NULL) { 1350 /* 1351 * Don't take the lock if our reply has been received 1352 * while we where sleeping. 1353 */ 1354 return (EALREADY); 1355 } 1356 if (slpflag == PCATCH) { 1357 slpflag = 0; 1358 slptimeo = 2 * hz; 1359 } 1360 } 1361 *flagp |= NFSMNT_RCVLOCK; 1362 return (0); 1363 } 1364 1365 /* 1366 * Unlock the stream socket for others. 1367 */ 1368 void 1369 nfs_rcvunlock(flagp) 1370 int *flagp; 1371 { 1372 1373 if ((*flagp & NFSMNT_RCVLOCK) == 0) 1374 panic("nfs rcvunlock"); 1375 *flagp &= ~NFSMNT_RCVLOCK; 1376 if (*flagp & NFSMNT_WANTRCV) { 1377 *flagp &= ~NFSMNT_WANTRCV; 1378 wakeup((caddr_t)flagp); 1379 } 1380 } 1381 1382 /* 1383 * Auxiliary routine to align the length of mbuf copies made with m_copyback(). 1384 */ 1385 void 1386 nfs_realign_fixup(struct mbuf *m, struct mbuf *n, unsigned int *off) 1387 { 1388 size_t padding; 1389 1390 /* 1391 * The maximum number of bytes that m_copyback() places in a mbuf is 1392 * always an aligned quantity, so realign happens at the chain's tail. 1393 */ 1394 while (n->m_next != NULL) 1395 n = n->m_next; 1396 1397 /* 1398 * Pad from the next elements in the source chain. Loop until the 1399 * destination chain is aligned, or the end of the source is reached. 1400 */ 1401 do { 1402 m = m->m_next; 1403 if (m == NULL) 1404 return; 1405 1406 padding = min(ALIGN(n->m_len) - n->m_len, m->m_len); 1407 if (padding > M_TRAILINGSPACE(n)) 1408 panic("nfs_realign_fixup: no memory to pad to"); 1409 1410 bcopy(mtod(m, void *), mtod(n, char *) + n->m_len, padding); 1411 1412 n->m_len += padding; 1413 m_adj(m, padding); 1414 *off += padding; 1415 1416 } while (!ALIGNED_POINTER(n->m_len, void *)); 1417 } 1418 1419 /* 1420 * The NFS RPC parsing code uses the data address and the length of mbuf 1421 * structures to calculate on-memory addresses. This function makes sure these 1422 * parameters are correctly aligned. 1423 */ 1424 void 1425 nfs_realign(struct mbuf **pm, int hsiz) 1426 { 1427 struct mbuf *m; 1428 struct mbuf *n = NULL; 1429 unsigned int off = 0; 1430 1431 ++nfs_realign_test; 1432 while ((m = *pm) != NULL) { 1433 if (!ALIGNED_POINTER(m->m_data, void *) || 1434 !ALIGNED_POINTER(m->m_len, void *)) { 1435 MGET(n, M_WAIT, MT_DATA); 1436 if (ALIGN(m->m_len) >= MINCLSIZE) { 1437 MCLGET(n, M_WAIT); 1438 } 1439 n->m_len = 0; 1440 break; 1441 } 1442 pm = &m->m_next; 1443 } 1444 /* 1445 * If n is non-NULL, loop on m copying data, then replace the 1446 * portion of the chain that had to be realigned. 1447 */ 1448 if (n != NULL) { 1449 ++nfs_realign_count; 1450 while (m) { 1451 m_copyback(n, off, m->m_len, mtod(m, caddr_t)); 1452 1453 /* 1454 * If an unaligned amount of memory was copied, fix up 1455 * the last mbuf created by m_copyback(). 1456 */ 1457 if (!ALIGNED_POINTER(m->m_len, void *)) 1458 nfs_realign_fixup(m, n, &off); 1459 1460 off += m->m_len; 1461 m = m->m_next; 1462 } 1463 m_freem(*pm); 1464 *pm = n; 1465 } 1466 } 1467 1468 1469 /* 1470 * Parse an RPC request 1471 * - verify it 1472 * - fill in the cred struct. 1473 */ 1474 int 1475 nfs_getreq(nd, nfsd, has_header) 1476 struct nfsrv_descript *nd; 1477 struct nfsd *nfsd; 1478 int has_header; 1479 { 1480 int len, i; 1481 u_int32_t *tl; 1482 int32_t t1; 1483 caddr_t dpos, cp2; 1484 u_int32_t nfsvers, auth_type; 1485 int error = 0; 1486 struct mbuf *mrep, *md; 1487 1488 mrep = nd->nd_mrep; 1489 md = nd->nd_md; 1490 dpos = nd->nd_dpos; 1491 if (has_header) { 1492 nfsm_dissect(tl, u_int32_t *, 10 * NFSX_UNSIGNED); 1493 nd->nd_retxid = fxdr_unsigned(u_int32_t, *tl++); 1494 if (*tl++ != rpc_call) { 1495 m_freem(mrep); 1496 return (EBADRPC); 1497 } 1498 } else 1499 nfsm_dissect(tl, u_int32_t *, 8 * NFSX_UNSIGNED); 1500 nd->nd_repstat = 0; 1501 nd->nd_flag = 0; 1502 if (*tl++ != rpc_vers) { 1503 nd->nd_repstat = ERPCMISMATCH; 1504 nd->nd_procnum = NFSPROC_NOOP; 1505 return (0); 1506 } 1507 if (*tl != nfs_prog) { 1508 nd->nd_repstat = EPROGUNAVAIL; 1509 nd->nd_procnum = NFSPROC_NOOP; 1510 return (0); 1511 } 1512 tl++; 1513 nfsvers = fxdr_unsigned(u_int32_t, *tl++); 1514 if (nfsvers != NFS_VER2 && nfsvers != NFS_VER3) { 1515 nd->nd_repstat = EPROGMISMATCH; 1516 nd->nd_procnum = NFSPROC_NOOP; 1517 return (0); 1518 } 1519 if (nfsvers == NFS_VER3) 1520 nd->nd_flag = ND_NFSV3; 1521 nd->nd_procnum = fxdr_unsigned(u_int32_t, *tl++); 1522 if (nd->nd_procnum == NFSPROC_NULL) 1523 return (0); 1524 if (nd->nd_procnum >= NFS_NPROCS || 1525 (nd->nd_procnum > NFSPROC_COMMIT) || 1526 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) { 1527 nd->nd_repstat = EPROCUNAVAIL; 1528 nd->nd_procnum = NFSPROC_NOOP; 1529 return (0); 1530 } 1531 if ((nd->nd_flag & ND_NFSV3) == 0) 1532 nd->nd_procnum = nfsv3_procid[nd->nd_procnum]; 1533 auth_type = *tl++; 1534 len = fxdr_unsigned(int, *tl++); 1535 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1536 m_freem(mrep); 1537 return (EBADRPC); 1538 } 1539 1540 /* Handle auth_unix */ 1541 if (auth_type == rpc_auth_unix) { 1542 len = fxdr_unsigned(int, *++tl); 1543 if (len < 0 || len > NFS_MAXNAMLEN) { 1544 m_freem(mrep); 1545 return (EBADRPC); 1546 } 1547 nfsm_adv(nfsm_rndup(len)); 1548 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); 1549 bzero((caddr_t)&nd->nd_cr, sizeof (struct ucred)); 1550 nd->nd_cr.cr_ref = 1; 1551 nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++); 1552 nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++); 1553 len = fxdr_unsigned(int, *tl); 1554 if (len < 0 || len > RPCAUTH_UNIXGIDS) { 1555 m_freem(mrep); 1556 return (EBADRPC); 1557 } 1558 nfsm_dissect(tl, u_int32_t *, (len + 2) * NFSX_UNSIGNED); 1559 for (i = 0; i < len; i++) 1560 if (i < NGROUPS) 1561 nd->nd_cr.cr_groups[i] = fxdr_unsigned(gid_t, *tl++); 1562 else 1563 tl++; 1564 nd->nd_cr.cr_ngroups = (len > NGROUPS) ? NGROUPS : len; 1565 if (nd->nd_cr.cr_ngroups > 1) 1566 nfsrvw_sort(nd->nd_cr.cr_groups, nd->nd_cr.cr_ngroups); 1567 len = fxdr_unsigned(int, *++tl); 1568 if (len < 0 || len > RPCAUTH_MAXSIZ) { 1569 m_freem(mrep); 1570 return (EBADRPC); 1571 } 1572 if (len > 0) 1573 nfsm_adv(nfsm_rndup(len)); 1574 } else { 1575 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED); 1576 nd->nd_procnum = NFSPROC_NOOP; 1577 return (0); 1578 } 1579 1580 nd->nd_md = md; 1581 nd->nd_dpos = dpos; 1582 return (0); 1583 nfsmout: 1584 return (error); 1585 } 1586 1587 void 1588 nfs_msg(struct nfsreq *rep, char *msg) 1589 { 1590 tpr_t tpr; 1591 1592 if (rep->r_procp) 1593 tpr = tprintf_open(rep->r_procp); 1594 else 1595 tpr = NULL; 1596 1597 tprintf(tpr, "nfs server %s: %s\n", 1598 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname, msg); 1599 tprintf_close(tpr); 1600 } 1601 1602 #ifdef NFSSERVER 1603 /* 1604 * Socket upcall routine for the nfsd sockets. 1605 * The caddr_t arg is a pointer to the "struct nfssvc_sock". 1606 * Essentially do as much as possible non-blocking, else punt and it will 1607 * be called with M_WAIT from an nfsd. 1608 */ 1609 void 1610 nfsrv_rcv(so, arg, waitflag) 1611 struct socket *so; 1612 caddr_t arg; 1613 int waitflag; 1614 { 1615 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg; 1616 struct mbuf *m; 1617 struct mbuf *mp, *nam; 1618 struct uio auio; 1619 int flags, error; 1620 1621 if ((slp->ns_flag & SLP_VALID) == 0) 1622 return; 1623 #ifdef notdef 1624 /* 1625 * Define this to test for nfsds handling this under heavy load. 1626 */ 1627 if (waitflag == M_DONTWAIT) { 1628 slp->ns_flag |= SLP_NEEDQ; goto dorecs; 1629 } 1630 #endif 1631 auio.uio_procp = NULL; 1632 if (so->so_type == SOCK_STREAM) { 1633 /* 1634 * If there are already records on the queue, defer soreceive() 1635 * to an nfsd so that there is feedback to the TCP layer that 1636 * the nfs servers are heavily loaded. 1637 */ 1638 if (slp->ns_rec && waitflag == M_DONTWAIT) { 1639 slp->ns_flag |= SLP_NEEDQ; 1640 goto dorecs; 1641 } 1642 1643 /* 1644 * Do soreceive(). 1645 */ 1646 auio.uio_resid = 1000000000; 1647 flags = MSG_DONTWAIT; 1648 error = soreceive(so, &nam, &auio, &mp, (struct mbuf **)0, 1649 &flags, 0); 1650 if (error || mp == (struct mbuf *)0) { 1651 if (error == EWOULDBLOCK) 1652 slp->ns_flag |= SLP_NEEDQ; 1653 else 1654 slp->ns_flag |= SLP_DISCONN; 1655 goto dorecs; 1656 } 1657 m = mp; 1658 if (slp->ns_rawend) { 1659 slp->ns_rawend->m_next = m; 1660 slp->ns_cc += 1000000000 - auio.uio_resid; 1661 } else { 1662 slp->ns_raw = m; 1663 slp->ns_cc = 1000000000 - auio.uio_resid; 1664 } 1665 while (m->m_next) 1666 m = m->m_next; 1667 slp->ns_rawend = m; 1668 1669 /* 1670 * Now try and parse record(s) out of the raw stream data. 1671 */ 1672 error = nfsrv_getstream(slp, waitflag); 1673 if (error) { 1674 if (error == EPERM) 1675 slp->ns_flag |= SLP_DISCONN; 1676 else 1677 slp->ns_flag |= SLP_NEEDQ; 1678 } 1679 } else { 1680 do { 1681 auio.uio_resid = 1000000000; 1682 flags = MSG_DONTWAIT; 1683 error = soreceive(so, &nam, &auio, &mp, 1684 (struct mbuf **)0, &flags, 0); 1685 if (mp) { 1686 if (nam) { 1687 m = nam; 1688 m->m_next = mp; 1689 } else 1690 m = mp; 1691 if (slp->ns_recend) 1692 slp->ns_recend->m_nextpkt = m; 1693 else 1694 slp->ns_rec = m; 1695 slp->ns_recend = m; 1696 m->m_nextpkt = (struct mbuf *)0; 1697 } 1698 if (error) { 1699 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) 1700 && error != EWOULDBLOCK) { 1701 slp->ns_flag |= SLP_DISCONN; 1702 goto dorecs; 1703 } 1704 } 1705 } while (mp); 1706 } 1707 1708 /* 1709 * Now try and process the request records, non-blocking. 1710 */ 1711 dorecs: 1712 if (waitflag == M_DONTWAIT && 1713 (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)))) 1714 nfsrv_wakenfsd(slp); 1715 } 1716 1717 /* 1718 * Try and extract an RPC request from the mbuf data list received on a 1719 * stream socket. The "waitflag" argument indicates whether or not it 1720 * can sleep. 1721 */ 1722 int 1723 nfsrv_getstream(slp, waitflag) 1724 struct nfssvc_sock *slp; 1725 int waitflag; 1726 { 1727 struct mbuf *m, **mpp; 1728 char *cp1, *cp2; 1729 int len; 1730 struct mbuf *om, *m2, *recm; 1731 u_int32_t recmark; 1732 1733 if (slp->ns_flag & SLP_GETSTREAM) 1734 panic("nfs getstream"); 1735 slp->ns_flag |= SLP_GETSTREAM; 1736 for (;;) { 1737 if (slp->ns_reclen == 0) { 1738 if (slp->ns_cc < NFSX_UNSIGNED) { 1739 slp->ns_flag &= ~SLP_GETSTREAM; 1740 return (0); 1741 } 1742 m = slp->ns_raw; 1743 if (m->m_len >= NFSX_UNSIGNED) { 1744 bcopy(mtod(m, caddr_t), (caddr_t)&recmark, NFSX_UNSIGNED); 1745 m->m_data += NFSX_UNSIGNED; 1746 m->m_len -= NFSX_UNSIGNED; 1747 } else { 1748 cp1 = (caddr_t)&recmark; 1749 cp2 = mtod(m, caddr_t); 1750 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) { 1751 while (m->m_len == 0) { 1752 m = m->m_next; 1753 cp2 = mtod(m, caddr_t); 1754 } 1755 *cp1++ = *cp2++; 1756 m->m_data++; 1757 m->m_len--; 1758 } 1759 } 1760 slp->ns_cc -= NFSX_UNSIGNED; 1761 recmark = ntohl(recmark); 1762 slp->ns_reclen = recmark & ~0x80000000; 1763 if (recmark & 0x80000000) 1764 slp->ns_flag |= SLP_LASTFRAG; 1765 else 1766 slp->ns_flag &= ~SLP_LASTFRAG; 1767 if (slp->ns_reclen > NFS_MAXPACKET) { 1768 slp->ns_flag &= ~SLP_GETSTREAM; 1769 return (EPERM); 1770 } 1771 } 1772 1773 /* 1774 * Now get the record part. 1775 */ 1776 recm = NULL; 1777 if (slp->ns_cc == slp->ns_reclen) { 1778 recm = slp->ns_raw; 1779 slp->ns_raw = slp->ns_rawend = (struct mbuf *)0; 1780 slp->ns_cc = slp->ns_reclen = 0; 1781 } else if (slp->ns_cc > slp->ns_reclen) { 1782 len = 0; 1783 m = slp->ns_raw; 1784 om = (struct mbuf *)0; 1785 while (len < slp->ns_reclen) { 1786 if ((len + m->m_len) > slp->ns_reclen) { 1787 m2 = m_copym(m, 0, slp->ns_reclen - len, 1788 waitflag); 1789 if (m2) { 1790 if (om) { 1791 om->m_next = m2; 1792 recm = slp->ns_raw; 1793 } else 1794 recm = m2; 1795 m->m_data += slp->ns_reclen - len; 1796 m->m_len -= slp->ns_reclen - len; 1797 len = slp->ns_reclen; 1798 } else { 1799 slp->ns_flag &= ~SLP_GETSTREAM; 1800 return (EWOULDBLOCK); 1801 } 1802 } else if ((len + m->m_len) == slp->ns_reclen) { 1803 om = m; 1804 len += m->m_len; 1805 m = m->m_next; 1806 recm = slp->ns_raw; 1807 om->m_next = (struct mbuf *)0; 1808 } else { 1809 om = m; 1810 len += m->m_len; 1811 m = m->m_next; 1812 } 1813 } 1814 slp->ns_raw = m; 1815 slp->ns_cc -= len; 1816 slp->ns_reclen = 0; 1817 } else { 1818 slp->ns_flag &= ~SLP_GETSTREAM; 1819 return (0); 1820 } 1821 1822 /* 1823 * Accumulate the fragments into a record. 1824 */ 1825 mpp = &slp->ns_frag; 1826 while (*mpp) 1827 mpp = &((*mpp)->m_next); 1828 *mpp = recm; 1829 if (slp->ns_flag & SLP_LASTFRAG) { 1830 if (slp->ns_recend) 1831 slp->ns_recend->m_nextpkt = slp->ns_frag; 1832 else 1833 slp->ns_rec = slp->ns_frag; 1834 slp->ns_recend = slp->ns_frag; 1835 slp->ns_frag = (struct mbuf *)0; 1836 } 1837 } 1838 } 1839 1840 /* 1841 * Parse an RPC header. 1842 */ 1843 int 1844 nfsrv_dorec(slp, nfsd, ndp) 1845 struct nfssvc_sock *slp; 1846 struct nfsd *nfsd; 1847 struct nfsrv_descript **ndp; 1848 { 1849 struct mbuf *m, *nam; 1850 struct nfsrv_descript *nd; 1851 int error; 1852 1853 *ndp = NULL; 1854 if ((slp->ns_flag & SLP_VALID) == 0 || 1855 (m = slp->ns_rec) == (struct mbuf *)0) 1856 return (ENOBUFS); 1857 slp->ns_rec = m->m_nextpkt; 1858 if (slp->ns_rec) 1859 m->m_nextpkt = (struct mbuf *)0; 1860 else 1861 slp->ns_recend = (struct mbuf *)0; 1862 if (m->m_type == MT_SONAME) { 1863 nam = m; 1864 m = m->m_next; 1865 nam->m_next = NULL; 1866 } else 1867 nam = NULL; 1868 nd = malloc(sizeof(struct nfsrv_descript), M_NFSRVDESC, M_WAITOK); 1869 nfs_realign(&m, 10 * NFSX_UNSIGNED); 1870 nd->nd_md = nd->nd_mrep = m; 1871 nd->nd_nam2 = nam; 1872 nd->nd_dpos = mtod(m, caddr_t); 1873 error = nfs_getreq(nd, nfsd, 1); 1874 if (error) { 1875 m_freem(nam); 1876 free(nd, M_NFSRVDESC); 1877 return (error); 1878 } 1879 *ndp = nd; 1880 nfsd->nfsd_nd = nd; 1881 return (0); 1882 } 1883 1884 1885 /* 1886 * Search for a sleeping nfsd and wake it up. 1887 * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the 1888 * running nfsds will go look for the work in the nfssvc_sock list. 1889 */ 1890 void 1891 nfsrv_wakenfsd(struct nfssvc_sock *slp) 1892 { 1893 struct nfsd *nfsd; 1894 1895 if ((slp->ns_flag & SLP_VALID) == 0) 1896 return; 1897 1898 TAILQ_FOREACH(nfsd, &nfsd_head, nfsd_chain) { 1899 if (nfsd->nfsd_flag & NFSD_WAITING) { 1900 nfsd->nfsd_flag &= ~NFSD_WAITING; 1901 if (nfsd->nfsd_slp) 1902 panic("nfsd wakeup"); 1903 slp->ns_sref++; 1904 nfsd->nfsd_slp = slp; 1905 wakeup_one(nfsd); 1906 return; 1907 } 1908 } 1909 1910 slp->ns_flag |= SLP_DOREC; 1911 nfsd_head_flag |= NFSD_CHECKSLP; 1912 } 1913 #endif /* NFSSERVER */ 1914