1 /* $OpenBSD: uipc_socket.c,v 1.134 2014/11/03 17:20:46 bluhm Exp $ */ 2 /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 #include <sys/file.h> 39 #include <sys/filedesc.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/domain.h> 43 #include <sys/kernel.h> 44 #include <sys/event.h> 45 #include <sys/protosw.h> 46 #include <sys/socket.h> 47 #include <sys/unpcb.h> 48 #include <sys/socketvar.h> 49 #include <sys/signalvar.h> 50 #include <sys/resourcevar.h> 51 #include <net/if.h> 52 #include <sys/pool.h> 53 54 void sbsync(struct sockbuf *, struct mbuf *); 55 56 int sosplice(struct socket *, int, off_t, struct timeval *); 57 void sounsplice(struct socket *, struct socket *, int); 58 void soidle(void *); 59 int somove(struct socket *, int); 60 61 void filt_sordetach(struct knote *kn); 62 int filt_soread(struct knote *kn, long hint); 63 void filt_sowdetach(struct knote *kn); 64 int filt_sowrite(struct knote *kn, long hint); 65 int filt_solisten(struct knote *kn, long hint); 66 67 struct filterops solisten_filtops = 68 { 1, NULL, filt_sordetach, filt_solisten }; 69 struct filterops soread_filtops = 70 { 1, NULL, filt_sordetach, filt_soread }; 71 struct filterops sowrite_filtops = 72 { 1, NULL, filt_sowdetach, filt_sowrite }; 73 74 75 #ifndef SOMINCONN 76 #define SOMINCONN 80 77 #endif /* SOMINCONN */ 78 79 int somaxconn = SOMAXCONN; 80 int sominconn = SOMINCONN; 81 82 struct pool socket_pool; 83 #ifdef SOCKET_SPLICE 84 struct pool sosplice_pool; 85 #endif 86 87 void 88 soinit(void) 89 { 90 91 pool_init(&socket_pool, sizeof(struct socket), 0, 0, 0, "sockpl", NULL); 92 #ifdef SOCKET_SPLICE 93 pool_init(&sosplice_pool, sizeof(struct sosplice), 0, 0, 0, "sosppl", 94 NULL); 95 #endif 96 } 97 98 /* 99 * Socket operation routines. 100 * These routines are called by the routines in 101 * sys_socket.c or from a system process, and 102 * implement the semantics of socket operations by 103 * switching out to the protocol specific routines. 104 */ 105 /*ARGSUSED*/ 106 int 107 socreate(int dom, struct socket **aso, int type, int proto) 108 { 109 struct proc *p = curproc; /* XXX */ 110 struct protosw *prp; 111 struct socket *so; 112 int error, s; 113 114 if (proto) 115 prp = pffindproto(dom, proto, type); 116 else 117 prp = pffindtype(dom, type); 118 if (prp == NULL || prp->pr_usrreq == 0) 119 return (EPROTONOSUPPORT); 120 if (prp->pr_type != type) 121 return (EPROTOTYPE); 122 s = splsoftnet(); 123 so = pool_get(&socket_pool, PR_WAITOK | PR_ZERO); 124 TAILQ_INIT(&so->so_q0); 125 TAILQ_INIT(&so->so_q); 126 so->so_type = type; 127 if (suser(p, 0) == 0) 128 so->so_state = SS_PRIV; 129 so->so_ruid = p->p_ucred->cr_ruid; 130 so->so_euid = p->p_ucred->cr_uid; 131 so->so_rgid = p->p_ucred->cr_rgid; 132 so->so_egid = p->p_ucred->cr_gid; 133 so->so_cpid = p->p_p->ps_pid; 134 so->so_proto = prp; 135 error = (*prp->pr_usrreq)(so, PRU_ATTACH, NULL, 136 (struct mbuf *)(long)proto, NULL, p); 137 if (error) { 138 so->so_state |= SS_NOFDREF; 139 sofree(so); 140 splx(s); 141 return (error); 142 } 143 splx(s); 144 *aso = so; 145 return (0); 146 } 147 148 int 149 sobind(struct socket *so, struct mbuf *nam, struct proc *p) 150 { 151 int s = splsoftnet(); 152 int error; 153 154 error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, NULL, nam, NULL, p); 155 splx(s); 156 return (error); 157 } 158 159 int 160 solisten(struct socket *so, int backlog) 161 { 162 int s, error; 163 164 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 165 return (EOPNOTSUPP); 166 #ifdef SOCKET_SPLICE 167 if (isspliced(so) || issplicedback(so)) 168 return (EOPNOTSUPP); 169 #endif /* SOCKET_SPLICE */ 170 s = splsoftnet(); 171 error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, NULL, NULL, 172 curproc); 173 if (error) { 174 splx(s); 175 return (error); 176 } 177 if (TAILQ_FIRST(&so->so_q) == NULL) 178 so->so_options |= SO_ACCEPTCONN; 179 if (backlog < 0 || backlog > somaxconn) 180 backlog = somaxconn; 181 if (backlog < sominconn) 182 backlog = sominconn; 183 so->so_qlimit = backlog; 184 splx(s); 185 return (0); 186 } 187 188 /* 189 * Must be called at splsoftnet() 190 */ 191 192 void 193 sofree(struct socket *so) 194 { 195 splsoftassert(IPL_SOFTNET); 196 197 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) 198 return; 199 if (so->so_head) { 200 /* 201 * We must not decommission a socket that's on the accept(2) 202 * queue. If we do, then accept(2) may hang after select(2) 203 * indicated that the listening socket was ready. 204 */ 205 if (!soqremque(so, 0)) 206 return; 207 } 208 #ifdef SOCKET_SPLICE 209 if (so->so_sp) { 210 if (issplicedback(so)) 211 sounsplice(so->so_sp->ssp_soback, so, 212 so->so_sp->ssp_soback != so); 213 if (isspliced(so)) 214 sounsplice(so, so->so_sp->ssp_socket, 0); 215 pool_put(&sosplice_pool, so->so_sp); 216 so->so_sp = NULL; 217 } 218 #endif /* SOCKET_SPLICE */ 219 sbrelease(&so->so_snd); 220 sorflush(so); 221 pool_put(&socket_pool, so); 222 } 223 224 /* 225 * Close a socket on last file table reference removal. 226 * Initiate disconnect if connected. 227 * Free socket when disconnect complete. 228 */ 229 int 230 soclose(struct socket *so) 231 { 232 struct socket *so2; 233 int s = splsoftnet(); /* conservative */ 234 int error = 0; 235 236 if (so->so_options & SO_ACCEPTCONN) { 237 while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { 238 (void) soqremque(so2, 0); 239 (void) soabort(so2); 240 } 241 while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { 242 (void) soqremque(so2, 1); 243 (void) soabort(so2); 244 } 245 } 246 if (so->so_pcb == 0) 247 goto discard; 248 if (so->so_state & SS_ISCONNECTED) { 249 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 250 error = sodisconnect(so); 251 if (error) 252 goto drop; 253 } 254 if (so->so_options & SO_LINGER) { 255 if ((so->so_state & SS_ISDISCONNECTING) && 256 (so->so_state & SS_NBIO)) 257 goto drop; 258 while (so->so_state & SS_ISCONNECTED) { 259 error = tsleep(&so->so_timeo, 260 PSOCK | PCATCH, "netcls", 261 so->so_linger * hz); 262 if (error) 263 break; 264 } 265 } 266 } 267 drop: 268 if (so->so_pcb) { 269 int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, NULL, 270 NULL, NULL, curproc); 271 if (error == 0) 272 error = error2; 273 } 274 discard: 275 if (so->so_state & SS_NOFDREF) 276 panic("soclose: NOFDREF"); 277 so->so_state |= SS_NOFDREF; 278 sofree(so); 279 splx(s); 280 return (error); 281 } 282 283 /* 284 * Must be called at splsoftnet. 285 */ 286 int 287 soabort(struct socket *so) 288 { 289 splsoftassert(IPL_SOFTNET); 290 291 return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, NULL, NULL, NULL, 292 curproc); 293 } 294 295 int 296 soaccept(struct socket *so, struct mbuf *nam) 297 { 298 int s = splsoftnet(); 299 int error = 0; 300 301 if ((so->so_state & SS_NOFDREF) == 0) 302 panic("soaccept: !NOFDREF"); 303 so->so_state &= ~SS_NOFDREF; 304 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 305 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 306 error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, NULL, 307 nam, NULL, curproc); 308 else 309 error = ECONNABORTED; 310 splx(s); 311 return (error); 312 } 313 314 int 315 soconnect(struct socket *so, struct mbuf *nam) 316 { 317 int s; 318 int error; 319 320 if (so->so_options & SO_ACCEPTCONN) 321 return (EOPNOTSUPP); 322 s = splsoftnet(); 323 /* 324 * If protocol is connection-based, can only connect once. 325 * Otherwise, if connected, try to disconnect first. 326 * This allows user to disconnect by connecting to, e.g., 327 * a null address. 328 */ 329 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 330 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 331 (error = sodisconnect(so)))) 332 error = EISCONN; 333 else 334 error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, 335 NULL, nam, NULL, curproc); 336 splx(s); 337 return (error); 338 } 339 340 int 341 soconnect2(struct socket *so1, struct socket *so2) 342 { 343 int s = splsoftnet(); 344 int error; 345 346 error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, NULL, 347 (struct mbuf *)so2, NULL, curproc); 348 splx(s); 349 return (error); 350 } 351 352 int 353 sodisconnect(struct socket *so) 354 { 355 int s = splsoftnet(); 356 int error; 357 358 if ((so->so_state & SS_ISCONNECTED) == 0) { 359 error = ENOTCONN; 360 goto bad; 361 } 362 if (so->so_state & SS_ISDISCONNECTING) { 363 error = EALREADY; 364 goto bad; 365 } 366 error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, NULL, NULL, 367 NULL, curproc); 368 bad: 369 splx(s); 370 return (error); 371 } 372 373 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 374 /* 375 * Send on a socket. 376 * If send must go all at once and message is larger than 377 * send buffering, then hard error. 378 * Lock against other senders. 379 * If must go all at once and not enough room now, then 380 * inform user that this would block and do nothing. 381 * Otherwise, if nonblocking, send as much as possible. 382 * The data to be sent is described by "uio" if nonzero, 383 * otherwise by the mbuf chain "top" (which must be null 384 * if uio is not). Data provided in mbuf chain must be small 385 * enough to send all at once. 386 * 387 * Returns nonzero on error, timeout or signal; callers 388 * must check for short counts if EINTR/ERESTART are returned. 389 * Data and control buffers are freed on return. 390 */ 391 int 392 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 393 struct mbuf *control, int flags) 394 { 395 struct mbuf **mp; 396 struct mbuf *m; 397 long space, len, mlen, clen = 0; 398 quad_t resid; 399 int error, s; 400 int atomic = sosendallatonce(so) || top; 401 402 if (uio) 403 resid = uio->uio_resid; 404 else 405 resid = top->m_pkthdr.len; 406 /* 407 * In theory resid should be unsigned (since uio->uio_resid is). 408 * However, space must be signed, as it might be less than 0 409 * if we over-committed, and we must use a signed comparison 410 * of space and resid. On the other hand, a negative resid 411 * causes us to loop sending 0-length segments to the protocol. 412 * MSG_EOR on a SOCK_STREAM socket is also invalid. 413 */ 414 if (resid < 0 || 415 (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 416 error = EINVAL; 417 goto out; 418 } 419 if (uio && uio->uio_procp) 420 uio->uio_procp->p_ru.ru_msgsnd++; 421 if (control) { 422 clen = control->m_len; 423 /* reserve extra space for AF_LOCAL's internalize */ 424 if (so->so_proto->pr_domain->dom_family == AF_LOCAL && 425 clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) && 426 mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 427 clen = CMSG_SPACE( 428 (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) * 429 (sizeof(struct file *) / sizeof(int))); 430 } 431 432 #define snderr(errno) { error = errno; splx(s); goto release; } 433 434 restart: 435 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) 436 goto out; 437 so->so_state |= SS_ISSENDING; 438 do { 439 s = splsoftnet(); 440 if (so->so_state & SS_CANTSENDMORE) 441 snderr(EPIPE); 442 if (so->so_error) { 443 error = so->so_error; 444 so->so_error = 0; 445 splx(s); 446 goto release; 447 } 448 if ((so->so_state & SS_ISCONNECTED) == 0) { 449 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 450 if (!(resid == 0 && clen != 0)) 451 snderr(ENOTCONN); 452 } else if (addr == 0) 453 snderr(EDESTADDRREQ); 454 } 455 space = sbspace(&so->so_snd); 456 if (flags & MSG_OOB) 457 space += 1024; 458 if ((atomic && resid > so->so_snd.sb_hiwat) || 459 (so->so_proto->pr_domain->dom_family != AF_LOCAL && 460 clen > so->so_snd.sb_hiwat)) 461 snderr(EMSGSIZE); 462 if (space < resid + clen && 463 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 464 if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) 465 snderr(EWOULDBLOCK); 466 sbunlock(&so->so_snd); 467 error = sbwait(&so->so_snd); 468 so->so_state &= ~SS_ISSENDING; 469 splx(s); 470 if (error) 471 goto out; 472 goto restart; 473 } 474 splx(s); 475 mp = ⊤ 476 space -= clen; 477 do { 478 if (uio == NULL) { 479 /* 480 * Data is prepackaged in "top". 481 */ 482 resid = 0; 483 if (flags & MSG_EOR) 484 top->m_flags |= M_EOR; 485 } else do { 486 if (top == 0) { 487 MGETHDR(m, M_WAIT, MT_DATA); 488 mlen = MHLEN; 489 m->m_pkthdr.len = 0; 490 m->m_pkthdr.rcvif = (struct ifnet *)0; 491 } else { 492 MGET(m, M_WAIT, MT_DATA); 493 mlen = MLEN; 494 } 495 if (resid >= MINCLSIZE && space >= MCLBYTES) { 496 MCLGET(m, M_NOWAIT); 497 if ((m->m_flags & M_EXT) == 0) 498 goto nopages; 499 if (atomic && top == 0) { 500 len = lmin(MCLBYTES - max_hdr, 501 resid); 502 m->m_data += max_hdr; 503 } else 504 len = lmin(MCLBYTES, resid); 505 space -= len; 506 } else { 507 nopages: 508 len = lmin(lmin(mlen, resid), space); 509 space -= len; 510 /* 511 * For datagram protocols, leave room 512 * for protocol headers in first mbuf. 513 */ 514 if (atomic && top == 0 && len < mlen) 515 MH_ALIGN(m, len); 516 } 517 error = uiomove(mtod(m, caddr_t), (int)len, 518 uio); 519 resid = uio->uio_resid; 520 m->m_len = len; 521 *mp = m; 522 top->m_pkthdr.len += len; 523 if (error) 524 goto release; 525 mp = &m->m_next; 526 if (resid <= 0) { 527 if (flags & MSG_EOR) 528 top->m_flags |= M_EOR; 529 break; 530 } 531 } while (space > 0 && atomic); 532 s = splsoftnet(); /* XXX */ 533 if (resid <= 0) 534 so->so_state &= ~SS_ISSENDING; 535 error = (*so->so_proto->pr_usrreq)(so, 536 (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, 537 top, addr, control, curproc); 538 splx(s); 539 clen = 0; 540 control = 0; 541 top = 0; 542 mp = ⊤ 543 if (error) 544 goto release; 545 } while (resid && space > 0); 546 } while (resid); 547 548 release: 549 so->so_state &= ~SS_ISSENDING; 550 sbunlock(&so->so_snd); 551 out: 552 if (top) 553 m_freem(top); 554 if (control) 555 m_freem(control); 556 return (error); 557 } 558 559 /* 560 * Following replacement or removal of the first mbuf on the first 561 * mbuf chain of a socket buffer, push necessary state changes back 562 * into the socket buffer so that other consumers see the values 563 * consistently. 'nextrecord' is the callers locally stored value of 564 * the original value of sb->sb_mb->m_nextpkt which must be restored 565 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 566 */ 567 void 568 sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 569 { 570 571 /* 572 * First, update for the new value of nextrecord. If necessary, 573 * make it the first record. 574 */ 575 if (sb->sb_mb != NULL) 576 sb->sb_mb->m_nextpkt = nextrecord; 577 else 578 sb->sb_mb = nextrecord; 579 580 /* 581 * Now update any dependent socket buffer fields to reflect 582 * the new state. This is an inline of SB_EMPTY_FIXUP, with 583 * the addition of a second clause that takes care of the 584 * case where sb_mb has been updated, but remains the last 585 * record. 586 */ 587 if (sb->sb_mb == NULL) { 588 sb->sb_mbtail = NULL; 589 sb->sb_lastrecord = NULL; 590 } else if (sb->sb_mb->m_nextpkt == NULL) 591 sb->sb_lastrecord = sb->sb_mb; 592 } 593 594 /* 595 * Implement receive operations on a socket. 596 * We depend on the way that records are added to the sockbuf 597 * by sbappend*. In particular, each record (mbufs linked through m_next) 598 * must begin with an address if the protocol so specifies, 599 * followed by an optional mbuf or mbufs containing ancillary data, 600 * and then zero or more mbufs of data. 601 * In order to avoid blocking network interrupts for the entire time here, 602 * we splx() while doing the actual copy to user space. 603 * Although the sockbuf is locked, new data may still be appended, 604 * and thus we must maintain consistency of the sockbuf during that time. 605 * 606 * The caller may receive the data as a single mbuf chain by supplying 607 * an mbuf **mp0 for use in returning the chain. The uio is then used 608 * only for the count in uio_resid. 609 */ 610 int 611 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 612 struct mbuf **mp0, struct mbuf **controlp, int *flagsp, 613 socklen_t controllen) 614 { 615 struct mbuf *m, **mp; 616 struct mbuf *cm; 617 int flags, len, error, s, offset; 618 struct protosw *pr = so->so_proto; 619 struct mbuf *nextrecord; 620 int moff, type = 0; 621 size_t orig_resid = uio->uio_resid; 622 int uio_error = 0; 623 int resid; 624 625 mp = mp0; 626 if (paddr) 627 *paddr = 0; 628 if (controlp) 629 *controlp = 0; 630 if (flagsp) 631 flags = *flagsp &~ MSG_EOR; 632 else 633 flags = 0; 634 if (so->so_state & SS_NBIO) 635 flags |= MSG_DONTWAIT; 636 if (flags & MSG_OOB) { 637 m = m_get(M_WAIT, MT_DATA); 638 error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, 639 (struct mbuf *)(long)(flags & MSG_PEEK), NULL, curproc); 640 if (error) 641 goto bad; 642 do { 643 error = uiomove(mtod(m, caddr_t), 644 (int) min(uio->uio_resid, m->m_len), uio); 645 m = m_free(m); 646 } while (uio->uio_resid && error == 0 && m); 647 bad: 648 if (m) 649 m_freem(m); 650 return (error); 651 } 652 if (mp) 653 *mp = NULL; 654 655 restart: 656 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) 657 return (error); 658 s = splsoftnet(); 659 660 m = so->so_rcv.sb_mb; 661 #ifdef SOCKET_SPLICE 662 if (isspliced(so)) 663 m = NULL; 664 #endif /* SOCKET_SPLICE */ 665 /* 666 * If we have less data than requested, block awaiting more 667 * (subject to any timeout) if: 668 * 1. the current count is less than the low water mark, 669 * 2. MSG_WAITALL is set, and it is possible to do the entire 670 * receive operation at once if we block (resid <= hiwat), or 671 * 3. MSG_DONTWAIT is not set. 672 * If MSG_WAITALL is set but resid is larger than the receive buffer, 673 * we have to do the receive in sections, and thus risk returning 674 * a short count if a timeout or signal occurs after we start. 675 */ 676 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 677 so->so_rcv.sb_cc < uio->uio_resid) && 678 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 679 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 680 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 681 #ifdef DIAGNOSTIC 682 if (m == NULL && so->so_rcv.sb_cc) 683 #ifdef SOCKET_SPLICE 684 if (!isspliced(so)) 685 #endif /* SOCKET_SPLICE */ 686 panic("receive 1"); 687 #endif 688 if (so->so_error) { 689 if (m) 690 goto dontblock; 691 error = so->so_error; 692 if ((flags & MSG_PEEK) == 0) 693 so->so_error = 0; 694 goto release; 695 } 696 if (so->so_state & SS_CANTRCVMORE) { 697 if (m) 698 goto dontblock; 699 else if (so->so_rcv.sb_cc == 0) 700 goto release; 701 } 702 for (; m; m = m->m_next) 703 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 704 m = so->so_rcv.sb_mb; 705 goto dontblock; 706 } 707 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 708 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 709 error = ENOTCONN; 710 goto release; 711 } 712 if (uio->uio_resid == 0 && controlp == NULL) 713 goto release; 714 if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { 715 error = EWOULDBLOCK; 716 goto release; 717 } 718 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 719 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 720 sbunlock(&so->so_rcv); 721 error = sbwait(&so->so_rcv); 722 splx(s); 723 if (error) 724 return (error); 725 goto restart; 726 } 727 dontblock: 728 /* 729 * On entry here, m points to the first record of the socket buffer. 730 * From this point onward, we maintain 'nextrecord' as a cache of the 731 * pointer to the next record in the socket buffer. We must keep the 732 * various socket buffer pointers and local stack versions of the 733 * pointers in sync, pushing out modifications before operations that 734 * may sleep, and re-reading them afterwards. 735 * 736 * Otherwise, we will race with the network stack appending new data 737 * or records onto the socket buffer by using inconsistent/stale 738 * versions of the field, possibly resulting in socket buffer 739 * corruption. 740 */ 741 if (uio->uio_procp) 742 uio->uio_procp->p_ru.ru_msgrcv++; 743 KASSERT(m == so->so_rcv.sb_mb); 744 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 745 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 746 nextrecord = m->m_nextpkt; 747 if (pr->pr_flags & PR_ADDR) { 748 #ifdef DIAGNOSTIC 749 if (m->m_type != MT_SONAME) 750 panic("receive 1a"); 751 #endif 752 orig_resid = 0; 753 if (flags & MSG_PEEK) { 754 if (paddr) 755 *paddr = m_copy(m, 0, m->m_len); 756 m = m->m_next; 757 } else { 758 sbfree(&so->so_rcv, m); 759 if (paddr) { 760 *paddr = m; 761 so->so_rcv.sb_mb = m->m_next; 762 m->m_next = 0; 763 m = so->so_rcv.sb_mb; 764 } else { 765 MFREE(m, so->so_rcv.sb_mb); 766 m = so->so_rcv.sb_mb; 767 } 768 sbsync(&so->so_rcv, nextrecord); 769 } 770 } 771 while (m && m->m_type == MT_CONTROL && error == 0) { 772 if (flags & MSG_PEEK) { 773 if (controlp) 774 *controlp = m_copy(m, 0, m->m_len); 775 m = m->m_next; 776 } else { 777 sbfree(&so->so_rcv, m); 778 so->so_rcv.sb_mb = m->m_next; 779 m->m_nextpkt = m->m_next = NULL; 780 cm = m; 781 m = so->so_rcv.sb_mb; 782 sbsync(&so->so_rcv, nextrecord); 783 if (controlp) { 784 if (pr->pr_domain->dom_externalize && 785 mtod(cm, struct cmsghdr *)->cmsg_type == 786 SCM_RIGHTS) 787 error = (*pr->pr_domain->dom_externalize)(cm, 788 controllen, flags); 789 *controlp = cm; 790 } else { 791 /* 792 * Dispose of any SCM_RIGHTS message that went 793 * through the read path rather than recv. 794 */ 795 if (pr->pr_domain->dom_dispose && 796 mtod(cm, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 797 pr->pr_domain->dom_dispose(cm); 798 m_free(cm); 799 } 800 } 801 if (m != NULL) 802 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 803 else 804 nextrecord = so->so_rcv.sb_mb; 805 if (controlp) { 806 orig_resid = 0; 807 controlp = &(*controlp)->m_next; 808 } 809 } 810 811 /* If m is non-NULL, we have some data to read. */ 812 if (m) { 813 type = m->m_type; 814 if (type == MT_OOBDATA) 815 flags |= MSG_OOB; 816 if (m->m_flags & M_BCAST) 817 flags |= MSG_BCAST; 818 if (m->m_flags & M_MCAST) 819 flags |= MSG_MCAST; 820 } 821 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 822 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 823 824 moff = 0; 825 offset = 0; 826 while (m && uio->uio_resid > 0 && error == 0) { 827 if (m->m_type == MT_OOBDATA) { 828 if (type != MT_OOBDATA) 829 break; 830 } else if (type == MT_OOBDATA) 831 break; 832 #ifdef DIAGNOSTIC 833 else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) 834 panic("receive 3"); 835 #endif 836 so->so_state &= ~SS_RCVATMARK; 837 len = uio->uio_resid; 838 if (so->so_oobmark && len > so->so_oobmark - offset) 839 len = so->so_oobmark - offset; 840 if (len > m->m_len - moff) 841 len = m->m_len - moff; 842 /* 843 * If mp is set, just pass back the mbufs. 844 * Otherwise copy them out via the uio, then free. 845 * Sockbuf must be consistent here (points to current mbuf, 846 * it points to next record) when we drop priority; 847 * we must note any additions to the sockbuf when we 848 * block interrupts again. 849 */ 850 if (mp == NULL && uio_error == 0) { 851 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 852 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 853 resid = uio->uio_resid; 854 splx(s); 855 uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); 856 s = splsoftnet(); 857 if (uio_error) 858 uio->uio_resid = resid - len; 859 } else 860 uio->uio_resid -= len; 861 if (len == m->m_len - moff) { 862 if (m->m_flags & M_EOR) 863 flags |= MSG_EOR; 864 if (flags & MSG_PEEK) { 865 m = m->m_next; 866 moff = 0; 867 } else { 868 nextrecord = m->m_nextpkt; 869 sbfree(&so->so_rcv, m); 870 if (mp) { 871 *mp = m; 872 mp = &m->m_next; 873 so->so_rcv.sb_mb = m = m->m_next; 874 *mp = NULL; 875 } else { 876 MFREE(m, so->so_rcv.sb_mb); 877 m = so->so_rcv.sb_mb; 878 } 879 /* 880 * If m != NULL, we also know that 881 * so->so_rcv.sb_mb != NULL. 882 */ 883 KASSERT(so->so_rcv.sb_mb == m); 884 if (m) { 885 m->m_nextpkt = nextrecord; 886 if (nextrecord == NULL) 887 so->so_rcv.sb_lastrecord = m; 888 } else { 889 so->so_rcv.sb_mb = nextrecord; 890 SB_EMPTY_FIXUP(&so->so_rcv); 891 } 892 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 893 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 894 } 895 } else { 896 if (flags & MSG_PEEK) 897 moff += len; 898 else { 899 if (mp) 900 *mp = m_copym(m, 0, len, M_WAIT); 901 m->m_data += len; 902 m->m_len -= len; 903 so->so_rcv.sb_cc -= len; 904 so->so_rcv.sb_datacc -= len; 905 } 906 } 907 if (so->so_oobmark) { 908 if ((flags & MSG_PEEK) == 0) { 909 so->so_oobmark -= len; 910 if (so->so_oobmark == 0) { 911 so->so_state |= SS_RCVATMARK; 912 break; 913 } 914 } else { 915 offset += len; 916 if (offset == so->so_oobmark) 917 break; 918 } 919 } 920 if (flags & MSG_EOR) 921 break; 922 /* 923 * If the MSG_WAITALL flag is set (for non-atomic socket), 924 * we must not quit until "uio->uio_resid == 0" or an error 925 * termination. If a signal/timeout occurs, return 926 * with a short count but without error. 927 * Keep sockbuf locked against other readers. 928 */ 929 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 930 !sosendallatonce(so) && !nextrecord) { 931 if (so->so_error || so->so_state & SS_CANTRCVMORE) 932 break; 933 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 934 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 935 error = sbwait(&so->so_rcv); 936 if (error) { 937 sbunlock(&so->so_rcv); 938 splx(s); 939 return (0); 940 } 941 if ((m = so->so_rcv.sb_mb) != NULL) 942 nextrecord = m->m_nextpkt; 943 } 944 } 945 946 if (m && pr->pr_flags & PR_ATOMIC) { 947 flags |= MSG_TRUNC; 948 if ((flags & MSG_PEEK) == 0) 949 (void) sbdroprecord(&so->so_rcv); 950 } 951 if ((flags & MSG_PEEK) == 0) { 952 if (m == NULL) { 953 /* 954 * First part is an inline SB_EMPTY_FIXUP(). Second 955 * part makes sure sb_lastrecord is up-to-date if 956 * there is still data in the socket buffer. 957 */ 958 so->so_rcv.sb_mb = nextrecord; 959 if (so->so_rcv.sb_mb == NULL) { 960 so->so_rcv.sb_mbtail = NULL; 961 so->so_rcv.sb_lastrecord = NULL; 962 } else if (nextrecord->m_nextpkt == NULL) 963 so->so_rcv.sb_lastrecord = nextrecord; 964 } 965 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 966 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 967 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 968 (*pr->pr_usrreq)(so, PRU_RCVD, NULL, 969 (struct mbuf *)(long)flags, NULL, curproc); 970 } 971 if (orig_resid == uio->uio_resid && orig_resid && 972 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 973 sbunlock(&so->so_rcv); 974 splx(s); 975 goto restart; 976 } 977 978 if (uio_error) 979 error = uio_error; 980 981 if (flagsp) 982 *flagsp |= flags; 983 release: 984 sbunlock(&so->so_rcv); 985 splx(s); 986 return (error); 987 } 988 989 int 990 soshutdown(struct socket *so, int how) 991 { 992 struct protosw *pr = so->so_proto; 993 994 switch (how) { 995 case SHUT_RD: 996 case SHUT_RDWR: 997 sorflush(so); 998 if (how == SHUT_RD) 999 return (0); 1000 /* FALLTHROUGH */ 1001 case SHUT_WR: 1002 return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, NULL, NULL, NULL, 1003 curproc); 1004 default: 1005 return (EINVAL); 1006 } 1007 } 1008 1009 void 1010 sorflush(struct socket *so) 1011 { 1012 struct sockbuf *sb = &so->so_rcv; 1013 struct protosw *pr = so->so_proto; 1014 int s; 1015 struct sockbuf asb; 1016 1017 sb->sb_flags |= SB_NOINTR; 1018 (void) sblock(sb, M_WAITOK); 1019 s = splnet(); 1020 socantrcvmore(so); 1021 sbunlock(sb); 1022 asb = *sb; 1023 memset(sb, 0, sizeof (*sb)); 1024 /* XXX - the memset stomps all over so_rcv */ 1025 if (asb.sb_flags & SB_KNOTE) { 1026 sb->sb_sel.si_note = asb.sb_sel.si_note; 1027 sb->sb_flags = SB_KNOTE; 1028 } 1029 splx(s); 1030 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1031 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1032 sbrelease(&asb); 1033 } 1034 1035 #ifdef SOCKET_SPLICE 1036 1037 #define so_splicelen so_sp->ssp_len 1038 #define so_splicemax so_sp->ssp_max 1039 #define so_idletv so_sp->ssp_idletv 1040 #define so_idleto so_sp->ssp_idleto 1041 1042 int 1043 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) 1044 { 1045 struct file *fp; 1046 struct socket *sosp; 1047 int s, error = 0; 1048 1049 if ((so->so_proto->pr_flags & PR_SPLICE) == 0) 1050 return (EPROTONOSUPPORT); 1051 if (so->so_options & SO_ACCEPTCONN) 1052 return (EOPNOTSUPP); 1053 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1054 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 1055 return (ENOTCONN); 1056 if (so->so_sp == NULL) 1057 so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1058 1059 /* If no fd is given, unsplice by removing existing link. */ 1060 if (fd < 0) { 1061 /* Lock receive buffer. */ 1062 if ((error = sblock(&so->so_rcv, 1063 (so->so_state & SS_NBIO) ? M_NOWAIT : M_WAITOK)) != 0) 1064 return (error); 1065 s = splsoftnet(); 1066 if (so->so_sp->ssp_socket) 1067 sounsplice(so, so->so_sp->ssp_socket, 1); 1068 splx(s); 1069 sbunlock(&so->so_rcv); 1070 return (0); 1071 } 1072 1073 if (max && max < 0) 1074 return (EINVAL); 1075 1076 if (tv && (tv->tv_sec < 0 || tv->tv_usec < 0)) 1077 return (EINVAL); 1078 1079 /* Find sosp, the drain socket where data will be spliced into. */ 1080 if ((error = getsock(curproc->p_fd, fd, &fp)) != 0) 1081 return (error); 1082 sosp = fp->f_data; 1083 if (sosp->so_sp == NULL) 1084 sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1085 1086 /* Lock both receive and send buffer. */ 1087 if ((error = sblock(&so->so_rcv, 1088 (so->so_state & SS_NBIO) ? M_NOWAIT : M_WAITOK)) != 0) { 1089 FRELE(fp, curproc); 1090 return (error); 1091 } 1092 if ((error = sblock(&sosp->so_snd, M_WAITOK)) != 0) { 1093 sbunlock(&so->so_rcv); 1094 FRELE(fp, curproc); 1095 return (error); 1096 } 1097 s = splsoftnet(); 1098 1099 if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { 1100 error = EBUSY; 1101 goto release; 1102 } 1103 if (sosp->so_proto->pr_usrreq != so->so_proto->pr_usrreq) { 1104 error = EPROTONOSUPPORT; 1105 goto release; 1106 } 1107 if (sosp->so_options & SO_ACCEPTCONN) { 1108 error = EOPNOTSUPP; 1109 goto release; 1110 } 1111 if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) { 1112 error = ENOTCONN; 1113 goto release; 1114 } 1115 1116 /* Splice so and sosp together. */ 1117 so->so_sp->ssp_socket = sosp; 1118 sosp->so_sp->ssp_soback = so; 1119 so->so_splicelen = 0; 1120 so->so_splicemax = max; 1121 if (tv) 1122 so->so_idletv = *tv; 1123 else 1124 timerclear(&so->so_idletv); 1125 timeout_set(&so->so_idleto, soidle, so); 1126 1127 /* 1128 * To prevent softnet interrupt from calling somove() while 1129 * we sleep, the socket buffers are not marked as spliced yet. 1130 */ 1131 if (somove(so, M_WAIT)) { 1132 so->so_rcv.sb_flagsintr |= SB_SPLICE; 1133 sosp->so_snd.sb_flagsintr |= SB_SPLICE; 1134 } 1135 1136 release: 1137 splx(s); 1138 sbunlock(&sosp->so_snd); 1139 sbunlock(&so->so_rcv); 1140 FRELE(fp, curproc); 1141 return (error); 1142 } 1143 1144 void 1145 sounsplice(struct socket *so, struct socket *sosp, int wakeup) 1146 { 1147 splsoftassert(IPL_SOFTNET); 1148 1149 timeout_del(&so->so_idleto); 1150 sosp->so_snd.sb_flagsintr &= ~SB_SPLICE; 1151 so->so_rcv.sb_flagsintr &= ~SB_SPLICE; 1152 so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; 1153 if (wakeup && soreadable(so)) 1154 sorwakeup(so); 1155 } 1156 1157 void 1158 soidle(void *arg) 1159 { 1160 struct socket *so = arg; 1161 int s; 1162 1163 s = splsoftnet(); 1164 if (so->so_rcv.sb_flagsintr & SB_SPLICE) { 1165 so->so_error = ETIMEDOUT; 1166 sounsplice(so, so->so_sp->ssp_socket, 1); 1167 } 1168 splx(s); 1169 } 1170 1171 /* 1172 * Move data from receive buffer of spliced source socket to send 1173 * buffer of drain socket. Try to move as much as possible in one 1174 * big chunk. It is a TCP only implementation. 1175 * Return value 0 means splicing has been finished, 1 continue. 1176 */ 1177 int 1178 somove(struct socket *so, int wait) 1179 { 1180 struct socket *sosp = so->so_sp->ssp_socket; 1181 struct mbuf *m, **mp, *nextrecord; 1182 u_long len, off, oobmark; 1183 long space; 1184 int error = 0, maxreached = 0; 1185 short state; 1186 1187 splsoftassert(IPL_SOFTNET); 1188 1189 nextpkt: 1190 if (so->so_error) { 1191 error = so->so_error; 1192 goto release; 1193 } 1194 if (sosp->so_state & SS_CANTSENDMORE) { 1195 error = EPIPE; 1196 goto release; 1197 } 1198 if (sosp->so_error && sosp->so_error != ETIMEDOUT && 1199 sosp->so_error != EFBIG) { 1200 error = sosp->so_error; 1201 goto release; 1202 } 1203 if ((sosp->so_state & SS_ISCONNECTED) == 0) 1204 goto release; 1205 1206 /* Calculate how many bytes can be copied now. */ 1207 len = so->so_rcv.sb_datacc; 1208 if (so->so_splicemax) { 1209 KASSERT(so->so_splicelen < so->so_splicemax); 1210 if (so->so_splicemax <= so->so_splicelen + len) { 1211 len = so->so_splicemax - so->so_splicelen; 1212 maxreached = 1; 1213 } 1214 } 1215 space = sbspace(&sosp->so_snd); 1216 if (so->so_oobmark && so->so_oobmark < len && 1217 so->so_oobmark < space + 1024) 1218 space += 1024; 1219 if (space <= 0) { 1220 maxreached = 0; 1221 goto release; 1222 } 1223 if (space < len) { 1224 maxreached = 0; 1225 if (space < sosp->so_snd.sb_lowat) 1226 goto release; 1227 len = space; 1228 } 1229 sosp->so_state |= SS_ISSENDING; 1230 1231 SBLASTRECORDCHK(&so->so_rcv, "somove 1"); 1232 SBLASTMBUFCHK(&so->so_rcv, "somove 1"); 1233 m = so->so_rcv.sb_mb; 1234 if (m == NULL) 1235 goto release; 1236 nextrecord = m->m_nextpkt; 1237 1238 /* Drop address and control information not used with splicing. */ 1239 if (so->so_proto->pr_flags & PR_ADDR) { 1240 #ifdef DIAGNOSTIC 1241 if (m->m_type != MT_SONAME) 1242 panic("somove soname"); 1243 #endif 1244 m = m->m_next; 1245 } 1246 while (m && m->m_type == MT_CONTROL) 1247 m = m->m_next; 1248 if (m == NULL) { 1249 sbdroprecord(&so->so_rcv); 1250 if (so->so_proto->pr_flags & PR_WANTRCVD && so->so_pcb) 1251 (so->so_proto->pr_usrreq)(so, PRU_RCVD, NULL, 1252 (struct mbuf *)0L, NULL, NULL); 1253 goto nextpkt; 1254 } 1255 1256 if (so->so_proto->pr_flags & PR_ATOMIC) { 1257 if ((m->m_flags & M_PKTHDR) == 0) 1258 panic("somove pkthdr"); 1259 if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) { 1260 error = EMSGSIZE; 1261 goto release; 1262 } 1263 if (len < m->m_pkthdr.len) 1264 goto release; 1265 if (m->m_pkthdr.len < len) { 1266 maxreached = 0; 1267 len = m->m_pkthdr.len; 1268 } 1269 /* 1270 * Throw away the name mbuf after it has been assured 1271 * that the whole first record can be processed. 1272 */ 1273 m = so->so_rcv.sb_mb; 1274 sbfree(&so->so_rcv, m); 1275 MFREE(m, so->so_rcv.sb_mb); 1276 sbsync(&so->so_rcv, nextrecord); 1277 } 1278 /* 1279 * Throw away the control mbufs after it has been assured 1280 * that the whole first record can be processed. 1281 */ 1282 m = so->so_rcv.sb_mb; 1283 while (m && m->m_type == MT_CONTROL) { 1284 sbfree(&so->so_rcv, m); 1285 MFREE(m, so->so_rcv.sb_mb); 1286 m = so->so_rcv.sb_mb; 1287 sbsync(&so->so_rcv, nextrecord); 1288 } 1289 1290 SBLASTRECORDCHK(&so->so_rcv, "somove 2"); 1291 SBLASTMBUFCHK(&so->so_rcv, "somove 2"); 1292 1293 /* Take at most len mbufs out of receive buffer. */ 1294 for (off = 0, mp = &m; off <= len && *mp; 1295 off += (*mp)->m_len, mp = &(*mp)->m_next) { 1296 u_long size = len - off; 1297 1298 #ifdef DIAGNOSTIC 1299 if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER) 1300 panic("somove type"); 1301 #endif 1302 if ((*mp)->m_len > size) { 1303 if (!maxreached || (*mp = m_copym( 1304 so->so_rcv.sb_mb, 0, size, wait)) == NULL) { 1305 len -= size; 1306 break; 1307 } 1308 so->so_rcv.sb_mb->m_data += size; 1309 so->so_rcv.sb_mb->m_len -= size; 1310 so->so_rcv.sb_cc -= size; 1311 so->so_rcv.sb_datacc -= size; 1312 } else { 1313 *mp = so->so_rcv.sb_mb; 1314 sbfree(&so->so_rcv, *mp); 1315 so->so_rcv.sb_mb = (*mp)->m_next; 1316 sbsync(&so->so_rcv, nextrecord); 1317 } 1318 } 1319 *mp = NULL; 1320 1321 SBLASTRECORDCHK(&so->so_rcv, "somove 3"); 1322 SBLASTMBUFCHK(&so->so_rcv, "somove 3"); 1323 SBCHECK(&so->so_rcv); 1324 if (m == NULL) 1325 goto release; 1326 m->m_nextpkt = NULL; 1327 if (m->m_flags & M_PKTHDR) { 1328 m_tag_delete_chain(m); 1329 memset(&m->m_pkthdr, 0, sizeof(m->m_pkthdr)); 1330 m->m_pkthdr.len = len; 1331 m->m_pkthdr.pf.prio = IFQ_DEFPRIO; 1332 } 1333 1334 /* Send window update to source peer as receive buffer has changed. */ 1335 if (so->so_proto->pr_flags & PR_WANTRCVD && so->so_pcb) 1336 (so->so_proto->pr_usrreq)(so, PRU_RCVD, NULL, 1337 NULL, NULL, NULL); 1338 1339 /* Receive buffer did shrink by len bytes, adjust oob. */ 1340 state = so->so_state; 1341 so->so_state &= ~SS_RCVATMARK; 1342 oobmark = so->so_oobmark; 1343 so->so_oobmark = oobmark > len ? oobmark - len : 0; 1344 if (oobmark) { 1345 if (oobmark == len) 1346 so->so_state |= SS_RCVATMARK; 1347 if (oobmark >= len) 1348 oobmark = 0; 1349 } 1350 1351 /* 1352 * Handle oob data. If any malloc fails, ignore error. 1353 * TCP urgent data is not very reliable anyway. 1354 */ 1355 while (((state & SS_RCVATMARK) || oobmark) && 1356 (so->so_options & SO_OOBINLINE)) { 1357 struct mbuf *o = NULL; 1358 1359 if (state & SS_RCVATMARK) { 1360 o = m_get(wait, MT_DATA); 1361 state &= ~SS_RCVATMARK; 1362 } else if (oobmark) { 1363 o = m_split(m, oobmark, wait); 1364 if (o) { 1365 error = (*sosp->so_proto->pr_usrreq)(sosp, 1366 PRU_SEND, m, NULL, NULL, NULL); 1367 if (error) { 1368 if (sosp->so_state & SS_CANTSENDMORE) 1369 error = EPIPE; 1370 m_freem(o); 1371 goto release; 1372 } 1373 len -= oobmark; 1374 so->so_splicelen += oobmark; 1375 m = o; 1376 o = m_get(wait, MT_DATA); 1377 } 1378 oobmark = 0; 1379 } 1380 if (o) { 1381 o->m_len = 1; 1382 *mtod(o, caddr_t) = *mtod(m, caddr_t); 1383 error = (*sosp->so_proto->pr_usrreq)(sosp, PRU_SENDOOB, 1384 o, NULL, NULL, NULL); 1385 if (error) { 1386 if (sosp->so_state & SS_CANTSENDMORE) 1387 error = EPIPE; 1388 m_freem(m); 1389 goto release; 1390 } 1391 len -= 1; 1392 so->so_splicelen += 1; 1393 if (oobmark) { 1394 oobmark -= 1; 1395 if (oobmark == 0) 1396 state |= SS_RCVATMARK; 1397 } 1398 m_adj(m, 1); 1399 } 1400 } 1401 1402 /* Append all remaining data to drain socket. */ 1403 if (so->so_rcv.sb_cc == 0 || maxreached) 1404 sosp->so_state &= ~SS_ISSENDING; 1405 error = (*sosp->so_proto->pr_usrreq)(sosp, PRU_SEND, m, NULL, NULL, 1406 NULL); 1407 if (error) { 1408 if (sosp->so_state & SS_CANTSENDMORE) 1409 error = EPIPE; 1410 goto release; 1411 } 1412 so->so_splicelen += len; 1413 1414 /* Move several packets if possible. */ 1415 if (!maxreached && nextrecord) 1416 goto nextpkt; 1417 1418 release: 1419 sosp->so_state &= ~SS_ISSENDING; 1420 if (!error && maxreached && so->so_splicemax == so->so_splicelen) 1421 error = EFBIG; 1422 if (error) 1423 so->so_error = error; 1424 if (((so->so_state & SS_CANTRCVMORE) && so->so_rcv.sb_cc == 0) || 1425 (sosp->so_state & SS_CANTSENDMORE) || maxreached || error) { 1426 sounsplice(so, sosp, 1); 1427 return (0); 1428 } 1429 if (timerisset(&so->so_idletv)) 1430 timeout_add_tv(&so->so_idleto, &so->so_idletv); 1431 return (1); 1432 } 1433 1434 #undef so_splicelen 1435 #undef so_splicemax 1436 #undef so_idletv 1437 #undef so_idleto 1438 1439 #endif /* SOCKET_SPLICE */ 1440 1441 void 1442 sorwakeup(struct socket *so) 1443 { 1444 #ifdef SOCKET_SPLICE 1445 if (so->so_rcv.sb_flagsintr & SB_SPLICE) 1446 (void) somove(so, M_DONTWAIT); 1447 if (isspliced(so)) 1448 return; 1449 #endif 1450 sowakeup(so, &so->so_rcv); 1451 if (so->so_upcall) 1452 (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT); 1453 } 1454 1455 void 1456 sowwakeup(struct socket *so) 1457 { 1458 #ifdef SOCKET_SPLICE 1459 if (so->so_snd.sb_flagsintr & SB_SPLICE) 1460 (void) somove(so->so_sp->ssp_soback, M_DONTWAIT); 1461 #endif 1462 sowakeup(so, &so->so_snd); 1463 } 1464 1465 int 1466 sosetopt(struct socket *so, int level, int optname, struct mbuf *m0) 1467 { 1468 int error = 0; 1469 struct mbuf *m = m0; 1470 1471 if (level != SOL_SOCKET) { 1472 if (so->so_proto && so->so_proto->pr_ctloutput) 1473 return ((*so->so_proto->pr_ctloutput) 1474 (PRCO_SETOPT, so, level, optname, &m0)); 1475 error = ENOPROTOOPT; 1476 } else { 1477 switch (optname) { 1478 case SO_BINDANY: 1479 if ((error = suser(curproc, 0)) != 0) /* XXX */ 1480 goto bad; 1481 break; 1482 } 1483 1484 switch (optname) { 1485 1486 case SO_LINGER: 1487 if (m == NULL || m->m_len != sizeof (struct linger) || 1488 mtod(m, struct linger *)->l_linger < 0 || 1489 mtod(m, struct linger *)->l_linger > SHRT_MAX) { 1490 error = EINVAL; 1491 goto bad; 1492 } 1493 so->so_linger = mtod(m, struct linger *)->l_linger; 1494 /* FALLTHROUGH */ 1495 1496 case SO_BINDANY: 1497 case SO_DEBUG: 1498 case SO_KEEPALIVE: 1499 case SO_USELOOPBACK: 1500 case SO_BROADCAST: 1501 case SO_REUSEADDR: 1502 case SO_REUSEPORT: 1503 case SO_OOBINLINE: 1504 case SO_TIMESTAMP: 1505 if (m == NULL || m->m_len < sizeof (int)) { 1506 error = EINVAL; 1507 goto bad; 1508 } 1509 if (*mtod(m, int *)) 1510 so->so_options |= optname; 1511 else 1512 so->so_options &= ~optname; 1513 break; 1514 1515 case SO_DONTROUTE: 1516 if (m == NULL || m->m_len < sizeof (int)) { 1517 error = EINVAL; 1518 goto bad; 1519 } 1520 if (*mtod(m, int *)) 1521 error = EOPNOTSUPP; 1522 break; 1523 1524 case SO_SNDBUF: 1525 case SO_RCVBUF: 1526 case SO_SNDLOWAT: 1527 case SO_RCVLOWAT: 1528 { 1529 u_long cnt; 1530 1531 if (m == NULL || m->m_len < sizeof (int)) { 1532 error = EINVAL; 1533 goto bad; 1534 } 1535 cnt = *mtod(m, int *); 1536 if ((long)cnt <= 0) 1537 cnt = 1; 1538 switch (optname) { 1539 1540 case SO_SNDBUF: 1541 if (so->so_state & SS_CANTSENDMORE) { 1542 error = EINVAL; 1543 goto bad; 1544 } 1545 if (sbcheckreserve(cnt, so->so_snd.sb_wat) || 1546 sbreserve(&so->so_snd, cnt)) { 1547 error = ENOBUFS; 1548 goto bad; 1549 } 1550 so->so_snd.sb_wat = cnt; 1551 break; 1552 1553 case SO_RCVBUF: 1554 if (so->so_state & SS_CANTRCVMORE) { 1555 error = EINVAL; 1556 goto bad; 1557 } 1558 if (sbcheckreserve(cnt, so->so_rcv.sb_wat) || 1559 sbreserve(&so->so_rcv, cnt)) { 1560 error = ENOBUFS; 1561 goto bad; 1562 } 1563 so->so_rcv.sb_wat = cnt; 1564 break; 1565 1566 case SO_SNDLOWAT: 1567 so->so_snd.sb_lowat = 1568 (cnt > so->so_snd.sb_hiwat) ? 1569 so->so_snd.sb_hiwat : cnt; 1570 break; 1571 case SO_RCVLOWAT: 1572 so->so_rcv.sb_lowat = 1573 (cnt > so->so_rcv.sb_hiwat) ? 1574 so->so_rcv.sb_hiwat : cnt; 1575 break; 1576 } 1577 break; 1578 } 1579 1580 case SO_SNDTIMEO: 1581 case SO_RCVTIMEO: 1582 { 1583 struct timeval tv; 1584 int val; 1585 1586 if (m == NULL || m->m_len < sizeof (tv)) { 1587 error = EINVAL; 1588 goto bad; 1589 } 1590 memcpy(&tv, mtod(m, struct timeval *), sizeof tv); 1591 val = tvtohz(&tv); 1592 if (val > USHRT_MAX) { 1593 error = EDOM; 1594 goto bad; 1595 } 1596 1597 switch (optname) { 1598 1599 case SO_SNDTIMEO: 1600 so->so_snd.sb_timeo = val; 1601 break; 1602 case SO_RCVTIMEO: 1603 so->so_rcv.sb_timeo = val; 1604 break; 1605 } 1606 break; 1607 } 1608 1609 case SO_RTABLE: 1610 if (so->so_proto && so->so_proto->pr_domain && 1611 so->so_proto->pr_domain->dom_protosw && 1612 so->so_proto->pr_ctloutput) { 1613 struct domain *dom = so->so_proto->pr_domain; 1614 1615 level = dom->dom_protosw->pr_protocol; 1616 return ((*so->so_proto->pr_ctloutput) 1617 (PRCO_SETOPT, so, level, optname, &m0)); 1618 } 1619 error = ENOPROTOOPT; 1620 break; 1621 1622 #ifdef SOCKET_SPLICE 1623 case SO_SPLICE: 1624 if (m == NULL) { 1625 error = sosplice(so, -1, 0, NULL); 1626 } else if (m->m_len < sizeof(int)) { 1627 error = EINVAL; 1628 goto bad; 1629 } else if (m->m_len < sizeof(struct splice)) { 1630 error = sosplice(so, *mtod(m, int *), 0, NULL); 1631 } else { 1632 error = sosplice(so, 1633 mtod(m, struct splice *)->sp_fd, 1634 mtod(m, struct splice *)->sp_max, 1635 &mtod(m, struct splice *)->sp_idle); 1636 } 1637 break; 1638 #endif /* SOCKET_SPLICE */ 1639 1640 default: 1641 error = ENOPROTOOPT; 1642 break; 1643 } 1644 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { 1645 (void) ((*so->so_proto->pr_ctloutput) 1646 (PRCO_SETOPT, so, level, optname, &m0)); 1647 m = NULL; /* freed by protocol */ 1648 } 1649 } 1650 bad: 1651 if (m) 1652 (void) m_free(m); 1653 return (error); 1654 } 1655 1656 int 1657 sogetopt(struct socket *so, int level, int optname, struct mbuf **mp) 1658 { 1659 struct mbuf *m; 1660 1661 if (level != SOL_SOCKET) { 1662 if (so->so_proto && so->so_proto->pr_ctloutput) { 1663 return ((*so->so_proto->pr_ctloutput) 1664 (PRCO_GETOPT, so, level, optname, mp)); 1665 } else 1666 return (ENOPROTOOPT); 1667 } else { 1668 m = m_get(M_WAIT, MT_SOOPTS); 1669 m->m_len = sizeof (int); 1670 1671 switch (optname) { 1672 1673 case SO_LINGER: 1674 m->m_len = sizeof (struct linger); 1675 mtod(m, struct linger *)->l_onoff = 1676 so->so_options & SO_LINGER; 1677 mtod(m, struct linger *)->l_linger = so->so_linger; 1678 break; 1679 1680 case SO_BINDANY: 1681 case SO_USELOOPBACK: 1682 case SO_DEBUG: 1683 case SO_KEEPALIVE: 1684 case SO_REUSEADDR: 1685 case SO_REUSEPORT: 1686 case SO_BROADCAST: 1687 case SO_OOBINLINE: 1688 case SO_TIMESTAMP: 1689 *mtod(m, int *) = so->so_options & optname; 1690 break; 1691 1692 case SO_DONTROUTE: 1693 *mtod(m, int *) = 0; 1694 break; 1695 1696 case SO_TYPE: 1697 *mtod(m, int *) = so->so_type; 1698 break; 1699 1700 case SO_ERROR: 1701 *mtod(m, int *) = so->so_error; 1702 so->so_error = 0; 1703 break; 1704 1705 case SO_SNDBUF: 1706 *mtod(m, int *) = so->so_snd.sb_hiwat; 1707 break; 1708 1709 case SO_RCVBUF: 1710 *mtod(m, int *) = so->so_rcv.sb_hiwat; 1711 break; 1712 1713 case SO_SNDLOWAT: 1714 *mtod(m, int *) = so->so_snd.sb_lowat; 1715 break; 1716 1717 case SO_RCVLOWAT: 1718 *mtod(m, int *) = so->so_rcv.sb_lowat; 1719 break; 1720 1721 case SO_SNDTIMEO: 1722 case SO_RCVTIMEO: 1723 { 1724 struct timeval tv; 1725 int val = (optname == SO_SNDTIMEO ? 1726 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1727 1728 m->m_len = sizeof(struct timeval); 1729 memset(&tv, 0, sizeof(tv)); 1730 tv.tv_sec = val / hz; 1731 tv.tv_usec = (val % hz) * tick; 1732 memcpy(mtod(m, struct timeval *), &tv, sizeof tv); 1733 break; 1734 } 1735 1736 case SO_RTABLE: 1737 (void)m_free(m); 1738 if (so->so_proto && so->so_proto->pr_domain && 1739 so->so_proto->pr_domain->dom_protosw && 1740 so->so_proto->pr_ctloutput) { 1741 struct domain *dom = so->so_proto->pr_domain; 1742 1743 level = dom->dom_protosw->pr_protocol; 1744 return ((*so->so_proto->pr_ctloutput) 1745 (PRCO_GETOPT, so, level, optname, mp)); 1746 } 1747 return (ENOPROTOOPT); 1748 break; 1749 1750 #ifdef SOCKET_SPLICE 1751 case SO_SPLICE: 1752 { 1753 off_t len; 1754 int s = splsoftnet(); 1755 1756 m->m_len = sizeof(off_t); 1757 len = so->so_sp ? so->so_sp->ssp_len : 0; 1758 memcpy(mtod(m, off_t *), &len, sizeof(off_t)); 1759 splx(s); 1760 break; 1761 } 1762 #endif /* SOCKET_SPLICE */ 1763 1764 case SO_PEERCRED: 1765 if (so->so_proto->pr_protocol == AF_UNIX) { 1766 struct unpcb *unp = sotounpcb(so); 1767 1768 if (unp->unp_flags & UNP_FEIDS) { 1769 m->m_len = sizeof(unp->unp_connid); 1770 bcopy(&(unp->unp_connid), 1771 mtod(m, caddr_t), m->m_len); 1772 break; 1773 } 1774 (void)m_free(m); 1775 return (ENOTCONN); 1776 } 1777 (void)m_free(m); 1778 return (EOPNOTSUPP); 1779 break; 1780 1781 default: 1782 (void)m_free(m); 1783 return (ENOPROTOOPT); 1784 } 1785 *mp = m; 1786 return (0); 1787 } 1788 } 1789 1790 void 1791 sohasoutofband(struct socket *so) 1792 { 1793 csignal(so->so_pgid, SIGURG, so->so_siguid, so->so_sigeuid); 1794 selwakeup(&so->so_rcv.sb_sel); 1795 } 1796 1797 int 1798 soo_kqfilter(struct file *fp, struct knote *kn) 1799 { 1800 struct socket *so = kn->kn_fp->f_data; 1801 struct sockbuf *sb; 1802 int s; 1803 1804 switch (kn->kn_filter) { 1805 case EVFILT_READ: 1806 if (so->so_options & SO_ACCEPTCONN) 1807 kn->kn_fop = &solisten_filtops; 1808 else 1809 kn->kn_fop = &soread_filtops; 1810 sb = &so->so_rcv; 1811 break; 1812 case EVFILT_WRITE: 1813 kn->kn_fop = &sowrite_filtops; 1814 sb = &so->so_snd; 1815 break; 1816 default: 1817 return (EINVAL); 1818 } 1819 1820 s = splnet(); 1821 SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext); 1822 sb->sb_flags |= SB_KNOTE; 1823 splx(s); 1824 return (0); 1825 } 1826 1827 void 1828 filt_sordetach(struct knote *kn) 1829 { 1830 struct socket *so = kn->kn_fp->f_data; 1831 int s = splnet(); 1832 1833 SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext); 1834 if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note)) 1835 so->so_rcv.sb_flags &= ~SB_KNOTE; 1836 splx(s); 1837 } 1838 1839 /*ARGSUSED*/ 1840 int 1841 filt_soread(struct knote *kn, long hint) 1842 { 1843 struct socket *so = kn->kn_fp->f_data; 1844 1845 kn->kn_data = so->so_rcv.sb_cc; 1846 #ifdef SOCKET_SPLICE 1847 if (isspliced(so)) 1848 return (0); 1849 #endif /* SOCKET_SPLICE */ 1850 if (so->so_state & SS_CANTRCVMORE) { 1851 kn->kn_flags |= EV_EOF; 1852 kn->kn_fflags = so->so_error; 1853 return (1); 1854 } 1855 if (so->so_error) /* temporary udp error */ 1856 return (1); 1857 if (kn->kn_sfflags & NOTE_LOWAT) 1858 return (kn->kn_data >= kn->kn_sdata); 1859 return (kn->kn_data >= so->so_rcv.sb_lowat); 1860 } 1861 1862 void 1863 filt_sowdetach(struct knote *kn) 1864 { 1865 struct socket *so = kn->kn_fp->f_data; 1866 int s = splnet(); 1867 1868 SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext); 1869 if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note)) 1870 so->so_snd.sb_flags &= ~SB_KNOTE; 1871 splx(s); 1872 } 1873 1874 /*ARGSUSED*/ 1875 int 1876 filt_sowrite(struct knote *kn, long hint) 1877 { 1878 struct socket *so = kn->kn_fp->f_data; 1879 1880 kn->kn_data = sbspace(&so->so_snd); 1881 if (so->so_state & SS_CANTSENDMORE) { 1882 kn->kn_flags |= EV_EOF; 1883 kn->kn_fflags = so->so_error; 1884 return (1); 1885 } 1886 if (so->so_error) /* temporary udp error */ 1887 return (1); 1888 if (((so->so_state & SS_ISCONNECTED) == 0) && 1889 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 1890 return (0); 1891 if (kn->kn_sfflags & NOTE_LOWAT) 1892 return (kn->kn_data >= kn->kn_sdata); 1893 return (kn->kn_data >= so->so_snd.sb_lowat); 1894 } 1895 1896 /*ARGSUSED*/ 1897 int 1898 filt_solisten(struct knote *kn, long hint) 1899 { 1900 struct socket *so = kn->kn_fp->f_data; 1901 1902 kn->kn_data = so->so_qlen; 1903 return (so->so_qlen != 0); 1904 } 1905