1 /* $OpenBSD: uipc_socket.c,v 1.357 2025/01/07 23:13:46 mvs Exp $ */ 2 /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 #include <sys/file.h> 39 #include <sys/filedesc.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/domain.h> 43 #include <sys/event.h> 44 #include <sys/protosw.h> 45 #include <sys/socket.h> 46 #include <sys/unpcb.h> 47 #include <sys/socketvar.h> 48 #include <sys/signalvar.h> 49 #include <sys/pool.h> 50 #include <sys/atomic.h> 51 #include <sys/rwlock.h> 52 #include <sys/time.h> 53 #include <sys/refcnt.h> 54 55 #ifdef DDB 56 #include <machine/db_machdep.h> 57 #endif 58 59 void sbsync(struct sockbuf *, struct mbuf *); 60 61 int sosplice(struct socket *, int, off_t, struct timeval *); 62 void sounsplice(struct socket *, struct socket *, int); 63 void soidle(void *); 64 void sotask(void *); 65 int somove(struct socket *, int); 66 void sorflush(struct socket *); 67 68 void filt_sordetach(struct knote *kn); 69 int filt_soread(struct knote *kn, long hint); 70 void filt_sowdetach(struct knote *kn); 71 int filt_sowrite(struct knote *kn, long hint); 72 int filt_soexcept(struct knote *kn, long hint); 73 74 int filt_sowmodify(struct kevent *kev, struct knote *kn); 75 int filt_sowprocess(struct knote *kn, struct kevent *kev); 76 77 int filt_sormodify(struct kevent *kev, struct knote *kn); 78 int filt_sorprocess(struct knote *kn, struct kevent *kev); 79 80 const struct filterops soread_filtops = { 81 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 82 .f_attach = NULL, 83 .f_detach = filt_sordetach, 84 .f_event = filt_soread, 85 .f_modify = filt_sormodify, 86 .f_process = filt_sorprocess, 87 }; 88 89 const struct filterops sowrite_filtops = { 90 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 91 .f_attach = NULL, 92 .f_detach = filt_sowdetach, 93 .f_event = filt_sowrite, 94 .f_modify = filt_sowmodify, 95 .f_process = filt_sowprocess, 96 }; 97 98 const struct filterops soexcept_filtops = { 99 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 100 .f_attach = NULL, 101 .f_detach = filt_sordetach, 102 .f_event = filt_soexcept, 103 .f_modify = filt_sormodify, 104 .f_process = filt_sorprocess, 105 }; 106 107 #ifndef SOMINCONN 108 #define SOMINCONN 80 109 #endif /* SOMINCONN */ 110 111 int somaxconn = SOMAXCONN; 112 int sominconn = SOMINCONN; 113 114 struct pool socket_pool; 115 #ifdef SOCKET_SPLICE 116 struct pool sosplice_pool; 117 struct taskq *sosplice_taskq; 118 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk"); 119 #endif 120 121 void 122 soinit(void) 123 { 124 pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0, 125 "sockpl", NULL); 126 #ifdef SOCKET_SPLICE 127 pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0, 128 "sosppl", NULL); 129 #endif 130 } 131 132 struct socket * 133 soalloc(const struct protosw *prp, int wait) 134 { 135 const struct domain *dp = prp->pr_domain; 136 const char *dom_name = dp->dom_name; 137 struct socket *so; 138 139 so = pool_get(&socket_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) | 140 PR_ZERO); 141 if (so == NULL) 142 return (NULL); 143 144 #ifdef WITNESS 145 /* 146 * XXX: Make WITNESS happy. AF_INET and AF_INET6 sockets could be 147 * spliced together. 148 */ 149 switch (dp->dom_family) { 150 case AF_INET: 151 case AF_INET6: 152 dom_name = "inet46"; 153 break; 154 } 155 #endif 156 157 refcnt_init(&so->so_refcnt); 158 rw_init_flags(&so->so_lock, dom_name, RWL_DUPOK); 159 rw_init(&so->so_rcv.sb_lock, "sbufrcv"); 160 rw_init(&so->so_snd.sb_lock, "sbufsnd"); 161 mtx_init_flags(&so->so_rcv.sb_mtx, IPL_MPFLOOR, "sbrcv", 0); 162 mtx_init_flags(&so->so_snd.sb_mtx, IPL_MPFLOOR, "sbsnd", 0); 163 klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx); 164 klist_init_mutex(&so->so_snd.sb_klist, &so->so_snd.sb_mtx); 165 sigio_init(&so->so_sigio); 166 TAILQ_INIT(&so->so_q0); 167 TAILQ_INIT(&so->so_q); 168 169 so->so_snd.sb_flags |= SB_MTXLOCK; 170 so->so_rcv.sb_flags |= SB_MTXLOCK; 171 172 return (so); 173 } 174 175 /* 176 * Socket operation routines. 177 * These routines are called by the routines in 178 * sys_socket.c or from a system process, and 179 * implement the semantics of socket operations by 180 * switching out to the protocol specific routines. 181 */ 182 int 183 socreate(int dom, struct socket **aso, int type, int proto) 184 { 185 struct proc *p = curproc; /* XXX */ 186 const struct protosw *prp; 187 struct socket *so; 188 int error; 189 190 if (proto) 191 prp = pffindproto(dom, proto, type); 192 else 193 prp = pffindtype(dom, type); 194 if (prp == NULL || prp->pr_usrreqs == NULL) 195 return (EPROTONOSUPPORT); 196 if (prp->pr_type != type) 197 return (EPROTOTYPE); 198 so = soalloc(prp, M_WAIT); 199 so->so_type = type; 200 if (suser(p) == 0) 201 so->so_state = SS_PRIV; 202 so->so_ruid = p->p_ucred->cr_ruid; 203 so->so_euid = p->p_ucred->cr_uid; 204 so->so_rgid = p->p_ucred->cr_rgid; 205 so->so_egid = p->p_ucred->cr_gid; 206 so->so_cpid = p->p_p->ps_pid; 207 so->so_proto = prp; 208 so->so_snd.sb_timeo_nsecs = INFSLP; 209 so->so_rcv.sb_timeo_nsecs = INFSLP; 210 211 solock(so); 212 error = pru_attach(so, proto, M_WAIT); 213 if (error) { 214 so->so_state |= SS_NOFDREF; 215 /* sofree() calls sounlock(). */ 216 sofree(so, 0); 217 return (error); 218 } 219 sounlock(so); 220 *aso = so; 221 return (0); 222 } 223 224 int 225 sobind(struct socket *so, struct mbuf *nam, struct proc *p) 226 { 227 soassertlocked(so); 228 return pru_bind(so, nam, p); 229 } 230 231 int 232 solisten(struct socket *so, int backlog) 233 { 234 int somaxconn_local = atomic_load_int(&somaxconn); 235 int sominconn_local = atomic_load_int(&sominconn); 236 int error; 237 238 switch (so->so_type) { 239 case SOCK_STREAM: 240 case SOCK_SEQPACKET: 241 break; 242 default: 243 return (EOPNOTSUPP); 244 } 245 246 soassertlocked(so); 247 248 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 249 return (EINVAL); 250 #ifdef SOCKET_SPLICE 251 if (isspliced(so) || issplicedback(so)) 252 return (EOPNOTSUPP); 253 #endif /* SOCKET_SPLICE */ 254 error = pru_listen(so); 255 if (error) 256 return (error); 257 if (TAILQ_FIRST(&so->so_q) == NULL) 258 so->so_options |= SO_ACCEPTCONN; 259 if (backlog < 0 || backlog > somaxconn_local) 260 backlog = somaxconn_local; 261 if (backlog < sominconn_local) 262 backlog = sominconn_local; 263 so->so_qlimit = backlog; 264 return (0); 265 } 266 267 void 268 sorele(struct socket *so) 269 { 270 if (refcnt_rele(&so->so_refcnt) == 0) 271 return; 272 273 sigio_free(&so->so_sigio); 274 klist_free(&so->so_rcv.sb_klist); 275 klist_free(&so->so_snd.sb_klist); 276 277 mtx_enter(&so->so_snd.sb_mtx); 278 sbrelease(so, &so->so_snd); 279 mtx_leave(&so->so_snd.sb_mtx); 280 281 if (so->so_proto->pr_flags & PR_RIGHTS && 282 so->so_proto->pr_domain->dom_dispose) 283 (*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb); 284 m_purge(so->so_rcv.sb_mb); 285 286 #ifdef SOCKET_SPLICE 287 if (so->so_sp) 288 pool_put(&sosplice_pool, so->so_sp); 289 #endif 290 pool_put(&socket_pool, so); 291 } 292 293 #define SOSP_FREEING_READ 1 294 #define SOSP_FREEING_WRITE 2 295 void 296 sofree(struct socket *so, int keep_lock) 297 { 298 int persocket = solock_persocket(so); 299 300 soassertlocked(so); 301 302 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { 303 if (!keep_lock) 304 sounlock(so); 305 return; 306 } 307 if (so->so_head) { 308 struct socket *head = so->so_head; 309 310 /* 311 * We must not decommission a socket that's on the accept(2) 312 * queue. If we do, then accept(2) may hang after select(2) 313 * indicated that the listening socket was ready. 314 */ 315 if (so->so_onq == &head->so_q) { 316 if (!keep_lock) 317 sounlock(so); 318 return; 319 } 320 321 if (persocket) { 322 soref(head); 323 sounlock(so); 324 solock(head); 325 solock(so); 326 327 if (so->so_onq != &head->so_q0) { 328 sounlock(so); 329 sounlock(head); 330 sorele(head); 331 return; 332 } 333 } 334 335 soqremque(so, 0); 336 337 if (persocket) { 338 sounlock(head); 339 sorele(head); 340 } 341 } 342 343 if (!keep_lock) 344 sounlock(so); 345 sorele(so); 346 } 347 348 static inline uint64_t 349 solinger_nsec(struct socket *so) 350 { 351 if (so->so_linger == 0) 352 return INFSLP; 353 354 return SEC_TO_NSEC(so->so_linger); 355 } 356 357 /* 358 * Close a socket on last file table reference removal. 359 * Initiate disconnect if connected. 360 * Free socket when disconnect complete. 361 */ 362 int 363 soclose(struct socket *so, int flags) 364 { 365 struct socket *so2; 366 int error = 0; 367 368 solock(so); 369 /* Revoke async IO early. There is a final revocation in sofree(). */ 370 sigio_free(&so->so_sigio); 371 if (so->so_state & SS_ISCONNECTED) { 372 if (so->so_pcb == NULL) 373 goto discard; 374 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 375 error = sodisconnect(so); 376 if (error) 377 goto drop; 378 } 379 if (so->so_options & SO_LINGER) { 380 if ((so->so_state & SS_ISDISCONNECTING) && 381 (flags & MSG_DONTWAIT)) 382 goto drop; 383 while (so->so_state & SS_ISCONNECTED) { 384 error = sosleep_nsec(so, &so->so_timeo, 385 PSOCK | PCATCH, "netcls", 386 solinger_nsec(so)); 387 if (error) 388 break; 389 } 390 } 391 } 392 drop: 393 if (so->so_pcb) { 394 int error2; 395 error2 = pru_detach(so); 396 if (error == 0) 397 error = error2; 398 } 399 if (so->so_options & SO_ACCEPTCONN) { 400 int persocket = solock_persocket(so); 401 402 while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { 403 if (persocket) 404 solock(so2); 405 (void) soqremque(so2, 0); 406 if (persocket) 407 sounlock(so); 408 soabort(so2); 409 if (persocket) 410 solock(so); 411 } 412 while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { 413 if (persocket) 414 solock(so2); 415 (void) soqremque(so2, 1); 416 if (persocket) 417 sounlock(so); 418 soabort(so2); 419 if (persocket) 420 solock(so); 421 } 422 } 423 discard: 424 #ifdef SOCKET_SPLICE 425 if (so->so_sp) { 426 struct socket *soback; 427 428 sounlock(so); 429 mtx_enter(&so->so_snd.sb_mtx); 430 /* 431 * Concurrent sounsplice() locks `sb_mtx' mutexes on 432 * both `so_snd' and `so_rcv' before unsplice sockets. 433 */ 434 if ((soback = so->so_sp->ssp_soback) == NULL) { 435 mtx_leave(&so->so_snd.sb_mtx); 436 goto notsplicedback; 437 } 438 soref(soback); 439 mtx_leave(&so->so_snd.sb_mtx); 440 441 /* 442 * `so' can be only unspliced, and never spliced again. 443 * Thus if issplicedback(so) check is positive, socket is 444 * still spliced and `ssp_soback' points to the same 445 * socket that `soback'. 446 */ 447 sblock(&soback->so_rcv, SBL_WAIT | SBL_NOINTR); 448 if (issplicedback(so)) { 449 int freeing = SOSP_FREEING_WRITE; 450 451 if (so->so_sp->ssp_soback == so) 452 freeing |= SOSP_FREEING_READ; 453 sounsplice(so->so_sp->ssp_soback, so, freeing); 454 } 455 sbunlock(&soback->so_rcv); 456 sorele(soback); 457 458 notsplicedback: 459 sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR); 460 if (isspliced(so)) { 461 struct socket *sosp; 462 int freeing = SOSP_FREEING_READ; 463 464 if (so == so->so_sp->ssp_socket) 465 freeing |= SOSP_FREEING_WRITE; 466 sosp = soref(so->so_sp->ssp_socket); 467 sounsplice(so, so->so_sp->ssp_socket, freeing); 468 sorele(sosp); 469 } 470 sbunlock(&so->so_rcv); 471 472 timeout_del_barrier(&so->so_sp->ssp_idleto); 473 task_del(sosplice_taskq, &so->so_sp->ssp_task); 474 taskq_barrier(sosplice_taskq); 475 476 solock(so); 477 } 478 #endif /* SOCKET_SPLICE */ 479 480 if (so->so_state & SS_NOFDREF) 481 panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type); 482 so->so_state |= SS_NOFDREF; 483 484 /* sofree() calls sounlock(). */ 485 sofree(so, 0); 486 return (error); 487 } 488 489 void 490 soabort(struct socket *so) 491 { 492 soassertlocked(so); 493 pru_abort(so); 494 } 495 496 int 497 soaccept(struct socket *so, struct mbuf *nam) 498 { 499 int error = 0; 500 501 soassertlocked(so); 502 503 if ((so->so_state & SS_NOFDREF) == 0) 504 panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type); 505 so->so_state &= ~SS_NOFDREF; 506 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 507 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 508 error = pru_accept(so, nam); 509 else 510 error = ECONNABORTED; 511 return (error); 512 } 513 514 int 515 soconnect(struct socket *so, struct mbuf *nam) 516 { 517 int error; 518 519 soassertlocked(so); 520 521 if (so->so_options & SO_ACCEPTCONN) 522 return (EOPNOTSUPP); 523 /* 524 * If protocol is connection-based, can only connect once. 525 * Otherwise, if connected, try to disconnect first. 526 * This allows user to disconnect by connecting to, e.g., 527 * a null address. 528 */ 529 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 530 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 531 (error = sodisconnect(so)))) 532 error = EISCONN; 533 else 534 error = pru_connect(so, nam); 535 return (error); 536 } 537 538 int 539 soconnect2(struct socket *so1, struct socket *so2) 540 { 541 int persocket, error; 542 543 if ((persocket = solock_persocket(so1))) 544 solock_pair(so1, so2); 545 else 546 solock(so1); 547 548 error = pru_connect2(so1, so2); 549 550 if (persocket) 551 sounlock(so2); 552 sounlock(so1); 553 return (error); 554 } 555 556 int 557 sodisconnect(struct socket *so) 558 { 559 int error; 560 561 soassertlocked(so); 562 563 if ((so->so_state & SS_ISCONNECTED) == 0) 564 return (ENOTCONN); 565 if (so->so_state & SS_ISDISCONNECTING) 566 return (EALREADY); 567 error = pru_disconnect(so); 568 return (error); 569 } 570 571 int m_getuio(struct mbuf **, int, long, struct uio *); 572 573 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 574 /* 575 * Send on a socket. 576 * If send must go all at once and message is larger than 577 * send buffering, then hard error. 578 * Lock against other senders. 579 * If must go all at once and not enough room now, then 580 * inform user that this would block and do nothing. 581 * Otherwise, if nonblocking, send as much as possible. 582 * The data to be sent is described by "uio" if nonzero, 583 * otherwise by the mbuf chain "top" (which must be null 584 * if uio is not). Data provided in mbuf chain must be small 585 * enough to send all at once. 586 * 587 * Returns nonzero on error, timeout or signal; callers 588 * must check for short counts if EINTR/ERESTART are returned. 589 * Data and control buffers are freed on return. 590 */ 591 int 592 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 593 struct mbuf *control, int flags) 594 { 595 long space, clen = 0; 596 size_t resid; 597 int error; 598 int atomic = sosendallatonce(so) || top; 599 int dosolock = ((so->so_snd.sb_flags & SB_MTXLOCK) == 0); 600 601 if (uio) 602 resid = uio->uio_resid; 603 else 604 resid = top->m_pkthdr.len; 605 /* MSG_EOR on a SOCK_STREAM socket is invalid. */ 606 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 607 m_freem(top); 608 m_freem(control); 609 return (EINVAL); 610 } 611 if (uio && uio->uio_procp) 612 uio->uio_procp->p_ru.ru_msgsnd++; 613 if (control) { 614 /* 615 * In theory clen should be unsigned (since control->m_len is). 616 * However, space must be signed, as it might be less than 0 617 * if we over-committed, and we must use a signed comparison 618 * of space and clen. 619 */ 620 clen = control->m_len; 621 /* reserve extra space for AF_UNIX's internalize */ 622 if (so->so_proto->pr_domain->dom_family == AF_UNIX && 623 clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) && 624 mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 625 clen = CMSG_SPACE( 626 (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) * 627 (sizeof(struct fdpass) / sizeof(int))); 628 } 629 630 #define snderr(errno) { error = errno; goto release; } 631 632 restart: 633 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) 634 goto out; 635 if (dosolock) 636 solock_shared(so); 637 sb_mtx_lock(&so->so_snd); 638 so->so_snd.sb_state |= SS_ISSENDING; 639 do { 640 if (so->so_snd.sb_state & SS_CANTSENDMORE) 641 snderr(EPIPE); 642 if ((error = READ_ONCE(so->so_error))) { 643 so->so_error = 0; 644 snderr(error); 645 } 646 if ((so->so_state & SS_ISCONNECTED) == 0) { 647 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 648 if (!(resid == 0 && clen != 0)) 649 snderr(ENOTCONN); 650 } else if (addr == NULL) 651 snderr(EDESTADDRREQ); 652 } 653 space = sbspace_locked(so, &so->so_snd); 654 if (flags & MSG_OOB) 655 space += 1024; 656 if (so->so_proto->pr_domain->dom_family == AF_UNIX) { 657 if (atomic && resid > so->so_snd.sb_hiwat) 658 snderr(EMSGSIZE); 659 } else { 660 if (clen > so->so_snd.sb_hiwat || 661 (atomic && resid > so->so_snd.sb_hiwat - clen)) 662 snderr(EMSGSIZE); 663 } 664 if (space < clen || 665 (space - clen < resid && 666 (atomic || space < so->so_snd.sb_lowat))) { 667 if (flags & MSG_DONTWAIT) 668 snderr(EWOULDBLOCK); 669 sbunlock(&so->so_snd); 670 error = sbwait(so, &so->so_snd); 671 so->so_snd.sb_state &= ~SS_ISSENDING; 672 sb_mtx_unlock(&so->so_snd); 673 if (dosolock) 674 sounlock_shared(so); 675 if (error) 676 goto out; 677 goto restart; 678 } 679 space -= clen; 680 do { 681 if (uio == NULL) { 682 /* 683 * Data is prepackaged in "top". 684 */ 685 resid = 0; 686 if (flags & MSG_EOR) 687 top->m_flags |= M_EOR; 688 } else { 689 sb_mtx_unlock(&so->so_snd); 690 if (dosolock) 691 sounlock_shared(so); 692 error = m_getuio(&top, atomic, space, uio); 693 if (dosolock) 694 solock_shared(so); 695 sb_mtx_lock(&so->so_snd); 696 if (error) 697 goto release; 698 space -= top->m_pkthdr.len; 699 resid = uio->uio_resid; 700 if (flags & MSG_EOR) 701 top->m_flags |= M_EOR; 702 } 703 if (resid == 0) 704 so->so_snd.sb_state &= ~SS_ISSENDING; 705 if (top && so->so_options & SO_ZEROIZE) 706 top->m_flags |= M_ZEROIZE; 707 sb_mtx_unlock(&so->so_snd); 708 if (!dosolock) 709 solock_shared(so); 710 if (flags & MSG_OOB) 711 error = pru_sendoob(so, top, addr, control); 712 else 713 error = pru_send(so, top, addr, control); 714 if (!dosolock) 715 sounlock_shared(so); 716 sb_mtx_lock(&so->so_snd); 717 clen = 0; 718 control = NULL; 719 top = NULL; 720 if (error) 721 goto release; 722 } while (resid && space > 0); 723 } while (resid); 724 725 release: 726 so->so_snd.sb_state &= ~SS_ISSENDING; 727 sb_mtx_unlock(&so->so_snd); 728 if (dosolock) 729 sounlock_shared(so); 730 sbunlock(&so->so_snd); 731 out: 732 m_freem(top); 733 m_freem(control); 734 return (error); 735 } 736 737 int 738 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio) 739 { 740 struct mbuf *m, *top = NULL; 741 struct mbuf **nextp = ⊤ 742 u_long len, mlen; 743 size_t resid = uio->uio_resid; 744 int error; 745 746 do { 747 if (top == NULL) { 748 MGETHDR(m, M_WAIT, MT_DATA); 749 mlen = MHLEN; 750 } else { 751 MGET(m, M_WAIT, MT_DATA); 752 mlen = MLEN; 753 } 754 /* chain mbuf together */ 755 *nextp = m; 756 nextp = &m->m_next; 757 758 resid = ulmin(resid, space); 759 if (resid >= MINCLSIZE) { 760 MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES)); 761 if ((m->m_flags & M_EXT) == 0) 762 MCLGETL(m, M_NOWAIT, MCLBYTES); 763 if ((m->m_flags & M_EXT) == 0) 764 goto nopages; 765 mlen = m->m_ext.ext_size; 766 len = ulmin(mlen, resid); 767 /* 768 * For datagram protocols, leave room 769 * for protocol headers in first mbuf. 770 */ 771 if (atomic && m == top && len < mlen - max_hdr) 772 m->m_data += max_hdr; 773 } else { 774 nopages: 775 len = ulmin(mlen, resid); 776 /* 777 * For datagram protocols, leave room 778 * for protocol headers in first mbuf. 779 */ 780 if (atomic && m == top && len < mlen - max_hdr) 781 m_align(m, len); 782 } 783 784 error = uiomove(mtod(m, caddr_t), len, uio); 785 if (error) { 786 m_freem(top); 787 return (error); 788 } 789 790 /* adjust counters */ 791 resid = uio->uio_resid; 792 space -= len; 793 m->m_len = len; 794 top->m_pkthdr.len += len; 795 796 /* Is there more space and more data? */ 797 } while (space > 0 && resid > 0); 798 799 *mp = top; 800 return 0; 801 } 802 803 /* 804 * Following replacement or removal of the first mbuf on the first 805 * mbuf chain of a socket buffer, push necessary state changes back 806 * into the socket buffer so that other consumers see the values 807 * consistently. 'nextrecord' is the callers locally stored value of 808 * the original value of sb->sb_mb->m_nextpkt which must be restored 809 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 810 */ 811 void 812 sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 813 { 814 815 /* 816 * First, update for the new value of nextrecord. If necessary, 817 * make it the first record. 818 */ 819 if (sb->sb_mb != NULL) 820 sb->sb_mb->m_nextpkt = nextrecord; 821 else 822 sb->sb_mb = nextrecord; 823 824 /* 825 * Now update any dependent socket buffer fields to reflect 826 * the new state. This is an inline of SB_EMPTY_FIXUP, with 827 * the addition of a second clause that takes care of the 828 * case where sb_mb has been updated, but remains the last 829 * record. 830 */ 831 if (sb->sb_mb == NULL) { 832 sb->sb_mbtail = NULL; 833 sb->sb_lastrecord = NULL; 834 } else if (sb->sb_mb->m_nextpkt == NULL) 835 sb->sb_lastrecord = sb->sb_mb; 836 } 837 838 /* 839 * Implement receive operations on a socket. 840 * We depend on the way that records are added to the sockbuf 841 * by sbappend*. In particular, each record (mbufs linked through m_next) 842 * must begin with an address if the protocol so specifies, 843 * followed by an optional mbuf or mbufs containing ancillary data, 844 * and then zero or more mbufs of data. 845 * In order to avoid blocking network for the entire time here, we release 846 * the solock() while doing the actual copy to user space. 847 * Although the sockbuf is locked, new data may still be appended, 848 * and thus we must maintain consistency of the sockbuf during that time. 849 * 850 * The caller may receive the data as a single mbuf chain by supplying 851 * an mbuf **mp0 for use in returning the chain. The uio is then used 852 * only for the count in uio_resid. 853 */ 854 int 855 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 856 struct mbuf **mp0, struct mbuf **controlp, int *flagsp, 857 socklen_t controllen) 858 { 859 struct mbuf *m, **mp; 860 struct mbuf *cm; 861 u_long len, offset, moff; 862 int flags, error, error2, type, uio_error = 0; 863 const struct protosw *pr = so->so_proto; 864 struct mbuf *nextrecord; 865 size_t resid, orig_resid = uio->uio_resid; 866 int dosolock = ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0); 867 868 mp = mp0; 869 if (paddr) 870 *paddr = NULL; 871 if (controlp) 872 *controlp = NULL; 873 if (flagsp) 874 flags = *flagsp &~ MSG_EOR; 875 else 876 flags = 0; 877 if (flags & MSG_OOB) { 878 m = m_get(M_WAIT, MT_DATA); 879 solock_shared(so); 880 error = pru_rcvoob(so, m, flags & MSG_PEEK); 881 sounlock_shared(so); 882 if (error) 883 goto bad; 884 do { 885 error = uiomove(mtod(m, caddr_t), 886 ulmin(uio->uio_resid, m->m_len), uio); 887 m = m_free(m); 888 } while (uio->uio_resid && error == 0 && m); 889 bad: 890 m_freem(m); 891 return (error); 892 } 893 if (mp) 894 *mp = NULL; 895 896 restart: 897 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) 898 return (error); 899 if (dosolock) 900 solock_shared(so); 901 sb_mtx_lock(&so->so_rcv); 902 903 m = so->so_rcv.sb_mb; 904 #ifdef SOCKET_SPLICE 905 if (isspliced(so)) 906 m = NULL; 907 #endif /* SOCKET_SPLICE */ 908 /* 909 * If we have less data than requested, block awaiting more 910 * (subject to any timeout) if: 911 * 1. the current count is less than the low water mark, 912 * 2. MSG_WAITALL is set, and it is possible to do the entire 913 * receive operation at once if we block (resid <= hiwat), or 914 * 3. MSG_DONTWAIT is not set. 915 * If MSG_WAITALL is set but resid is larger than the receive buffer, 916 * we have to do the receive in sections, and thus risk returning 917 * a short count if a timeout or signal occurs after we start. 918 */ 919 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 920 so->so_rcv.sb_cc < uio->uio_resid) && 921 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 922 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 923 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 924 #ifdef DIAGNOSTIC 925 if (m == NULL && so->so_rcv.sb_cc) 926 #ifdef SOCKET_SPLICE 927 if (!isspliced(so)) 928 #endif /* SOCKET_SPLICE */ 929 panic("receive 1: so %p, so_type %d, sb_cc %lu", 930 so, so->so_type, so->so_rcv.sb_cc); 931 #endif 932 if ((error2 = READ_ONCE(so->so_error))) { 933 if (m) 934 goto dontblock; 935 error = error2; 936 if ((flags & MSG_PEEK) == 0) 937 so->so_error = 0; 938 goto release; 939 } 940 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 941 if (m) 942 goto dontblock; 943 else if (so->so_rcv.sb_cc == 0) 944 goto release; 945 } 946 for (; m; m = m->m_next) 947 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 948 m = so->so_rcv.sb_mb; 949 goto dontblock; 950 } 951 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 952 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 953 error = ENOTCONN; 954 goto release; 955 } 956 if (uio->uio_resid == 0 && controlp == NULL) 957 goto release; 958 if (flags & MSG_DONTWAIT) { 959 error = EWOULDBLOCK; 960 goto release; 961 } 962 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 963 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 964 965 sbunlock(&so->so_rcv); 966 error = sbwait(so, &so->so_rcv); 967 sb_mtx_unlock(&so->so_rcv); 968 if (dosolock) 969 sounlock_shared(so); 970 if (error) 971 return (error); 972 goto restart; 973 } 974 dontblock: 975 /* 976 * On entry here, m points to the first record of the socket buffer. 977 * From this point onward, we maintain 'nextrecord' as a cache of the 978 * pointer to the next record in the socket buffer. We must keep the 979 * various socket buffer pointers and local stack versions of the 980 * pointers in sync, pushing out modifications before operations that 981 * may sleep, and re-reading them afterwards. 982 * 983 * Otherwise, we will race with the network stack appending new data 984 * or records onto the socket buffer by using inconsistent/stale 985 * versions of the field, possibly resulting in socket buffer 986 * corruption. 987 */ 988 if (uio->uio_procp) 989 uio->uio_procp->p_ru.ru_msgrcv++; 990 KASSERT(m == so->so_rcv.sb_mb); 991 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 992 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 993 nextrecord = m->m_nextpkt; 994 if (pr->pr_flags & PR_ADDR) { 995 #ifdef DIAGNOSTIC 996 if (m->m_type != MT_SONAME) 997 panic("receive 1a: so %p, so_type %d, m %p, m_type %d", 998 so, so->so_type, m, m->m_type); 999 #endif 1000 orig_resid = 0; 1001 if (flags & MSG_PEEK) { 1002 if (paddr) 1003 *paddr = m_copym(m, 0, m->m_len, M_NOWAIT); 1004 m = m->m_next; 1005 } else { 1006 sbfree(so, &so->so_rcv, m); 1007 if (paddr) { 1008 *paddr = m; 1009 so->so_rcv.sb_mb = m->m_next; 1010 m->m_next = NULL; 1011 m = so->so_rcv.sb_mb; 1012 } else { 1013 so->so_rcv.sb_mb = m_free(m); 1014 m = so->so_rcv.sb_mb; 1015 } 1016 sbsync(&so->so_rcv, nextrecord); 1017 } 1018 } 1019 while (m && m->m_type == MT_CONTROL && error == 0) { 1020 int skip = 0; 1021 if (flags & MSG_PEEK) { 1022 if (mtod(m, struct cmsghdr *)->cmsg_type == 1023 SCM_RIGHTS) { 1024 /* don't leak internalized SCM_RIGHTS msgs */ 1025 skip = 1; 1026 } else if (controlp) 1027 *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); 1028 m = m->m_next; 1029 } else { 1030 sbfree(so, &so->so_rcv, m); 1031 so->so_rcv.sb_mb = m->m_next; 1032 m->m_nextpkt = m->m_next = NULL; 1033 cm = m; 1034 m = so->so_rcv.sb_mb; 1035 sbsync(&so->so_rcv, nextrecord); 1036 if (controlp) { 1037 if (pr->pr_domain->dom_externalize) { 1038 sb_mtx_unlock(&so->so_rcv); 1039 if (dosolock) 1040 sounlock_shared(so); 1041 error = 1042 (*pr->pr_domain->dom_externalize) 1043 (cm, controllen, flags); 1044 if (dosolock) 1045 solock_shared(so); 1046 sb_mtx_lock(&so->so_rcv); 1047 } 1048 *controlp = cm; 1049 } else { 1050 /* 1051 * Dispose of any SCM_RIGHTS message that went 1052 * through the read path rather than recv. 1053 */ 1054 if (pr->pr_domain->dom_dispose) { 1055 sb_mtx_unlock(&so->so_rcv); 1056 pr->pr_domain->dom_dispose(cm); 1057 sb_mtx_lock(&so->so_rcv); 1058 } 1059 m_free(cm); 1060 } 1061 } 1062 if (m != NULL) 1063 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1064 else 1065 nextrecord = so->so_rcv.sb_mb; 1066 if (controlp && !skip) 1067 controlp = &(*controlp)->m_next; 1068 orig_resid = 0; 1069 } 1070 1071 /* If m is non-NULL, we have some data to read. */ 1072 if (m) { 1073 type = m->m_type; 1074 if (type == MT_OOBDATA) 1075 flags |= MSG_OOB; 1076 if (m->m_flags & M_BCAST) 1077 flags |= MSG_BCAST; 1078 if (m->m_flags & M_MCAST) 1079 flags |= MSG_MCAST; 1080 } 1081 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 1082 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 1083 1084 moff = 0; 1085 offset = 0; 1086 while (m && uio->uio_resid > 0 && error == 0) { 1087 if (m->m_type == MT_OOBDATA) { 1088 if (type != MT_OOBDATA) 1089 break; 1090 } else if (type == MT_OOBDATA) { 1091 break; 1092 } else if (m->m_type == MT_CONTROL) { 1093 /* 1094 * If there is more than one control message in the 1095 * stream, we do a short read. Next can be received 1096 * or disposed by another system call. 1097 */ 1098 break; 1099 #ifdef DIAGNOSTIC 1100 } else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) { 1101 panic("receive 3: so %p, so_type %d, m %p, m_type %d", 1102 so, so->so_type, m, m->m_type); 1103 #endif 1104 } 1105 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1106 len = uio->uio_resid; 1107 if (so->so_oobmark && len > so->so_oobmark - offset) 1108 len = so->so_oobmark - offset; 1109 if (len > m->m_len - moff) 1110 len = m->m_len - moff; 1111 /* 1112 * If mp is set, just pass back the mbufs. 1113 * Otherwise copy them out via the uio, then free. 1114 * Sockbuf must be consistent here (points to current mbuf, 1115 * it points to next record) when we drop priority; 1116 * we must note any additions to the sockbuf when we 1117 * block interrupts again. 1118 */ 1119 if (mp == NULL && uio_error == 0) { 1120 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 1121 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 1122 resid = uio->uio_resid; 1123 sb_mtx_unlock(&so->so_rcv); 1124 if (dosolock) 1125 sounlock_shared(so); 1126 uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1127 if (dosolock) 1128 solock_shared(so); 1129 sb_mtx_lock(&so->so_rcv); 1130 if (uio_error) 1131 uio->uio_resid = resid - len; 1132 } else 1133 uio->uio_resid -= len; 1134 if (len == m->m_len - moff) { 1135 if (m->m_flags & M_EOR) 1136 flags |= MSG_EOR; 1137 if (flags & MSG_PEEK) { 1138 m = m->m_next; 1139 moff = 0; 1140 orig_resid = 0; 1141 } else { 1142 nextrecord = m->m_nextpkt; 1143 sbfree(so, &so->so_rcv, m); 1144 if (mp) { 1145 *mp = m; 1146 mp = &m->m_next; 1147 so->so_rcv.sb_mb = m = m->m_next; 1148 *mp = NULL; 1149 } else { 1150 so->so_rcv.sb_mb = m_free(m); 1151 m = so->so_rcv.sb_mb; 1152 } 1153 /* 1154 * If m != NULL, we also know that 1155 * so->so_rcv.sb_mb != NULL. 1156 */ 1157 KASSERT(so->so_rcv.sb_mb == m); 1158 if (m) { 1159 m->m_nextpkt = nextrecord; 1160 if (nextrecord == NULL) 1161 so->so_rcv.sb_lastrecord = m; 1162 } else { 1163 so->so_rcv.sb_mb = nextrecord; 1164 SB_EMPTY_FIXUP(&so->so_rcv); 1165 } 1166 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1167 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1168 } 1169 } else { 1170 if (flags & MSG_PEEK) { 1171 moff += len; 1172 orig_resid = 0; 1173 } else { 1174 if (mp) 1175 *mp = m_copym(m, 0, len, M_WAIT); 1176 m->m_data += len; 1177 m->m_len -= len; 1178 so->so_rcv.sb_cc -= len; 1179 so->so_rcv.sb_datacc -= len; 1180 } 1181 } 1182 if (so->so_oobmark) { 1183 if ((flags & MSG_PEEK) == 0) { 1184 so->so_oobmark -= len; 1185 if (so->so_oobmark == 0) { 1186 so->so_rcv.sb_state |= SS_RCVATMARK; 1187 break; 1188 } 1189 } else { 1190 offset += len; 1191 if (offset == so->so_oobmark) 1192 break; 1193 } 1194 } 1195 if (flags & MSG_EOR) 1196 break; 1197 /* 1198 * If the MSG_WAITALL flag is set (for non-atomic socket), 1199 * we must not quit until "uio->uio_resid == 0" or an error 1200 * termination. If a signal/timeout occurs, return 1201 * with a short count but without error. 1202 * Keep sockbuf locked against other readers. 1203 */ 1204 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1205 !sosendallatonce(so) && !nextrecord) { 1206 if (so->so_rcv.sb_state & SS_CANTRCVMORE || 1207 so->so_error) 1208 break; 1209 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1210 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1211 if (sbwait(so, &so->so_rcv)) { 1212 sb_mtx_unlock(&so->so_rcv); 1213 if (dosolock) 1214 sounlock_shared(so); 1215 sbunlock(&so->so_rcv); 1216 return (0); 1217 } 1218 if ((m = so->so_rcv.sb_mb) != NULL) 1219 nextrecord = m->m_nextpkt; 1220 } 1221 } 1222 1223 if (m && pr->pr_flags & PR_ATOMIC) { 1224 flags |= MSG_TRUNC; 1225 if ((flags & MSG_PEEK) == 0) 1226 (void) sbdroprecord(so, &so->so_rcv); 1227 } 1228 if ((flags & MSG_PEEK) == 0) { 1229 if (m == NULL) { 1230 /* 1231 * First part is an inline SB_EMPTY_FIXUP(). Second 1232 * part makes sure sb_lastrecord is up-to-date if 1233 * there is still data in the socket buffer. 1234 */ 1235 so->so_rcv.sb_mb = nextrecord; 1236 if (so->so_rcv.sb_mb == NULL) { 1237 so->so_rcv.sb_mbtail = NULL; 1238 so->so_rcv.sb_lastrecord = NULL; 1239 } else if (nextrecord->m_nextpkt == NULL) 1240 so->so_rcv.sb_lastrecord = nextrecord; 1241 } 1242 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1243 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1244 if (pr->pr_flags & PR_WANTRCVD) { 1245 sb_mtx_unlock(&so->so_rcv); 1246 if (!dosolock) 1247 solock_shared(so); 1248 pru_rcvd(so); 1249 if (!dosolock) 1250 sounlock_shared(so); 1251 sb_mtx_lock(&so->so_rcv); 1252 } 1253 } 1254 if (orig_resid == uio->uio_resid && orig_resid && 1255 (flags & MSG_EOR) == 0 && 1256 (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) { 1257 sb_mtx_unlock(&so->so_rcv); 1258 sbunlock(&so->so_rcv); 1259 goto restart; 1260 } 1261 1262 if (uio_error) 1263 error = uio_error; 1264 1265 if (flagsp) 1266 *flagsp |= flags; 1267 release: 1268 sb_mtx_unlock(&so->so_rcv); 1269 if (dosolock) 1270 sounlock_shared(so); 1271 sbunlock(&so->so_rcv); 1272 return (error); 1273 } 1274 1275 int 1276 soshutdown(struct socket *so, int how) 1277 { 1278 int error = 0; 1279 1280 switch (how) { 1281 case SHUT_RD: 1282 sorflush(so); 1283 break; 1284 case SHUT_RDWR: 1285 sorflush(so); 1286 /* FALLTHROUGH */ 1287 case SHUT_WR: 1288 solock(so); 1289 error = pru_shutdown(so); 1290 sounlock(so); 1291 break; 1292 default: 1293 error = EINVAL; 1294 break; 1295 } 1296 1297 return (error); 1298 } 1299 1300 void 1301 sorflush(struct socket *so) 1302 { 1303 struct sockbuf *sb = &so->so_rcv; 1304 struct mbuf *m; 1305 const struct protosw *pr = so->so_proto; 1306 int error; 1307 1308 error = sblock(sb, SBL_WAIT | SBL_NOINTR); 1309 /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */ 1310 KASSERT(error == 0); 1311 1312 solock_shared(so); 1313 socantrcvmore(so); 1314 mtx_enter(&sb->sb_mtx); 1315 m = sb->sb_mb; 1316 memset(&sb->sb_startzero, 0, 1317 (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); 1318 sb->sb_timeo_nsecs = INFSLP; 1319 mtx_leave(&sb->sb_mtx); 1320 sounlock_shared(so); 1321 sbunlock(sb); 1322 1323 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1324 (*pr->pr_domain->dom_dispose)(m); 1325 m_purge(m); 1326 } 1327 1328 #ifdef SOCKET_SPLICE 1329 1330 #define so_splicelen so_sp->ssp_len 1331 #define so_splicemax so_sp->ssp_max 1332 #define so_idletv so_sp->ssp_idletv 1333 #define so_idleto so_sp->ssp_idleto 1334 #define so_splicetask so_sp->ssp_task 1335 1336 void 1337 sosplice_solock_pair(struct socket *so1, struct socket *so2) 1338 { 1339 NET_LOCK_SHARED(); 1340 1341 if (so1 == so2) 1342 rw_enter_write(&so1->so_lock); 1343 else if (so1 < so2) { 1344 rw_enter_write(&so1->so_lock); 1345 rw_enter_write(&so2->so_lock); 1346 } else { 1347 rw_enter_write(&so2->so_lock); 1348 rw_enter_write(&so1->so_lock); 1349 } 1350 } 1351 1352 void 1353 sosplice_sounlock_pair(struct socket *so1, struct socket *so2) 1354 { 1355 if (so1 == so2) 1356 rw_exit_write(&so1->so_lock); 1357 else if (so1 < so2) { 1358 rw_exit_write(&so2->so_lock); 1359 rw_exit_write(&so1->so_lock); 1360 } else { 1361 rw_exit_write(&so1->so_lock); 1362 rw_exit_write(&so2->so_lock); 1363 } 1364 1365 NET_UNLOCK_SHARED(); 1366 } 1367 1368 int 1369 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) 1370 { 1371 struct file *fp; 1372 struct socket *sosp; 1373 struct taskq *tq; 1374 int error = 0; 1375 1376 if ((so->so_proto->pr_flags & PR_SPLICE) == 0) 1377 return (EPROTONOSUPPORT); 1378 if (max && max < 0) 1379 return (EINVAL); 1380 if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) 1381 return (EINVAL); 1382 1383 /* If no fd is given, unsplice by removing existing link. */ 1384 if (fd < 0) { 1385 if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0) 1386 return (error); 1387 if (so->so_sp && so->so_sp->ssp_socket) { 1388 sosp = soref(so->so_sp->ssp_socket); 1389 sounsplice(so, so->so_sp->ssp_socket, 0); 1390 sorele(sosp); 1391 } 1392 sbunlock(&so->so_rcv); 1393 return (0); 1394 } 1395 1396 if (sosplice_taskq == NULL) { 1397 rw_enter_write(&sosplice_lock); 1398 if (sosplice_taskq == NULL) { 1399 tq = taskq_create("sosplice", 1, IPL_SOFTNET, 1400 TASKQ_MPSAFE); 1401 if (tq == NULL) { 1402 rw_exit_write(&sosplice_lock); 1403 return (ENOMEM); 1404 } 1405 /* Ensure the taskq is fully visible to other CPUs. */ 1406 membar_producer(); 1407 sosplice_taskq = tq; 1408 } 1409 rw_exit_write(&sosplice_lock); 1410 } else { 1411 /* Ensure the taskq is fully visible on this CPU. */ 1412 membar_consumer(); 1413 } 1414 1415 /* Find sosp, the drain socket where data will be spliced into. */ 1416 if ((error = getsock(curproc, fd, &fp)) != 0) 1417 return (error); 1418 sosp = fp->f_data; 1419 1420 if (sosp->so_proto->pr_usrreqs->pru_send != 1421 so->so_proto->pr_usrreqs->pru_send) { 1422 error = EPROTONOSUPPORT; 1423 goto frele; 1424 } 1425 1426 if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0) 1427 goto frele; 1428 if ((error = sblock(&sosp->so_snd, SBL_WAIT)) != 0) { 1429 sbunlock(&so->so_rcv); 1430 goto frele; 1431 } 1432 sosplice_solock_pair(so, sosp); 1433 1434 if ((so->so_options & SO_ACCEPTCONN) || 1435 (sosp->so_options & SO_ACCEPTCONN)) { 1436 error = EOPNOTSUPP; 1437 goto release; 1438 } 1439 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1440 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1441 error = ENOTCONN; 1442 goto release; 1443 } 1444 if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) { 1445 error = ENOTCONN; 1446 goto release; 1447 } 1448 if (so->so_sp == NULL) { 1449 struct sosplice *so_sp; 1450 1451 so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1452 timeout_set_flags(&so_sp->ssp_idleto, soidle, so, 1453 KCLOCK_NONE, TIMEOUT_PROC | TIMEOUT_MPSAFE); 1454 task_set(&so_sp->ssp_task, sotask, so); 1455 1456 so->so_sp = so_sp; 1457 } 1458 if (sosp->so_sp == NULL) { 1459 struct sosplice *so_sp; 1460 1461 so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1462 timeout_set_flags(&so_sp->ssp_idleto, soidle, sosp, 1463 KCLOCK_NONE, TIMEOUT_PROC | TIMEOUT_MPSAFE); 1464 task_set(&so_sp->ssp_task, sotask, sosp); 1465 1466 sosp->so_sp = so_sp; 1467 } 1468 if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { 1469 error = EBUSY; 1470 goto release; 1471 } 1472 1473 so->so_splicelen = 0; 1474 so->so_splicemax = max; 1475 if (tv) 1476 so->so_idletv = *tv; 1477 else 1478 timerclear(&so->so_idletv); 1479 1480 /* 1481 * To prevent sorwakeup() calling somove() before this somove() 1482 * has finished, the socket buffers are not marked as spliced yet. 1483 */ 1484 1485 /* Splice so and sosp together. */ 1486 mtx_enter(&so->so_rcv.sb_mtx); 1487 mtx_enter(&sosp->so_snd.sb_mtx); 1488 so->so_sp->ssp_socket = sosp; 1489 sosp->so_sp->ssp_soback = so; 1490 mtx_leave(&sosp->so_snd.sb_mtx); 1491 mtx_leave(&so->so_rcv.sb_mtx); 1492 1493 sosplice_sounlock_pair(so, sosp); 1494 sbunlock(&sosp->so_snd); 1495 1496 if (somove(so, M_WAIT)) { 1497 mtx_enter(&so->so_rcv.sb_mtx); 1498 mtx_enter(&sosp->so_snd.sb_mtx); 1499 so->so_rcv.sb_flags |= SB_SPLICE; 1500 sosp->so_snd.sb_flags |= SB_SPLICE; 1501 mtx_leave(&sosp->so_snd.sb_mtx); 1502 mtx_leave(&so->so_rcv.sb_mtx); 1503 } 1504 1505 sbunlock(&so->so_rcv); 1506 FRELE(fp, curproc); 1507 return (0); 1508 1509 release: 1510 sosplice_sounlock_pair(so, sosp); 1511 sbunlock(&sosp->so_snd); 1512 sbunlock(&so->so_rcv); 1513 frele: 1514 FRELE(fp, curproc); 1515 return (error); 1516 } 1517 1518 void 1519 sounsplice(struct socket *so, struct socket *sosp, int freeing) 1520 { 1521 sbassertlocked(&so->so_rcv); 1522 1523 mtx_enter(&so->so_rcv.sb_mtx); 1524 mtx_enter(&sosp->so_snd.sb_mtx); 1525 so->so_rcv.sb_flags &= ~SB_SPLICE; 1526 sosp->so_snd.sb_flags &= ~SB_SPLICE; 1527 so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; 1528 mtx_leave(&sosp->so_snd.sb_mtx); 1529 mtx_leave(&so->so_rcv.sb_mtx); 1530 1531 task_del(sosplice_taskq, &so->so_splicetask); 1532 timeout_del(&so->so_idleto); 1533 1534 /* Do not wakeup a socket that is about to be freed. */ 1535 if ((freeing & SOSP_FREEING_READ) == 0) { 1536 int readable; 1537 1538 solock_shared(so); 1539 mtx_enter(&so->so_rcv.sb_mtx); 1540 readable = soreadable(so); 1541 mtx_leave(&so->so_rcv.sb_mtx); 1542 if (readable) 1543 sorwakeup(so); 1544 sounlock_shared(so); 1545 } 1546 if ((freeing & SOSP_FREEING_WRITE) == 0) { 1547 solock_shared(sosp); 1548 if (sowriteable(sosp)) 1549 sowwakeup(sosp); 1550 sounlock_shared(sosp); 1551 } 1552 } 1553 1554 void 1555 soidle(void *arg) 1556 { 1557 struct socket *so = arg; 1558 1559 sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR); 1560 if (so->so_rcv.sb_flags & SB_SPLICE) { 1561 struct socket *sosp; 1562 1563 WRITE_ONCE(so->so_error, ETIMEDOUT); 1564 sosp = soref(so->so_sp->ssp_socket); 1565 sounsplice(so, so->so_sp->ssp_socket, 0); 1566 sorele(sosp); 1567 } 1568 sbunlock(&so->so_rcv); 1569 } 1570 1571 void 1572 sotask(void *arg) 1573 { 1574 struct socket *so = arg; 1575 int doyield = 0; 1576 1577 sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR); 1578 if (so->so_rcv.sb_flags & SB_SPLICE) { 1579 if (so->so_proto->pr_flags & PR_WANTRCVD) 1580 doyield = 1; 1581 somove(so, M_DONTWAIT); 1582 } 1583 sbunlock(&so->so_rcv); 1584 1585 if (doyield) { 1586 /* Avoid user land starvation. */ 1587 yield(); 1588 } 1589 } 1590 1591 /* 1592 * Move data from receive buffer of spliced source socket to send 1593 * buffer of drain socket. Try to move as much as possible in one 1594 * big chunk. It is a TCP only implementation. 1595 * Return value 0 means splicing has been finished, 1 continue. 1596 */ 1597 int 1598 somove(struct socket *so, int wait) 1599 { 1600 struct socket *sosp = so->so_sp->ssp_socket; 1601 struct mbuf *m, **mp, *nextrecord; 1602 u_long len, off, oobmark; 1603 long space; 1604 int error = 0, maxreached = 0, unsplice = 0; 1605 unsigned int rcvstate; 1606 int sockdgram = ((so->so_proto->pr_flags & 1607 PR_WANTRCVD) == 0); 1608 1609 sbassertlocked(&so->so_rcv); 1610 1611 if (!sockdgram) { 1612 sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR); 1613 solock(so); 1614 } 1615 1616 mtx_enter(&so->so_rcv.sb_mtx); 1617 mtx_enter(&sosp->so_snd.sb_mtx); 1618 1619 nextpkt: 1620 if ((error = READ_ONCE(so->so_error))) 1621 goto release; 1622 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) { 1623 error = EPIPE; 1624 goto release; 1625 } 1626 1627 error = READ_ONCE(sosp->so_error); 1628 if (error) { 1629 if (error != ETIMEDOUT && error != EFBIG && error != ELOOP) 1630 goto release; 1631 error = 0; 1632 } 1633 if ((sosp->so_state & SS_ISCONNECTED) == 0) 1634 goto release; 1635 1636 /* Calculate how many bytes can be copied now. */ 1637 len = so->so_rcv.sb_datacc; 1638 if (so->so_splicemax) { 1639 KASSERT(so->so_splicelen < so->so_splicemax); 1640 if (so->so_splicemax <= so->so_splicelen + len) { 1641 len = so->so_splicemax - so->so_splicelen; 1642 maxreached = 1; 1643 } 1644 } 1645 space = sbspace_locked(sosp, &sosp->so_snd); 1646 if (so->so_oobmark && so->so_oobmark < len && 1647 so->so_oobmark < space + 1024) 1648 space += 1024; 1649 if (space <= 0) { 1650 maxreached = 0; 1651 goto release; 1652 } 1653 if (space < len) { 1654 maxreached = 0; 1655 if (space < sosp->so_snd.sb_lowat) 1656 goto release; 1657 len = space; 1658 } 1659 sosp->so_snd.sb_state |= SS_ISSENDING; 1660 1661 SBLASTRECORDCHK(&so->so_rcv, "somove 1"); 1662 SBLASTMBUFCHK(&so->so_rcv, "somove 1"); 1663 m = so->so_rcv.sb_mb; 1664 if (m == NULL) 1665 goto release; 1666 nextrecord = m->m_nextpkt; 1667 1668 /* Drop address and control information not used with splicing. */ 1669 if (so->so_proto->pr_flags & PR_ADDR) { 1670 #ifdef DIAGNOSTIC 1671 if (m->m_type != MT_SONAME) 1672 panic("somove soname: so %p, so_type %d, m %p, " 1673 "m_type %d", so, so->so_type, m, m->m_type); 1674 #endif 1675 m = m->m_next; 1676 } 1677 while (m && m->m_type == MT_CONTROL) 1678 m = m->m_next; 1679 if (m == NULL) { 1680 sbdroprecord(so, &so->so_rcv); 1681 if (so->so_proto->pr_flags & PR_WANTRCVD) { 1682 mtx_leave(&sosp->so_snd.sb_mtx); 1683 mtx_leave(&so->so_rcv.sb_mtx); 1684 pru_rcvd(so); 1685 mtx_enter(&so->so_rcv.sb_mtx); 1686 mtx_enter(&sosp->so_snd.sb_mtx); 1687 } 1688 goto nextpkt; 1689 } 1690 1691 /* 1692 * By splicing sockets connected to localhost, userland might create a 1693 * loop. Dissolve splicing with error if loop is detected by counter. 1694 * 1695 * If we deal with looped broadcast/multicast packet we bail out with 1696 * no error to suppress splice termination. 1697 */ 1698 if ((m->m_flags & M_PKTHDR) && 1699 ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) || 1700 ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) { 1701 error = ELOOP; 1702 goto release; 1703 } 1704 1705 if (so->so_proto->pr_flags & PR_ATOMIC) { 1706 if ((m->m_flags & M_PKTHDR) == 0) 1707 panic("somove !PKTHDR: so %p, so_type %d, m %p, " 1708 "m_type %d", so, so->so_type, m, m->m_type); 1709 if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) { 1710 error = EMSGSIZE; 1711 goto release; 1712 } 1713 if (len < m->m_pkthdr.len) 1714 goto release; 1715 if (m->m_pkthdr.len < len) { 1716 maxreached = 0; 1717 len = m->m_pkthdr.len; 1718 } 1719 /* 1720 * Throw away the name mbuf after it has been assured 1721 * that the whole first record can be processed. 1722 */ 1723 m = so->so_rcv.sb_mb; 1724 sbfree(so, &so->so_rcv, m); 1725 so->so_rcv.sb_mb = m_free(m); 1726 sbsync(&so->so_rcv, nextrecord); 1727 } 1728 /* 1729 * Throw away the control mbufs after it has been assured 1730 * that the whole first record can be processed. 1731 */ 1732 m = so->so_rcv.sb_mb; 1733 while (m && m->m_type == MT_CONTROL) { 1734 sbfree(so, &so->so_rcv, m); 1735 so->so_rcv.sb_mb = m_free(m); 1736 m = so->so_rcv.sb_mb; 1737 sbsync(&so->so_rcv, nextrecord); 1738 } 1739 1740 SBLASTRECORDCHK(&so->so_rcv, "somove 2"); 1741 SBLASTMBUFCHK(&so->so_rcv, "somove 2"); 1742 1743 /* Take at most len mbufs out of receive buffer. */ 1744 for (off = 0, mp = &m; off <= len && *mp; 1745 off += (*mp)->m_len, mp = &(*mp)->m_next) { 1746 u_long size = len - off; 1747 1748 #ifdef DIAGNOSTIC 1749 if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER) 1750 panic("somove type: so %p, so_type %d, m %p, " 1751 "m_type %d", so, so->so_type, *mp, (*mp)->m_type); 1752 #endif 1753 if ((*mp)->m_len > size) { 1754 /* 1755 * Move only a partial mbuf at maximum splice length or 1756 * if the drain buffer is too small for this large mbuf. 1757 */ 1758 if (!maxreached && sosp->so_snd.sb_datacc > 0) { 1759 len -= size; 1760 break; 1761 } 1762 *mp = m_copym(so->so_rcv.sb_mb, 0, size, wait); 1763 if (*mp == NULL) { 1764 len -= size; 1765 break; 1766 } 1767 so->so_rcv.sb_mb->m_data += size; 1768 so->so_rcv.sb_mb->m_len -= size; 1769 so->so_rcv.sb_cc -= size; 1770 so->so_rcv.sb_datacc -= size; 1771 } else { 1772 *mp = so->so_rcv.sb_mb; 1773 sbfree(so, &so->so_rcv, *mp); 1774 so->so_rcv.sb_mb = (*mp)->m_next; 1775 sbsync(&so->so_rcv, nextrecord); 1776 } 1777 } 1778 *mp = NULL; 1779 1780 SBLASTRECORDCHK(&so->so_rcv, "somove 3"); 1781 SBLASTMBUFCHK(&so->so_rcv, "somove 3"); 1782 SBCHECK(so, &so->so_rcv); 1783 if (m == NULL) 1784 goto release; 1785 m->m_nextpkt = NULL; 1786 if (m->m_flags & M_PKTHDR) { 1787 m_resethdr(m); 1788 m->m_pkthdr.len = len; 1789 } 1790 1791 /* Send window update to source peer as receive buffer has changed. */ 1792 if (so->so_proto->pr_flags & PR_WANTRCVD) { 1793 mtx_leave(&sosp->so_snd.sb_mtx); 1794 mtx_leave(&so->so_rcv.sb_mtx); 1795 pru_rcvd(so); 1796 mtx_enter(&so->so_rcv.sb_mtx); 1797 mtx_enter(&sosp->so_snd.sb_mtx); 1798 } 1799 1800 /* Receive buffer did shrink by len bytes, adjust oob. */ 1801 rcvstate = so->so_rcv.sb_state; 1802 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1803 oobmark = so->so_oobmark; 1804 so->so_oobmark = oobmark > len ? oobmark - len : 0; 1805 if (oobmark) { 1806 if (oobmark == len) 1807 so->so_rcv.sb_state |= SS_RCVATMARK; 1808 if (oobmark >= len) 1809 oobmark = 0; 1810 } 1811 1812 /* 1813 * Handle oob data. If any malloc fails, ignore error. 1814 * TCP urgent data is not very reliable anyway. 1815 */ 1816 while (((rcvstate & SS_RCVATMARK) || oobmark) && 1817 (so->so_options & SO_OOBINLINE)) { 1818 struct mbuf *o = NULL; 1819 1820 if (rcvstate & SS_RCVATMARK) { 1821 o = m_get(wait, MT_DATA); 1822 rcvstate &= ~SS_RCVATMARK; 1823 } else if (oobmark) { 1824 o = m_split(m, oobmark, wait); 1825 if (o) { 1826 mtx_leave(&sosp->so_snd.sb_mtx); 1827 mtx_leave(&so->so_rcv.sb_mtx); 1828 error = pru_send(sosp, m, NULL, NULL); 1829 mtx_enter(&so->so_rcv.sb_mtx); 1830 mtx_enter(&sosp->so_snd.sb_mtx); 1831 1832 if (error) { 1833 if (sosp->so_snd.sb_state & 1834 SS_CANTSENDMORE) 1835 error = EPIPE; 1836 m_freem(o); 1837 goto release; 1838 } 1839 len -= oobmark; 1840 so->so_splicelen += oobmark; 1841 m = o; 1842 o = m_get(wait, MT_DATA); 1843 } 1844 oobmark = 0; 1845 } 1846 if (o) { 1847 o->m_len = 1; 1848 *mtod(o, caddr_t) = *mtod(m, caddr_t); 1849 1850 mtx_leave(&sosp->so_snd.sb_mtx); 1851 mtx_leave(&so->so_rcv.sb_mtx); 1852 error = pru_sendoob(sosp, o, NULL, NULL); 1853 mtx_enter(&so->so_rcv.sb_mtx); 1854 mtx_enter(&sosp->so_snd.sb_mtx); 1855 1856 if (error) { 1857 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) 1858 error = EPIPE; 1859 m_freem(m); 1860 goto release; 1861 } 1862 len -= 1; 1863 so->so_splicelen += 1; 1864 if (oobmark) { 1865 oobmark -= 1; 1866 if (oobmark == 0) 1867 rcvstate |= SS_RCVATMARK; 1868 } 1869 m_adj(m, 1); 1870 } 1871 } 1872 1873 /* Append all remaining data to drain socket. */ 1874 if (so->so_rcv.sb_cc == 0 || maxreached) 1875 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1876 1877 mtx_leave(&sosp->so_snd.sb_mtx); 1878 mtx_leave(&so->so_rcv.sb_mtx); 1879 1880 if (sockdgram) 1881 solock_shared(sosp); 1882 error = pru_send(sosp, m, NULL, NULL); 1883 if (sockdgram) 1884 sounlock_shared(sosp); 1885 1886 mtx_enter(&so->so_rcv.sb_mtx); 1887 mtx_enter(&sosp->so_snd.sb_mtx); 1888 1889 if (error) { 1890 if (sosp->so_snd.sb_state & SS_CANTSENDMORE || 1891 sosp->so_pcb == NULL) 1892 error = EPIPE; 1893 goto release; 1894 } 1895 so->so_splicelen += len; 1896 1897 /* Move several packets if possible. */ 1898 if (!maxreached && nextrecord) 1899 goto nextpkt; 1900 1901 release: 1902 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1903 1904 if (!error && maxreached && so->so_splicemax == so->so_splicelen) 1905 error = EFBIG; 1906 if (error) 1907 WRITE_ONCE(so->so_error, error); 1908 1909 if (((so->so_rcv.sb_state & SS_CANTRCVMORE) && 1910 so->so_rcv.sb_cc == 0) || 1911 (sosp->so_snd.sb_state & SS_CANTSENDMORE) || 1912 maxreached || error) 1913 unsplice = 1; 1914 1915 mtx_leave(&sosp->so_snd.sb_mtx); 1916 mtx_leave(&so->so_rcv.sb_mtx); 1917 1918 if (!sockdgram) { 1919 sbunlock(&so->so_snd); 1920 sounlock(so); 1921 } 1922 1923 if (unsplice) { 1924 soref(sosp); 1925 sounsplice(so, sosp, 0); 1926 sorele(sosp); 1927 1928 return (0); 1929 } 1930 if (timerisset(&so->so_idletv)) 1931 timeout_add_tv(&so->so_idleto, &so->so_idletv); 1932 return (1); 1933 } 1934 #endif /* SOCKET_SPLICE */ 1935 1936 void 1937 sorwakeup(struct socket *so) 1938 { 1939 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 1940 soassertlocked_readonly(so); 1941 1942 #ifdef SOCKET_SPLICE 1943 if (so->so_proto->pr_flags & PR_SPLICE) { 1944 sb_mtx_lock(&so->so_rcv); 1945 if (so->so_rcv.sb_flags & SB_SPLICE) 1946 task_add(sosplice_taskq, &so->so_splicetask); 1947 if (isspliced(so)) { 1948 sb_mtx_unlock(&so->so_rcv); 1949 return; 1950 } 1951 sb_mtx_unlock(&so->so_rcv); 1952 } 1953 #endif 1954 sowakeup(so, &so->so_rcv); 1955 if (so->so_upcall) 1956 (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT); 1957 } 1958 1959 void 1960 sowwakeup(struct socket *so) 1961 { 1962 if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0) 1963 soassertlocked_readonly(so); 1964 1965 #ifdef SOCKET_SPLICE 1966 if (so->so_proto->pr_flags & PR_SPLICE) { 1967 sb_mtx_lock(&so->so_snd); 1968 if (so->so_snd.sb_flags & SB_SPLICE) 1969 task_add(sosplice_taskq, 1970 &so->so_sp->ssp_soback->so_splicetask); 1971 if (issplicedback(so)) { 1972 sb_mtx_unlock(&so->so_snd); 1973 return; 1974 } 1975 sb_mtx_unlock(&so->so_snd); 1976 } 1977 #endif 1978 sowakeup(so, &so->so_snd); 1979 } 1980 1981 int 1982 sosetopt(struct socket *so, int level, int optname, struct mbuf *m) 1983 { 1984 int error = 0; 1985 1986 if (level != SOL_SOCKET) { 1987 if (so->so_proto->pr_ctloutput) { 1988 solock(so); 1989 error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1990 level, optname, m); 1991 sounlock(so); 1992 return (error); 1993 } 1994 error = ENOPROTOOPT; 1995 } else { 1996 switch (optname) { 1997 1998 case SO_LINGER: 1999 if (m == NULL || m->m_len != sizeof (struct linger) || 2000 mtod(m, struct linger *)->l_linger < 0 || 2001 mtod(m, struct linger *)->l_linger > SHRT_MAX) 2002 return (EINVAL); 2003 2004 solock(so); 2005 so->so_linger = mtod(m, struct linger *)->l_linger; 2006 if (*mtod(m, int *)) 2007 so->so_options |= optname; 2008 else 2009 so->so_options &= ~optname; 2010 sounlock(so); 2011 2012 break; 2013 case SO_BINDANY: 2014 if ((error = suser(curproc)) != 0) /* XXX */ 2015 return (error); 2016 /* FALLTHROUGH */ 2017 2018 case SO_DEBUG: 2019 case SO_KEEPALIVE: 2020 case SO_USELOOPBACK: 2021 case SO_BROADCAST: 2022 case SO_REUSEADDR: 2023 case SO_REUSEPORT: 2024 case SO_OOBINLINE: 2025 case SO_TIMESTAMP: 2026 case SO_ZEROIZE: 2027 if (m == NULL || m->m_len < sizeof (int)) 2028 return (EINVAL); 2029 2030 solock(so); 2031 if (*mtod(m, int *)) 2032 so->so_options |= optname; 2033 else 2034 so->so_options &= ~optname; 2035 sounlock(so); 2036 2037 break; 2038 case SO_DONTROUTE: 2039 if (m == NULL || m->m_len < sizeof (int)) 2040 return (EINVAL); 2041 if (*mtod(m, int *)) 2042 error = EOPNOTSUPP; 2043 break; 2044 2045 case SO_SNDBUF: 2046 case SO_RCVBUF: 2047 case SO_SNDLOWAT: 2048 case SO_RCVLOWAT: 2049 { 2050 struct sockbuf *sb = (optname == SO_SNDBUF || 2051 optname == SO_SNDLOWAT ? 2052 &so->so_snd : &so->so_rcv); 2053 u_long cnt; 2054 2055 if (m == NULL || m->m_len < sizeof (int)) 2056 return (EINVAL); 2057 cnt = *mtod(m, int *); 2058 if ((long)cnt <= 0) 2059 cnt = 1; 2060 2061 if (((sb->sb_flags & SB_MTXLOCK) == 0)) 2062 solock(so); 2063 mtx_enter(&sb->sb_mtx); 2064 2065 switch (optname) { 2066 case SO_SNDBUF: 2067 case SO_RCVBUF: 2068 if (sb->sb_state & 2069 (SS_CANTSENDMORE | SS_CANTRCVMORE)) { 2070 error = EINVAL; 2071 break; 2072 } 2073 if (sbcheckreserve(cnt, sb->sb_wat) || 2074 sbreserve(so, sb, cnt)) { 2075 error = ENOBUFS; 2076 break; 2077 } 2078 sb->sb_wat = cnt; 2079 break; 2080 case SO_SNDLOWAT: 2081 case SO_RCVLOWAT: 2082 sb->sb_lowat = (cnt > sb->sb_hiwat) ? 2083 sb->sb_hiwat : cnt; 2084 break; 2085 } 2086 2087 mtx_leave(&sb->sb_mtx); 2088 if (((sb->sb_flags & SB_MTXLOCK) == 0)) 2089 sounlock(so); 2090 2091 break; 2092 } 2093 2094 case SO_SNDTIMEO: 2095 case SO_RCVTIMEO: 2096 { 2097 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 2098 &so->so_snd : &so->so_rcv); 2099 struct timeval tv; 2100 uint64_t nsecs; 2101 2102 if (m == NULL || m->m_len < sizeof (tv)) 2103 return (EINVAL); 2104 memcpy(&tv, mtod(m, struct timeval *), sizeof tv); 2105 if (!timerisvalid(&tv)) 2106 return (EINVAL); 2107 nsecs = TIMEVAL_TO_NSEC(&tv); 2108 if (nsecs == UINT64_MAX) 2109 return (EDOM); 2110 if (nsecs == 0) 2111 nsecs = INFSLP; 2112 2113 mtx_enter(&sb->sb_mtx); 2114 sb->sb_timeo_nsecs = nsecs; 2115 mtx_leave(&sb->sb_mtx); 2116 break; 2117 } 2118 2119 case SO_RTABLE: 2120 if (so->so_proto->pr_domain && 2121 so->so_proto->pr_domain->dom_protosw && 2122 so->so_proto->pr_ctloutput) { 2123 const struct domain *dom = 2124 so->so_proto->pr_domain; 2125 2126 level = dom->dom_protosw->pr_protocol; 2127 solock(so); 2128 error = (*so->so_proto->pr_ctloutput) 2129 (PRCO_SETOPT, so, level, optname, m); 2130 sounlock(so); 2131 } else 2132 error = ENOPROTOOPT; 2133 break; 2134 #ifdef SOCKET_SPLICE 2135 case SO_SPLICE: 2136 if (m == NULL) { 2137 error = sosplice(so, -1, 0, NULL); 2138 } else if (m->m_len < sizeof(int)) { 2139 error = EINVAL; 2140 } else if (m->m_len < sizeof(struct splice)) { 2141 error = sosplice(so, *mtod(m, int *), 0, NULL); 2142 } else { 2143 error = sosplice(so, 2144 mtod(m, struct splice *)->sp_fd, 2145 mtod(m, struct splice *)->sp_max, 2146 &mtod(m, struct splice *)->sp_idle); 2147 } 2148 break; 2149 #endif /* SOCKET_SPLICE */ 2150 2151 default: 2152 error = ENOPROTOOPT; 2153 break; 2154 } 2155 } 2156 2157 return (error); 2158 } 2159 2160 int 2161 sogetopt(struct socket *so, int level, int optname, struct mbuf *m) 2162 { 2163 int error = 0; 2164 2165 if (level != SOL_SOCKET) { 2166 if (so->so_proto->pr_ctloutput) { 2167 m->m_len = 0; 2168 2169 solock(so); 2170 error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so, 2171 level, optname, m); 2172 sounlock(so); 2173 return (error); 2174 } else 2175 return (ENOPROTOOPT); 2176 } else { 2177 m->m_len = sizeof (int); 2178 2179 switch (optname) { 2180 2181 case SO_LINGER: 2182 m->m_len = sizeof (struct linger); 2183 solock_shared(so); 2184 mtod(m, struct linger *)->l_onoff = 2185 so->so_options & SO_LINGER; 2186 mtod(m, struct linger *)->l_linger = so->so_linger; 2187 sounlock_shared(so); 2188 break; 2189 2190 case SO_BINDANY: 2191 case SO_USELOOPBACK: 2192 case SO_DEBUG: 2193 case SO_KEEPALIVE: 2194 case SO_REUSEADDR: 2195 case SO_REUSEPORT: 2196 case SO_BROADCAST: 2197 case SO_OOBINLINE: 2198 case SO_ACCEPTCONN: 2199 case SO_TIMESTAMP: 2200 case SO_ZEROIZE: 2201 *mtod(m, int *) = so->so_options & optname; 2202 break; 2203 2204 case SO_DONTROUTE: 2205 *mtod(m, int *) = 0; 2206 break; 2207 2208 case SO_TYPE: 2209 *mtod(m, int *) = so->so_type; 2210 break; 2211 2212 case SO_ERROR: 2213 solock(so); 2214 *mtod(m, int *) = so->so_error; 2215 so->so_error = 0; 2216 sounlock(so); 2217 2218 break; 2219 2220 case SO_DOMAIN: 2221 *mtod(m, int *) = so->so_proto->pr_domain->dom_family; 2222 break; 2223 2224 case SO_PROTOCOL: 2225 *mtod(m, int *) = so->so_proto->pr_protocol; 2226 break; 2227 2228 case SO_SNDBUF: 2229 *mtod(m, int *) = so->so_snd.sb_hiwat; 2230 break; 2231 2232 case SO_RCVBUF: 2233 *mtod(m, int *) = so->so_rcv.sb_hiwat; 2234 break; 2235 2236 case SO_SNDLOWAT: 2237 *mtod(m, int *) = so->so_snd.sb_lowat; 2238 break; 2239 2240 case SO_RCVLOWAT: 2241 *mtod(m, int *) = so->so_rcv.sb_lowat; 2242 break; 2243 2244 case SO_SNDTIMEO: 2245 case SO_RCVTIMEO: 2246 { 2247 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 2248 &so->so_snd : &so->so_rcv); 2249 struct timeval tv; 2250 uint64_t nsecs; 2251 2252 mtx_enter(&sb->sb_mtx); 2253 nsecs = sb->sb_timeo_nsecs; 2254 mtx_leave(&sb->sb_mtx); 2255 2256 m->m_len = sizeof(struct timeval); 2257 memset(&tv, 0, sizeof(tv)); 2258 if (nsecs != INFSLP) 2259 NSEC_TO_TIMEVAL(nsecs, &tv); 2260 memcpy(mtod(m, struct timeval *), &tv, sizeof tv); 2261 break; 2262 } 2263 2264 case SO_RTABLE: 2265 if (so->so_proto->pr_domain && 2266 so->so_proto->pr_domain->dom_protosw && 2267 so->so_proto->pr_ctloutput) { 2268 const struct domain *dom = 2269 so->so_proto->pr_domain; 2270 2271 level = dom->dom_protosw->pr_protocol; 2272 solock(so); 2273 error = (*so->so_proto->pr_ctloutput) 2274 (PRCO_GETOPT, so, level, optname, m); 2275 sounlock(so); 2276 if (error) 2277 return (error); 2278 break; 2279 } 2280 return (ENOPROTOOPT); 2281 2282 #ifdef SOCKET_SPLICE 2283 case SO_SPLICE: 2284 { 2285 off_t len; 2286 2287 m->m_len = sizeof(off_t); 2288 solock_shared(so); 2289 len = so->so_sp ? so->so_sp->ssp_len : 0; 2290 sounlock_shared(so); 2291 memcpy(mtod(m, off_t *), &len, sizeof(off_t)); 2292 break; 2293 } 2294 #endif /* SOCKET_SPLICE */ 2295 2296 case SO_PEERCRED: 2297 if (so->so_proto->pr_protocol == AF_UNIX) { 2298 struct unpcb *unp = sotounpcb(so); 2299 2300 solock(so); 2301 if (unp->unp_flags & UNP_FEIDS) { 2302 m->m_len = sizeof(unp->unp_connid); 2303 memcpy(mtod(m, caddr_t), 2304 &(unp->unp_connid), m->m_len); 2305 sounlock(so); 2306 break; 2307 } 2308 sounlock(so); 2309 2310 return (ENOTCONN); 2311 } 2312 return (EOPNOTSUPP); 2313 2314 default: 2315 return (ENOPROTOOPT); 2316 } 2317 return (0); 2318 } 2319 } 2320 2321 void 2322 sohasoutofband(struct socket *so) 2323 { 2324 pgsigio(&so->so_sigio, SIGURG, 0); 2325 knote(&so->so_rcv.sb_klist, 0); 2326 } 2327 2328 void 2329 sofilt_lock(struct socket *so, struct sockbuf *sb) 2330 { 2331 switch (so->so_proto->pr_domain->dom_family) { 2332 case PF_INET: 2333 case PF_INET6: 2334 NET_LOCK_SHARED(); 2335 break; 2336 default: 2337 rw_enter_write(&so->so_lock); 2338 break; 2339 } 2340 2341 mtx_enter(&sb->sb_mtx); 2342 } 2343 2344 void 2345 sofilt_unlock(struct socket *so, struct sockbuf *sb) 2346 { 2347 mtx_leave(&sb->sb_mtx); 2348 2349 switch (so->so_proto->pr_domain->dom_family) { 2350 case PF_INET: 2351 case PF_INET6: 2352 NET_UNLOCK_SHARED(); 2353 break; 2354 default: 2355 rw_exit_write(&so->so_lock); 2356 break; 2357 } 2358 } 2359 2360 int 2361 soo_kqfilter(struct file *fp, struct knote *kn) 2362 { 2363 struct socket *so = kn->kn_fp->f_data; 2364 struct sockbuf *sb; 2365 2366 switch (kn->kn_filter) { 2367 case EVFILT_READ: 2368 kn->kn_fop = &soread_filtops; 2369 sb = &so->so_rcv; 2370 break; 2371 case EVFILT_WRITE: 2372 kn->kn_fop = &sowrite_filtops; 2373 sb = &so->so_snd; 2374 break; 2375 case EVFILT_EXCEPT: 2376 kn->kn_fop = &soexcept_filtops; 2377 sb = &so->so_rcv; 2378 break; 2379 default: 2380 return (EINVAL); 2381 } 2382 2383 klist_insert(&sb->sb_klist, kn); 2384 2385 return (0); 2386 } 2387 2388 void 2389 filt_sordetach(struct knote *kn) 2390 { 2391 struct socket *so = kn->kn_fp->f_data; 2392 2393 klist_remove(&so->so_rcv.sb_klist, kn); 2394 } 2395 2396 int 2397 filt_soread(struct knote *kn, long hint) 2398 { 2399 struct socket *so = kn->kn_fp->f_data; 2400 u_int state = READ_ONCE(so->so_state); 2401 u_int error = READ_ONCE(so->so_error); 2402 int rv = 0; 2403 2404 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); 2405 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 2406 soassertlocked_readonly(so); 2407 2408 if (so->so_options & SO_ACCEPTCONN) { 2409 short qlen = READ_ONCE(so->so_qlen); 2410 2411 if (so->so_rcv.sb_flags & SB_MTXLOCK) 2412 soassertlocked_readonly(so); 2413 2414 kn->kn_data = qlen; 2415 rv = (kn->kn_data != 0); 2416 2417 if (kn->kn_flags & (__EV_POLL | __EV_SELECT)) { 2418 if (state & SS_ISDISCONNECTED) { 2419 kn->kn_flags |= __EV_HUP; 2420 rv = 1; 2421 } else { 2422 rv = qlen || soreadable(so); 2423 } 2424 } 2425 2426 return rv; 2427 } 2428 2429 kn->kn_data = so->so_rcv.sb_cc; 2430 #ifdef SOCKET_SPLICE 2431 if (isspliced(so)) { 2432 rv = 0; 2433 } else 2434 #endif /* SOCKET_SPLICE */ 2435 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 2436 kn->kn_flags |= EV_EOF; 2437 if (kn->kn_flags & __EV_POLL) { 2438 if (state & SS_ISDISCONNECTED) 2439 kn->kn_flags |= __EV_HUP; 2440 } 2441 kn->kn_fflags = error; 2442 rv = 1; 2443 } else if (error) { 2444 rv = 1; 2445 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2446 rv = (kn->kn_data >= kn->kn_sdata); 2447 } else { 2448 rv = (kn->kn_data >= so->so_rcv.sb_lowat); 2449 } 2450 2451 return rv; 2452 } 2453 2454 void 2455 filt_sowdetach(struct knote *kn) 2456 { 2457 struct socket *so = kn->kn_fp->f_data; 2458 2459 klist_remove(&so->so_snd.sb_klist, kn); 2460 } 2461 2462 int 2463 filt_sowrite(struct knote *kn, long hint) 2464 { 2465 struct socket *so = kn->kn_fp->f_data; 2466 u_int state = READ_ONCE(so->so_state); 2467 u_int error = READ_ONCE(so->so_error); 2468 int rv; 2469 2470 MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx); 2471 if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0) 2472 soassertlocked_readonly(so); 2473 2474 kn->kn_data = sbspace_locked(so, &so->so_snd); 2475 if (so->so_snd.sb_state & SS_CANTSENDMORE) { 2476 kn->kn_flags |= EV_EOF; 2477 if (kn->kn_flags & __EV_POLL) { 2478 if (state & SS_ISDISCONNECTED) 2479 kn->kn_flags |= __EV_HUP; 2480 } 2481 kn->kn_fflags = error; 2482 rv = 1; 2483 } else if (error) { 2484 rv = 1; 2485 } else if (((state & SS_ISCONNECTED) == 0) && 2486 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 2487 rv = 0; 2488 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2489 rv = (kn->kn_data >= kn->kn_sdata); 2490 } else { 2491 rv = (kn->kn_data >= so->so_snd.sb_lowat); 2492 } 2493 2494 return (rv); 2495 } 2496 2497 int 2498 filt_soexcept(struct knote *kn, long hint) 2499 { 2500 struct socket *so = kn->kn_fp->f_data; 2501 int rv = 0; 2502 2503 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); 2504 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 2505 soassertlocked_readonly(so); 2506 2507 #ifdef SOCKET_SPLICE 2508 if (isspliced(so)) { 2509 rv = 0; 2510 } else 2511 #endif /* SOCKET_SPLICE */ 2512 if (kn->kn_sfflags & NOTE_OOB) { 2513 if (so->so_oobmark || (so->so_rcv.sb_state & SS_RCVATMARK)) { 2514 kn->kn_fflags |= NOTE_OOB; 2515 kn->kn_data -= so->so_oobmark; 2516 rv = 1; 2517 } 2518 } 2519 2520 if (kn->kn_flags & __EV_POLL) { 2521 u_int state = READ_ONCE(so->so_state); 2522 2523 if (state & SS_ISDISCONNECTED) { 2524 kn->kn_flags |= __EV_HUP; 2525 rv = 1; 2526 } 2527 } 2528 2529 return rv; 2530 } 2531 2532 int 2533 filt_sowmodify(struct kevent *kev, struct knote *kn) 2534 { 2535 struct socket *so = kn->kn_fp->f_data; 2536 int rv; 2537 2538 sofilt_lock(so, &so->so_snd); 2539 rv = knote_modify(kev, kn); 2540 sofilt_unlock(so, &so->so_snd); 2541 2542 return (rv); 2543 } 2544 2545 int 2546 filt_sowprocess(struct knote *kn, struct kevent *kev) 2547 { 2548 struct socket *so = kn->kn_fp->f_data; 2549 int rv; 2550 2551 sofilt_lock(so, &so->so_snd); 2552 rv = knote_process(kn, kev); 2553 sofilt_unlock(so, &so->so_snd); 2554 2555 return (rv); 2556 } 2557 2558 int 2559 filt_sormodify(struct kevent *kev, struct knote *kn) 2560 { 2561 struct socket *so = kn->kn_fp->f_data; 2562 int rv; 2563 2564 sofilt_lock(so, &so->so_rcv); 2565 rv = knote_modify(kev, kn); 2566 sofilt_unlock(so, &so->so_rcv); 2567 2568 return (rv); 2569 } 2570 2571 int 2572 filt_sorprocess(struct knote *kn, struct kevent *kev) 2573 { 2574 struct socket *so = kn->kn_fp->f_data; 2575 int rv; 2576 2577 sofilt_lock(so, &so->so_rcv); 2578 rv = knote_process(kn, kev); 2579 sofilt_unlock(so, &so->so_rcv); 2580 2581 return (rv); 2582 } 2583 2584 #ifdef DDB 2585 void 2586 sobuf_print(struct sockbuf *, 2587 int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))); 2588 2589 void 2590 sobuf_print(struct sockbuf *sb, 2591 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2592 { 2593 (*pr)("\tsb_cc: %lu\n", sb->sb_cc); 2594 (*pr)("\tsb_datacc: %lu\n", sb->sb_datacc); 2595 (*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat); 2596 (*pr)("\tsb_wat: %lu\n", sb->sb_wat); 2597 (*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt); 2598 (*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax); 2599 (*pr)("\tsb_lowat: %ld\n", sb->sb_lowat); 2600 (*pr)("\tsb_mb: %p\n", sb->sb_mb); 2601 (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail); 2602 (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord); 2603 (*pr)("\tsb_flags: %04x\n", sb->sb_flags); 2604 (*pr)("\tsb_state: %04x\n", sb->sb_state); 2605 (*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs); 2606 } 2607 2608 void 2609 so_print(void *v, 2610 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2611 { 2612 struct socket *so = v; 2613 2614 (*pr)("socket %p\n", so); 2615 (*pr)("so_type: %i\n", so->so_type); 2616 (*pr)("so_options: 0x%04x\n", so->so_options); /* %b */ 2617 (*pr)("so_linger: %i\n", so->so_linger); 2618 (*pr)("so_state: 0x%04x\n", so->so_state); 2619 (*pr)("so_pcb: %p\n", so->so_pcb); 2620 (*pr)("so_proto: %p\n", so->so_proto); 2621 (*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio); 2622 2623 (*pr)("so_head: %p\n", so->so_head); 2624 (*pr)("so_onq: %p\n", so->so_onq); 2625 (*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0)); 2626 (*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q)); 2627 (*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe)); 2628 (*pr)("so_q0len: %i\n", so->so_q0len); 2629 (*pr)("so_qlen: %i\n", so->so_qlen); 2630 (*pr)("so_qlimit: %i\n", so->so_qlimit); 2631 (*pr)("so_timeo: %i\n", so->so_timeo); 2632 (*pr)("so_obmark: %lu\n", so->so_oobmark); 2633 2634 (*pr)("so_sp: %p\n", so->so_sp); 2635 if (so->so_sp != NULL) { 2636 (*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket); 2637 (*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback); 2638 (*pr)("\tssp_len: %lld\n", 2639 (unsigned long long)so->so_sp->ssp_len); 2640 (*pr)("\tssp_max: %lld\n", 2641 (unsigned long long)so->so_sp->ssp_max); 2642 (*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec, 2643 so->so_sp->ssp_idletv.tv_usec); 2644 (*pr)("\tssp_idleto: %spending (@%i)\n", 2645 timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ", 2646 so->so_sp->ssp_idleto.to_time); 2647 } 2648 2649 (*pr)("so_rcv:\n"); 2650 sobuf_print(&so->so_rcv, pr); 2651 (*pr)("so_snd:\n"); 2652 sobuf_print(&so->so_snd, pr); 2653 2654 (*pr)("so_upcall: %p so_upcallarg: %p\n", 2655 so->so_upcall, so->so_upcallarg); 2656 2657 (*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid); 2658 (*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid); 2659 (*pr)("so_cpid: %d\n", so->so_cpid); 2660 } 2661 #endif 2662