1 /* $OpenBSD: uipc_socket.c,v 1.351 2024/12/30 12:12:35 mvs Exp $ */ 2 /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 #include <sys/file.h> 39 #include <sys/filedesc.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/domain.h> 43 #include <sys/event.h> 44 #include <sys/protosw.h> 45 #include <sys/socket.h> 46 #include <sys/unpcb.h> 47 #include <sys/socketvar.h> 48 #include <sys/signalvar.h> 49 #include <sys/pool.h> 50 #include <sys/atomic.h> 51 #include <sys/rwlock.h> 52 #include <sys/time.h> 53 #include <sys/refcnt.h> 54 55 #ifdef DDB 56 #include <machine/db_machdep.h> 57 #endif 58 59 void sbsync(struct sockbuf *, struct mbuf *); 60 61 int sosplice(struct socket *, int, off_t, struct timeval *); 62 void sounsplice(struct socket *, struct socket *, int); 63 void soidle(void *); 64 void sotask(void *); 65 int somove(struct socket *, int); 66 void sorflush(struct socket *); 67 68 void filt_sordetach(struct knote *kn); 69 int filt_soread(struct knote *kn, long hint); 70 void filt_sowdetach(struct knote *kn); 71 int filt_sowrite(struct knote *kn, long hint); 72 int filt_soexcept(struct knote *kn, long hint); 73 74 int filt_sowmodify(struct kevent *kev, struct knote *kn); 75 int filt_sowprocess(struct knote *kn, struct kevent *kev); 76 77 int filt_sormodify(struct kevent *kev, struct knote *kn); 78 int filt_sorprocess(struct knote *kn, struct kevent *kev); 79 80 const struct filterops soread_filtops = { 81 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 82 .f_attach = NULL, 83 .f_detach = filt_sordetach, 84 .f_event = filt_soread, 85 .f_modify = filt_sormodify, 86 .f_process = filt_sorprocess, 87 }; 88 89 const struct filterops sowrite_filtops = { 90 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 91 .f_attach = NULL, 92 .f_detach = filt_sowdetach, 93 .f_event = filt_sowrite, 94 .f_modify = filt_sowmodify, 95 .f_process = filt_sowprocess, 96 }; 97 98 const struct filterops soexcept_filtops = { 99 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 100 .f_attach = NULL, 101 .f_detach = filt_sordetach, 102 .f_event = filt_soexcept, 103 .f_modify = filt_sormodify, 104 .f_process = filt_sorprocess, 105 }; 106 107 #ifndef SOMINCONN 108 #define SOMINCONN 80 109 #endif /* SOMINCONN */ 110 111 int somaxconn = SOMAXCONN; 112 int sominconn = SOMINCONN; 113 114 struct pool socket_pool; 115 #ifdef SOCKET_SPLICE 116 struct pool sosplice_pool; 117 struct taskq *sosplice_taskq; 118 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk"); 119 #endif 120 121 void 122 soinit(void) 123 { 124 pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0, 125 "sockpl", NULL); 126 #ifdef SOCKET_SPLICE 127 pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0, 128 "sosppl", NULL); 129 #endif 130 } 131 132 struct socket * 133 soalloc(const struct protosw *prp, int wait) 134 { 135 const struct domain *dp = prp->pr_domain; 136 struct socket *so; 137 138 so = pool_get(&socket_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) | 139 PR_ZERO); 140 if (so == NULL) 141 return (NULL); 142 rw_init_flags(&so->so_lock, dp->dom_name, RWL_DUPOK); 143 refcnt_init(&so->so_refcnt); 144 rw_init(&so->so_rcv.sb_lock, "sbufrcv"); 145 rw_init(&so->so_snd.sb_lock, "sbufsnd"); 146 mtx_init_flags(&so->so_rcv.sb_mtx, IPL_MPFLOOR, "sbrcv", 0); 147 mtx_init_flags(&so->so_snd.sb_mtx, IPL_MPFLOOR, "sbsnd", 0); 148 klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx); 149 klist_init_mutex(&so->so_snd.sb_klist, &so->so_snd.sb_mtx); 150 sigio_init(&so->so_sigio); 151 TAILQ_INIT(&so->so_q0); 152 TAILQ_INIT(&so->so_q); 153 154 switch (dp->dom_family) { 155 case AF_INET: 156 case AF_INET6: 157 switch (prp->pr_type) { 158 case SOCK_RAW: 159 case SOCK_DGRAM: 160 so->so_snd.sb_flags |= SB_MTXLOCK; 161 /* FALLTHROUGH */ 162 case SOCK_STREAM: 163 so->so_rcv.sb_flags |= SB_MTXLOCK; 164 break; 165 } 166 break; 167 case AF_KEY: 168 case AF_ROUTE: 169 case AF_UNIX: 170 case AF_FRAME: 171 so->so_snd.sb_flags |= SB_MTXLOCK; 172 so->so_rcv.sb_flags |= SB_MTXLOCK; 173 break; 174 } 175 176 return (so); 177 } 178 179 /* 180 * Socket operation routines. 181 * These routines are called by the routines in 182 * sys_socket.c or from a system process, and 183 * implement the semantics of socket operations by 184 * switching out to the protocol specific routines. 185 */ 186 int 187 socreate(int dom, struct socket **aso, int type, int proto) 188 { 189 struct proc *p = curproc; /* XXX */ 190 const struct protosw *prp; 191 struct socket *so; 192 int error; 193 194 if (proto) 195 prp = pffindproto(dom, proto, type); 196 else 197 prp = pffindtype(dom, type); 198 if (prp == NULL || prp->pr_usrreqs == NULL) 199 return (EPROTONOSUPPORT); 200 if (prp->pr_type != type) 201 return (EPROTOTYPE); 202 so = soalloc(prp, M_WAIT); 203 so->so_type = type; 204 if (suser(p) == 0) 205 so->so_state = SS_PRIV; 206 so->so_ruid = p->p_ucred->cr_ruid; 207 so->so_euid = p->p_ucred->cr_uid; 208 so->so_rgid = p->p_ucred->cr_rgid; 209 so->so_egid = p->p_ucred->cr_gid; 210 so->so_cpid = p->p_p->ps_pid; 211 so->so_proto = prp; 212 so->so_snd.sb_timeo_nsecs = INFSLP; 213 so->so_rcv.sb_timeo_nsecs = INFSLP; 214 215 solock(so); 216 error = pru_attach(so, proto, M_WAIT); 217 if (error) { 218 so->so_state |= SS_NOFDREF; 219 /* sofree() calls sounlock(). */ 220 sofree(so, 0); 221 return (error); 222 } 223 sounlock(so); 224 *aso = so; 225 return (0); 226 } 227 228 int 229 sobind(struct socket *so, struct mbuf *nam, struct proc *p) 230 { 231 soassertlocked(so); 232 return pru_bind(so, nam, p); 233 } 234 235 int 236 solisten(struct socket *so, int backlog) 237 { 238 int somaxconn_local = atomic_load_int(&somaxconn); 239 int sominconn_local = atomic_load_int(&sominconn); 240 int error; 241 242 switch (so->so_type) { 243 case SOCK_STREAM: 244 case SOCK_SEQPACKET: 245 break; 246 default: 247 return (EOPNOTSUPP); 248 } 249 250 soassertlocked(so); 251 252 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 253 return (EINVAL); 254 #ifdef SOCKET_SPLICE 255 if (isspliced(so) || issplicedback(so)) 256 return (EOPNOTSUPP); 257 #endif /* SOCKET_SPLICE */ 258 error = pru_listen(so); 259 if (error) 260 return (error); 261 if (TAILQ_FIRST(&so->so_q) == NULL) 262 so->so_options |= SO_ACCEPTCONN; 263 if (backlog < 0 || backlog > somaxconn_local) 264 backlog = somaxconn_local; 265 if (backlog < sominconn_local) 266 backlog = sominconn_local; 267 so->so_qlimit = backlog; 268 return (0); 269 } 270 271 void 272 sorele(struct socket *so, int keep_lock) 273 { 274 int need_lock = (((so->so_snd.sb_flags & SB_MTXLOCK) == 0) && 275 keep_lock == 0); 276 277 if (keep_lock == 0) 278 sounlock(so); 279 280 if (refcnt_rele(&so->so_refcnt) == 0) 281 return; 282 283 sigio_free(&so->so_sigio); 284 klist_free(&so->so_rcv.sb_klist); 285 klist_free(&so->so_snd.sb_klist); 286 287 if (need_lock) 288 solock(so); 289 mtx_enter(&so->so_snd.sb_mtx); 290 sbrelease(so, &so->so_snd); 291 mtx_leave(&so->so_snd.sb_mtx); 292 if (need_lock) 293 sounlock(so); 294 295 if (so->so_proto->pr_flags & PR_RIGHTS && 296 so->so_proto->pr_domain->dom_dispose) 297 (*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb); 298 m_purge(so->so_rcv.sb_mb); 299 300 #ifdef SOCKET_SPLICE 301 if (so->so_sp) 302 pool_put(&sosplice_pool, so->so_sp); 303 #endif 304 pool_put(&socket_pool, so); 305 } 306 307 #define SOSP_FREEING_READ 1 308 #define SOSP_FREEING_WRITE 2 309 void 310 sofree(struct socket *so, int keep_lock) 311 { 312 int persocket = solock_persocket(so); 313 314 soassertlocked(so); 315 316 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { 317 if (!keep_lock) 318 sounlock(so); 319 return; 320 } 321 if (so->so_head) { 322 struct socket *head = so->so_head; 323 324 /* 325 * We must not decommission a socket that's on the accept(2) 326 * queue. If we do, then accept(2) may hang after select(2) 327 * indicated that the listening socket was ready. 328 */ 329 if (so->so_onq == &head->so_q) { 330 if (!keep_lock) 331 sounlock(so); 332 return; 333 } 334 335 if (persocket) { 336 soref(head); 337 sounlock(so); 338 solock(head); 339 solock(so); 340 341 if (so->so_onq != &head->so_q0) { 342 sounlock(so); 343 sorele(head, 0); 344 return; 345 } 346 } 347 348 soqremque(so, 0); 349 350 if (persocket) 351 sorele(head, 0); 352 } 353 354 sorele(so, keep_lock); 355 } 356 357 static inline uint64_t 358 solinger_nsec(struct socket *so) 359 { 360 if (so->so_linger == 0) 361 return INFSLP; 362 363 return SEC_TO_NSEC(so->so_linger); 364 } 365 366 /* 367 * Close a socket on last file table reference removal. 368 * Initiate disconnect if connected. 369 * Free socket when disconnect complete. 370 */ 371 int 372 soclose(struct socket *so, int flags) 373 { 374 struct socket *so2; 375 int error = 0; 376 377 solock(so); 378 /* Revoke async IO early. There is a final revocation in sofree(). */ 379 sigio_free(&so->so_sigio); 380 if (so->so_state & SS_ISCONNECTED) { 381 if (so->so_pcb == NULL) 382 goto discard; 383 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 384 error = sodisconnect(so); 385 if (error) 386 goto drop; 387 } 388 if (so->so_options & SO_LINGER) { 389 if ((so->so_state & SS_ISDISCONNECTING) && 390 (flags & MSG_DONTWAIT)) 391 goto drop; 392 while (so->so_state & SS_ISCONNECTED) { 393 error = sosleep_nsec(so, &so->so_timeo, 394 PSOCK | PCATCH, "netcls", 395 solinger_nsec(so)); 396 if (error) 397 break; 398 } 399 } 400 } 401 drop: 402 if (so->so_pcb) { 403 int error2; 404 error2 = pru_detach(so); 405 if (error == 0) 406 error = error2; 407 } 408 if (so->so_options & SO_ACCEPTCONN) { 409 int persocket = solock_persocket(so); 410 411 while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { 412 if (persocket) 413 solock(so2); 414 (void) soqremque(so2, 0); 415 if (persocket) 416 sounlock(so); 417 soabort(so2); 418 if (persocket) 419 solock(so); 420 } 421 while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { 422 if (persocket) 423 solock(so2); 424 (void) soqremque(so2, 1); 425 if (persocket) 426 sounlock(so); 427 soabort(so2); 428 if (persocket) 429 solock(so); 430 } 431 } 432 discard: 433 #ifdef SOCKET_SPLICE 434 if (so->so_sp) { 435 struct socket *soback; 436 437 sounlock(so); 438 mtx_enter(&so->so_snd.sb_mtx); 439 /* 440 * Concurrent sounsplice() locks `sb_mtx' mutexes on 441 * both `so_snd' and `so_rcv' before unsplice sockets. 442 */ 443 if ((soback = so->so_sp->ssp_soback) == NULL) { 444 mtx_leave(&so->so_snd.sb_mtx); 445 goto notsplicedback; 446 } 447 soref(soback); 448 mtx_leave(&so->so_snd.sb_mtx); 449 450 /* 451 * `so' can be only unspliced, and never spliced again. 452 * Thus if issplicedback(so) check is positive, socket is 453 * still spliced and `ssp_soback' points to the same 454 * socket that `soback'. 455 */ 456 sblock(&soback->so_rcv, SBL_WAIT | SBL_NOINTR); 457 if (issplicedback(so)) { 458 int freeing = SOSP_FREEING_WRITE; 459 460 if (so->so_sp->ssp_soback == so) 461 freeing |= SOSP_FREEING_READ; 462 solock(soback); 463 sounsplice(so->so_sp->ssp_soback, so, freeing); 464 sounlock(soback); 465 } 466 sbunlock(&soback->so_rcv); 467 solock(soback); 468 sorele(soback, 0); 469 470 notsplicedback: 471 sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR); 472 if (isspliced(so)) { 473 int freeing = SOSP_FREEING_READ; 474 475 if (so == so->so_sp->ssp_socket) 476 freeing |= SOSP_FREEING_WRITE; 477 solock(so); 478 sounsplice(so, so->so_sp->ssp_socket, freeing); 479 sounlock(so); 480 } 481 sbunlock(&so->so_rcv); 482 483 timeout_del_barrier(&so->so_sp->ssp_idleto); 484 task_del(sosplice_taskq, &so->so_sp->ssp_task); 485 taskq_barrier(sosplice_taskq); 486 487 solock(so); 488 } 489 #endif /* SOCKET_SPLICE */ 490 491 if (so->so_state & SS_NOFDREF) 492 panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type); 493 so->so_state |= SS_NOFDREF; 494 495 /* sofree() calls sounlock(). */ 496 sofree(so, 0); 497 return (error); 498 } 499 500 void 501 soabort(struct socket *so) 502 { 503 soassertlocked(so); 504 pru_abort(so); 505 } 506 507 int 508 soaccept(struct socket *so, struct mbuf *nam) 509 { 510 int error = 0; 511 512 soassertlocked(so); 513 514 if ((so->so_state & SS_NOFDREF) == 0) 515 panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type); 516 so->so_state &= ~SS_NOFDREF; 517 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 518 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 519 error = pru_accept(so, nam); 520 else 521 error = ECONNABORTED; 522 return (error); 523 } 524 525 int 526 soconnect(struct socket *so, struct mbuf *nam) 527 { 528 int error; 529 530 soassertlocked(so); 531 532 if (so->so_options & SO_ACCEPTCONN) 533 return (EOPNOTSUPP); 534 /* 535 * If protocol is connection-based, can only connect once. 536 * Otherwise, if connected, try to disconnect first. 537 * This allows user to disconnect by connecting to, e.g., 538 * a null address. 539 */ 540 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 541 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 542 (error = sodisconnect(so)))) 543 error = EISCONN; 544 else 545 error = pru_connect(so, nam); 546 return (error); 547 } 548 549 int 550 soconnect2(struct socket *so1, struct socket *so2) 551 { 552 int persocket, error; 553 554 if ((persocket = solock_persocket(so1))) 555 solock_pair(so1, so2); 556 else 557 solock(so1); 558 559 error = pru_connect2(so1, so2); 560 561 if (persocket) 562 sounlock(so2); 563 sounlock(so1); 564 return (error); 565 } 566 567 int 568 sodisconnect(struct socket *so) 569 { 570 int error; 571 572 soassertlocked(so); 573 574 if ((so->so_state & SS_ISCONNECTED) == 0) 575 return (ENOTCONN); 576 if (so->so_state & SS_ISDISCONNECTING) 577 return (EALREADY); 578 error = pru_disconnect(so); 579 return (error); 580 } 581 582 int m_getuio(struct mbuf **, int, long, struct uio *); 583 584 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 585 /* 586 * Send on a socket. 587 * If send must go all at once and message is larger than 588 * send buffering, then hard error. 589 * Lock against other senders. 590 * If must go all at once and not enough room now, then 591 * inform user that this would block and do nothing. 592 * Otherwise, if nonblocking, send as much as possible. 593 * The data to be sent is described by "uio" if nonzero, 594 * otherwise by the mbuf chain "top" (which must be null 595 * if uio is not). Data provided in mbuf chain must be small 596 * enough to send all at once. 597 * 598 * Returns nonzero on error, timeout or signal; callers 599 * must check for short counts if EINTR/ERESTART are returned. 600 * Data and control buffers are freed on return. 601 */ 602 int 603 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 604 struct mbuf *control, int flags) 605 { 606 long space, clen = 0; 607 size_t resid; 608 int error; 609 int atomic = sosendallatonce(so) || top; 610 int dosolock = ((so->so_snd.sb_flags & SB_MTXLOCK) == 0); 611 612 if (uio) 613 resid = uio->uio_resid; 614 else 615 resid = top->m_pkthdr.len; 616 /* MSG_EOR on a SOCK_STREAM socket is invalid. */ 617 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 618 m_freem(top); 619 m_freem(control); 620 return (EINVAL); 621 } 622 if (uio && uio->uio_procp) 623 uio->uio_procp->p_ru.ru_msgsnd++; 624 if (control) { 625 /* 626 * In theory clen should be unsigned (since control->m_len is). 627 * However, space must be signed, as it might be less than 0 628 * if we over-committed, and we must use a signed comparison 629 * of space and clen. 630 */ 631 clen = control->m_len; 632 /* reserve extra space for AF_UNIX's internalize */ 633 if (so->so_proto->pr_domain->dom_family == AF_UNIX && 634 clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) && 635 mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 636 clen = CMSG_SPACE( 637 (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) * 638 (sizeof(struct fdpass) / sizeof(int))); 639 } 640 641 #define snderr(errno) { error = errno; goto release; } 642 643 restart: 644 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) 645 goto out; 646 if (dosolock) 647 solock_shared(so); 648 sb_mtx_lock(&so->so_snd); 649 so->so_snd.sb_state |= SS_ISSENDING; 650 do { 651 if (so->so_snd.sb_state & SS_CANTSENDMORE) 652 snderr(EPIPE); 653 if ((error = READ_ONCE(so->so_error))) { 654 so->so_error = 0; 655 snderr(error); 656 } 657 if ((so->so_state & SS_ISCONNECTED) == 0) { 658 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 659 if (!(resid == 0 && clen != 0)) 660 snderr(ENOTCONN); 661 } else if (addr == NULL) 662 snderr(EDESTADDRREQ); 663 } 664 space = sbspace_locked(so, &so->so_snd); 665 if (flags & MSG_OOB) 666 space += 1024; 667 if (so->so_proto->pr_domain->dom_family == AF_UNIX) { 668 if (atomic && resid > so->so_snd.sb_hiwat) 669 snderr(EMSGSIZE); 670 } else { 671 if (clen > so->so_snd.sb_hiwat || 672 (atomic && resid > so->so_snd.sb_hiwat - clen)) 673 snderr(EMSGSIZE); 674 } 675 if (space < clen || 676 (space - clen < resid && 677 (atomic || space < so->so_snd.sb_lowat))) { 678 if (flags & MSG_DONTWAIT) 679 snderr(EWOULDBLOCK); 680 sbunlock(&so->so_snd); 681 error = sbwait(so, &so->so_snd); 682 so->so_snd.sb_state &= ~SS_ISSENDING; 683 sb_mtx_unlock(&so->so_snd); 684 if (dosolock) 685 sounlock_shared(so); 686 if (error) 687 goto out; 688 goto restart; 689 } 690 space -= clen; 691 do { 692 if (uio == NULL) { 693 /* 694 * Data is prepackaged in "top". 695 */ 696 resid = 0; 697 if (flags & MSG_EOR) 698 top->m_flags |= M_EOR; 699 } else { 700 sb_mtx_unlock(&so->so_snd); 701 if (dosolock) 702 sounlock_shared(so); 703 error = m_getuio(&top, atomic, space, uio); 704 if (dosolock) 705 solock_shared(so); 706 sb_mtx_lock(&so->so_snd); 707 if (error) 708 goto release; 709 space -= top->m_pkthdr.len; 710 resid = uio->uio_resid; 711 if (flags & MSG_EOR) 712 top->m_flags |= M_EOR; 713 } 714 if (resid == 0) 715 so->so_snd.sb_state &= ~SS_ISSENDING; 716 if (top && so->so_options & SO_ZEROIZE) 717 top->m_flags |= M_ZEROIZE; 718 sb_mtx_unlock(&so->so_snd); 719 if (!dosolock) 720 solock_shared(so); 721 if (flags & MSG_OOB) 722 error = pru_sendoob(so, top, addr, control); 723 else 724 error = pru_send(so, top, addr, control); 725 if (!dosolock) 726 sounlock_shared(so); 727 sb_mtx_lock(&so->so_snd); 728 clen = 0; 729 control = NULL; 730 top = NULL; 731 if (error) 732 goto release; 733 } while (resid && space > 0); 734 } while (resid); 735 736 release: 737 so->so_snd.sb_state &= ~SS_ISSENDING; 738 sb_mtx_unlock(&so->so_snd); 739 if (dosolock) 740 sounlock_shared(so); 741 sbunlock(&so->so_snd); 742 out: 743 m_freem(top); 744 m_freem(control); 745 return (error); 746 } 747 748 int 749 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio) 750 { 751 struct mbuf *m, *top = NULL; 752 struct mbuf **nextp = ⊤ 753 u_long len, mlen; 754 size_t resid = uio->uio_resid; 755 int error; 756 757 do { 758 if (top == NULL) { 759 MGETHDR(m, M_WAIT, MT_DATA); 760 mlen = MHLEN; 761 } else { 762 MGET(m, M_WAIT, MT_DATA); 763 mlen = MLEN; 764 } 765 /* chain mbuf together */ 766 *nextp = m; 767 nextp = &m->m_next; 768 769 resid = ulmin(resid, space); 770 if (resid >= MINCLSIZE) { 771 MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES)); 772 if ((m->m_flags & M_EXT) == 0) 773 MCLGETL(m, M_NOWAIT, MCLBYTES); 774 if ((m->m_flags & M_EXT) == 0) 775 goto nopages; 776 mlen = m->m_ext.ext_size; 777 len = ulmin(mlen, resid); 778 /* 779 * For datagram protocols, leave room 780 * for protocol headers in first mbuf. 781 */ 782 if (atomic && m == top && len < mlen - max_hdr) 783 m->m_data += max_hdr; 784 } else { 785 nopages: 786 len = ulmin(mlen, resid); 787 /* 788 * For datagram protocols, leave room 789 * for protocol headers in first mbuf. 790 */ 791 if (atomic && m == top && len < mlen - max_hdr) 792 m_align(m, len); 793 } 794 795 error = uiomove(mtod(m, caddr_t), len, uio); 796 if (error) { 797 m_freem(top); 798 return (error); 799 } 800 801 /* adjust counters */ 802 resid = uio->uio_resid; 803 space -= len; 804 m->m_len = len; 805 top->m_pkthdr.len += len; 806 807 /* Is there more space and more data? */ 808 } while (space > 0 && resid > 0); 809 810 *mp = top; 811 return 0; 812 } 813 814 /* 815 * Following replacement or removal of the first mbuf on the first 816 * mbuf chain of a socket buffer, push necessary state changes back 817 * into the socket buffer so that other consumers see the values 818 * consistently. 'nextrecord' is the callers locally stored value of 819 * the original value of sb->sb_mb->m_nextpkt which must be restored 820 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 821 */ 822 void 823 sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 824 { 825 826 /* 827 * First, update for the new value of nextrecord. If necessary, 828 * make it the first record. 829 */ 830 if (sb->sb_mb != NULL) 831 sb->sb_mb->m_nextpkt = nextrecord; 832 else 833 sb->sb_mb = nextrecord; 834 835 /* 836 * Now update any dependent socket buffer fields to reflect 837 * the new state. This is an inline of SB_EMPTY_FIXUP, with 838 * the addition of a second clause that takes care of the 839 * case where sb_mb has been updated, but remains the last 840 * record. 841 */ 842 if (sb->sb_mb == NULL) { 843 sb->sb_mbtail = NULL; 844 sb->sb_lastrecord = NULL; 845 } else if (sb->sb_mb->m_nextpkt == NULL) 846 sb->sb_lastrecord = sb->sb_mb; 847 } 848 849 /* 850 * Implement receive operations on a socket. 851 * We depend on the way that records are added to the sockbuf 852 * by sbappend*. In particular, each record (mbufs linked through m_next) 853 * must begin with an address if the protocol so specifies, 854 * followed by an optional mbuf or mbufs containing ancillary data, 855 * and then zero or more mbufs of data. 856 * In order to avoid blocking network for the entire time here, we release 857 * the solock() while doing the actual copy to user space. 858 * Although the sockbuf is locked, new data may still be appended, 859 * and thus we must maintain consistency of the sockbuf during that time. 860 * 861 * The caller may receive the data as a single mbuf chain by supplying 862 * an mbuf **mp0 for use in returning the chain. The uio is then used 863 * only for the count in uio_resid. 864 */ 865 int 866 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 867 struct mbuf **mp0, struct mbuf **controlp, int *flagsp, 868 socklen_t controllen) 869 { 870 struct mbuf *m, **mp; 871 struct mbuf *cm; 872 u_long len, offset, moff; 873 int flags, error, error2, type, uio_error = 0; 874 const struct protosw *pr = so->so_proto; 875 struct mbuf *nextrecord; 876 size_t resid, orig_resid = uio->uio_resid; 877 int dosolock = ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0); 878 879 mp = mp0; 880 if (paddr) 881 *paddr = NULL; 882 if (controlp) 883 *controlp = NULL; 884 if (flagsp) 885 flags = *flagsp &~ MSG_EOR; 886 else 887 flags = 0; 888 if (flags & MSG_OOB) { 889 m = m_get(M_WAIT, MT_DATA); 890 solock_shared(so); 891 error = pru_rcvoob(so, m, flags & MSG_PEEK); 892 sounlock_shared(so); 893 if (error) 894 goto bad; 895 do { 896 error = uiomove(mtod(m, caddr_t), 897 ulmin(uio->uio_resid, m->m_len), uio); 898 m = m_free(m); 899 } while (uio->uio_resid && error == 0 && m); 900 bad: 901 m_freem(m); 902 return (error); 903 } 904 if (mp) 905 *mp = NULL; 906 907 restart: 908 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) 909 return (error); 910 if (dosolock) 911 solock_shared(so); 912 sb_mtx_lock(&so->so_rcv); 913 914 m = so->so_rcv.sb_mb; 915 #ifdef SOCKET_SPLICE 916 if (isspliced(so)) 917 m = NULL; 918 #endif /* SOCKET_SPLICE */ 919 /* 920 * If we have less data than requested, block awaiting more 921 * (subject to any timeout) if: 922 * 1. the current count is less than the low water mark, 923 * 2. MSG_WAITALL is set, and it is possible to do the entire 924 * receive operation at once if we block (resid <= hiwat), or 925 * 3. MSG_DONTWAIT is not set. 926 * If MSG_WAITALL is set but resid is larger than the receive buffer, 927 * we have to do the receive in sections, and thus risk returning 928 * a short count if a timeout or signal occurs after we start. 929 */ 930 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 931 so->so_rcv.sb_cc < uio->uio_resid) && 932 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 933 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 934 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 935 #ifdef DIAGNOSTIC 936 if (m == NULL && so->so_rcv.sb_cc) 937 #ifdef SOCKET_SPLICE 938 if (!isspliced(so)) 939 #endif /* SOCKET_SPLICE */ 940 panic("receive 1: so %p, so_type %d, sb_cc %lu", 941 so, so->so_type, so->so_rcv.sb_cc); 942 #endif 943 if ((error2 = READ_ONCE(so->so_error))) { 944 if (m) 945 goto dontblock; 946 error = error2; 947 if ((flags & MSG_PEEK) == 0) 948 so->so_error = 0; 949 goto release; 950 } 951 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 952 if (m) 953 goto dontblock; 954 else if (so->so_rcv.sb_cc == 0) 955 goto release; 956 } 957 for (; m; m = m->m_next) 958 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 959 m = so->so_rcv.sb_mb; 960 goto dontblock; 961 } 962 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 963 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 964 error = ENOTCONN; 965 goto release; 966 } 967 if (uio->uio_resid == 0 && controlp == NULL) 968 goto release; 969 if (flags & MSG_DONTWAIT) { 970 error = EWOULDBLOCK; 971 goto release; 972 } 973 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 974 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 975 976 sbunlock(&so->so_rcv); 977 error = sbwait(so, &so->so_rcv); 978 sb_mtx_unlock(&so->so_rcv); 979 if (dosolock) 980 sounlock_shared(so); 981 if (error) 982 return (error); 983 goto restart; 984 } 985 dontblock: 986 /* 987 * On entry here, m points to the first record of the socket buffer. 988 * From this point onward, we maintain 'nextrecord' as a cache of the 989 * pointer to the next record in the socket buffer. We must keep the 990 * various socket buffer pointers and local stack versions of the 991 * pointers in sync, pushing out modifications before operations that 992 * may sleep, and re-reading them afterwards. 993 * 994 * Otherwise, we will race with the network stack appending new data 995 * or records onto the socket buffer by using inconsistent/stale 996 * versions of the field, possibly resulting in socket buffer 997 * corruption. 998 */ 999 if (uio->uio_procp) 1000 uio->uio_procp->p_ru.ru_msgrcv++; 1001 KASSERT(m == so->so_rcv.sb_mb); 1002 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 1003 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 1004 nextrecord = m->m_nextpkt; 1005 if (pr->pr_flags & PR_ADDR) { 1006 #ifdef DIAGNOSTIC 1007 if (m->m_type != MT_SONAME) 1008 panic("receive 1a: so %p, so_type %d, m %p, m_type %d", 1009 so, so->so_type, m, m->m_type); 1010 #endif 1011 orig_resid = 0; 1012 if (flags & MSG_PEEK) { 1013 if (paddr) 1014 *paddr = m_copym(m, 0, m->m_len, M_NOWAIT); 1015 m = m->m_next; 1016 } else { 1017 sbfree(so, &so->so_rcv, m); 1018 if (paddr) { 1019 *paddr = m; 1020 so->so_rcv.sb_mb = m->m_next; 1021 m->m_next = NULL; 1022 m = so->so_rcv.sb_mb; 1023 } else { 1024 so->so_rcv.sb_mb = m_free(m); 1025 m = so->so_rcv.sb_mb; 1026 } 1027 sbsync(&so->so_rcv, nextrecord); 1028 } 1029 } 1030 while (m && m->m_type == MT_CONTROL && error == 0) { 1031 int skip = 0; 1032 if (flags & MSG_PEEK) { 1033 if (mtod(m, struct cmsghdr *)->cmsg_type == 1034 SCM_RIGHTS) { 1035 /* don't leak internalized SCM_RIGHTS msgs */ 1036 skip = 1; 1037 } else if (controlp) 1038 *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); 1039 m = m->m_next; 1040 } else { 1041 sbfree(so, &so->so_rcv, m); 1042 so->so_rcv.sb_mb = m->m_next; 1043 m->m_nextpkt = m->m_next = NULL; 1044 cm = m; 1045 m = so->so_rcv.sb_mb; 1046 sbsync(&so->so_rcv, nextrecord); 1047 if (controlp) { 1048 if (pr->pr_domain->dom_externalize) { 1049 sb_mtx_unlock(&so->so_rcv); 1050 if (dosolock) 1051 sounlock_shared(so); 1052 error = 1053 (*pr->pr_domain->dom_externalize) 1054 (cm, controllen, flags); 1055 if (dosolock) 1056 solock_shared(so); 1057 sb_mtx_lock(&so->so_rcv); 1058 } 1059 *controlp = cm; 1060 } else { 1061 /* 1062 * Dispose of any SCM_RIGHTS message that went 1063 * through the read path rather than recv. 1064 */ 1065 if (pr->pr_domain->dom_dispose) { 1066 sb_mtx_unlock(&so->so_rcv); 1067 pr->pr_domain->dom_dispose(cm); 1068 sb_mtx_lock(&so->so_rcv); 1069 } 1070 m_free(cm); 1071 } 1072 } 1073 if (m != NULL) 1074 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1075 else 1076 nextrecord = so->so_rcv.sb_mb; 1077 if (controlp && !skip) 1078 controlp = &(*controlp)->m_next; 1079 orig_resid = 0; 1080 } 1081 1082 /* If m is non-NULL, we have some data to read. */ 1083 if (m) { 1084 type = m->m_type; 1085 if (type == MT_OOBDATA) 1086 flags |= MSG_OOB; 1087 if (m->m_flags & M_BCAST) 1088 flags |= MSG_BCAST; 1089 if (m->m_flags & M_MCAST) 1090 flags |= MSG_MCAST; 1091 } 1092 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 1093 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 1094 1095 moff = 0; 1096 offset = 0; 1097 while (m && uio->uio_resid > 0 && error == 0) { 1098 if (m->m_type == MT_OOBDATA) { 1099 if (type != MT_OOBDATA) 1100 break; 1101 } else if (type == MT_OOBDATA) { 1102 break; 1103 } else if (m->m_type == MT_CONTROL) { 1104 /* 1105 * If there is more than one control message in the 1106 * stream, we do a short read. Next can be received 1107 * or disposed by another system call. 1108 */ 1109 break; 1110 #ifdef DIAGNOSTIC 1111 } else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) { 1112 panic("receive 3: so %p, so_type %d, m %p, m_type %d", 1113 so, so->so_type, m, m->m_type); 1114 #endif 1115 } 1116 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1117 len = uio->uio_resid; 1118 if (so->so_oobmark && len > so->so_oobmark - offset) 1119 len = so->so_oobmark - offset; 1120 if (len > m->m_len - moff) 1121 len = m->m_len - moff; 1122 /* 1123 * If mp is set, just pass back the mbufs. 1124 * Otherwise copy them out via the uio, then free. 1125 * Sockbuf must be consistent here (points to current mbuf, 1126 * it points to next record) when we drop priority; 1127 * we must note any additions to the sockbuf when we 1128 * block interrupts again. 1129 */ 1130 if (mp == NULL && uio_error == 0) { 1131 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 1132 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 1133 resid = uio->uio_resid; 1134 sb_mtx_unlock(&so->so_rcv); 1135 if (dosolock) 1136 sounlock_shared(so); 1137 uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1138 if (dosolock) 1139 solock_shared(so); 1140 sb_mtx_lock(&so->so_rcv); 1141 if (uio_error) 1142 uio->uio_resid = resid - len; 1143 } else 1144 uio->uio_resid -= len; 1145 if (len == m->m_len - moff) { 1146 if (m->m_flags & M_EOR) 1147 flags |= MSG_EOR; 1148 if (flags & MSG_PEEK) { 1149 m = m->m_next; 1150 moff = 0; 1151 orig_resid = 0; 1152 } else { 1153 nextrecord = m->m_nextpkt; 1154 sbfree(so, &so->so_rcv, m); 1155 if (mp) { 1156 *mp = m; 1157 mp = &m->m_next; 1158 so->so_rcv.sb_mb = m = m->m_next; 1159 *mp = NULL; 1160 } else { 1161 so->so_rcv.sb_mb = m_free(m); 1162 m = so->so_rcv.sb_mb; 1163 } 1164 /* 1165 * If m != NULL, we also know that 1166 * so->so_rcv.sb_mb != NULL. 1167 */ 1168 KASSERT(so->so_rcv.sb_mb == m); 1169 if (m) { 1170 m->m_nextpkt = nextrecord; 1171 if (nextrecord == NULL) 1172 so->so_rcv.sb_lastrecord = m; 1173 } else { 1174 so->so_rcv.sb_mb = nextrecord; 1175 SB_EMPTY_FIXUP(&so->so_rcv); 1176 } 1177 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1178 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1179 } 1180 } else { 1181 if (flags & MSG_PEEK) { 1182 moff += len; 1183 orig_resid = 0; 1184 } else { 1185 if (mp) 1186 *mp = m_copym(m, 0, len, M_WAIT); 1187 m->m_data += len; 1188 m->m_len -= len; 1189 so->so_rcv.sb_cc -= len; 1190 so->so_rcv.sb_datacc -= len; 1191 } 1192 } 1193 if (so->so_oobmark) { 1194 if ((flags & MSG_PEEK) == 0) { 1195 so->so_oobmark -= len; 1196 if (so->so_oobmark == 0) { 1197 so->so_rcv.sb_state |= SS_RCVATMARK; 1198 break; 1199 } 1200 } else { 1201 offset += len; 1202 if (offset == so->so_oobmark) 1203 break; 1204 } 1205 } 1206 if (flags & MSG_EOR) 1207 break; 1208 /* 1209 * If the MSG_WAITALL flag is set (for non-atomic socket), 1210 * we must not quit until "uio->uio_resid == 0" or an error 1211 * termination. If a signal/timeout occurs, return 1212 * with a short count but without error. 1213 * Keep sockbuf locked against other readers. 1214 */ 1215 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1216 !sosendallatonce(so) && !nextrecord) { 1217 if (so->so_rcv.sb_state & SS_CANTRCVMORE || 1218 so->so_error) 1219 break; 1220 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1221 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1222 if (sbwait(so, &so->so_rcv)) { 1223 sb_mtx_unlock(&so->so_rcv); 1224 if (dosolock) 1225 sounlock_shared(so); 1226 sbunlock(&so->so_rcv); 1227 return (0); 1228 } 1229 if ((m = so->so_rcv.sb_mb) != NULL) 1230 nextrecord = m->m_nextpkt; 1231 } 1232 } 1233 1234 if (m && pr->pr_flags & PR_ATOMIC) { 1235 flags |= MSG_TRUNC; 1236 if ((flags & MSG_PEEK) == 0) 1237 (void) sbdroprecord(so, &so->so_rcv); 1238 } 1239 if ((flags & MSG_PEEK) == 0) { 1240 if (m == NULL) { 1241 /* 1242 * First part is an inline SB_EMPTY_FIXUP(). Second 1243 * part makes sure sb_lastrecord is up-to-date if 1244 * there is still data in the socket buffer. 1245 */ 1246 so->so_rcv.sb_mb = nextrecord; 1247 if (so->so_rcv.sb_mb == NULL) { 1248 so->so_rcv.sb_mbtail = NULL; 1249 so->so_rcv.sb_lastrecord = NULL; 1250 } else if (nextrecord->m_nextpkt == NULL) 1251 so->so_rcv.sb_lastrecord = nextrecord; 1252 } 1253 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1254 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1255 if (pr->pr_flags & PR_WANTRCVD) { 1256 sb_mtx_unlock(&so->so_rcv); 1257 if (!dosolock) 1258 solock_shared(so); 1259 pru_rcvd(so); 1260 if (!dosolock) 1261 sounlock_shared(so); 1262 sb_mtx_lock(&so->so_rcv); 1263 } 1264 } 1265 if (orig_resid == uio->uio_resid && orig_resid && 1266 (flags & MSG_EOR) == 0 && 1267 (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) { 1268 sb_mtx_unlock(&so->so_rcv); 1269 sbunlock(&so->so_rcv); 1270 goto restart; 1271 } 1272 1273 if (uio_error) 1274 error = uio_error; 1275 1276 if (flagsp) 1277 *flagsp |= flags; 1278 release: 1279 sb_mtx_unlock(&so->so_rcv); 1280 if (dosolock) 1281 sounlock_shared(so); 1282 sbunlock(&so->so_rcv); 1283 return (error); 1284 } 1285 1286 int 1287 soshutdown(struct socket *so, int how) 1288 { 1289 int error = 0; 1290 1291 switch (how) { 1292 case SHUT_RD: 1293 sorflush(so); 1294 break; 1295 case SHUT_RDWR: 1296 sorflush(so); 1297 /* FALLTHROUGH */ 1298 case SHUT_WR: 1299 solock(so); 1300 error = pru_shutdown(so); 1301 sounlock(so); 1302 break; 1303 default: 1304 error = EINVAL; 1305 break; 1306 } 1307 1308 return (error); 1309 } 1310 1311 void 1312 sorflush(struct socket *so) 1313 { 1314 struct sockbuf *sb = &so->so_rcv; 1315 struct mbuf *m; 1316 const struct protosw *pr = so->so_proto; 1317 int error; 1318 1319 error = sblock(sb, SBL_WAIT | SBL_NOINTR); 1320 /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */ 1321 KASSERT(error == 0); 1322 1323 solock_shared(so); 1324 socantrcvmore(so); 1325 mtx_enter(&sb->sb_mtx); 1326 m = sb->sb_mb; 1327 memset(&sb->sb_startzero, 0, 1328 (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); 1329 sb->sb_timeo_nsecs = INFSLP; 1330 mtx_leave(&sb->sb_mtx); 1331 sounlock_shared(so); 1332 sbunlock(sb); 1333 1334 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1335 (*pr->pr_domain->dom_dispose)(m); 1336 m_purge(m); 1337 } 1338 1339 #ifdef SOCKET_SPLICE 1340 1341 #define so_splicelen so_sp->ssp_len 1342 #define so_splicemax so_sp->ssp_max 1343 #define so_idletv so_sp->ssp_idletv 1344 #define so_idleto so_sp->ssp_idleto 1345 #define so_splicetask so_sp->ssp_task 1346 1347 int 1348 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) 1349 { 1350 struct file *fp; 1351 struct socket *sosp; 1352 struct taskq *tq; 1353 int error = 0; 1354 1355 if ((so->so_proto->pr_flags & PR_SPLICE) == 0) 1356 return (EPROTONOSUPPORT); 1357 if (max && max < 0) 1358 return (EINVAL); 1359 if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) 1360 return (EINVAL); 1361 1362 /* If no fd is given, unsplice by removing existing link. */ 1363 if (fd < 0) { 1364 if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0) 1365 return (error); 1366 solock(so); 1367 if (so->so_options & SO_ACCEPTCONN) { 1368 error = EOPNOTSUPP; 1369 goto out; 1370 } 1371 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1372 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1373 error = ENOTCONN; 1374 goto out; 1375 } 1376 1377 if (so->so_sp && so->so_sp->ssp_socket) 1378 sounsplice(so, so->so_sp->ssp_socket, 0); 1379 out: 1380 sounlock(so); 1381 sbunlock(&so->so_rcv); 1382 return (error); 1383 } 1384 1385 if (sosplice_taskq == NULL) { 1386 rw_enter_write(&sosplice_lock); 1387 if (sosplice_taskq == NULL) { 1388 tq = taskq_create("sosplice", 1, IPL_SOFTNET, 1389 TASKQ_MPSAFE); 1390 if (tq == NULL) { 1391 rw_exit_write(&sosplice_lock); 1392 return (ENOMEM); 1393 } 1394 /* Ensure the taskq is fully visible to other CPUs. */ 1395 membar_producer(); 1396 sosplice_taskq = tq; 1397 } 1398 rw_exit_write(&sosplice_lock); 1399 } else { 1400 /* Ensure the taskq is fully visible on this CPU. */ 1401 membar_consumer(); 1402 } 1403 1404 /* Find sosp, the drain socket where data will be spliced into. */ 1405 if ((error = getsock(curproc, fd, &fp)) != 0) 1406 return (error); 1407 sosp = fp->f_data; 1408 1409 if (sosp->so_proto->pr_usrreqs->pru_send != 1410 so->so_proto->pr_usrreqs->pru_send) { 1411 error = EPROTONOSUPPORT; 1412 goto frele; 1413 } 1414 1415 if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0) 1416 goto frele; 1417 if ((error = sblock(&sosp->so_snd, SBL_WAIT)) != 0) { 1418 sbunlock(&so->so_rcv); 1419 goto frele; 1420 } 1421 solock(so); 1422 1423 if ((so->so_options & SO_ACCEPTCONN) || 1424 (sosp->so_options & SO_ACCEPTCONN)) { 1425 error = EOPNOTSUPP; 1426 goto release; 1427 } 1428 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1429 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1430 error = ENOTCONN; 1431 goto release; 1432 } 1433 if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) { 1434 error = ENOTCONN; 1435 goto release; 1436 } 1437 if (so->so_sp == NULL) 1438 so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1439 if (sosp->so_sp == NULL) 1440 sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1441 if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { 1442 error = EBUSY; 1443 goto release; 1444 } 1445 1446 so->so_splicelen = 0; 1447 so->so_splicemax = max; 1448 if (tv) 1449 so->so_idletv = *tv; 1450 else 1451 timerclear(&so->so_idletv); 1452 timeout_set_flags(&so->so_idleto, soidle, so, 1453 KCLOCK_NONE, TIMEOUT_PROC | TIMEOUT_MPSAFE); 1454 task_set(&so->so_splicetask, sotask, so); 1455 1456 /* 1457 * To prevent sorwakeup() calling somove() before this somove() 1458 * has finished, the socket buffers are not marked as spliced yet. 1459 */ 1460 1461 /* Splice so and sosp together. */ 1462 mtx_enter(&so->so_rcv.sb_mtx); 1463 mtx_enter(&sosp->so_snd.sb_mtx); 1464 so->so_sp->ssp_socket = sosp; 1465 sosp->so_sp->ssp_soback = so; 1466 mtx_leave(&sosp->so_snd.sb_mtx); 1467 mtx_leave(&so->so_rcv.sb_mtx); 1468 1469 if ((so->so_proto->pr_flags & PR_WANTRCVD) == 0) 1470 sounlock(so); 1471 if (somove(so, M_WAIT)) { 1472 mtx_enter(&so->so_rcv.sb_mtx); 1473 mtx_enter(&sosp->so_snd.sb_mtx); 1474 so->so_rcv.sb_flags |= SB_SPLICE; 1475 sosp->so_snd.sb_flags |= SB_SPLICE; 1476 mtx_leave(&sosp->so_snd.sb_mtx); 1477 mtx_leave(&so->so_rcv.sb_mtx); 1478 } 1479 if ((so->so_proto->pr_flags & PR_WANTRCVD) == 0) 1480 solock(so); 1481 1482 release: 1483 sounlock(so); 1484 sbunlock(&sosp->so_snd); 1485 sbunlock(&so->so_rcv); 1486 frele: 1487 FRELE(fp, curproc); 1488 1489 return (error); 1490 } 1491 1492 void 1493 sounsplice(struct socket *so, struct socket *sosp, int freeing) 1494 { 1495 sbassertlocked(&so->so_rcv); 1496 soassertlocked(so); 1497 1498 task_del(sosplice_taskq, &so->so_splicetask); 1499 timeout_del(&so->so_idleto); 1500 1501 mtx_enter(&so->so_rcv.sb_mtx); 1502 mtx_enter(&sosp->so_snd.sb_mtx); 1503 so->so_rcv.sb_flags &= ~SB_SPLICE; 1504 sosp->so_snd.sb_flags &= ~SB_SPLICE; 1505 so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; 1506 mtx_leave(&sosp->so_snd.sb_mtx); 1507 mtx_leave(&so->so_rcv.sb_mtx); 1508 1509 /* Do not wakeup a socket that is about to be freed. */ 1510 if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so)) 1511 sorwakeup(so); 1512 if ((freeing & SOSP_FREEING_WRITE) == 0 && sowriteable(sosp)) 1513 sowwakeup(sosp); 1514 } 1515 1516 void 1517 soidle(void *arg) 1518 { 1519 struct socket *so = arg; 1520 1521 sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR); 1522 solock(so); 1523 /* 1524 * Depending on socket type, sblock(&so->so_rcv) or solock() 1525 * is always held while modifying SB_SPLICE and 1526 * so->so_sp->ssp_socket. 1527 */ 1528 if (so->so_rcv.sb_flags & SB_SPLICE) { 1529 so->so_error = ETIMEDOUT; 1530 sounsplice(so, so->so_sp->ssp_socket, 0); 1531 } 1532 sounlock(so); 1533 sbunlock(&so->so_rcv); 1534 } 1535 1536 void 1537 sotask(void *arg) 1538 { 1539 struct socket *so = arg; 1540 int doyield = 0; 1541 int sockstream = (so->so_proto->pr_flags & PR_WANTRCVD); 1542 1543 /* 1544 * sblock() on `so_rcv' protects sockets from being unspliced 1545 * for UDP case. TCP sockets still rely on solock(). 1546 */ 1547 1548 sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR); 1549 if (so->so_rcv.sb_flags & SB_SPLICE) { 1550 struct socket *sosp = so->so_sp->ssp_socket; 1551 1552 if (sockstream) { 1553 sblock(&sosp->so_snd, SBL_WAIT | SBL_NOINTR); 1554 solock(so); 1555 doyield = 1; 1556 } 1557 1558 somove(so, M_DONTWAIT); 1559 1560 if (sockstream) { 1561 sounlock(so); 1562 sbunlock(&sosp->so_snd); 1563 } 1564 } 1565 1566 sbunlock(&so->so_rcv); 1567 1568 if (doyield) { 1569 /* Avoid user land starvation. */ 1570 yield(); 1571 } 1572 } 1573 1574 /* 1575 * Move data from receive buffer of spliced source socket to send 1576 * buffer of drain socket. Try to move as much as possible in one 1577 * big chunk. It is a TCP only implementation. 1578 * Return value 0 means splicing has been finished, 1 continue. 1579 */ 1580 int 1581 somove(struct socket *so, int wait) 1582 { 1583 struct socket *sosp = so->so_sp->ssp_socket; 1584 struct mbuf *m, **mp, *nextrecord; 1585 u_long len, off, oobmark; 1586 long space; 1587 int error = 0, maxreached = 0, unsplice = 0; 1588 unsigned int rcvstate; 1589 int sockdgram = ((so->so_proto->pr_flags & 1590 PR_WANTRCVD) == 0); 1591 1592 if (sockdgram) 1593 sbassertlocked(&so->so_rcv); 1594 else { 1595 sbassertlocked(&sosp->so_snd); 1596 soassertlocked(so); 1597 } 1598 1599 mtx_enter(&so->so_rcv.sb_mtx); 1600 mtx_enter(&sosp->so_snd.sb_mtx); 1601 1602 nextpkt: 1603 if ((error = READ_ONCE(so->so_error))) 1604 goto release; 1605 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) { 1606 error = EPIPE; 1607 goto release; 1608 } 1609 1610 error = READ_ONCE(sosp->so_error); 1611 if (error) { 1612 if (error != ETIMEDOUT && error != EFBIG && error != ELOOP) 1613 goto release; 1614 error = 0; 1615 } 1616 if ((sosp->so_state & SS_ISCONNECTED) == 0) 1617 goto release; 1618 1619 /* Calculate how many bytes can be copied now. */ 1620 len = so->so_rcv.sb_datacc; 1621 if (so->so_splicemax) { 1622 KASSERT(so->so_splicelen < so->so_splicemax); 1623 if (so->so_splicemax <= so->so_splicelen + len) { 1624 len = so->so_splicemax - so->so_splicelen; 1625 maxreached = 1; 1626 } 1627 } 1628 space = sbspace_locked(sosp, &sosp->so_snd); 1629 if (so->so_oobmark && so->so_oobmark < len && 1630 so->so_oobmark < space + 1024) 1631 space += 1024; 1632 if (space <= 0) { 1633 maxreached = 0; 1634 goto release; 1635 } 1636 if (space < len) { 1637 maxreached = 0; 1638 if (space < sosp->so_snd.sb_lowat) 1639 goto release; 1640 len = space; 1641 } 1642 sosp->so_snd.sb_state |= SS_ISSENDING; 1643 1644 SBLASTRECORDCHK(&so->so_rcv, "somove 1"); 1645 SBLASTMBUFCHK(&so->so_rcv, "somove 1"); 1646 m = so->so_rcv.sb_mb; 1647 if (m == NULL) 1648 goto release; 1649 nextrecord = m->m_nextpkt; 1650 1651 /* Drop address and control information not used with splicing. */ 1652 if (so->so_proto->pr_flags & PR_ADDR) { 1653 #ifdef DIAGNOSTIC 1654 if (m->m_type != MT_SONAME) 1655 panic("somove soname: so %p, so_type %d, m %p, " 1656 "m_type %d", so, so->so_type, m, m->m_type); 1657 #endif 1658 m = m->m_next; 1659 } 1660 while (m && m->m_type == MT_CONTROL) 1661 m = m->m_next; 1662 if (m == NULL) { 1663 sbdroprecord(so, &so->so_rcv); 1664 if (so->so_proto->pr_flags & PR_WANTRCVD) { 1665 mtx_leave(&sosp->so_snd.sb_mtx); 1666 mtx_leave(&so->so_rcv.sb_mtx); 1667 pru_rcvd(so); 1668 mtx_enter(&so->so_rcv.sb_mtx); 1669 mtx_enter(&sosp->so_snd.sb_mtx); 1670 } 1671 goto nextpkt; 1672 } 1673 1674 /* 1675 * By splicing sockets connected to localhost, userland might create a 1676 * loop. Dissolve splicing with error if loop is detected by counter. 1677 * 1678 * If we deal with looped broadcast/multicast packet we bail out with 1679 * no error to suppress splice termination. 1680 */ 1681 if ((m->m_flags & M_PKTHDR) && 1682 ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) || 1683 ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) { 1684 error = ELOOP; 1685 goto release; 1686 } 1687 1688 if (so->so_proto->pr_flags & PR_ATOMIC) { 1689 if ((m->m_flags & M_PKTHDR) == 0) 1690 panic("somove !PKTHDR: so %p, so_type %d, m %p, " 1691 "m_type %d", so, so->so_type, m, m->m_type); 1692 if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) { 1693 error = EMSGSIZE; 1694 goto release; 1695 } 1696 if (len < m->m_pkthdr.len) 1697 goto release; 1698 if (m->m_pkthdr.len < len) { 1699 maxreached = 0; 1700 len = m->m_pkthdr.len; 1701 } 1702 /* 1703 * Throw away the name mbuf after it has been assured 1704 * that the whole first record can be processed. 1705 */ 1706 m = so->so_rcv.sb_mb; 1707 sbfree(so, &so->so_rcv, m); 1708 so->so_rcv.sb_mb = m_free(m); 1709 sbsync(&so->so_rcv, nextrecord); 1710 } 1711 /* 1712 * Throw away the control mbufs after it has been assured 1713 * that the whole first record can be processed. 1714 */ 1715 m = so->so_rcv.sb_mb; 1716 while (m && m->m_type == MT_CONTROL) { 1717 sbfree(so, &so->so_rcv, m); 1718 so->so_rcv.sb_mb = m_free(m); 1719 m = so->so_rcv.sb_mb; 1720 sbsync(&so->so_rcv, nextrecord); 1721 } 1722 1723 SBLASTRECORDCHK(&so->so_rcv, "somove 2"); 1724 SBLASTMBUFCHK(&so->so_rcv, "somove 2"); 1725 1726 /* Take at most len mbufs out of receive buffer. */ 1727 for (off = 0, mp = &m; off <= len && *mp; 1728 off += (*mp)->m_len, mp = &(*mp)->m_next) { 1729 u_long size = len - off; 1730 1731 #ifdef DIAGNOSTIC 1732 if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER) 1733 panic("somove type: so %p, so_type %d, m %p, " 1734 "m_type %d", so, so->so_type, *mp, (*mp)->m_type); 1735 #endif 1736 if ((*mp)->m_len > size) { 1737 /* 1738 * Move only a partial mbuf at maximum splice length or 1739 * if the drain buffer is too small for this large mbuf. 1740 */ 1741 if (!maxreached && sosp->so_snd.sb_datacc > 0) { 1742 len -= size; 1743 break; 1744 } 1745 *mp = m_copym(so->so_rcv.sb_mb, 0, size, wait); 1746 if (*mp == NULL) { 1747 len -= size; 1748 break; 1749 } 1750 so->so_rcv.sb_mb->m_data += size; 1751 so->so_rcv.sb_mb->m_len -= size; 1752 so->so_rcv.sb_cc -= size; 1753 so->so_rcv.sb_datacc -= size; 1754 } else { 1755 *mp = so->so_rcv.sb_mb; 1756 sbfree(so, &so->so_rcv, *mp); 1757 so->so_rcv.sb_mb = (*mp)->m_next; 1758 sbsync(&so->so_rcv, nextrecord); 1759 } 1760 } 1761 *mp = NULL; 1762 1763 SBLASTRECORDCHK(&so->so_rcv, "somove 3"); 1764 SBLASTMBUFCHK(&so->so_rcv, "somove 3"); 1765 SBCHECK(so, &so->so_rcv); 1766 if (m == NULL) 1767 goto release; 1768 m->m_nextpkt = NULL; 1769 if (m->m_flags & M_PKTHDR) { 1770 m_resethdr(m); 1771 m->m_pkthdr.len = len; 1772 } 1773 1774 /* Send window update to source peer as receive buffer has changed. */ 1775 if (so->so_proto->pr_flags & PR_WANTRCVD) { 1776 mtx_leave(&sosp->so_snd.sb_mtx); 1777 mtx_leave(&so->so_rcv.sb_mtx); 1778 pru_rcvd(so); 1779 mtx_enter(&so->so_rcv.sb_mtx); 1780 mtx_enter(&sosp->so_snd.sb_mtx); 1781 } 1782 1783 /* Receive buffer did shrink by len bytes, adjust oob. */ 1784 rcvstate = so->so_rcv.sb_state; 1785 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1786 oobmark = so->so_oobmark; 1787 so->so_oobmark = oobmark > len ? oobmark - len : 0; 1788 if (oobmark) { 1789 if (oobmark == len) 1790 so->so_rcv.sb_state |= SS_RCVATMARK; 1791 if (oobmark >= len) 1792 oobmark = 0; 1793 } 1794 1795 /* 1796 * Handle oob data. If any malloc fails, ignore error. 1797 * TCP urgent data is not very reliable anyway. 1798 */ 1799 while (((rcvstate & SS_RCVATMARK) || oobmark) && 1800 (so->so_options & SO_OOBINLINE)) { 1801 struct mbuf *o = NULL; 1802 1803 if (rcvstate & SS_RCVATMARK) { 1804 o = m_get(wait, MT_DATA); 1805 rcvstate &= ~SS_RCVATMARK; 1806 } else if (oobmark) { 1807 o = m_split(m, oobmark, wait); 1808 if (o) { 1809 mtx_leave(&sosp->so_snd.sb_mtx); 1810 mtx_leave(&so->so_rcv.sb_mtx); 1811 error = pru_send(sosp, m, NULL, NULL); 1812 mtx_enter(&so->so_rcv.sb_mtx); 1813 mtx_enter(&sosp->so_snd.sb_mtx); 1814 1815 if (error) { 1816 if (sosp->so_snd.sb_state & 1817 SS_CANTSENDMORE) 1818 error = EPIPE; 1819 m_freem(o); 1820 goto release; 1821 } 1822 len -= oobmark; 1823 so->so_splicelen += oobmark; 1824 m = o; 1825 o = m_get(wait, MT_DATA); 1826 } 1827 oobmark = 0; 1828 } 1829 if (o) { 1830 o->m_len = 1; 1831 *mtod(o, caddr_t) = *mtod(m, caddr_t); 1832 1833 mtx_leave(&sosp->so_snd.sb_mtx); 1834 mtx_leave(&so->so_rcv.sb_mtx); 1835 error = pru_sendoob(sosp, o, NULL, NULL); 1836 mtx_enter(&so->so_rcv.sb_mtx); 1837 mtx_enter(&sosp->so_snd.sb_mtx); 1838 1839 if (error) { 1840 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) 1841 error = EPIPE; 1842 m_freem(m); 1843 goto release; 1844 } 1845 len -= 1; 1846 so->so_splicelen += 1; 1847 if (oobmark) { 1848 oobmark -= 1; 1849 if (oobmark == 0) 1850 rcvstate |= SS_RCVATMARK; 1851 } 1852 m_adj(m, 1); 1853 } 1854 } 1855 1856 /* Append all remaining data to drain socket. */ 1857 if (so->so_rcv.sb_cc == 0 || maxreached) 1858 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1859 1860 mtx_leave(&sosp->so_snd.sb_mtx); 1861 mtx_leave(&so->so_rcv.sb_mtx); 1862 1863 if (sockdgram) 1864 solock_shared(sosp); 1865 error = pru_send(sosp, m, NULL, NULL); 1866 if (sockdgram) 1867 sounlock_shared(sosp); 1868 1869 mtx_enter(&so->so_rcv.sb_mtx); 1870 mtx_enter(&sosp->so_snd.sb_mtx); 1871 1872 if (error) { 1873 if (sosp->so_snd.sb_state & SS_CANTSENDMORE || 1874 sosp->so_pcb == NULL) 1875 error = EPIPE; 1876 goto release; 1877 } 1878 so->so_splicelen += len; 1879 1880 /* Move several packets if possible. */ 1881 if (!maxreached && nextrecord) 1882 goto nextpkt; 1883 1884 release: 1885 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1886 1887 if (!error && maxreached && so->so_splicemax == so->so_splicelen) 1888 error = EFBIG; 1889 if (error) 1890 WRITE_ONCE(so->so_error, error); 1891 1892 if (((so->so_rcv.sb_state & SS_CANTRCVMORE) && 1893 so->so_rcv.sb_cc == 0) || 1894 (sosp->so_snd.sb_state & SS_CANTSENDMORE) || 1895 maxreached || error) 1896 unsplice = 1; 1897 1898 mtx_leave(&sosp->so_snd.sb_mtx); 1899 mtx_leave(&so->so_rcv.sb_mtx); 1900 1901 if (unsplice) { 1902 if (sockdgram) 1903 solock(so); 1904 sounsplice(so, sosp, 0); 1905 if (sockdgram) 1906 sounlock(so); 1907 1908 return (0); 1909 } 1910 if (timerisset(&so->so_idletv)) 1911 timeout_add_tv(&so->so_idleto, &so->so_idletv); 1912 return (1); 1913 } 1914 #endif /* SOCKET_SPLICE */ 1915 1916 void 1917 sorwakeup(struct socket *so) 1918 { 1919 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 1920 soassertlocked_readonly(so); 1921 1922 #ifdef SOCKET_SPLICE 1923 if (so->so_proto->pr_flags & PR_SPLICE) { 1924 sb_mtx_lock(&so->so_rcv); 1925 if (so->so_rcv.sb_flags & SB_SPLICE) 1926 task_add(sosplice_taskq, &so->so_splicetask); 1927 if (isspliced(so)) { 1928 sb_mtx_unlock(&so->so_rcv); 1929 return; 1930 } 1931 sb_mtx_unlock(&so->so_rcv); 1932 } 1933 #endif 1934 sowakeup(so, &so->so_rcv); 1935 if (so->so_upcall) 1936 (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT); 1937 } 1938 1939 void 1940 sowwakeup(struct socket *so) 1941 { 1942 if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0) 1943 soassertlocked_readonly(so); 1944 1945 #ifdef SOCKET_SPLICE 1946 if (so->so_proto->pr_flags & PR_SPLICE) { 1947 sb_mtx_lock(&so->so_snd); 1948 if (so->so_snd.sb_flags & SB_SPLICE) 1949 task_add(sosplice_taskq, 1950 &so->so_sp->ssp_soback->so_splicetask); 1951 if (issplicedback(so)) { 1952 sb_mtx_unlock(&so->so_snd); 1953 return; 1954 } 1955 sb_mtx_unlock(&so->so_snd); 1956 } 1957 #endif 1958 sowakeup(so, &so->so_snd); 1959 } 1960 1961 int 1962 sosetopt(struct socket *so, int level, int optname, struct mbuf *m) 1963 { 1964 int error = 0; 1965 1966 if (level != SOL_SOCKET) { 1967 if (so->so_proto->pr_ctloutput) { 1968 solock(so); 1969 error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1970 level, optname, m); 1971 sounlock(so); 1972 return (error); 1973 } 1974 error = ENOPROTOOPT; 1975 } else { 1976 switch (optname) { 1977 1978 case SO_LINGER: 1979 if (m == NULL || m->m_len != sizeof (struct linger) || 1980 mtod(m, struct linger *)->l_linger < 0 || 1981 mtod(m, struct linger *)->l_linger > SHRT_MAX) 1982 return (EINVAL); 1983 1984 solock(so); 1985 so->so_linger = mtod(m, struct linger *)->l_linger; 1986 if (*mtod(m, int *)) 1987 so->so_options |= optname; 1988 else 1989 so->so_options &= ~optname; 1990 sounlock(so); 1991 1992 break; 1993 case SO_BINDANY: 1994 if ((error = suser(curproc)) != 0) /* XXX */ 1995 return (error); 1996 /* FALLTHROUGH */ 1997 1998 case SO_DEBUG: 1999 case SO_KEEPALIVE: 2000 case SO_USELOOPBACK: 2001 case SO_BROADCAST: 2002 case SO_REUSEADDR: 2003 case SO_REUSEPORT: 2004 case SO_OOBINLINE: 2005 case SO_TIMESTAMP: 2006 case SO_ZEROIZE: 2007 if (m == NULL || m->m_len < sizeof (int)) 2008 return (EINVAL); 2009 2010 solock(so); 2011 if (*mtod(m, int *)) 2012 so->so_options |= optname; 2013 else 2014 so->so_options &= ~optname; 2015 sounlock(so); 2016 2017 break; 2018 case SO_DONTROUTE: 2019 if (m == NULL || m->m_len < sizeof (int)) 2020 return (EINVAL); 2021 if (*mtod(m, int *)) 2022 error = EOPNOTSUPP; 2023 break; 2024 2025 case SO_SNDBUF: 2026 case SO_RCVBUF: 2027 case SO_SNDLOWAT: 2028 case SO_RCVLOWAT: 2029 { 2030 struct sockbuf *sb = (optname == SO_SNDBUF || 2031 optname == SO_SNDLOWAT ? 2032 &so->so_snd : &so->so_rcv); 2033 u_long cnt; 2034 2035 if (m == NULL || m->m_len < sizeof (int)) 2036 return (EINVAL); 2037 cnt = *mtod(m, int *); 2038 if ((long)cnt <= 0) 2039 cnt = 1; 2040 2041 if (((sb->sb_flags & SB_MTXLOCK) == 0)) 2042 solock(so); 2043 mtx_enter(&sb->sb_mtx); 2044 2045 switch (optname) { 2046 case SO_SNDBUF: 2047 case SO_RCVBUF: 2048 if (sb->sb_state & 2049 (SS_CANTSENDMORE | SS_CANTRCVMORE)) { 2050 error = EINVAL; 2051 break; 2052 } 2053 if (sbcheckreserve(cnt, sb->sb_wat) || 2054 sbreserve(so, sb, cnt)) { 2055 error = ENOBUFS; 2056 break; 2057 } 2058 sb->sb_wat = cnt; 2059 break; 2060 case SO_SNDLOWAT: 2061 case SO_RCVLOWAT: 2062 sb->sb_lowat = (cnt > sb->sb_hiwat) ? 2063 sb->sb_hiwat : cnt; 2064 break; 2065 } 2066 2067 mtx_leave(&sb->sb_mtx); 2068 if (((sb->sb_flags & SB_MTXLOCK) == 0)) 2069 sounlock(so); 2070 2071 break; 2072 } 2073 2074 case SO_SNDTIMEO: 2075 case SO_RCVTIMEO: 2076 { 2077 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 2078 &so->so_snd : &so->so_rcv); 2079 struct timeval tv; 2080 uint64_t nsecs; 2081 2082 if (m == NULL || m->m_len < sizeof (tv)) 2083 return (EINVAL); 2084 memcpy(&tv, mtod(m, struct timeval *), sizeof tv); 2085 if (!timerisvalid(&tv)) 2086 return (EINVAL); 2087 nsecs = TIMEVAL_TO_NSEC(&tv); 2088 if (nsecs == UINT64_MAX) 2089 return (EDOM); 2090 if (nsecs == 0) 2091 nsecs = INFSLP; 2092 2093 mtx_enter(&sb->sb_mtx); 2094 sb->sb_timeo_nsecs = nsecs; 2095 mtx_leave(&sb->sb_mtx); 2096 break; 2097 } 2098 2099 case SO_RTABLE: 2100 if (so->so_proto->pr_domain && 2101 so->so_proto->pr_domain->dom_protosw && 2102 so->so_proto->pr_ctloutput) { 2103 const struct domain *dom = 2104 so->so_proto->pr_domain; 2105 2106 level = dom->dom_protosw->pr_protocol; 2107 solock(so); 2108 error = (*so->so_proto->pr_ctloutput) 2109 (PRCO_SETOPT, so, level, optname, m); 2110 sounlock(so); 2111 } else 2112 error = ENOPROTOOPT; 2113 break; 2114 #ifdef SOCKET_SPLICE 2115 case SO_SPLICE: 2116 if (m == NULL) { 2117 error = sosplice(so, -1, 0, NULL); 2118 } else if (m->m_len < sizeof(int)) { 2119 error = EINVAL; 2120 } else if (m->m_len < sizeof(struct splice)) { 2121 error = sosplice(so, *mtod(m, int *), 0, NULL); 2122 } else { 2123 error = sosplice(so, 2124 mtod(m, struct splice *)->sp_fd, 2125 mtod(m, struct splice *)->sp_max, 2126 &mtod(m, struct splice *)->sp_idle); 2127 } 2128 break; 2129 #endif /* SOCKET_SPLICE */ 2130 2131 default: 2132 error = ENOPROTOOPT; 2133 break; 2134 } 2135 } 2136 2137 return (error); 2138 } 2139 2140 int 2141 sogetopt(struct socket *so, int level, int optname, struct mbuf *m) 2142 { 2143 int error = 0; 2144 2145 if (level != SOL_SOCKET) { 2146 if (so->so_proto->pr_ctloutput) { 2147 m->m_len = 0; 2148 2149 solock(so); 2150 error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so, 2151 level, optname, m); 2152 sounlock(so); 2153 return (error); 2154 } else 2155 return (ENOPROTOOPT); 2156 } else { 2157 m->m_len = sizeof (int); 2158 2159 switch (optname) { 2160 2161 case SO_LINGER: 2162 m->m_len = sizeof (struct linger); 2163 solock_shared(so); 2164 mtod(m, struct linger *)->l_onoff = 2165 so->so_options & SO_LINGER; 2166 mtod(m, struct linger *)->l_linger = so->so_linger; 2167 sounlock_shared(so); 2168 break; 2169 2170 case SO_BINDANY: 2171 case SO_USELOOPBACK: 2172 case SO_DEBUG: 2173 case SO_KEEPALIVE: 2174 case SO_REUSEADDR: 2175 case SO_REUSEPORT: 2176 case SO_BROADCAST: 2177 case SO_OOBINLINE: 2178 case SO_ACCEPTCONN: 2179 case SO_TIMESTAMP: 2180 case SO_ZEROIZE: 2181 *mtod(m, int *) = so->so_options & optname; 2182 break; 2183 2184 case SO_DONTROUTE: 2185 *mtod(m, int *) = 0; 2186 break; 2187 2188 case SO_TYPE: 2189 *mtod(m, int *) = so->so_type; 2190 break; 2191 2192 case SO_ERROR: 2193 solock(so); 2194 *mtod(m, int *) = so->so_error; 2195 so->so_error = 0; 2196 sounlock(so); 2197 2198 break; 2199 2200 case SO_DOMAIN: 2201 *mtod(m, int *) = so->so_proto->pr_domain->dom_family; 2202 break; 2203 2204 case SO_PROTOCOL: 2205 *mtod(m, int *) = so->so_proto->pr_protocol; 2206 break; 2207 2208 case SO_SNDBUF: 2209 *mtod(m, int *) = so->so_snd.sb_hiwat; 2210 break; 2211 2212 case SO_RCVBUF: 2213 *mtod(m, int *) = so->so_rcv.sb_hiwat; 2214 break; 2215 2216 case SO_SNDLOWAT: 2217 *mtod(m, int *) = so->so_snd.sb_lowat; 2218 break; 2219 2220 case SO_RCVLOWAT: 2221 *mtod(m, int *) = so->so_rcv.sb_lowat; 2222 break; 2223 2224 case SO_SNDTIMEO: 2225 case SO_RCVTIMEO: 2226 { 2227 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 2228 &so->so_snd : &so->so_rcv); 2229 struct timeval tv; 2230 uint64_t nsecs; 2231 2232 mtx_enter(&sb->sb_mtx); 2233 nsecs = sb->sb_timeo_nsecs; 2234 mtx_leave(&sb->sb_mtx); 2235 2236 m->m_len = sizeof(struct timeval); 2237 memset(&tv, 0, sizeof(tv)); 2238 if (nsecs != INFSLP) 2239 NSEC_TO_TIMEVAL(nsecs, &tv); 2240 memcpy(mtod(m, struct timeval *), &tv, sizeof tv); 2241 break; 2242 } 2243 2244 case SO_RTABLE: 2245 if (so->so_proto->pr_domain && 2246 so->so_proto->pr_domain->dom_protosw && 2247 so->so_proto->pr_ctloutput) { 2248 const struct domain *dom = 2249 so->so_proto->pr_domain; 2250 2251 level = dom->dom_protosw->pr_protocol; 2252 solock(so); 2253 error = (*so->so_proto->pr_ctloutput) 2254 (PRCO_GETOPT, so, level, optname, m); 2255 sounlock(so); 2256 if (error) 2257 return (error); 2258 break; 2259 } 2260 return (ENOPROTOOPT); 2261 2262 #ifdef SOCKET_SPLICE 2263 case SO_SPLICE: 2264 { 2265 off_t len; 2266 2267 m->m_len = sizeof(off_t); 2268 solock_shared(so); 2269 len = so->so_sp ? so->so_sp->ssp_len : 0; 2270 sounlock_shared(so); 2271 memcpy(mtod(m, off_t *), &len, sizeof(off_t)); 2272 break; 2273 } 2274 #endif /* SOCKET_SPLICE */ 2275 2276 case SO_PEERCRED: 2277 if (so->so_proto->pr_protocol == AF_UNIX) { 2278 struct unpcb *unp = sotounpcb(so); 2279 2280 solock(so); 2281 if (unp->unp_flags & UNP_FEIDS) { 2282 m->m_len = sizeof(unp->unp_connid); 2283 memcpy(mtod(m, caddr_t), 2284 &(unp->unp_connid), m->m_len); 2285 sounlock(so); 2286 break; 2287 } 2288 sounlock(so); 2289 2290 return (ENOTCONN); 2291 } 2292 return (EOPNOTSUPP); 2293 2294 default: 2295 return (ENOPROTOOPT); 2296 } 2297 return (0); 2298 } 2299 } 2300 2301 void 2302 sohasoutofband(struct socket *so) 2303 { 2304 pgsigio(&so->so_sigio, SIGURG, 0); 2305 knote(&so->so_rcv.sb_klist, 0); 2306 } 2307 2308 void 2309 sofilt_lock(struct socket *so, struct sockbuf *sb) 2310 { 2311 switch (so->so_proto->pr_domain->dom_family) { 2312 case PF_INET: 2313 case PF_INET6: 2314 NET_LOCK_SHARED(); 2315 break; 2316 default: 2317 rw_enter_write(&so->so_lock); 2318 break; 2319 } 2320 2321 mtx_enter(&sb->sb_mtx); 2322 } 2323 2324 void 2325 sofilt_unlock(struct socket *so, struct sockbuf *sb) 2326 { 2327 mtx_leave(&sb->sb_mtx); 2328 2329 switch (so->so_proto->pr_domain->dom_family) { 2330 case PF_INET: 2331 case PF_INET6: 2332 NET_UNLOCK_SHARED(); 2333 break; 2334 default: 2335 rw_exit_write(&so->so_lock); 2336 break; 2337 } 2338 } 2339 2340 int 2341 soo_kqfilter(struct file *fp, struct knote *kn) 2342 { 2343 struct socket *so = kn->kn_fp->f_data; 2344 struct sockbuf *sb; 2345 2346 switch (kn->kn_filter) { 2347 case EVFILT_READ: 2348 kn->kn_fop = &soread_filtops; 2349 sb = &so->so_rcv; 2350 break; 2351 case EVFILT_WRITE: 2352 kn->kn_fop = &sowrite_filtops; 2353 sb = &so->so_snd; 2354 break; 2355 case EVFILT_EXCEPT: 2356 kn->kn_fop = &soexcept_filtops; 2357 sb = &so->so_rcv; 2358 break; 2359 default: 2360 return (EINVAL); 2361 } 2362 2363 klist_insert(&sb->sb_klist, kn); 2364 2365 return (0); 2366 } 2367 2368 void 2369 filt_sordetach(struct knote *kn) 2370 { 2371 struct socket *so = kn->kn_fp->f_data; 2372 2373 klist_remove(&so->so_rcv.sb_klist, kn); 2374 } 2375 2376 int 2377 filt_soread(struct knote *kn, long hint) 2378 { 2379 struct socket *so = kn->kn_fp->f_data; 2380 u_int state = READ_ONCE(so->so_state); 2381 u_int error = READ_ONCE(so->so_error); 2382 int rv = 0; 2383 2384 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); 2385 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 2386 soassertlocked_readonly(so); 2387 2388 if (so->so_options & SO_ACCEPTCONN) { 2389 short qlen = READ_ONCE(so->so_qlen); 2390 2391 if (so->so_rcv.sb_flags & SB_MTXLOCK) 2392 soassertlocked_readonly(so); 2393 2394 kn->kn_data = qlen; 2395 rv = (kn->kn_data != 0); 2396 2397 if (kn->kn_flags & (__EV_POLL | __EV_SELECT)) { 2398 if (state & SS_ISDISCONNECTED) { 2399 kn->kn_flags |= __EV_HUP; 2400 rv = 1; 2401 } else { 2402 rv = qlen || soreadable(so); 2403 } 2404 } 2405 2406 return rv; 2407 } 2408 2409 kn->kn_data = so->so_rcv.sb_cc; 2410 #ifdef SOCKET_SPLICE 2411 if (isspliced(so)) { 2412 rv = 0; 2413 } else 2414 #endif /* SOCKET_SPLICE */ 2415 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 2416 kn->kn_flags |= EV_EOF; 2417 if (kn->kn_flags & __EV_POLL) { 2418 if (state & SS_ISDISCONNECTED) 2419 kn->kn_flags |= __EV_HUP; 2420 } 2421 kn->kn_fflags = error; 2422 rv = 1; 2423 } else if (error) { 2424 rv = 1; 2425 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2426 rv = (kn->kn_data >= kn->kn_sdata); 2427 } else { 2428 rv = (kn->kn_data >= so->so_rcv.sb_lowat); 2429 } 2430 2431 return rv; 2432 } 2433 2434 void 2435 filt_sowdetach(struct knote *kn) 2436 { 2437 struct socket *so = kn->kn_fp->f_data; 2438 2439 klist_remove(&so->so_snd.sb_klist, kn); 2440 } 2441 2442 int 2443 filt_sowrite(struct knote *kn, long hint) 2444 { 2445 struct socket *so = kn->kn_fp->f_data; 2446 u_int state = READ_ONCE(so->so_state); 2447 u_int error = READ_ONCE(so->so_error); 2448 int rv; 2449 2450 MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx); 2451 if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0) 2452 soassertlocked_readonly(so); 2453 2454 kn->kn_data = sbspace_locked(so, &so->so_snd); 2455 if (so->so_snd.sb_state & SS_CANTSENDMORE) { 2456 kn->kn_flags |= EV_EOF; 2457 if (kn->kn_flags & __EV_POLL) { 2458 if (state & SS_ISDISCONNECTED) 2459 kn->kn_flags |= __EV_HUP; 2460 } 2461 kn->kn_fflags = error; 2462 rv = 1; 2463 } else if (error) { 2464 rv = 1; 2465 } else if (((state & SS_ISCONNECTED) == 0) && 2466 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 2467 rv = 0; 2468 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2469 rv = (kn->kn_data >= kn->kn_sdata); 2470 } else { 2471 rv = (kn->kn_data >= so->so_snd.sb_lowat); 2472 } 2473 2474 return (rv); 2475 } 2476 2477 int 2478 filt_soexcept(struct knote *kn, long hint) 2479 { 2480 struct socket *so = kn->kn_fp->f_data; 2481 int rv = 0; 2482 2483 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); 2484 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 2485 soassertlocked_readonly(so); 2486 2487 #ifdef SOCKET_SPLICE 2488 if (isspliced(so)) { 2489 rv = 0; 2490 } else 2491 #endif /* SOCKET_SPLICE */ 2492 if (kn->kn_sfflags & NOTE_OOB) { 2493 if (so->so_oobmark || (so->so_rcv.sb_state & SS_RCVATMARK)) { 2494 kn->kn_fflags |= NOTE_OOB; 2495 kn->kn_data -= so->so_oobmark; 2496 rv = 1; 2497 } 2498 } 2499 2500 if (kn->kn_flags & __EV_POLL) { 2501 u_int state = READ_ONCE(so->so_state); 2502 2503 if (state & SS_ISDISCONNECTED) { 2504 kn->kn_flags |= __EV_HUP; 2505 rv = 1; 2506 } 2507 } 2508 2509 return rv; 2510 } 2511 2512 int 2513 filt_sowmodify(struct kevent *kev, struct knote *kn) 2514 { 2515 struct socket *so = kn->kn_fp->f_data; 2516 int rv; 2517 2518 sofilt_lock(so, &so->so_snd); 2519 rv = knote_modify(kev, kn); 2520 sofilt_unlock(so, &so->so_snd); 2521 2522 return (rv); 2523 } 2524 2525 int 2526 filt_sowprocess(struct knote *kn, struct kevent *kev) 2527 { 2528 struct socket *so = kn->kn_fp->f_data; 2529 int rv; 2530 2531 sofilt_lock(so, &so->so_snd); 2532 rv = knote_process(kn, kev); 2533 sofilt_unlock(so, &so->so_snd); 2534 2535 return (rv); 2536 } 2537 2538 int 2539 filt_sormodify(struct kevent *kev, struct knote *kn) 2540 { 2541 struct socket *so = kn->kn_fp->f_data; 2542 int rv; 2543 2544 sofilt_lock(so, &so->so_rcv); 2545 rv = knote_modify(kev, kn); 2546 sofilt_unlock(so, &so->so_rcv); 2547 2548 return (rv); 2549 } 2550 2551 int 2552 filt_sorprocess(struct knote *kn, struct kevent *kev) 2553 { 2554 struct socket *so = kn->kn_fp->f_data; 2555 int rv; 2556 2557 sofilt_lock(so, &so->so_rcv); 2558 rv = knote_process(kn, kev); 2559 sofilt_unlock(so, &so->so_rcv); 2560 2561 return (rv); 2562 } 2563 2564 #ifdef DDB 2565 void 2566 sobuf_print(struct sockbuf *, 2567 int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))); 2568 2569 void 2570 sobuf_print(struct sockbuf *sb, 2571 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2572 { 2573 (*pr)("\tsb_cc: %lu\n", sb->sb_cc); 2574 (*pr)("\tsb_datacc: %lu\n", sb->sb_datacc); 2575 (*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat); 2576 (*pr)("\tsb_wat: %lu\n", sb->sb_wat); 2577 (*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt); 2578 (*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax); 2579 (*pr)("\tsb_lowat: %ld\n", sb->sb_lowat); 2580 (*pr)("\tsb_mb: %p\n", sb->sb_mb); 2581 (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail); 2582 (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord); 2583 (*pr)("\tsb_flags: %04x\n", sb->sb_flags); 2584 (*pr)("\tsb_state: %04x\n", sb->sb_state); 2585 (*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs); 2586 } 2587 2588 void 2589 so_print(void *v, 2590 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2591 { 2592 struct socket *so = v; 2593 2594 (*pr)("socket %p\n", so); 2595 (*pr)("so_type: %i\n", so->so_type); 2596 (*pr)("so_options: 0x%04x\n", so->so_options); /* %b */ 2597 (*pr)("so_linger: %i\n", so->so_linger); 2598 (*pr)("so_state: 0x%04x\n", so->so_state); 2599 (*pr)("so_pcb: %p\n", so->so_pcb); 2600 (*pr)("so_proto: %p\n", so->so_proto); 2601 (*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio); 2602 2603 (*pr)("so_head: %p\n", so->so_head); 2604 (*pr)("so_onq: %p\n", so->so_onq); 2605 (*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0)); 2606 (*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q)); 2607 (*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe)); 2608 (*pr)("so_q0len: %i\n", so->so_q0len); 2609 (*pr)("so_qlen: %i\n", so->so_qlen); 2610 (*pr)("so_qlimit: %i\n", so->so_qlimit); 2611 (*pr)("so_timeo: %i\n", so->so_timeo); 2612 (*pr)("so_obmark: %lu\n", so->so_oobmark); 2613 2614 (*pr)("so_sp: %p\n", so->so_sp); 2615 if (so->so_sp != NULL) { 2616 (*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket); 2617 (*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback); 2618 (*pr)("\tssp_len: %lld\n", 2619 (unsigned long long)so->so_sp->ssp_len); 2620 (*pr)("\tssp_max: %lld\n", 2621 (unsigned long long)so->so_sp->ssp_max); 2622 (*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec, 2623 so->so_sp->ssp_idletv.tv_usec); 2624 (*pr)("\tssp_idleto: %spending (@%i)\n", 2625 timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ", 2626 so->so_sp->ssp_idleto.to_time); 2627 } 2628 2629 (*pr)("so_rcv:\n"); 2630 sobuf_print(&so->so_rcv, pr); 2631 (*pr)("so_snd:\n"); 2632 sobuf_print(&so->so_snd, pr); 2633 2634 (*pr)("so_upcall: %p so_upcallarg: %p\n", 2635 so->so_upcall, so->so_upcallarg); 2636 2637 (*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid); 2638 (*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid); 2639 (*pr)("so_cpid: %d\n", so->so_cpid); 2640 } 2641 #endif 2642