1 /* $OpenBSD: uipc_socket.c,v 1.349 2024/12/27 10:18:04 mvs Exp $ */ 2 /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 #include <sys/file.h> 39 #include <sys/filedesc.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/domain.h> 43 #include <sys/event.h> 44 #include <sys/protosw.h> 45 #include <sys/socket.h> 46 #include <sys/unpcb.h> 47 #include <sys/socketvar.h> 48 #include <sys/signalvar.h> 49 #include <sys/pool.h> 50 #include <sys/atomic.h> 51 #include <sys/rwlock.h> 52 #include <sys/time.h> 53 #include <sys/refcnt.h> 54 55 #ifdef DDB 56 #include <machine/db_machdep.h> 57 #endif 58 59 void sbsync(struct sockbuf *, struct mbuf *); 60 61 int sosplice(struct socket *, int, off_t, struct timeval *); 62 void sounsplice(struct socket *, struct socket *, int); 63 void soidle(void *); 64 void sotask(void *); 65 int somove(struct socket *, int); 66 void sorflush(struct socket *); 67 68 void filt_sordetach(struct knote *kn); 69 int filt_soread(struct knote *kn, long hint); 70 void filt_sowdetach(struct knote *kn); 71 int filt_sowrite(struct knote *kn, long hint); 72 int filt_soexcept(struct knote *kn, long hint); 73 74 int filt_sowmodify(struct kevent *kev, struct knote *kn); 75 int filt_sowprocess(struct knote *kn, struct kevent *kev); 76 77 int filt_sormodify(struct kevent *kev, struct knote *kn); 78 int filt_sorprocess(struct knote *kn, struct kevent *kev); 79 80 const struct filterops soread_filtops = { 81 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 82 .f_attach = NULL, 83 .f_detach = filt_sordetach, 84 .f_event = filt_soread, 85 .f_modify = filt_sormodify, 86 .f_process = filt_sorprocess, 87 }; 88 89 const struct filterops sowrite_filtops = { 90 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 91 .f_attach = NULL, 92 .f_detach = filt_sowdetach, 93 .f_event = filt_sowrite, 94 .f_modify = filt_sowmodify, 95 .f_process = filt_sowprocess, 96 }; 97 98 const struct filterops soexcept_filtops = { 99 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 100 .f_attach = NULL, 101 .f_detach = filt_sordetach, 102 .f_event = filt_soexcept, 103 .f_modify = filt_sormodify, 104 .f_process = filt_sorprocess, 105 }; 106 107 #ifndef SOMINCONN 108 #define SOMINCONN 80 109 #endif /* SOMINCONN */ 110 111 int somaxconn = SOMAXCONN; 112 int sominconn = SOMINCONN; 113 114 struct pool socket_pool; 115 #ifdef SOCKET_SPLICE 116 struct pool sosplice_pool; 117 struct taskq *sosplice_taskq; 118 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk"); 119 #endif 120 121 void 122 soinit(void) 123 { 124 pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0, 125 "sockpl", NULL); 126 #ifdef SOCKET_SPLICE 127 pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0, 128 "sosppl", NULL); 129 #endif 130 } 131 132 struct socket * 133 soalloc(const struct protosw *prp, int wait) 134 { 135 const struct domain *dp = prp->pr_domain; 136 struct socket *so; 137 138 so = pool_get(&socket_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) | 139 PR_ZERO); 140 if (so == NULL) 141 return (NULL); 142 rw_init_flags(&so->so_lock, dp->dom_name, RWL_DUPOK); 143 refcnt_init(&so->so_refcnt); 144 rw_init(&so->so_rcv.sb_lock, "sbufrcv"); 145 rw_init(&so->so_snd.sb_lock, "sbufsnd"); 146 mtx_init_flags(&so->so_rcv.sb_mtx, IPL_MPFLOOR, "sbrcv", 0); 147 mtx_init_flags(&so->so_snd.sb_mtx, IPL_MPFLOOR, "sbsnd", 0); 148 klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx); 149 klist_init_mutex(&so->so_snd.sb_klist, &so->so_snd.sb_mtx); 150 sigio_init(&so->so_sigio); 151 TAILQ_INIT(&so->so_q0); 152 TAILQ_INIT(&so->so_q); 153 154 switch (dp->dom_family) { 155 case AF_INET: 156 case AF_INET6: 157 switch (prp->pr_type) { 158 case SOCK_RAW: 159 case SOCK_DGRAM: 160 so->so_snd.sb_flags |= SB_MTXLOCK; 161 /* FALLTHROUGH */ 162 case SOCK_STREAM: 163 so->so_rcv.sb_flags |= SB_MTXLOCK; 164 break; 165 } 166 break; 167 case AF_KEY: 168 case AF_ROUTE: 169 case AF_UNIX: 170 case AF_FRAME: 171 so->so_snd.sb_flags |= SB_MTXLOCK; 172 so->so_rcv.sb_flags |= SB_MTXLOCK; 173 break; 174 } 175 176 return (so); 177 } 178 179 /* 180 * Socket operation routines. 181 * These routines are called by the routines in 182 * sys_socket.c or from a system process, and 183 * implement the semantics of socket operations by 184 * switching out to the protocol specific routines. 185 */ 186 int 187 socreate(int dom, struct socket **aso, int type, int proto) 188 { 189 struct proc *p = curproc; /* XXX */ 190 const struct protosw *prp; 191 struct socket *so; 192 int error; 193 194 if (proto) 195 prp = pffindproto(dom, proto, type); 196 else 197 prp = pffindtype(dom, type); 198 if (prp == NULL || prp->pr_usrreqs == NULL) 199 return (EPROTONOSUPPORT); 200 if (prp->pr_type != type) 201 return (EPROTOTYPE); 202 so = soalloc(prp, M_WAIT); 203 so->so_type = type; 204 if (suser(p) == 0) 205 so->so_state = SS_PRIV; 206 so->so_ruid = p->p_ucred->cr_ruid; 207 so->so_euid = p->p_ucred->cr_uid; 208 so->so_rgid = p->p_ucred->cr_rgid; 209 so->so_egid = p->p_ucred->cr_gid; 210 so->so_cpid = p->p_p->ps_pid; 211 so->so_proto = prp; 212 so->so_snd.sb_timeo_nsecs = INFSLP; 213 so->so_rcv.sb_timeo_nsecs = INFSLP; 214 215 solock(so); 216 error = pru_attach(so, proto, M_WAIT); 217 if (error) { 218 so->so_state |= SS_NOFDREF; 219 /* sofree() calls sounlock(). */ 220 sofree(so, 0); 221 return (error); 222 } 223 sounlock(so); 224 *aso = so; 225 return (0); 226 } 227 228 int 229 sobind(struct socket *so, struct mbuf *nam, struct proc *p) 230 { 231 soassertlocked(so); 232 return pru_bind(so, nam, p); 233 } 234 235 int 236 solisten(struct socket *so, int backlog) 237 { 238 int somaxconn_local = atomic_load_int(&somaxconn); 239 int sominconn_local = atomic_load_int(&sominconn); 240 int error; 241 242 switch (so->so_type) { 243 case SOCK_STREAM: 244 case SOCK_SEQPACKET: 245 break; 246 default: 247 return (EOPNOTSUPP); 248 } 249 250 soassertlocked(so); 251 252 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 253 return (EINVAL); 254 #ifdef SOCKET_SPLICE 255 if (isspliced(so) || issplicedback(so)) 256 return (EOPNOTSUPP); 257 #endif /* SOCKET_SPLICE */ 258 error = pru_listen(so); 259 if (error) 260 return (error); 261 if (TAILQ_FIRST(&so->so_q) == NULL) 262 so->so_options |= SO_ACCEPTCONN; 263 if (backlog < 0 || backlog > somaxconn_local) 264 backlog = somaxconn_local; 265 if (backlog < sominconn_local) 266 backlog = sominconn_local; 267 so->so_qlimit = backlog; 268 return (0); 269 } 270 271 #define SOSP_FREEING_READ 1 272 #define SOSP_FREEING_WRITE 2 273 void 274 sofree(struct socket *so, int keep_lock) 275 { 276 int persocket = solock_persocket(so); 277 278 soassertlocked(so); 279 280 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { 281 if (!keep_lock) 282 sounlock(so); 283 return; 284 } 285 if (so->so_head) { 286 struct socket *head = so->so_head; 287 288 /* 289 * We must not decommission a socket that's on the accept(2) 290 * queue. If we do, then accept(2) may hang after select(2) 291 * indicated that the listening socket was ready. 292 */ 293 if (so->so_onq == &head->so_q) { 294 if (!keep_lock) 295 sounlock(so); 296 return; 297 } 298 299 if (persocket) { 300 /* 301 * Concurrent close of `head' could 302 * abort `so' due to re-lock. 303 */ 304 soref(so); 305 soref(head); 306 sounlock(so); 307 solock(head); 308 solock(so); 309 310 if (so->so_onq != &head->so_q0) { 311 sounlock(head); 312 sounlock(so); 313 sorele(head); 314 sorele(so); 315 return; 316 } 317 318 sorele(head); 319 sorele(so); 320 } 321 322 soqremque(so, 0); 323 324 if (persocket) 325 sounlock(head); 326 } 327 328 if (!keep_lock) { 329 /* 330 * sofree() was called from soclose(). Sleep is safe 331 * even for tcp(4) sockets. 332 */ 333 sounlock(so); 334 refcnt_finalize(&so->so_refcnt, "sofinal"); 335 solock(so); 336 } 337 338 sigio_free(&so->so_sigio); 339 klist_free(&so->so_rcv.sb_klist); 340 klist_free(&so->so_snd.sb_klist); 341 342 mtx_enter(&so->so_snd.sb_mtx); 343 sbrelease(so, &so->so_snd); 344 mtx_leave(&so->so_snd.sb_mtx); 345 346 /* 347 * Unlocked dispose and cleanup is safe. Socket is unlinked 348 * from everywhere. Even concurrent sotask() thread will not 349 * call somove(). 350 */ 351 if (so->so_proto->pr_flags & PR_RIGHTS && 352 so->so_proto->pr_domain->dom_dispose) 353 (*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb); 354 m_purge(so->so_rcv.sb_mb); 355 356 if (!keep_lock) { 357 sounlock(so); 358 359 #ifdef SOCKET_SPLICE 360 if (so->so_sp) { 361 timeout_del_barrier(&so->so_sp->ssp_idleto); 362 task_del(sosplice_taskq, &so->so_sp->ssp_task); 363 taskq_barrier(sosplice_taskq); 364 pool_put(&sosplice_pool, so->so_sp); 365 } 366 #endif /* SOCKET_SPLICE */ 367 } 368 369 pool_put(&socket_pool, so); 370 } 371 372 static inline uint64_t 373 solinger_nsec(struct socket *so) 374 { 375 if (so->so_linger == 0) 376 return INFSLP; 377 378 return SEC_TO_NSEC(so->so_linger); 379 } 380 381 /* 382 * Close a socket on last file table reference removal. 383 * Initiate disconnect if connected. 384 * Free socket when disconnect complete. 385 */ 386 int 387 soclose(struct socket *so, int flags) 388 { 389 struct socket *so2; 390 int error = 0; 391 392 solock(so); 393 /* Revoke async IO early. There is a final revocation in sofree(). */ 394 sigio_free(&so->so_sigio); 395 if (so->so_state & SS_ISCONNECTED) { 396 if (so->so_pcb == NULL) 397 goto discard; 398 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 399 error = sodisconnect(so); 400 if (error) 401 goto drop; 402 } 403 if (so->so_options & SO_LINGER) { 404 if ((so->so_state & SS_ISDISCONNECTING) && 405 (flags & MSG_DONTWAIT)) 406 goto drop; 407 while (so->so_state & SS_ISCONNECTED) { 408 error = sosleep_nsec(so, &so->so_timeo, 409 PSOCK | PCATCH, "netcls", 410 solinger_nsec(so)); 411 if (error) 412 break; 413 } 414 } 415 } 416 drop: 417 if (so->so_pcb) { 418 int error2; 419 error2 = pru_detach(so); 420 if (error == 0) 421 error = error2; 422 } 423 if (so->so_options & SO_ACCEPTCONN) { 424 int persocket = solock_persocket(so); 425 426 while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { 427 if (persocket) 428 solock(so2); 429 (void) soqremque(so2, 0); 430 if (persocket) 431 sounlock(so); 432 soabort(so2); 433 if (persocket) 434 solock(so); 435 } 436 while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { 437 if (persocket) 438 solock(so2); 439 (void) soqremque(so2, 1); 440 if (persocket) 441 sounlock(so); 442 soabort(so2); 443 if (persocket) 444 solock(so); 445 } 446 } 447 discard: 448 #ifdef SOCKET_SPLICE 449 if (so->so_sp) { 450 struct socket *soback; 451 452 sounlock(so); 453 mtx_enter(&so->so_snd.sb_mtx); 454 /* 455 * Concurrent sounsplice() locks `sb_mtx' mutexes on 456 * both `so_snd' and `so_rcv' before unsplice sockets. 457 */ 458 if ((soback = so->so_sp->ssp_soback) == NULL) { 459 mtx_leave(&so->so_snd.sb_mtx); 460 goto notsplicedback; 461 } 462 soref(soback); 463 mtx_leave(&so->so_snd.sb_mtx); 464 465 /* 466 * `so' can be only unspliced, and never spliced again. 467 * Thus if issplicedback(so) check is positive, socket is 468 * still spliced and `ssp_soback' points to the same 469 * socket that `soback'. 470 */ 471 sblock(&soback->so_rcv, SBL_WAIT | SBL_NOINTR); 472 if (issplicedback(so)) { 473 int freeing = SOSP_FREEING_WRITE; 474 475 if (so->so_sp->ssp_soback == so) 476 freeing |= SOSP_FREEING_READ; 477 solock(soback); 478 sounsplice(so->so_sp->ssp_soback, so, freeing); 479 sounlock(soback); 480 } 481 sbunlock(&soback->so_rcv); 482 sorele(soback); 483 484 notsplicedback: 485 sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR); 486 if (isspliced(so)) { 487 int freeing = SOSP_FREEING_READ; 488 489 if (so == so->so_sp->ssp_socket) 490 freeing |= SOSP_FREEING_WRITE; 491 solock(so); 492 sounsplice(so, so->so_sp->ssp_socket, freeing); 493 sounlock(so); 494 } 495 sbunlock(&so->so_rcv); 496 497 solock(so); 498 } 499 #endif /* SOCKET_SPLICE */ 500 501 if (so->so_state & SS_NOFDREF) 502 panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type); 503 so->so_state |= SS_NOFDREF; 504 505 /* sofree() calls sounlock(). */ 506 sofree(so, 0); 507 return (error); 508 } 509 510 void 511 soabort(struct socket *so) 512 { 513 soassertlocked(so); 514 pru_abort(so); 515 } 516 517 int 518 soaccept(struct socket *so, struct mbuf *nam) 519 { 520 int error = 0; 521 522 soassertlocked(so); 523 524 if ((so->so_state & SS_NOFDREF) == 0) 525 panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type); 526 so->so_state &= ~SS_NOFDREF; 527 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 528 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 529 error = pru_accept(so, nam); 530 else 531 error = ECONNABORTED; 532 return (error); 533 } 534 535 int 536 soconnect(struct socket *so, struct mbuf *nam) 537 { 538 int error; 539 540 soassertlocked(so); 541 542 if (so->so_options & SO_ACCEPTCONN) 543 return (EOPNOTSUPP); 544 /* 545 * If protocol is connection-based, can only connect once. 546 * Otherwise, if connected, try to disconnect first. 547 * This allows user to disconnect by connecting to, e.g., 548 * a null address. 549 */ 550 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 551 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 552 (error = sodisconnect(so)))) 553 error = EISCONN; 554 else 555 error = pru_connect(so, nam); 556 return (error); 557 } 558 559 int 560 soconnect2(struct socket *so1, struct socket *so2) 561 { 562 int persocket, error; 563 564 if ((persocket = solock_persocket(so1))) 565 solock_pair(so1, so2); 566 else 567 solock(so1); 568 569 error = pru_connect2(so1, so2); 570 571 if (persocket) 572 sounlock(so2); 573 sounlock(so1); 574 return (error); 575 } 576 577 int 578 sodisconnect(struct socket *so) 579 { 580 int error; 581 582 soassertlocked(so); 583 584 if ((so->so_state & SS_ISCONNECTED) == 0) 585 return (ENOTCONN); 586 if (so->so_state & SS_ISDISCONNECTING) 587 return (EALREADY); 588 error = pru_disconnect(so); 589 return (error); 590 } 591 592 int m_getuio(struct mbuf **, int, long, struct uio *); 593 594 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 595 /* 596 * Send on a socket. 597 * If send must go all at once and message is larger than 598 * send buffering, then hard error. 599 * Lock against other senders. 600 * If must go all at once and not enough room now, then 601 * inform user that this would block and do nothing. 602 * Otherwise, if nonblocking, send as much as possible. 603 * The data to be sent is described by "uio" if nonzero, 604 * otherwise by the mbuf chain "top" (which must be null 605 * if uio is not). Data provided in mbuf chain must be small 606 * enough to send all at once. 607 * 608 * Returns nonzero on error, timeout or signal; callers 609 * must check for short counts if EINTR/ERESTART are returned. 610 * Data and control buffers are freed on return. 611 */ 612 int 613 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 614 struct mbuf *control, int flags) 615 { 616 long space, clen = 0; 617 size_t resid; 618 int error; 619 int atomic = sosendallatonce(so) || top; 620 int dosolock = ((so->so_snd.sb_flags & SB_MTXLOCK) == 0); 621 622 if (uio) 623 resid = uio->uio_resid; 624 else 625 resid = top->m_pkthdr.len; 626 /* MSG_EOR on a SOCK_STREAM socket is invalid. */ 627 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 628 m_freem(top); 629 m_freem(control); 630 return (EINVAL); 631 } 632 if (uio && uio->uio_procp) 633 uio->uio_procp->p_ru.ru_msgsnd++; 634 if (control) { 635 /* 636 * In theory clen should be unsigned (since control->m_len is). 637 * However, space must be signed, as it might be less than 0 638 * if we over-committed, and we must use a signed comparison 639 * of space and clen. 640 */ 641 clen = control->m_len; 642 /* reserve extra space for AF_UNIX's internalize */ 643 if (so->so_proto->pr_domain->dom_family == AF_UNIX && 644 clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) && 645 mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 646 clen = CMSG_SPACE( 647 (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) * 648 (sizeof(struct fdpass) / sizeof(int))); 649 } 650 651 #define snderr(errno) { error = errno; goto release; } 652 653 restart: 654 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) 655 goto out; 656 if (dosolock) 657 solock_shared(so); 658 sb_mtx_lock(&so->so_snd); 659 so->so_snd.sb_state |= SS_ISSENDING; 660 do { 661 if (so->so_snd.sb_state & SS_CANTSENDMORE) 662 snderr(EPIPE); 663 if ((error = READ_ONCE(so->so_error))) { 664 so->so_error = 0; 665 snderr(error); 666 } 667 if ((so->so_state & SS_ISCONNECTED) == 0) { 668 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 669 if (!(resid == 0 && clen != 0)) 670 snderr(ENOTCONN); 671 } else if (addr == NULL) 672 snderr(EDESTADDRREQ); 673 } 674 space = sbspace_locked(so, &so->so_snd); 675 if (flags & MSG_OOB) 676 space += 1024; 677 if (so->so_proto->pr_domain->dom_family == AF_UNIX) { 678 if (atomic && resid > so->so_snd.sb_hiwat) 679 snderr(EMSGSIZE); 680 } else { 681 if (clen > so->so_snd.sb_hiwat || 682 (atomic && resid > so->so_snd.sb_hiwat - clen)) 683 snderr(EMSGSIZE); 684 } 685 if (space < clen || 686 (space - clen < resid && 687 (atomic || space < so->so_snd.sb_lowat))) { 688 if (flags & MSG_DONTWAIT) 689 snderr(EWOULDBLOCK); 690 sbunlock(&so->so_snd); 691 error = sbwait(so, &so->so_snd); 692 so->so_snd.sb_state &= ~SS_ISSENDING; 693 sb_mtx_unlock(&so->so_snd); 694 if (dosolock) 695 sounlock_shared(so); 696 if (error) 697 goto out; 698 goto restart; 699 } 700 space -= clen; 701 do { 702 if (uio == NULL) { 703 /* 704 * Data is prepackaged in "top". 705 */ 706 resid = 0; 707 if (flags & MSG_EOR) 708 top->m_flags |= M_EOR; 709 } else { 710 sb_mtx_unlock(&so->so_snd); 711 if (dosolock) 712 sounlock_shared(so); 713 error = m_getuio(&top, atomic, space, uio); 714 if (dosolock) 715 solock_shared(so); 716 sb_mtx_lock(&so->so_snd); 717 if (error) 718 goto release; 719 space -= top->m_pkthdr.len; 720 resid = uio->uio_resid; 721 if (flags & MSG_EOR) 722 top->m_flags |= M_EOR; 723 } 724 if (resid == 0) 725 so->so_snd.sb_state &= ~SS_ISSENDING; 726 if (top && so->so_options & SO_ZEROIZE) 727 top->m_flags |= M_ZEROIZE; 728 sb_mtx_unlock(&so->so_snd); 729 if (!dosolock) 730 solock_shared(so); 731 if (flags & MSG_OOB) 732 error = pru_sendoob(so, top, addr, control); 733 else 734 error = pru_send(so, top, addr, control); 735 if (!dosolock) 736 sounlock_shared(so); 737 sb_mtx_lock(&so->so_snd); 738 clen = 0; 739 control = NULL; 740 top = NULL; 741 if (error) 742 goto release; 743 } while (resid && space > 0); 744 } while (resid); 745 746 release: 747 so->so_snd.sb_state &= ~SS_ISSENDING; 748 sb_mtx_unlock(&so->so_snd); 749 if (dosolock) 750 sounlock_shared(so); 751 sbunlock(&so->so_snd); 752 out: 753 m_freem(top); 754 m_freem(control); 755 return (error); 756 } 757 758 int 759 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio) 760 { 761 struct mbuf *m, *top = NULL; 762 struct mbuf **nextp = ⊤ 763 u_long len, mlen; 764 size_t resid = uio->uio_resid; 765 int error; 766 767 do { 768 if (top == NULL) { 769 MGETHDR(m, M_WAIT, MT_DATA); 770 mlen = MHLEN; 771 } else { 772 MGET(m, M_WAIT, MT_DATA); 773 mlen = MLEN; 774 } 775 /* chain mbuf together */ 776 *nextp = m; 777 nextp = &m->m_next; 778 779 resid = ulmin(resid, space); 780 if (resid >= MINCLSIZE) { 781 MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES)); 782 if ((m->m_flags & M_EXT) == 0) 783 MCLGETL(m, M_NOWAIT, MCLBYTES); 784 if ((m->m_flags & M_EXT) == 0) 785 goto nopages; 786 mlen = m->m_ext.ext_size; 787 len = ulmin(mlen, resid); 788 /* 789 * For datagram protocols, leave room 790 * for protocol headers in first mbuf. 791 */ 792 if (atomic && m == top && len < mlen - max_hdr) 793 m->m_data += max_hdr; 794 } else { 795 nopages: 796 len = ulmin(mlen, resid); 797 /* 798 * For datagram protocols, leave room 799 * for protocol headers in first mbuf. 800 */ 801 if (atomic && m == top && len < mlen - max_hdr) 802 m_align(m, len); 803 } 804 805 error = uiomove(mtod(m, caddr_t), len, uio); 806 if (error) { 807 m_freem(top); 808 return (error); 809 } 810 811 /* adjust counters */ 812 resid = uio->uio_resid; 813 space -= len; 814 m->m_len = len; 815 top->m_pkthdr.len += len; 816 817 /* Is there more space and more data? */ 818 } while (space > 0 && resid > 0); 819 820 *mp = top; 821 return 0; 822 } 823 824 /* 825 * Following replacement or removal of the first mbuf on the first 826 * mbuf chain of a socket buffer, push necessary state changes back 827 * into the socket buffer so that other consumers see the values 828 * consistently. 'nextrecord' is the callers locally stored value of 829 * the original value of sb->sb_mb->m_nextpkt which must be restored 830 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 831 */ 832 void 833 sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 834 { 835 836 /* 837 * First, update for the new value of nextrecord. If necessary, 838 * make it the first record. 839 */ 840 if (sb->sb_mb != NULL) 841 sb->sb_mb->m_nextpkt = nextrecord; 842 else 843 sb->sb_mb = nextrecord; 844 845 /* 846 * Now update any dependent socket buffer fields to reflect 847 * the new state. This is an inline of SB_EMPTY_FIXUP, with 848 * the addition of a second clause that takes care of the 849 * case where sb_mb has been updated, but remains the last 850 * record. 851 */ 852 if (sb->sb_mb == NULL) { 853 sb->sb_mbtail = NULL; 854 sb->sb_lastrecord = NULL; 855 } else if (sb->sb_mb->m_nextpkt == NULL) 856 sb->sb_lastrecord = sb->sb_mb; 857 } 858 859 /* 860 * Implement receive operations on a socket. 861 * We depend on the way that records are added to the sockbuf 862 * by sbappend*. In particular, each record (mbufs linked through m_next) 863 * must begin with an address if the protocol so specifies, 864 * followed by an optional mbuf or mbufs containing ancillary data, 865 * and then zero or more mbufs of data. 866 * In order to avoid blocking network for the entire time here, we release 867 * the solock() while doing the actual copy to user space. 868 * Although the sockbuf is locked, new data may still be appended, 869 * and thus we must maintain consistency of the sockbuf during that time. 870 * 871 * The caller may receive the data as a single mbuf chain by supplying 872 * an mbuf **mp0 for use in returning the chain. The uio is then used 873 * only for the count in uio_resid. 874 */ 875 int 876 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 877 struct mbuf **mp0, struct mbuf **controlp, int *flagsp, 878 socklen_t controllen) 879 { 880 struct mbuf *m, **mp; 881 struct mbuf *cm; 882 u_long len, offset, moff; 883 int flags, error, error2, type, uio_error = 0; 884 const struct protosw *pr = so->so_proto; 885 struct mbuf *nextrecord; 886 size_t resid, orig_resid = uio->uio_resid; 887 int dosolock = ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0); 888 889 mp = mp0; 890 if (paddr) 891 *paddr = NULL; 892 if (controlp) 893 *controlp = NULL; 894 if (flagsp) 895 flags = *flagsp &~ MSG_EOR; 896 else 897 flags = 0; 898 if (flags & MSG_OOB) { 899 m = m_get(M_WAIT, MT_DATA); 900 solock_shared(so); 901 error = pru_rcvoob(so, m, flags & MSG_PEEK); 902 sounlock_shared(so); 903 if (error) 904 goto bad; 905 do { 906 error = uiomove(mtod(m, caddr_t), 907 ulmin(uio->uio_resid, m->m_len), uio); 908 m = m_free(m); 909 } while (uio->uio_resid && error == 0 && m); 910 bad: 911 m_freem(m); 912 return (error); 913 } 914 if (mp) 915 *mp = NULL; 916 917 restart: 918 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) 919 return (error); 920 if (dosolock) 921 solock_shared(so); 922 sb_mtx_lock(&so->so_rcv); 923 924 m = so->so_rcv.sb_mb; 925 #ifdef SOCKET_SPLICE 926 if (isspliced(so)) 927 m = NULL; 928 #endif /* SOCKET_SPLICE */ 929 /* 930 * If we have less data than requested, block awaiting more 931 * (subject to any timeout) if: 932 * 1. the current count is less than the low water mark, 933 * 2. MSG_WAITALL is set, and it is possible to do the entire 934 * receive operation at once if we block (resid <= hiwat), or 935 * 3. MSG_DONTWAIT is not set. 936 * If MSG_WAITALL is set but resid is larger than the receive buffer, 937 * we have to do the receive in sections, and thus risk returning 938 * a short count if a timeout or signal occurs after we start. 939 */ 940 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 941 so->so_rcv.sb_cc < uio->uio_resid) && 942 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 943 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 944 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 945 #ifdef DIAGNOSTIC 946 if (m == NULL && so->so_rcv.sb_cc) 947 #ifdef SOCKET_SPLICE 948 if (!isspliced(so)) 949 #endif /* SOCKET_SPLICE */ 950 panic("receive 1: so %p, so_type %d, sb_cc %lu", 951 so, so->so_type, so->so_rcv.sb_cc); 952 #endif 953 if ((error2 = READ_ONCE(so->so_error))) { 954 if (m) 955 goto dontblock; 956 error = error2; 957 if ((flags & MSG_PEEK) == 0) 958 so->so_error = 0; 959 goto release; 960 } 961 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 962 if (m) 963 goto dontblock; 964 else if (so->so_rcv.sb_cc == 0) 965 goto release; 966 } 967 for (; m; m = m->m_next) 968 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 969 m = so->so_rcv.sb_mb; 970 goto dontblock; 971 } 972 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 973 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 974 error = ENOTCONN; 975 goto release; 976 } 977 if (uio->uio_resid == 0 && controlp == NULL) 978 goto release; 979 if (flags & MSG_DONTWAIT) { 980 error = EWOULDBLOCK; 981 goto release; 982 } 983 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 984 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 985 986 sbunlock(&so->so_rcv); 987 error = sbwait(so, &so->so_rcv); 988 sb_mtx_unlock(&so->so_rcv); 989 if (dosolock) 990 sounlock_shared(so); 991 if (error) 992 return (error); 993 goto restart; 994 } 995 dontblock: 996 /* 997 * On entry here, m points to the first record of the socket buffer. 998 * From this point onward, we maintain 'nextrecord' as a cache of the 999 * pointer to the next record in the socket buffer. We must keep the 1000 * various socket buffer pointers and local stack versions of the 1001 * pointers in sync, pushing out modifications before operations that 1002 * may sleep, and re-reading them afterwards. 1003 * 1004 * Otherwise, we will race with the network stack appending new data 1005 * or records onto the socket buffer by using inconsistent/stale 1006 * versions of the field, possibly resulting in socket buffer 1007 * corruption. 1008 */ 1009 if (uio->uio_procp) 1010 uio->uio_procp->p_ru.ru_msgrcv++; 1011 KASSERT(m == so->so_rcv.sb_mb); 1012 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 1013 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 1014 nextrecord = m->m_nextpkt; 1015 if (pr->pr_flags & PR_ADDR) { 1016 #ifdef DIAGNOSTIC 1017 if (m->m_type != MT_SONAME) 1018 panic("receive 1a: so %p, so_type %d, m %p, m_type %d", 1019 so, so->so_type, m, m->m_type); 1020 #endif 1021 orig_resid = 0; 1022 if (flags & MSG_PEEK) { 1023 if (paddr) 1024 *paddr = m_copym(m, 0, m->m_len, M_NOWAIT); 1025 m = m->m_next; 1026 } else { 1027 sbfree(so, &so->so_rcv, m); 1028 if (paddr) { 1029 *paddr = m; 1030 so->so_rcv.sb_mb = m->m_next; 1031 m->m_next = NULL; 1032 m = so->so_rcv.sb_mb; 1033 } else { 1034 so->so_rcv.sb_mb = m_free(m); 1035 m = so->so_rcv.sb_mb; 1036 } 1037 sbsync(&so->so_rcv, nextrecord); 1038 } 1039 } 1040 while (m && m->m_type == MT_CONTROL && error == 0) { 1041 int skip = 0; 1042 if (flags & MSG_PEEK) { 1043 if (mtod(m, struct cmsghdr *)->cmsg_type == 1044 SCM_RIGHTS) { 1045 /* don't leak internalized SCM_RIGHTS msgs */ 1046 skip = 1; 1047 } else if (controlp) 1048 *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); 1049 m = m->m_next; 1050 } else { 1051 sbfree(so, &so->so_rcv, m); 1052 so->so_rcv.sb_mb = m->m_next; 1053 m->m_nextpkt = m->m_next = NULL; 1054 cm = m; 1055 m = so->so_rcv.sb_mb; 1056 sbsync(&so->so_rcv, nextrecord); 1057 if (controlp) { 1058 if (pr->pr_domain->dom_externalize) { 1059 sb_mtx_unlock(&so->so_rcv); 1060 if (dosolock) 1061 sounlock_shared(so); 1062 error = 1063 (*pr->pr_domain->dom_externalize) 1064 (cm, controllen, flags); 1065 if (dosolock) 1066 solock_shared(so); 1067 sb_mtx_lock(&so->so_rcv); 1068 } 1069 *controlp = cm; 1070 } else { 1071 /* 1072 * Dispose of any SCM_RIGHTS message that went 1073 * through the read path rather than recv. 1074 */ 1075 if (pr->pr_domain->dom_dispose) { 1076 sb_mtx_unlock(&so->so_rcv); 1077 pr->pr_domain->dom_dispose(cm); 1078 sb_mtx_lock(&so->so_rcv); 1079 } 1080 m_free(cm); 1081 } 1082 } 1083 if (m != NULL) 1084 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1085 else 1086 nextrecord = so->so_rcv.sb_mb; 1087 if (controlp && !skip) 1088 controlp = &(*controlp)->m_next; 1089 orig_resid = 0; 1090 } 1091 1092 /* If m is non-NULL, we have some data to read. */ 1093 if (m) { 1094 type = m->m_type; 1095 if (type == MT_OOBDATA) 1096 flags |= MSG_OOB; 1097 if (m->m_flags & M_BCAST) 1098 flags |= MSG_BCAST; 1099 if (m->m_flags & M_MCAST) 1100 flags |= MSG_MCAST; 1101 } 1102 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 1103 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 1104 1105 moff = 0; 1106 offset = 0; 1107 while (m && uio->uio_resid > 0 && error == 0) { 1108 if (m->m_type == MT_OOBDATA) { 1109 if (type != MT_OOBDATA) 1110 break; 1111 } else if (type == MT_OOBDATA) { 1112 break; 1113 } else if (m->m_type == MT_CONTROL) { 1114 /* 1115 * If there is more than one control message in the 1116 * stream, we do a short read. Next can be received 1117 * or disposed by another system call. 1118 */ 1119 break; 1120 #ifdef DIAGNOSTIC 1121 } else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) { 1122 panic("receive 3: so %p, so_type %d, m %p, m_type %d", 1123 so, so->so_type, m, m->m_type); 1124 #endif 1125 } 1126 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1127 len = uio->uio_resid; 1128 if (so->so_oobmark && len > so->so_oobmark - offset) 1129 len = so->so_oobmark - offset; 1130 if (len > m->m_len - moff) 1131 len = m->m_len - moff; 1132 /* 1133 * If mp is set, just pass back the mbufs. 1134 * Otherwise copy them out via the uio, then free. 1135 * Sockbuf must be consistent here (points to current mbuf, 1136 * it points to next record) when we drop priority; 1137 * we must note any additions to the sockbuf when we 1138 * block interrupts again. 1139 */ 1140 if (mp == NULL && uio_error == 0) { 1141 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 1142 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 1143 resid = uio->uio_resid; 1144 sb_mtx_unlock(&so->so_rcv); 1145 if (dosolock) 1146 sounlock_shared(so); 1147 uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1148 if (dosolock) 1149 solock_shared(so); 1150 sb_mtx_lock(&so->so_rcv); 1151 if (uio_error) 1152 uio->uio_resid = resid - len; 1153 } else 1154 uio->uio_resid -= len; 1155 if (len == m->m_len - moff) { 1156 if (m->m_flags & M_EOR) 1157 flags |= MSG_EOR; 1158 if (flags & MSG_PEEK) { 1159 m = m->m_next; 1160 moff = 0; 1161 orig_resid = 0; 1162 } else { 1163 nextrecord = m->m_nextpkt; 1164 sbfree(so, &so->so_rcv, m); 1165 if (mp) { 1166 *mp = m; 1167 mp = &m->m_next; 1168 so->so_rcv.sb_mb = m = m->m_next; 1169 *mp = NULL; 1170 } else { 1171 so->so_rcv.sb_mb = m_free(m); 1172 m = so->so_rcv.sb_mb; 1173 } 1174 /* 1175 * If m != NULL, we also know that 1176 * so->so_rcv.sb_mb != NULL. 1177 */ 1178 KASSERT(so->so_rcv.sb_mb == m); 1179 if (m) { 1180 m->m_nextpkt = nextrecord; 1181 if (nextrecord == NULL) 1182 so->so_rcv.sb_lastrecord = m; 1183 } else { 1184 so->so_rcv.sb_mb = nextrecord; 1185 SB_EMPTY_FIXUP(&so->so_rcv); 1186 } 1187 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1188 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1189 } 1190 } else { 1191 if (flags & MSG_PEEK) { 1192 moff += len; 1193 orig_resid = 0; 1194 } else { 1195 if (mp) 1196 *mp = m_copym(m, 0, len, M_WAIT); 1197 m->m_data += len; 1198 m->m_len -= len; 1199 so->so_rcv.sb_cc -= len; 1200 so->so_rcv.sb_datacc -= len; 1201 } 1202 } 1203 if (so->so_oobmark) { 1204 if ((flags & MSG_PEEK) == 0) { 1205 so->so_oobmark -= len; 1206 if (so->so_oobmark == 0) { 1207 so->so_rcv.sb_state |= SS_RCVATMARK; 1208 break; 1209 } 1210 } else { 1211 offset += len; 1212 if (offset == so->so_oobmark) 1213 break; 1214 } 1215 } 1216 if (flags & MSG_EOR) 1217 break; 1218 /* 1219 * If the MSG_WAITALL flag is set (for non-atomic socket), 1220 * we must not quit until "uio->uio_resid == 0" or an error 1221 * termination. If a signal/timeout occurs, return 1222 * with a short count but without error. 1223 * Keep sockbuf locked against other readers. 1224 */ 1225 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1226 !sosendallatonce(so) && !nextrecord) { 1227 if (so->so_rcv.sb_state & SS_CANTRCVMORE || 1228 so->so_error) 1229 break; 1230 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1231 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1232 if (sbwait(so, &so->so_rcv)) { 1233 sb_mtx_unlock(&so->so_rcv); 1234 if (dosolock) 1235 sounlock_shared(so); 1236 sbunlock(&so->so_rcv); 1237 return (0); 1238 } 1239 if ((m = so->so_rcv.sb_mb) != NULL) 1240 nextrecord = m->m_nextpkt; 1241 } 1242 } 1243 1244 if (m && pr->pr_flags & PR_ATOMIC) { 1245 flags |= MSG_TRUNC; 1246 if ((flags & MSG_PEEK) == 0) 1247 (void) sbdroprecord(so, &so->so_rcv); 1248 } 1249 if ((flags & MSG_PEEK) == 0) { 1250 if (m == NULL) { 1251 /* 1252 * First part is an inline SB_EMPTY_FIXUP(). Second 1253 * part makes sure sb_lastrecord is up-to-date if 1254 * there is still data in the socket buffer. 1255 */ 1256 so->so_rcv.sb_mb = nextrecord; 1257 if (so->so_rcv.sb_mb == NULL) { 1258 so->so_rcv.sb_mbtail = NULL; 1259 so->so_rcv.sb_lastrecord = NULL; 1260 } else if (nextrecord->m_nextpkt == NULL) 1261 so->so_rcv.sb_lastrecord = nextrecord; 1262 } 1263 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1264 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1265 if (pr->pr_flags & PR_WANTRCVD) { 1266 sb_mtx_unlock(&so->so_rcv); 1267 if (!dosolock) 1268 solock_shared(so); 1269 pru_rcvd(so); 1270 if (!dosolock) 1271 sounlock_shared(so); 1272 sb_mtx_lock(&so->so_rcv); 1273 } 1274 } 1275 if (orig_resid == uio->uio_resid && orig_resid && 1276 (flags & MSG_EOR) == 0 && 1277 (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) { 1278 sb_mtx_unlock(&so->so_rcv); 1279 sbunlock(&so->so_rcv); 1280 goto restart; 1281 } 1282 1283 if (uio_error) 1284 error = uio_error; 1285 1286 if (flagsp) 1287 *flagsp |= flags; 1288 release: 1289 sb_mtx_unlock(&so->so_rcv); 1290 if (dosolock) 1291 sounlock_shared(so); 1292 sbunlock(&so->so_rcv); 1293 return (error); 1294 } 1295 1296 int 1297 soshutdown(struct socket *so, int how) 1298 { 1299 int error = 0; 1300 1301 switch (how) { 1302 case SHUT_RD: 1303 sorflush(so); 1304 break; 1305 case SHUT_RDWR: 1306 sorflush(so); 1307 /* FALLTHROUGH */ 1308 case SHUT_WR: 1309 solock(so); 1310 error = pru_shutdown(so); 1311 sounlock(so); 1312 break; 1313 default: 1314 error = EINVAL; 1315 break; 1316 } 1317 1318 return (error); 1319 } 1320 1321 void 1322 sorflush(struct socket *so) 1323 { 1324 struct sockbuf *sb = &so->so_rcv; 1325 struct mbuf *m; 1326 const struct protosw *pr = so->so_proto; 1327 int error; 1328 1329 error = sblock(sb, SBL_WAIT | SBL_NOINTR); 1330 /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */ 1331 KASSERT(error == 0); 1332 1333 solock_shared(so); 1334 socantrcvmore(so); 1335 mtx_enter(&sb->sb_mtx); 1336 m = sb->sb_mb; 1337 memset(&sb->sb_startzero, 0, 1338 (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); 1339 sb->sb_timeo_nsecs = INFSLP; 1340 mtx_leave(&sb->sb_mtx); 1341 sounlock_shared(so); 1342 sbunlock(sb); 1343 1344 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1345 (*pr->pr_domain->dom_dispose)(m); 1346 m_purge(m); 1347 } 1348 1349 #ifdef SOCKET_SPLICE 1350 1351 #define so_splicelen so_sp->ssp_len 1352 #define so_splicemax so_sp->ssp_max 1353 #define so_idletv so_sp->ssp_idletv 1354 #define so_idleto so_sp->ssp_idleto 1355 #define so_splicetask so_sp->ssp_task 1356 1357 int 1358 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) 1359 { 1360 struct file *fp; 1361 struct socket *sosp; 1362 struct taskq *tq; 1363 int error = 0; 1364 1365 if ((so->so_proto->pr_flags & PR_SPLICE) == 0) 1366 return (EPROTONOSUPPORT); 1367 if (max && max < 0) 1368 return (EINVAL); 1369 if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) 1370 return (EINVAL); 1371 1372 /* If no fd is given, unsplice by removing existing link. */ 1373 if (fd < 0) { 1374 if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0) 1375 return (error); 1376 solock(so); 1377 if (so->so_options & SO_ACCEPTCONN) { 1378 error = EOPNOTSUPP; 1379 goto out; 1380 } 1381 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1382 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1383 error = ENOTCONN; 1384 goto out; 1385 } 1386 1387 if (so->so_sp && so->so_sp->ssp_socket) 1388 sounsplice(so, so->so_sp->ssp_socket, 0); 1389 out: 1390 sounlock(so); 1391 sbunlock(&so->so_rcv); 1392 return (error); 1393 } 1394 1395 if (sosplice_taskq == NULL) { 1396 rw_enter_write(&sosplice_lock); 1397 if (sosplice_taskq == NULL) { 1398 tq = taskq_create("sosplice", 1, IPL_SOFTNET, 1399 TASKQ_MPSAFE); 1400 if (tq == NULL) { 1401 rw_exit_write(&sosplice_lock); 1402 return (ENOMEM); 1403 } 1404 /* Ensure the taskq is fully visible to other CPUs. */ 1405 membar_producer(); 1406 sosplice_taskq = tq; 1407 } 1408 rw_exit_write(&sosplice_lock); 1409 } else { 1410 /* Ensure the taskq is fully visible on this CPU. */ 1411 membar_consumer(); 1412 } 1413 1414 /* Find sosp, the drain socket where data will be spliced into. */ 1415 if ((error = getsock(curproc, fd, &fp)) != 0) 1416 return (error); 1417 sosp = fp->f_data; 1418 1419 if (sosp->so_proto->pr_usrreqs->pru_send != 1420 so->so_proto->pr_usrreqs->pru_send) { 1421 error = EPROTONOSUPPORT; 1422 goto frele; 1423 } 1424 1425 if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0) 1426 goto frele; 1427 if ((error = sblock(&sosp->so_snd, SBL_WAIT)) != 0) { 1428 sbunlock(&so->so_rcv); 1429 goto frele; 1430 } 1431 solock(so); 1432 1433 if ((so->so_options & SO_ACCEPTCONN) || 1434 (sosp->so_options & SO_ACCEPTCONN)) { 1435 error = EOPNOTSUPP; 1436 goto release; 1437 } 1438 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1439 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1440 error = ENOTCONN; 1441 goto release; 1442 } 1443 if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) { 1444 error = ENOTCONN; 1445 goto release; 1446 } 1447 if (so->so_sp == NULL) 1448 so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1449 if (sosp->so_sp == NULL) 1450 sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1451 if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { 1452 error = EBUSY; 1453 goto release; 1454 } 1455 1456 so->so_splicelen = 0; 1457 so->so_splicemax = max; 1458 if (tv) 1459 so->so_idletv = *tv; 1460 else 1461 timerclear(&so->so_idletv); 1462 timeout_set_flags(&so->so_idleto, soidle, so, 1463 KCLOCK_NONE, TIMEOUT_PROC | TIMEOUT_MPSAFE); 1464 task_set(&so->so_splicetask, sotask, so); 1465 1466 /* 1467 * To prevent sorwakeup() calling somove() before this somove() 1468 * has finished, the socket buffers are not marked as spliced yet. 1469 */ 1470 1471 /* Splice so and sosp together. */ 1472 mtx_enter(&so->so_rcv.sb_mtx); 1473 mtx_enter(&sosp->so_snd.sb_mtx); 1474 so->so_sp->ssp_socket = sosp; 1475 sosp->so_sp->ssp_soback = so; 1476 mtx_leave(&sosp->so_snd.sb_mtx); 1477 mtx_leave(&so->so_rcv.sb_mtx); 1478 1479 if ((so->so_proto->pr_flags & PR_WANTRCVD) == 0) 1480 sounlock(so); 1481 if (somove(so, M_WAIT)) { 1482 mtx_enter(&so->so_rcv.sb_mtx); 1483 mtx_enter(&sosp->so_snd.sb_mtx); 1484 so->so_rcv.sb_flags |= SB_SPLICE; 1485 sosp->so_snd.sb_flags |= SB_SPLICE; 1486 mtx_leave(&sosp->so_snd.sb_mtx); 1487 mtx_leave(&so->so_rcv.sb_mtx); 1488 } 1489 if ((so->so_proto->pr_flags & PR_WANTRCVD) == 0) 1490 solock(so); 1491 1492 release: 1493 sounlock(so); 1494 sbunlock(&sosp->so_snd); 1495 sbunlock(&so->so_rcv); 1496 frele: 1497 FRELE(fp, curproc); 1498 1499 return (error); 1500 } 1501 1502 void 1503 sounsplice(struct socket *so, struct socket *sosp, int freeing) 1504 { 1505 sbassertlocked(&so->so_rcv); 1506 soassertlocked(so); 1507 1508 task_del(sosplice_taskq, &so->so_splicetask); 1509 timeout_del(&so->so_idleto); 1510 1511 mtx_enter(&so->so_rcv.sb_mtx); 1512 mtx_enter(&sosp->so_snd.sb_mtx); 1513 so->so_rcv.sb_flags &= ~SB_SPLICE; 1514 sosp->so_snd.sb_flags &= ~SB_SPLICE; 1515 so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; 1516 mtx_leave(&sosp->so_snd.sb_mtx); 1517 mtx_leave(&so->so_rcv.sb_mtx); 1518 1519 /* Do not wakeup a socket that is about to be freed. */ 1520 if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so)) 1521 sorwakeup(so); 1522 if ((freeing & SOSP_FREEING_WRITE) == 0 && sowriteable(sosp)) 1523 sowwakeup(sosp); 1524 } 1525 1526 void 1527 soidle(void *arg) 1528 { 1529 struct socket *so = arg; 1530 1531 sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR); 1532 solock(so); 1533 /* 1534 * Depending on socket type, sblock(&so->so_rcv) or solock() 1535 * is always held while modifying SB_SPLICE and 1536 * so->so_sp->ssp_socket. 1537 */ 1538 if (so->so_rcv.sb_flags & SB_SPLICE) { 1539 so->so_error = ETIMEDOUT; 1540 sounsplice(so, so->so_sp->ssp_socket, 0); 1541 } 1542 sounlock(so); 1543 sbunlock(&so->so_rcv); 1544 } 1545 1546 void 1547 sotask(void *arg) 1548 { 1549 struct socket *so = arg; 1550 int doyield = 0; 1551 int sockstream = (so->so_proto->pr_flags & PR_WANTRCVD); 1552 1553 /* 1554 * sblock() on `so_rcv' protects sockets from being unspliced 1555 * for UDP case. TCP sockets still rely on solock(). 1556 */ 1557 1558 sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR); 1559 if (so->so_rcv.sb_flags & SB_SPLICE) { 1560 struct socket *sosp = so->so_sp->ssp_socket; 1561 1562 if (sockstream) { 1563 sblock(&sosp->so_snd, SBL_WAIT | SBL_NOINTR); 1564 solock(so); 1565 doyield = 1; 1566 } 1567 1568 somove(so, M_DONTWAIT); 1569 1570 if (sockstream) { 1571 sounlock(so); 1572 sbunlock(&sosp->so_snd); 1573 } 1574 } 1575 1576 sbunlock(&so->so_rcv); 1577 1578 if (doyield) { 1579 /* Avoid user land starvation. */ 1580 yield(); 1581 } 1582 } 1583 1584 /* 1585 * Move data from receive buffer of spliced source socket to send 1586 * buffer of drain socket. Try to move as much as possible in one 1587 * big chunk. It is a TCP only implementation. 1588 * Return value 0 means splicing has been finished, 1 continue. 1589 */ 1590 int 1591 somove(struct socket *so, int wait) 1592 { 1593 struct socket *sosp = so->so_sp->ssp_socket; 1594 struct mbuf *m, **mp, *nextrecord; 1595 u_long len, off, oobmark; 1596 long space; 1597 int error = 0, maxreached = 0, unsplice = 0; 1598 unsigned int rcvstate; 1599 int sockdgram = ((so->so_proto->pr_flags & 1600 PR_WANTRCVD) == 0); 1601 1602 if (sockdgram) 1603 sbassertlocked(&so->so_rcv); 1604 else { 1605 sbassertlocked(&sosp->so_snd); 1606 soassertlocked(so); 1607 } 1608 1609 mtx_enter(&so->so_rcv.sb_mtx); 1610 mtx_enter(&sosp->so_snd.sb_mtx); 1611 1612 nextpkt: 1613 if ((error = READ_ONCE(so->so_error))) 1614 goto release; 1615 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) { 1616 error = EPIPE; 1617 goto release; 1618 } 1619 1620 error = READ_ONCE(sosp->so_error); 1621 if (error) { 1622 if (error != ETIMEDOUT && error != EFBIG && error != ELOOP) 1623 goto release; 1624 error = 0; 1625 } 1626 if ((sosp->so_state & SS_ISCONNECTED) == 0) 1627 goto release; 1628 1629 /* Calculate how many bytes can be copied now. */ 1630 len = so->so_rcv.sb_datacc; 1631 if (so->so_splicemax) { 1632 KASSERT(so->so_splicelen < so->so_splicemax); 1633 if (so->so_splicemax <= so->so_splicelen + len) { 1634 len = so->so_splicemax - so->so_splicelen; 1635 maxreached = 1; 1636 } 1637 } 1638 space = sbspace_locked(sosp, &sosp->so_snd); 1639 if (so->so_oobmark && so->so_oobmark < len && 1640 so->so_oobmark < space + 1024) 1641 space += 1024; 1642 if (space <= 0) { 1643 maxreached = 0; 1644 goto release; 1645 } 1646 if (space < len) { 1647 maxreached = 0; 1648 if (space < sosp->so_snd.sb_lowat) 1649 goto release; 1650 len = space; 1651 } 1652 sosp->so_snd.sb_state |= SS_ISSENDING; 1653 1654 SBLASTRECORDCHK(&so->so_rcv, "somove 1"); 1655 SBLASTMBUFCHK(&so->so_rcv, "somove 1"); 1656 m = so->so_rcv.sb_mb; 1657 if (m == NULL) 1658 goto release; 1659 nextrecord = m->m_nextpkt; 1660 1661 /* Drop address and control information not used with splicing. */ 1662 if (so->so_proto->pr_flags & PR_ADDR) { 1663 #ifdef DIAGNOSTIC 1664 if (m->m_type != MT_SONAME) 1665 panic("somove soname: so %p, so_type %d, m %p, " 1666 "m_type %d", so, so->so_type, m, m->m_type); 1667 #endif 1668 m = m->m_next; 1669 } 1670 while (m && m->m_type == MT_CONTROL) 1671 m = m->m_next; 1672 if (m == NULL) { 1673 sbdroprecord(so, &so->so_rcv); 1674 if (so->so_proto->pr_flags & PR_WANTRCVD) { 1675 mtx_leave(&sosp->so_snd.sb_mtx); 1676 mtx_leave(&so->so_rcv.sb_mtx); 1677 pru_rcvd(so); 1678 mtx_enter(&so->so_rcv.sb_mtx); 1679 mtx_enter(&sosp->so_snd.sb_mtx); 1680 } 1681 goto nextpkt; 1682 } 1683 1684 /* 1685 * By splicing sockets connected to localhost, userland might create a 1686 * loop. Dissolve splicing with error if loop is detected by counter. 1687 * 1688 * If we deal with looped broadcast/multicast packet we bail out with 1689 * no error to suppress splice termination. 1690 */ 1691 if ((m->m_flags & M_PKTHDR) && 1692 ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) || 1693 ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) { 1694 error = ELOOP; 1695 goto release; 1696 } 1697 1698 if (so->so_proto->pr_flags & PR_ATOMIC) { 1699 if ((m->m_flags & M_PKTHDR) == 0) 1700 panic("somove !PKTHDR: so %p, so_type %d, m %p, " 1701 "m_type %d", so, so->so_type, m, m->m_type); 1702 if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) { 1703 error = EMSGSIZE; 1704 goto release; 1705 } 1706 if (len < m->m_pkthdr.len) 1707 goto release; 1708 if (m->m_pkthdr.len < len) { 1709 maxreached = 0; 1710 len = m->m_pkthdr.len; 1711 } 1712 /* 1713 * Throw away the name mbuf after it has been assured 1714 * that the whole first record can be processed. 1715 */ 1716 m = so->so_rcv.sb_mb; 1717 sbfree(so, &so->so_rcv, m); 1718 so->so_rcv.sb_mb = m_free(m); 1719 sbsync(&so->so_rcv, nextrecord); 1720 } 1721 /* 1722 * Throw away the control mbufs after it has been assured 1723 * that the whole first record can be processed. 1724 */ 1725 m = so->so_rcv.sb_mb; 1726 while (m && m->m_type == MT_CONTROL) { 1727 sbfree(so, &so->so_rcv, m); 1728 so->so_rcv.sb_mb = m_free(m); 1729 m = so->so_rcv.sb_mb; 1730 sbsync(&so->so_rcv, nextrecord); 1731 } 1732 1733 SBLASTRECORDCHK(&so->so_rcv, "somove 2"); 1734 SBLASTMBUFCHK(&so->so_rcv, "somove 2"); 1735 1736 /* Take at most len mbufs out of receive buffer. */ 1737 for (off = 0, mp = &m; off <= len && *mp; 1738 off += (*mp)->m_len, mp = &(*mp)->m_next) { 1739 u_long size = len - off; 1740 1741 #ifdef DIAGNOSTIC 1742 if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER) 1743 panic("somove type: so %p, so_type %d, m %p, " 1744 "m_type %d", so, so->so_type, *mp, (*mp)->m_type); 1745 #endif 1746 if ((*mp)->m_len > size) { 1747 /* 1748 * Move only a partial mbuf at maximum splice length or 1749 * if the drain buffer is too small for this large mbuf. 1750 */ 1751 if (!maxreached && sosp->so_snd.sb_datacc > 0) { 1752 len -= size; 1753 break; 1754 } 1755 *mp = m_copym(so->so_rcv.sb_mb, 0, size, wait); 1756 if (*mp == NULL) { 1757 len -= size; 1758 break; 1759 } 1760 so->so_rcv.sb_mb->m_data += size; 1761 so->so_rcv.sb_mb->m_len -= size; 1762 so->so_rcv.sb_cc -= size; 1763 so->so_rcv.sb_datacc -= size; 1764 } else { 1765 *mp = so->so_rcv.sb_mb; 1766 sbfree(so, &so->so_rcv, *mp); 1767 so->so_rcv.sb_mb = (*mp)->m_next; 1768 sbsync(&so->so_rcv, nextrecord); 1769 } 1770 } 1771 *mp = NULL; 1772 1773 SBLASTRECORDCHK(&so->so_rcv, "somove 3"); 1774 SBLASTMBUFCHK(&so->so_rcv, "somove 3"); 1775 SBCHECK(so, &so->so_rcv); 1776 if (m == NULL) 1777 goto release; 1778 m->m_nextpkt = NULL; 1779 if (m->m_flags & M_PKTHDR) { 1780 m_resethdr(m); 1781 m->m_pkthdr.len = len; 1782 } 1783 1784 /* Send window update to source peer as receive buffer has changed. */ 1785 if (so->so_proto->pr_flags & PR_WANTRCVD) { 1786 mtx_leave(&sosp->so_snd.sb_mtx); 1787 mtx_leave(&so->so_rcv.sb_mtx); 1788 pru_rcvd(so); 1789 mtx_enter(&so->so_rcv.sb_mtx); 1790 mtx_enter(&sosp->so_snd.sb_mtx); 1791 } 1792 1793 /* Receive buffer did shrink by len bytes, adjust oob. */ 1794 rcvstate = so->so_rcv.sb_state; 1795 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1796 oobmark = so->so_oobmark; 1797 so->so_oobmark = oobmark > len ? oobmark - len : 0; 1798 if (oobmark) { 1799 if (oobmark == len) 1800 so->so_rcv.sb_state |= SS_RCVATMARK; 1801 if (oobmark >= len) 1802 oobmark = 0; 1803 } 1804 1805 /* 1806 * Handle oob data. If any malloc fails, ignore error. 1807 * TCP urgent data is not very reliable anyway. 1808 */ 1809 while (((rcvstate & SS_RCVATMARK) || oobmark) && 1810 (so->so_options & SO_OOBINLINE)) { 1811 struct mbuf *o = NULL; 1812 1813 if (rcvstate & SS_RCVATMARK) { 1814 o = m_get(wait, MT_DATA); 1815 rcvstate &= ~SS_RCVATMARK; 1816 } else if (oobmark) { 1817 o = m_split(m, oobmark, wait); 1818 if (o) { 1819 mtx_leave(&sosp->so_snd.sb_mtx); 1820 mtx_leave(&so->so_rcv.sb_mtx); 1821 error = pru_send(sosp, m, NULL, NULL); 1822 mtx_enter(&so->so_rcv.sb_mtx); 1823 mtx_enter(&sosp->so_snd.sb_mtx); 1824 1825 if (error) { 1826 if (sosp->so_snd.sb_state & 1827 SS_CANTSENDMORE) 1828 error = EPIPE; 1829 m_freem(o); 1830 goto release; 1831 } 1832 len -= oobmark; 1833 so->so_splicelen += oobmark; 1834 m = o; 1835 o = m_get(wait, MT_DATA); 1836 } 1837 oobmark = 0; 1838 } 1839 if (o) { 1840 o->m_len = 1; 1841 *mtod(o, caddr_t) = *mtod(m, caddr_t); 1842 1843 mtx_leave(&sosp->so_snd.sb_mtx); 1844 mtx_leave(&so->so_rcv.sb_mtx); 1845 error = pru_sendoob(sosp, o, NULL, NULL); 1846 mtx_enter(&so->so_rcv.sb_mtx); 1847 mtx_enter(&sosp->so_snd.sb_mtx); 1848 1849 if (error) { 1850 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) 1851 error = EPIPE; 1852 m_freem(m); 1853 goto release; 1854 } 1855 len -= 1; 1856 so->so_splicelen += 1; 1857 if (oobmark) { 1858 oobmark -= 1; 1859 if (oobmark == 0) 1860 rcvstate |= SS_RCVATMARK; 1861 } 1862 m_adj(m, 1); 1863 } 1864 } 1865 1866 /* Append all remaining data to drain socket. */ 1867 if (so->so_rcv.sb_cc == 0 || maxreached) 1868 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1869 1870 mtx_leave(&sosp->so_snd.sb_mtx); 1871 mtx_leave(&so->so_rcv.sb_mtx); 1872 1873 if (sockdgram) 1874 solock_shared(sosp); 1875 error = pru_send(sosp, m, NULL, NULL); 1876 if (sockdgram) 1877 sounlock_shared(sosp); 1878 1879 mtx_enter(&so->so_rcv.sb_mtx); 1880 mtx_enter(&sosp->so_snd.sb_mtx); 1881 1882 if (error) { 1883 if (sosp->so_snd.sb_state & SS_CANTSENDMORE || 1884 sosp->so_pcb == NULL) 1885 error = EPIPE; 1886 goto release; 1887 } 1888 so->so_splicelen += len; 1889 1890 /* Move several packets if possible. */ 1891 if (!maxreached && nextrecord) 1892 goto nextpkt; 1893 1894 release: 1895 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1896 1897 if (!error && maxreached && so->so_splicemax == so->so_splicelen) 1898 error = EFBIG; 1899 if (error) 1900 WRITE_ONCE(so->so_error, error); 1901 1902 if (((so->so_rcv.sb_state & SS_CANTRCVMORE) && 1903 so->so_rcv.sb_cc == 0) || 1904 (sosp->so_snd.sb_state & SS_CANTSENDMORE) || 1905 maxreached || error) 1906 unsplice = 1; 1907 1908 mtx_leave(&sosp->so_snd.sb_mtx); 1909 mtx_leave(&so->so_rcv.sb_mtx); 1910 1911 if (unsplice) { 1912 if (sockdgram) 1913 solock(so); 1914 sounsplice(so, sosp, 0); 1915 if (sockdgram) 1916 sounlock(so); 1917 1918 return (0); 1919 } 1920 if (timerisset(&so->so_idletv)) 1921 timeout_add_tv(&so->so_idleto, &so->so_idletv); 1922 return (1); 1923 } 1924 #endif /* SOCKET_SPLICE */ 1925 1926 void 1927 sorwakeup(struct socket *so) 1928 { 1929 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 1930 soassertlocked_readonly(so); 1931 1932 #ifdef SOCKET_SPLICE 1933 if (so->so_proto->pr_flags & PR_SPLICE) { 1934 sb_mtx_lock(&so->so_rcv); 1935 if (so->so_rcv.sb_flags & SB_SPLICE) 1936 task_add(sosplice_taskq, &so->so_splicetask); 1937 if (isspliced(so)) { 1938 sb_mtx_unlock(&so->so_rcv); 1939 return; 1940 } 1941 sb_mtx_unlock(&so->so_rcv); 1942 } 1943 #endif 1944 sowakeup(so, &so->so_rcv); 1945 if (so->so_upcall) 1946 (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT); 1947 } 1948 1949 void 1950 sowwakeup(struct socket *so) 1951 { 1952 if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0) 1953 soassertlocked_readonly(so); 1954 1955 #ifdef SOCKET_SPLICE 1956 if (so->so_proto->pr_flags & PR_SPLICE) { 1957 sb_mtx_lock(&so->so_snd); 1958 if (so->so_snd.sb_flags & SB_SPLICE) 1959 task_add(sosplice_taskq, 1960 &so->so_sp->ssp_soback->so_splicetask); 1961 if (issplicedback(so)) { 1962 sb_mtx_unlock(&so->so_snd); 1963 return; 1964 } 1965 sb_mtx_unlock(&so->so_snd); 1966 } 1967 #endif 1968 sowakeup(so, &so->so_snd); 1969 } 1970 1971 int 1972 sosetopt(struct socket *so, int level, int optname, struct mbuf *m) 1973 { 1974 int error = 0; 1975 1976 if (level != SOL_SOCKET) { 1977 if (so->so_proto->pr_ctloutput) { 1978 solock(so); 1979 error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1980 level, optname, m); 1981 sounlock(so); 1982 return (error); 1983 } 1984 error = ENOPROTOOPT; 1985 } else { 1986 switch (optname) { 1987 1988 case SO_LINGER: 1989 if (m == NULL || m->m_len != sizeof (struct linger) || 1990 mtod(m, struct linger *)->l_linger < 0 || 1991 mtod(m, struct linger *)->l_linger > SHRT_MAX) 1992 return (EINVAL); 1993 1994 solock(so); 1995 so->so_linger = mtod(m, struct linger *)->l_linger; 1996 if (*mtod(m, int *)) 1997 so->so_options |= optname; 1998 else 1999 so->so_options &= ~optname; 2000 sounlock(so); 2001 2002 break; 2003 case SO_BINDANY: 2004 if ((error = suser(curproc)) != 0) /* XXX */ 2005 return (error); 2006 /* FALLTHROUGH */ 2007 2008 case SO_DEBUG: 2009 case SO_KEEPALIVE: 2010 case SO_USELOOPBACK: 2011 case SO_BROADCAST: 2012 case SO_REUSEADDR: 2013 case SO_REUSEPORT: 2014 case SO_OOBINLINE: 2015 case SO_TIMESTAMP: 2016 case SO_ZEROIZE: 2017 if (m == NULL || m->m_len < sizeof (int)) 2018 return (EINVAL); 2019 2020 solock(so); 2021 if (*mtod(m, int *)) 2022 so->so_options |= optname; 2023 else 2024 so->so_options &= ~optname; 2025 sounlock(so); 2026 2027 break; 2028 case SO_DONTROUTE: 2029 if (m == NULL || m->m_len < sizeof (int)) 2030 return (EINVAL); 2031 if (*mtod(m, int *)) 2032 error = EOPNOTSUPP; 2033 break; 2034 2035 case SO_SNDBUF: 2036 case SO_RCVBUF: 2037 case SO_SNDLOWAT: 2038 case SO_RCVLOWAT: 2039 { 2040 struct sockbuf *sb = (optname == SO_SNDBUF || 2041 optname == SO_SNDLOWAT ? 2042 &so->so_snd : &so->so_rcv); 2043 u_long cnt; 2044 2045 if (m == NULL || m->m_len < sizeof (int)) 2046 return (EINVAL); 2047 cnt = *mtod(m, int *); 2048 if ((long)cnt <= 0) 2049 cnt = 1; 2050 2051 if (((sb->sb_flags & SB_MTXLOCK) == 0)) 2052 solock(so); 2053 mtx_enter(&sb->sb_mtx); 2054 2055 switch (optname) { 2056 case SO_SNDBUF: 2057 case SO_RCVBUF: 2058 if (sb->sb_state & 2059 (SS_CANTSENDMORE | SS_CANTRCVMORE)) { 2060 error = EINVAL; 2061 break; 2062 } 2063 if (sbcheckreserve(cnt, sb->sb_wat) || 2064 sbreserve(so, sb, cnt)) { 2065 error = ENOBUFS; 2066 break; 2067 } 2068 sb->sb_wat = cnt; 2069 break; 2070 case SO_SNDLOWAT: 2071 case SO_RCVLOWAT: 2072 sb->sb_lowat = (cnt > sb->sb_hiwat) ? 2073 sb->sb_hiwat : cnt; 2074 break; 2075 } 2076 2077 mtx_leave(&sb->sb_mtx); 2078 if (((sb->sb_flags & SB_MTXLOCK) == 0)) 2079 sounlock(so); 2080 2081 break; 2082 } 2083 2084 case SO_SNDTIMEO: 2085 case SO_RCVTIMEO: 2086 { 2087 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 2088 &so->so_snd : &so->so_rcv); 2089 struct timeval tv; 2090 uint64_t nsecs; 2091 2092 if (m == NULL || m->m_len < sizeof (tv)) 2093 return (EINVAL); 2094 memcpy(&tv, mtod(m, struct timeval *), sizeof tv); 2095 if (!timerisvalid(&tv)) 2096 return (EINVAL); 2097 nsecs = TIMEVAL_TO_NSEC(&tv); 2098 if (nsecs == UINT64_MAX) 2099 return (EDOM); 2100 if (nsecs == 0) 2101 nsecs = INFSLP; 2102 2103 mtx_enter(&sb->sb_mtx); 2104 sb->sb_timeo_nsecs = nsecs; 2105 mtx_leave(&sb->sb_mtx); 2106 break; 2107 } 2108 2109 case SO_RTABLE: 2110 if (so->so_proto->pr_domain && 2111 so->so_proto->pr_domain->dom_protosw && 2112 so->so_proto->pr_ctloutput) { 2113 const struct domain *dom = 2114 so->so_proto->pr_domain; 2115 2116 level = dom->dom_protosw->pr_protocol; 2117 solock(so); 2118 error = (*so->so_proto->pr_ctloutput) 2119 (PRCO_SETOPT, so, level, optname, m); 2120 sounlock(so); 2121 } else 2122 error = ENOPROTOOPT; 2123 break; 2124 #ifdef SOCKET_SPLICE 2125 case SO_SPLICE: 2126 if (m == NULL) { 2127 error = sosplice(so, -1, 0, NULL); 2128 } else if (m->m_len < sizeof(int)) { 2129 error = EINVAL; 2130 } else if (m->m_len < sizeof(struct splice)) { 2131 error = sosplice(so, *mtod(m, int *), 0, NULL); 2132 } else { 2133 error = sosplice(so, 2134 mtod(m, struct splice *)->sp_fd, 2135 mtod(m, struct splice *)->sp_max, 2136 &mtod(m, struct splice *)->sp_idle); 2137 } 2138 break; 2139 #endif /* SOCKET_SPLICE */ 2140 2141 default: 2142 error = ENOPROTOOPT; 2143 break; 2144 } 2145 } 2146 2147 return (error); 2148 } 2149 2150 int 2151 sogetopt(struct socket *so, int level, int optname, struct mbuf *m) 2152 { 2153 int error = 0; 2154 2155 if (level != SOL_SOCKET) { 2156 if (so->so_proto->pr_ctloutput) { 2157 m->m_len = 0; 2158 2159 solock(so); 2160 error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so, 2161 level, optname, m); 2162 sounlock(so); 2163 return (error); 2164 } else 2165 return (ENOPROTOOPT); 2166 } else { 2167 m->m_len = sizeof (int); 2168 2169 switch (optname) { 2170 2171 case SO_LINGER: 2172 m->m_len = sizeof (struct linger); 2173 solock_shared(so); 2174 mtod(m, struct linger *)->l_onoff = 2175 so->so_options & SO_LINGER; 2176 mtod(m, struct linger *)->l_linger = so->so_linger; 2177 sounlock_shared(so); 2178 break; 2179 2180 case SO_BINDANY: 2181 case SO_USELOOPBACK: 2182 case SO_DEBUG: 2183 case SO_KEEPALIVE: 2184 case SO_REUSEADDR: 2185 case SO_REUSEPORT: 2186 case SO_BROADCAST: 2187 case SO_OOBINLINE: 2188 case SO_ACCEPTCONN: 2189 case SO_TIMESTAMP: 2190 case SO_ZEROIZE: 2191 *mtod(m, int *) = so->so_options & optname; 2192 break; 2193 2194 case SO_DONTROUTE: 2195 *mtod(m, int *) = 0; 2196 break; 2197 2198 case SO_TYPE: 2199 *mtod(m, int *) = so->so_type; 2200 break; 2201 2202 case SO_ERROR: 2203 solock(so); 2204 *mtod(m, int *) = so->so_error; 2205 so->so_error = 0; 2206 sounlock(so); 2207 2208 break; 2209 2210 case SO_DOMAIN: 2211 *mtod(m, int *) = so->so_proto->pr_domain->dom_family; 2212 break; 2213 2214 case SO_PROTOCOL: 2215 *mtod(m, int *) = so->so_proto->pr_protocol; 2216 break; 2217 2218 case SO_SNDBUF: 2219 *mtod(m, int *) = so->so_snd.sb_hiwat; 2220 break; 2221 2222 case SO_RCVBUF: 2223 *mtod(m, int *) = so->so_rcv.sb_hiwat; 2224 break; 2225 2226 case SO_SNDLOWAT: 2227 *mtod(m, int *) = so->so_snd.sb_lowat; 2228 break; 2229 2230 case SO_RCVLOWAT: 2231 *mtod(m, int *) = so->so_rcv.sb_lowat; 2232 break; 2233 2234 case SO_SNDTIMEO: 2235 case SO_RCVTIMEO: 2236 { 2237 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 2238 &so->so_snd : &so->so_rcv); 2239 struct timeval tv; 2240 uint64_t nsecs; 2241 2242 mtx_enter(&sb->sb_mtx); 2243 nsecs = sb->sb_timeo_nsecs; 2244 mtx_leave(&sb->sb_mtx); 2245 2246 m->m_len = sizeof(struct timeval); 2247 memset(&tv, 0, sizeof(tv)); 2248 if (nsecs != INFSLP) 2249 NSEC_TO_TIMEVAL(nsecs, &tv); 2250 memcpy(mtod(m, struct timeval *), &tv, sizeof tv); 2251 break; 2252 } 2253 2254 case SO_RTABLE: 2255 if (so->so_proto->pr_domain && 2256 so->so_proto->pr_domain->dom_protosw && 2257 so->so_proto->pr_ctloutput) { 2258 const struct domain *dom = 2259 so->so_proto->pr_domain; 2260 2261 level = dom->dom_protosw->pr_protocol; 2262 solock(so); 2263 error = (*so->so_proto->pr_ctloutput) 2264 (PRCO_GETOPT, so, level, optname, m); 2265 sounlock(so); 2266 if (error) 2267 return (error); 2268 break; 2269 } 2270 return (ENOPROTOOPT); 2271 2272 #ifdef SOCKET_SPLICE 2273 case SO_SPLICE: 2274 { 2275 off_t len; 2276 2277 m->m_len = sizeof(off_t); 2278 solock_shared(so); 2279 len = so->so_sp ? so->so_sp->ssp_len : 0; 2280 sounlock_shared(so); 2281 memcpy(mtod(m, off_t *), &len, sizeof(off_t)); 2282 break; 2283 } 2284 #endif /* SOCKET_SPLICE */ 2285 2286 case SO_PEERCRED: 2287 if (so->so_proto->pr_protocol == AF_UNIX) { 2288 struct unpcb *unp = sotounpcb(so); 2289 2290 solock(so); 2291 if (unp->unp_flags & UNP_FEIDS) { 2292 m->m_len = sizeof(unp->unp_connid); 2293 memcpy(mtod(m, caddr_t), 2294 &(unp->unp_connid), m->m_len); 2295 sounlock(so); 2296 break; 2297 } 2298 sounlock(so); 2299 2300 return (ENOTCONN); 2301 } 2302 return (EOPNOTSUPP); 2303 2304 default: 2305 return (ENOPROTOOPT); 2306 } 2307 return (0); 2308 } 2309 } 2310 2311 void 2312 sohasoutofband(struct socket *so) 2313 { 2314 pgsigio(&so->so_sigio, SIGURG, 0); 2315 knote(&so->so_rcv.sb_klist, 0); 2316 } 2317 2318 void 2319 sofilt_lock(struct socket *so, struct sockbuf *sb) 2320 { 2321 switch (so->so_proto->pr_domain->dom_family) { 2322 case PF_INET: 2323 case PF_INET6: 2324 NET_LOCK_SHARED(); 2325 break; 2326 default: 2327 rw_enter_write(&so->so_lock); 2328 break; 2329 } 2330 2331 mtx_enter(&sb->sb_mtx); 2332 } 2333 2334 void 2335 sofilt_unlock(struct socket *so, struct sockbuf *sb) 2336 { 2337 mtx_leave(&sb->sb_mtx); 2338 2339 switch (so->so_proto->pr_domain->dom_family) { 2340 case PF_INET: 2341 case PF_INET6: 2342 NET_UNLOCK_SHARED(); 2343 break; 2344 default: 2345 rw_exit_write(&so->so_lock); 2346 break; 2347 } 2348 } 2349 2350 int 2351 soo_kqfilter(struct file *fp, struct knote *kn) 2352 { 2353 struct socket *so = kn->kn_fp->f_data; 2354 struct sockbuf *sb; 2355 2356 switch (kn->kn_filter) { 2357 case EVFILT_READ: 2358 kn->kn_fop = &soread_filtops; 2359 sb = &so->so_rcv; 2360 break; 2361 case EVFILT_WRITE: 2362 kn->kn_fop = &sowrite_filtops; 2363 sb = &so->so_snd; 2364 break; 2365 case EVFILT_EXCEPT: 2366 kn->kn_fop = &soexcept_filtops; 2367 sb = &so->so_rcv; 2368 break; 2369 default: 2370 return (EINVAL); 2371 } 2372 2373 klist_insert(&sb->sb_klist, kn); 2374 2375 return (0); 2376 } 2377 2378 void 2379 filt_sordetach(struct knote *kn) 2380 { 2381 struct socket *so = kn->kn_fp->f_data; 2382 2383 klist_remove(&so->so_rcv.sb_klist, kn); 2384 } 2385 2386 int 2387 filt_soread(struct knote *kn, long hint) 2388 { 2389 struct socket *so = kn->kn_fp->f_data; 2390 u_int state = READ_ONCE(so->so_state); 2391 u_int error = READ_ONCE(so->so_error); 2392 int rv = 0; 2393 2394 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); 2395 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 2396 soassertlocked_readonly(so); 2397 2398 if (so->so_options & SO_ACCEPTCONN) { 2399 short qlen = READ_ONCE(so->so_qlen); 2400 2401 if (so->so_rcv.sb_flags & SB_MTXLOCK) 2402 soassertlocked_readonly(so); 2403 2404 kn->kn_data = qlen; 2405 rv = (kn->kn_data != 0); 2406 2407 if (kn->kn_flags & (__EV_POLL | __EV_SELECT)) { 2408 if (state & SS_ISDISCONNECTED) { 2409 kn->kn_flags |= __EV_HUP; 2410 rv = 1; 2411 } else { 2412 rv = qlen || soreadable(so); 2413 } 2414 } 2415 2416 return rv; 2417 } 2418 2419 kn->kn_data = so->so_rcv.sb_cc; 2420 #ifdef SOCKET_SPLICE 2421 if (isspliced(so)) { 2422 rv = 0; 2423 } else 2424 #endif /* SOCKET_SPLICE */ 2425 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 2426 kn->kn_flags |= EV_EOF; 2427 if (kn->kn_flags & __EV_POLL) { 2428 if (state & SS_ISDISCONNECTED) 2429 kn->kn_flags |= __EV_HUP; 2430 } 2431 kn->kn_fflags = error; 2432 rv = 1; 2433 } else if (error) { 2434 rv = 1; 2435 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2436 rv = (kn->kn_data >= kn->kn_sdata); 2437 } else { 2438 rv = (kn->kn_data >= so->so_rcv.sb_lowat); 2439 } 2440 2441 return rv; 2442 } 2443 2444 void 2445 filt_sowdetach(struct knote *kn) 2446 { 2447 struct socket *so = kn->kn_fp->f_data; 2448 2449 klist_remove(&so->so_snd.sb_klist, kn); 2450 } 2451 2452 int 2453 filt_sowrite(struct knote *kn, long hint) 2454 { 2455 struct socket *so = kn->kn_fp->f_data; 2456 u_int state = READ_ONCE(so->so_state); 2457 u_int error = READ_ONCE(so->so_error); 2458 int rv; 2459 2460 MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx); 2461 if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0) 2462 soassertlocked_readonly(so); 2463 2464 kn->kn_data = sbspace_locked(so, &so->so_snd); 2465 if (so->so_snd.sb_state & SS_CANTSENDMORE) { 2466 kn->kn_flags |= EV_EOF; 2467 if (kn->kn_flags & __EV_POLL) { 2468 if (state & SS_ISDISCONNECTED) 2469 kn->kn_flags |= __EV_HUP; 2470 } 2471 kn->kn_fflags = error; 2472 rv = 1; 2473 } else if (error) { 2474 rv = 1; 2475 } else if (((state & SS_ISCONNECTED) == 0) && 2476 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 2477 rv = 0; 2478 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2479 rv = (kn->kn_data >= kn->kn_sdata); 2480 } else { 2481 rv = (kn->kn_data >= so->so_snd.sb_lowat); 2482 } 2483 2484 return (rv); 2485 } 2486 2487 int 2488 filt_soexcept(struct knote *kn, long hint) 2489 { 2490 struct socket *so = kn->kn_fp->f_data; 2491 int rv = 0; 2492 2493 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); 2494 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 2495 soassertlocked_readonly(so); 2496 2497 #ifdef SOCKET_SPLICE 2498 if (isspliced(so)) { 2499 rv = 0; 2500 } else 2501 #endif /* SOCKET_SPLICE */ 2502 if (kn->kn_sfflags & NOTE_OOB) { 2503 if (so->so_oobmark || (so->so_rcv.sb_state & SS_RCVATMARK)) { 2504 kn->kn_fflags |= NOTE_OOB; 2505 kn->kn_data -= so->so_oobmark; 2506 rv = 1; 2507 } 2508 } 2509 2510 if (kn->kn_flags & __EV_POLL) { 2511 u_int state = READ_ONCE(so->so_state); 2512 2513 if (state & SS_ISDISCONNECTED) { 2514 kn->kn_flags |= __EV_HUP; 2515 rv = 1; 2516 } 2517 } 2518 2519 return rv; 2520 } 2521 2522 int 2523 filt_sowmodify(struct kevent *kev, struct knote *kn) 2524 { 2525 struct socket *so = kn->kn_fp->f_data; 2526 int rv; 2527 2528 sofilt_lock(so, &so->so_snd); 2529 rv = knote_modify(kev, kn); 2530 sofilt_unlock(so, &so->so_snd); 2531 2532 return (rv); 2533 } 2534 2535 int 2536 filt_sowprocess(struct knote *kn, struct kevent *kev) 2537 { 2538 struct socket *so = kn->kn_fp->f_data; 2539 int rv; 2540 2541 sofilt_lock(so, &so->so_snd); 2542 rv = knote_process(kn, kev); 2543 sofilt_unlock(so, &so->so_snd); 2544 2545 return (rv); 2546 } 2547 2548 int 2549 filt_sormodify(struct kevent *kev, struct knote *kn) 2550 { 2551 struct socket *so = kn->kn_fp->f_data; 2552 int rv; 2553 2554 sofilt_lock(so, &so->so_rcv); 2555 rv = knote_modify(kev, kn); 2556 sofilt_unlock(so, &so->so_rcv); 2557 2558 return (rv); 2559 } 2560 2561 int 2562 filt_sorprocess(struct knote *kn, struct kevent *kev) 2563 { 2564 struct socket *so = kn->kn_fp->f_data; 2565 int rv; 2566 2567 sofilt_lock(so, &so->so_rcv); 2568 rv = knote_process(kn, kev); 2569 sofilt_unlock(so, &so->so_rcv); 2570 2571 return (rv); 2572 } 2573 2574 #ifdef DDB 2575 void 2576 sobuf_print(struct sockbuf *, 2577 int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))); 2578 2579 void 2580 sobuf_print(struct sockbuf *sb, 2581 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2582 { 2583 (*pr)("\tsb_cc: %lu\n", sb->sb_cc); 2584 (*pr)("\tsb_datacc: %lu\n", sb->sb_datacc); 2585 (*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat); 2586 (*pr)("\tsb_wat: %lu\n", sb->sb_wat); 2587 (*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt); 2588 (*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax); 2589 (*pr)("\tsb_lowat: %ld\n", sb->sb_lowat); 2590 (*pr)("\tsb_mb: %p\n", sb->sb_mb); 2591 (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail); 2592 (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord); 2593 (*pr)("\tsb_flags: %04x\n", sb->sb_flags); 2594 (*pr)("\tsb_state: %04x\n", sb->sb_state); 2595 (*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs); 2596 } 2597 2598 void 2599 so_print(void *v, 2600 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2601 { 2602 struct socket *so = v; 2603 2604 (*pr)("socket %p\n", so); 2605 (*pr)("so_type: %i\n", so->so_type); 2606 (*pr)("so_options: 0x%04x\n", so->so_options); /* %b */ 2607 (*pr)("so_linger: %i\n", so->so_linger); 2608 (*pr)("so_state: 0x%04x\n", so->so_state); 2609 (*pr)("so_pcb: %p\n", so->so_pcb); 2610 (*pr)("so_proto: %p\n", so->so_proto); 2611 (*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio); 2612 2613 (*pr)("so_head: %p\n", so->so_head); 2614 (*pr)("so_onq: %p\n", so->so_onq); 2615 (*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0)); 2616 (*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q)); 2617 (*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe)); 2618 (*pr)("so_q0len: %i\n", so->so_q0len); 2619 (*pr)("so_qlen: %i\n", so->so_qlen); 2620 (*pr)("so_qlimit: %i\n", so->so_qlimit); 2621 (*pr)("so_timeo: %i\n", so->so_timeo); 2622 (*pr)("so_obmark: %lu\n", so->so_oobmark); 2623 2624 (*pr)("so_sp: %p\n", so->so_sp); 2625 if (so->so_sp != NULL) { 2626 (*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket); 2627 (*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback); 2628 (*pr)("\tssp_len: %lld\n", 2629 (unsigned long long)so->so_sp->ssp_len); 2630 (*pr)("\tssp_max: %lld\n", 2631 (unsigned long long)so->so_sp->ssp_max); 2632 (*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec, 2633 so->so_sp->ssp_idletv.tv_usec); 2634 (*pr)("\tssp_idleto: %spending (@%i)\n", 2635 timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ", 2636 so->so_sp->ssp_idleto.to_time); 2637 } 2638 2639 (*pr)("so_rcv:\n"); 2640 sobuf_print(&so->so_rcv, pr); 2641 (*pr)("so_snd:\n"); 2642 sobuf_print(&so->so_snd, pr); 2643 2644 (*pr)("so_upcall: %p so_upcallarg: %p\n", 2645 so->so_upcall, so->so_upcallarg); 2646 2647 (*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid); 2648 (*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid); 2649 (*pr)("so_cpid: %d\n", so->so_cpid); 2650 } 2651 #endif 2652