1 /* $OpenBSD: uipc_socket.c,v 1.353 2025/01/01 13:44:22 bluhm Exp $ */ 2 /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 #include <sys/file.h> 39 #include <sys/filedesc.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/domain.h> 43 #include <sys/event.h> 44 #include <sys/protosw.h> 45 #include <sys/socket.h> 46 #include <sys/unpcb.h> 47 #include <sys/socketvar.h> 48 #include <sys/signalvar.h> 49 #include <sys/pool.h> 50 #include <sys/atomic.h> 51 #include <sys/rwlock.h> 52 #include <sys/time.h> 53 #include <sys/refcnt.h> 54 55 #ifdef DDB 56 #include <machine/db_machdep.h> 57 #endif 58 59 void sbsync(struct sockbuf *, struct mbuf *); 60 61 int sosplice(struct socket *, int, off_t, struct timeval *); 62 void sounsplice(struct socket *, struct socket *, int); 63 void soidle(void *); 64 void sotask(void *); 65 int somove(struct socket *, int); 66 void sorflush(struct socket *); 67 68 void filt_sordetach(struct knote *kn); 69 int filt_soread(struct knote *kn, long hint); 70 void filt_sowdetach(struct knote *kn); 71 int filt_sowrite(struct knote *kn, long hint); 72 int filt_soexcept(struct knote *kn, long hint); 73 74 int filt_sowmodify(struct kevent *kev, struct knote *kn); 75 int filt_sowprocess(struct knote *kn, struct kevent *kev); 76 77 int filt_sormodify(struct kevent *kev, struct knote *kn); 78 int filt_sorprocess(struct knote *kn, struct kevent *kev); 79 80 const struct filterops soread_filtops = { 81 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 82 .f_attach = NULL, 83 .f_detach = filt_sordetach, 84 .f_event = filt_soread, 85 .f_modify = filt_sormodify, 86 .f_process = filt_sorprocess, 87 }; 88 89 const struct filterops sowrite_filtops = { 90 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 91 .f_attach = NULL, 92 .f_detach = filt_sowdetach, 93 .f_event = filt_sowrite, 94 .f_modify = filt_sowmodify, 95 .f_process = filt_sowprocess, 96 }; 97 98 const struct filterops soexcept_filtops = { 99 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 100 .f_attach = NULL, 101 .f_detach = filt_sordetach, 102 .f_event = filt_soexcept, 103 .f_modify = filt_sormodify, 104 .f_process = filt_sorprocess, 105 }; 106 107 #ifndef SOMINCONN 108 #define SOMINCONN 80 109 #endif /* SOMINCONN */ 110 111 int somaxconn = SOMAXCONN; 112 int sominconn = SOMINCONN; 113 114 struct pool socket_pool; 115 #ifdef SOCKET_SPLICE 116 struct pool sosplice_pool; 117 struct taskq *sosplice_taskq; 118 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk"); 119 #endif 120 121 void 122 soinit(void) 123 { 124 pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0, 125 "sockpl", NULL); 126 #ifdef SOCKET_SPLICE 127 pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0, 128 "sosppl", NULL); 129 #endif 130 } 131 132 struct socket * 133 soalloc(const struct protosw *prp, int wait) 134 { 135 const struct domain *dp = prp->pr_domain; 136 struct socket *so; 137 138 so = pool_get(&socket_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) | 139 PR_ZERO); 140 if (so == NULL) 141 return (NULL); 142 rw_init_flags(&so->so_lock, dp->dom_name, RWL_DUPOK); 143 refcnt_init(&so->so_refcnt); 144 rw_init(&so->so_rcv.sb_lock, "sbufrcv"); 145 rw_init(&so->so_snd.sb_lock, "sbufsnd"); 146 mtx_init_flags(&so->so_rcv.sb_mtx, IPL_MPFLOOR, "sbrcv", 0); 147 mtx_init_flags(&so->so_snd.sb_mtx, IPL_MPFLOOR, "sbsnd", 0); 148 klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx); 149 klist_init_mutex(&so->so_snd.sb_klist, &so->so_snd.sb_mtx); 150 sigio_init(&so->so_sigio); 151 TAILQ_INIT(&so->so_q0); 152 TAILQ_INIT(&so->so_q); 153 154 so->so_snd.sb_flags |= SB_MTXLOCK; 155 so->so_rcv.sb_flags |= SB_MTXLOCK; 156 157 return (so); 158 } 159 160 /* 161 * Socket operation routines. 162 * These routines are called by the routines in 163 * sys_socket.c or from a system process, and 164 * implement the semantics of socket operations by 165 * switching out to the protocol specific routines. 166 */ 167 int 168 socreate(int dom, struct socket **aso, int type, int proto) 169 { 170 struct proc *p = curproc; /* XXX */ 171 const struct protosw *prp; 172 struct socket *so; 173 int error; 174 175 if (proto) 176 prp = pffindproto(dom, proto, type); 177 else 178 prp = pffindtype(dom, type); 179 if (prp == NULL || prp->pr_usrreqs == NULL) 180 return (EPROTONOSUPPORT); 181 if (prp->pr_type != type) 182 return (EPROTOTYPE); 183 so = soalloc(prp, M_WAIT); 184 so->so_type = type; 185 if (suser(p) == 0) 186 so->so_state = SS_PRIV; 187 so->so_ruid = p->p_ucred->cr_ruid; 188 so->so_euid = p->p_ucred->cr_uid; 189 so->so_rgid = p->p_ucred->cr_rgid; 190 so->so_egid = p->p_ucred->cr_gid; 191 so->so_cpid = p->p_p->ps_pid; 192 so->so_proto = prp; 193 so->so_snd.sb_timeo_nsecs = INFSLP; 194 so->so_rcv.sb_timeo_nsecs = INFSLP; 195 196 solock(so); 197 error = pru_attach(so, proto, M_WAIT); 198 if (error) { 199 so->so_state |= SS_NOFDREF; 200 /* sofree() calls sounlock(). */ 201 sofree(so, 0); 202 return (error); 203 } 204 sounlock(so); 205 *aso = so; 206 return (0); 207 } 208 209 int 210 sobind(struct socket *so, struct mbuf *nam, struct proc *p) 211 { 212 soassertlocked(so); 213 return pru_bind(so, nam, p); 214 } 215 216 int 217 solisten(struct socket *so, int backlog) 218 { 219 int somaxconn_local = atomic_load_int(&somaxconn); 220 int sominconn_local = atomic_load_int(&sominconn); 221 int error; 222 223 switch (so->so_type) { 224 case SOCK_STREAM: 225 case SOCK_SEQPACKET: 226 break; 227 default: 228 return (EOPNOTSUPP); 229 } 230 231 soassertlocked(so); 232 233 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 234 return (EINVAL); 235 #ifdef SOCKET_SPLICE 236 if (isspliced(so) || issplicedback(so)) 237 return (EOPNOTSUPP); 238 #endif /* SOCKET_SPLICE */ 239 error = pru_listen(so); 240 if (error) 241 return (error); 242 if (TAILQ_FIRST(&so->so_q) == NULL) 243 so->so_options |= SO_ACCEPTCONN; 244 if (backlog < 0 || backlog > somaxconn_local) 245 backlog = somaxconn_local; 246 if (backlog < sominconn_local) 247 backlog = sominconn_local; 248 so->so_qlimit = backlog; 249 return (0); 250 } 251 252 void 253 sorele(struct socket *so, int keep_lock) 254 { 255 if (keep_lock == 0) 256 sounlock(so); 257 258 if (refcnt_rele(&so->so_refcnt) == 0) 259 return; 260 261 sigio_free(&so->so_sigio); 262 klist_free(&so->so_rcv.sb_klist); 263 klist_free(&so->so_snd.sb_klist); 264 265 mtx_enter(&so->so_snd.sb_mtx); 266 sbrelease(so, &so->so_snd); 267 mtx_leave(&so->so_snd.sb_mtx); 268 269 if (so->so_proto->pr_flags & PR_RIGHTS && 270 so->so_proto->pr_domain->dom_dispose) 271 (*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb); 272 m_purge(so->so_rcv.sb_mb); 273 274 #ifdef SOCKET_SPLICE 275 if (so->so_sp) 276 pool_put(&sosplice_pool, so->so_sp); 277 #endif 278 pool_put(&socket_pool, so); 279 } 280 281 #define SOSP_FREEING_READ 1 282 #define SOSP_FREEING_WRITE 2 283 void 284 sofree(struct socket *so, int keep_lock) 285 { 286 int persocket = solock_persocket(so); 287 288 soassertlocked(so); 289 290 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { 291 if (!keep_lock) 292 sounlock(so); 293 return; 294 } 295 if (so->so_head) { 296 struct socket *head = so->so_head; 297 298 /* 299 * We must not decommission a socket that's on the accept(2) 300 * queue. If we do, then accept(2) may hang after select(2) 301 * indicated that the listening socket was ready. 302 */ 303 if (so->so_onq == &head->so_q) { 304 if (!keep_lock) 305 sounlock(so); 306 return; 307 } 308 309 if (persocket) { 310 soref(head); 311 sounlock(so); 312 solock(head); 313 solock(so); 314 315 if (so->so_onq != &head->so_q0) { 316 sounlock(so); 317 sorele(head, 0); 318 return; 319 } 320 } 321 322 soqremque(so, 0); 323 324 if (persocket) 325 sorele(head, 0); 326 } 327 328 sorele(so, keep_lock); 329 } 330 331 static inline uint64_t 332 solinger_nsec(struct socket *so) 333 { 334 if (so->so_linger == 0) 335 return INFSLP; 336 337 return SEC_TO_NSEC(so->so_linger); 338 } 339 340 /* 341 * Close a socket on last file table reference removal. 342 * Initiate disconnect if connected. 343 * Free socket when disconnect complete. 344 */ 345 int 346 soclose(struct socket *so, int flags) 347 { 348 struct socket *so2; 349 int error = 0; 350 351 solock(so); 352 /* Revoke async IO early. There is a final revocation in sofree(). */ 353 sigio_free(&so->so_sigio); 354 if (so->so_state & SS_ISCONNECTED) { 355 if (so->so_pcb == NULL) 356 goto discard; 357 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 358 error = sodisconnect(so); 359 if (error) 360 goto drop; 361 } 362 if (so->so_options & SO_LINGER) { 363 if ((so->so_state & SS_ISDISCONNECTING) && 364 (flags & MSG_DONTWAIT)) 365 goto drop; 366 while (so->so_state & SS_ISCONNECTED) { 367 error = sosleep_nsec(so, &so->so_timeo, 368 PSOCK | PCATCH, "netcls", 369 solinger_nsec(so)); 370 if (error) 371 break; 372 } 373 } 374 } 375 drop: 376 if (so->so_pcb) { 377 int error2; 378 error2 = pru_detach(so); 379 if (error == 0) 380 error = error2; 381 } 382 if (so->so_options & SO_ACCEPTCONN) { 383 int persocket = solock_persocket(so); 384 385 while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { 386 if (persocket) 387 solock(so2); 388 (void) soqremque(so2, 0); 389 if (persocket) 390 sounlock(so); 391 soabort(so2); 392 if (persocket) 393 solock(so); 394 } 395 while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { 396 if (persocket) 397 solock(so2); 398 (void) soqremque(so2, 1); 399 if (persocket) 400 sounlock(so); 401 soabort(so2); 402 if (persocket) 403 solock(so); 404 } 405 } 406 discard: 407 #ifdef SOCKET_SPLICE 408 if (so->so_sp) { 409 struct socket *soback; 410 411 sounlock(so); 412 mtx_enter(&so->so_snd.sb_mtx); 413 /* 414 * Concurrent sounsplice() locks `sb_mtx' mutexes on 415 * both `so_snd' and `so_rcv' before unsplice sockets. 416 */ 417 if ((soback = so->so_sp->ssp_soback) == NULL) { 418 mtx_leave(&so->so_snd.sb_mtx); 419 goto notsplicedback; 420 } 421 soref(soback); 422 mtx_leave(&so->so_snd.sb_mtx); 423 424 /* 425 * `so' can be only unspliced, and never spliced again. 426 * Thus if issplicedback(so) check is positive, socket is 427 * still spliced and `ssp_soback' points to the same 428 * socket that `soback'. 429 */ 430 sblock(&soback->so_rcv, SBL_WAIT | SBL_NOINTR); 431 if (issplicedback(so)) { 432 int freeing = SOSP_FREEING_WRITE; 433 434 if (so->so_sp->ssp_soback == so) 435 freeing |= SOSP_FREEING_READ; 436 solock(soback); 437 sounsplice(so->so_sp->ssp_soback, so, freeing); 438 sounlock(soback); 439 } 440 sbunlock(&soback->so_rcv); 441 solock(soback); 442 sorele(soback, 0); 443 444 notsplicedback: 445 sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR); 446 if (isspliced(so)) { 447 int freeing = SOSP_FREEING_READ; 448 449 if (so == so->so_sp->ssp_socket) 450 freeing |= SOSP_FREEING_WRITE; 451 solock(so); 452 sounsplice(so, so->so_sp->ssp_socket, freeing); 453 sounlock(so); 454 } 455 sbunlock(&so->so_rcv); 456 457 timeout_del_barrier(&so->so_sp->ssp_idleto); 458 task_del(sosplice_taskq, &so->so_sp->ssp_task); 459 taskq_barrier(sosplice_taskq); 460 461 solock(so); 462 } 463 #endif /* SOCKET_SPLICE */ 464 465 if (so->so_state & SS_NOFDREF) 466 panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type); 467 so->so_state |= SS_NOFDREF; 468 469 /* sofree() calls sounlock(). */ 470 sofree(so, 0); 471 return (error); 472 } 473 474 void 475 soabort(struct socket *so) 476 { 477 soassertlocked(so); 478 pru_abort(so); 479 } 480 481 int 482 soaccept(struct socket *so, struct mbuf *nam) 483 { 484 int error = 0; 485 486 soassertlocked(so); 487 488 if ((so->so_state & SS_NOFDREF) == 0) 489 panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type); 490 so->so_state &= ~SS_NOFDREF; 491 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 492 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 493 error = pru_accept(so, nam); 494 else 495 error = ECONNABORTED; 496 return (error); 497 } 498 499 int 500 soconnect(struct socket *so, struct mbuf *nam) 501 { 502 int error; 503 504 soassertlocked(so); 505 506 if (so->so_options & SO_ACCEPTCONN) 507 return (EOPNOTSUPP); 508 /* 509 * If protocol is connection-based, can only connect once. 510 * Otherwise, if connected, try to disconnect first. 511 * This allows user to disconnect by connecting to, e.g., 512 * a null address. 513 */ 514 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 515 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 516 (error = sodisconnect(so)))) 517 error = EISCONN; 518 else 519 error = pru_connect(so, nam); 520 return (error); 521 } 522 523 int 524 soconnect2(struct socket *so1, struct socket *so2) 525 { 526 int persocket, error; 527 528 if ((persocket = solock_persocket(so1))) 529 solock_pair(so1, so2); 530 else 531 solock(so1); 532 533 error = pru_connect2(so1, so2); 534 535 if (persocket) 536 sounlock(so2); 537 sounlock(so1); 538 return (error); 539 } 540 541 int 542 sodisconnect(struct socket *so) 543 { 544 int error; 545 546 soassertlocked(so); 547 548 if ((so->so_state & SS_ISCONNECTED) == 0) 549 return (ENOTCONN); 550 if (so->so_state & SS_ISDISCONNECTING) 551 return (EALREADY); 552 error = pru_disconnect(so); 553 return (error); 554 } 555 556 int m_getuio(struct mbuf **, int, long, struct uio *); 557 558 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 559 /* 560 * Send on a socket. 561 * If send must go all at once and message is larger than 562 * send buffering, then hard error. 563 * Lock against other senders. 564 * If must go all at once and not enough room now, then 565 * inform user that this would block and do nothing. 566 * Otherwise, if nonblocking, send as much as possible. 567 * The data to be sent is described by "uio" if nonzero, 568 * otherwise by the mbuf chain "top" (which must be null 569 * if uio is not). Data provided in mbuf chain must be small 570 * enough to send all at once. 571 * 572 * Returns nonzero on error, timeout or signal; callers 573 * must check for short counts if EINTR/ERESTART are returned. 574 * Data and control buffers are freed on return. 575 */ 576 int 577 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 578 struct mbuf *control, int flags) 579 { 580 long space, clen = 0; 581 size_t resid; 582 int error; 583 int atomic = sosendallatonce(so) || top; 584 int dosolock = ((so->so_snd.sb_flags & SB_MTXLOCK) == 0); 585 586 if (uio) 587 resid = uio->uio_resid; 588 else 589 resid = top->m_pkthdr.len; 590 /* MSG_EOR on a SOCK_STREAM socket is invalid. */ 591 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 592 m_freem(top); 593 m_freem(control); 594 return (EINVAL); 595 } 596 if (uio && uio->uio_procp) 597 uio->uio_procp->p_ru.ru_msgsnd++; 598 if (control) { 599 /* 600 * In theory clen should be unsigned (since control->m_len is). 601 * However, space must be signed, as it might be less than 0 602 * if we over-committed, and we must use a signed comparison 603 * of space and clen. 604 */ 605 clen = control->m_len; 606 /* reserve extra space for AF_UNIX's internalize */ 607 if (so->so_proto->pr_domain->dom_family == AF_UNIX && 608 clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) && 609 mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 610 clen = CMSG_SPACE( 611 (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) * 612 (sizeof(struct fdpass) / sizeof(int))); 613 } 614 615 #define snderr(errno) { error = errno; goto release; } 616 617 restart: 618 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) 619 goto out; 620 if (dosolock) 621 solock_shared(so); 622 sb_mtx_lock(&so->so_snd); 623 so->so_snd.sb_state |= SS_ISSENDING; 624 do { 625 if (so->so_snd.sb_state & SS_CANTSENDMORE) 626 snderr(EPIPE); 627 if ((error = READ_ONCE(so->so_error))) { 628 so->so_error = 0; 629 snderr(error); 630 } 631 if ((so->so_state & SS_ISCONNECTED) == 0) { 632 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 633 if (!(resid == 0 && clen != 0)) 634 snderr(ENOTCONN); 635 } else if (addr == NULL) 636 snderr(EDESTADDRREQ); 637 } 638 space = sbspace_locked(so, &so->so_snd); 639 if (flags & MSG_OOB) 640 space += 1024; 641 if (so->so_proto->pr_domain->dom_family == AF_UNIX) { 642 if (atomic && resid > so->so_snd.sb_hiwat) 643 snderr(EMSGSIZE); 644 } else { 645 if (clen > so->so_snd.sb_hiwat || 646 (atomic && resid > so->so_snd.sb_hiwat - clen)) 647 snderr(EMSGSIZE); 648 } 649 if (space < clen || 650 (space - clen < resid && 651 (atomic || space < so->so_snd.sb_lowat))) { 652 if (flags & MSG_DONTWAIT) 653 snderr(EWOULDBLOCK); 654 sbunlock(&so->so_snd); 655 error = sbwait(so, &so->so_snd); 656 so->so_snd.sb_state &= ~SS_ISSENDING; 657 sb_mtx_unlock(&so->so_snd); 658 if (dosolock) 659 sounlock_shared(so); 660 if (error) 661 goto out; 662 goto restart; 663 } 664 space -= clen; 665 do { 666 if (uio == NULL) { 667 /* 668 * Data is prepackaged in "top". 669 */ 670 resid = 0; 671 if (flags & MSG_EOR) 672 top->m_flags |= M_EOR; 673 } else { 674 sb_mtx_unlock(&so->so_snd); 675 if (dosolock) 676 sounlock_shared(so); 677 error = m_getuio(&top, atomic, space, uio); 678 if (dosolock) 679 solock_shared(so); 680 sb_mtx_lock(&so->so_snd); 681 if (error) 682 goto release; 683 space -= top->m_pkthdr.len; 684 resid = uio->uio_resid; 685 if (flags & MSG_EOR) 686 top->m_flags |= M_EOR; 687 } 688 if (resid == 0) 689 so->so_snd.sb_state &= ~SS_ISSENDING; 690 if (top && so->so_options & SO_ZEROIZE) 691 top->m_flags |= M_ZEROIZE; 692 sb_mtx_unlock(&so->so_snd); 693 if (!dosolock) 694 solock_shared(so); 695 if (flags & MSG_OOB) 696 error = pru_sendoob(so, top, addr, control); 697 else 698 error = pru_send(so, top, addr, control); 699 if (!dosolock) 700 sounlock_shared(so); 701 sb_mtx_lock(&so->so_snd); 702 clen = 0; 703 control = NULL; 704 top = NULL; 705 if (error) 706 goto release; 707 } while (resid && space > 0); 708 } while (resid); 709 710 release: 711 so->so_snd.sb_state &= ~SS_ISSENDING; 712 sb_mtx_unlock(&so->so_snd); 713 if (dosolock) 714 sounlock_shared(so); 715 sbunlock(&so->so_snd); 716 out: 717 m_freem(top); 718 m_freem(control); 719 return (error); 720 } 721 722 int 723 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio) 724 { 725 struct mbuf *m, *top = NULL; 726 struct mbuf **nextp = ⊤ 727 u_long len, mlen; 728 size_t resid = uio->uio_resid; 729 int error; 730 731 do { 732 if (top == NULL) { 733 MGETHDR(m, M_WAIT, MT_DATA); 734 mlen = MHLEN; 735 } else { 736 MGET(m, M_WAIT, MT_DATA); 737 mlen = MLEN; 738 } 739 /* chain mbuf together */ 740 *nextp = m; 741 nextp = &m->m_next; 742 743 resid = ulmin(resid, space); 744 if (resid >= MINCLSIZE) { 745 MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES)); 746 if ((m->m_flags & M_EXT) == 0) 747 MCLGETL(m, M_NOWAIT, MCLBYTES); 748 if ((m->m_flags & M_EXT) == 0) 749 goto nopages; 750 mlen = m->m_ext.ext_size; 751 len = ulmin(mlen, resid); 752 /* 753 * For datagram protocols, leave room 754 * for protocol headers in first mbuf. 755 */ 756 if (atomic && m == top && len < mlen - max_hdr) 757 m->m_data += max_hdr; 758 } else { 759 nopages: 760 len = ulmin(mlen, resid); 761 /* 762 * For datagram protocols, leave room 763 * for protocol headers in first mbuf. 764 */ 765 if (atomic && m == top && len < mlen - max_hdr) 766 m_align(m, len); 767 } 768 769 error = uiomove(mtod(m, caddr_t), len, uio); 770 if (error) { 771 m_freem(top); 772 return (error); 773 } 774 775 /* adjust counters */ 776 resid = uio->uio_resid; 777 space -= len; 778 m->m_len = len; 779 top->m_pkthdr.len += len; 780 781 /* Is there more space and more data? */ 782 } while (space > 0 && resid > 0); 783 784 *mp = top; 785 return 0; 786 } 787 788 /* 789 * Following replacement or removal of the first mbuf on the first 790 * mbuf chain of a socket buffer, push necessary state changes back 791 * into the socket buffer so that other consumers see the values 792 * consistently. 'nextrecord' is the callers locally stored value of 793 * the original value of sb->sb_mb->m_nextpkt which must be restored 794 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 795 */ 796 void 797 sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 798 { 799 800 /* 801 * First, update for the new value of nextrecord. If necessary, 802 * make it the first record. 803 */ 804 if (sb->sb_mb != NULL) 805 sb->sb_mb->m_nextpkt = nextrecord; 806 else 807 sb->sb_mb = nextrecord; 808 809 /* 810 * Now update any dependent socket buffer fields to reflect 811 * the new state. This is an inline of SB_EMPTY_FIXUP, with 812 * the addition of a second clause that takes care of the 813 * case where sb_mb has been updated, but remains the last 814 * record. 815 */ 816 if (sb->sb_mb == NULL) { 817 sb->sb_mbtail = NULL; 818 sb->sb_lastrecord = NULL; 819 } else if (sb->sb_mb->m_nextpkt == NULL) 820 sb->sb_lastrecord = sb->sb_mb; 821 } 822 823 /* 824 * Implement receive operations on a socket. 825 * We depend on the way that records are added to the sockbuf 826 * by sbappend*. In particular, each record (mbufs linked through m_next) 827 * must begin with an address if the protocol so specifies, 828 * followed by an optional mbuf or mbufs containing ancillary data, 829 * and then zero or more mbufs of data. 830 * In order to avoid blocking network for the entire time here, we release 831 * the solock() while doing the actual copy to user space. 832 * Although the sockbuf is locked, new data may still be appended, 833 * and thus we must maintain consistency of the sockbuf during that time. 834 * 835 * The caller may receive the data as a single mbuf chain by supplying 836 * an mbuf **mp0 for use in returning the chain. The uio is then used 837 * only for the count in uio_resid. 838 */ 839 int 840 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 841 struct mbuf **mp0, struct mbuf **controlp, int *flagsp, 842 socklen_t controllen) 843 { 844 struct mbuf *m, **mp; 845 struct mbuf *cm; 846 u_long len, offset, moff; 847 int flags, error, error2, type, uio_error = 0; 848 const struct protosw *pr = so->so_proto; 849 struct mbuf *nextrecord; 850 size_t resid, orig_resid = uio->uio_resid; 851 int dosolock = ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0); 852 853 mp = mp0; 854 if (paddr) 855 *paddr = NULL; 856 if (controlp) 857 *controlp = NULL; 858 if (flagsp) 859 flags = *flagsp &~ MSG_EOR; 860 else 861 flags = 0; 862 if (flags & MSG_OOB) { 863 m = m_get(M_WAIT, MT_DATA); 864 solock_shared(so); 865 error = pru_rcvoob(so, m, flags & MSG_PEEK); 866 sounlock_shared(so); 867 if (error) 868 goto bad; 869 do { 870 error = uiomove(mtod(m, caddr_t), 871 ulmin(uio->uio_resid, m->m_len), uio); 872 m = m_free(m); 873 } while (uio->uio_resid && error == 0 && m); 874 bad: 875 m_freem(m); 876 return (error); 877 } 878 if (mp) 879 *mp = NULL; 880 881 restart: 882 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) 883 return (error); 884 if (dosolock) 885 solock_shared(so); 886 sb_mtx_lock(&so->so_rcv); 887 888 m = so->so_rcv.sb_mb; 889 #ifdef SOCKET_SPLICE 890 if (isspliced(so)) 891 m = NULL; 892 #endif /* SOCKET_SPLICE */ 893 /* 894 * If we have less data than requested, block awaiting more 895 * (subject to any timeout) if: 896 * 1. the current count is less than the low water mark, 897 * 2. MSG_WAITALL is set, and it is possible to do the entire 898 * receive operation at once if we block (resid <= hiwat), or 899 * 3. MSG_DONTWAIT is not set. 900 * If MSG_WAITALL is set but resid is larger than the receive buffer, 901 * we have to do the receive in sections, and thus risk returning 902 * a short count if a timeout or signal occurs after we start. 903 */ 904 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 905 so->so_rcv.sb_cc < uio->uio_resid) && 906 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 907 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 908 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 909 #ifdef DIAGNOSTIC 910 if (m == NULL && so->so_rcv.sb_cc) 911 #ifdef SOCKET_SPLICE 912 if (!isspliced(so)) 913 #endif /* SOCKET_SPLICE */ 914 panic("receive 1: so %p, so_type %d, sb_cc %lu", 915 so, so->so_type, so->so_rcv.sb_cc); 916 #endif 917 if ((error2 = READ_ONCE(so->so_error))) { 918 if (m) 919 goto dontblock; 920 error = error2; 921 if ((flags & MSG_PEEK) == 0) 922 so->so_error = 0; 923 goto release; 924 } 925 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 926 if (m) 927 goto dontblock; 928 else if (so->so_rcv.sb_cc == 0) 929 goto release; 930 } 931 for (; m; m = m->m_next) 932 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 933 m = so->so_rcv.sb_mb; 934 goto dontblock; 935 } 936 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 937 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 938 error = ENOTCONN; 939 goto release; 940 } 941 if (uio->uio_resid == 0 && controlp == NULL) 942 goto release; 943 if (flags & MSG_DONTWAIT) { 944 error = EWOULDBLOCK; 945 goto release; 946 } 947 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 948 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 949 950 sbunlock(&so->so_rcv); 951 error = sbwait(so, &so->so_rcv); 952 sb_mtx_unlock(&so->so_rcv); 953 if (dosolock) 954 sounlock_shared(so); 955 if (error) 956 return (error); 957 goto restart; 958 } 959 dontblock: 960 /* 961 * On entry here, m points to the first record of the socket buffer. 962 * From this point onward, we maintain 'nextrecord' as a cache of the 963 * pointer to the next record in the socket buffer. We must keep the 964 * various socket buffer pointers and local stack versions of the 965 * pointers in sync, pushing out modifications before operations that 966 * may sleep, and re-reading them afterwards. 967 * 968 * Otherwise, we will race with the network stack appending new data 969 * or records onto the socket buffer by using inconsistent/stale 970 * versions of the field, possibly resulting in socket buffer 971 * corruption. 972 */ 973 if (uio->uio_procp) 974 uio->uio_procp->p_ru.ru_msgrcv++; 975 KASSERT(m == so->so_rcv.sb_mb); 976 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 977 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 978 nextrecord = m->m_nextpkt; 979 if (pr->pr_flags & PR_ADDR) { 980 #ifdef DIAGNOSTIC 981 if (m->m_type != MT_SONAME) 982 panic("receive 1a: so %p, so_type %d, m %p, m_type %d", 983 so, so->so_type, m, m->m_type); 984 #endif 985 orig_resid = 0; 986 if (flags & MSG_PEEK) { 987 if (paddr) 988 *paddr = m_copym(m, 0, m->m_len, M_NOWAIT); 989 m = m->m_next; 990 } else { 991 sbfree(so, &so->so_rcv, m); 992 if (paddr) { 993 *paddr = m; 994 so->so_rcv.sb_mb = m->m_next; 995 m->m_next = NULL; 996 m = so->so_rcv.sb_mb; 997 } else { 998 so->so_rcv.sb_mb = m_free(m); 999 m = so->so_rcv.sb_mb; 1000 } 1001 sbsync(&so->so_rcv, nextrecord); 1002 } 1003 } 1004 while (m && m->m_type == MT_CONTROL && error == 0) { 1005 int skip = 0; 1006 if (flags & MSG_PEEK) { 1007 if (mtod(m, struct cmsghdr *)->cmsg_type == 1008 SCM_RIGHTS) { 1009 /* don't leak internalized SCM_RIGHTS msgs */ 1010 skip = 1; 1011 } else if (controlp) 1012 *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); 1013 m = m->m_next; 1014 } else { 1015 sbfree(so, &so->so_rcv, m); 1016 so->so_rcv.sb_mb = m->m_next; 1017 m->m_nextpkt = m->m_next = NULL; 1018 cm = m; 1019 m = so->so_rcv.sb_mb; 1020 sbsync(&so->so_rcv, nextrecord); 1021 if (controlp) { 1022 if (pr->pr_domain->dom_externalize) { 1023 sb_mtx_unlock(&so->so_rcv); 1024 if (dosolock) 1025 sounlock_shared(so); 1026 error = 1027 (*pr->pr_domain->dom_externalize) 1028 (cm, controllen, flags); 1029 if (dosolock) 1030 solock_shared(so); 1031 sb_mtx_lock(&so->so_rcv); 1032 } 1033 *controlp = cm; 1034 } else { 1035 /* 1036 * Dispose of any SCM_RIGHTS message that went 1037 * through the read path rather than recv. 1038 */ 1039 if (pr->pr_domain->dom_dispose) { 1040 sb_mtx_unlock(&so->so_rcv); 1041 pr->pr_domain->dom_dispose(cm); 1042 sb_mtx_lock(&so->so_rcv); 1043 } 1044 m_free(cm); 1045 } 1046 } 1047 if (m != NULL) 1048 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1049 else 1050 nextrecord = so->so_rcv.sb_mb; 1051 if (controlp && !skip) 1052 controlp = &(*controlp)->m_next; 1053 orig_resid = 0; 1054 } 1055 1056 /* If m is non-NULL, we have some data to read. */ 1057 if (m) { 1058 type = m->m_type; 1059 if (type == MT_OOBDATA) 1060 flags |= MSG_OOB; 1061 if (m->m_flags & M_BCAST) 1062 flags |= MSG_BCAST; 1063 if (m->m_flags & M_MCAST) 1064 flags |= MSG_MCAST; 1065 } 1066 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 1067 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 1068 1069 moff = 0; 1070 offset = 0; 1071 while (m && uio->uio_resid > 0 && error == 0) { 1072 if (m->m_type == MT_OOBDATA) { 1073 if (type != MT_OOBDATA) 1074 break; 1075 } else if (type == MT_OOBDATA) { 1076 break; 1077 } else if (m->m_type == MT_CONTROL) { 1078 /* 1079 * If there is more than one control message in the 1080 * stream, we do a short read. Next can be received 1081 * or disposed by another system call. 1082 */ 1083 break; 1084 #ifdef DIAGNOSTIC 1085 } else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) { 1086 panic("receive 3: so %p, so_type %d, m %p, m_type %d", 1087 so, so->so_type, m, m->m_type); 1088 #endif 1089 } 1090 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1091 len = uio->uio_resid; 1092 if (so->so_oobmark && len > so->so_oobmark - offset) 1093 len = so->so_oobmark - offset; 1094 if (len > m->m_len - moff) 1095 len = m->m_len - moff; 1096 /* 1097 * If mp is set, just pass back the mbufs. 1098 * Otherwise copy them out via the uio, then free. 1099 * Sockbuf must be consistent here (points to current mbuf, 1100 * it points to next record) when we drop priority; 1101 * we must note any additions to the sockbuf when we 1102 * block interrupts again. 1103 */ 1104 if (mp == NULL && uio_error == 0) { 1105 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 1106 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 1107 resid = uio->uio_resid; 1108 sb_mtx_unlock(&so->so_rcv); 1109 if (dosolock) 1110 sounlock_shared(so); 1111 uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1112 if (dosolock) 1113 solock_shared(so); 1114 sb_mtx_lock(&so->so_rcv); 1115 if (uio_error) 1116 uio->uio_resid = resid - len; 1117 } else 1118 uio->uio_resid -= len; 1119 if (len == m->m_len - moff) { 1120 if (m->m_flags & M_EOR) 1121 flags |= MSG_EOR; 1122 if (flags & MSG_PEEK) { 1123 m = m->m_next; 1124 moff = 0; 1125 orig_resid = 0; 1126 } else { 1127 nextrecord = m->m_nextpkt; 1128 sbfree(so, &so->so_rcv, m); 1129 if (mp) { 1130 *mp = m; 1131 mp = &m->m_next; 1132 so->so_rcv.sb_mb = m = m->m_next; 1133 *mp = NULL; 1134 } else { 1135 so->so_rcv.sb_mb = m_free(m); 1136 m = so->so_rcv.sb_mb; 1137 } 1138 /* 1139 * If m != NULL, we also know that 1140 * so->so_rcv.sb_mb != NULL. 1141 */ 1142 KASSERT(so->so_rcv.sb_mb == m); 1143 if (m) { 1144 m->m_nextpkt = nextrecord; 1145 if (nextrecord == NULL) 1146 so->so_rcv.sb_lastrecord = m; 1147 } else { 1148 so->so_rcv.sb_mb = nextrecord; 1149 SB_EMPTY_FIXUP(&so->so_rcv); 1150 } 1151 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1152 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1153 } 1154 } else { 1155 if (flags & MSG_PEEK) { 1156 moff += len; 1157 orig_resid = 0; 1158 } else { 1159 if (mp) 1160 *mp = m_copym(m, 0, len, M_WAIT); 1161 m->m_data += len; 1162 m->m_len -= len; 1163 so->so_rcv.sb_cc -= len; 1164 so->so_rcv.sb_datacc -= len; 1165 } 1166 } 1167 if (so->so_oobmark) { 1168 if ((flags & MSG_PEEK) == 0) { 1169 so->so_oobmark -= len; 1170 if (so->so_oobmark == 0) { 1171 so->so_rcv.sb_state |= SS_RCVATMARK; 1172 break; 1173 } 1174 } else { 1175 offset += len; 1176 if (offset == so->so_oobmark) 1177 break; 1178 } 1179 } 1180 if (flags & MSG_EOR) 1181 break; 1182 /* 1183 * If the MSG_WAITALL flag is set (for non-atomic socket), 1184 * we must not quit until "uio->uio_resid == 0" or an error 1185 * termination. If a signal/timeout occurs, return 1186 * with a short count but without error. 1187 * Keep sockbuf locked against other readers. 1188 */ 1189 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1190 !sosendallatonce(so) && !nextrecord) { 1191 if (so->so_rcv.sb_state & SS_CANTRCVMORE || 1192 so->so_error) 1193 break; 1194 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1195 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1196 if (sbwait(so, &so->so_rcv)) { 1197 sb_mtx_unlock(&so->so_rcv); 1198 if (dosolock) 1199 sounlock_shared(so); 1200 sbunlock(&so->so_rcv); 1201 return (0); 1202 } 1203 if ((m = so->so_rcv.sb_mb) != NULL) 1204 nextrecord = m->m_nextpkt; 1205 } 1206 } 1207 1208 if (m && pr->pr_flags & PR_ATOMIC) { 1209 flags |= MSG_TRUNC; 1210 if ((flags & MSG_PEEK) == 0) 1211 (void) sbdroprecord(so, &so->so_rcv); 1212 } 1213 if ((flags & MSG_PEEK) == 0) { 1214 if (m == NULL) { 1215 /* 1216 * First part is an inline SB_EMPTY_FIXUP(). Second 1217 * part makes sure sb_lastrecord is up-to-date if 1218 * there is still data in the socket buffer. 1219 */ 1220 so->so_rcv.sb_mb = nextrecord; 1221 if (so->so_rcv.sb_mb == NULL) { 1222 so->so_rcv.sb_mbtail = NULL; 1223 so->so_rcv.sb_lastrecord = NULL; 1224 } else if (nextrecord->m_nextpkt == NULL) 1225 so->so_rcv.sb_lastrecord = nextrecord; 1226 } 1227 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1228 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1229 if (pr->pr_flags & PR_WANTRCVD) { 1230 sb_mtx_unlock(&so->so_rcv); 1231 if (!dosolock) 1232 solock_shared(so); 1233 pru_rcvd(so); 1234 if (!dosolock) 1235 sounlock_shared(so); 1236 sb_mtx_lock(&so->so_rcv); 1237 } 1238 } 1239 if (orig_resid == uio->uio_resid && orig_resid && 1240 (flags & MSG_EOR) == 0 && 1241 (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) { 1242 sb_mtx_unlock(&so->so_rcv); 1243 sbunlock(&so->so_rcv); 1244 goto restart; 1245 } 1246 1247 if (uio_error) 1248 error = uio_error; 1249 1250 if (flagsp) 1251 *flagsp |= flags; 1252 release: 1253 sb_mtx_unlock(&so->so_rcv); 1254 if (dosolock) 1255 sounlock_shared(so); 1256 sbunlock(&so->so_rcv); 1257 return (error); 1258 } 1259 1260 int 1261 soshutdown(struct socket *so, int how) 1262 { 1263 int error = 0; 1264 1265 switch (how) { 1266 case SHUT_RD: 1267 sorflush(so); 1268 break; 1269 case SHUT_RDWR: 1270 sorflush(so); 1271 /* FALLTHROUGH */ 1272 case SHUT_WR: 1273 solock(so); 1274 error = pru_shutdown(so); 1275 sounlock(so); 1276 break; 1277 default: 1278 error = EINVAL; 1279 break; 1280 } 1281 1282 return (error); 1283 } 1284 1285 void 1286 sorflush(struct socket *so) 1287 { 1288 struct sockbuf *sb = &so->so_rcv; 1289 struct mbuf *m; 1290 const struct protosw *pr = so->so_proto; 1291 int error; 1292 1293 error = sblock(sb, SBL_WAIT | SBL_NOINTR); 1294 /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */ 1295 KASSERT(error == 0); 1296 1297 solock_shared(so); 1298 socantrcvmore(so); 1299 mtx_enter(&sb->sb_mtx); 1300 m = sb->sb_mb; 1301 memset(&sb->sb_startzero, 0, 1302 (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); 1303 sb->sb_timeo_nsecs = INFSLP; 1304 mtx_leave(&sb->sb_mtx); 1305 sounlock_shared(so); 1306 sbunlock(sb); 1307 1308 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1309 (*pr->pr_domain->dom_dispose)(m); 1310 m_purge(m); 1311 } 1312 1313 #ifdef SOCKET_SPLICE 1314 1315 #define so_splicelen so_sp->ssp_len 1316 #define so_splicemax so_sp->ssp_max 1317 #define so_idletv so_sp->ssp_idletv 1318 #define so_idleto so_sp->ssp_idleto 1319 #define so_splicetask so_sp->ssp_task 1320 1321 int 1322 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) 1323 { 1324 struct file *fp; 1325 struct socket *sosp; 1326 struct taskq *tq; 1327 int error = 0; 1328 1329 if ((so->so_proto->pr_flags & PR_SPLICE) == 0) 1330 return (EPROTONOSUPPORT); 1331 if (max && max < 0) 1332 return (EINVAL); 1333 if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) 1334 return (EINVAL); 1335 1336 /* If no fd is given, unsplice by removing existing link. */ 1337 if (fd < 0) { 1338 if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0) 1339 return (error); 1340 solock(so); 1341 if (so->so_options & SO_ACCEPTCONN) { 1342 error = EOPNOTSUPP; 1343 goto out; 1344 } 1345 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1346 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1347 error = ENOTCONN; 1348 goto out; 1349 } 1350 1351 if (so->so_sp && so->so_sp->ssp_socket) 1352 sounsplice(so, so->so_sp->ssp_socket, 0); 1353 out: 1354 sounlock(so); 1355 sbunlock(&so->so_rcv); 1356 return (error); 1357 } 1358 1359 if (sosplice_taskq == NULL) { 1360 rw_enter_write(&sosplice_lock); 1361 if (sosplice_taskq == NULL) { 1362 tq = taskq_create("sosplice", 1, IPL_SOFTNET, 1363 TASKQ_MPSAFE); 1364 if (tq == NULL) { 1365 rw_exit_write(&sosplice_lock); 1366 return (ENOMEM); 1367 } 1368 /* Ensure the taskq is fully visible to other CPUs. */ 1369 membar_producer(); 1370 sosplice_taskq = tq; 1371 } 1372 rw_exit_write(&sosplice_lock); 1373 } else { 1374 /* Ensure the taskq is fully visible on this CPU. */ 1375 membar_consumer(); 1376 } 1377 1378 /* Find sosp, the drain socket where data will be spliced into. */ 1379 if ((error = getsock(curproc, fd, &fp)) != 0) 1380 return (error); 1381 sosp = fp->f_data; 1382 1383 if (sosp->so_proto->pr_usrreqs->pru_send != 1384 so->so_proto->pr_usrreqs->pru_send) { 1385 error = EPROTONOSUPPORT; 1386 goto frele; 1387 } 1388 1389 if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0) 1390 goto frele; 1391 if ((error = sblock(&sosp->so_snd, SBL_WAIT)) != 0) { 1392 sbunlock(&so->so_rcv); 1393 goto frele; 1394 } 1395 solock(so); 1396 1397 if ((so->so_options & SO_ACCEPTCONN) || 1398 (sosp->so_options & SO_ACCEPTCONN)) { 1399 error = EOPNOTSUPP; 1400 goto release; 1401 } 1402 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1403 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1404 error = ENOTCONN; 1405 goto release; 1406 } 1407 if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) { 1408 error = ENOTCONN; 1409 goto release; 1410 } 1411 if (so->so_sp == NULL) 1412 so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1413 if (sosp->so_sp == NULL) 1414 sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1415 if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { 1416 error = EBUSY; 1417 goto release; 1418 } 1419 1420 so->so_splicelen = 0; 1421 so->so_splicemax = max; 1422 if (tv) 1423 so->so_idletv = *tv; 1424 else 1425 timerclear(&so->so_idletv); 1426 timeout_set_flags(&so->so_idleto, soidle, so, 1427 KCLOCK_NONE, TIMEOUT_PROC | TIMEOUT_MPSAFE); 1428 task_set(&so->so_splicetask, sotask, so); 1429 1430 /* 1431 * To prevent sorwakeup() calling somove() before this somove() 1432 * has finished, the socket buffers are not marked as spliced yet. 1433 */ 1434 1435 /* Splice so and sosp together. */ 1436 mtx_enter(&so->so_rcv.sb_mtx); 1437 mtx_enter(&sosp->so_snd.sb_mtx); 1438 so->so_sp->ssp_socket = sosp; 1439 sosp->so_sp->ssp_soback = so; 1440 mtx_leave(&sosp->so_snd.sb_mtx); 1441 mtx_leave(&so->so_rcv.sb_mtx); 1442 1443 if ((so->so_proto->pr_flags & PR_WANTRCVD) == 0) 1444 sounlock(so); 1445 if (somove(so, M_WAIT)) { 1446 mtx_enter(&so->so_rcv.sb_mtx); 1447 mtx_enter(&sosp->so_snd.sb_mtx); 1448 so->so_rcv.sb_flags |= SB_SPLICE; 1449 sosp->so_snd.sb_flags |= SB_SPLICE; 1450 mtx_leave(&sosp->so_snd.sb_mtx); 1451 mtx_leave(&so->so_rcv.sb_mtx); 1452 } 1453 if ((so->so_proto->pr_flags & PR_WANTRCVD) == 0) 1454 solock(so); 1455 1456 release: 1457 sounlock(so); 1458 sbunlock(&sosp->so_snd); 1459 sbunlock(&so->so_rcv); 1460 frele: 1461 FRELE(fp, curproc); 1462 1463 return (error); 1464 } 1465 1466 void 1467 sounsplice(struct socket *so, struct socket *sosp, int freeing) 1468 { 1469 sbassertlocked(&so->so_rcv); 1470 soassertlocked(so); 1471 1472 task_del(sosplice_taskq, &so->so_splicetask); 1473 timeout_del(&so->so_idleto); 1474 1475 mtx_enter(&so->so_rcv.sb_mtx); 1476 mtx_enter(&sosp->so_snd.sb_mtx); 1477 so->so_rcv.sb_flags &= ~SB_SPLICE; 1478 sosp->so_snd.sb_flags &= ~SB_SPLICE; 1479 so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; 1480 mtx_leave(&sosp->so_snd.sb_mtx); 1481 mtx_leave(&so->so_rcv.sb_mtx); 1482 1483 /* Do not wakeup a socket that is about to be freed. */ 1484 if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so)) 1485 sorwakeup(so); 1486 if ((freeing & SOSP_FREEING_WRITE) == 0 && sowriteable(sosp)) 1487 sowwakeup(sosp); 1488 } 1489 1490 void 1491 soidle(void *arg) 1492 { 1493 struct socket *so = arg; 1494 1495 sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR); 1496 solock(so); 1497 /* 1498 * Depending on socket type, sblock(&so->so_rcv) or solock() 1499 * is always held while modifying SB_SPLICE and 1500 * so->so_sp->ssp_socket. 1501 */ 1502 if (so->so_rcv.sb_flags & SB_SPLICE) { 1503 so->so_error = ETIMEDOUT; 1504 sounsplice(so, so->so_sp->ssp_socket, 0); 1505 } 1506 sounlock(so); 1507 sbunlock(&so->so_rcv); 1508 } 1509 1510 void 1511 sotask(void *arg) 1512 { 1513 struct socket *so = arg; 1514 int doyield = 0; 1515 int sockstream = (so->so_proto->pr_flags & PR_WANTRCVD); 1516 1517 /* 1518 * sblock() on `so_rcv' protects sockets from being unspliced 1519 * for UDP case. TCP sockets still rely on solock(). 1520 */ 1521 1522 sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR); 1523 if (so->so_rcv.sb_flags & SB_SPLICE) { 1524 struct socket *sosp = so->so_sp->ssp_socket; 1525 1526 if (sockstream) { 1527 sblock(&sosp->so_snd, SBL_WAIT | SBL_NOINTR); 1528 solock(so); 1529 doyield = 1; 1530 } 1531 1532 somove(so, M_DONTWAIT); 1533 1534 if (sockstream) { 1535 sounlock(so); 1536 sbunlock(&sosp->so_snd); 1537 } 1538 } 1539 1540 sbunlock(&so->so_rcv); 1541 1542 if (doyield) { 1543 /* Avoid user land starvation. */ 1544 yield(); 1545 } 1546 } 1547 1548 /* 1549 * Move data from receive buffer of spliced source socket to send 1550 * buffer of drain socket. Try to move as much as possible in one 1551 * big chunk. It is a TCP only implementation. 1552 * Return value 0 means splicing has been finished, 1 continue. 1553 */ 1554 int 1555 somove(struct socket *so, int wait) 1556 { 1557 struct socket *sosp = so->so_sp->ssp_socket; 1558 struct mbuf *m, **mp, *nextrecord; 1559 u_long len, off, oobmark; 1560 long space; 1561 int error = 0, maxreached = 0, unsplice = 0; 1562 unsigned int rcvstate; 1563 int sockdgram = ((so->so_proto->pr_flags & 1564 PR_WANTRCVD) == 0); 1565 1566 if (sockdgram) 1567 sbassertlocked(&so->so_rcv); 1568 else { 1569 sbassertlocked(&sosp->so_snd); 1570 soassertlocked(so); 1571 } 1572 1573 mtx_enter(&so->so_rcv.sb_mtx); 1574 mtx_enter(&sosp->so_snd.sb_mtx); 1575 1576 nextpkt: 1577 if ((error = READ_ONCE(so->so_error))) 1578 goto release; 1579 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) { 1580 error = EPIPE; 1581 goto release; 1582 } 1583 1584 error = READ_ONCE(sosp->so_error); 1585 if (error) { 1586 if (error != ETIMEDOUT && error != EFBIG && error != ELOOP) 1587 goto release; 1588 error = 0; 1589 } 1590 if ((sosp->so_state & SS_ISCONNECTED) == 0) 1591 goto release; 1592 1593 /* Calculate how many bytes can be copied now. */ 1594 len = so->so_rcv.sb_datacc; 1595 if (so->so_splicemax) { 1596 KASSERT(so->so_splicelen < so->so_splicemax); 1597 if (so->so_splicemax <= so->so_splicelen + len) { 1598 len = so->so_splicemax - so->so_splicelen; 1599 maxreached = 1; 1600 } 1601 } 1602 space = sbspace_locked(sosp, &sosp->so_snd); 1603 if (so->so_oobmark && so->so_oobmark < len && 1604 so->so_oobmark < space + 1024) 1605 space += 1024; 1606 if (space <= 0) { 1607 maxreached = 0; 1608 goto release; 1609 } 1610 if (space < len) { 1611 maxreached = 0; 1612 if (space < sosp->so_snd.sb_lowat) 1613 goto release; 1614 len = space; 1615 } 1616 sosp->so_snd.sb_state |= SS_ISSENDING; 1617 1618 SBLASTRECORDCHK(&so->so_rcv, "somove 1"); 1619 SBLASTMBUFCHK(&so->so_rcv, "somove 1"); 1620 m = so->so_rcv.sb_mb; 1621 if (m == NULL) 1622 goto release; 1623 nextrecord = m->m_nextpkt; 1624 1625 /* Drop address and control information not used with splicing. */ 1626 if (so->so_proto->pr_flags & PR_ADDR) { 1627 #ifdef DIAGNOSTIC 1628 if (m->m_type != MT_SONAME) 1629 panic("somove soname: so %p, so_type %d, m %p, " 1630 "m_type %d", so, so->so_type, m, m->m_type); 1631 #endif 1632 m = m->m_next; 1633 } 1634 while (m && m->m_type == MT_CONTROL) 1635 m = m->m_next; 1636 if (m == NULL) { 1637 sbdroprecord(so, &so->so_rcv); 1638 if (so->so_proto->pr_flags & PR_WANTRCVD) { 1639 mtx_leave(&sosp->so_snd.sb_mtx); 1640 mtx_leave(&so->so_rcv.sb_mtx); 1641 pru_rcvd(so); 1642 mtx_enter(&so->so_rcv.sb_mtx); 1643 mtx_enter(&sosp->so_snd.sb_mtx); 1644 } 1645 goto nextpkt; 1646 } 1647 1648 /* 1649 * By splicing sockets connected to localhost, userland might create a 1650 * loop. Dissolve splicing with error if loop is detected by counter. 1651 * 1652 * If we deal with looped broadcast/multicast packet we bail out with 1653 * no error to suppress splice termination. 1654 */ 1655 if ((m->m_flags & M_PKTHDR) && 1656 ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) || 1657 ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) { 1658 error = ELOOP; 1659 goto release; 1660 } 1661 1662 if (so->so_proto->pr_flags & PR_ATOMIC) { 1663 if ((m->m_flags & M_PKTHDR) == 0) 1664 panic("somove !PKTHDR: so %p, so_type %d, m %p, " 1665 "m_type %d", so, so->so_type, m, m->m_type); 1666 if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) { 1667 error = EMSGSIZE; 1668 goto release; 1669 } 1670 if (len < m->m_pkthdr.len) 1671 goto release; 1672 if (m->m_pkthdr.len < len) { 1673 maxreached = 0; 1674 len = m->m_pkthdr.len; 1675 } 1676 /* 1677 * Throw away the name mbuf after it has been assured 1678 * that the whole first record can be processed. 1679 */ 1680 m = so->so_rcv.sb_mb; 1681 sbfree(so, &so->so_rcv, m); 1682 so->so_rcv.sb_mb = m_free(m); 1683 sbsync(&so->so_rcv, nextrecord); 1684 } 1685 /* 1686 * Throw away the control mbufs after it has been assured 1687 * that the whole first record can be processed. 1688 */ 1689 m = so->so_rcv.sb_mb; 1690 while (m && m->m_type == MT_CONTROL) { 1691 sbfree(so, &so->so_rcv, m); 1692 so->so_rcv.sb_mb = m_free(m); 1693 m = so->so_rcv.sb_mb; 1694 sbsync(&so->so_rcv, nextrecord); 1695 } 1696 1697 SBLASTRECORDCHK(&so->so_rcv, "somove 2"); 1698 SBLASTMBUFCHK(&so->so_rcv, "somove 2"); 1699 1700 /* Take at most len mbufs out of receive buffer. */ 1701 for (off = 0, mp = &m; off <= len && *mp; 1702 off += (*mp)->m_len, mp = &(*mp)->m_next) { 1703 u_long size = len - off; 1704 1705 #ifdef DIAGNOSTIC 1706 if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER) 1707 panic("somove type: so %p, so_type %d, m %p, " 1708 "m_type %d", so, so->so_type, *mp, (*mp)->m_type); 1709 #endif 1710 if ((*mp)->m_len > size) { 1711 /* 1712 * Move only a partial mbuf at maximum splice length or 1713 * if the drain buffer is too small for this large mbuf. 1714 */ 1715 if (!maxreached && sosp->so_snd.sb_datacc > 0) { 1716 len -= size; 1717 break; 1718 } 1719 *mp = m_copym(so->so_rcv.sb_mb, 0, size, wait); 1720 if (*mp == NULL) { 1721 len -= size; 1722 break; 1723 } 1724 so->so_rcv.sb_mb->m_data += size; 1725 so->so_rcv.sb_mb->m_len -= size; 1726 so->so_rcv.sb_cc -= size; 1727 so->so_rcv.sb_datacc -= size; 1728 } else { 1729 *mp = so->so_rcv.sb_mb; 1730 sbfree(so, &so->so_rcv, *mp); 1731 so->so_rcv.sb_mb = (*mp)->m_next; 1732 sbsync(&so->so_rcv, nextrecord); 1733 } 1734 } 1735 *mp = NULL; 1736 1737 SBLASTRECORDCHK(&so->so_rcv, "somove 3"); 1738 SBLASTMBUFCHK(&so->so_rcv, "somove 3"); 1739 SBCHECK(so, &so->so_rcv); 1740 if (m == NULL) 1741 goto release; 1742 m->m_nextpkt = NULL; 1743 if (m->m_flags & M_PKTHDR) { 1744 m_resethdr(m); 1745 m->m_pkthdr.len = len; 1746 } 1747 1748 /* Send window update to source peer as receive buffer has changed. */ 1749 if (so->so_proto->pr_flags & PR_WANTRCVD) { 1750 mtx_leave(&sosp->so_snd.sb_mtx); 1751 mtx_leave(&so->so_rcv.sb_mtx); 1752 pru_rcvd(so); 1753 mtx_enter(&so->so_rcv.sb_mtx); 1754 mtx_enter(&sosp->so_snd.sb_mtx); 1755 } 1756 1757 /* Receive buffer did shrink by len bytes, adjust oob. */ 1758 rcvstate = so->so_rcv.sb_state; 1759 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1760 oobmark = so->so_oobmark; 1761 so->so_oobmark = oobmark > len ? oobmark - len : 0; 1762 if (oobmark) { 1763 if (oobmark == len) 1764 so->so_rcv.sb_state |= SS_RCVATMARK; 1765 if (oobmark >= len) 1766 oobmark = 0; 1767 } 1768 1769 /* 1770 * Handle oob data. If any malloc fails, ignore error. 1771 * TCP urgent data is not very reliable anyway. 1772 */ 1773 while (((rcvstate & SS_RCVATMARK) || oobmark) && 1774 (so->so_options & SO_OOBINLINE)) { 1775 struct mbuf *o = NULL; 1776 1777 if (rcvstate & SS_RCVATMARK) { 1778 o = m_get(wait, MT_DATA); 1779 rcvstate &= ~SS_RCVATMARK; 1780 } else if (oobmark) { 1781 o = m_split(m, oobmark, wait); 1782 if (o) { 1783 mtx_leave(&sosp->so_snd.sb_mtx); 1784 mtx_leave(&so->so_rcv.sb_mtx); 1785 error = pru_send(sosp, m, NULL, NULL); 1786 mtx_enter(&so->so_rcv.sb_mtx); 1787 mtx_enter(&sosp->so_snd.sb_mtx); 1788 1789 if (error) { 1790 if (sosp->so_snd.sb_state & 1791 SS_CANTSENDMORE) 1792 error = EPIPE; 1793 m_freem(o); 1794 goto release; 1795 } 1796 len -= oobmark; 1797 so->so_splicelen += oobmark; 1798 m = o; 1799 o = m_get(wait, MT_DATA); 1800 } 1801 oobmark = 0; 1802 } 1803 if (o) { 1804 o->m_len = 1; 1805 *mtod(o, caddr_t) = *mtod(m, caddr_t); 1806 1807 mtx_leave(&sosp->so_snd.sb_mtx); 1808 mtx_leave(&so->so_rcv.sb_mtx); 1809 error = pru_sendoob(sosp, o, NULL, NULL); 1810 mtx_enter(&so->so_rcv.sb_mtx); 1811 mtx_enter(&sosp->so_snd.sb_mtx); 1812 1813 if (error) { 1814 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) 1815 error = EPIPE; 1816 m_freem(m); 1817 goto release; 1818 } 1819 len -= 1; 1820 so->so_splicelen += 1; 1821 if (oobmark) { 1822 oobmark -= 1; 1823 if (oobmark == 0) 1824 rcvstate |= SS_RCVATMARK; 1825 } 1826 m_adj(m, 1); 1827 } 1828 } 1829 1830 /* Append all remaining data to drain socket. */ 1831 if (so->so_rcv.sb_cc == 0 || maxreached) 1832 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1833 1834 mtx_leave(&sosp->so_snd.sb_mtx); 1835 mtx_leave(&so->so_rcv.sb_mtx); 1836 1837 if (sockdgram) 1838 solock_shared(sosp); 1839 error = pru_send(sosp, m, NULL, NULL); 1840 if (sockdgram) 1841 sounlock_shared(sosp); 1842 1843 mtx_enter(&so->so_rcv.sb_mtx); 1844 mtx_enter(&sosp->so_snd.sb_mtx); 1845 1846 if (error) { 1847 if (sosp->so_snd.sb_state & SS_CANTSENDMORE || 1848 sosp->so_pcb == NULL) 1849 error = EPIPE; 1850 goto release; 1851 } 1852 so->so_splicelen += len; 1853 1854 /* Move several packets if possible. */ 1855 if (!maxreached && nextrecord) 1856 goto nextpkt; 1857 1858 release: 1859 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1860 1861 if (!error && maxreached && so->so_splicemax == so->so_splicelen) 1862 error = EFBIG; 1863 if (error) 1864 WRITE_ONCE(so->so_error, error); 1865 1866 if (((so->so_rcv.sb_state & SS_CANTRCVMORE) && 1867 so->so_rcv.sb_cc == 0) || 1868 (sosp->so_snd.sb_state & SS_CANTSENDMORE) || 1869 maxreached || error) 1870 unsplice = 1; 1871 1872 mtx_leave(&sosp->so_snd.sb_mtx); 1873 mtx_leave(&so->so_rcv.sb_mtx); 1874 1875 if (unsplice) { 1876 if (sockdgram) 1877 solock(so); 1878 sounsplice(so, sosp, 0); 1879 if (sockdgram) 1880 sounlock(so); 1881 1882 return (0); 1883 } 1884 if (timerisset(&so->so_idletv)) 1885 timeout_add_tv(&so->so_idleto, &so->so_idletv); 1886 return (1); 1887 } 1888 #endif /* SOCKET_SPLICE */ 1889 1890 void 1891 sorwakeup(struct socket *so) 1892 { 1893 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 1894 soassertlocked_readonly(so); 1895 1896 #ifdef SOCKET_SPLICE 1897 if (so->so_proto->pr_flags & PR_SPLICE) { 1898 sb_mtx_lock(&so->so_rcv); 1899 if (so->so_rcv.sb_flags & SB_SPLICE) 1900 task_add(sosplice_taskq, &so->so_splicetask); 1901 if (isspliced(so)) { 1902 sb_mtx_unlock(&so->so_rcv); 1903 return; 1904 } 1905 sb_mtx_unlock(&so->so_rcv); 1906 } 1907 #endif 1908 sowakeup(so, &so->so_rcv); 1909 if (so->so_upcall) 1910 (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT); 1911 } 1912 1913 void 1914 sowwakeup(struct socket *so) 1915 { 1916 if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0) 1917 soassertlocked_readonly(so); 1918 1919 #ifdef SOCKET_SPLICE 1920 if (so->so_proto->pr_flags & PR_SPLICE) { 1921 sb_mtx_lock(&so->so_snd); 1922 if (so->so_snd.sb_flags & SB_SPLICE) 1923 task_add(sosplice_taskq, 1924 &so->so_sp->ssp_soback->so_splicetask); 1925 if (issplicedback(so)) { 1926 sb_mtx_unlock(&so->so_snd); 1927 return; 1928 } 1929 sb_mtx_unlock(&so->so_snd); 1930 } 1931 #endif 1932 sowakeup(so, &so->so_snd); 1933 } 1934 1935 int 1936 sosetopt(struct socket *so, int level, int optname, struct mbuf *m) 1937 { 1938 int error = 0; 1939 1940 if (level != SOL_SOCKET) { 1941 if (so->so_proto->pr_ctloutput) { 1942 solock(so); 1943 error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1944 level, optname, m); 1945 sounlock(so); 1946 return (error); 1947 } 1948 error = ENOPROTOOPT; 1949 } else { 1950 switch (optname) { 1951 1952 case SO_LINGER: 1953 if (m == NULL || m->m_len != sizeof (struct linger) || 1954 mtod(m, struct linger *)->l_linger < 0 || 1955 mtod(m, struct linger *)->l_linger > SHRT_MAX) 1956 return (EINVAL); 1957 1958 solock(so); 1959 so->so_linger = mtod(m, struct linger *)->l_linger; 1960 if (*mtod(m, int *)) 1961 so->so_options |= optname; 1962 else 1963 so->so_options &= ~optname; 1964 sounlock(so); 1965 1966 break; 1967 case SO_BINDANY: 1968 if ((error = suser(curproc)) != 0) /* XXX */ 1969 return (error); 1970 /* FALLTHROUGH */ 1971 1972 case SO_DEBUG: 1973 case SO_KEEPALIVE: 1974 case SO_USELOOPBACK: 1975 case SO_BROADCAST: 1976 case SO_REUSEADDR: 1977 case SO_REUSEPORT: 1978 case SO_OOBINLINE: 1979 case SO_TIMESTAMP: 1980 case SO_ZEROIZE: 1981 if (m == NULL || m->m_len < sizeof (int)) 1982 return (EINVAL); 1983 1984 solock(so); 1985 if (*mtod(m, int *)) 1986 so->so_options |= optname; 1987 else 1988 so->so_options &= ~optname; 1989 sounlock(so); 1990 1991 break; 1992 case SO_DONTROUTE: 1993 if (m == NULL || m->m_len < sizeof (int)) 1994 return (EINVAL); 1995 if (*mtod(m, int *)) 1996 error = EOPNOTSUPP; 1997 break; 1998 1999 case SO_SNDBUF: 2000 case SO_RCVBUF: 2001 case SO_SNDLOWAT: 2002 case SO_RCVLOWAT: 2003 { 2004 struct sockbuf *sb = (optname == SO_SNDBUF || 2005 optname == SO_SNDLOWAT ? 2006 &so->so_snd : &so->so_rcv); 2007 u_long cnt; 2008 2009 if (m == NULL || m->m_len < sizeof (int)) 2010 return (EINVAL); 2011 cnt = *mtod(m, int *); 2012 if ((long)cnt <= 0) 2013 cnt = 1; 2014 2015 if (((sb->sb_flags & SB_MTXLOCK) == 0)) 2016 solock(so); 2017 mtx_enter(&sb->sb_mtx); 2018 2019 switch (optname) { 2020 case SO_SNDBUF: 2021 case SO_RCVBUF: 2022 if (sb->sb_state & 2023 (SS_CANTSENDMORE | SS_CANTRCVMORE)) { 2024 error = EINVAL; 2025 break; 2026 } 2027 if (sbcheckreserve(cnt, sb->sb_wat) || 2028 sbreserve(so, sb, cnt)) { 2029 error = ENOBUFS; 2030 break; 2031 } 2032 sb->sb_wat = cnt; 2033 break; 2034 case SO_SNDLOWAT: 2035 case SO_RCVLOWAT: 2036 sb->sb_lowat = (cnt > sb->sb_hiwat) ? 2037 sb->sb_hiwat : cnt; 2038 break; 2039 } 2040 2041 mtx_leave(&sb->sb_mtx); 2042 if (((sb->sb_flags & SB_MTXLOCK) == 0)) 2043 sounlock(so); 2044 2045 break; 2046 } 2047 2048 case SO_SNDTIMEO: 2049 case SO_RCVTIMEO: 2050 { 2051 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 2052 &so->so_snd : &so->so_rcv); 2053 struct timeval tv; 2054 uint64_t nsecs; 2055 2056 if (m == NULL || m->m_len < sizeof (tv)) 2057 return (EINVAL); 2058 memcpy(&tv, mtod(m, struct timeval *), sizeof tv); 2059 if (!timerisvalid(&tv)) 2060 return (EINVAL); 2061 nsecs = TIMEVAL_TO_NSEC(&tv); 2062 if (nsecs == UINT64_MAX) 2063 return (EDOM); 2064 if (nsecs == 0) 2065 nsecs = INFSLP; 2066 2067 mtx_enter(&sb->sb_mtx); 2068 sb->sb_timeo_nsecs = nsecs; 2069 mtx_leave(&sb->sb_mtx); 2070 break; 2071 } 2072 2073 case SO_RTABLE: 2074 if (so->so_proto->pr_domain && 2075 so->so_proto->pr_domain->dom_protosw && 2076 so->so_proto->pr_ctloutput) { 2077 const struct domain *dom = 2078 so->so_proto->pr_domain; 2079 2080 level = dom->dom_protosw->pr_protocol; 2081 solock(so); 2082 error = (*so->so_proto->pr_ctloutput) 2083 (PRCO_SETOPT, so, level, optname, m); 2084 sounlock(so); 2085 } else 2086 error = ENOPROTOOPT; 2087 break; 2088 #ifdef SOCKET_SPLICE 2089 case SO_SPLICE: 2090 if (m == NULL) { 2091 error = sosplice(so, -1, 0, NULL); 2092 } else if (m->m_len < sizeof(int)) { 2093 error = EINVAL; 2094 } else if (m->m_len < sizeof(struct splice)) { 2095 error = sosplice(so, *mtod(m, int *), 0, NULL); 2096 } else { 2097 error = sosplice(so, 2098 mtod(m, struct splice *)->sp_fd, 2099 mtod(m, struct splice *)->sp_max, 2100 &mtod(m, struct splice *)->sp_idle); 2101 } 2102 break; 2103 #endif /* SOCKET_SPLICE */ 2104 2105 default: 2106 error = ENOPROTOOPT; 2107 break; 2108 } 2109 } 2110 2111 return (error); 2112 } 2113 2114 int 2115 sogetopt(struct socket *so, int level, int optname, struct mbuf *m) 2116 { 2117 int error = 0; 2118 2119 if (level != SOL_SOCKET) { 2120 if (so->so_proto->pr_ctloutput) { 2121 m->m_len = 0; 2122 2123 solock(so); 2124 error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so, 2125 level, optname, m); 2126 sounlock(so); 2127 return (error); 2128 } else 2129 return (ENOPROTOOPT); 2130 } else { 2131 m->m_len = sizeof (int); 2132 2133 switch (optname) { 2134 2135 case SO_LINGER: 2136 m->m_len = sizeof (struct linger); 2137 solock_shared(so); 2138 mtod(m, struct linger *)->l_onoff = 2139 so->so_options & SO_LINGER; 2140 mtod(m, struct linger *)->l_linger = so->so_linger; 2141 sounlock_shared(so); 2142 break; 2143 2144 case SO_BINDANY: 2145 case SO_USELOOPBACK: 2146 case SO_DEBUG: 2147 case SO_KEEPALIVE: 2148 case SO_REUSEADDR: 2149 case SO_REUSEPORT: 2150 case SO_BROADCAST: 2151 case SO_OOBINLINE: 2152 case SO_ACCEPTCONN: 2153 case SO_TIMESTAMP: 2154 case SO_ZEROIZE: 2155 *mtod(m, int *) = so->so_options & optname; 2156 break; 2157 2158 case SO_DONTROUTE: 2159 *mtod(m, int *) = 0; 2160 break; 2161 2162 case SO_TYPE: 2163 *mtod(m, int *) = so->so_type; 2164 break; 2165 2166 case SO_ERROR: 2167 solock(so); 2168 *mtod(m, int *) = so->so_error; 2169 so->so_error = 0; 2170 sounlock(so); 2171 2172 break; 2173 2174 case SO_DOMAIN: 2175 *mtod(m, int *) = so->so_proto->pr_domain->dom_family; 2176 break; 2177 2178 case SO_PROTOCOL: 2179 *mtod(m, int *) = so->so_proto->pr_protocol; 2180 break; 2181 2182 case SO_SNDBUF: 2183 *mtod(m, int *) = so->so_snd.sb_hiwat; 2184 break; 2185 2186 case SO_RCVBUF: 2187 *mtod(m, int *) = so->so_rcv.sb_hiwat; 2188 break; 2189 2190 case SO_SNDLOWAT: 2191 *mtod(m, int *) = so->so_snd.sb_lowat; 2192 break; 2193 2194 case SO_RCVLOWAT: 2195 *mtod(m, int *) = so->so_rcv.sb_lowat; 2196 break; 2197 2198 case SO_SNDTIMEO: 2199 case SO_RCVTIMEO: 2200 { 2201 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 2202 &so->so_snd : &so->so_rcv); 2203 struct timeval tv; 2204 uint64_t nsecs; 2205 2206 mtx_enter(&sb->sb_mtx); 2207 nsecs = sb->sb_timeo_nsecs; 2208 mtx_leave(&sb->sb_mtx); 2209 2210 m->m_len = sizeof(struct timeval); 2211 memset(&tv, 0, sizeof(tv)); 2212 if (nsecs != INFSLP) 2213 NSEC_TO_TIMEVAL(nsecs, &tv); 2214 memcpy(mtod(m, struct timeval *), &tv, sizeof tv); 2215 break; 2216 } 2217 2218 case SO_RTABLE: 2219 if (so->so_proto->pr_domain && 2220 so->so_proto->pr_domain->dom_protosw && 2221 so->so_proto->pr_ctloutput) { 2222 const struct domain *dom = 2223 so->so_proto->pr_domain; 2224 2225 level = dom->dom_protosw->pr_protocol; 2226 solock(so); 2227 error = (*so->so_proto->pr_ctloutput) 2228 (PRCO_GETOPT, so, level, optname, m); 2229 sounlock(so); 2230 if (error) 2231 return (error); 2232 break; 2233 } 2234 return (ENOPROTOOPT); 2235 2236 #ifdef SOCKET_SPLICE 2237 case SO_SPLICE: 2238 { 2239 off_t len; 2240 2241 m->m_len = sizeof(off_t); 2242 solock_shared(so); 2243 len = so->so_sp ? so->so_sp->ssp_len : 0; 2244 sounlock_shared(so); 2245 memcpy(mtod(m, off_t *), &len, sizeof(off_t)); 2246 break; 2247 } 2248 #endif /* SOCKET_SPLICE */ 2249 2250 case SO_PEERCRED: 2251 if (so->so_proto->pr_protocol == AF_UNIX) { 2252 struct unpcb *unp = sotounpcb(so); 2253 2254 solock(so); 2255 if (unp->unp_flags & UNP_FEIDS) { 2256 m->m_len = sizeof(unp->unp_connid); 2257 memcpy(mtod(m, caddr_t), 2258 &(unp->unp_connid), m->m_len); 2259 sounlock(so); 2260 break; 2261 } 2262 sounlock(so); 2263 2264 return (ENOTCONN); 2265 } 2266 return (EOPNOTSUPP); 2267 2268 default: 2269 return (ENOPROTOOPT); 2270 } 2271 return (0); 2272 } 2273 } 2274 2275 void 2276 sohasoutofband(struct socket *so) 2277 { 2278 pgsigio(&so->so_sigio, SIGURG, 0); 2279 knote(&so->so_rcv.sb_klist, 0); 2280 } 2281 2282 void 2283 sofilt_lock(struct socket *so, struct sockbuf *sb) 2284 { 2285 switch (so->so_proto->pr_domain->dom_family) { 2286 case PF_INET: 2287 case PF_INET6: 2288 NET_LOCK_SHARED(); 2289 break; 2290 default: 2291 rw_enter_write(&so->so_lock); 2292 break; 2293 } 2294 2295 mtx_enter(&sb->sb_mtx); 2296 } 2297 2298 void 2299 sofilt_unlock(struct socket *so, struct sockbuf *sb) 2300 { 2301 mtx_leave(&sb->sb_mtx); 2302 2303 switch (so->so_proto->pr_domain->dom_family) { 2304 case PF_INET: 2305 case PF_INET6: 2306 NET_UNLOCK_SHARED(); 2307 break; 2308 default: 2309 rw_exit_write(&so->so_lock); 2310 break; 2311 } 2312 } 2313 2314 int 2315 soo_kqfilter(struct file *fp, struct knote *kn) 2316 { 2317 struct socket *so = kn->kn_fp->f_data; 2318 struct sockbuf *sb; 2319 2320 switch (kn->kn_filter) { 2321 case EVFILT_READ: 2322 kn->kn_fop = &soread_filtops; 2323 sb = &so->so_rcv; 2324 break; 2325 case EVFILT_WRITE: 2326 kn->kn_fop = &sowrite_filtops; 2327 sb = &so->so_snd; 2328 break; 2329 case EVFILT_EXCEPT: 2330 kn->kn_fop = &soexcept_filtops; 2331 sb = &so->so_rcv; 2332 break; 2333 default: 2334 return (EINVAL); 2335 } 2336 2337 klist_insert(&sb->sb_klist, kn); 2338 2339 return (0); 2340 } 2341 2342 void 2343 filt_sordetach(struct knote *kn) 2344 { 2345 struct socket *so = kn->kn_fp->f_data; 2346 2347 klist_remove(&so->so_rcv.sb_klist, kn); 2348 } 2349 2350 int 2351 filt_soread(struct knote *kn, long hint) 2352 { 2353 struct socket *so = kn->kn_fp->f_data; 2354 u_int state = READ_ONCE(so->so_state); 2355 u_int error = READ_ONCE(so->so_error); 2356 int rv = 0; 2357 2358 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); 2359 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 2360 soassertlocked_readonly(so); 2361 2362 if (so->so_options & SO_ACCEPTCONN) { 2363 short qlen = READ_ONCE(so->so_qlen); 2364 2365 if (so->so_rcv.sb_flags & SB_MTXLOCK) 2366 soassertlocked_readonly(so); 2367 2368 kn->kn_data = qlen; 2369 rv = (kn->kn_data != 0); 2370 2371 if (kn->kn_flags & (__EV_POLL | __EV_SELECT)) { 2372 if (state & SS_ISDISCONNECTED) { 2373 kn->kn_flags |= __EV_HUP; 2374 rv = 1; 2375 } else { 2376 rv = qlen || soreadable(so); 2377 } 2378 } 2379 2380 return rv; 2381 } 2382 2383 kn->kn_data = so->so_rcv.sb_cc; 2384 #ifdef SOCKET_SPLICE 2385 if (isspliced(so)) { 2386 rv = 0; 2387 } else 2388 #endif /* SOCKET_SPLICE */ 2389 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 2390 kn->kn_flags |= EV_EOF; 2391 if (kn->kn_flags & __EV_POLL) { 2392 if (state & SS_ISDISCONNECTED) 2393 kn->kn_flags |= __EV_HUP; 2394 } 2395 kn->kn_fflags = error; 2396 rv = 1; 2397 } else if (error) { 2398 rv = 1; 2399 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2400 rv = (kn->kn_data >= kn->kn_sdata); 2401 } else { 2402 rv = (kn->kn_data >= so->so_rcv.sb_lowat); 2403 } 2404 2405 return rv; 2406 } 2407 2408 void 2409 filt_sowdetach(struct knote *kn) 2410 { 2411 struct socket *so = kn->kn_fp->f_data; 2412 2413 klist_remove(&so->so_snd.sb_klist, kn); 2414 } 2415 2416 int 2417 filt_sowrite(struct knote *kn, long hint) 2418 { 2419 struct socket *so = kn->kn_fp->f_data; 2420 u_int state = READ_ONCE(so->so_state); 2421 u_int error = READ_ONCE(so->so_error); 2422 int rv; 2423 2424 MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx); 2425 if ((so->so_snd.sb_flags & SB_MTXLOCK) == 0) 2426 soassertlocked_readonly(so); 2427 2428 kn->kn_data = sbspace_locked(so, &so->so_snd); 2429 if (so->so_snd.sb_state & SS_CANTSENDMORE) { 2430 kn->kn_flags |= EV_EOF; 2431 if (kn->kn_flags & __EV_POLL) { 2432 if (state & SS_ISDISCONNECTED) 2433 kn->kn_flags |= __EV_HUP; 2434 } 2435 kn->kn_fflags = error; 2436 rv = 1; 2437 } else if (error) { 2438 rv = 1; 2439 } else if (((state & SS_ISCONNECTED) == 0) && 2440 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 2441 rv = 0; 2442 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2443 rv = (kn->kn_data >= kn->kn_sdata); 2444 } else { 2445 rv = (kn->kn_data >= so->so_snd.sb_lowat); 2446 } 2447 2448 return (rv); 2449 } 2450 2451 int 2452 filt_soexcept(struct knote *kn, long hint) 2453 { 2454 struct socket *so = kn->kn_fp->f_data; 2455 int rv = 0; 2456 2457 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx); 2458 if ((so->so_rcv.sb_flags & SB_MTXLOCK) == 0) 2459 soassertlocked_readonly(so); 2460 2461 #ifdef SOCKET_SPLICE 2462 if (isspliced(so)) { 2463 rv = 0; 2464 } else 2465 #endif /* SOCKET_SPLICE */ 2466 if (kn->kn_sfflags & NOTE_OOB) { 2467 if (so->so_oobmark || (so->so_rcv.sb_state & SS_RCVATMARK)) { 2468 kn->kn_fflags |= NOTE_OOB; 2469 kn->kn_data -= so->so_oobmark; 2470 rv = 1; 2471 } 2472 } 2473 2474 if (kn->kn_flags & __EV_POLL) { 2475 u_int state = READ_ONCE(so->so_state); 2476 2477 if (state & SS_ISDISCONNECTED) { 2478 kn->kn_flags |= __EV_HUP; 2479 rv = 1; 2480 } 2481 } 2482 2483 return rv; 2484 } 2485 2486 int 2487 filt_sowmodify(struct kevent *kev, struct knote *kn) 2488 { 2489 struct socket *so = kn->kn_fp->f_data; 2490 int rv; 2491 2492 sofilt_lock(so, &so->so_snd); 2493 rv = knote_modify(kev, kn); 2494 sofilt_unlock(so, &so->so_snd); 2495 2496 return (rv); 2497 } 2498 2499 int 2500 filt_sowprocess(struct knote *kn, struct kevent *kev) 2501 { 2502 struct socket *so = kn->kn_fp->f_data; 2503 int rv; 2504 2505 sofilt_lock(so, &so->so_snd); 2506 rv = knote_process(kn, kev); 2507 sofilt_unlock(so, &so->so_snd); 2508 2509 return (rv); 2510 } 2511 2512 int 2513 filt_sormodify(struct kevent *kev, struct knote *kn) 2514 { 2515 struct socket *so = kn->kn_fp->f_data; 2516 int rv; 2517 2518 sofilt_lock(so, &so->so_rcv); 2519 rv = knote_modify(kev, kn); 2520 sofilt_unlock(so, &so->so_rcv); 2521 2522 return (rv); 2523 } 2524 2525 int 2526 filt_sorprocess(struct knote *kn, struct kevent *kev) 2527 { 2528 struct socket *so = kn->kn_fp->f_data; 2529 int rv; 2530 2531 sofilt_lock(so, &so->so_rcv); 2532 rv = knote_process(kn, kev); 2533 sofilt_unlock(so, &so->so_rcv); 2534 2535 return (rv); 2536 } 2537 2538 #ifdef DDB 2539 void 2540 sobuf_print(struct sockbuf *, 2541 int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))); 2542 2543 void 2544 sobuf_print(struct sockbuf *sb, 2545 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2546 { 2547 (*pr)("\tsb_cc: %lu\n", sb->sb_cc); 2548 (*pr)("\tsb_datacc: %lu\n", sb->sb_datacc); 2549 (*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat); 2550 (*pr)("\tsb_wat: %lu\n", sb->sb_wat); 2551 (*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt); 2552 (*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax); 2553 (*pr)("\tsb_lowat: %ld\n", sb->sb_lowat); 2554 (*pr)("\tsb_mb: %p\n", sb->sb_mb); 2555 (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail); 2556 (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord); 2557 (*pr)("\tsb_flags: %04x\n", sb->sb_flags); 2558 (*pr)("\tsb_state: %04x\n", sb->sb_state); 2559 (*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs); 2560 } 2561 2562 void 2563 so_print(void *v, 2564 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2565 { 2566 struct socket *so = v; 2567 2568 (*pr)("socket %p\n", so); 2569 (*pr)("so_type: %i\n", so->so_type); 2570 (*pr)("so_options: 0x%04x\n", so->so_options); /* %b */ 2571 (*pr)("so_linger: %i\n", so->so_linger); 2572 (*pr)("so_state: 0x%04x\n", so->so_state); 2573 (*pr)("so_pcb: %p\n", so->so_pcb); 2574 (*pr)("so_proto: %p\n", so->so_proto); 2575 (*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio); 2576 2577 (*pr)("so_head: %p\n", so->so_head); 2578 (*pr)("so_onq: %p\n", so->so_onq); 2579 (*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0)); 2580 (*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q)); 2581 (*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe)); 2582 (*pr)("so_q0len: %i\n", so->so_q0len); 2583 (*pr)("so_qlen: %i\n", so->so_qlen); 2584 (*pr)("so_qlimit: %i\n", so->so_qlimit); 2585 (*pr)("so_timeo: %i\n", so->so_timeo); 2586 (*pr)("so_obmark: %lu\n", so->so_oobmark); 2587 2588 (*pr)("so_sp: %p\n", so->so_sp); 2589 if (so->so_sp != NULL) { 2590 (*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket); 2591 (*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback); 2592 (*pr)("\tssp_len: %lld\n", 2593 (unsigned long long)so->so_sp->ssp_len); 2594 (*pr)("\tssp_max: %lld\n", 2595 (unsigned long long)so->so_sp->ssp_max); 2596 (*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec, 2597 so->so_sp->ssp_idletv.tv_usec); 2598 (*pr)("\tssp_idleto: %spending (@%i)\n", 2599 timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ", 2600 so->so_sp->ssp_idleto.to_time); 2601 } 2602 2603 (*pr)("so_rcv:\n"); 2604 sobuf_print(&so->so_rcv, pr); 2605 (*pr)("so_snd:\n"); 2606 sobuf_print(&so->so_snd, pr); 2607 2608 (*pr)("so_upcall: %p so_upcallarg: %p\n", 2609 so->so_upcall, so->so_upcallarg); 2610 2611 (*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid); 2612 (*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid); 2613 (*pr)("so_cpid: %d\n", so->so_cpid); 2614 } 2615 #endif 2616