1 /* $OpenBSD: uipc_socket.c,v 1.314 2024/01/12 10:48:03 bluhm Exp $ */ 2 /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 #include <sys/file.h> 39 #include <sys/filedesc.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/domain.h> 43 #include <sys/event.h> 44 #include <sys/protosw.h> 45 #include <sys/socket.h> 46 #include <sys/unpcb.h> 47 #include <sys/socketvar.h> 48 #include <sys/signalvar.h> 49 #include <sys/pool.h> 50 #include <sys/atomic.h> 51 #include <sys/rwlock.h> 52 #include <sys/time.h> 53 #include <sys/refcnt.h> 54 55 #ifdef DDB 56 #include <machine/db_machdep.h> 57 #endif 58 59 void sbsync(struct sockbuf *, struct mbuf *); 60 61 int sosplice(struct socket *, int, off_t, struct timeval *); 62 void sounsplice(struct socket *, struct socket *, int); 63 void soidle(void *); 64 void sotask(void *); 65 void soreaper(void *); 66 void soput(void *); 67 int somove(struct socket *, int); 68 void sorflush(struct socket *); 69 70 void filt_sordetach(struct knote *kn); 71 int filt_soread(struct knote *kn, long hint); 72 void filt_sowdetach(struct knote *kn); 73 int filt_sowrite(struct knote *kn, long hint); 74 int filt_soexcept(struct knote *kn, long hint); 75 int filt_solisten(struct knote *kn, long hint); 76 int filt_somodify(struct kevent *kev, struct knote *kn); 77 int filt_soprocess(struct knote *kn, struct kevent *kev); 78 79 const struct filterops solisten_filtops = { 80 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 81 .f_attach = NULL, 82 .f_detach = filt_sordetach, 83 .f_event = filt_solisten, 84 .f_modify = filt_somodify, 85 .f_process = filt_soprocess, 86 }; 87 88 const struct filterops soread_filtops = { 89 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 90 .f_attach = NULL, 91 .f_detach = filt_sordetach, 92 .f_event = filt_soread, 93 .f_modify = filt_somodify, 94 .f_process = filt_soprocess, 95 }; 96 97 const struct filterops sowrite_filtops = { 98 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 99 .f_attach = NULL, 100 .f_detach = filt_sowdetach, 101 .f_event = filt_sowrite, 102 .f_modify = filt_somodify, 103 .f_process = filt_soprocess, 104 }; 105 106 const struct filterops soexcept_filtops = { 107 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 108 .f_attach = NULL, 109 .f_detach = filt_sordetach, 110 .f_event = filt_soexcept, 111 .f_modify = filt_somodify, 112 .f_process = filt_soprocess, 113 }; 114 115 void klist_soassertlk(void *); 116 int klist_solock(void *); 117 void klist_sounlock(void *, int); 118 119 const struct klistops socket_klistops = { 120 .klo_assertlk = klist_soassertlk, 121 .klo_lock = klist_solock, 122 .klo_unlock = klist_sounlock, 123 }; 124 125 #ifndef SOMINCONN 126 #define SOMINCONN 80 127 #endif /* SOMINCONN */ 128 129 int somaxconn = SOMAXCONN; 130 int sominconn = SOMINCONN; 131 132 struct pool socket_pool; 133 #ifdef SOCKET_SPLICE 134 struct pool sosplice_pool; 135 struct taskq *sosplice_taskq; 136 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk"); 137 #endif 138 139 void 140 soinit(void) 141 { 142 pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0, 143 "sockpl", NULL); 144 #ifdef SOCKET_SPLICE 145 pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0, 146 "sosppl", NULL); 147 #endif 148 } 149 150 struct socket * 151 soalloc(const struct domain *dp, int wait) 152 { 153 struct socket *so; 154 155 so = pool_get(&socket_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) | 156 PR_ZERO); 157 if (so == NULL) 158 return (NULL); 159 rw_init_flags(&so->so_lock, dp->dom_name, RWL_DUPOK); 160 refcnt_init(&so->so_refcnt); 161 klist_init(&so->so_rcv.sb_klist, &socket_klistops, so); 162 klist_init(&so->so_snd.sb_klist, &socket_klistops, so); 163 sigio_init(&so->so_sigio); 164 TAILQ_INIT(&so->so_q0); 165 TAILQ_INIT(&so->so_q); 166 167 return (so); 168 } 169 170 /* 171 * Socket operation routines. 172 * These routines are called by the routines in 173 * sys_socket.c or from a system process, and 174 * implement the semantics of socket operations by 175 * switching out to the protocol specific routines. 176 */ 177 int 178 socreate(int dom, struct socket **aso, int type, int proto) 179 { 180 struct proc *p = curproc; /* XXX */ 181 const struct protosw *prp; 182 struct socket *so; 183 int error; 184 185 if (proto) 186 prp = pffindproto(dom, proto, type); 187 else 188 prp = pffindtype(dom, type); 189 if (prp == NULL || prp->pr_usrreqs == NULL) 190 return (EPROTONOSUPPORT); 191 if (prp->pr_type != type) 192 return (EPROTOTYPE); 193 so = soalloc(pffinddomain(dom), M_WAIT); 194 so->so_type = type; 195 if (suser(p) == 0) 196 so->so_state = SS_PRIV; 197 so->so_ruid = p->p_ucred->cr_ruid; 198 so->so_euid = p->p_ucred->cr_uid; 199 so->so_rgid = p->p_ucred->cr_rgid; 200 so->so_egid = p->p_ucred->cr_gid; 201 so->so_cpid = p->p_p->ps_pid; 202 so->so_proto = prp; 203 so->so_snd.sb_timeo_nsecs = INFSLP; 204 so->so_rcv.sb_timeo_nsecs = INFSLP; 205 206 solock(so); 207 error = pru_attach(so, proto, M_WAIT); 208 if (error) { 209 so->so_state |= SS_NOFDREF; 210 /* sofree() calls sounlock(). */ 211 sofree(so, 0); 212 return (error); 213 } 214 sounlock(so); 215 *aso = so; 216 return (0); 217 } 218 219 int 220 sobind(struct socket *so, struct mbuf *nam, struct proc *p) 221 { 222 soassertlocked(so); 223 return pru_bind(so, nam, p); 224 } 225 226 int 227 solisten(struct socket *so, int backlog) 228 { 229 int error; 230 231 soassertlocked(so); 232 233 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 234 return (EINVAL); 235 #ifdef SOCKET_SPLICE 236 if (isspliced(so) || issplicedback(so)) 237 return (EOPNOTSUPP); 238 #endif /* SOCKET_SPLICE */ 239 error = pru_listen(so); 240 if (error) 241 return (error); 242 if (TAILQ_FIRST(&so->so_q) == NULL) 243 so->so_options |= SO_ACCEPTCONN; 244 if (backlog < 0 || backlog > somaxconn) 245 backlog = somaxconn; 246 if (backlog < sominconn) 247 backlog = sominconn; 248 so->so_qlimit = backlog; 249 return (0); 250 } 251 252 #define SOSP_FREEING_READ 1 253 #define SOSP_FREEING_WRITE 2 254 void 255 sofree(struct socket *so, int keep_lock) 256 { 257 int persocket = solock_persocket(so); 258 259 soassertlocked(so); 260 261 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { 262 if (!keep_lock) 263 sounlock(so); 264 return; 265 } 266 if (so->so_head) { 267 struct socket *head = so->so_head; 268 269 /* 270 * We must not decommission a socket that's on the accept(2) 271 * queue. If we do, then accept(2) may hang after select(2) 272 * indicated that the listening socket was ready. 273 */ 274 if (so->so_onq == &head->so_q) { 275 if (!keep_lock) 276 sounlock(so); 277 return; 278 } 279 280 if (persocket) { 281 /* 282 * Concurrent close of `head' could 283 * abort `so' due to re-lock. 284 */ 285 soref(so); 286 soref(head); 287 sounlock(so); 288 solock(head); 289 solock(so); 290 291 if (so->so_onq != &head->so_q0) { 292 sounlock(head); 293 sounlock(so); 294 sorele(head); 295 sorele(so); 296 return; 297 } 298 299 sorele(head); 300 sorele(so); 301 } 302 303 soqremque(so, 0); 304 305 if (persocket) 306 sounlock(head); 307 } 308 309 if (persocket) { 310 sounlock(so); 311 refcnt_finalize(&so->so_refcnt, "sofinal"); 312 solock(so); 313 } 314 315 sigio_free(&so->so_sigio); 316 klist_free(&so->so_rcv.sb_klist); 317 klist_free(&so->so_snd.sb_klist); 318 #ifdef SOCKET_SPLICE 319 if (issplicedback(so)) { 320 int freeing = SOSP_FREEING_WRITE; 321 322 if (so->so_sp->ssp_soback == so) 323 freeing |= SOSP_FREEING_READ; 324 sounsplice(so->so_sp->ssp_soback, so, freeing); 325 } 326 if (isspliced(so)) { 327 int freeing = SOSP_FREEING_READ; 328 329 if (so == so->so_sp->ssp_socket) 330 freeing |= SOSP_FREEING_WRITE; 331 sounsplice(so, so->so_sp->ssp_socket, freeing); 332 } 333 #endif /* SOCKET_SPLICE */ 334 sbrelease(so, &so->so_snd); 335 sorflush(so); 336 if (!keep_lock) 337 sounlock(so); 338 #ifdef SOCKET_SPLICE 339 if (so->so_sp) { 340 /* Reuse splice idle, sounsplice() has been called before. */ 341 timeout_set_proc(&so->so_sp->ssp_idleto, soreaper, so); 342 timeout_add(&so->so_sp->ssp_idleto, 0); 343 } else 344 #endif /* SOCKET_SPLICE */ 345 { 346 pool_put(&socket_pool, so); 347 } 348 } 349 350 static inline uint64_t 351 solinger_nsec(struct socket *so) 352 { 353 if (so->so_linger == 0) 354 return INFSLP; 355 356 return SEC_TO_NSEC(so->so_linger); 357 } 358 359 /* 360 * Close a socket on last file table reference removal. 361 * Initiate disconnect if connected. 362 * Free socket when disconnect complete. 363 */ 364 int 365 soclose(struct socket *so, int flags) 366 { 367 struct socket *so2; 368 int error = 0; 369 370 solock(so); 371 /* Revoke async IO early. There is a final revocation in sofree(). */ 372 sigio_free(&so->so_sigio); 373 if (so->so_state & SS_ISCONNECTED) { 374 if (so->so_pcb == NULL) 375 goto discard; 376 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 377 error = sodisconnect(so); 378 if (error) 379 goto drop; 380 } 381 if (so->so_options & SO_LINGER) { 382 if ((so->so_state & SS_ISDISCONNECTING) && 383 (flags & MSG_DONTWAIT)) 384 goto drop; 385 while (so->so_state & SS_ISCONNECTED) { 386 error = sosleep_nsec(so, &so->so_timeo, 387 PSOCK | PCATCH, "netcls", 388 solinger_nsec(so)); 389 if (error) 390 break; 391 } 392 } 393 } 394 drop: 395 if (so->so_pcb) { 396 int error2; 397 error2 = pru_detach(so); 398 if (error == 0) 399 error = error2; 400 } 401 if (so->so_options & SO_ACCEPTCONN) { 402 int persocket = solock_persocket(so); 403 404 if (persocket) { 405 /* Wait concurrent sonewconn() threads. */ 406 while (so->so_newconn > 0) { 407 so->so_state |= SS_NEWCONN_WAIT; 408 sosleep_nsec(so, &so->so_newconn, PSOCK, 409 "newcon", INFSLP); 410 } 411 } 412 413 while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { 414 if (persocket) 415 solock(so2); 416 (void) soqremque(so2, 0); 417 if (persocket) 418 sounlock(so); 419 soabort(so2); 420 if (persocket) 421 solock(so); 422 } 423 while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { 424 if (persocket) 425 solock(so2); 426 (void) soqremque(so2, 1); 427 if (persocket) 428 sounlock(so); 429 soabort(so2); 430 if (persocket) 431 solock(so); 432 } 433 } 434 discard: 435 if (so->so_state & SS_NOFDREF) 436 panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type); 437 so->so_state |= SS_NOFDREF; 438 /* sofree() calls sounlock(). */ 439 sofree(so, 0); 440 return (error); 441 } 442 443 void 444 soabort(struct socket *so) 445 { 446 soassertlocked(so); 447 pru_abort(so); 448 } 449 450 int 451 soaccept(struct socket *so, struct mbuf *nam) 452 { 453 int error = 0; 454 455 soassertlocked(so); 456 457 if ((so->so_state & SS_NOFDREF) == 0) 458 panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type); 459 so->so_state &= ~SS_NOFDREF; 460 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 461 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 462 error = pru_accept(so, nam); 463 else 464 error = ECONNABORTED; 465 return (error); 466 } 467 468 int 469 soconnect(struct socket *so, struct mbuf *nam) 470 { 471 int error; 472 473 soassertlocked(so); 474 475 if (so->so_options & SO_ACCEPTCONN) 476 return (EOPNOTSUPP); 477 /* 478 * If protocol is connection-based, can only connect once. 479 * Otherwise, if connected, try to disconnect first. 480 * This allows user to disconnect by connecting to, e.g., 481 * a null address. 482 */ 483 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 484 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 485 (error = sodisconnect(so)))) 486 error = EISCONN; 487 else 488 error = pru_connect(so, nam); 489 return (error); 490 } 491 492 int 493 soconnect2(struct socket *so1, struct socket *so2) 494 { 495 int persocket, error; 496 497 if ((persocket = solock_persocket(so1))) 498 solock_pair(so1, so2); 499 else 500 solock(so1); 501 502 error = pru_connect2(so1, so2); 503 504 if (persocket) 505 sounlock(so2); 506 sounlock(so1); 507 return (error); 508 } 509 510 int 511 sodisconnect(struct socket *so) 512 { 513 int error; 514 515 soassertlocked(so); 516 517 if ((so->so_state & SS_ISCONNECTED) == 0) 518 return (ENOTCONN); 519 if (so->so_state & SS_ISDISCONNECTING) 520 return (EALREADY); 521 error = pru_disconnect(so); 522 return (error); 523 } 524 525 int m_getuio(struct mbuf **, int, long, struct uio *); 526 527 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 528 /* 529 * Send on a socket. 530 * If send must go all at once and message is larger than 531 * send buffering, then hard error. 532 * Lock against other senders. 533 * If must go all at once and not enough room now, then 534 * inform user that this would block and do nothing. 535 * Otherwise, if nonblocking, send as much as possible. 536 * The data to be sent is described by "uio" if nonzero, 537 * otherwise by the mbuf chain "top" (which must be null 538 * if uio is not). Data provided in mbuf chain must be small 539 * enough to send all at once. 540 * 541 * Returns nonzero on error, timeout or signal; callers 542 * must check for short counts if EINTR/ERESTART are returned. 543 * Data and control buffers are freed on return. 544 */ 545 int 546 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 547 struct mbuf *control, int flags) 548 { 549 long space, clen = 0; 550 size_t resid; 551 int error; 552 int atomic = sosendallatonce(so) || top; 553 554 if (uio) 555 resid = uio->uio_resid; 556 else 557 resid = top->m_pkthdr.len; 558 /* MSG_EOR on a SOCK_STREAM socket is invalid. */ 559 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 560 m_freem(top); 561 m_freem(control); 562 return (EINVAL); 563 } 564 if (uio && uio->uio_procp) 565 uio->uio_procp->p_ru.ru_msgsnd++; 566 if (control) { 567 /* 568 * In theory clen should be unsigned (since control->m_len is). 569 * However, space must be signed, as it might be less than 0 570 * if we over-committed, and we must use a signed comparison 571 * of space and clen. 572 */ 573 clen = control->m_len; 574 /* reserve extra space for AF_UNIX's internalize */ 575 if (so->so_proto->pr_domain->dom_family == AF_UNIX && 576 clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) && 577 mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 578 clen = CMSG_SPACE( 579 (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) * 580 (sizeof(struct fdpass) / sizeof(int))); 581 } 582 583 #define snderr(errno) { error = errno; goto release; } 584 585 solock_shared(so); 586 restart: 587 if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0) 588 goto out; 589 so->so_snd.sb_state |= SS_ISSENDING; 590 do { 591 if (so->so_snd.sb_state & SS_CANTSENDMORE) 592 snderr(EPIPE); 593 if (so->so_error) { 594 error = so->so_error; 595 so->so_error = 0; 596 snderr(error); 597 } 598 if ((so->so_state & SS_ISCONNECTED) == 0) { 599 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 600 if (!(resid == 0 && clen != 0)) 601 snderr(ENOTCONN); 602 } else if (addr == NULL) 603 snderr(EDESTADDRREQ); 604 } 605 space = sbspace(so, &so->so_snd); 606 if (flags & MSG_OOB) 607 space += 1024; 608 if (so->so_proto->pr_domain->dom_family == AF_UNIX) { 609 if (atomic && resid > so->so_snd.sb_hiwat) 610 snderr(EMSGSIZE); 611 } else { 612 if (clen > so->so_snd.sb_hiwat || 613 (atomic && resid > so->so_snd.sb_hiwat - clen)) 614 snderr(EMSGSIZE); 615 } 616 if (space < clen || 617 (space - clen < resid && 618 (atomic || space < so->so_snd.sb_lowat))) { 619 if (flags & MSG_DONTWAIT) 620 snderr(EWOULDBLOCK); 621 sbunlock(so, &so->so_snd); 622 error = sbwait(so, &so->so_snd); 623 so->so_snd.sb_state &= ~SS_ISSENDING; 624 if (error) 625 goto out; 626 goto restart; 627 } 628 space -= clen; 629 do { 630 if (uio == NULL) { 631 /* 632 * Data is prepackaged in "top". 633 */ 634 resid = 0; 635 if (flags & MSG_EOR) 636 top->m_flags |= M_EOR; 637 } else { 638 sounlock_shared(so); 639 error = m_getuio(&top, atomic, space, uio); 640 solock_shared(so); 641 if (error) 642 goto release; 643 space -= top->m_pkthdr.len; 644 resid = uio->uio_resid; 645 if (flags & MSG_EOR) 646 top->m_flags |= M_EOR; 647 } 648 if (resid == 0) 649 so->so_snd.sb_state &= ~SS_ISSENDING; 650 if (top && so->so_options & SO_ZEROIZE) 651 top->m_flags |= M_ZEROIZE; 652 if (flags & MSG_OOB) 653 error = pru_sendoob(so, top, addr, control); 654 else 655 error = pru_send(so, top, addr, control); 656 clen = 0; 657 control = NULL; 658 top = NULL; 659 if (error) 660 goto release; 661 } while (resid && space > 0); 662 } while (resid); 663 664 release: 665 so->so_snd.sb_state &= ~SS_ISSENDING; 666 sbunlock(so, &so->so_snd); 667 out: 668 sounlock_shared(so); 669 m_freem(top); 670 m_freem(control); 671 return (error); 672 } 673 674 int 675 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio) 676 { 677 struct mbuf *m, *top = NULL; 678 struct mbuf **nextp = ⊤ 679 u_long len, mlen; 680 size_t resid = uio->uio_resid; 681 int error; 682 683 do { 684 if (top == NULL) { 685 MGETHDR(m, M_WAIT, MT_DATA); 686 mlen = MHLEN; 687 m->m_pkthdr.len = 0; 688 m->m_pkthdr.ph_ifidx = 0; 689 } else { 690 MGET(m, M_WAIT, MT_DATA); 691 mlen = MLEN; 692 } 693 /* chain mbuf together */ 694 *nextp = m; 695 nextp = &m->m_next; 696 697 resid = ulmin(resid, space); 698 if (resid >= MINCLSIZE) { 699 MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES)); 700 if ((m->m_flags & M_EXT) == 0) 701 MCLGETL(m, M_NOWAIT, MCLBYTES); 702 if ((m->m_flags & M_EXT) == 0) 703 goto nopages; 704 mlen = m->m_ext.ext_size; 705 len = ulmin(mlen, resid); 706 /* 707 * For datagram protocols, leave room 708 * for protocol headers in first mbuf. 709 */ 710 if (atomic && m == top && len < mlen - max_hdr) 711 m->m_data += max_hdr; 712 } else { 713 nopages: 714 len = ulmin(mlen, resid); 715 /* 716 * For datagram protocols, leave room 717 * for protocol headers in first mbuf. 718 */ 719 if (atomic && m == top && len < mlen - max_hdr) 720 m_align(m, len); 721 } 722 723 error = uiomove(mtod(m, caddr_t), len, uio); 724 if (error) { 725 m_freem(top); 726 return (error); 727 } 728 729 /* adjust counters */ 730 resid = uio->uio_resid; 731 space -= len; 732 m->m_len = len; 733 top->m_pkthdr.len += len; 734 735 /* Is there more space and more data? */ 736 } while (space > 0 && resid > 0); 737 738 *mp = top; 739 return 0; 740 } 741 742 /* 743 * Following replacement or removal of the first mbuf on the first 744 * mbuf chain of a socket buffer, push necessary state changes back 745 * into the socket buffer so that other consumers see the values 746 * consistently. 'nextrecord' is the callers locally stored value of 747 * the original value of sb->sb_mb->m_nextpkt which must be restored 748 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 749 */ 750 void 751 sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 752 { 753 754 /* 755 * First, update for the new value of nextrecord. If necessary, 756 * make it the first record. 757 */ 758 if (sb->sb_mb != NULL) 759 sb->sb_mb->m_nextpkt = nextrecord; 760 else 761 sb->sb_mb = nextrecord; 762 763 /* 764 * Now update any dependent socket buffer fields to reflect 765 * the new state. This is an inline of SB_EMPTY_FIXUP, with 766 * the addition of a second clause that takes care of the 767 * case where sb_mb has been updated, but remains the last 768 * record. 769 */ 770 if (sb->sb_mb == NULL) { 771 sb->sb_mbtail = NULL; 772 sb->sb_lastrecord = NULL; 773 } else if (sb->sb_mb->m_nextpkt == NULL) 774 sb->sb_lastrecord = sb->sb_mb; 775 } 776 777 /* 778 * Implement receive operations on a socket. 779 * We depend on the way that records are added to the sockbuf 780 * by sbappend*. In particular, each record (mbufs linked through m_next) 781 * must begin with an address if the protocol so specifies, 782 * followed by an optional mbuf or mbufs containing ancillary data, 783 * and then zero or more mbufs of data. 784 * In order to avoid blocking network for the entire time here, we release 785 * the solock() while doing the actual copy to user space. 786 * Although the sockbuf is locked, new data may still be appended, 787 * and thus we must maintain consistency of the sockbuf during that time. 788 * 789 * The caller may receive the data as a single mbuf chain by supplying 790 * an mbuf **mp0 for use in returning the chain. The uio is then used 791 * only for the count in uio_resid. 792 */ 793 int 794 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 795 struct mbuf **mp0, struct mbuf **controlp, int *flagsp, 796 socklen_t controllen) 797 { 798 struct mbuf *m, **mp; 799 struct mbuf *cm; 800 u_long len, offset, moff; 801 int flags, error, type, uio_error = 0; 802 const struct protosw *pr = so->so_proto; 803 struct mbuf *nextrecord; 804 size_t resid, orig_resid = uio->uio_resid; 805 806 mp = mp0; 807 if (paddr) 808 *paddr = NULL; 809 if (controlp) 810 *controlp = NULL; 811 if (flagsp) 812 flags = *flagsp &~ MSG_EOR; 813 else 814 flags = 0; 815 if (flags & MSG_OOB) { 816 m = m_get(M_WAIT, MT_DATA); 817 solock(so); 818 error = pru_rcvoob(so, m, flags & MSG_PEEK); 819 sounlock(so); 820 if (error) 821 goto bad; 822 do { 823 error = uiomove(mtod(m, caddr_t), 824 ulmin(uio->uio_resid, m->m_len), uio); 825 m = m_free(m); 826 } while (uio->uio_resid && error == 0 && m); 827 bad: 828 m_freem(m); 829 return (error); 830 } 831 if (mp) 832 *mp = NULL; 833 834 solock_shared(so); 835 restart: 836 if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) { 837 sounlock_shared(so); 838 return (error); 839 } 840 pru_lock(so); 841 842 m = so->so_rcv.sb_mb; 843 #ifdef SOCKET_SPLICE 844 if (isspliced(so)) 845 m = NULL; 846 #endif /* SOCKET_SPLICE */ 847 /* 848 * If we have less data than requested, block awaiting more 849 * (subject to any timeout) if: 850 * 1. the current count is less than the low water mark, 851 * 2. MSG_WAITALL is set, and it is possible to do the entire 852 * receive operation at once if we block (resid <= hiwat), or 853 * 3. MSG_DONTWAIT is not set. 854 * If MSG_WAITALL is set but resid is larger than the receive buffer, 855 * we have to do the receive in sections, and thus risk returning 856 * a short count if a timeout or signal occurs after we start. 857 */ 858 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 859 so->so_rcv.sb_cc < uio->uio_resid) && 860 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 861 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 862 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 863 #ifdef DIAGNOSTIC 864 if (m == NULL && so->so_rcv.sb_cc) 865 #ifdef SOCKET_SPLICE 866 if (!isspliced(so)) 867 #endif /* SOCKET_SPLICE */ 868 panic("receive 1: so %p, so_type %d, sb_cc %lu", 869 so, so->so_type, so->so_rcv.sb_cc); 870 #endif 871 if (so->so_error) { 872 if (m) 873 goto dontblock; 874 error = so->so_error; 875 if ((flags & MSG_PEEK) == 0) 876 so->so_error = 0; 877 goto release; 878 } 879 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 880 if (m) 881 goto dontblock; 882 else if (so->so_rcv.sb_cc == 0) 883 goto release; 884 } 885 for (; m; m = m->m_next) 886 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 887 m = so->so_rcv.sb_mb; 888 goto dontblock; 889 } 890 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 891 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 892 error = ENOTCONN; 893 goto release; 894 } 895 if (uio->uio_resid == 0 && controlp == NULL) 896 goto release; 897 if (flags & MSG_DONTWAIT) { 898 error = EWOULDBLOCK; 899 goto release; 900 } 901 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 902 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 903 sbunlock(so, &so->so_rcv); 904 pru_unlock(so); 905 error = sbwait(so, &so->so_rcv); 906 if (error) { 907 sounlock_shared(so); 908 return (error); 909 } 910 goto restart; 911 } 912 dontblock: 913 /* 914 * On entry here, m points to the first record of the socket buffer. 915 * From this point onward, we maintain 'nextrecord' as a cache of the 916 * pointer to the next record in the socket buffer. We must keep the 917 * various socket buffer pointers and local stack versions of the 918 * pointers in sync, pushing out modifications before operations that 919 * may sleep, and re-reading them afterwards. 920 * 921 * Otherwise, we will race with the network stack appending new data 922 * or records onto the socket buffer by using inconsistent/stale 923 * versions of the field, possibly resulting in socket buffer 924 * corruption. 925 */ 926 if (uio->uio_procp) 927 uio->uio_procp->p_ru.ru_msgrcv++; 928 KASSERT(m == so->so_rcv.sb_mb); 929 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 930 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 931 nextrecord = m->m_nextpkt; 932 if (pr->pr_flags & PR_ADDR) { 933 #ifdef DIAGNOSTIC 934 if (m->m_type != MT_SONAME) 935 panic("receive 1a: so %p, so_type %d, m %p, m_type %d", 936 so, so->so_type, m, m->m_type); 937 #endif 938 orig_resid = 0; 939 if (flags & MSG_PEEK) { 940 if (paddr) 941 *paddr = m_copym(m, 0, m->m_len, M_NOWAIT); 942 m = m->m_next; 943 } else { 944 sbfree(so, &so->so_rcv, m); 945 if (paddr) { 946 *paddr = m; 947 so->so_rcv.sb_mb = m->m_next; 948 m->m_next = NULL; 949 m = so->so_rcv.sb_mb; 950 } else { 951 so->so_rcv.sb_mb = m_free(m); 952 m = so->so_rcv.sb_mb; 953 } 954 sbsync(&so->so_rcv, nextrecord); 955 } 956 } 957 while (m && m->m_type == MT_CONTROL && error == 0) { 958 int skip = 0; 959 if (flags & MSG_PEEK) { 960 if (mtod(m, struct cmsghdr *)->cmsg_type == 961 SCM_RIGHTS) { 962 /* don't leak internalized SCM_RIGHTS msgs */ 963 skip = 1; 964 } else if (controlp) 965 *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); 966 m = m->m_next; 967 } else { 968 sbfree(so, &so->so_rcv, m); 969 so->so_rcv.sb_mb = m->m_next; 970 m->m_nextpkt = m->m_next = NULL; 971 cm = m; 972 m = so->so_rcv.sb_mb; 973 sbsync(&so->so_rcv, nextrecord); 974 if (controlp) { 975 if (pr->pr_domain->dom_externalize) { 976 pru_unlock(so); 977 sounlock_shared(so); 978 error = 979 (*pr->pr_domain->dom_externalize) 980 (cm, controllen, flags); 981 solock_shared(so); 982 pru_lock(so); 983 } 984 *controlp = cm; 985 } else { 986 /* 987 * Dispose of any SCM_RIGHTS message that went 988 * through the read path rather than recv. 989 */ 990 if (pr->pr_domain->dom_dispose) 991 pr->pr_domain->dom_dispose(cm); 992 m_free(cm); 993 } 994 } 995 if (m != NULL) 996 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 997 else 998 nextrecord = so->so_rcv.sb_mb; 999 if (controlp && !skip) 1000 controlp = &(*controlp)->m_next; 1001 orig_resid = 0; 1002 } 1003 1004 /* If m is non-NULL, we have some data to read. */ 1005 if (m) { 1006 type = m->m_type; 1007 if (type == MT_OOBDATA) 1008 flags |= MSG_OOB; 1009 if (m->m_flags & M_BCAST) 1010 flags |= MSG_BCAST; 1011 if (m->m_flags & M_MCAST) 1012 flags |= MSG_MCAST; 1013 } 1014 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 1015 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 1016 1017 moff = 0; 1018 offset = 0; 1019 while (m && uio->uio_resid > 0 && error == 0) { 1020 if (m->m_type == MT_OOBDATA) { 1021 if (type != MT_OOBDATA) 1022 break; 1023 } else if (type == MT_OOBDATA) { 1024 break; 1025 } else if (m->m_type == MT_CONTROL) { 1026 /* 1027 * If there is more than one control message in the 1028 * stream, we do a short read. Next can be received 1029 * or disposed by another system call. 1030 */ 1031 break; 1032 #ifdef DIAGNOSTIC 1033 } else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) { 1034 panic("receive 3: so %p, so_type %d, m %p, m_type %d", 1035 so, so->so_type, m, m->m_type); 1036 #endif 1037 } 1038 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1039 len = uio->uio_resid; 1040 if (so->so_oobmark && len > so->so_oobmark - offset) 1041 len = so->so_oobmark - offset; 1042 if (len > m->m_len - moff) 1043 len = m->m_len - moff; 1044 /* 1045 * If mp is set, just pass back the mbufs. 1046 * Otherwise copy them out via the uio, then free. 1047 * Sockbuf must be consistent here (points to current mbuf, 1048 * it points to next record) when we drop priority; 1049 * we must note any additions to the sockbuf when we 1050 * block interrupts again. 1051 */ 1052 if (mp == NULL && uio_error == 0) { 1053 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 1054 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 1055 resid = uio->uio_resid; 1056 pru_unlock(so); 1057 sounlock_shared(so); 1058 uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1059 solock_shared(so); 1060 pru_lock(so); 1061 if (uio_error) 1062 uio->uio_resid = resid - len; 1063 } else 1064 uio->uio_resid -= len; 1065 if (len == m->m_len - moff) { 1066 if (m->m_flags & M_EOR) 1067 flags |= MSG_EOR; 1068 if (flags & MSG_PEEK) { 1069 m = m->m_next; 1070 moff = 0; 1071 orig_resid = 0; 1072 } else { 1073 nextrecord = m->m_nextpkt; 1074 sbfree(so, &so->so_rcv, m); 1075 if (mp) { 1076 *mp = m; 1077 mp = &m->m_next; 1078 so->so_rcv.sb_mb = m = m->m_next; 1079 *mp = NULL; 1080 } else { 1081 so->so_rcv.sb_mb = m_free(m); 1082 m = so->so_rcv.sb_mb; 1083 } 1084 /* 1085 * If m != NULL, we also know that 1086 * so->so_rcv.sb_mb != NULL. 1087 */ 1088 KASSERT(so->so_rcv.sb_mb == m); 1089 if (m) { 1090 m->m_nextpkt = nextrecord; 1091 if (nextrecord == NULL) 1092 so->so_rcv.sb_lastrecord = m; 1093 } else { 1094 so->so_rcv.sb_mb = nextrecord; 1095 SB_EMPTY_FIXUP(&so->so_rcv); 1096 } 1097 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1098 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1099 } 1100 } else { 1101 if (flags & MSG_PEEK) { 1102 moff += len; 1103 orig_resid = 0; 1104 } else { 1105 if (mp) 1106 *mp = m_copym(m, 0, len, M_WAIT); 1107 m->m_data += len; 1108 m->m_len -= len; 1109 so->so_rcv.sb_cc -= len; 1110 so->so_rcv.sb_datacc -= len; 1111 } 1112 } 1113 if (so->so_oobmark) { 1114 if ((flags & MSG_PEEK) == 0) { 1115 so->so_oobmark -= len; 1116 if (so->so_oobmark == 0) { 1117 so->so_rcv.sb_state |= SS_RCVATMARK; 1118 break; 1119 } 1120 } else { 1121 offset += len; 1122 if (offset == so->so_oobmark) 1123 break; 1124 } 1125 } 1126 if (flags & MSG_EOR) 1127 break; 1128 /* 1129 * If the MSG_WAITALL flag is set (for non-atomic socket), 1130 * we must not quit until "uio->uio_resid == 0" or an error 1131 * termination. If a signal/timeout occurs, return 1132 * with a short count but without error. 1133 * Keep sockbuf locked against other readers. 1134 */ 1135 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1136 !sosendallatonce(so) && !nextrecord) { 1137 if (so->so_rcv.sb_state & SS_CANTRCVMORE || 1138 so->so_error) 1139 break; 1140 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1141 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1142 pru_unlock(so); 1143 error = sbwait(so, &so->so_rcv); 1144 if (error) { 1145 sbunlock(so, &so->so_rcv); 1146 sounlock_shared(so); 1147 return (0); 1148 } 1149 pru_lock(so); 1150 if ((m = so->so_rcv.sb_mb) != NULL) 1151 nextrecord = m->m_nextpkt; 1152 } 1153 } 1154 1155 if (m && pr->pr_flags & PR_ATOMIC) { 1156 flags |= MSG_TRUNC; 1157 if ((flags & MSG_PEEK) == 0) 1158 (void) sbdroprecord(so, &so->so_rcv); 1159 } 1160 if ((flags & MSG_PEEK) == 0) { 1161 if (m == NULL) { 1162 /* 1163 * First part is an inline SB_EMPTY_FIXUP(). Second 1164 * part makes sure sb_lastrecord is up-to-date if 1165 * there is still data in the socket buffer. 1166 */ 1167 so->so_rcv.sb_mb = nextrecord; 1168 if (so->so_rcv.sb_mb == NULL) { 1169 so->so_rcv.sb_mbtail = NULL; 1170 so->so_rcv.sb_lastrecord = NULL; 1171 } else if (nextrecord->m_nextpkt == NULL) 1172 so->so_rcv.sb_lastrecord = nextrecord; 1173 } 1174 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1175 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1176 if (pr->pr_flags & PR_WANTRCVD) 1177 pru_rcvd(so); 1178 } 1179 if (orig_resid == uio->uio_resid && orig_resid && 1180 (flags & MSG_EOR) == 0 && 1181 (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) { 1182 sbunlock(so, &so->so_rcv); 1183 pru_unlock(so); 1184 goto restart; 1185 } 1186 1187 if (uio_error) 1188 error = uio_error; 1189 1190 if (flagsp) 1191 *flagsp |= flags; 1192 release: 1193 sbunlock(so, &so->so_rcv); 1194 pru_unlock(so); 1195 sounlock_shared(so); 1196 return (error); 1197 } 1198 1199 int 1200 soshutdown(struct socket *so, int how) 1201 { 1202 int error = 0; 1203 1204 solock(so); 1205 switch (how) { 1206 case SHUT_RD: 1207 sorflush(so); 1208 break; 1209 case SHUT_RDWR: 1210 sorflush(so); 1211 /* FALLTHROUGH */ 1212 case SHUT_WR: 1213 error = pru_shutdown(so); 1214 break; 1215 default: 1216 error = EINVAL; 1217 break; 1218 } 1219 sounlock(so); 1220 1221 return (error); 1222 } 1223 1224 void 1225 sorflush(struct socket *so) 1226 { 1227 struct sockbuf *sb = &so->so_rcv; 1228 struct mbuf *m; 1229 const struct protosw *pr = so->so_proto; 1230 int error; 1231 1232 error = sblock(so, sb, SBL_WAIT | SBL_NOINTR); 1233 /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */ 1234 KASSERT(error == 0); 1235 socantrcvmore(so); 1236 m = sb->sb_mb; 1237 memset(&sb->sb_startzero, 0, 1238 (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); 1239 sb->sb_timeo_nsecs = INFSLP; 1240 sbunlock(so, sb); 1241 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1242 (*pr->pr_domain->dom_dispose)(m); 1243 m_purge(m); 1244 } 1245 1246 #ifdef SOCKET_SPLICE 1247 1248 #define so_splicelen so_sp->ssp_len 1249 #define so_splicemax so_sp->ssp_max 1250 #define so_idletv so_sp->ssp_idletv 1251 #define so_idleto so_sp->ssp_idleto 1252 #define so_splicetask so_sp->ssp_task 1253 1254 int 1255 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) 1256 { 1257 struct file *fp; 1258 struct socket *sosp; 1259 struct sosplice *sp; 1260 struct taskq *tq; 1261 int error = 0; 1262 1263 soassertlocked(so); 1264 1265 if (sosplice_taskq == NULL) { 1266 rw_enter_write(&sosplice_lock); 1267 if (sosplice_taskq == NULL) { 1268 tq = taskq_create("sosplice", 1, IPL_SOFTNET, 1269 TASKQ_MPSAFE); 1270 if (tq == NULL) { 1271 rw_exit_write(&sosplice_lock); 1272 return (ENOMEM); 1273 } 1274 /* Ensure the taskq is fully visible to other CPUs. */ 1275 membar_producer(); 1276 sosplice_taskq = tq; 1277 } 1278 rw_exit_write(&sosplice_lock); 1279 } else { 1280 /* Ensure the taskq is fully visible on this CPU. */ 1281 membar_consumer(); 1282 } 1283 1284 if ((so->so_proto->pr_flags & PR_SPLICE) == 0) 1285 return (EPROTONOSUPPORT); 1286 if (so->so_options & SO_ACCEPTCONN) 1287 return (EOPNOTSUPP); 1288 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1289 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 1290 return (ENOTCONN); 1291 if (so->so_sp == NULL) { 1292 sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1293 if (so->so_sp == NULL) 1294 so->so_sp = sp; 1295 else 1296 pool_put(&sosplice_pool, sp); 1297 } 1298 1299 /* If no fd is given, unsplice by removing existing link. */ 1300 if (fd < 0) { 1301 /* Lock receive buffer. */ 1302 if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) { 1303 return (error); 1304 } 1305 if (so->so_sp->ssp_socket) 1306 sounsplice(so, so->so_sp->ssp_socket, 0); 1307 sbunlock(so, &so->so_rcv); 1308 return (0); 1309 } 1310 1311 if (max && max < 0) 1312 return (EINVAL); 1313 1314 if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) 1315 return (EINVAL); 1316 1317 /* Find sosp, the drain socket where data will be spliced into. */ 1318 if ((error = getsock(curproc, fd, &fp)) != 0) 1319 return (error); 1320 sosp = fp->f_data; 1321 if (sosp->so_proto->pr_usrreqs->pru_send != 1322 so->so_proto->pr_usrreqs->pru_send) { 1323 error = EPROTONOSUPPORT; 1324 goto frele; 1325 } 1326 if (sosp->so_sp == NULL) { 1327 sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1328 if (sosp->so_sp == NULL) 1329 sosp->so_sp = sp; 1330 else 1331 pool_put(&sosplice_pool, sp); 1332 } 1333 1334 /* Lock both receive and send buffer. */ 1335 if ((error = sblock(so, &so->so_rcv, SBL_WAIT)) != 0) { 1336 goto frele; 1337 } 1338 if ((error = sblock(so, &sosp->so_snd, SBL_WAIT)) != 0) { 1339 sbunlock(so, &so->so_rcv); 1340 goto frele; 1341 } 1342 1343 if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { 1344 error = EBUSY; 1345 goto release; 1346 } 1347 if (sosp->so_options & SO_ACCEPTCONN) { 1348 error = EOPNOTSUPP; 1349 goto release; 1350 } 1351 if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) { 1352 error = ENOTCONN; 1353 goto release; 1354 } 1355 1356 /* Splice so and sosp together. */ 1357 so->so_sp->ssp_socket = sosp; 1358 sosp->so_sp->ssp_soback = so; 1359 so->so_splicelen = 0; 1360 so->so_splicemax = max; 1361 if (tv) 1362 so->so_idletv = *tv; 1363 else 1364 timerclear(&so->so_idletv); 1365 timeout_set_proc(&so->so_idleto, soidle, so); 1366 task_set(&so->so_splicetask, sotask, so); 1367 1368 /* 1369 * To prevent softnet interrupt from calling somove() while 1370 * we sleep, the socket buffers are not marked as spliced yet. 1371 */ 1372 if (somove(so, M_WAIT)) { 1373 so->so_rcv.sb_flags |= SB_SPLICE; 1374 sosp->so_snd.sb_flags |= SB_SPLICE; 1375 } 1376 1377 release: 1378 sbunlock(sosp, &sosp->so_snd); 1379 sbunlock(so, &so->so_rcv); 1380 frele: 1381 /* 1382 * FRELE() must not be called with the socket lock held. It is safe to 1383 * release the lock here as long as no other operation happen on the 1384 * socket when sosplice() returns. The dance could be avoided by 1385 * grabbing the socket lock inside this function. 1386 */ 1387 sounlock(so); 1388 FRELE(fp, curproc); 1389 solock(so); 1390 return (error); 1391 } 1392 1393 void 1394 sounsplice(struct socket *so, struct socket *sosp, int freeing) 1395 { 1396 soassertlocked(so); 1397 1398 task_del(sosplice_taskq, &so->so_splicetask); 1399 timeout_del(&so->so_idleto); 1400 sosp->so_snd.sb_flags &= ~SB_SPLICE; 1401 so->so_rcv.sb_flags &= ~SB_SPLICE; 1402 so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; 1403 /* Do not wakeup a socket that is about to be freed. */ 1404 if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so)) 1405 sorwakeup(so); 1406 if ((freeing & SOSP_FREEING_WRITE) == 0 && sowriteable(sosp)) 1407 sowwakeup(sosp); 1408 } 1409 1410 void 1411 soidle(void *arg) 1412 { 1413 struct socket *so = arg; 1414 1415 solock(so); 1416 if (so->so_rcv.sb_flags & SB_SPLICE) { 1417 so->so_error = ETIMEDOUT; 1418 sounsplice(so, so->so_sp->ssp_socket, 0); 1419 } 1420 sounlock(so); 1421 } 1422 1423 void 1424 sotask(void *arg) 1425 { 1426 struct socket *so = arg; 1427 1428 solock(so); 1429 if (so->so_rcv.sb_flags & SB_SPLICE) { 1430 /* 1431 * We may not sleep here as sofree() and unsplice() may be 1432 * called from softnet interrupt context. This would remove 1433 * the socket during somove(). 1434 */ 1435 somove(so, M_DONTWAIT); 1436 } 1437 sounlock(so); 1438 1439 /* Avoid user land starvation. */ 1440 yield(); 1441 } 1442 1443 /* 1444 * The socket splicing task or idle timeout may sleep while grabbing the net 1445 * lock. As sofree() can be called anytime, sotask() or soidle() could access 1446 * the socket memory of a freed socket after wakeup. So delay the pool_put() 1447 * after all pending socket splicing tasks or timeouts have finished. Do this 1448 * by scheduling it on the same threads. 1449 */ 1450 void 1451 soreaper(void *arg) 1452 { 1453 struct socket *so = arg; 1454 1455 /* Reuse splice task, sounsplice() has been called before. */ 1456 task_set(&so->so_sp->ssp_task, soput, so); 1457 task_add(sosplice_taskq, &so->so_sp->ssp_task); 1458 } 1459 1460 void 1461 soput(void *arg) 1462 { 1463 struct socket *so = arg; 1464 1465 pool_put(&sosplice_pool, so->so_sp); 1466 pool_put(&socket_pool, so); 1467 } 1468 1469 /* 1470 * Move data from receive buffer of spliced source socket to send 1471 * buffer of drain socket. Try to move as much as possible in one 1472 * big chunk. It is a TCP only implementation. 1473 * Return value 0 means splicing has been finished, 1 continue. 1474 */ 1475 int 1476 somove(struct socket *so, int wait) 1477 { 1478 struct socket *sosp = so->so_sp->ssp_socket; 1479 struct mbuf *m, **mp, *nextrecord; 1480 u_long len, off, oobmark; 1481 long space; 1482 int error = 0, maxreached = 0; 1483 unsigned int rcvstate; 1484 1485 soassertlocked(so); 1486 1487 nextpkt: 1488 if (so->so_error) { 1489 error = so->so_error; 1490 goto release; 1491 } 1492 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) { 1493 error = EPIPE; 1494 goto release; 1495 } 1496 if (sosp->so_error && sosp->so_error != ETIMEDOUT && 1497 sosp->so_error != EFBIG && sosp->so_error != ELOOP) { 1498 error = sosp->so_error; 1499 goto release; 1500 } 1501 if ((sosp->so_state & SS_ISCONNECTED) == 0) 1502 goto release; 1503 1504 /* Calculate how many bytes can be copied now. */ 1505 len = so->so_rcv.sb_datacc; 1506 if (so->so_splicemax) { 1507 KASSERT(so->so_splicelen < so->so_splicemax); 1508 if (so->so_splicemax <= so->so_splicelen + len) { 1509 len = so->so_splicemax - so->so_splicelen; 1510 maxreached = 1; 1511 } 1512 } 1513 space = sbspace(sosp, &sosp->so_snd); 1514 if (so->so_oobmark && so->so_oobmark < len && 1515 so->so_oobmark < space + 1024) 1516 space += 1024; 1517 if (space <= 0) { 1518 maxreached = 0; 1519 goto release; 1520 } 1521 if (space < len) { 1522 maxreached = 0; 1523 if (space < sosp->so_snd.sb_lowat) 1524 goto release; 1525 len = space; 1526 } 1527 sosp->so_snd.sb_state |= SS_ISSENDING; 1528 1529 SBLASTRECORDCHK(&so->so_rcv, "somove 1"); 1530 SBLASTMBUFCHK(&so->so_rcv, "somove 1"); 1531 m = so->so_rcv.sb_mb; 1532 if (m == NULL) 1533 goto release; 1534 nextrecord = m->m_nextpkt; 1535 1536 /* Drop address and control information not used with splicing. */ 1537 if (so->so_proto->pr_flags & PR_ADDR) { 1538 #ifdef DIAGNOSTIC 1539 if (m->m_type != MT_SONAME) 1540 panic("somove soname: so %p, so_type %d, m %p, " 1541 "m_type %d", so, so->so_type, m, m->m_type); 1542 #endif 1543 m = m->m_next; 1544 } 1545 while (m && m->m_type == MT_CONTROL) 1546 m = m->m_next; 1547 if (m == NULL) { 1548 sbdroprecord(so, &so->so_rcv); 1549 if (so->so_proto->pr_flags & PR_WANTRCVD) 1550 pru_rcvd(so); 1551 goto nextpkt; 1552 } 1553 1554 /* 1555 * By splicing sockets connected to localhost, userland might create a 1556 * loop. Dissolve splicing with error if loop is detected by counter. 1557 * 1558 * If we deal with looped broadcast/multicast packet we bail out with 1559 * no error to suppress splice termination. 1560 */ 1561 if ((m->m_flags & M_PKTHDR) && 1562 ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) || 1563 ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) { 1564 error = ELOOP; 1565 goto release; 1566 } 1567 1568 if (so->so_proto->pr_flags & PR_ATOMIC) { 1569 if ((m->m_flags & M_PKTHDR) == 0) 1570 panic("somove !PKTHDR: so %p, so_type %d, m %p, " 1571 "m_type %d", so, so->so_type, m, m->m_type); 1572 if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) { 1573 error = EMSGSIZE; 1574 goto release; 1575 } 1576 if (len < m->m_pkthdr.len) 1577 goto release; 1578 if (m->m_pkthdr.len < len) { 1579 maxreached = 0; 1580 len = m->m_pkthdr.len; 1581 } 1582 /* 1583 * Throw away the name mbuf after it has been assured 1584 * that the whole first record can be processed. 1585 */ 1586 m = so->so_rcv.sb_mb; 1587 sbfree(so, &so->so_rcv, m); 1588 so->so_rcv.sb_mb = m_free(m); 1589 sbsync(&so->so_rcv, nextrecord); 1590 } 1591 /* 1592 * Throw away the control mbufs after it has been assured 1593 * that the whole first record can be processed. 1594 */ 1595 m = so->so_rcv.sb_mb; 1596 while (m && m->m_type == MT_CONTROL) { 1597 sbfree(so, &so->so_rcv, m); 1598 so->so_rcv.sb_mb = m_free(m); 1599 m = so->so_rcv.sb_mb; 1600 sbsync(&so->so_rcv, nextrecord); 1601 } 1602 1603 SBLASTRECORDCHK(&so->so_rcv, "somove 2"); 1604 SBLASTMBUFCHK(&so->so_rcv, "somove 2"); 1605 1606 /* Take at most len mbufs out of receive buffer. */ 1607 for (off = 0, mp = &m; off <= len && *mp; 1608 off += (*mp)->m_len, mp = &(*mp)->m_next) { 1609 u_long size = len - off; 1610 1611 #ifdef DIAGNOSTIC 1612 if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER) 1613 panic("somove type: so %p, so_type %d, m %p, " 1614 "m_type %d", so, so->so_type, *mp, (*mp)->m_type); 1615 #endif 1616 if ((*mp)->m_len > size) { 1617 /* 1618 * Move only a partial mbuf at maximum splice length or 1619 * if the drain buffer is too small for this large mbuf. 1620 */ 1621 if (!maxreached && so->so_snd.sb_datacc > 0) { 1622 len -= size; 1623 break; 1624 } 1625 *mp = m_copym(so->so_rcv.sb_mb, 0, size, wait); 1626 if (*mp == NULL) { 1627 len -= size; 1628 break; 1629 } 1630 so->so_rcv.sb_mb->m_data += size; 1631 so->so_rcv.sb_mb->m_len -= size; 1632 so->so_rcv.sb_cc -= size; 1633 so->so_rcv.sb_datacc -= size; 1634 } else { 1635 *mp = so->so_rcv.sb_mb; 1636 sbfree(so, &so->so_rcv, *mp); 1637 so->so_rcv.sb_mb = (*mp)->m_next; 1638 sbsync(&so->so_rcv, nextrecord); 1639 } 1640 } 1641 *mp = NULL; 1642 1643 SBLASTRECORDCHK(&so->so_rcv, "somove 3"); 1644 SBLASTMBUFCHK(&so->so_rcv, "somove 3"); 1645 SBCHECK(so, &so->so_rcv); 1646 if (m == NULL) 1647 goto release; 1648 m->m_nextpkt = NULL; 1649 if (m->m_flags & M_PKTHDR) { 1650 m_resethdr(m); 1651 m->m_pkthdr.len = len; 1652 } 1653 1654 /* Send window update to source peer as receive buffer has changed. */ 1655 if (so->so_proto->pr_flags & PR_WANTRCVD) 1656 pru_rcvd(so); 1657 1658 /* Receive buffer did shrink by len bytes, adjust oob. */ 1659 rcvstate = so->so_rcv.sb_state; 1660 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1661 oobmark = so->so_oobmark; 1662 so->so_oobmark = oobmark > len ? oobmark - len : 0; 1663 if (oobmark) { 1664 if (oobmark == len) 1665 so->so_rcv.sb_state |= SS_RCVATMARK; 1666 if (oobmark >= len) 1667 oobmark = 0; 1668 } 1669 1670 /* 1671 * Handle oob data. If any malloc fails, ignore error. 1672 * TCP urgent data is not very reliable anyway. 1673 */ 1674 while (((rcvstate & SS_RCVATMARK) || oobmark) && 1675 (so->so_options & SO_OOBINLINE)) { 1676 struct mbuf *o = NULL; 1677 1678 if (rcvstate & SS_RCVATMARK) { 1679 o = m_get(wait, MT_DATA); 1680 rcvstate &= ~SS_RCVATMARK; 1681 } else if (oobmark) { 1682 o = m_split(m, oobmark, wait); 1683 if (o) { 1684 error = pru_send(sosp, m, NULL, NULL); 1685 if (error) { 1686 if (sosp->so_snd.sb_state & 1687 SS_CANTSENDMORE) 1688 error = EPIPE; 1689 m_freem(o); 1690 goto release; 1691 } 1692 len -= oobmark; 1693 so->so_splicelen += oobmark; 1694 m = o; 1695 o = m_get(wait, MT_DATA); 1696 } 1697 oobmark = 0; 1698 } 1699 if (o) { 1700 o->m_len = 1; 1701 *mtod(o, caddr_t) = *mtod(m, caddr_t); 1702 error = pru_sendoob(sosp, o, NULL, NULL); 1703 if (error) { 1704 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) 1705 error = EPIPE; 1706 m_freem(m); 1707 goto release; 1708 } 1709 len -= 1; 1710 so->so_splicelen += 1; 1711 if (oobmark) { 1712 oobmark -= 1; 1713 if (oobmark == 0) 1714 rcvstate |= SS_RCVATMARK; 1715 } 1716 m_adj(m, 1); 1717 } 1718 } 1719 1720 /* Append all remaining data to drain socket. */ 1721 if (so->so_rcv.sb_cc == 0 || maxreached) 1722 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1723 error = pru_send(sosp, m, NULL, NULL); 1724 if (error) { 1725 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) 1726 error = EPIPE; 1727 goto release; 1728 } 1729 so->so_splicelen += len; 1730 1731 /* Move several packets if possible. */ 1732 if (!maxreached && nextrecord) 1733 goto nextpkt; 1734 1735 release: 1736 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1737 if (!error && maxreached && so->so_splicemax == so->so_splicelen) 1738 error = EFBIG; 1739 if (error) 1740 so->so_error = error; 1741 if (((so->so_rcv.sb_state & SS_CANTRCVMORE) && 1742 so->so_rcv.sb_cc == 0) || 1743 (sosp->so_snd.sb_state & SS_CANTSENDMORE) || 1744 maxreached || error) { 1745 sounsplice(so, sosp, 0); 1746 return (0); 1747 } 1748 if (timerisset(&so->so_idletv)) 1749 timeout_add_tv(&so->so_idleto, &so->so_idletv); 1750 return (1); 1751 } 1752 1753 #endif /* SOCKET_SPLICE */ 1754 1755 void 1756 sorwakeup(struct socket *so) 1757 { 1758 soassertlocked(so); 1759 1760 #ifdef SOCKET_SPLICE 1761 if (so->so_rcv.sb_flags & SB_SPLICE) { 1762 /* 1763 * TCP has a sendbuffer that can handle multiple packets 1764 * at once. So queue the stream a bit to accumulate data. 1765 * The sosplice thread will call somove() later and send 1766 * the packets calling tcp_output() only once. 1767 * In the UDP case, send out the packets immediately. 1768 * Using a thread would make things slower. 1769 */ 1770 if (so->so_proto->pr_flags & PR_WANTRCVD) 1771 task_add(sosplice_taskq, &so->so_splicetask); 1772 else 1773 somove(so, M_DONTWAIT); 1774 } 1775 if (isspliced(so)) 1776 return; 1777 #endif 1778 sowakeup(so, &so->so_rcv); 1779 if (so->so_upcall) 1780 (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT); 1781 } 1782 1783 void 1784 sowwakeup(struct socket *so) 1785 { 1786 soassertlocked(so); 1787 1788 #ifdef SOCKET_SPLICE 1789 if (so->so_snd.sb_flags & SB_SPLICE) 1790 task_add(sosplice_taskq, &so->so_sp->ssp_soback->so_splicetask); 1791 if (issplicedback(so)) 1792 return; 1793 #endif 1794 sowakeup(so, &so->so_snd); 1795 } 1796 1797 int 1798 sosetopt(struct socket *so, int level, int optname, struct mbuf *m) 1799 { 1800 int error = 0; 1801 1802 if (level != SOL_SOCKET) { 1803 if (so->so_proto->pr_ctloutput) { 1804 solock(so); 1805 error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1806 level, optname, m); 1807 sounlock(so); 1808 return (error); 1809 } 1810 error = ENOPROTOOPT; 1811 } else { 1812 switch (optname) { 1813 1814 case SO_LINGER: 1815 if (m == NULL || m->m_len != sizeof (struct linger) || 1816 mtod(m, struct linger *)->l_linger < 0 || 1817 mtod(m, struct linger *)->l_linger > SHRT_MAX) 1818 return (EINVAL); 1819 1820 solock(so); 1821 so->so_linger = mtod(m, struct linger *)->l_linger; 1822 if (*mtod(m, int *)) 1823 so->so_options |= optname; 1824 else 1825 so->so_options &= ~optname; 1826 sounlock(so); 1827 1828 break; 1829 case SO_BINDANY: 1830 if ((error = suser(curproc)) != 0) /* XXX */ 1831 return (error); 1832 /* FALLTHROUGH */ 1833 1834 case SO_DEBUG: 1835 case SO_KEEPALIVE: 1836 case SO_USELOOPBACK: 1837 case SO_BROADCAST: 1838 case SO_REUSEADDR: 1839 case SO_REUSEPORT: 1840 case SO_OOBINLINE: 1841 case SO_TIMESTAMP: 1842 case SO_ZEROIZE: 1843 if (m == NULL || m->m_len < sizeof (int)) 1844 return (EINVAL); 1845 1846 solock(so); 1847 if (*mtod(m, int *)) 1848 so->so_options |= optname; 1849 else 1850 so->so_options &= ~optname; 1851 sounlock(so); 1852 1853 break; 1854 case SO_DONTROUTE: 1855 if (m == NULL || m->m_len < sizeof (int)) 1856 return (EINVAL); 1857 if (*mtod(m, int *)) 1858 error = EOPNOTSUPP; 1859 break; 1860 1861 case SO_SNDBUF: 1862 case SO_RCVBUF: 1863 case SO_SNDLOWAT: 1864 case SO_RCVLOWAT: 1865 { 1866 struct sockbuf *sb = (optname == SO_SNDBUF || 1867 optname == SO_SNDLOWAT ? 1868 &so->so_snd : &so->so_rcv); 1869 u_long cnt; 1870 1871 if (m == NULL || m->m_len < sizeof (int)) 1872 return (EINVAL); 1873 cnt = *mtod(m, int *); 1874 if ((long)cnt <= 0) 1875 cnt = 1; 1876 1877 solock(so); 1878 switch (optname) { 1879 case SO_SNDBUF: 1880 case SO_RCVBUF: 1881 if (sb->sb_state & 1882 (SS_CANTSENDMORE | SS_CANTRCVMORE)) { 1883 error = EINVAL; 1884 break; 1885 } 1886 if (sbcheckreserve(cnt, sb->sb_wat) || 1887 sbreserve(so, sb, cnt)) { 1888 error = ENOBUFS; 1889 break; 1890 } 1891 sb->sb_wat = cnt; 1892 break; 1893 case SO_SNDLOWAT: 1894 case SO_RCVLOWAT: 1895 sb->sb_lowat = (cnt > sb->sb_hiwat) ? 1896 sb->sb_hiwat : cnt; 1897 break; 1898 } 1899 sounlock(so); 1900 break; 1901 } 1902 1903 case SO_SNDTIMEO: 1904 case SO_RCVTIMEO: 1905 { 1906 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 1907 &so->so_snd : &so->so_rcv); 1908 struct timeval tv; 1909 uint64_t nsecs; 1910 1911 if (m == NULL || m->m_len < sizeof (tv)) 1912 return (EINVAL); 1913 memcpy(&tv, mtod(m, struct timeval *), sizeof tv); 1914 if (!timerisvalid(&tv)) 1915 return (EINVAL); 1916 nsecs = TIMEVAL_TO_NSEC(&tv); 1917 if (nsecs == UINT64_MAX) 1918 return (EDOM); 1919 if (nsecs == 0) 1920 nsecs = INFSLP; 1921 1922 solock(so); 1923 sb->sb_timeo_nsecs = nsecs; 1924 sounlock(so); 1925 break; 1926 } 1927 1928 case SO_RTABLE: 1929 if (so->so_proto->pr_domain && 1930 so->so_proto->pr_domain->dom_protosw && 1931 so->so_proto->pr_ctloutput) { 1932 const struct domain *dom = 1933 so->so_proto->pr_domain; 1934 1935 level = dom->dom_protosw->pr_protocol; 1936 solock(so); 1937 error = (*so->so_proto->pr_ctloutput) 1938 (PRCO_SETOPT, so, level, optname, m); 1939 sounlock(so); 1940 } else 1941 error = ENOPROTOOPT; 1942 break; 1943 #ifdef SOCKET_SPLICE 1944 case SO_SPLICE: 1945 solock(so); 1946 if (m == NULL) { 1947 error = sosplice(so, -1, 0, NULL); 1948 } else if (m->m_len < sizeof(int)) { 1949 error = EINVAL; 1950 } else if (m->m_len < sizeof(struct splice)) { 1951 error = sosplice(so, *mtod(m, int *), 0, NULL); 1952 } else { 1953 error = sosplice(so, 1954 mtod(m, struct splice *)->sp_fd, 1955 mtod(m, struct splice *)->sp_max, 1956 &mtod(m, struct splice *)->sp_idle); 1957 } 1958 sounlock(so); 1959 break; 1960 #endif /* SOCKET_SPLICE */ 1961 1962 default: 1963 error = ENOPROTOOPT; 1964 break; 1965 } 1966 } 1967 1968 return (error); 1969 } 1970 1971 int 1972 sogetopt(struct socket *so, int level, int optname, struct mbuf *m) 1973 { 1974 int error = 0; 1975 1976 if (level != SOL_SOCKET) { 1977 if (so->so_proto->pr_ctloutput) { 1978 m->m_len = 0; 1979 1980 solock(so); 1981 error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so, 1982 level, optname, m); 1983 sounlock(so); 1984 return (error); 1985 } else 1986 return (ENOPROTOOPT); 1987 } else { 1988 m->m_len = sizeof (int); 1989 1990 switch (optname) { 1991 1992 case SO_LINGER: 1993 m->m_len = sizeof (struct linger); 1994 solock_shared(so); 1995 mtod(m, struct linger *)->l_onoff = 1996 so->so_options & SO_LINGER; 1997 mtod(m, struct linger *)->l_linger = so->so_linger; 1998 sounlock_shared(so); 1999 break; 2000 2001 case SO_BINDANY: 2002 case SO_USELOOPBACK: 2003 case SO_DEBUG: 2004 case SO_KEEPALIVE: 2005 case SO_REUSEADDR: 2006 case SO_REUSEPORT: 2007 case SO_BROADCAST: 2008 case SO_OOBINLINE: 2009 case SO_TIMESTAMP: 2010 case SO_ZEROIZE: 2011 *mtod(m, int *) = so->so_options & optname; 2012 break; 2013 2014 case SO_DONTROUTE: 2015 *mtod(m, int *) = 0; 2016 break; 2017 2018 case SO_TYPE: 2019 *mtod(m, int *) = so->so_type; 2020 break; 2021 2022 case SO_ERROR: 2023 solock(so); 2024 *mtod(m, int *) = so->so_error; 2025 so->so_error = 0; 2026 sounlock(so); 2027 2028 break; 2029 2030 case SO_DOMAIN: 2031 *mtod(m, int *) = so->so_proto->pr_domain->dom_family; 2032 break; 2033 2034 case SO_PROTOCOL: 2035 *mtod(m, int *) = so->so_proto->pr_protocol; 2036 break; 2037 2038 case SO_SNDBUF: 2039 *mtod(m, int *) = so->so_snd.sb_hiwat; 2040 break; 2041 2042 case SO_RCVBUF: 2043 *mtod(m, int *) = so->so_rcv.sb_hiwat; 2044 break; 2045 2046 case SO_SNDLOWAT: 2047 *mtod(m, int *) = so->so_snd.sb_lowat; 2048 break; 2049 2050 case SO_RCVLOWAT: 2051 *mtod(m, int *) = so->so_rcv.sb_lowat; 2052 break; 2053 2054 case SO_SNDTIMEO: 2055 case SO_RCVTIMEO: 2056 { 2057 struct sockbuf *sb = (optname == SO_SNDTIMEO ? 2058 &so->so_snd : &so->so_rcv); 2059 struct timeval tv; 2060 uint64_t nsecs; 2061 2062 solock_shared(so); 2063 nsecs = sb->sb_timeo_nsecs; 2064 sounlock_shared(so); 2065 2066 m->m_len = sizeof(struct timeval); 2067 memset(&tv, 0, sizeof(tv)); 2068 if (nsecs != INFSLP) 2069 NSEC_TO_TIMEVAL(nsecs, &tv); 2070 memcpy(mtod(m, struct timeval *), &tv, sizeof tv); 2071 break; 2072 } 2073 2074 case SO_RTABLE: 2075 if (so->so_proto->pr_domain && 2076 so->so_proto->pr_domain->dom_protosw && 2077 so->so_proto->pr_ctloutput) { 2078 const struct domain *dom = 2079 so->so_proto->pr_domain; 2080 2081 level = dom->dom_protosw->pr_protocol; 2082 solock(so); 2083 error = (*so->so_proto->pr_ctloutput) 2084 (PRCO_GETOPT, so, level, optname, m); 2085 sounlock(so); 2086 if (error) 2087 return (error); 2088 break; 2089 } 2090 return (ENOPROTOOPT); 2091 2092 #ifdef SOCKET_SPLICE 2093 case SO_SPLICE: 2094 { 2095 off_t len; 2096 2097 m->m_len = sizeof(off_t); 2098 solock_shared(so); 2099 len = so->so_sp ? so->so_sp->ssp_len : 0; 2100 sounlock_shared(so); 2101 memcpy(mtod(m, off_t *), &len, sizeof(off_t)); 2102 break; 2103 } 2104 #endif /* SOCKET_SPLICE */ 2105 2106 case SO_PEERCRED: 2107 if (so->so_proto->pr_protocol == AF_UNIX) { 2108 struct unpcb *unp = sotounpcb(so); 2109 2110 solock(so); 2111 if (unp->unp_flags & UNP_FEIDS) { 2112 m->m_len = sizeof(unp->unp_connid); 2113 memcpy(mtod(m, caddr_t), 2114 &(unp->unp_connid), m->m_len); 2115 sounlock(so); 2116 break; 2117 } 2118 sounlock(so); 2119 2120 return (ENOTCONN); 2121 } 2122 return (EOPNOTSUPP); 2123 2124 default: 2125 return (ENOPROTOOPT); 2126 } 2127 return (0); 2128 } 2129 } 2130 2131 void 2132 sohasoutofband(struct socket *so) 2133 { 2134 pgsigio(&so->so_sigio, SIGURG, 0); 2135 knote_locked(&so->so_rcv.sb_klist, 0); 2136 } 2137 2138 int 2139 soo_kqfilter(struct file *fp, struct knote *kn) 2140 { 2141 struct socket *so = kn->kn_fp->f_data; 2142 struct sockbuf *sb; 2143 2144 solock(so); 2145 switch (kn->kn_filter) { 2146 case EVFILT_READ: 2147 if (so->so_options & SO_ACCEPTCONN) 2148 kn->kn_fop = &solisten_filtops; 2149 else 2150 kn->kn_fop = &soread_filtops; 2151 sb = &so->so_rcv; 2152 break; 2153 case EVFILT_WRITE: 2154 kn->kn_fop = &sowrite_filtops; 2155 sb = &so->so_snd; 2156 break; 2157 case EVFILT_EXCEPT: 2158 kn->kn_fop = &soexcept_filtops; 2159 sb = &so->so_rcv; 2160 break; 2161 default: 2162 sounlock(so); 2163 return (EINVAL); 2164 } 2165 2166 klist_insert_locked(&sb->sb_klist, kn); 2167 sounlock(so); 2168 2169 return (0); 2170 } 2171 2172 void 2173 filt_sordetach(struct knote *kn) 2174 { 2175 struct socket *so = kn->kn_fp->f_data; 2176 2177 klist_remove(&so->so_rcv.sb_klist, kn); 2178 } 2179 2180 int 2181 filt_soread(struct knote *kn, long hint) 2182 { 2183 struct socket *so = kn->kn_fp->f_data; 2184 int rv = 0; 2185 2186 soassertlocked(so); 2187 2188 kn->kn_data = so->so_rcv.sb_cc; 2189 #ifdef SOCKET_SPLICE 2190 if (isspliced(so)) { 2191 rv = 0; 2192 } else 2193 #endif /* SOCKET_SPLICE */ 2194 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 2195 kn->kn_flags |= EV_EOF; 2196 if (kn->kn_flags & __EV_POLL) { 2197 if (so->so_state & SS_ISDISCONNECTED) 2198 kn->kn_flags |= __EV_HUP; 2199 } 2200 kn->kn_fflags = so->so_error; 2201 rv = 1; 2202 } else if (so->so_error) { /* temporary udp error */ 2203 rv = 1; 2204 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2205 rv = (kn->kn_data >= kn->kn_sdata); 2206 } else { 2207 rv = (kn->kn_data >= so->so_rcv.sb_lowat); 2208 } 2209 2210 return rv; 2211 } 2212 2213 void 2214 filt_sowdetach(struct knote *kn) 2215 { 2216 struct socket *so = kn->kn_fp->f_data; 2217 2218 klist_remove(&so->so_snd.sb_klist, kn); 2219 } 2220 2221 int 2222 filt_sowrite(struct knote *kn, long hint) 2223 { 2224 struct socket *so = kn->kn_fp->f_data; 2225 int rv; 2226 2227 soassertlocked(so); 2228 2229 kn->kn_data = sbspace(so, &so->so_snd); 2230 if (so->so_snd.sb_state & SS_CANTSENDMORE) { 2231 kn->kn_flags |= EV_EOF; 2232 if (kn->kn_flags & __EV_POLL) { 2233 if (so->so_state & SS_ISDISCONNECTED) 2234 kn->kn_flags |= __EV_HUP; 2235 } 2236 kn->kn_fflags = so->so_error; 2237 rv = 1; 2238 } else if (so->so_error) { /* temporary udp error */ 2239 rv = 1; 2240 } else if (((so->so_state & SS_ISCONNECTED) == 0) && 2241 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 2242 rv = 0; 2243 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2244 rv = (kn->kn_data >= kn->kn_sdata); 2245 } else { 2246 rv = (kn->kn_data >= so->so_snd.sb_lowat); 2247 } 2248 2249 return (rv); 2250 } 2251 2252 int 2253 filt_soexcept(struct knote *kn, long hint) 2254 { 2255 struct socket *so = kn->kn_fp->f_data; 2256 int rv = 0; 2257 2258 soassertlocked(so); 2259 2260 #ifdef SOCKET_SPLICE 2261 if (isspliced(so)) { 2262 rv = 0; 2263 } else 2264 #endif /* SOCKET_SPLICE */ 2265 if (kn->kn_sfflags & NOTE_OOB) { 2266 if (so->so_oobmark || (so->so_rcv.sb_state & SS_RCVATMARK)) { 2267 kn->kn_fflags |= NOTE_OOB; 2268 kn->kn_data -= so->so_oobmark; 2269 rv = 1; 2270 } 2271 } 2272 2273 if (kn->kn_flags & __EV_POLL) { 2274 if (so->so_state & SS_ISDISCONNECTED) { 2275 kn->kn_flags |= __EV_HUP; 2276 rv = 1; 2277 } 2278 } 2279 2280 return rv; 2281 } 2282 2283 int 2284 filt_solisten(struct knote *kn, long hint) 2285 { 2286 struct socket *so = kn->kn_fp->f_data; 2287 int active; 2288 2289 soassertlocked(so); 2290 2291 kn->kn_data = so->so_qlen; 2292 active = (kn->kn_data != 0); 2293 2294 if (kn->kn_flags & (__EV_POLL | __EV_SELECT)) { 2295 if (so->so_state & SS_ISDISCONNECTED) { 2296 kn->kn_flags |= __EV_HUP; 2297 active = 1; 2298 } else { 2299 active = soreadable(so); 2300 } 2301 } 2302 2303 return (active); 2304 } 2305 2306 int 2307 filt_somodify(struct kevent *kev, struct knote *kn) 2308 { 2309 struct socket *so = kn->kn_fp->f_data; 2310 int rv; 2311 2312 solock(so); 2313 rv = knote_modify(kev, kn); 2314 sounlock(so); 2315 2316 return (rv); 2317 } 2318 2319 int 2320 filt_soprocess(struct knote *kn, struct kevent *kev) 2321 { 2322 struct socket *so = kn->kn_fp->f_data; 2323 int rv; 2324 2325 solock(so); 2326 rv = knote_process(kn, kev); 2327 sounlock(so); 2328 2329 return (rv); 2330 } 2331 2332 void 2333 klist_soassertlk(void *arg) 2334 { 2335 struct socket *so = arg; 2336 2337 soassertlocked(so); 2338 } 2339 2340 int 2341 klist_solock(void *arg) 2342 { 2343 struct socket *so = arg; 2344 2345 solock(so); 2346 return (1); 2347 } 2348 2349 void 2350 klist_sounlock(void *arg, int ls) 2351 { 2352 struct socket *so = arg; 2353 2354 sounlock(so); 2355 } 2356 2357 #ifdef DDB 2358 void 2359 sobuf_print(struct sockbuf *, 2360 int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))); 2361 2362 void 2363 sobuf_print(struct sockbuf *sb, 2364 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2365 { 2366 (*pr)("\tsb_cc: %lu\n", sb->sb_cc); 2367 (*pr)("\tsb_datacc: %lu\n", sb->sb_datacc); 2368 (*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat); 2369 (*pr)("\tsb_wat: %lu\n", sb->sb_wat); 2370 (*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt); 2371 (*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax); 2372 (*pr)("\tsb_lowat: %ld\n", sb->sb_lowat); 2373 (*pr)("\tsb_mb: %p\n", sb->sb_mb); 2374 (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail); 2375 (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord); 2376 (*pr)("\tsb_sel: ...\n"); 2377 (*pr)("\tsb_flags: %04x\n", sb->sb_flags); 2378 (*pr)("\tsb_state: %04x\n", sb->sb_state); 2379 (*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs); 2380 } 2381 2382 void 2383 so_print(void *v, 2384 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2385 { 2386 struct socket *so = v; 2387 2388 (*pr)("socket %p\n", so); 2389 (*pr)("so_type: %i\n", so->so_type); 2390 (*pr)("so_options: 0x%04x\n", so->so_options); /* %b */ 2391 (*pr)("so_linger: %i\n", so->so_linger); 2392 (*pr)("so_state: 0x%04x\n", so->so_state); 2393 (*pr)("so_pcb: %p\n", so->so_pcb); 2394 (*pr)("so_proto: %p\n", so->so_proto); 2395 (*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio); 2396 2397 (*pr)("so_head: %p\n", so->so_head); 2398 (*pr)("so_onq: %p\n", so->so_onq); 2399 (*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0)); 2400 (*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q)); 2401 (*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe)); 2402 (*pr)("so_q0len: %i\n", so->so_q0len); 2403 (*pr)("so_qlen: %i\n", so->so_qlen); 2404 (*pr)("so_qlimit: %i\n", so->so_qlimit); 2405 (*pr)("so_timeo: %i\n", so->so_timeo); 2406 (*pr)("so_obmark: %lu\n", so->so_oobmark); 2407 2408 (*pr)("so_sp: %p\n", so->so_sp); 2409 if (so->so_sp != NULL) { 2410 (*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket); 2411 (*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback); 2412 (*pr)("\tssp_len: %lld\n", 2413 (unsigned long long)so->so_sp->ssp_len); 2414 (*pr)("\tssp_max: %lld\n", 2415 (unsigned long long)so->so_sp->ssp_max); 2416 (*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec, 2417 so->so_sp->ssp_idletv.tv_usec); 2418 (*pr)("\tssp_idleto: %spending (@%i)\n", 2419 timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ", 2420 so->so_sp->ssp_idleto.to_time); 2421 } 2422 2423 (*pr)("so_rcv:\n"); 2424 sobuf_print(&so->so_rcv, pr); 2425 (*pr)("so_snd:\n"); 2426 sobuf_print(&so->so_snd, pr); 2427 2428 (*pr)("so_upcall: %p so_upcallarg: %p\n", 2429 so->so_upcall, so->so_upcallarg); 2430 2431 (*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid); 2432 (*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid); 2433 (*pr)("so_cpid: %d\n", so->so_cpid); 2434 } 2435 #endif 2436