1 /* $OpenBSD: uipc_socket.c,v 1.297 2023/01/23 18:34:24 mvs Exp $ */ 2 /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 #include <sys/file.h> 39 #include <sys/filedesc.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/domain.h> 43 #include <sys/event.h> 44 #include <sys/protosw.h> 45 #include <sys/socket.h> 46 #include <sys/unpcb.h> 47 #include <sys/socketvar.h> 48 #include <sys/signalvar.h> 49 #include <sys/pool.h> 50 #include <sys/atomic.h> 51 #include <sys/rwlock.h> 52 #include <sys/time.h> 53 #include <sys/refcnt.h> 54 55 #ifdef DDB 56 #include <machine/db_machdep.h> 57 #endif 58 59 void sbsync(struct sockbuf *, struct mbuf *); 60 61 int sosplice(struct socket *, int, off_t, struct timeval *); 62 void sounsplice(struct socket *, struct socket *, int); 63 void soidle(void *); 64 void sotask(void *); 65 void soreaper(void *); 66 void soput(void *); 67 int somove(struct socket *, int); 68 void sorflush(struct socket *); 69 70 void filt_sordetach(struct knote *kn); 71 int filt_soread(struct knote *kn, long hint); 72 void filt_sowdetach(struct knote *kn); 73 int filt_sowrite(struct knote *kn, long hint); 74 int filt_soexcept(struct knote *kn, long hint); 75 int filt_solisten(struct knote *kn, long hint); 76 int filt_somodify(struct kevent *kev, struct knote *kn); 77 int filt_soprocess(struct knote *kn, struct kevent *kev); 78 79 const struct filterops solisten_filtops = { 80 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 81 .f_attach = NULL, 82 .f_detach = filt_sordetach, 83 .f_event = filt_solisten, 84 .f_modify = filt_somodify, 85 .f_process = filt_soprocess, 86 }; 87 88 const struct filterops soread_filtops = { 89 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 90 .f_attach = NULL, 91 .f_detach = filt_sordetach, 92 .f_event = filt_soread, 93 .f_modify = filt_somodify, 94 .f_process = filt_soprocess, 95 }; 96 97 const struct filterops sowrite_filtops = { 98 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 99 .f_attach = NULL, 100 .f_detach = filt_sowdetach, 101 .f_event = filt_sowrite, 102 .f_modify = filt_somodify, 103 .f_process = filt_soprocess, 104 }; 105 106 const struct filterops soexcept_filtops = { 107 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 108 .f_attach = NULL, 109 .f_detach = filt_sordetach, 110 .f_event = filt_soexcept, 111 .f_modify = filt_somodify, 112 .f_process = filt_soprocess, 113 }; 114 115 #ifndef SOMINCONN 116 #define SOMINCONN 80 117 #endif /* SOMINCONN */ 118 119 int somaxconn = SOMAXCONN; 120 int sominconn = SOMINCONN; 121 122 struct pool socket_pool; 123 #ifdef SOCKET_SPLICE 124 struct pool sosplice_pool; 125 struct taskq *sosplice_taskq; 126 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk"); 127 #endif 128 129 void 130 soinit(void) 131 { 132 pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0, 133 "sockpl", NULL); 134 #ifdef SOCKET_SPLICE 135 pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0, 136 "sosppl", NULL); 137 #endif 138 } 139 140 struct socket * 141 soalloc(int wait) 142 { 143 struct socket *so; 144 145 so = pool_get(&socket_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) | 146 PR_ZERO); 147 if (so == NULL) 148 return (NULL); 149 rw_init_flags(&so->so_lock, "solock", RWL_DUPOK); 150 refcnt_init(&so->so_refcnt); 151 152 return (so); 153 } 154 155 /* 156 * Socket operation routines. 157 * These routines are called by the routines in 158 * sys_socket.c or from a system process, and 159 * implement the semantics of socket operations by 160 * switching out to the protocol specific routines. 161 */ 162 int 163 socreate(int dom, struct socket **aso, int type, int proto) 164 { 165 struct proc *p = curproc; /* XXX */ 166 const struct protosw *prp; 167 struct socket *so; 168 int error; 169 170 if (proto) 171 prp = pffindproto(dom, proto, type); 172 else 173 prp = pffindtype(dom, type); 174 if (prp == NULL || prp->pr_usrreqs == NULL) 175 return (EPROTONOSUPPORT); 176 if (prp->pr_type != type) 177 return (EPROTOTYPE); 178 so = soalloc(M_WAIT); 179 klist_init(&so->so_rcv.sb_sel.si_note, &socket_klistops, so); 180 klist_init(&so->so_snd.sb_sel.si_note, &socket_klistops, so); 181 sigio_init(&so->so_sigio); 182 TAILQ_INIT(&so->so_q0); 183 TAILQ_INIT(&so->so_q); 184 so->so_type = type; 185 if (suser(p) == 0) 186 so->so_state = SS_PRIV; 187 so->so_ruid = p->p_ucred->cr_ruid; 188 so->so_euid = p->p_ucred->cr_uid; 189 so->so_rgid = p->p_ucred->cr_rgid; 190 so->so_egid = p->p_ucred->cr_gid; 191 so->so_cpid = p->p_p->ps_pid; 192 so->so_proto = prp; 193 so->so_snd.sb_timeo_nsecs = INFSLP; 194 so->so_rcv.sb_timeo_nsecs = INFSLP; 195 196 solock(so); 197 error = pru_attach(so, proto, M_WAIT); 198 if (error) { 199 so->so_state |= SS_NOFDREF; 200 /* sofree() calls sounlock(). */ 201 sofree(so, 0); 202 return (error); 203 } 204 sounlock(so); 205 *aso = so; 206 return (0); 207 } 208 209 int 210 sobind(struct socket *so, struct mbuf *nam, struct proc *p) 211 { 212 soassertlocked(so); 213 return pru_bind(so, nam, p); 214 } 215 216 int 217 solisten(struct socket *so, int backlog) 218 { 219 int error; 220 221 soassertlocked(so); 222 223 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 224 return (EINVAL); 225 #ifdef SOCKET_SPLICE 226 if (isspliced(so) || issplicedback(so)) 227 return (EOPNOTSUPP); 228 #endif /* SOCKET_SPLICE */ 229 error = pru_listen(so); 230 if (error) 231 return (error); 232 if (TAILQ_FIRST(&so->so_q) == NULL) 233 so->so_options |= SO_ACCEPTCONN; 234 if (backlog < 0 || backlog > somaxconn) 235 backlog = somaxconn; 236 if (backlog < sominconn) 237 backlog = sominconn; 238 so->so_qlimit = backlog; 239 return (0); 240 } 241 242 #define SOSP_FREEING_READ 1 243 #define SOSP_FREEING_WRITE 2 244 void 245 sofree(struct socket *so, int keep_lock) 246 { 247 int persocket = solock_persocket(so); 248 249 soassertlocked(so); 250 251 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { 252 if (!keep_lock) 253 sounlock(so); 254 return; 255 } 256 if (so->so_head) { 257 struct socket *head = so->so_head; 258 259 /* 260 * We must not decommission a socket that's on the accept(2) 261 * queue. If we do, then accept(2) may hang after select(2) 262 * indicated that the listening socket was ready. 263 */ 264 if (so->so_onq == &head->so_q) { 265 if (!keep_lock) 266 sounlock(so); 267 return; 268 } 269 270 if (persocket) { 271 /* 272 * Concurrent close of `head' could 273 * abort `so' due to re-lock. 274 */ 275 soref(so); 276 soref(head); 277 sounlock(so); 278 solock(head); 279 solock(so); 280 281 if (so->so_onq != &head->so_q0) { 282 sounlock(head); 283 sounlock(so); 284 sorele(head); 285 sorele(so); 286 return; 287 } 288 289 sorele(head); 290 sorele(so); 291 } 292 293 soqremque(so, 0); 294 295 if (persocket) 296 sounlock(head); 297 } 298 299 if (persocket) { 300 sounlock(so); 301 refcnt_finalize(&so->so_refcnt, "sofinal"); 302 solock(so); 303 } 304 305 sigio_free(&so->so_sigio); 306 klist_free(&so->so_rcv.sb_sel.si_note); 307 klist_free(&so->so_snd.sb_sel.si_note); 308 #ifdef SOCKET_SPLICE 309 if (so->so_sp) { 310 if (issplicedback(so)) { 311 int freeing = SOSP_FREEING_WRITE; 312 313 if (so->so_sp->ssp_soback == so) 314 freeing |= SOSP_FREEING_READ; 315 sounsplice(so->so_sp->ssp_soback, so, freeing); 316 } 317 if (isspliced(so)) { 318 int freeing = SOSP_FREEING_READ; 319 320 if (so == so->so_sp->ssp_socket) 321 freeing |= SOSP_FREEING_WRITE; 322 sounsplice(so, so->so_sp->ssp_socket, freeing); 323 } 324 } 325 #endif /* SOCKET_SPLICE */ 326 sbrelease(so, &so->so_snd); 327 sorflush(so); 328 if (!keep_lock) 329 sounlock(so); 330 #ifdef SOCKET_SPLICE 331 if (so->so_sp) { 332 /* Reuse splice idle, sounsplice() has been called before. */ 333 timeout_set_proc(&so->so_sp->ssp_idleto, soreaper, so); 334 timeout_add(&so->so_sp->ssp_idleto, 0); 335 } else 336 #endif /* SOCKET_SPLICE */ 337 { 338 pool_put(&socket_pool, so); 339 } 340 } 341 342 static inline uint64_t 343 solinger_nsec(struct socket *so) 344 { 345 if (so->so_linger == 0) 346 return INFSLP; 347 348 return SEC_TO_NSEC(so->so_linger); 349 } 350 351 /* 352 * Close a socket on last file table reference removal. 353 * Initiate disconnect if connected. 354 * Free socket when disconnect complete. 355 */ 356 int 357 soclose(struct socket *so, int flags) 358 { 359 struct socket *so2; 360 int error = 0; 361 362 solock(so); 363 /* Revoke async IO early. There is a final revocation in sofree(). */ 364 sigio_free(&so->so_sigio); 365 if (so->so_state & SS_ISCONNECTED) { 366 if (so->so_pcb == NULL) 367 goto discard; 368 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 369 error = sodisconnect(so); 370 if (error) 371 goto drop; 372 } 373 if (so->so_options & SO_LINGER) { 374 if ((so->so_state & SS_ISDISCONNECTING) && 375 (flags & MSG_DONTWAIT)) 376 goto drop; 377 while (so->so_state & SS_ISCONNECTED) { 378 error = sosleep_nsec(so, &so->so_timeo, 379 PSOCK | PCATCH, "netcls", 380 solinger_nsec(so)); 381 if (error) 382 break; 383 } 384 } 385 } 386 drop: 387 if (so->so_pcb) { 388 int error2; 389 error2 = pru_detach(so); 390 if (error == 0) 391 error = error2; 392 } 393 if (so->so_options & SO_ACCEPTCONN) { 394 int persocket = solock_persocket(so); 395 396 if (persocket) { 397 /* Wait concurrent sonewconn() threads. */ 398 while (so->so_newconn > 0) { 399 so->so_state |= SS_NEWCONN_WAIT; 400 sosleep_nsec(so, &so->so_newconn, PSOCK, 401 "netlck", INFSLP); 402 } 403 } 404 405 while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { 406 if (persocket) 407 solock(so2); 408 (void) soqremque(so2, 0); 409 if (persocket) 410 sounlock(so); 411 soabort(so2); 412 if (persocket) 413 solock(so); 414 } 415 while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { 416 if (persocket) 417 solock(so2); 418 (void) soqremque(so2, 1); 419 if (persocket) 420 sounlock(so); 421 soabort(so2); 422 if (persocket) 423 solock(so); 424 } 425 } 426 discard: 427 if (so->so_state & SS_NOFDREF) 428 panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type); 429 so->so_state |= SS_NOFDREF; 430 /* sofree() calls sounlock(). */ 431 sofree(so, 0); 432 return (error); 433 } 434 435 void 436 soabort(struct socket *so) 437 { 438 soassertlocked(so); 439 pru_abort(so); 440 } 441 442 int 443 soaccept(struct socket *so, struct mbuf *nam) 444 { 445 int error = 0; 446 447 soassertlocked(so); 448 449 if ((so->so_state & SS_NOFDREF) == 0) 450 panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type); 451 so->so_state &= ~SS_NOFDREF; 452 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 453 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 454 error = pru_accept(so, nam); 455 else 456 error = ECONNABORTED; 457 return (error); 458 } 459 460 int 461 soconnect(struct socket *so, struct mbuf *nam) 462 { 463 int error; 464 465 soassertlocked(so); 466 467 if (so->so_options & SO_ACCEPTCONN) 468 return (EOPNOTSUPP); 469 /* 470 * If protocol is connection-based, can only connect once. 471 * Otherwise, if connected, try to disconnect first. 472 * This allows user to disconnect by connecting to, e.g., 473 * a null address. 474 */ 475 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 476 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 477 (error = sodisconnect(so)))) 478 error = EISCONN; 479 else 480 error = pru_connect(so, nam); 481 return (error); 482 } 483 484 int 485 soconnect2(struct socket *so1, struct socket *so2) 486 { 487 int persocket, error; 488 489 if ((persocket = solock_persocket(so1))) 490 solock_pair(so1, so2); 491 else 492 solock(so1); 493 494 error = pru_connect2(so1, so2); 495 496 if (persocket) 497 sounlock(so2); 498 sounlock(so1); 499 return (error); 500 } 501 502 int 503 sodisconnect(struct socket *so) 504 { 505 int error; 506 507 soassertlocked(so); 508 509 if ((so->so_state & SS_ISCONNECTED) == 0) 510 return (ENOTCONN); 511 if (so->so_state & SS_ISDISCONNECTING) 512 return (EALREADY); 513 error = pru_disconnect(so); 514 return (error); 515 } 516 517 int m_getuio(struct mbuf **, int, long, struct uio *); 518 519 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 520 /* 521 * Send on a socket. 522 * If send must go all at once and message is larger than 523 * send buffering, then hard error. 524 * Lock against other senders. 525 * If must go all at once and not enough room now, then 526 * inform user that this would block and do nothing. 527 * Otherwise, if nonblocking, send as much as possible. 528 * The data to be sent is described by "uio" if nonzero, 529 * otherwise by the mbuf chain "top" (which must be null 530 * if uio is not). Data provided in mbuf chain must be small 531 * enough to send all at once. 532 * 533 * Returns nonzero on error, timeout or signal; callers 534 * must check for short counts if EINTR/ERESTART are returned. 535 * Data and control buffers are freed on return. 536 */ 537 int 538 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 539 struct mbuf *control, int flags) 540 { 541 long space, clen = 0; 542 size_t resid; 543 int error; 544 int atomic = sosendallatonce(so) || top; 545 546 if (uio) 547 resid = uio->uio_resid; 548 else 549 resid = top->m_pkthdr.len; 550 /* MSG_EOR on a SOCK_STREAM socket is invalid. */ 551 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 552 m_freem(top); 553 m_freem(control); 554 return (EINVAL); 555 } 556 if (uio && uio->uio_procp) 557 uio->uio_procp->p_ru.ru_msgsnd++; 558 if (control) { 559 /* 560 * In theory clen should be unsigned (since control->m_len is). 561 * However, space must be signed, as it might be less than 0 562 * if we over-committed, and we must use a signed comparison 563 * of space and clen. 564 */ 565 clen = control->m_len; 566 /* reserve extra space for AF_UNIX's internalize */ 567 if (so->so_proto->pr_domain->dom_family == AF_UNIX && 568 clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) && 569 mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 570 clen = CMSG_SPACE( 571 (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) * 572 (sizeof(struct fdpass) / sizeof(int))); 573 } 574 575 #define snderr(errno) { error = errno; goto release; } 576 577 solock(so); 578 restart: 579 if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0) 580 goto out; 581 so->so_snd.sb_state |= SS_ISSENDING; 582 do { 583 if (so->so_snd.sb_state & SS_CANTSENDMORE) 584 snderr(EPIPE); 585 if (so->so_error) { 586 error = so->so_error; 587 so->so_error = 0; 588 snderr(error); 589 } 590 if ((so->so_state & SS_ISCONNECTED) == 0) { 591 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 592 if (!(resid == 0 && clen != 0)) 593 snderr(ENOTCONN); 594 } else if (addr == NULL) 595 snderr(EDESTADDRREQ); 596 } 597 space = sbspace(so, &so->so_snd); 598 if (flags & MSG_OOB) 599 space += 1024; 600 if (so->so_proto->pr_domain->dom_family == AF_UNIX) { 601 if (atomic && resid > so->so_snd.sb_hiwat) 602 snderr(EMSGSIZE); 603 } else { 604 if (clen > so->so_snd.sb_hiwat || 605 (atomic && resid > so->so_snd.sb_hiwat - clen)) 606 snderr(EMSGSIZE); 607 } 608 if (space < clen || 609 (space - clen < resid && 610 (atomic || space < so->so_snd.sb_lowat))) { 611 if (flags & MSG_DONTWAIT) 612 snderr(EWOULDBLOCK); 613 sbunlock(so, &so->so_snd); 614 error = sbwait(so, &so->so_snd); 615 so->so_snd.sb_state &= ~SS_ISSENDING; 616 if (error) 617 goto out; 618 goto restart; 619 } 620 space -= clen; 621 do { 622 if (uio == NULL) { 623 /* 624 * Data is prepackaged in "top". 625 */ 626 resid = 0; 627 if (flags & MSG_EOR) 628 top->m_flags |= M_EOR; 629 } else { 630 sounlock(so); 631 error = m_getuio(&top, atomic, space, uio); 632 solock(so); 633 if (error) 634 goto release; 635 space -= top->m_pkthdr.len; 636 resid = uio->uio_resid; 637 if (flags & MSG_EOR) 638 top->m_flags |= M_EOR; 639 } 640 if (resid == 0) 641 so->so_snd.sb_state &= ~SS_ISSENDING; 642 if (top && so->so_options & SO_ZEROIZE) 643 top->m_flags |= M_ZEROIZE; 644 if (flags & MSG_OOB) 645 error = pru_sendoob(so, top, addr, control); 646 else 647 error = pru_send(so, top, addr, control); 648 clen = 0; 649 control = NULL; 650 top = NULL; 651 if (error) 652 goto release; 653 } while (resid && space > 0); 654 } while (resid); 655 656 release: 657 so->so_snd.sb_state &= ~SS_ISSENDING; 658 sbunlock(so, &so->so_snd); 659 out: 660 sounlock(so); 661 m_freem(top); 662 m_freem(control); 663 return (error); 664 } 665 666 int 667 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio) 668 { 669 struct mbuf *m, *top = NULL; 670 struct mbuf **nextp = ⊤ 671 u_long len, mlen; 672 size_t resid = uio->uio_resid; 673 int error; 674 675 do { 676 if (top == NULL) { 677 MGETHDR(m, M_WAIT, MT_DATA); 678 mlen = MHLEN; 679 m->m_pkthdr.len = 0; 680 m->m_pkthdr.ph_ifidx = 0; 681 } else { 682 MGET(m, M_WAIT, MT_DATA); 683 mlen = MLEN; 684 } 685 /* chain mbuf together */ 686 *nextp = m; 687 nextp = &m->m_next; 688 689 resid = ulmin(resid, space); 690 if (resid >= MINCLSIZE) { 691 MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES)); 692 if ((m->m_flags & M_EXT) == 0) 693 MCLGETL(m, M_NOWAIT, MCLBYTES); 694 if ((m->m_flags & M_EXT) == 0) 695 goto nopages; 696 mlen = m->m_ext.ext_size; 697 len = ulmin(mlen, resid); 698 /* 699 * For datagram protocols, leave room 700 * for protocol headers in first mbuf. 701 */ 702 if (atomic && m == top && len < mlen - max_hdr) 703 m->m_data += max_hdr; 704 } else { 705 nopages: 706 len = ulmin(mlen, resid); 707 /* 708 * For datagram protocols, leave room 709 * for protocol headers in first mbuf. 710 */ 711 if (atomic && m == top && len < mlen - max_hdr) 712 m_align(m, len); 713 } 714 715 error = uiomove(mtod(m, caddr_t), len, uio); 716 if (error) { 717 m_freem(top); 718 return (error); 719 } 720 721 /* adjust counters */ 722 resid = uio->uio_resid; 723 space -= len; 724 m->m_len = len; 725 top->m_pkthdr.len += len; 726 727 /* Is there more space and more data? */ 728 } while (space > 0 && resid > 0); 729 730 *mp = top; 731 return 0; 732 } 733 734 /* 735 * Following replacement or removal of the first mbuf on the first 736 * mbuf chain of a socket buffer, push necessary state changes back 737 * into the socket buffer so that other consumers see the values 738 * consistently. 'nextrecord' is the callers locally stored value of 739 * the original value of sb->sb_mb->m_nextpkt which must be restored 740 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 741 */ 742 void 743 sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 744 { 745 746 /* 747 * First, update for the new value of nextrecord. If necessary, 748 * make it the first record. 749 */ 750 if (sb->sb_mb != NULL) 751 sb->sb_mb->m_nextpkt = nextrecord; 752 else 753 sb->sb_mb = nextrecord; 754 755 /* 756 * Now update any dependent socket buffer fields to reflect 757 * the new state. This is an inline of SB_EMPTY_FIXUP, with 758 * the addition of a second clause that takes care of the 759 * case where sb_mb has been updated, but remains the last 760 * record. 761 */ 762 if (sb->sb_mb == NULL) { 763 sb->sb_mbtail = NULL; 764 sb->sb_lastrecord = NULL; 765 } else if (sb->sb_mb->m_nextpkt == NULL) 766 sb->sb_lastrecord = sb->sb_mb; 767 } 768 769 /* 770 * Implement receive operations on a socket. 771 * We depend on the way that records are added to the sockbuf 772 * by sbappend*. In particular, each record (mbufs linked through m_next) 773 * must begin with an address if the protocol so specifies, 774 * followed by an optional mbuf or mbufs containing ancillary data, 775 * and then zero or more mbufs of data. 776 * In order to avoid blocking network for the entire time here, we release 777 * the solock() while doing the actual copy to user space. 778 * Although the sockbuf is locked, new data may still be appended, 779 * and thus we must maintain consistency of the sockbuf during that time. 780 * 781 * The caller may receive the data as a single mbuf chain by supplying 782 * an mbuf **mp0 for use in returning the chain. The uio is then used 783 * only for the count in uio_resid. 784 */ 785 int 786 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 787 struct mbuf **mp0, struct mbuf **controlp, int *flagsp, 788 socklen_t controllen) 789 { 790 struct mbuf *m, **mp; 791 struct mbuf *cm; 792 u_long len, offset, moff; 793 int flags, error, type, uio_error = 0; 794 const struct protosw *pr = so->so_proto; 795 struct mbuf *nextrecord; 796 size_t resid, orig_resid = uio->uio_resid; 797 798 mp = mp0; 799 if (paddr) 800 *paddr = NULL; 801 if (controlp) 802 *controlp = NULL; 803 if (flagsp) 804 flags = *flagsp &~ MSG_EOR; 805 else 806 flags = 0; 807 if (flags & MSG_OOB) { 808 m = m_get(M_WAIT, MT_DATA); 809 solock(so); 810 error = pru_rcvoob(so, m, flags & MSG_PEEK); 811 sounlock(so); 812 if (error) 813 goto bad; 814 do { 815 error = uiomove(mtod(m, caddr_t), 816 ulmin(uio->uio_resid, m->m_len), uio); 817 m = m_free(m); 818 } while (uio->uio_resid && error == 0 && m); 819 bad: 820 m_freem(m); 821 return (error); 822 } 823 if (mp) 824 *mp = NULL; 825 826 solock_shared(so); 827 restart: 828 if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) { 829 sounlock_shared(so); 830 return (error); 831 } 832 833 m = so->so_rcv.sb_mb; 834 #ifdef SOCKET_SPLICE 835 if (isspliced(so)) 836 m = NULL; 837 #endif /* SOCKET_SPLICE */ 838 /* 839 * If we have less data than requested, block awaiting more 840 * (subject to any timeout) if: 841 * 1. the current count is less than the low water mark, 842 * 2. MSG_WAITALL is set, and it is possible to do the entire 843 * receive operation at once if we block (resid <= hiwat), or 844 * 3. MSG_DONTWAIT is not set. 845 * If MSG_WAITALL is set but resid is larger than the receive buffer, 846 * we have to do the receive in sections, and thus risk returning 847 * a short count if a timeout or signal occurs after we start. 848 */ 849 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 850 so->so_rcv.sb_cc < uio->uio_resid) && 851 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 852 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 853 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 854 #ifdef DIAGNOSTIC 855 if (m == NULL && so->so_rcv.sb_cc) 856 #ifdef SOCKET_SPLICE 857 if (!isspliced(so)) 858 #endif /* SOCKET_SPLICE */ 859 panic("receive 1: so %p, so_type %d, sb_cc %lu", 860 so, so->so_type, so->so_rcv.sb_cc); 861 #endif 862 if (so->so_error) { 863 if (m) 864 goto dontblock; 865 error = so->so_error; 866 if ((flags & MSG_PEEK) == 0) 867 so->so_error = 0; 868 goto release; 869 } 870 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 871 if (m) 872 goto dontblock; 873 else if (so->so_rcv.sb_cc == 0) 874 goto release; 875 } 876 for (; m; m = m->m_next) 877 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 878 m = so->so_rcv.sb_mb; 879 goto dontblock; 880 } 881 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 882 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 883 error = ENOTCONN; 884 goto release; 885 } 886 if (uio->uio_resid == 0 && controlp == NULL) 887 goto release; 888 if (flags & MSG_DONTWAIT) { 889 error = EWOULDBLOCK; 890 goto release; 891 } 892 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 893 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 894 sbunlock(so, &so->so_rcv); 895 error = sbwait(so, &so->so_rcv); 896 if (error) { 897 sounlock_shared(so); 898 return (error); 899 } 900 goto restart; 901 } 902 dontblock: 903 /* 904 * On entry here, m points to the first record of the socket buffer. 905 * From this point onward, we maintain 'nextrecord' as a cache of the 906 * pointer to the next record in the socket buffer. We must keep the 907 * various socket buffer pointers and local stack versions of the 908 * pointers in sync, pushing out modifications before operations that 909 * may sleep, and re-reading them afterwards. 910 * 911 * Otherwise, we will race with the network stack appending new data 912 * or records onto the socket buffer by using inconsistent/stale 913 * versions of the field, possibly resulting in socket buffer 914 * corruption. 915 */ 916 if (uio->uio_procp) 917 uio->uio_procp->p_ru.ru_msgrcv++; 918 KASSERT(m == so->so_rcv.sb_mb); 919 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 920 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 921 nextrecord = m->m_nextpkt; 922 if (pr->pr_flags & PR_ADDR) { 923 #ifdef DIAGNOSTIC 924 if (m->m_type != MT_SONAME) 925 panic("receive 1a: so %p, so_type %d, m %p, m_type %d", 926 so, so->so_type, m, m->m_type); 927 #endif 928 orig_resid = 0; 929 if (flags & MSG_PEEK) { 930 if (paddr) 931 *paddr = m_copym(m, 0, m->m_len, M_NOWAIT); 932 m = m->m_next; 933 } else { 934 sbfree(so, &so->so_rcv, m); 935 if (paddr) { 936 *paddr = m; 937 so->so_rcv.sb_mb = m->m_next; 938 m->m_next = NULL; 939 m = so->so_rcv.sb_mb; 940 } else { 941 so->so_rcv.sb_mb = m_free(m); 942 m = so->so_rcv.sb_mb; 943 } 944 sbsync(&so->so_rcv, nextrecord); 945 } 946 } 947 while (m && m->m_type == MT_CONTROL && error == 0) { 948 int skip = 0; 949 if (flags & MSG_PEEK) { 950 if (mtod(m, struct cmsghdr *)->cmsg_type == 951 SCM_RIGHTS) { 952 /* don't leak internalized SCM_RIGHTS msgs */ 953 skip = 1; 954 } else if (controlp) 955 *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); 956 m = m->m_next; 957 } else { 958 sbfree(so, &so->so_rcv, m); 959 so->so_rcv.sb_mb = m->m_next; 960 m->m_nextpkt = m->m_next = NULL; 961 cm = m; 962 m = so->so_rcv.sb_mb; 963 sbsync(&so->so_rcv, nextrecord); 964 if (controlp) { 965 if (pr->pr_domain->dom_externalize) { 966 sounlock_shared(so); 967 error = 968 (*pr->pr_domain->dom_externalize) 969 (cm, controllen, flags); 970 solock_shared(so); 971 } 972 *controlp = cm; 973 } else { 974 /* 975 * Dispose of any SCM_RIGHTS message that went 976 * through the read path rather than recv. 977 */ 978 if (pr->pr_domain->dom_dispose) 979 pr->pr_domain->dom_dispose(cm); 980 m_free(cm); 981 } 982 } 983 if (m != NULL) 984 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 985 else 986 nextrecord = so->so_rcv.sb_mb; 987 if (controlp && !skip) 988 controlp = &(*controlp)->m_next; 989 orig_resid = 0; 990 } 991 992 /* If m is non-NULL, we have some data to read. */ 993 if (m) { 994 type = m->m_type; 995 if (type == MT_OOBDATA) 996 flags |= MSG_OOB; 997 if (m->m_flags & M_BCAST) 998 flags |= MSG_BCAST; 999 if (m->m_flags & M_MCAST) 1000 flags |= MSG_MCAST; 1001 } 1002 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 1003 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 1004 1005 moff = 0; 1006 offset = 0; 1007 while (m && uio->uio_resid > 0 && error == 0) { 1008 if (m->m_type == MT_OOBDATA) { 1009 if (type != MT_OOBDATA) 1010 break; 1011 } else if (type == MT_OOBDATA) { 1012 break; 1013 } else if (m->m_type == MT_CONTROL) { 1014 /* 1015 * If there is more than one control message in the 1016 * stream, we do a short read. Next can be received 1017 * or disposed by another system call. 1018 */ 1019 break; 1020 #ifdef DIAGNOSTIC 1021 } else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) { 1022 panic("receive 3: so %p, so_type %d, m %p, m_type %d", 1023 so, so->so_type, m, m->m_type); 1024 #endif 1025 } 1026 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1027 len = uio->uio_resid; 1028 if (so->so_oobmark && len > so->so_oobmark - offset) 1029 len = so->so_oobmark - offset; 1030 if (len > m->m_len - moff) 1031 len = m->m_len - moff; 1032 /* 1033 * If mp is set, just pass back the mbufs. 1034 * Otherwise copy them out via the uio, then free. 1035 * Sockbuf must be consistent here (points to current mbuf, 1036 * it points to next record) when we drop priority; 1037 * we must note any additions to the sockbuf when we 1038 * block interrupts again. 1039 */ 1040 if (mp == NULL && uio_error == 0) { 1041 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 1042 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 1043 resid = uio->uio_resid; 1044 sounlock_shared(so); 1045 uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1046 solock_shared(so); 1047 if (uio_error) 1048 uio->uio_resid = resid - len; 1049 } else 1050 uio->uio_resid -= len; 1051 if (len == m->m_len - moff) { 1052 if (m->m_flags & M_EOR) 1053 flags |= MSG_EOR; 1054 if (flags & MSG_PEEK) { 1055 m = m->m_next; 1056 moff = 0; 1057 orig_resid = 0; 1058 } else { 1059 nextrecord = m->m_nextpkt; 1060 sbfree(so, &so->so_rcv, m); 1061 if (mp) { 1062 *mp = m; 1063 mp = &m->m_next; 1064 so->so_rcv.sb_mb = m = m->m_next; 1065 *mp = NULL; 1066 } else { 1067 so->so_rcv.sb_mb = m_free(m); 1068 m = so->so_rcv.sb_mb; 1069 } 1070 /* 1071 * If m != NULL, we also know that 1072 * so->so_rcv.sb_mb != NULL. 1073 */ 1074 KASSERT(so->so_rcv.sb_mb == m); 1075 if (m) { 1076 m->m_nextpkt = nextrecord; 1077 if (nextrecord == NULL) 1078 so->so_rcv.sb_lastrecord = m; 1079 } else { 1080 so->so_rcv.sb_mb = nextrecord; 1081 SB_EMPTY_FIXUP(&so->so_rcv); 1082 } 1083 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1084 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1085 } 1086 } else { 1087 if (flags & MSG_PEEK) { 1088 moff += len; 1089 orig_resid = 0; 1090 } else { 1091 if (mp) 1092 *mp = m_copym(m, 0, len, M_WAIT); 1093 m->m_data += len; 1094 m->m_len -= len; 1095 so->so_rcv.sb_cc -= len; 1096 so->so_rcv.sb_datacc -= len; 1097 } 1098 } 1099 if (so->so_oobmark) { 1100 if ((flags & MSG_PEEK) == 0) { 1101 so->so_oobmark -= len; 1102 if (so->so_oobmark == 0) { 1103 so->so_rcv.sb_state |= SS_RCVATMARK; 1104 break; 1105 } 1106 } else { 1107 offset += len; 1108 if (offset == so->so_oobmark) 1109 break; 1110 } 1111 } 1112 if (flags & MSG_EOR) 1113 break; 1114 /* 1115 * If the MSG_WAITALL flag is set (for non-atomic socket), 1116 * we must not quit until "uio->uio_resid == 0" or an error 1117 * termination. If a signal/timeout occurs, return 1118 * with a short count but without error. 1119 * Keep sockbuf locked against other readers. 1120 */ 1121 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1122 !sosendallatonce(so) && !nextrecord) { 1123 if (so->so_rcv.sb_state & SS_CANTRCVMORE || 1124 so->so_error) 1125 break; 1126 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1127 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1128 error = sbwait(so, &so->so_rcv); 1129 if (error) { 1130 sbunlock(so, &so->so_rcv); 1131 sounlock_shared(so); 1132 return (0); 1133 } 1134 if ((m = so->so_rcv.sb_mb) != NULL) 1135 nextrecord = m->m_nextpkt; 1136 } 1137 } 1138 1139 if (m && pr->pr_flags & PR_ATOMIC) { 1140 flags |= MSG_TRUNC; 1141 if ((flags & MSG_PEEK) == 0) 1142 (void) sbdroprecord(so, &so->so_rcv); 1143 } 1144 if ((flags & MSG_PEEK) == 0) { 1145 if (m == NULL) { 1146 /* 1147 * First part is an inline SB_EMPTY_FIXUP(). Second 1148 * part makes sure sb_lastrecord is up-to-date if 1149 * there is still data in the socket buffer. 1150 */ 1151 so->so_rcv.sb_mb = nextrecord; 1152 if (so->so_rcv.sb_mb == NULL) { 1153 so->so_rcv.sb_mbtail = NULL; 1154 so->so_rcv.sb_lastrecord = NULL; 1155 } else if (nextrecord->m_nextpkt == NULL) 1156 so->so_rcv.sb_lastrecord = nextrecord; 1157 } 1158 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1159 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1160 if (pr->pr_flags & PR_WANTRCVD) 1161 pru_rcvd(so); 1162 } 1163 if (orig_resid == uio->uio_resid && orig_resid && 1164 (flags & MSG_EOR) == 0 && 1165 (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) { 1166 sbunlock(so, &so->so_rcv); 1167 goto restart; 1168 } 1169 1170 if (uio_error) 1171 error = uio_error; 1172 1173 if (flagsp) 1174 *flagsp |= flags; 1175 release: 1176 sbunlock(so, &so->so_rcv); 1177 sounlock_shared(so); 1178 return (error); 1179 } 1180 1181 int 1182 soshutdown(struct socket *so, int how) 1183 { 1184 int error = 0; 1185 1186 solock(so); 1187 switch (how) { 1188 case SHUT_RD: 1189 sorflush(so); 1190 break; 1191 case SHUT_RDWR: 1192 sorflush(so); 1193 /* FALLTHROUGH */ 1194 case SHUT_WR: 1195 error = pru_shutdown(so); 1196 break; 1197 default: 1198 error = EINVAL; 1199 break; 1200 } 1201 sounlock(so); 1202 1203 return (error); 1204 } 1205 1206 void 1207 sorflush(struct socket *so) 1208 { 1209 struct sockbuf *sb = &so->so_rcv; 1210 struct mbuf *m; 1211 const struct protosw *pr = so->so_proto; 1212 int error; 1213 1214 sb->sb_flags |= SB_NOINTR; 1215 error = sblock(so, sb, M_WAITOK); 1216 /* with SB_NOINTR and M_WAITOK sblock() must not fail */ 1217 KASSERT(error == 0); 1218 socantrcvmore(so); 1219 m = sb->sb_mb; 1220 memset(&sb->sb_startzero, 0, 1221 (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); 1222 sb->sb_timeo_nsecs = INFSLP; 1223 sbunlock(so, sb); 1224 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1225 (*pr->pr_domain->dom_dispose)(m); 1226 m_purge(m); 1227 } 1228 1229 #ifdef SOCKET_SPLICE 1230 1231 #define so_splicelen so_sp->ssp_len 1232 #define so_splicemax so_sp->ssp_max 1233 #define so_idletv so_sp->ssp_idletv 1234 #define so_idleto so_sp->ssp_idleto 1235 #define so_splicetask so_sp->ssp_task 1236 1237 int 1238 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) 1239 { 1240 struct file *fp; 1241 struct socket *sosp; 1242 struct sosplice *sp; 1243 struct taskq *tq; 1244 int error = 0; 1245 1246 soassertlocked(so); 1247 1248 if (sosplice_taskq == NULL) { 1249 rw_enter_write(&sosplice_lock); 1250 if (sosplice_taskq == NULL) { 1251 tq = taskq_create("sosplice", 1, IPL_SOFTNET, 1252 TASKQ_MPSAFE); 1253 /* Ensure the taskq is fully visible to other CPUs. */ 1254 membar_producer(); 1255 sosplice_taskq = tq; 1256 } 1257 rw_exit_write(&sosplice_lock); 1258 } 1259 if (sosplice_taskq == NULL) 1260 return (ENOMEM); 1261 1262 if ((so->so_proto->pr_flags & PR_SPLICE) == 0) 1263 return (EPROTONOSUPPORT); 1264 if (so->so_options & SO_ACCEPTCONN) 1265 return (EOPNOTSUPP); 1266 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1267 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 1268 return (ENOTCONN); 1269 if (so->so_sp == NULL) { 1270 sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1271 if (so->so_sp == NULL) 1272 so->so_sp = sp; 1273 else 1274 pool_put(&sosplice_pool, sp); 1275 } 1276 1277 /* If no fd is given, unsplice by removing existing link. */ 1278 if (fd < 0) { 1279 /* Lock receive buffer. */ 1280 if ((error = sblock(so, &so->so_rcv, M_WAITOK)) != 0) { 1281 return (error); 1282 } 1283 if (so->so_sp->ssp_socket) 1284 sounsplice(so, so->so_sp->ssp_socket, 0); 1285 sbunlock(so, &so->so_rcv); 1286 return (0); 1287 } 1288 1289 if (max && max < 0) 1290 return (EINVAL); 1291 1292 if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) 1293 return (EINVAL); 1294 1295 /* Find sosp, the drain socket where data will be spliced into. */ 1296 if ((error = getsock(curproc, fd, &fp)) != 0) 1297 return (error); 1298 sosp = fp->f_data; 1299 if (sosp->so_proto->pr_usrreqs->pru_send != 1300 so->so_proto->pr_usrreqs->pru_send) { 1301 error = EPROTONOSUPPORT; 1302 goto frele; 1303 } 1304 if (sosp->so_sp == NULL) { 1305 sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1306 if (sosp->so_sp == NULL) 1307 sosp->so_sp = sp; 1308 else 1309 pool_put(&sosplice_pool, sp); 1310 } 1311 1312 /* Lock both receive and send buffer. */ 1313 if ((error = sblock(so, &so->so_rcv, M_WAITOK)) != 0) { 1314 goto frele; 1315 } 1316 if ((error = sblock(so, &sosp->so_snd, M_WAITOK)) != 0) { 1317 sbunlock(so, &so->so_rcv); 1318 goto frele; 1319 } 1320 1321 if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { 1322 error = EBUSY; 1323 goto release; 1324 } 1325 if (sosp->so_options & SO_ACCEPTCONN) { 1326 error = EOPNOTSUPP; 1327 goto release; 1328 } 1329 if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) { 1330 error = ENOTCONN; 1331 goto release; 1332 } 1333 1334 /* Splice so and sosp together. */ 1335 so->so_sp->ssp_socket = sosp; 1336 sosp->so_sp->ssp_soback = so; 1337 so->so_splicelen = 0; 1338 so->so_splicemax = max; 1339 if (tv) 1340 so->so_idletv = *tv; 1341 else 1342 timerclear(&so->so_idletv); 1343 timeout_set_proc(&so->so_idleto, soidle, so); 1344 task_set(&so->so_splicetask, sotask, so); 1345 1346 /* 1347 * To prevent softnet interrupt from calling somove() while 1348 * we sleep, the socket buffers are not marked as spliced yet. 1349 */ 1350 if (somove(so, M_WAIT)) { 1351 so->so_rcv.sb_flags |= SB_SPLICE; 1352 sosp->so_snd.sb_flags |= SB_SPLICE; 1353 } 1354 1355 release: 1356 sbunlock(sosp, &sosp->so_snd); 1357 sbunlock(so, &so->so_rcv); 1358 frele: 1359 /* 1360 * FRELE() must not be called with the socket lock held. It is safe to 1361 * release the lock here as long as no other operation happen on the 1362 * socket when sosplice() returns. The dance could be avoided by 1363 * grabbing the socket lock inside this function. 1364 */ 1365 sounlock(so); 1366 FRELE(fp, curproc); 1367 solock(so); 1368 return (error); 1369 } 1370 1371 void 1372 sounsplice(struct socket *so, struct socket *sosp, int freeing) 1373 { 1374 soassertlocked(so); 1375 1376 task_del(sosplice_taskq, &so->so_splicetask); 1377 timeout_del(&so->so_idleto); 1378 sosp->so_snd.sb_flags &= ~SB_SPLICE; 1379 so->so_rcv.sb_flags &= ~SB_SPLICE; 1380 so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; 1381 /* Do not wakeup a socket that is about to be freed. */ 1382 if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so)) 1383 sorwakeup(so); 1384 if ((freeing & SOSP_FREEING_WRITE) == 0 && sowriteable(sosp)) 1385 sowwakeup(sosp); 1386 } 1387 1388 void 1389 soidle(void *arg) 1390 { 1391 struct socket *so = arg; 1392 1393 solock(so); 1394 if (so->so_rcv.sb_flags & SB_SPLICE) { 1395 so->so_error = ETIMEDOUT; 1396 sounsplice(so, so->so_sp->ssp_socket, 0); 1397 } 1398 sounlock(so); 1399 } 1400 1401 void 1402 sotask(void *arg) 1403 { 1404 struct socket *so = arg; 1405 1406 solock(so); 1407 if (so->so_rcv.sb_flags & SB_SPLICE) { 1408 /* 1409 * We may not sleep here as sofree() and unsplice() may be 1410 * called from softnet interrupt context. This would remove 1411 * the socket during somove(). 1412 */ 1413 somove(so, M_DONTWAIT); 1414 } 1415 sounlock(so); 1416 1417 /* Avoid user land starvation. */ 1418 yield(); 1419 } 1420 1421 /* 1422 * The socket splicing task or idle timeout may sleep while grabbing the net 1423 * lock. As sofree() can be called anytime, sotask() or soidle() could access 1424 * the socket memory of a freed socket after wakeup. So delay the pool_put() 1425 * after all pending socket splicing tasks or timeouts have finished. Do this 1426 * by scheduling it on the same threads. 1427 */ 1428 void 1429 soreaper(void *arg) 1430 { 1431 struct socket *so = arg; 1432 1433 /* Reuse splice task, sounsplice() has been called before. */ 1434 task_set(&so->so_sp->ssp_task, soput, so); 1435 task_add(sosplice_taskq, &so->so_sp->ssp_task); 1436 } 1437 1438 void 1439 soput(void *arg) 1440 { 1441 struct socket *so = arg; 1442 1443 pool_put(&sosplice_pool, so->so_sp); 1444 pool_put(&socket_pool, so); 1445 } 1446 1447 /* 1448 * Move data from receive buffer of spliced source socket to send 1449 * buffer of drain socket. Try to move as much as possible in one 1450 * big chunk. It is a TCP only implementation. 1451 * Return value 0 means splicing has been finished, 1 continue. 1452 */ 1453 int 1454 somove(struct socket *so, int wait) 1455 { 1456 struct socket *sosp = so->so_sp->ssp_socket; 1457 struct mbuf *m, **mp, *nextrecord; 1458 u_long len, off, oobmark; 1459 long space; 1460 int error = 0, maxreached = 0; 1461 unsigned int rcvstate; 1462 1463 soassertlocked(so); 1464 1465 nextpkt: 1466 if (so->so_error) { 1467 error = so->so_error; 1468 goto release; 1469 } 1470 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) { 1471 error = EPIPE; 1472 goto release; 1473 } 1474 if (sosp->so_error && sosp->so_error != ETIMEDOUT && 1475 sosp->so_error != EFBIG && sosp->so_error != ELOOP) { 1476 error = sosp->so_error; 1477 goto release; 1478 } 1479 if ((sosp->so_state & SS_ISCONNECTED) == 0) 1480 goto release; 1481 1482 /* Calculate how many bytes can be copied now. */ 1483 len = so->so_rcv.sb_datacc; 1484 if (so->so_splicemax) { 1485 KASSERT(so->so_splicelen < so->so_splicemax); 1486 if (so->so_splicemax <= so->so_splicelen + len) { 1487 len = so->so_splicemax - so->so_splicelen; 1488 maxreached = 1; 1489 } 1490 } 1491 space = sbspace(sosp, &sosp->so_snd); 1492 if (so->so_oobmark && so->so_oobmark < len && 1493 so->so_oobmark < space + 1024) 1494 space += 1024; 1495 if (space <= 0) { 1496 maxreached = 0; 1497 goto release; 1498 } 1499 if (space < len) { 1500 maxreached = 0; 1501 if (space < sosp->so_snd.sb_lowat) 1502 goto release; 1503 len = space; 1504 } 1505 sosp->so_snd.sb_state |= SS_ISSENDING; 1506 1507 SBLASTRECORDCHK(&so->so_rcv, "somove 1"); 1508 SBLASTMBUFCHK(&so->so_rcv, "somove 1"); 1509 m = so->so_rcv.sb_mb; 1510 if (m == NULL) 1511 goto release; 1512 nextrecord = m->m_nextpkt; 1513 1514 /* Drop address and control information not used with splicing. */ 1515 if (so->so_proto->pr_flags & PR_ADDR) { 1516 #ifdef DIAGNOSTIC 1517 if (m->m_type != MT_SONAME) 1518 panic("somove soname: so %p, so_type %d, m %p, " 1519 "m_type %d", so, so->so_type, m, m->m_type); 1520 #endif 1521 m = m->m_next; 1522 } 1523 while (m && m->m_type == MT_CONTROL) 1524 m = m->m_next; 1525 if (m == NULL) { 1526 sbdroprecord(so, &so->so_rcv); 1527 if (so->so_proto->pr_flags & PR_WANTRCVD) 1528 pru_rcvd(so); 1529 goto nextpkt; 1530 } 1531 1532 /* 1533 * By splicing sockets connected to localhost, userland might create a 1534 * loop. Dissolve splicing with error if loop is detected by counter. 1535 * 1536 * If we deal with looped broadcast/multicast packet we bail out with 1537 * no error to suppress splice termination. 1538 */ 1539 if ((m->m_flags & M_PKTHDR) && 1540 ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) || 1541 ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) { 1542 error = ELOOP; 1543 goto release; 1544 } 1545 1546 if (so->so_proto->pr_flags & PR_ATOMIC) { 1547 if ((m->m_flags & M_PKTHDR) == 0) 1548 panic("somove !PKTHDR: so %p, so_type %d, m %p, " 1549 "m_type %d", so, so->so_type, m, m->m_type); 1550 if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) { 1551 error = EMSGSIZE; 1552 goto release; 1553 } 1554 if (len < m->m_pkthdr.len) 1555 goto release; 1556 if (m->m_pkthdr.len < len) { 1557 maxreached = 0; 1558 len = m->m_pkthdr.len; 1559 } 1560 /* 1561 * Throw away the name mbuf after it has been assured 1562 * that the whole first record can be processed. 1563 */ 1564 m = so->so_rcv.sb_mb; 1565 sbfree(so, &so->so_rcv, m); 1566 so->so_rcv.sb_mb = m_free(m); 1567 sbsync(&so->so_rcv, nextrecord); 1568 } 1569 /* 1570 * Throw away the control mbufs after it has been assured 1571 * that the whole first record can be processed. 1572 */ 1573 m = so->so_rcv.sb_mb; 1574 while (m && m->m_type == MT_CONTROL) { 1575 sbfree(so, &so->so_rcv, m); 1576 so->so_rcv.sb_mb = m_free(m); 1577 m = so->so_rcv.sb_mb; 1578 sbsync(&so->so_rcv, nextrecord); 1579 } 1580 1581 SBLASTRECORDCHK(&so->so_rcv, "somove 2"); 1582 SBLASTMBUFCHK(&so->so_rcv, "somove 2"); 1583 1584 /* Take at most len mbufs out of receive buffer. */ 1585 for (off = 0, mp = &m; off <= len && *mp; 1586 off += (*mp)->m_len, mp = &(*mp)->m_next) { 1587 u_long size = len - off; 1588 1589 #ifdef DIAGNOSTIC 1590 if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER) 1591 panic("somove type: so %p, so_type %d, m %p, " 1592 "m_type %d", so, so->so_type, *mp, (*mp)->m_type); 1593 #endif 1594 if ((*mp)->m_len > size) { 1595 /* 1596 * Move only a partial mbuf at maximum splice length or 1597 * if the drain buffer is too small for this large mbuf. 1598 */ 1599 if (!maxreached && so->so_snd.sb_datacc > 0) { 1600 len -= size; 1601 break; 1602 } 1603 *mp = m_copym(so->so_rcv.sb_mb, 0, size, wait); 1604 if (*mp == NULL) { 1605 len -= size; 1606 break; 1607 } 1608 so->so_rcv.sb_mb->m_data += size; 1609 so->so_rcv.sb_mb->m_len -= size; 1610 so->so_rcv.sb_cc -= size; 1611 so->so_rcv.sb_datacc -= size; 1612 } else { 1613 *mp = so->so_rcv.sb_mb; 1614 sbfree(so, &so->so_rcv, *mp); 1615 so->so_rcv.sb_mb = (*mp)->m_next; 1616 sbsync(&so->so_rcv, nextrecord); 1617 } 1618 } 1619 *mp = NULL; 1620 1621 SBLASTRECORDCHK(&so->so_rcv, "somove 3"); 1622 SBLASTMBUFCHK(&so->so_rcv, "somove 3"); 1623 SBCHECK(so, &so->so_rcv); 1624 if (m == NULL) 1625 goto release; 1626 m->m_nextpkt = NULL; 1627 if (m->m_flags & M_PKTHDR) { 1628 m_resethdr(m); 1629 m->m_pkthdr.len = len; 1630 } 1631 1632 /* Send window update to source peer as receive buffer has changed. */ 1633 if (so->so_proto->pr_flags & PR_WANTRCVD) 1634 pru_rcvd(so); 1635 1636 /* Receive buffer did shrink by len bytes, adjust oob. */ 1637 rcvstate = so->so_rcv.sb_state; 1638 so->so_rcv.sb_state &= ~SS_RCVATMARK; 1639 oobmark = so->so_oobmark; 1640 so->so_oobmark = oobmark > len ? oobmark - len : 0; 1641 if (oobmark) { 1642 if (oobmark == len) 1643 so->so_rcv.sb_state |= SS_RCVATMARK; 1644 if (oobmark >= len) 1645 oobmark = 0; 1646 } 1647 1648 /* 1649 * Handle oob data. If any malloc fails, ignore error. 1650 * TCP urgent data is not very reliable anyway. 1651 */ 1652 while (((rcvstate & SS_RCVATMARK) || oobmark) && 1653 (so->so_options & SO_OOBINLINE)) { 1654 struct mbuf *o = NULL; 1655 1656 if (rcvstate & SS_RCVATMARK) { 1657 o = m_get(wait, MT_DATA); 1658 rcvstate &= ~SS_RCVATMARK; 1659 } else if (oobmark) { 1660 o = m_split(m, oobmark, wait); 1661 if (o) { 1662 error = pru_send(sosp, m, NULL, NULL); 1663 if (error) { 1664 if (sosp->so_snd.sb_state & 1665 SS_CANTSENDMORE) 1666 error = EPIPE; 1667 m_freem(o); 1668 goto release; 1669 } 1670 len -= oobmark; 1671 so->so_splicelen += oobmark; 1672 m = o; 1673 o = m_get(wait, MT_DATA); 1674 } 1675 oobmark = 0; 1676 } 1677 if (o) { 1678 o->m_len = 1; 1679 *mtod(o, caddr_t) = *mtod(m, caddr_t); 1680 error = pru_sendoob(sosp, o, NULL, NULL); 1681 if (error) { 1682 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) 1683 error = EPIPE; 1684 m_freem(m); 1685 goto release; 1686 } 1687 len -= 1; 1688 so->so_splicelen += 1; 1689 if (oobmark) { 1690 oobmark -= 1; 1691 if (oobmark == 0) 1692 rcvstate |= SS_RCVATMARK; 1693 } 1694 m_adj(m, 1); 1695 } 1696 } 1697 1698 /* Append all remaining data to drain socket. */ 1699 if (so->so_rcv.sb_cc == 0 || maxreached) 1700 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1701 error = pru_send(sosp, m, NULL, NULL); 1702 if (error) { 1703 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) 1704 error = EPIPE; 1705 goto release; 1706 } 1707 so->so_splicelen += len; 1708 1709 /* Move several packets if possible. */ 1710 if (!maxreached && nextrecord) 1711 goto nextpkt; 1712 1713 release: 1714 sosp->so_snd.sb_state &= ~SS_ISSENDING; 1715 if (!error && maxreached && so->so_splicemax == so->so_splicelen) 1716 error = EFBIG; 1717 if (error) 1718 so->so_error = error; 1719 if (((so->so_rcv.sb_state & SS_CANTRCVMORE) && 1720 so->so_rcv.sb_cc == 0) || 1721 (sosp->so_snd.sb_state & SS_CANTSENDMORE) || 1722 maxreached || error) { 1723 sounsplice(so, sosp, 0); 1724 return (0); 1725 } 1726 if (timerisset(&so->so_idletv)) 1727 timeout_add_tv(&so->so_idleto, &so->so_idletv); 1728 return (1); 1729 } 1730 1731 #endif /* SOCKET_SPLICE */ 1732 1733 void 1734 sorwakeup(struct socket *so) 1735 { 1736 soassertlocked(so); 1737 1738 #ifdef SOCKET_SPLICE 1739 if (so->so_rcv.sb_flags & SB_SPLICE) { 1740 /* 1741 * TCP has a sendbuffer that can handle multiple packets 1742 * at once. So queue the stream a bit to accumulate data. 1743 * The sosplice thread will call somove() later and send 1744 * the packets calling tcp_output() only once. 1745 * In the UDP case, send out the packets immediately. 1746 * Using a thread would make things slower. 1747 */ 1748 if (so->so_proto->pr_flags & PR_WANTRCVD) 1749 task_add(sosplice_taskq, &so->so_splicetask); 1750 else 1751 somove(so, M_DONTWAIT); 1752 } 1753 if (isspliced(so)) 1754 return; 1755 #endif 1756 sowakeup(so, &so->so_rcv); 1757 if (so->so_upcall) 1758 (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT); 1759 } 1760 1761 void 1762 sowwakeup(struct socket *so) 1763 { 1764 soassertlocked(so); 1765 1766 #ifdef SOCKET_SPLICE 1767 if (so->so_snd.sb_flags & SB_SPLICE) 1768 task_add(sosplice_taskq, &so->so_sp->ssp_soback->so_splicetask); 1769 if (issplicedback(so)) 1770 return; 1771 #endif 1772 sowakeup(so, &so->so_snd); 1773 } 1774 1775 int 1776 sosetopt(struct socket *so, int level, int optname, struct mbuf *m) 1777 { 1778 int error = 0; 1779 1780 soassertlocked(so); 1781 1782 if (level != SOL_SOCKET) { 1783 if (so->so_proto->pr_ctloutput) { 1784 error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1785 level, optname, m); 1786 return (error); 1787 } 1788 error = ENOPROTOOPT; 1789 } else { 1790 switch (optname) { 1791 case SO_BINDANY: 1792 if ((error = suser(curproc)) != 0) /* XXX */ 1793 return (error); 1794 break; 1795 } 1796 1797 switch (optname) { 1798 1799 case SO_LINGER: 1800 if (m == NULL || m->m_len != sizeof (struct linger) || 1801 mtod(m, struct linger *)->l_linger < 0 || 1802 mtod(m, struct linger *)->l_linger > SHRT_MAX) 1803 return (EINVAL); 1804 so->so_linger = mtod(m, struct linger *)->l_linger; 1805 /* FALLTHROUGH */ 1806 1807 case SO_BINDANY: 1808 case SO_DEBUG: 1809 case SO_KEEPALIVE: 1810 case SO_USELOOPBACK: 1811 case SO_BROADCAST: 1812 case SO_REUSEADDR: 1813 case SO_REUSEPORT: 1814 case SO_OOBINLINE: 1815 case SO_TIMESTAMP: 1816 case SO_ZEROIZE: 1817 if (m == NULL || m->m_len < sizeof (int)) 1818 return (EINVAL); 1819 if (*mtod(m, int *)) 1820 so->so_options |= optname; 1821 else 1822 so->so_options &= ~optname; 1823 break; 1824 1825 case SO_DONTROUTE: 1826 if (m == NULL || m->m_len < sizeof (int)) 1827 return (EINVAL); 1828 if (*mtod(m, int *)) 1829 error = EOPNOTSUPP; 1830 break; 1831 1832 case SO_SNDBUF: 1833 case SO_RCVBUF: 1834 case SO_SNDLOWAT: 1835 case SO_RCVLOWAT: 1836 { 1837 u_long cnt; 1838 1839 if (m == NULL || m->m_len < sizeof (int)) 1840 return (EINVAL); 1841 cnt = *mtod(m, int *); 1842 if ((long)cnt <= 0) 1843 cnt = 1; 1844 switch (optname) { 1845 1846 case SO_SNDBUF: 1847 if (so->so_snd.sb_state & SS_CANTSENDMORE) 1848 return (EINVAL); 1849 if (sbcheckreserve(cnt, so->so_snd.sb_wat) || 1850 sbreserve(so, &so->so_snd, cnt)) 1851 return (ENOBUFS); 1852 so->so_snd.sb_wat = cnt; 1853 break; 1854 1855 case SO_RCVBUF: 1856 if (so->so_rcv.sb_state & SS_CANTRCVMORE) 1857 return (EINVAL); 1858 if (sbcheckreserve(cnt, so->so_rcv.sb_wat) || 1859 sbreserve(so, &so->so_rcv, cnt)) 1860 return (ENOBUFS); 1861 so->so_rcv.sb_wat = cnt; 1862 break; 1863 1864 case SO_SNDLOWAT: 1865 so->so_snd.sb_lowat = 1866 (cnt > so->so_snd.sb_hiwat) ? 1867 so->so_snd.sb_hiwat : cnt; 1868 break; 1869 case SO_RCVLOWAT: 1870 so->so_rcv.sb_lowat = 1871 (cnt > so->so_rcv.sb_hiwat) ? 1872 so->so_rcv.sb_hiwat : cnt; 1873 break; 1874 } 1875 break; 1876 } 1877 1878 case SO_SNDTIMEO: 1879 case SO_RCVTIMEO: 1880 { 1881 struct timeval tv; 1882 uint64_t nsecs; 1883 1884 if (m == NULL || m->m_len < sizeof (tv)) 1885 return (EINVAL); 1886 memcpy(&tv, mtod(m, struct timeval *), sizeof tv); 1887 if (!timerisvalid(&tv)) 1888 return (EINVAL); 1889 nsecs = TIMEVAL_TO_NSEC(&tv); 1890 if (nsecs == UINT64_MAX) 1891 return (EDOM); 1892 if (nsecs == 0) 1893 nsecs = INFSLP; 1894 switch (optname) { 1895 1896 case SO_SNDTIMEO: 1897 so->so_snd.sb_timeo_nsecs = nsecs; 1898 break; 1899 case SO_RCVTIMEO: 1900 so->so_rcv.sb_timeo_nsecs = nsecs; 1901 break; 1902 } 1903 break; 1904 } 1905 1906 case SO_RTABLE: 1907 if (so->so_proto->pr_domain && 1908 so->so_proto->pr_domain->dom_protosw && 1909 so->so_proto->pr_ctloutput) { 1910 const struct domain *dom = 1911 so->so_proto->pr_domain; 1912 1913 level = dom->dom_protosw->pr_protocol; 1914 error = (*so->so_proto->pr_ctloutput) 1915 (PRCO_SETOPT, so, level, optname, m); 1916 return (error); 1917 } 1918 error = ENOPROTOOPT; 1919 break; 1920 1921 #ifdef SOCKET_SPLICE 1922 case SO_SPLICE: 1923 if (m == NULL) { 1924 error = sosplice(so, -1, 0, NULL); 1925 } else if (m->m_len < sizeof(int)) { 1926 return (EINVAL); 1927 } else if (m->m_len < sizeof(struct splice)) { 1928 error = sosplice(so, *mtod(m, int *), 0, NULL); 1929 } else { 1930 error = sosplice(so, 1931 mtod(m, struct splice *)->sp_fd, 1932 mtod(m, struct splice *)->sp_max, 1933 &mtod(m, struct splice *)->sp_idle); 1934 } 1935 break; 1936 #endif /* SOCKET_SPLICE */ 1937 1938 default: 1939 error = ENOPROTOOPT; 1940 break; 1941 } 1942 if (error == 0 && so->so_proto->pr_ctloutput) { 1943 (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1944 level, optname, m); 1945 } 1946 } 1947 1948 return (error); 1949 } 1950 1951 int 1952 sogetopt(struct socket *so, int level, int optname, struct mbuf *m) 1953 { 1954 int error = 0; 1955 1956 soassertlocked(so); 1957 1958 if (level != SOL_SOCKET) { 1959 if (so->so_proto->pr_ctloutput) { 1960 m->m_len = 0; 1961 1962 error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so, 1963 level, optname, m); 1964 return (error); 1965 } else 1966 return (ENOPROTOOPT); 1967 } else { 1968 m->m_len = sizeof (int); 1969 1970 switch (optname) { 1971 1972 case SO_LINGER: 1973 m->m_len = sizeof (struct linger); 1974 mtod(m, struct linger *)->l_onoff = 1975 so->so_options & SO_LINGER; 1976 mtod(m, struct linger *)->l_linger = so->so_linger; 1977 break; 1978 1979 case SO_BINDANY: 1980 case SO_USELOOPBACK: 1981 case SO_DEBUG: 1982 case SO_KEEPALIVE: 1983 case SO_REUSEADDR: 1984 case SO_REUSEPORT: 1985 case SO_BROADCAST: 1986 case SO_OOBINLINE: 1987 case SO_TIMESTAMP: 1988 case SO_ZEROIZE: 1989 *mtod(m, int *) = so->so_options & optname; 1990 break; 1991 1992 case SO_DONTROUTE: 1993 *mtod(m, int *) = 0; 1994 break; 1995 1996 case SO_TYPE: 1997 *mtod(m, int *) = so->so_type; 1998 break; 1999 2000 case SO_ERROR: 2001 *mtod(m, int *) = so->so_error; 2002 so->so_error = 0; 2003 break; 2004 2005 case SO_DOMAIN: 2006 *mtod(m, int *) = so->so_proto->pr_domain->dom_family; 2007 break; 2008 2009 case SO_PROTOCOL: 2010 *mtod(m, int *) = so->so_proto->pr_protocol; 2011 break; 2012 2013 case SO_SNDBUF: 2014 *mtod(m, int *) = so->so_snd.sb_hiwat; 2015 break; 2016 2017 case SO_RCVBUF: 2018 *mtod(m, int *) = so->so_rcv.sb_hiwat; 2019 break; 2020 2021 case SO_SNDLOWAT: 2022 *mtod(m, int *) = so->so_snd.sb_lowat; 2023 break; 2024 2025 case SO_RCVLOWAT: 2026 *mtod(m, int *) = so->so_rcv.sb_lowat; 2027 break; 2028 2029 case SO_SNDTIMEO: 2030 case SO_RCVTIMEO: 2031 { 2032 struct timeval tv; 2033 uint64_t nsecs = (optname == SO_SNDTIMEO ? 2034 so->so_snd.sb_timeo_nsecs : 2035 so->so_rcv.sb_timeo_nsecs); 2036 2037 m->m_len = sizeof(struct timeval); 2038 memset(&tv, 0, sizeof(tv)); 2039 if (nsecs != INFSLP) 2040 NSEC_TO_TIMEVAL(nsecs, &tv); 2041 memcpy(mtod(m, struct timeval *), &tv, sizeof tv); 2042 break; 2043 } 2044 2045 case SO_RTABLE: 2046 if (so->so_proto->pr_domain && 2047 so->so_proto->pr_domain->dom_protosw && 2048 so->so_proto->pr_ctloutput) { 2049 const struct domain *dom = 2050 so->so_proto->pr_domain; 2051 2052 level = dom->dom_protosw->pr_protocol; 2053 error = (*so->so_proto->pr_ctloutput) 2054 (PRCO_GETOPT, so, level, optname, m); 2055 if (error) 2056 return (error); 2057 break; 2058 } 2059 return (ENOPROTOOPT); 2060 2061 #ifdef SOCKET_SPLICE 2062 case SO_SPLICE: 2063 { 2064 off_t len; 2065 2066 m->m_len = sizeof(off_t); 2067 len = so->so_sp ? so->so_sp->ssp_len : 0; 2068 memcpy(mtod(m, off_t *), &len, sizeof(off_t)); 2069 break; 2070 } 2071 #endif /* SOCKET_SPLICE */ 2072 2073 case SO_PEERCRED: 2074 if (so->so_proto->pr_protocol == AF_UNIX) { 2075 struct unpcb *unp = sotounpcb(so); 2076 2077 if (unp->unp_flags & UNP_FEIDS) { 2078 m->m_len = sizeof(unp->unp_connid); 2079 memcpy(mtod(m, caddr_t), 2080 &(unp->unp_connid), m->m_len); 2081 break; 2082 } 2083 return (ENOTCONN); 2084 } 2085 return (EOPNOTSUPP); 2086 2087 default: 2088 return (ENOPROTOOPT); 2089 } 2090 return (0); 2091 } 2092 } 2093 2094 void 2095 sohasoutofband(struct socket *so) 2096 { 2097 pgsigio(&so->so_sigio, SIGURG, 0); 2098 KNOTE(&so->so_rcv.sb_sel.si_note, 0); 2099 } 2100 2101 int 2102 soo_kqfilter(struct file *fp, struct knote *kn) 2103 { 2104 struct socket *so = kn->kn_fp->f_data; 2105 struct sockbuf *sb; 2106 2107 solock(so); 2108 switch (kn->kn_filter) { 2109 case EVFILT_READ: 2110 if (so->so_options & SO_ACCEPTCONN) 2111 kn->kn_fop = &solisten_filtops; 2112 else 2113 kn->kn_fop = &soread_filtops; 2114 sb = &so->so_rcv; 2115 break; 2116 case EVFILT_WRITE: 2117 kn->kn_fop = &sowrite_filtops; 2118 sb = &so->so_snd; 2119 break; 2120 case EVFILT_EXCEPT: 2121 kn->kn_fop = &soexcept_filtops; 2122 sb = &so->so_rcv; 2123 break; 2124 default: 2125 sounlock(so); 2126 return (EINVAL); 2127 } 2128 2129 klist_insert_locked(&sb->sb_sel.si_note, kn); 2130 sounlock(so); 2131 2132 return (0); 2133 } 2134 2135 void 2136 filt_sordetach(struct knote *kn) 2137 { 2138 struct socket *so = kn->kn_fp->f_data; 2139 2140 klist_remove(&so->so_rcv.sb_sel.si_note, kn); 2141 } 2142 2143 int 2144 filt_soread(struct knote *kn, long hint) 2145 { 2146 struct socket *so = kn->kn_fp->f_data; 2147 int rv = 0; 2148 2149 soassertlocked(so); 2150 2151 kn->kn_data = so->so_rcv.sb_cc; 2152 #ifdef SOCKET_SPLICE 2153 if (isspliced(so)) { 2154 rv = 0; 2155 } else 2156 #endif /* SOCKET_SPLICE */ 2157 if (so->so_rcv.sb_state & SS_CANTRCVMORE) { 2158 kn->kn_flags |= EV_EOF; 2159 if (kn->kn_flags & __EV_POLL) { 2160 if (so->so_state & SS_ISDISCONNECTED) 2161 kn->kn_flags |= __EV_HUP; 2162 } 2163 kn->kn_fflags = so->so_error; 2164 rv = 1; 2165 } else if (so->so_error) { /* temporary udp error */ 2166 rv = 1; 2167 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2168 rv = (kn->kn_data >= kn->kn_sdata); 2169 } else { 2170 rv = (kn->kn_data >= so->so_rcv.sb_lowat); 2171 } 2172 2173 return rv; 2174 } 2175 2176 void 2177 filt_sowdetach(struct knote *kn) 2178 { 2179 struct socket *so = kn->kn_fp->f_data; 2180 2181 klist_remove(&so->so_snd.sb_sel.si_note, kn); 2182 } 2183 2184 int 2185 filt_sowrite(struct knote *kn, long hint) 2186 { 2187 struct socket *so = kn->kn_fp->f_data; 2188 int rv; 2189 2190 soassertlocked(so); 2191 2192 kn->kn_data = sbspace(so, &so->so_snd); 2193 if (so->so_snd.sb_state & SS_CANTSENDMORE) { 2194 kn->kn_flags |= EV_EOF; 2195 if (kn->kn_flags & __EV_POLL) { 2196 if (so->so_state & SS_ISDISCONNECTED) 2197 kn->kn_flags |= __EV_HUP; 2198 } 2199 kn->kn_fflags = so->so_error; 2200 rv = 1; 2201 } else if (so->so_error) { /* temporary udp error */ 2202 rv = 1; 2203 } else if (((so->so_state & SS_ISCONNECTED) == 0) && 2204 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 2205 rv = 0; 2206 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2207 rv = (kn->kn_data >= kn->kn_sdata); 2208 } else { 2209 rv = (kn->kn_data >= so->so_snd.sb_lowat); 2210 } 2211 2212 return (rv); 2213 } 2214 2215 int 2216 filt_soexcept(struct knote *kn, long hint) 2217 { 2218 struct socket *so = kn->kn_fp->f_data; 2219 int rv = 0; 2220 2221 soassertlocked(so); 2222 2223 #ifdef SOCKET_SPLICE 2224 if (isspliced(so)) { 2225 rv = 0; 2226 } else 2227 #endif /* SOCKET_SPLICE */ 2228 if (kn->kn_sfflags & NOTE_OOB) { 2229 if (so->so_oobmark || (so->so_rcv.sb_state & SS_RCVATMARK)) { 2230 kn->kn_fflags |= NOTE_OOB; 2231 kn->kn_data -= so->so_oobmark; 2232 rv = 1; 2233 } 2234 } 2235 2236 if (kn->kn_flags & __EV_POLL) { 2237 if (so->so_state & SS_ISDISCONNECTED) { 2238 kn->kn_flags |= __EV_HUP; 2239 rv = 1; 2240 } 2241 } 2242 2243 return rv; 2244 } 2245 2246 int 2247 filt_solisten(struct knote *kn, long hint) 2248 { 2249 struct socket *so = kn->kn_fp->f_data; 2250 int active; 2251 2252 soassertlocked(so); 2253 2254 kn->kn_data = so->so_qlen; 2255 active = (kn->kn_data != 0); 2256 2257 if (kn->kn_flags & (__EV_POLL | __EV_SELECT)) { 2258 if (so->so_state & SS_ISDISCONNECTED) { 2259 kn->kn_flags |= __EV_HUP; 2260 active = 1; 2261 } else { 2262 active = soreadable(so); 2263 } 2264 } 2265 2266 return (active); 2267 } 2268 2269 int 2270 filt_somodify(struct kevent *kev, struct knote *kn) 2271 { 2272 struct socket *so = kn->kn_fp->f_data; 2273 int rv; 2274 2275 solock(so); 2276 rv = knote_modify(kev, kn); 2277 sounlock(so); 2278 2279 return (rv); 2280 } 2281 2282 int 2283 filt_soprocess(struct knote *kn, struct kevent *kev) 2284 { 2285 struct socket *so = kn->kn_fp->f_data; 2286 int rv; 2287 2288 solock(so); 2289 rv = knote_process(kn, kev); 2290 sounlock(so); 2291 2292 return (rv); 2293 } 2294 2295 void 2296 klist_soassertlk(void *arg) 2297 { 2298 struct socket *so = arg; 2299 2300 soassertlocked(so); 2301 } 2302 2303 int 2304 klist_solock(void *arg) 2305 { 2306 struct socket *so = arg; 2307 2308 solock(so); 2309 return (1); 2310 } 2311 2312 void 2313 klist_sounlock(void *arg, int ls) 2314 { 2315 struct socket *so = arg; 2316 2317 sounlock(so); 2318 } 2319 2320 const struct klistops socket_klistops = { 2321 .klo_assertlk = klist_soassertlk, 2322 .klo_lock = klist_solock, 2323 .klo_unlock = klist_sounlock, 2324 }; 2325 2326 #ifdef DDB 2327 void 2328 sobuf_print(struct sockbuf *, 2329 int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))); 2330 2331 void 2332 sobuf_print(struct sockbuf *sb, 2333 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2334 { 2335 (*pr)("\tsb_cc: %lu\n", sb->sb_cc); 2336 (*pr)("\tsb_datacc: %lu\n", sb->sb_datacc); 2337 (*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat); 2338 (*pr)("\tsb_wat: %lu\n", sb->sb_wat); 2339 (*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt); 2340 (*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax); 2341 (*pr)("\tsb_lowat: %ld\n", sb->sb_lowat); 2342 (*pr)("\tsb_mb: %p\n", sb->sb_mb); 2343 (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail); 2344 (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord); 2345 (*pr)("\tsb_sel: ...\n"); 2346 (*pr)("\tsb_flags: %i\n", sb->sb_flags); 2347 (*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs); 2348 } 2349 2350 void 2351 so_print(void *v, 2352 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2353 { 2354 struct socket *so = v; 2355 2356 (*pr)("socket %p\n", so); 2357 (*pr)("so_type: %i\n", so->so_type); 2358 (*pr)("so_options: 0x%04x\n", so->so_options); /* %b */ 2359 (*pr)("so_linger: %i\n", so->so_linger); 2360 (*pr)("so_state: 0x%04x\n", so->so_state); 2361 (*pr)("so_pcb: %p\n", so->so_pcb); 2362 (*pr)("so_proto: %p\n", so->so_proto); 2363 (*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio); 2364 2365 (*pr)("so_head: %p\n", so->so_head); 2366 (*pr)("so_onq: %p\n", so->so_onq); 2367 (*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0)); 2368 (*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q)); 2369 (*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe)); 2370 (*pr)("so_q0len: %i\n", so->so_q0len); 2371 (*pr)("so_qlen: %i\n", so->so_qlen); 2372 (*pr)("so_qlimit: %i\n", so->so_qlimit); 2373 (*pr)("so_timeo: %i\n", so->so_timeo); 2374 (*pr)("so_obmark: %lu\n", so->so_oobmark); 2375 2376 (*pr)("so_sp: %p\n", so->so_sp); 2377 if (so->so_sp != NULL) { 2378 (*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket); 2379 (*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback); 2380 (*pr)("\tssp_len: %lld\n", 2381 (unsigned long long)so->so_sp->ssp_len); 2382 (*pr)("\tssp_max: %lld\n", 2383 (unsigned long long)so->so_sp->ssp_max); 2384 (*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec, 2385 so->so_sp->ssp_idletv.tv_usec); 2386 (*pr)("\tssp_idleto: %spending (@%i)\n", 2387 timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ", 2388 so->so_sp->ssp_idleto.to_time); 2389 } 2390 2391 (*pr)("so_rcv:\n"); 2392 sobuf_print(&so->so_rcv, pr); 2393 (*pr)("so_snd:\n"); 2394 sobuf_print(&so->so_snd, pr); 2395 2396 (*pr)("so_upcall: %p so_upcallarg: %p\n", 2397 so->so_upcall, so->so_upcallarg); 2398 2399 (*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid); 2400 (*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid); 2401 (*pr)("so_cpid: %d\n", so->so_cpid); 2402 } 2403 #endif 2404