1 /* $OpenBSD: uipc_socket.c,v 1.289 2022/09/05 14:56:08 bluhm Exp $ */ 2 /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 #include <sys/file.h> 39 #include <sys/filedesc.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/domain.h> 43 #include <sys/event.h> 44 #include <sys/protosw.h> 45 #include <sys/socket.h> 46 #include <sys/unpcb.h> 47 #include <sys/socketvar.h> 48 #include <sys/signalvar.h> 49 #include <sys/pool.h> 50 #include <sys/atomic.h> 51 #include <sys/rwlock.h> 52 #include <sys/time.h> 53 #include <sys/refcnt.h> 54 55 #ifdef DDB 56 #include <machine/db_machdep.h> 57 #endif 58 59 void sbsync(struct sockbuf *, struct mbuf *); 60 61 int sosplice(struct socket *, int, off_t, struct timeval *); 62 void sounsplice(struct socket *, struct socket *, int); 63 void soidle(void *); 64 void sotask(void *); 65 void soreaper(void *); 66 void soput(void *); 67 int somove(struct socket *, int); 68 void sorflush(struct socket *); 69 70 void filt_sordetach(struct knote *kn); 71 int filt_soread(struct knote *kn, long hint); 72 void filt_sowdetach(struct knote *kn); 73 int filt_sowrite(struct knote *kn, long hint); 74 int filt_soexcept(struct knote *kn, long hint); 75 int filt_solisten(struct knote *kn, long hint); 76 int filt_somodify(struct kevent *kev, struct knote *kn); 77 int filt_soprocess(struct knote *kn, struct kevent *kev); 78 79 const struct filterops solisten_filtops = { 80 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 81 .f_attach = NULL, 82 .f_detach = filt_sordetach, 83 .f_event = filt_solisten, 84 .f_modify = filt_somodify, 85 .f_process = filt_soprocess, 86 }; 87 88 const struct filterops soread_filtops = { 89 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 90 .f_attach = NULL, 91 .f_detach = filt_sordetach, 92 .f_event = filt_soread, 93 .f_modify = filt_somodify, 94 .f_process = filt_soprocess, 95 }; 96 97 const struct filterops sowrite_filtops = { 98 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 99 .f_attach = NULL, 100 .f_detach = filt_sowdetach, 101 .f_event = filt_sowrite, 102 .f_modify = filt_somodify, 103 .f_process = filt_soprocess, 104 }; 105 106 const struct filterops soexcept_filtops = { 107 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 108 .f_attach = NULL, 109 .f_detach = filt_sordetach, 110 .f_event = filt_soexcept, 111 .f_modify = filt_somodify, 112 .f_process = filt_soprocess, 113 }; 114 115 #ifndef SOMINCONN 116 #define SOMINCONN 80 117 #endif /* SOMINCONN */ 118 119 int somaxconn = SOMAXCONN; 120 int sominconn = SOMINCONN; 121 122 struct pool socket_pool; 123 #ifdef SOCKET_SPLICE 124 struct pool sosplice_pool; 125 struct taskq *sosplice_taskq; 126 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk"); 127 #endif 128 129 void 130 soinit(void) 131 { 132 pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0, 133 "sockpl", NULL); 134 #ifdef SOCKET_SPLICE 135 pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0, 136 "sosppl", NULL); 137 #endif 138 } 139 140 struct socket * 141 soalloc(int prflags) 142 { 143 struct socket *so; 144 145 so = pool_get(&socket_pool, prflags); 146 if (so == NULL) 147 return (NULL); 148 rw_init_flags(&so->so_lock, "solock", RWL_DUPOK); 149 refcnt_init(&so->so_refcnt); 150 151 return (so); 152 } 153 154 /* 155 * Socket operation routines. 156 * These routines are called by the routines in 157 * sys_socket.c or from a system process, and 158 * implement the semantics of socket operations by 159 * switching out to the protocol specific routines. 160 */ 161 int 162 socreate(int dom, struct socket **aso, int type, int proto) 163 { 164 struct proc *p = curproc; /* XXX */ 165 const struct protosw *prp; 166 struct socket *so; 167 int error; 168 169 if (proto) 170 prp = pffindproto(dom, proto, type); 171 else 172 prp = pffindtype(dom, type); 173 if (prp == NULL || prp->pr_usrreqs == NULL) 174 return (EPROTONOSUPPORT); 175 if (prp->pr_type != type) 176 return (EPROTOTYPE); 177 so = soalloc(PR_WAITOK | PR_ZERO); 178 klist_init(&so->so_rcv.sb_sel.si_note, &socket_klistops, so); 179 klist_init(&so->so_snd.sb_sel.si_note, &socket_klistops, so); 180 sigio_init(&so->so_sigio); 181 TAILQ_INIT(&so->so_q0); 182 TAILQ_INIT(&so->so_q); 183 so->so_type = type; 184 if (suser(p) == 0) 185 so->so_state = SS_PRIV; 186 so->so_ruid = p->p_ucred->cr_ruid; 187 so->so_euid = p->p_ucred->cr_uid; 188 so->so_rgid = p->p_ucred->cr_rgid; 189 so->so_egid = p->p_ucred->cr_gid; 190 so->so_cpid = p->p_p->ps_pid; 191 so->so_proto = prp; 192 so->so_snd.sb_timeo_nsecs = INFSLP; 193 so->so_rcv.sb_timeo_nsecs = INFSLP; 194 195 solock(so); 196 error = pru_attach(so, proto); 197 if (error) { 198 so->so_state |= SS_NOFDREF; 199 /* sofree() calls sounlock(). */ 200 sofree(so, 0); 201 return (error); 202 } 203 sounlock(so); 204 *aso = so; 205 return (0); 206 } 207 208 int 209 sobind(struct socket *so, struct mbuf *nam, struct proc *p) 210 { 211 soassertlocked(so); 212 return pru_bind(so, nam, p); 213 } 214 215 int 216 solisten(struct socket *so, int backlog) 217 { 218 int error; 219 220 soassertlocked(so); 221 222 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 223 return (EINVAL); 224 #ifdef SOCKET_SPLICE 225 if (isspliced(so) || issplicedback(so)) 226 return (EOPNOTSUPP); 227 #endif /* SOCKET_SPLICE */ 228 error = pru_listen(so); 229 if (error) 230 return (error); 231 if (TAILQ_FIRST(&so->so_q) == NULL) 232 so->so_options |= SO_ACCEPTCONN; 233 if (backlog < 0 || backlog > somaxconn) 234 backlog = somaxconn; 235 if (backlog < sominconn) 236 backlog = sominconn; 237 so->so_qlimit = backlog; 238 return (0); 239 } 240 241 #define SOSP_FREEING_READ 1 242 #define SOSP_FREEING_WRITE 2 243 void 244 sofree(struct socket *so, int keep_lock) 245 { 246 int persocket = solock_persocket(so); 247 248 soassertlocked(so); 249 250 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { 251 if (!keep_lock) 252 sounlock(so); 253 return; 254 } 255 if (so->so_head) { 256 struct socket *head = so->so_head; 257 258 /* 259 * We must not decommission a socket that's on the accept(2) 260 * queue. If we do, then accept(2) may hang after select(2) 261 * indicated that the listening socket was ready. 262 */ 263 if (so->so_onq == &head->so_q) { 264 if (!keep_lock) 265 sounlock(so); 266 return; 267 } 268 269 if (persocket) { 270 /* 271 * Concurrent close of `head' could 272 * abort `so' due to re-lock. 273 */ 274 soref(so); 275 soref(head); 276 sounlock(so); 277 solock(head); 278 solock(so); 279 280 if (so->so_onq != &head->so_q0) { 281 sounlock(head); 282 sounlock(so); 283 sorele(head); 284 sorele(so); 285 return; 286 } 287 288 sorele(head); 289 sorele(so); 290 } 291 292 soqremque(so, 0); 293 294 if (persocket) 295 sounlock(head); 296 } 297 298 if (persocket) { 299 sounlock(so); 300 refcnt_finalize(&so->so_refcnt, "sofinal"); 301 solock(so); 302 } 303 304 sigio_free(&so->so_sigio); 305 klist_free(&so->so_rcv.sb_sel.si_note); 306 klist_free(&so->so_snd.sb_sel.si_note); 307 #ifdef SOCKET_SPLICE 308 if (so->so_sp) { 309 if (issplicedback(so)) { 310 int freeing = SOSP_FREEING_WRITE; 311 312 if (so->so_sp->ssp_soback == so) 313 freeing |= SOSP_FREEING_READ; 314 sounsplice(so->so_sp->ssp_soback, so, freeing); 315 } 316 if (isspliced(so)) { 317 int freeing = SOSP_FREEING_READ; 318 319 if (so == so->so_sp->ssp_socket) 320 freeing |= SOSP_FREEING_WRITE; 321 sounsplice(so, so->so_sp->ssp_socket, freeing); 322 } 323 } 324 #endif /* SOCKET_SPLICE */ 325 sbrelease(so, &so->so_snd); 326 sorflush(so); 327 if (!keep_lock) 328 sounlock(so); 329 #ifdef SOCKET_SPLICE 330 if (so->so_sp) { 331 /* Reuse splice idle, sounsplice() has been called before. */ 332 timeout_set_proc(&so->so_sp->ssp_idleto, soreaper, so); 333 timeout_add(&so->so_sp->ssp_idleto, 0); 334 } else 335 #endif /* SOCKET_SPLICE */ 336 { 337 pool_put(&socket_pool, so); 338 } 339 } 340 341 static inline uint64_t 342 solinger_nsec(struct socket *so) 343 { 344 if (so->so_linger == 0) 345 return INFSLP; 346 347 return SEC_TO_NSEC(so->so_linger); 348 } 349 350 /* 351 * Close a socket on last file table reference removal. 352 * Initiate disconnect if connected. 353 * Free socket when disconnect complete. 354 */ 355 int 356 soclose(struct socket *so, int flags) 357 { 358 struct socket *so2; 359 int error = 0; 360 361 solock(so); 362 /* Revoke async IO early. There is a final revocation in sofree(). */ 363 sigio_free(&so->so_sigio); 364 if (so->so_state & SS_ISCONNECTED) { 365 if (so->so_pcb == NULL) 366 goto discard; 367 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 368 error = sodisconnect(so); 369 if (error) 370 goto drop; 371 } 372 if (so->so_options & SO_LINGER) { 373 if ((so->so_state & SS_ISDISCONNECTING) && 374 (flags & MSG_DONTWAIT)) 375 goto drop; 376 while (so->so_state & SS_ISCONNECTED) { 377 error = sosleep_nsec(so, &so->so_timeo, 378 PSOCK | PCATCH, "netcls", 379 solinger_nsec(so)); 380 if (error) 381 break; 382 } 383 } 384 } 385 drop: 386 if (so->so_pcb) { 387 int error2; 388 error2 = pru_detach(so); 389 if (error == 0) 390 error = error2; 391 } 392 if (so->so_options & SO_ACCEPTCONN) { 393 int persocket = solock_persocket(so); 394 395 if (persocket) { 396 /* Wait concurrent sonewconn() threads. */ 397 while (so->so_newconn > 0) { 398 so->so_state |= SS_NEWCONN_WAIT; 399 sosleep_nsec(so, &so->so_newconn, PSOCK, 400 "netlck", INFSLP); 401 } 402 } 403 404 while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { 405 if (persocket) 406 solock(so2); 407 (void) soqremque(so2, 0); 408 if (persocket) 409 sounlock(so); 410 soabort(so2); 411 if (persocket) 412 solock(so); 413 } 414 while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { 415 if (persocket) 416 solock(so2); 417 (void) soqremque(so2, 1); 418 if (persocket) 419 sounlock(so); 420 soabort(so2); 421 if (persocket) 422 solock(so); 423 } 424 } 425 discard: 426 if (so->so_state & SS_NOFDREF) 427 panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type); 428 so->so_state |= SS_NOFDREF; 429 /* sofree() calls sounlock(). */ 430 sofree(so, 0); 431 return (error); 432 } 433 434 void 435 soabort(struct socket *so) 436 { 437 soassertlocked(so); 438 pru_abort(so); 439 } 440 441 int 442 soaccept(struct socket *so, struct mbuf *nam) 443 { 444 int error = 0; 445 446 soassertlocked(so); 447 448 if ((so->so_state & SS_NOFDREF) == 0) 449 panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type); 450 so->so_state &= ~SS_NOFDREF; 451 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 452 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 453 error = pru_accept(so, nam); 454 else 455 error = ECONNABORTED; 456 return (error); 457 } 458 459 int 460 soconnect(struct socket *so, struct mbuf *nam) 461 { 462 int error; 463 464 soassertlocked(so); 465 466 if (so->so_options & SO_ACCEPTCONN) 467 return (EOPNOTSUPP); 468 /* 469 * If protocol is connection-based, can only connect once. 470 * Otherwise, if connected, try to disconnect first. 471 * This allows user to disconnect by connecting to, e.g., 472 * a null address. 473 */ 474 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 475 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 476 (error = sodisconnect(so)))) 477 error = EISCONN; 478 else 479 error = pru_connect(so, nam); 480 return (error); 481 } 482 483 int 484 soconnect2(struct socket *so1, struct socket *so2) 485 { 486 int persocket, error; 487 488 if ((persocket = solock_persocket(so1))) 489 solock_pair(so1, so2); 490 else 491 solock(so1); 492 493 error = pru_connect2(so1, so2); 494 495 if (persocket) 496 sounlock(so2); 497 sounlock(so1); 498 return (error); 499 } 500 501 int 502 sodisconnect(struct socket *so) 503 { 504 int error; 505 506 soassertlocked(so); 507 508 if ((so->so_state & SS_ISCONNECTED) == 0) 509 return (ENOTCONN); 510 if (so->so_state & SS_ISDISCONNECTING) 511 return (EALREADY); 512 error = pru_disconnect(so); 513 return (error); 514 } 515 516 int m_getuio(struct mbuf **, int, long, struct uio *); 517 518 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 519 /* 520 * Send on a socket. 521 * If send must go all at once and message is larger than 522 * send buffering, then hard error. 523 * Lock against other senders. 524 * If must go all at once and not enough room now, then 525 * inform user that this would block and do nothing. 526 * Otherwise, if nonblocking, send as much as possible. 527 * The data to be sent is described by "uio" if nonzero, 528 * otherwise by the mbuf chain "top" (which must be null 529 * if uio is not). Data provided in mbuf chain must be small 530 * enough to send all at once. 531 * 532 * Returns nonzero on error, timeout or signal; callers 533 * must check for short counts if EINTR/ERESTART are returned. 534 * Data and control buffers are freed on return. 535 */ 536 int 537 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 538 struct mbuf *control, int flags) 539 { 540 long space, clen = 0; 541 size_t resid; 542 int error; 543 int atomic = sosendallatonce(so) || top; 544 545 if (uio) 546 resid = uio->uio_resid; 547 else 548 resid = top->m_pkthdr.len; 549 /* MSG_EOR on a SOCK_STREAM socket is invalid. */ 550 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 551 m_freem(top); 552 m_freem(control); 553 return (EINVAL); 554 } 555 if (uio && uio->uio_procp) 556 uio->uio_procp->p_ru.ru_msgsnd++; 557 if (control) { 558 /* 559 * In theory clen should be unsigned (since control->m_len is). 560 * However, space must be signed, as it might be less than 0 561 * if we over-committed, and we must use a signed comparison 562 * of space and clen. 563 */ 564 clen = control->m_len; 565 /* reserve extra space for AF_UNIX's internalize */ 566 if (so->so_proto->pr_domain->dom_family == AF_UNIX && 567 clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) && 568 mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 569 clen = CMSG_SPACE( 570 (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) * 571 (sizeof(struct fdpass) / sizeof(int))); 572 } 573 574 #define snderr(errno) { error = errno; goto release; } 575 576 solock(so); 577 restart: 578 if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0) 579 goto out; 580 so->so_state |= SS_ISSENDING; 581 do { 582 if (so->so_state & SS_CANTSENDMORE) 583 snderr(EPIPE); 584 if (so->so_error) { 585 error = so->so_error; 586 so->so_error = 0; 587 snderr(error); 588 } 589 if ((so->so_state & SS_ISCONNECTED) == 0) { 590 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 591 if (!(resid == 0 && clen != 0)) 592 snderr(ENOTCONN); 593 } else if (addr == NULL) 594 snderr(EDESTADDRREQ); 595 } 596 space = sbspace(so, &so->so_snd); 597 if (flags & MSG_OOB) 598 space += 1024; 599 if (so->so_proto->pr_domain->dom_family == AF_UNIX) { 600 if (atomic && resid > so->so_snd.sb_hiwat) 601 snderr(EMSGSIZE); 602 } else { 603 if (clen > so->so_snd.sb_hiwat || 604 (atomic && resid > so->so_snd.sb_hiwat - clen)) 605 snderr(EMSGSIZE); 606 } 607 if (space < clen || 608 (space - clen < resid && 609 (atomic || space < so->so_snd.sb_lowat))) { 610 if (flags & MSG_DONTWAIT) 611 snderr(EWOULDBLOCK); 612 sbunlock(so, &so->so_snd); 613 error = sbwait(so, &so->so_snd); 614 so->so_state &= ~SS_ISSENDING; 615 if (error) 616 goto out; 617 goto restart; 618 } 619 space -= clen; 620 do { 621 if (uio == NULL) { 622 /* 623 * Data is prepackaged in "top". 624 */ 625 resid = 0; 626 if (flags & MSG_EOR) 627 top->m_flags |= M_EOR; 628 } else { 629 sounlock(so); 630 error = m_getuio(&top, atomic, space, uio); 631 solock(so); 632 if (error) 633 goto release; 634 space -= top->m_pkthdr.len; 635 resid = uio->uio_resid; 636 if (flags & MSG_EOR) 637 top->m_flags |= M_EOR; 638 } 639 if (resid == 0) 640 so->so_state &= ~SS_ISSENDING; 641 if (top && so->so_options & SO_ZEROIZE) 642 top->m_flags |= M_ZEROIZE; 643 if (flags & MSG_OOB) 644 error = pru_sendoob(so, top, addr, control); 645 else 646 error = pru_send(so, top, addr, control); 647 clen = 0; 648 control = NULL; 649 top = NULL; 650 if (error) 651 goto release; 652 } while (resid && space > 0); 653 } while (resid); 654 655 release: 656 so->so_state &= ~SS_ISSENDING; 657 sbunlock(so, &so->so_snd); 658 out: 659 sounlock(so); 660 m_freem(top); 661 m_freem(control); 662 return (error); 663 } 664 665 int 666 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio) 667 { 668 struct mbuf *m, *top = NULL; 669 struct mbuf **nextp = ⊤ 670 u_long len, mlen; 671 size_t resid = uio->uio_resid; 672 int error; 673 674 do { 675 if (top == NULL) { 676 MGETHDR(m, M_WAIT, MT_DATA); 677 mlen = MHLEN; 678 m->m_pkthdr.len = 0; 679 m->m_pkthdr.ph_ifidx = 0; 680 } else { 681 MGET(m, M_WAIT, MT_DATA); 682 mlen = MLEN; 683 } 684 /* chain mbuf together */ 685 *nextp = m; 686 nextp = &m->m_next; 687 688 resid = ulmin(resid, space); 689 if (resid >= MINCLSIZE) { 690 MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES)); 691 if ((m->m_flags & M_EXT) == 0) 692 MCLGETL(m, M_NOWAIT, MCLBYTES); 693 if ((m->m_flags & M_EXT) == 0) 694 goto nopages; 695 mlen = m->m_ext.ext_size; 696 len = ulmin(mlen, resid); 697 /* 698 * For datagram protocols, leave room 699 * for protocol headers in first mbuf. 700 */ 701 if (atomic && m == top && len < mlen - max_hdr) 702 m->m_data += max_hdr; 703 } else { 704 nopages: 705 len = ulmin(mlen, resid); 706 /* 707 * For datagram protocols, leave room 708 * for protocol headers in first mbuf. 709 */ 710 if (atomic && m == top && len < mlen - max_hdr) 711 m_align(m, len); 712 } 713 714 error = uiomove(mtod(m, caddr_t), len, uio); 715 if (error) { 716 m_freem(top); 717 return (error); 718 } 719 720 /* adjust counters */ 721 resid = uio->uio_resid; 722 space -= len; 723 m->m_len = len; 724 top->m_pkthdr.len += len; 725 726 /* Is there more space and more data? */ 727 } while (space > 0 && resid > 0); 728 729 *mp = top; 730 return 0; 731 } 732 733 /* 734 * Following replacement or removal of the first mbuf on the first 735 * mbuf chain of a socket buffer, push necessary state changes back 736 * into the socket buffer so that other consumers see the values 737 * consistently. 'nextrecord' is the callers locally stored value of 738 * the original value of sb->sb_mb->m_nextpkt which must be restored 739 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 740 */ 741 void 742 sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 743 { 744 745 /* 746 * First, update for the new value of nextrecord. If necessary, 747 * make it the first record. 748 */ 749 if (sb->sb_mb != NULL) 750 sb->sb_mb->m_nextpkt = nextrecord; 751 else 752 sb->sb_mb = nextrecord; 753 754 /* 755 * Now update any dependent socket buffer fields to reflect 756 * the new state. This is an inline of SB_EMPTY_FIXUP, with 757 * the addition of a second clause that takes care of the 758 * case where sb_mb has been updated, but remains the last 759 * record. 760 */ 761 if (sb->sb_mb == NULL) { 762 sb->sb_mbtail = NULL; 763 sb->sb_lastrecord = NULL; 764 } else if (sb->sb_mb->m_nextpkt == NULL) 765 sb->sb_lastrecord = sb->sb_mb; 766 } 767 768 /* 769 * Implement receive operations on a socket. 770 * We depend on the way that records are added to the sockbuf 771 * by sbappend*. In particular, each record (mbufs linked through m_next) 772 * must begin with an address if the protocol so specifies, 773 * followed by an optional mbuf or mbufs containing ancillary data, 774 * and then zero or more mbufs of data. 775 * In order to avoid blocking network for the entire time here, we release 776 * the solock() while doing the actual copy to user space. 777 * Although the sockbuf is locked, new data may still be appended, 778 * and thus we must maintain consistency of the sockbuf during that time. 779 * 780 * The caller may receive the data as a single mbuf chain by supplying 781 * an mbuf **mp0 for use in returning the chain. The uio is then used 782 * only for the count in uio_resid. 783 */ 784 int 785 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 786 struct mbuf **mp0, struct mbuf **controlp, int *flagsp, 787 socklen_t controllen) 788 { 789 struct mbuf *m, **mp; 790 struct mbuf *cm; 791 u_long len, offset, moff; 792 int flags, error, type, uio_error = 0; 793 const struct protosw *pr = so->so_proto; 794 struct mbuf *nextrecord; 795 size_t resid, orig_resid = uio->uio_resid; 796 797 mp = mp0; 798 if (paddr) 799 *paddr = NULL; 800 if (controlp) 801 *controlp = NULL; 802 if (flagsp) 803 flags = *flagsp &~ MSG_EOR; 804 else 805 flags = 0; 806 if (flags & MSG_OOB) { 807 m = m_get(M_WAIT, MT_DATA); 808 solock(so); 809 error = pru_rcvoob(so, m, flags & MSG_PEEK); 810 sounlock(so); 811 if (error) 812 goto bad; 813 do { 814 error = uiomove(mtod(m, caddr_t), 815 ulmin(uio->uio_resid, m->m_len), uio); 816 m = m_free(m); 817 } while (uio->uio_resid && error == 0 && m); 818 bad: 819 m_freem(m); 820 return (error); 821 } 822 if (mp) 823 *mp = NULL; 824 825 solock_shared(so); 826 restart: 827 if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) { 828 sounlock_shared(so); 829 return (error); 830 } 831 832 m = so->so_rcv.sb_mb; 833 #ifdef SOCKET_SPLICE 834 if (isspliced(so)) 835 m = NULL; 836 #endif /* SOCKET_SPLICE */ 837 /* 838 * If we have less data than requested, block awaiting more 839 * (subject to any timeout) if: 840 * 1. the current count is less than the low water mark, 841 * 2. MSG_WAITALL is set, and it is possible to do the entire 842 * receive operation at once if we block (resid <= hiwat), or 843 * 3. MSG_DONTWAIT is not set. 844 * If MSG_WAITALL is set but resid is larger than the receive buffer, 845 * we have to do the receive in sections, and thus risk returning 846 * a short count if a timeout or signal occurs after we start. 847 */ 848 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 849 so->so_rcv.sb_cc < uio->uio_resid) && 850 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 851 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 852 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 853 #ifdef DIAGNOSTIC 854 if (m == NULL && so->so_rcv.sb_cc) 855 #ifdef SOCKET_SPLICE 856 if (!isspliced(so)) 857 #endif /* SOCKET_SPLICE */ 858 panic("receive 1: so %p, so_type %d, sb_cc %lu", 859 so, so->so_type, so->so_rcv.sb_cc); 860 #endif 861 if (so->so_error) { 862 if (m) 863 goto dontblock; 864 error = so->so_error; 865 if ((flags & MSG_PEEK) == 0) 866 so->so_error = 0; 867 goto release; 868 } 869 if (so->so_state & SS_CANTRCVMORE) { 870 if (m) 871 goto dontblock; 872 else if (so->so_rcv.sb_cc == 0) 873 goto release; 874 } 875 for (; m; m = m->m_next) 876 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 877 m = so->so_rcv.sb_mb; 878 goto dontblock; 879 } 880 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 881 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 882 error = ENOTCONN; 883 goto release; 884 } 885 if (uio->uio_resid == 0 && controlp == NULL) 886 goto release; 887 if (flags & MSG_DONTWAIT) { 888 error = EWOULDBLOCK; 889 goto release; 890 } 891 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 892 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 893 sbunlock(so, &so->so_rcv); 894 error = sbwait(so, &so->so_rcv); 895 if (error) { 896 sounlock_shared(so); 897 return (error); 898 } 899 goto restart; 900 } 901 dontblock: 902 /* 903 * On entry here, m points to the first record of the socket buffer. 904 * From this point onward, we maintain 'nextrecord' as a cache of the 905 * pointer to the next record in the socket buffer. We must keep the 906 * various socket buffer pointers and local stack versions of the 907 * pointers in sync, pushing out modifications before operations that 908 * may sleep, and re-reading them afterwards. 909 * 910 * Otherwise, we will race with the network stack appending new data 911 * or records onto the socket buffer by using inconsistent/stale 912 * versions of the field, possibly resulting in socket buffer 913 * corruption. 914 */ 915 if (uio->uio_procp) 916 uio->uio_procp->p_ru.ru_msgrcv++; 917 KASSERT(m == so->so_rcv.sb_mb); 918 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 919 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 920 nextrecord = m->m_nextpkt; 921 if (pr->pr_flags & PR_ADDR) { 922 #ifdef DIAGNOSTIC 923 if (m->m_type != MT_SONAME) 924 panic("receive 1a: so %p, so_type %d, m %p, m_type %d", 925 so, so->so_type, m, m->m_type); 926 #endif 927 orig_resid = 0; 928 if (flags & MSG_PEEK) { 929 if (paddr) 930 *paddr = m_copym(m, 0, m->m_len, M_NOWAIT); 931 m = m->m_next; 932 } else { 933 sbfree(so, &so->so_rcv, m); 934 if (paddr) { 935 *paddr = m; 936 so->so_rcv.sb_mb = m->m_next; 937 m->m_next = NULL; 938 m = so->so_rcv.sb_mb; 939 } else { 940 so->so_rcv.sb_mb = m_free(m); 941 m = so->so_rcv.sb_mb; 942 } 943 sbsync(&so->so_rcv, nextrecord); 944 } 945 } 946 while (m && m->m_type == MT_CONTROL && error == 0) { 947 int skip = 0; 948 if (flags & MSG_PEEK) { 949 if (mtod(m, struct cmsghdr *)->cmsg_type == 950 SCM_RIGHTS) { 951 /* don't leak internalized SCM_RIGHTS msgs */ 952 skip = 1; 953 } else if (controlp) 954 *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); 955 m = m->m_next; 956 } else { 957 sbfree(so, &so->so_rcv, m); 958 so->so_rcv.sb_mb = m->m_next; 959 m->m_nextpkt = m->m_next = NULL; 960 cm = m; 961 m = so->so_rcv.sb_mb; 962 sbsync(&so->so_rcv, nextrecord); 963 if (controlp) { 964 if (pr->pr_domain->dom_externalize) { 965 sounlock_shared(so); 966 error = 967 (*pr->pr_domain->dom_externalize) 968 (cm, controllen, flags); 969 solock_shared(so); 970 } 971 *controlp = cm; 972 } else { 973 /* 974 * Dispose of any SCM_RIGHTS message that went 975 * through the read path rather than recv. 976 */ 977 if (pr->pr_domain->dom_dispose) 978 pr->pr_domain->dom_dispose(cm); 979 m_free(cm); 980 } 981 } 982 if (m != NULL) 983 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 984 else 985 nextrecord = so->so_rcv.sb_mb; 986 if (controlp && !skip) 987 controlp = &(*controlp)->m_next; 988 orig_resid = 0; 989 } 990 991 /* If m is non-NULL, we have some data to read. */ 992 if (m) { 993 type = m->m_type; 994 if (type == MT_OOBDATA) 995 flags |= MSG_OOB; 996 if (m->m_flags & M_BCAST) 997 flags |= MSG_BCAST; 998 if (m->m_flags & M_MCAST) 999 flags |= MSG_MCAST; 1000 } 1001 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 1002 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 1003 1004 moff = 0; 1005 offset = 0; 1006 while (m && uio->uio_resid > 0 && error == 0) { 1007 if (m->m_type == MT_OOBDATA) { 1008 if (type != MT_OOBDATA) 1009 break; 1010 } else if (type == MT_OOBDATA) { 1011 break; 1012 } else if (m->m_type == MT_CONTROL) { 1013 /* 1014 * If there is more than one control message in the 1015 * stream, we do a short read. Next can be received 1016 * or disposed by another system call. 1017 */ 1018 break; 1019 #ifdef DIAGNOSTIC 1020 } else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) { 1021 panic("receive 3: so %p, so_type %d, m %p, m_type %d", 1022 so, so->so_type, m, m->m_type); 1023 #endif 1024 } 1025 so->so_state &= ~SS_RCVATMARK; 1026 len = uio->uio_resid; 1027 if (so->so_oobmark && len > so->so_oobmark - offset) 1028 len = so->so_oobmark - offset; 1029 if (len > m->m_len - moff) 1030 len = m->m_len - moff; 1031 /* 1032 * If mp is set, just pass back the mbufs. 1033 * Otherwise copy them out via the uio, then free. 1034 * Sockbuf must be consistent here (points to current mbuf, 1035 * it points to next record) when we drop priority; 1036 * we must note any additions to the sockbuf when we 1037 * block interrupts again. 1038 */ 1039 if (mp == NULL && uio_error == 0) { 1040 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 1041 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 1042 resid = uio->uio_resid; 1043 sounlock_shared(so); 1044 uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1045 solock_shared(so); 1046 if (uio_error) 1047 uio->uio_resid = resid - len; 1048 } else 1049 uio->uio_resid -= len; 1050 if (len == m->m_len - moff) { 1051 if (m->m_flags & M_EOR) 1052 flags |= MSG_EOR; 1053 if (flags & MSG_PEEK) { 1054 m = m->m_next; 1055 moff = 0; 1056 orig_resid = 0; 1057 } else { 1058 nextrecord = m->m_nextpkt; 1059 sbfree(so, &so->so_rcv, m); 1060 if (mp) { 1061 *mp = m; 1062 mp = &m->m_next; 1063 so->so_rcv.sb_mb = m = m->m_next; 1064 *mp = NULL; 1065 } else { 1066 so->so_rcv.sb_mb = m_free(m); 1067 m = so->so_rcv.sb_mb; 1068 } 1069 /* 1070 * If m != NULL, we also know that 1071 * so->so_rcv.sb_mb != NULL. 1072 */ 1073 KASSERT(so->so_rcv.sb_mb == m); 1074 if (m) { 1075 m->m_nextpkt = nextrecord; 1076 if (nextrecord == NULL) 1077 so->so_rcv.sb_lastrecord = m; 1078 } else { 1079 so->so_rcv.sb_mb = nextrecord; 1080 SB_EMPTY_FIXUP(&so->so_rcv); 1081 } 1082 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1083 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1084 } 1085 } else { 1086 if (flags & MSG_PEEK) { 1087 moff += len; 1088 orig_resid = 0; 1089 } else { 1090 if (mp) 1091 *mp = m_copym(m, 0, len, M_WAIT); 1092 m->m_data += len; 1093 m->m_len -= len; 1094 so->so_rcv.sb_cc -= len; 1095 so->so_rcv.sb_datacc -= len; 1096 } 1097 } 1098 if (so->so_oobmark) { 1099 if ((flags & MSG_PEEK) == 0) { 1100 so->so_oobmark -= len; 1101 if (so->so_oobmark == 0) { 1102 so->so_state |= SS_RCVATMARK; 1103 break; 1104 } 1105 } else { 1106 offset += len; 1107 if (offset == so->so_oobmark) 1108 break; 1109 } 1110 } 1111 if (flags & MSG_EOR) 1112 break; 1113 /* 1114 * If the MSG_WAITALL flag is set (for non-atomic socket), 1115 * we must not quit until "uio->uio_resid == 0" or an error 1116 * termination. If a signal/timeout occurs, return 1117 * with a short count but without error. 1118 * Keep sockbuf locked against other readers. 1119 */ 1120 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1121 !sosendallatonce(so) && !nextrecord) { 1122 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1123 break; 1124 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1125 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1126 error = sbwait(so, &so->so_rcv); 1127 if (error) { 1128 sbunlock(so, &so->so_rcv); 1129 sounlock_shared(so); 1130 return (0); 1131 } 1132 if ((m = so->so_rcv.sb_mb) != NULL) 1133 nextrecord = m->m_nextpkt; 1134 } 1135 } 1136 1137 if (m && pr->pr_flags & PR_ATOMIC) { 1138 flags |= MSG_TRUNC; 1139 if ((flags & MSG_PEEK) == 0) 1140 (void) sbdroprecord(so, &so->so_rcv); 1141 } 1142 if ((flags & MSG_PEEK) == 0) { 1143 if (m == NULL) { 1144 /* 1145 * First part is an inline SB_EMPTY_FIXUP(). Second 1146 * part makes sure sb_lastrecord is up-to-date if 1147 * there is still data in the socket buffer. 1148 */ 1149 so->so_rcv.sb_mb = nextrecord; 1150 if (so->so_rcv.sb_mb == NULL) { 1151 so->so_rcv.sb_mbtail = NULL; 1152 so->so_rcv.sb_lastrecord = NULL; 1153 } else if (nextrecord->m_nextpkt == NULL) 1154 so->so_rcv.sb_lastrecord = nextrecord; 1155 } 1156 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1157 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1158 if (pr->pr_flags & PR_WANTRCVD) 1159 pru_rcvd(so); 1160 } 1161 if (orig_resid == uio->uio_resid && orig_resid && 1162 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1163 sbunlock(so, &so->so_rcv); 1164 goto restart; 1165 } 1166 1167 if (uio_error) 1168 error = uio_error; 1169 1170 if (flagsp) 1171 *flagsp |= flags; 1172 release: 1173 sbunlock(so, &so->so_rcv); 1174 sounlock_shared(so); 1175 return (error); 1176 } 1177 1178 int 1179 soshutdown(struct socket *so, int how) 1180 { 1181 int error = 0; 1182 1183 solock(so); 1184 switch (how) { 1185 case SHUT_RD: 1186 sorflush(so); 1187 break; 1188 case SHUT_RDWR: 1189 sorflush(so); 1190 /* FALLTHROUGH */ 1191 case SHUT_WR: 1192 error = pru_shutdown(so); 1193 break; 1194 default: 1195 error = EINVAL; 1196 break; 1197 } 1198 sounlock(so); 1199 1200 return (error); 1201 } 1202 1203 void 1204 sorflush(struct socket *so) 1205 { 1206 struct sockbuf *sb = &so->so_rcv; 1207 struct mbuf *m; 1208 const struct protosw *pr = so->so_proto; 1209 int error; 1210 1211 sb->sb_flags |= SB_NOINTR; 1212 error = sblock(so, sb, M_WAITOK); 1213 /* with SB_NOINTR and M_WAITOK sblock() must not fail */ 1214 KASSERT(error == 0); 1215 socantrcvmore(so); 1216 m = sb->sb_mb; 1217 memset(&sb->sb_startzero, 0, 1218 (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); 1219 sb->sb_timeo_nsecs = INFSLP; 1220 sbunlock(so, sb); 1221 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1222 (*pr->pr_domain->dom_dispose)(m); 1223 m_purge(m); 1224 } 1225 1226 #ifdef SOCKET_SPLICE 1227 1228 #define so_splicelen so_sp->ssp_len 1229 #define so_splicemax so_sp->ssp_max 1230 #define so_idletv so_sp->ssp_idletv 1231 #define so_idleto so_sp->ssp_idleto 1232 #define so_splicetask so_sp->ssp_task 1233 1234 int 1235 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) 1236 { 1237 struct file *fp; 1238 struct socket *sosp; 1239 struct sosplice *sp; 1240 struct taskq *tq; 1241 int error = 0; 1242 1243 soassertlocked(so); 1244 1245 if (sosplice_taskq == NULL) { 1246 rw_enter_write(&sosplice_lock); 1247 if (sosplice_taskq == NULL) { 1248 tq = taskq_create("sosplice", 1, IPL_SOFTNET, 1249 TASKQ_MPSAFE); 1250 /* Ensure the taskq is fully visible to other CPUs. */ 1251 membar_producer(); 1252 sosplice_taskq = tq; 1253 } 1254 rw_exit_write(&sosplice_lock); 1255 } 1256 if (sosplice_taskq == NULL) 1257 return (ENOMEM); 1258 1259 if ((so->so_proto->pr_flags & PR_SPLICE) == 0) 1260 return (EPROTONOSUPPORT); 1261 if (so->so_options & SO_ACCEPTCONN) 1262 return (EOPNOTSUPP); 1263 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1264 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 1265 return (ENOTCONN); 1266 if (so->so_sp == NULL) { 1267 sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1268 if (so->so_sp == NULL) 1269 so->so_sp = sp; 1270 else 1271 pool_put(&sosplice_pool, sp); 1272 } 1273 1274 /* If no fd is given, unsplice by removing existing link. */ 1275 if (fd < 0) { 1276 /* Lock receive buffer. */ 1277 if ((error = sblock(so, &so->so_rcv, M_WAITOK)) != 0) { 1278 return (error); 1279 } 1280 if (so->so_sp->ssp_socket) 1281 sounsplice(so, so->so_sp->ssp_socket, 0); 1282 sbunlock(so, &so->so_rcv); 1283 return (0); 1284 } 1285 1286 if (max && max < 0) 1287 return (EINVAL); 1288 1289 if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) 1290 return (EINVAL); 1291 1292 /* Find sosp, the drain socket where data will be spliced into. */ 1293 if ((error = getsock(curproc, fd, &fp)) != 0) 1294 return (error); 1295 sosp = fp->f_data; 1296 if (sosp->so_proto->pr_usrreqs->pru_send != 1297 so->so_proto->pr_usrreqs->pru_send) { 1298 error = EPROTONOSUPPORT; 1299 goto frele; 1300 } 1301 if (sosp->so_sp == NULL) { 1302 sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1303 if (sosp->so_sp == NULL) 1304 sosp->so_sp = sp; 1305 else 1306 pool_put(&sosplice_pool, sp); 1307 } 1308 1309 /* Lock both receive and send buffer. */ 1310 if ((error = sblock(so, &so->so_rcv, M_WAITOK)) != 0) { 1311 goto frele; 1312 } 1313 if ((error = sblock(so, &sosp->so_snd, M_WAITOK)) != 0) { 1314 sbunlock(so, &so->so_rcv); 1315 goto frele; 1316 } 1317 1318 if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { 1319 error = EBUSY; 1320 goto release; 1321 } 1322 if (sosp->so_options & SO_ACCEPTCONN) { 1323 error = EOPNOTSUPP; 1324 goto release; 1325 } 1326 if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) { 1327 error = ENOTCONN; 1328 goto release; 1329 } 1330 1331 /* Splice so and sosp together. */ 1332 so->so_sp->ssp_socket = sosp; 1333 sosp->so_sp->ssp_soback = so; 1334 so->so_splicelen = 0; 1335 so->so_splicemax = max; 1336 if (tv) 1337 so->so_idletv = *tv; 1338 else 1339 timerclear(&so->so_idletv); 1340 timeout_set_proc(&so->so_idleto, soidle, so); 1341 task_set(&so->so_splicetask, sotask, so); 1342 1343 /* 1344 * To prevent softnet interrupt from calling somove() while 1345 * we sleep, the socket buffers are not marked as spliced yet. 1346 */ 1347 if (somove(so, M_WAIT)) { 1348 so->so_rcv.sb_flags |= SB_SPLICE; 1349 sosp->so_snd.sb_flags |= SB_SPLICE; 1350 } 1351 1352 release: 1353 sbunlock(sosp, &sosp->so_snd); 1354 sbunlock(so, &so->so_rcv); 1355 frele: 1356 /* 1357 * FRELE() must not be called with the socket lock held. It is safe to 1358 * release the lock here as long as no other operation happen on the 1359 * socket when sosplice() returns. The dance could be avoided by 1360 * grabbing the socket lock inside this function. 1361 */ 1362 sounlock(so); 1363 FRELE(fp, curproc); 1364 solock(so); 1365 return (error); 1366 } 1367 1368 void 1369 sounsplice(struct socket *so, struct socket *sosp, int freeing) 1370 { 1371 soassertlocked(so); 1372 1373 task_del(sosplice_taskq, &so->so_splicetask); 1374 timeout_del(&so->so_idleto); 1375 sosp->so_snd.sb_flags &= ~SB_SPLICE; 1376 so->so_rcv.sb_flags &= ~SB_SPLICE; 1377 so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; 1378 /* Do not wakeup a socket that is about to be freed. */ 1379 if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so)) 1380 sorwakeup(so); 1381 if ((freeing & SOSP_FREEING_WRITE) == 0 && sowriteable(sosp)) 1382 sowwakeup(sosp); 1383 } 1384 1385 void 1386 soidle(void *arg) 1387 { 1388 struct socket *so = arg; 1389 1390 solock(so); 1391 if (so->so_rcv.sb_flags & SB_SPLICE) { 1392 so->so_error = ETIMEDOUT; 1393 sounsplice(so, so->so_sp->ssp_socket, 0); 1394 } 1395 sounlock(so); 1396 } 1397 1398 void 1399 sotask(void *arg) 1400 { 1401 struct socket *so = arg; 1402 1403 solock(so); 1404 if (so->so_rcv.sb_flags & SB_SPLICE) { 1405 /* 1406 * We may not sleep here as sofree() and unsplice() may be 1407 * called from softnet interrupt context. This would remove 1408 * the socket during somove(). 1409 */ 1410 somove(so, M_DONTWAIT); 1411 } 1412 sounlock(so); 1413 1414 /* Avoid user land starvation. */ 1415 yield(); 1416 } 1417 1418 /* 1419 * The socket splicing task or idle timeout may sleep while grabbing the net 1420 * lock. As sofree() can be called anytime, sotask() or soidle() could access 1421 * the socket memory of a freed socket after wakeup. So delay the pool_put() 1422 * after all pending socket splicing tasks or timeouts have finished. Do this 1423 * by scheduling it on the same threads. 1424 */ 1425 void 1426 soreaper(void *arg) 1427 { 1428 struct socket *so = arg; 1429 1430 /* Reuse splice task, sounsplice() has been called before. */ 1431 task_set(&so->so_sp->ssp_task, soput, so); 1432 task_add(sosplice_taskq, &so->so_sp->ssp_task); 1433 } 1434 1435 void 1436 soput(void *arg) 1437 { 1438 struct socket *so = arg; 1439 1440 pool_put(&sosplice_pool, so->so_sp); 1441 pool_put(&socket_pool, so); 1442 } 1443 1444 /* 1445 * Move data from receive buffer of spliced source socket to send 1446 * buffer of drain socket. Try to move as much as possible in one 1447 * big chunk. It is a TCP only implementation. 1448 * Return value 0 means splicing has been finished, 1 continue. 1449 */ 1450 int 1451 somove(struct socket *so, int wait) 1452 { 1453 struct socket *sosp = so->so_sp->ssp_socket; 1454 struct mbuf *m, **mp, *nextrecord; 1455 u_long len, off, oobmark; 1456 long space; 1457 int error = 0, maxreached = 0; 1458 unsigned int state; 1459 1460 soassertlocked(so); 1461 1462 nextpkt: 1463 if (so->so_error) { 1464 error = so->so_error; 1465 goto release; 1466 } 1467 if (sosp->so_state & SS_CANTSENDMORE) { 1468 error = EPIPE; 1469 goto release; 1470 } 1471 if (sosp->so_error && sosp->so_error != ETIMEDOUT && 1472 sosp->so_error != EFBIG && sosp->so_error != ELOOP) { 1473 error = sosp->so_error; 1474 goto release; 1475 } 1476 if ((sosp->so_state & SS_ISCONNECTED) == 0) 1477 goto release; 1478 1479 /* Calculate how many bytes can be copied now. */ 1480 len = so->so_rcv.sb_datacc; 1481 if (so->so_splicemax) { 1482 KASSERT(so->so_splicelen < so->so_splicemax); 1483 if (so->so_splicemax <= so->so_splicelen + len) { 1484 len = so->so_splicemax - so->so_splicelen; 1485 maxreached = 1; 1486 } 1487 } 1488 space = sbspace(sosp, &sosp->so_snd); 1489 if (so->so_oobmark && so->so_oobmark < len && 1490 so->so_oobmark < space + 1024) 1491 space += 1024; 1492 if (space <= 0) { 1493 maxreached = 0; 1494 goto release; 1495 } 1496 if (space < len) { 1497 maxreached = 0; 1498 if (space < sosp->so_snd.sb_lowat) 1499 goto release; 1500 len = space; 1501 } 1502 sosp->so_state |= SS_ISSENDING; 1503 1504 SBLASTRECORDCHK(&so->so_rcv, "somove 1"); 1505 SBLASTMBUFCHK(&so->so_rcv, "somove 1"); 1506 m = so->so_rcv.sb_mb; 1507 if (m == NULL) 1508 goto release; 1509 nextrecord = m->m_nextpkt; 1510 1511 /* Drop address and control information not used with splicing. */ 1512 if (so->so_proto->pr_flags & PR_ADDR) { 1513 #ifdef DIAGNOSTIC 1514 if (m->m_type != MT_SONAME) 1515 panic("somove soname: so %p, so_type %d, m %p, " 1516 "m_type %d", so, so->so_type, m, m->m_type); 1517 #endif 1518 m = m->m_next; 1519 } 1520 while (m && m->m_type == MT_CONTROL) 1521 m = m->m_next; 1522 if (m == NULL) { 1523 sbdroprecord(so, &so->so_rcv); 1524 if (so->so_proto->pr_flags & PR_WANTRCVD) 1525 pru_rcvd(so); 1526 goto nextpkt; 1527 } 1528 1529 /* 1530 * By splicing sockets connected to localhost, userland might create a 1531 * loop. Dissolve splicing with error if loop is detected by counter. 1532 * 1533 * If we deal with looped broadcast/multicast packet we bail out with 1534 * no error to suppress splice termination. 1535 */ 1536 if ((m->m_flags & M_PKTHDR) && 1537 ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) || 1538 ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) { 1539 error = ELOOP; 1540 goto release; 1541 } 1542 1543 if (so->so_proto->pr_flags & PR_ATOMIC) { 1544 if ((m->m_flags & M_PKTHDR) == 0) 1545 panic("somove !PKTHDR: so %p, so_type %d, m %p, " 1546 "m_type %d", so, so->so_type, m, m->m_type); 1547 if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) { 1548 error = EMSGSIZE; 1549 goto release; 1550 } 1551 if (len < m->m_pkthdr.len) 1552 goto release; 1553 if (m->m_pkthdr.len < len) { 1554 maxreached = 0; 1555 len = m->m_pkthdr.len; 1556 } 1557 /* 1558 * Throw away the name mbuf after it has been assured 1559 * that the whole first record can be processed. 1560 */ 1561 m = so->so_rcv.sb_mb; 1562 sbfree(so, &so->so_rcv, m); 1563 so->so_rcv.sb_mb = m_free(m); 1564 sbsync(&so->so_rcv, nextrecord); 1565 } 1566 /* 1567 * Throw away the control mbufs after it has been assured 1568 * that the whole first record can be processed. 1569 */ 1570 m = so->so_rcv.sb_mb; 1571 while (m && m->m_type == MT_CONTROL) { 1572 sbfree(so, &so->so_rcv, m); 1573 so->so_rcv.sb_mb = m_free(m); 1574 m = so->so_rcv.sb_mb; 1575 sbsync(&so->so_rcv, nextrecord); 1576 } 1577 1578 SBLASTRECORDCHK(&so->so_rcv, "somove 2"); 1579 SBLASTMBUFCHK(&so->so_rcv, "somove 2"); 1580 1581 /* Take at most len mbufs out of receive buffer. */ 1582 for (off = 0, mp = &m; off <= len && *mp; 1583 off += (*mp)->m_len, mp = &(*mp)->m_next) { 1584 u_long size = len - off; 1585 1586 #ifdef DIAGNOSTIC 1587 if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER) 1588 panic("somove type: so %p, so_type %d, m %p, " 1589 "m_type %d", so, so->so_type, *mp, (*mp)->m_type); 1590 #endif 1591 if ((*mp)->m_len > size) { 1592 /* 1593 * Move only a partial mbuf at maximum splice length or 1594 * if the drain buffer is too small for this large mbuf. 1595 */ 1596 if (!maxreached && so->so_snd.sb_datacc > 0) { 1597 len -= size; 1598 break; 1599 } 1600 *mp = m_copym(so->so_rcv.sb_mb, 0, size, wait); 1601 if (*mp == NULL) { 1602 len -= size; 1603 break; 1604 } 1605 so->so_rcv.sb_mb->m_data += size; 1606 so->so_rcv.sb_mb->m_len -= size; 1607 so->so_rcv.sb_cc -= size; 1608 so->so_rcv.sb_datacc -= size; 1609 } else { 1610 *mp = so->so_rcv.sb_mb; 1611 sbfree(so, &so->so_rcv, *mp); 1612 so->so_rcv.sb_mb = (*mp)->m_next; 1613 sbsync(&so->so_rcv, nextrecord); 1614 } 1615 } 1616 *mp = NULL; 1617 1618 SBLASTRECORDCHK(&so->so_rcv, "somove 3"); 1619 SBLASTMBUFCHK(&so->so_rcv, "somove 3"); 1620 SBCHECK(so, &so->so_rcv); 1621 if (m == NULL) 1622 goto release; 1623 m->m_nextpkt = NULL; 1624 if (m->m_flags & M_PKTHDR) { 1625 m_resethdr(m); 1626 m->m_pkthdr.len = len; 1627 } 1628 1629 /* Send window update to source peer as receive buffer has changed. */ 1630 if (so->so_proto->pr_flags & PR_WANTRCVD) 1631 pru_rcvd(so); 1632 1633 /* Receive buffer did shrink by len bytes, adjust oob. */ 1634 state = so->so_state; 1635 so->so_state &= ~SS_RCVATMARK; 1636 oobmark = so->so_oobmark; 1637 so->so_oobmark = oobmark > len ? oobmark - len : 0; 1638 if (oobmark) { 1639 if (oobmark == len) 1640 so->so_state |= SS_RCVATMARK; 1641 if (oobmark >= len) 1642 oobmark = 0; 1643 } 1644 1645 /* 1646 * Handle oob data. If any malloc fails, ignore error. 1647 * TCP urgent data is not very reliable anyway. 1648 */ 1649 while (((state & SS_RCVATMARK) || oobmark) && 1650 (so->so_options & SO_OOBINLINE)) { 1651 struct mbuf *o = NULL; 1652 1653 if (state & SS_RCVATMARK) { 1654 o = m_get(wait, MT_DATA); 1655 state &= ~SS_RCVATMARK; 1656 } else if (oobmark) { 1657 o = m_split(m, oobmark, wait); 1658 if (o) { 1659 error = pru_send(sosp, m, NULL, NULL); 1660 if (error) { 1661 if (sosp->so_state & SS_CANTSENDMORE) 1662 error = EPIPE; 1663 m_freem(o); 1664 goto release; 1665 } 1666 len -= oobmark; 1667 so->so_splicelen += oobmark; 1668 m = o; 1669 o = m_get(wait, MT_DATA); 1670 } 1671 oobmark = 0; 1672 } 1673 if (o) { 1674 o->m_len = 1; 1675 *mtod(o, caddr_t) = *mtod(m, caddr_t); 1676 error = pru_sendoob(sosp, o, NULL, NULL); 1677 if (error) { 1678 if (sosp->so_state & SS_CANTSENDMORE) 1679 error = EPIPE; 1680 m_freem(m); 1681 goto release; 1682 } 1683 len -= 1; 1684 so->so_splicelen += 1; 1685 if (oobmark) { 1686 oobmark -= 1; 1687 if (oobmark == 0) 1688 state |= SS_RCVATMARK; 1689 } 1690 m_adj(m, 1); 1691 } 1692 } 1693 1694 /* Append all remaining data to drain socket. */ 1695 if (so->so_rcv.sb_cc == 0 || maxreached) 1696 sosp->so_state &= ~SS_ISSENDING; 1697 error = pru_send(sosp, m, NULL, NULL); 1698 if (error) { 1699 if (sosp->so_state & SS_CANTSENDMORE) 1700 error = EPIPE; 1701 goto release; 1702 } 1703 so->so_splicelen += len; 1704 1705 /* Move several packets if possible. */ 1706 if (!maxreached && nextrecord) 1707 goto nextpkt; 1708 1709 release: 1710 sosp->so_state &= ~SS_ISSENDING; 1711 if (!error && maxreached && so->so_splicemax == so->so_splicelen) 1712 error = EFBIG; 1713 if (error) 1714 so->so_error = error; 1715 if (((so->so_state & SS_CANTRCVMORE) && so->so_rcv.sb_cc == 0) || 1716 (sosp->so_state & SS_CANTSENDMORE) || maxreached || error) { 1717 sounsplice(so, sosp, 0); 1718 return (0); 1719 } 1720 if (timerisset(&so->so_idletv)) 1721 timeout_add_tv(&so->so_idleto, &so->so_idletv); 1722 return (1); 1723 } 1724 1725 #endif /* SOCKET_SPLICE */ 1726 1727 void 1728 sorwakeup(struct socket *so) 1729 { 1730 soassertlocked(so); 1731 1732 #ifdef SOCKET_SPLICE 1733 if (so->so_rcv.sb_flags & SB_SPLICE) { 1734 /* 1735 * TCP has a sendbuffer that can handle multiple packets 1736 * at once. So queue the stream a bit to accumulate data. 1737 * The sosplice thread will call somove() later and send 1738 * the packets calling tcp_output() only once. 1739 * In the UDP case, send out the packets immediately. 1740 * Using a thread would make things slower. 1741 */ 1742 if (so->so_proto->pr_flags & PR_WANTRCVD) 1743 task_add(sosplice_taskq, &so->so_splicetask); 1744 else 1745 somove(so, M_DONTWAIT); 1746 } 1747 if (isspliced(so)) 1748 return; 1749 #endif 1750 sowakeup(so, &so->so_rcv); 1751 if (so->so_upcall) 1752 (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT); 1753 } 1754 1755 void 1756 sowwakeup(struct socket *so) 1757 { 1758 soassertlocked(so); 1759 1760 #ifdef SOCKET_SPLICE 1761 if (so->so_snd.sb_flags & SB_SPLICE) 1762 task_add(sosplice_taskq, &so->so_sp->ssp_soback->so_splicetask); 1763 if (issplicedback(so)) 1764 return; 1765 #endif 1766 sowakeup(so, &so->so_snd); 1767 } 1768 1769 int 1770 sosetopt(struct socket *so, int level, int optname, struct mbuf *m) 1771 { 1772 int error = 0; 1773 1774 soassertlocked(so); 1775 1776 if (level != SOL_SOCKET) { 1777 if (so->so_proto->pr_ctloutput) { 1778 error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1779 level, optname, m); 1780 return (error); 1781 } 1782 error = ENOPROTOOPT; 1783 } else { 1784 switch (optname) { 1785 case SO_BINDANY: 1786 if ((error = suser(curproc)) != 0) /* XXX */ 1787 return (error); 1788 break; 1789 } 1790 1791 switch (optname) { 1792 1793 case SO_LINGER: 1794 if (m == NULL || m->m_len != sizeof (struct linger) || 1795 mtod(m, struct linger *)->l_linger < 0 || 1796 mtod(m, struct linger *)->l_linger > SHRT_MAX) 1797 return (EINVAL); 1798 so->so_linger = mtod(m, struct linger *)->l_linger; 1799 /* FALLTHROUGH */ 1800 1801 case SO_BINDANY: 1802 case SO_DEBUG: 1803 case SO_KEEPALIVE: 1804 case SO_USELOOPBACK: 1805 case SO_BROADCAST: 1806 case SO_REUSEADDR: 1807 case SO_REUSEPORT: 1808 case SO_OOBINLINE: 1809 case SO_TIMESTAMP: 1810 case SO_ZEROIZE: 1811 if (m == NULL || m->m_len < sizeof (int)) 1812 return (EINVAL); 1813 if (*mtod(m, int *)) 1814 so->so_options |= optname; 1815 else 1816 so->so_options &= ~optname; 1817 break; 1818 1819 case SO_DONTROUTE: 1820 if (m == NULL || m->m_len < sizeof (int)) 1821 return (EINVAL); 1822 if (*mtod(m, int *)) 1823 error = EOPNOTSUPP; 1824 break; 1825 1826 case SO_SNDBUF: 1827 case SO_RCVBUF: 1828 case SO_SNDLOWAT: 1829 case SO_RCVLOWAT: 1830 { 1831 u_long cnt; 1832 1833 if (m == NULL || m->m_len < sizeof (int)) 1834 return (EINVAL); 1835 cnt = *mtod(m, int *); 1836 if ((long)cnt <= 0) 1837 cnt = 1; 1838 switch (optname) { 1839 1840 case SO_SNDBUF: 1841 if (so->so_state & SS_CANTSENDMORE) 1842 return (EINVAL); 1843 if (sbcheckreserve(cnt, so->so_snd.sb_wat) || 1844 sbreserve(so, &so->so_snd, cnt)) 1845 return (ENOBUFS); 1846 so->so_snd.sb_wat = cnt; 1847 break; 1848 1849 case SO_RCVBUF: 1850 if (so->so_state & SS_CANTRCVMORE) 1851 return (EINVAL); 1852 if (sbcheckreserve(cnt, so->so_rcv.sb_wat) || 1853 sbreserve(so, &so->so_rcv, cnt)) 1854 return (ENOBUFS); 1855 so->so_rcv.sb_wat = cnt; 1856 break; 1857 1858 case SO_SNDLOWAT: 1859 so->so_snd.sb_lowat = 1860 (cnt > so->so_snd.sb_hiwat) ? 1861 so->so_snd.sb_hiwat : cnt; 1862 break; 1863 case SO_RCVLOWAT: 1864 so->so_rcv.sb_lowat = 1865 (cnt > so->so_rcv.sb_hiwat) ? 1866 so->so_rcv.sb_hiwat : cnt; 1867 break; 1868 } 1869 break; 1870 } 1871 1872 case SO_SNDTIMEO: 1873 case SO_RCVTIMEO: 1874 { 1875 struct timeval tv; 1876 uint64_t nsecs; 1877 1878 if (m == NULL || m->m_len < sizeof (tv)) 1879 return (EINVAL); 1880 memcpy(&tv, mtod(m, struct timeval *), sizeof tv); 1881 if (!timerisvalid(&tv)) 1882 return (EINVAL); 1883 nsecs = TIMEVAL_TO_NSEC(&tv); 1884 if (nsecs == UINT64_MAX) 1885 return (EDOM); 1886 if (nsecs == 0) 1887 nsecs = INFSLP; 1888 switch (optname) { 1889 1890 case SO_SNDTIMEO: 1891 so->so_snd.sb_timeo_nsecs = nsecs; 1892 break; 1893 case SO_RCVTIMEO: 1894 so->so_rcv.sb_timeo_nsecs = nsecs; 1895 break; 1896 } 1897 break; 1898 } 1899 1900 case SO_RTABLE: 1901 if (so->so_proto->pr_domain && 1902 so->so_proto->pr_domain->dom_protosw && 1903 so->so_proto->pr_ctloutput) { 1904 const struct domain *dom = 1905 so->so_proto->pr_domain; 1906 1907 level = dom->dom_protosw->pr_protocol; 1908 error = (*so->so_proto->pr_ctloutput) 1909 (PRCO_SETOPT, so, level, optname, m); 1910 return (error); 1911 } 1912 error = ENOPROTOOPT; 1913 break; 1914 1915 #ifdef SOCKET_SPLICE 1916 case SO_SPLICE: 1917 if (m == NULL) { 1918 error = sosplice(so, -1, 0, NULL); 1919 } else if (m->m_len < sizeof(int)) { 1920 return (EINVAL); 1921 } else if (m->m_len < sizeof(struct splice)) { 1922 error = sosplice(so, *mtod(m, int *), 0, NULL); 1923 } else { 1924 error = sosplice(so, 1925 mtod(m, struct splice *)->sp_fd, 1926 mtod(m, struct splice *)->sp_max, 1927 &mtod(m, struct splice *)->sp_idle); 1928 } 1929 break; 1930 #endif /* SOCKET_SPLICE */ 1931 1932 default: 1933 error = ENOPROTOOPT; 1934 break; 1935 } 1936 if (error == 0 && so->so_proto->pr_ctloutput) { 1937 (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1938 level, optname, m); 1939 } 1940 } 1941 1942 return (error); 1943 } 1944 1945 int 1946 sogetopt(struct socket *so, int level, int optname, struct mbuf *m) 1947 { 1948 int error = 0; 1949 1950 soassertlocked(so); 1951 1952 if (level != SOL_SOCKET) { 1953 if (so->so_proto->pr_ctloutput) { 1954 m->m_len = 0; 1955 1956 error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so, 1957 level, optname, m); 1958 if (error) 1959 return (error); 1960 return (0); 1961 } else 1962 return (ENOPROTOOPT); 1963 } else { 1964 m->m_len = sizeof (int); 1965 1966 switch (optname) { 1967 1968 case SO_LINGER: 1969 m->m_len = sizeof (struct linger); 1970 mtod(m, struct linger *)->l_onoff = 1971 so->so_options & SO_LINGER; 1972 mtod(m, struct linger *)->l_linger = so->so_linger; 1973 break; 1974 1975 case SO_BINDANY: 1976 case SO_USELOOPBACK: 1977 case SO_DEBUG: 1978 case SO_KEEPALIVE: 1979 case SO_REUSEADDR: 1980 case SO_REUSEPORT: 1981 case SO_BROADCAST: 1982 case SO_OOBINLINE: 1983 case SO_TIMESTAMP: 1984 case SO_ZEROIZE: 1985 *mtod(m, int *) = so->so_options & optname; 1986 break; 1987 1988 case SO_DONTROUTE: 1989 *mtod(m, int *) = 0; 1990 break; 1991 1992 case SO_TYPE: 1993 *mtod(m, int *) = so->so_type; 1994 break; 1995 1996 case SO_ERROR: 1997 *mtod(m, int *) = so->so_error; 1998 so->so_error = 0; 1999 break; 2000 2001 case SO_DOMAIN: 2002 *mtod(m, int *) = so->so_proto->pr_domain->dom_family; 2003 break; 2004 2005 case SO_PROTOCOL: 2006 *mtod(m, int *) = so->so_proto->pr_protocol; 2007 break; 2008 2009 case SO_SNDBUF: 2010 *mtod(m, int *) = so->so_snd.sb_hiwat; 2011 break; 2012 2013 case SO_RCVBUF: 2014 *mtod(m, int *) = so->so_rcv.sb_hiwat; 2015 break; 2016 2017 case SO_SNDLOWAT: 2018 *mtod(m, int *) = so->so_snd.sb_lowat; 2019 break; 2020 2021 case SO_RCVLOWAT: 2022 *mtod(m, int *) = so->so_rcv.sb_lowat; 2023 break; 2024 2025 case SO_SNDTIMEO: 2026 case SO_RCVTIMEO: 2027 { 2028 struct timeval tv; 2029 uint64_t nsecs = (optname == SO_SNDTIMEO ? 2030 so->so_snd.sb_timeo_nsecs : 2031 so->so_rcv.sb_timeo_nsecs); 2032 2033 m->m_len = sizeof(struct timeval); 2034 memset(&tv, 0, sizeof(tv)); 2035 if (nsecs != INFSLP) 2036 NSEC_TO_TIMEVAL(nsecs, &tv); 2037 memcpy(mtod(m, struct timeval *), &tv, sizeof tv); 2038 break; 2039 } 2040 2041 case SO_RTABLE: 2042 if (so->so_proto->pr_domain && 2043 so->so_proto->pr_domain->dom_protosw && 2044 so->so_proto->pr_ctloutput) { 2045 const struct domain *dom = 2046 so->so_proto->pr_domain; 2047 2048 level = dom->dom_protosw->pr_protocol; 2049 error = (*so->so_proto->pr_ctloutput) 2050 (PRCO_GETOPT, so, level, optname, m); 2051 if (error) 2052 return (error); 2053 break; 2054 } 2055 return (ENOPROTOOPT); 2056 2057 #ifdef SOCKET_SPLICE 2058 case SO_SPLICE: 2059 { 2060 off_t len; 2061 2062 m->m_len = sizeof(off_t); 2063 len = so->so_sp ? so->so_sp->ssp_len : 0; 2064 memcpy(mtod(m, off_t *), &len, sizeof(off_t)); 2065 break; 2066 } 2067 #endif /* SOCKET_SPLICE */ 2068 2069 case SO_PEERCRED: 2070 if (so->so_proto->pr_protocol == AF_UNIX) { 2071 struct unpcb *unp = sotounpcb(so); 2072 2073 if (unp->unp_flags & UNP_FEIDS) { 2074 m->m_len = sizeof(unp->unp_connid); 2075 memcpy(mtod(m, caddr_t), 2076 &(unp->unp_connid), m->m_len); 2077 break; 2078 } 2079 return (ENOTCONN); 2080 } 2081 return (EOPNOTSUPP); 2082 2083 default: 2084 return (ENOPROTOOPT); 2085 } 2086 return (0); 2087 } 2088 } 2089 2090 void 2091 sohasoutofband(struct socket *so) 2092 { 2093 pgsigio(&so->so_sigio, SIGURG, 0); 2094 KNOTE(&so->so_rcv.sb_sel.si_note, 0); 2095 } 2096 2097 int 2098 soo_kqfilter(struct file *fp, struct knote *kn) 2099 { 2100 struct socket *so = kn->kn_fp->f_data; 2101 struct sockbuf *sb; 2102 2103 solock(so); 2104 switch (kn->kn_filter) { 2105 case EVFILT_READ: 2106 if (so->so_options & SO_ACCEPTCONN) 2107 kn->kn_fop = &solisten_filtops; 2108 else 2109 kn->kn_fop = &soread_filtops; 2110 sb = &so->so_rcv; 2111 break; 2112 case EVFILT_WRITE: 2113 kn->kn_fop = &sowrite_filtops; 2114 sb = &so->so_snd; 2115 break; 2116 case EVFILT_EXCEPT: 2117 kn->kn_fop = &soexcept_filtops; 2118 sb = &so->so_rcv; 2119 break; 2120 default: 2121 sounlock(so); 2122 return (EINVAL); 2123 } 2124 2125 klist_insert_locked(&sb->sb_sel.si_note, kn); 2126 sounlock(so); 2127 2128 return (0); 2129 } 2130 2131 void 2132 filt_sordetach(struct knote *kn) 2133 { 2134 struct socket *so = kn->kn_fp->f_data; 2135 2136 klist_remove(&so->so_rcv.sb_sel.si_note, kn); 2137 } 2138 2139 int 2140 filt_soread(struct knote *kn, long hint) 2141 { 2142 struct socket *so = kn->kn_fp->f_data; 2143 int rv = 0; 2144 2145 soassertlocked(so); 2146 2147 kn->kn_data = so->so_rcv.sb_cc; 2148 #ifdef SOCKET_SPLICE 2149 if (isspliced(so)) { 2150 rv = 0; 2151 } else 2152 #endif /* SOCKET_SPLICE */ 2153 if (so->so_state & SS_CANTRCVMORE) { 2154 kn->kn_flags |= EV_EOF; 2155 if (kn->kn_flags & __EV_POLL) { 2156 if (so->so_state & SS_ISDISCONNECTED) 2157 kn->kn_flags |= __EV_HUP; 2158 } 2159 kn->kn_fflags = so->so_error; 2160 rv = 1; 2161 } else if (so->so_error) { /* temporary udp error */ 2162 rv = 1; 2163 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2164 rv = (kn->kn_data >= kn->kn_sdata); 2165 } else { 2166 rv = (kn->kn_data >= so->so_rcv.sb_lowat); 2167 } 2168 2169 return rv; 2170 } 2171 2172 void 2173 filt_sowdetach(struct knote *kn) 2174 { 2175 struct socket *so = kn->kn_fp->f_data; 2176 2177 klist_remove(&so->so_snd.sb_sel.si_note, kn); 2178 } 2179 2180 int 2181 filt_sowrite(struct knote *kn, long hint) 2182 { 2183 struct socket *so = kn->kn_fp->f_data; 2184 int rv; 2185 2186 soassertlocked(so); 2187 2188 kn->kn_data = sbspace(so, &so->so_snd); 2189 if (so->so_state & SS_CANTSENDMORE) { 2190 kn->kn_flags |= EV_EOF; 2191 if (kn->kn_flags & __EV_POLL) { 2192 if (so->so_state & SS_ISDISCONNECTED) 2193 kn->kn_flags |= __EV_HUP; 2194 } 2195 kn->kn_fflags = so->so_error; 2196 rv = 1; 2197 } else if (so->so_error) { /* temporary udp error */ 2198 rv = 1; 2199 } else if (((so->so_state & SS_ISCONNECTED) == 0) && 2200 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 2201 rv = 0; 2202 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2203 rv = (kn->kn_data >= kn->kn_sdata); 2204 } else { 2205 rv = (kn->kn_data >= so->so_snd.sb_lowat); 2206 } 2207 2208 return (rv); 2209 } 2210 2211 int 2212 filt_soexcept(struct knote *kn, long hint) 2213 { 2214 struct socket *so = kn->kn_fp->f_data; 2215 int rv = 0; 2216 2217 soassertlocked(so); 2218 2219 #ifdef SOCKET_SPLICE 2220 if (isspliced(so)) { 2221 rv = 0; 2222 } else 2223 #endif /* SOCKET_SPLICE */ 2224 if (kn->kn_sfflags & NOTE_OOB) { 2225 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) { 2226 kn->kn_fflags |= NOTE_OOB; 2227 kn->kn_data -= so->so_oobmark; 2228 rv = 1; 2229 } 2230 } 2231 2232 if (kn->kn_flags & __EV_POLL) { 2233 if (so->so_state & SS_ISDISCONNECTED) { 2234 kn->kn_flags |= __EV_HUP; 2235 rv = 1; 2236 } 2237 } 2238 2239 return rv; 2240 } 2241 2242 int 2243 filt_solisten(struct knote *kn, long hint) 2244 { 2245 struct socket *so = kn->kn_fp->f_data; 2246 int active; 2247 2248 soassertlocked(so); 2249 2250 kn->kn_data = so->so_qlen; 2251 active = (kn->kn_data != 0); 2252 2253 if (kn->kn_flags & (__EV_POLL | __EV_SELECT)) { 2254 if (so->so_state & SS_ISDISCONNECTED) { 2255 kn->kn_flags |= __EV_HUP; 2256 active = 1; 2257 } else { 2258 active = soreadable(so); 2259 } 2260 } 2261 2262 return (active); 2263 } 2264 2265 int 2266 filt_somodify(struct kevent *kev, struct knote *kn) 2267 { 2268 struct socket *so = kn->kn_fp->f_data; 2269 int rv; 2270 2271 solock(so); 2272 rv = knote_modify(kev, kn); 2273 sounlock(so); 2274 2275 return (rv); 2276 } 2277 2278 int 2279 filt_soprocess(struct knote *kn, struct kevent *kev) 2280 { 2281 struct socket *so = kn->kn_fp->f_data; 2282 int rv; 2283 2284 solock(so); 2285 rv = knote_process(kn, kev); 2286 sounlock(so); 2287 2288 return (rv); 2289 } 2290 2291 void 2292 klist_soassertlk(void *arg) 2293 { 2294 struct socket *so = arg; 2295 2296 soassertlocked(so); 2297 } 2298 2299 int 2300 klist_solock(void *arg) 2301 { 2302 struct socket *so = arg; 2303 2304 solock(so); 2305 return (1); 2306 } 2307 2308 void 2309 klist_sounlock(void *arg, int ls) 2310 { 2311 struct socket *so = arg; 2312 2313 sounlock(so); 2314 } 2315 2316 const struct klistops socket_klistops = { 2317 .klo_assertlk = klist_soassertlk, 2318 .klo_lock = klist_solock, 2319 .klo_unlock = klist_sounlock, 2320 }; 2321 2322 #ifdef DDB 2323 void 2324 sobuf_print(struct sockbuf *, 2325 int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))); 2326 2327 void 2328 sobuf_print(struct sockbuf *sb, 2329 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2330 { 2331 (*pr)("\tsb_cc: %lu\n", sb->sb_cc); 2332 (*pr)("\tsb_datacc: %lu\n", sb->sb_datacc); 2333 (*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat); 2334 (*pr)("\tsb_wat: %lu\n", sb->sb_wat); 2335 (*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt); 2336 (*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax); 2337 (*pr)("\tsb_lowat: %ld\n", sb->sb_lowat); 2338 (*pr)("\tsb_mb: %p\n", sb->sb_mb); 2339 (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail); 2340 (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord); 2341 (*pr)("\tsb_sel: ...\n"); 2342 (*pr)("\tsb_flags: %i\n", sb->sb_flags); 2343 (*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs); 2344 } 2345 2346 void 2347 so_print(void *v, 2348 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2349 { 2350 struct socket *so = v; 2351 2352 (*pr)("socket %p\n", so); 2353 (*pr)("so_type: %i\n", so->so_type); 2354 (*pr)("so_options: 0x%04x\n", so->so_options); /* %b */ 2355 (*pr)("so_linger: %i\n", so->so_linger); 2356 (*pr)("so_state: 0x%04x\n", so->so_state); 2357 (*pr)("so_pcb: %p\n", so->so_pcb); 2358 (*pr)("so_proto: %p\n", so->so_proto); 2359 (*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio); 2360 2361 (*pr)("so_head: %p\n", so->so_head); 2362 (*pr)("so_onq: %p\n", so->so_onq); 2363 (*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0)); 2364 (*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q)); 2365 (*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe)); 2366 (*pr)("so_q0len: %i\n", so->so_q0len); 2367 (*pr)("so_qlen: %i\n", so->so_qlen); 2368 (*pr)("so_qlimit: %i\n", so->so_qlimit); 2369 (*pr)("so_timeo: %i\n", so->so_timeo); 2370 (*pr)("so_obmark: %lu\n", so->so_oobmark); 2371 2372 (*pr)("so_sp: %p\n", so->so_sp); 2373 if (so->so_sp != NULL) { 2374 (*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket); 2375 (*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback); 2376 (*pr)("\tssp_len: %lld\n", 2377 (unsigned long long)so->so_sp->ssp_len); 2378 (*pr)("\tssp_max: %lld\n", 2379 (unsigned long long)so->so_sp->ssp_max); 2380 (*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec, 2381 so->so_sp->ssp_idletv.tv_usec); 2382 (*pr)("\tssp_idleto: %spending (@%i)\n", 2383 timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ", 2384 so->so_sp->ssp_idleto.to_time); 2385 } 2386 2387 (*pr)("so_rcv:\n"); 2388 sobuf_print(&so->so_rcv, pr); 2389 (*pr)("so_snd:\n"); 2390 sobuf_print(&so->so_snd, pr); 2391 2392 (*pr)("so_upcall: %p so_upcallarg: %p\n", 2393 so->so_upcall, so->so_upcallarg); 2394 2395 (*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid); 2396 (*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid); 2397 (*pr)("so_cpid: %d\n", so->so_cpid); 2398 } 2399 #endif 2400