1 /* $OpenBSD: uipc_socket.c,v 1.293 2022/12/12 08:30:22 tb Exp $ */ 2 /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 #include <sys/file.h> 39 #include <sys/filedesc.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/domain.h> 43 #include <sys/event.h> 44 #include <sys/protosw.h> 45 #include <sys/socket.h> 46 #include <sys/unpcb.h> 47 #include <sys/socketvar.h> 48 #include <sys/signalvar.h> 49 #include <sys/pool.h> 50 #include <sys/atomic.h> 51 #include <sys/rwlock.h> 52 #include <sys/time.h> 53 #include <sys/refcnt.h> 54 55 #ifdef DDB 56 #include <machine/db_machdep.h> 57 #endif 58 59 void sbsync(struct sockbuf *, struct mbuf *); 60 61 int sosplice(struct socket *, int, off_t, struct timeval *); 62 void sounsplice(struct socket *, struct socket *, int); 63 void soidle(void *); 64 void sotask(void *); 65 void soreaper(void *); 66 void soput(void *); 67 int somove(struct socket *, int); 68 void sorflush(struct socket *); 69 70 void filt_sordetach(struct knote *kn); 71 int filt_soread(struct knote *kn, long hint); 72 void filt_sowdetach(struct knote *kn); 73 int filt_sowrite(struct knote *kn, long hint); 74 int filt_soexcept(struct knote *kn, long hint); 75 int filt_solisten(struct knote *kn, long hint); 76 int filt_somodify(struct kevent *kev, struct knote *kn); 77 int filt_soprocess(struct knote *kn, struct kevent *kev); 78 79 const struct filterops solisten_filtops = { 80 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 81 .f_attach = NULL, 82 .f_detach = filt_sordetach, 83 .f_event = filt_solisten, 84 .f_modify = filt_somodify, 85 .f_process = filt_soprocess, 86 }; 87 88 const struct filterops soread_filtops = { 89 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 90 .f_attach = NULL, 91 .f_detach = filt_sordetach, 92 .f_event = filt_soread, 93 .f_modify = filt_somodify, 94 .f_process = filt_soprocess, 95 }; 96 97 const struct filterops sowrite_filtops = { 98 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 99 .f_attach = NULL, 100 .f_detach = filt_sowdetach, 101 .f_event = filt_sowrite, 102 .f_modify = filt_somodify, 103 .f_process = filt_soprocess, 104 }; 105 106 const struct filterops soexcept_filtops = { 107 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 108 .f_attach = NULL, 109 .f_detach = filt_sordetach, 110 .f_event = filt_soexcept, 111 .f_modify = filt_somodify, 112 .f_process = filt_soprocess, 113 }; 114 115 #ifndef SOMINCONN 116 #define SOMINCONN 80 117 #endif /* SOMINCONN */ 118 119 int somaxconn = SOMAXCONN; 120 int sominconn = SOMINCONN; 121 122 struct pool socket_pool; 123 #ifdef SOCKET_SPLICE 124 struct pool sosplice_pool; 125 struct taskq *sosplice_taskq; 126 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk"); 127 #endif 128 129 void 130 soinit(void) 131 { 132 pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0, 133 "sockpl", NULL); 134 #ifdef SOCKET_SPLICE 135 pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0, 136 "sosppl", NULL); 137 #endif 138 } 139 140 struct socket * 141 soalloc(int wait) 142 { 143 struct socket *so; 144 145 so = pool_get(&socket_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) | 146 PR_ZERO); 147 if (so == NULL) 148 return (NULL); 149 rw_init_flags(&so->so_lock, "solock", RWL_DUPOK); 150 refcnt_init(&so->so_refcnt); 151 152 return (so); 153 } 154 155 /* 156 * Socket operation routines. 157 * These routines are called by the routines in 158 * sys_socket.c or from a system process, and 159 * implement the semantics of socket operations by 160 * switching out to the protocol specific routines. 161 */ 162 int 163 socreate(int dom, struct socket **aso, int type, int proto) 164 { 165 struct proc *p = curproc; /* XXX */ 166 const struct protosw *prp; 167 struct socket *so; 168 int error; 169 170 if (proto) 171 prp = pffindproto(dom, proto, type); 172 else 173 prp = pffindtype(dom, type); 174 if (prp == NULL || prp->pr_usrreqs == NULL) 175 return (EPROTONOSUPPORT); 176 if (prp->pr_type != type) 177 return (EPROTOTYPE); 178 so = soalloc(M_WAIT); 179 klist_init(&so->so_rcv.sb_sel.si_note, &socket_klistops, so); 180 klist_init(&so->so_snd.sb_sel.si_note, &socket_klistops, so); 181 sigio_init(&so->so_sigio); 182 TAILQ_INIT(&so->so_q0); 183 TAILQ_INIT(&so->so_q); 184 so->so_type = type; 185 if (suser(p) == 0) 186 so->so_state = SS_PRIV; 187 so->so_ruid = p->p_ucred->cr_ruid; 188 so->so_euid = p->p_ucred->cr_uid; 189 so->so_rgid = p->p_ucred->cr_rgid; 190 so->so_egid = p->p_ucred->cr_gid; 191 so->so_cpid = p->p_p->ps_pid; 192 so->so_proto = prp; 193 so->so_snd.sb_timeo_nsecs = INFSLP; 194 so->so_rcv.sb_timeo_nsecs = INFSLP; 195 196 solock(so); 197 error = pru_attach(so, proto, M_WAIT); 198 if (error) { 199 so->so_state |= SS_NOFDREF; 200 /* sofree() calls sounlock(). */ 201 sofree(so, 0); 202 return (error); 203 } 204 sounlock(so); 205 *aso = so; 206 return (0); 207 } 208 209 int 210 sobind(struct socket *so, struct mbuf *nam, struct proc *p) 211 { 212 soassertlocked(so); 213 return pru_bind(so, nam, p); 214 } 215 216 int 217 solisten(struct socket *so, int backlog) 218 { 219 int error; 220 221 soassertlocked(so); 222 223 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 224 return (EINVAL); 225 #ifdef SOCKET_SPLICE 226 if (isspliced(so) || issplicedback(so)) 227 return (EOPNOTSUPP); 228 #endif /* SOCKET_SPLICE */ 229 error = pru_listen(so); 230 if (error) 231 return (error); 232 if (TAILQ_FIRST(&so->so_q) == NULL) 233 so->so_options |= SO_ACCEPTCONN; 234 if (backlog < 0 || backlog > somaxconn) 235 backlog = somaxconn; 236 if (backlog < sominconn) 237 backlog = sominconn; 238 so->so_qlimit = backlog; 239 return (0); 240 } 241 242 #define SOSP_FREEING_READ 1 243 #define SOSP_FREEING_WRITE 2 244 void 245 sofree(struct socket *so, int keep_lock) 246 { 247 int persocket = solock_persocket(so); 248 249 soassertlocked(so); 250 251 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { 252 if (!keep_lock) 253 sounlock(so); 254 return; 255 } 256 if (so->so_head) { 257 struct socket *head = so->so_head; 258 259 /* 260 * We must not decommission a socket that's on the accept(2) 261 * queue. If we do, then accept(2) may hang after select(2) 262 * indicated that the listening socket was ready. 263 */ 264 if (so->so_onq == &head->so_q) { 265 if (!keep_lock) 266 sounlock(so); 267 return; 268 } 269 270 if (persocket) { 271 /* 272 * Concurrent close of `head' could 273 * abort `so' due to re-lock. 274 */ 275 soref(so); 276 soref(head); 277 sounlock(so); 278 solock(head); 279 solock(so); 280 281 if (so->so_onq != &head->so_q0) { 282 sounlock(head); 283 sounlock(so); 284 sorele(head); 285 sorele(so); 286 return; 287 } 288 289 sorele(head); 290 sorele(so); 291 } 292 293 soqremque(so, 0); 294 295 if (persocket) 296 sounlock(head); 297 } 298 299 if (persocket) { 300 sounlock(so); 301 refcnt_finalize(&so->so_refcnt, "sofinal"); 302 solock(so); 303 } 304 305 sigio_free(&so->so_sigio); 306 klist_free(&so->so_rcv.sb_sel.si_note); 307 klist_free(&so->so_snd.sb_sel.si_note); 308 #ifdef SOCKET_SPLICE 309 if (so->so_sp) { 310 if (issplicedback(so)) { 311 int freeing = SOSP_FREEING_WRITE; 312 313 if (so->so_sp->ssp_soback == so) 314 freeing |= SOSP_FREEING_READ; 315 sounsplice(so->so_sp->ssp_soback, so, freeing); 316 } 317 if (isspliced(so)) { 318 int freeing = SOSP_FREEING_READ; 319 320 if (so == so->so_sp->ssp_socket) 321 freeing |= SOSP_FREEING_WRITE; 322 sounsplice(so, so->so_sp->ssp_socket, freeing); 323 } 324 } 325 #endif /* SOCKET_SPLICE */ 326 sbrelease(so, &so->so_snd); 327 sorflush(so); 328 if (!keep_lock) 329 sounlock(so); 330 #ifdef SOCKET_SPLICE 331 if (so->so_sp) { 332 /* Reuse splice idle, sounsplice() has been called before. */ 333 timeout_set_proc(&so->so_sp->ssp_idleto, soreaper, so); 334 timeout_add(&so->so_sp->ssp_idleto, 0); 335 } else 336 #endif /* SOCKET_SPLICE */ 337 { 338 pool_put(&socket_pool, so); 339 } 340 } 341 342 static inline uint64_t 343 solinger_nsec(struct socket *so) 344 { 345 if (so->so_linger == 0) 346 return INFSLP; 347 348 return SEC_TO_NSEC(so->so_linger); 349 } 350 351 /* 352 * Close a socket on last file table reference removal. 353 * Initiate disconnect if connected. 354 * Free socket when disconnect complete. 355 */ 356 int 357 soclose(struct socket *so, int flags) 358 { 359 struct socket *so2; 360 int error = 0; 361 362 solock(so); 363 /* Revoke async IO early. There is a final revocation in sofree(). */ 364 sigio_free(&so->so_sigio); 365 if (so->so_state & SS_ISCONNECTED) { 366 if (so->so_pcb == NULL) 367 goto discard; 368 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 369 error = sodisconnect(so); 370 if (error) 371 goto drop; 372 } 373 if (so->so_options & SO_LINGER) { 374 if ((so->so_state & SS_ISDISCONNECTING) && 375 (flags & MSG_DONTWAIT)) 376 goto drop; 377 while (so->so_state & SS_ISCONNECTED) { 378 error = sosleep_nsec(so, &so->so_timeo, 379 PSOCK | PCATCH, "netcls", 380 solinger_nsec(so)); 381 if (error) 382 break; 383 } 384 } 385 } 386 drop: 387 if (so->so_pcb) { 388 int error2; 389 error2 = pru_detach(so); 390 if (error == 0) 391 error = error2; 392 } 393 if (so->so_options & SO_ACCEPTCONN) { 394 int persocket = solock_persocket(so); 395 396 if (persocket) { 397 /* Wait concurrent sonewconn() threads. */ 398 while (so->so_newconn > 0) { 399 so->so_state |= SS_NEWCONN_WAIT; 400 sosleep_nsec(so, &so->so_newconn, PSOCK, 401 "netlck", INFSLP); 402 } 403 } 404 405 while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { 406 if (persocket) 407 solock(so2); 408 (void) soqremque(so2, 0); 409 if (persocket) 410 sounlock(so); 411 soabort(so2); 412 if (persocket) 413 solock(so); 414 } 415 while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { 416 if (persocket) 417 solock(so2); 418 (void) soqremque(so2, 1); 419 if (persocket) 420 sounlock(so); 421 soabort(so2); 422 if (persocket) 423 solock(so); 424 } 425 } 426 discard: 427 if (so->so_state & SS_NOFDREF) 428 panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type); 429 so->so_state |= SS_NOFDREF; 430 /* sofree() calls sounlock(). */ 431 sofree(so, 0); 432 return (error); 433 } 434 435 void 436 soabort(struct socket *so) 437 { 438 soassertlocked(so); 439 pru_abort(so); 440 } 441 442 int 443 soaccept(struct socket *so, struct mbuf *nam) 444 { 445 int error = 0; 446 447 soassertlocked(so); 448 449 if ((so->so_state & SS_NOFDREF) == 0) 450 panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type); 451 so->so_state &= ~SS_NOFDREF; 452 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 453 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 454 error = pru_accept(so, nam); 455 else 456 error = ECONNABORTED; 457 return (error); 458 } 459 460 int 461 soconnect(struct socket *so, struct mbuf *nam) 462 { 463 int error; 464 465 soassertlocked(so); 466 467 if (so->so_options & SO_ACCEPTCONN) 468 return (EOPNOTSUPP); 469 /* 470 * If protocol is connection-based, can only connect once. 471 * Otherwise, if connected, try to disconnect first. 472 * This allows user to disconnect by connecting to, e.g., 473 * a null address. 474 */ 475 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 476 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 477 (error = sodisconnect(so)))) 478 error = EISCONN; 479 else 480 error = pru_connect(so, nam); 481 return (error); 482 } 483 484 int 485 soconnect2(struct socket *so1, struct socket *so2) 486 { 487 int persocket, error; 488 489 if ((persocket = solock_persocket(so1))) 490 solock_pair(so1, so2); 491 else 492 solock(so1); 493 494 error = pru_connect2(so1, so2); 495 496 if (persocket) 497 sounlock(so2); 498 sounlock(so1); 499 return (error); 500 } 501 502 int 503 sodisconnect(struct socket *so) 504 { 505 int error; 506 507 soassertlocked(so); 508 509 if ((so->so_state & SS_ISCONNECTED) == 0) 510 return (ENOTCONN); 511 if (so->so_state & SS_ISDISCONNECTING) 512 return (EALREADY); 513 error = pru_disconnect(so); 514 return (error); 515 } 516 517 int m_getuio(struct mbuf **, int, long, struct uio *); 518 519 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 520 /* 521 * Send on a socket. 522 * If send must go all at once and message is larger than 523 * send buffering, then hard error. 524 * Lock against other senders. 525 * If must go all at once and not enough room now, then 526 * inform user that this would block and do nothing. 527 * Otherwise, if nonblocking, send as much as possible. 528 * The data to be sent is described by "uio" if nonzero, 529 * otherwise by the mbuf chain "top" (which must be null 530 * if uio is not). Data provided in mbuf chain must be small 531 * enough to send all at once. 532 * 533 * Returns nonzero on error, timeout or signal; callers 534 * must check for short counts if EINTR/ERESTART are returned. 535 * Data and control buffers are freed on return. 536 */ 537 int 538 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 539 struct mbuf *control, int flags) 540 { 541 long space, clen = 0; 542 size_t resid; 543 int error; 544 int atomic = sosendallatonce(so) || top; 545 546 if (uio) 547 resid = uio->uio_resid; 548 else 549 resid = top->m_pkthdr.len; 550 /* MSG_EOR on a SOCK_STREAM socket is invalid. */ 551 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 552 m_freem(top); 553 m_freem(control); 554 return (EINVAL); 555 } 556 if (uio && uio->uio_procp) 557 uio->uio_procp->p_ru.ru_msgsnd++; 558 if (control) { 559 /* 560 * In theory clen should be unsigned (since control->m_len is). 561 * However, space must be signed, as it might be less than 0 562 * if we over-committed, and we must use a signed comparison 563 * of space and clen. 564 */ 565 clen = control->m_len; 566 /* reserve extra space for AF_UNIX's internalize */ 567 if (so->so_proto->pr_domain->dom_family == AF_UNIX && 568 clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) && 569 mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 570 clen = CMSG_SPACE( 571 (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) * 572 (sizeof(struct fdpass) / sizeof(int))); 573 } 574 575 #define snderr(errno) { error = errno; goto release; } 576 577 solock(so); 578 restart: 579 if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0) 580 goto out; 581 so->so_state |= SS_ISSENDING; 582 do { 583 if (so->so_state & SS_CANTSENDMORE) 584 snderr(EPIPE); 585 if (so->so_error) { 586 error = so->so_error; 587 so->so_error = 0; 588 snderr(error); 589 } 590 if ((so->so_state & SS_ISCONNECTED) == 0) { 591 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 592 if (!(resid == 0 && clen != 0)) 593 snderr(ENOTCONN); 594 } else if (addr == NULL) 595 snderr(EDESTADDRREQ); 596 } 597 space = sbspace(so, &so->so_snd); 598 if (flags & MSG_OOB) 599 space += 1024; 600 if (so->so_proto->pr_domain->dom_family == AF_UNIX) { 601 if (atomic && resid > so->so_snd.sb_hiwat) 602 snderr(EMSGSIZE); 603 } else { 604 if (clen > so->so_snd.sb_hiwat || 605 (atomic && resid > so->so_snd.sb_hiwat - clen)) 606 snderr(EMSGSIZE); 607 } 608 if (space < clen || 609 (space - clen < resid && 610 (atomic || space < so->so_snd.sb_lowat))) { 611 if (flags & MSG_DONTWAIT) 612 snderr(EWOULDBLOCK); 613 sbunlock(so, &so->so_snd); 614 error = sbwait(so, &so->so_snd); 615 so->so_state &= ~SS_ISSENDING; 616 if (error) 617 goto out; 618 goto restart; 619 } 620 space -= clen; 621 do { 622 if (uio == NULL) { 623 /* 624 * Data is prepackaged in "top". 625 */ 626 resid = 0; 627 if (flags & MSG_EOR) 628 top->m_flags |= M_EOR; 629 } else { 630 sounlock(so); 631 error = m_getuio(&top, atomic, space, uio); 632 solock(so); 633 if (error) 634 goto release; 635 space -= top->m_pkthdr.len; 636 resid = uio->uio_resid; 637 if (flags & MSG_EOR) 638 top->m_flags |= M_EOR; 639 } 640 if (resid == 0) 641 so->so_state &= ~SS_ISSENDING; 642 if (top && so->so_options & SO_ZEROIZE) 643 top->m_flags |= M_ZEROIZE; 644 if (flags & MSG_OOB) 645 error = pru_sendoob(so, top, addr, control); 646 else 647 error = pru_send(so, top, addr, control); 648 clen = 0; 649 control = NULL; 650 top = NULL; 651 if (error) 652 goto release; 653 } while (resid && space > 0); 654 } while (resid); 655 656 release: 657 so->so_state &= ~SS_ISSENDING; 658 sbunlock(so, &so->so_snd); 659 out: 660 sounlock(so); 661 m_freem(top); 662 m_freem(control); 663 return (error); 664 } 665 666 int 667 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio) 668 { 669 struct mbuf *m, *top = NULL; 670 struct mbuf **nextp = ⊤ 671 u_long len, mlen; 672 size_t resid = uio->uio_resid; 673 int error; 674 675 do { 676 if (top == NULL) { 677 MGETHDR(m, M_WAIT, MT_DATA); 678 mlen = MHLEN; 679 m->m_pkthdr.len = 0; 680 m->m_pkthdr.ph_ifidx = 0; 681 } else { 682 MGET(m, M_WAIT, MT_DATA); 683 mlen = MLEN; 684 } 685 /* chain mbuf together */ 686 *nextp = m; 687 nextp = &m->m_next; 688 689 resid = ulmin(resid, space); 690 if (resid >= MINCLSIZE) { 691 MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES)); 692 if ((m->m_flags & M_EXT) == 0) 693 MCLGETL(m, M_NOWAIT, MCLBYTES); 694 if ((m->m_flags & M_EXT) == 0) 695 goto nopages; 696 mlen = m->m_ext.ext_size; 697 len = ulmin(mlen, resid); 698 /* 699 * For datagram protocols, leave room 700 * for protocol headers in first mbuf. 701 */ 702 if (atomic && m == top && len < mlen - max_hdr) 703 m->m_data += max_hdr; 704 } else { 705 nopages: 706 len = ulmin(mlen, resid); 707 /* 708 * For datagram protocols, leave room 709 * for protocol headers in first mbuf. 710 */ 711 if (atomic && m == top && len < mlen - max_hdr) 712 m_align(m, len); 713 } 714 715 error = uiomove(mtod(m, caddr_t), len, uio); 716 if (error) { 717 m_freem(top); 718 return (error); 719 } 720 721 /* adjust counters */ 722 resid = uio->uio_resid; 723 space -= len; 724 m->m_len = len; 725 top->m_pkthdr.len += len; 726 727 /* Is there more space and more data? */ 728 } while (space > 0 && resid > 0); 729 730 *mp = top; 731 return 0; 732 } 733 734 /* 735 * Following replacement or removal of the first mbuf on the first 736 * mbuf chain of a socket buffer, push necessary state changes back 737 * into the socket buffer so that other consumers see the values 738 * consistently. 'nextrecord' is the callers locally stored value of 739 * the original value of sb->sb_mb->m_nextpkt which must be restored 740 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 741 */ 742 void 743 sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 744 { 745 746 /* 747 * First, update for the new value of nextrecord. If necessary, 748 * make it the first record. 749 */ 750 if (sb->sb_mb != NULL) 751 sb->sb_mb->m_nextpkt = nextrecord; 752 else 753 sb->sb_mb = nextrecord; 754 755 /* 756 * Now update any dependent socket buffer fields to reflect 757 * the new state. This is an inline of SB_EMPTY_FIXUP, with 758 * the addition of a second clause that takes care of the 759 * case where sb_mb has been updated, but remains the last 760 * record. 761 */ 762 if (sb->sb_mb == NULL) { 763 sb->sb_mbtail = NULL; 764 sb->sb_lastrecord = NULL; 765 } else if (sb->sb_mb->m_nextpkt == NULL) 766 sb->sb_lastrecord = sb->sb_mb; 767 } 768 769 /* 770 * Implement receive operations on a socket. 771 * We depend on the way that records are added to the sockbuf 772 * by sbappend*. In particular, each record (mbufs linked through m_next) 773 * must begin with an address if the protocol so specifies, 774 * followed by an optional mbuf or mbufs containing ancillary data, 775 * and then zero or more mbufs of data. 776 * In order to avoid blocking network for the entire time here, we release 777 * the solock() while doing the actual copy to user space. 778 * Although the sockbuf is locked, new data may still be appended, 779 * and thus we must maintain consistency of the sockbuf during that time. 780 * 781 * The caller may receive the data as a single mbuf chain by supplying 782 * an mbuf **mp0 for use in returning the chain. The uio is then used 783 * only for the count in uio_resid. 784 */ 785 int 786 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 787 struct mbuf **mp0, struct mbuf **controlp, int *flagsp, 788 socklen_t controllen) 789 { 790 struct mbuf *m, **mp; 791 struct mbuf *cm; 792 u_long len, offset, moff; 793 int flags, error, type, uio_error = 0; 794 const struct protosw *pr = so->so_proto; 795 struct mbuf *nextrecord; 796 size_t resid, orig_resid = uio->uio_resid; 797 798 mp = mp0; 799 if (paddr) 800 *paddr = NULL; 801 if (controlp) 802 *controlp = NULL; 803 if (flagsp) 804 flags = *flagsp &~ MSG_EOR; 805 else 806 flags = 0; 807 if (flags & MSG_OOB) { 808 m = m_get(M_WAIT, MT_DATA); 809 solock(so); 810 error = pru_rcvoob(so, m, flags & MSG_PEEK); 811 sounlock(so); 812 if (error) 813 goto bad; 814 do { 815 error = uiomove(mtod(m, caddr_t), 816 ulmin(uio->uio_resid, m->m_len), uio); 817 m = m_free(m); 818 } while (uio->uio_resid && error == 0 && m); 819 bad: 820 m_freem(m); 821 return (error); 822 } 823 if (mp) 824 *mp = NULL; 825 826 solock_shared(so); 827 restart: 828 if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) { 829 sounlock_shared(so); 830 return (error); 831 } 832 833 m = so->so_rcv.sb_mb; 834 #ifdef SOCKET_SPLICE 835 if (isspliced(so)) 836 m = NULL; 837 #endif /* SOCKET_SPLICE */ 838 /* 839 * If we have less data than requested, block awaiting more 840 * (subject to any timeout) if: 841 * 1. the current count is less than the low water mark, 842 * 2. MSG_WAITALL is set, and it is possible to do the entire 843 * receive operation at once if we block (resid <= hiwat), or 844 * 3. MSG_DONTWAIT is not set. 845 * If MSG_WAITALL is set but resid is larger than the receive buffer, 846 * we have to do the receive in sections, and thus risk returning 847 * a short count if a timeout or signal occurs after we start. 848 */ 849 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 850 so->so_rcv.sb_cc < uio->uio_resid) && 851 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 852 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 853 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 854 #ifdef DIAGNOSTIC 855 if (m == NULL && so->so_rcv.sb_cc) 856 #ifdef SOCKET_SPLICE 857 if (!isspliced(so)) 858 #endif /* SOCKET_SPLICE */ 859 panic("receive 1: so %p, so_type %d, sb_cc %lu", 860 so, so->so_type, so->so_rcv.sb_cc); 861 #endif 862 if (so->so_error) { 863 if (m) 864 goto dontblock; 865 error = so->so_error; 866 if ((flags & MSG_PEEK) == 0) 867 so->so_error = 0; 868 goto release; 869 } 870 if (so->so_state & SS_CANTRCVMORE) { 871 if (m) 872 goto dontblock; 873 else if (so->so_rcv.sb_cc == 0) 874 goto release; 875 } 876 for (; m; m = m->m_next) 877 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 878 m = so->so_rcv.sb_mb; 879 goto dontblock; 880 } 881 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 882 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 883 error = ENOTCONN; 884 goto release; 885 } 886 if (uio->uio_resid == 0 && controlp == NULL) 887 goto release; 888 if (flags & MSG_DONTWAIT) { 889 error = EWOULDBLOCK; 890 goto release; 891 } 892 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 893 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 894 sbunlock(so, &so->so_rcv); 895 error = sbwait(so, &so->so_rcv); 896 if (error) { 897 sounlock_shared(so); 898 return (error); 899 } 900 goto restart; 901 } 902 dontblock: 903 /* 904 * On entry here, m points to the first record of the socket buffer. 905 * From this point onward, we maintain 'nextrecord' as a cache of the 906 * pointer to the next record in the socket buffer. We must keep the 907 * various socket buffer pointers and local stack versions of the 908 * pointers in sync, pushing out modifications before operations that 909 * may sleep, and re-reading them afterwards. 910 * 911 * Otherwise, we will race with the network stack appending new data 912 * or records onto the socket buffer by using inconsistent/stale 913 * versions of the field, possibly resulting in socket buffer 914 * corruption. 915 */ 916 if (uio->uio_procp) 917 uio->uio_procp->p_ru.ru_msgrcv++; 918 KASSERT(m == so->so_rcv.sb_mb); 919 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 920 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 921 nextrecord = m->m_nextpkt; 922 if (pr->pr_flags & PR_ADDR) { 923 #ifdef DIAGNOSTIC 924 if (m->m_type != MT_SONAME) 925 panic("receive 1a: so %p, so_type %d, m %p, m_type %d", 926 so, so->so_type, m, m->m_type); 927 #endif 928 orig_resid = 0; 929 if (flags & MSG_PEEK) { 930 if (paddr) 931 *paddr = m_copym(m, 0, m->m_len, M_NOWAIT); 932 m = m->m_next; 933 } else { 934 sbfree(so, &so->so_rcv, m); 935 if (paddr) { 936 *paddr = m; 937 so->so_rcv.sb_mb = m->m_next; 938 m->m_next = NULL; 939 m = so->so_rcv.sb_mb; 940 } else { 941 so->so_rcv.sb_mb = m_free(m); 942 m = so->so_rcv.sb_mb; 943 } 944 sbsync(&so->so_rcv, nextrecord); 945 } 946 } 947 while (m && m->m_type == MT_CONTROL && error == 0) { 948 int skip = 0; 949 if (flags & MSG_PEEK) { 950 if (mtod(m, struct cmsghdr *)->cmsg_type == 951 SCM_RIGHTS) { 952 /* don't leak internalized SCM_RIGHTS msgs */ 953 skip = 1; 954 } else if (controlp) 955 *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); 956 m = m->m_next; 957 } else { 958 sbfree(so, &so->so_rcv, m); 959 so->so_rcv.sb_mb = m->m_next; 960 m->m_nextpkt = m->m_next = NULL; 961 cm = m; 962 m = so->so_rcv.sb_mb; 963 sbsync(&so->so_rcv, nextrecord); 964 if (controlp) { 965 if (pr->pr_domain->dom_externalize) { 966 sounlock_shared(so); 967 error = 968 (*pr->pr_domain->dom_externalize) 969 (cm, controllen, flags); 970 solock_shared(so); 971 } 972 *controlp = cm; 973 } else { 974 /* 975 * Dispose of any SCM_RIGHTS message that went 976 * through the read path rather than recv. 977 */ 978 if (pr->pr_domain->dom_dispose) 979 pr->pr_domain->dom_dispose(cm); 980 m_free(cm); 981 } 982 } 983 if (m != NULL) 984 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 985 else 986 nextrecord = so->so_rcv.sb_mb; 987 if (controlp && !skip) 988 controlp = &(*controlp)->m_next; 989 orig_resid = 0; 990 } 991 992 /* If m is non-NULL, we have some data to read. */ 993 if (m) { 994 type = m->m_type; 995 if (type == MT_OOBDATA) 996 flags |= MSG_OOB; 997 if (m->m_flags & M_BCAST) 998 flags |= MSG_BCAST; 999 if (m->m_flags & M_MCAST) 1000 flags |= MSG_MCAST; 1001 } 1002 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 1003 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 1004 1005 moff = 0; 1006 offset = 0; 1007 while (m && uio->uio_resid > 0 && error == 0) { 1008 if (m->m_type == MT_OOBDATA) { 1009 if (type != MT_OOBDATA) 1010 break; 1011 } else if (type == MT_OOBDATA) { 1012 break; 1013 } else if (m->m_type == MT_CONTROL) { 1014 /* 1015 * If there is more than one control message in the 1016 * stream, we do a short read. Next can be received 1017 * or disposed by another system call. 1018 */ 1019 break; 1020 #ifdef DIAGNOSTIC 1021 } else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) { 1022 panic("receive 3: so %p, so_type %d, m %p, m_type %d", 1023 so, so->so_type, m, m->m_type); 1024 #endif 1025 } 1026 so->so_state &= ~SS_RCVATMARK; 1027 len = uio->uio_resid; 1028 if (so->so_oobmark && len > so->so_oobmark - offset) 1029 len = so->so_oobmark - offset; 1030 if (len > m->m_len - moff) 1031 len = m->m_len - moff; 1032 /* 1033 * If mp is set, just pass back the mbufs. 1034 * Otherwise copy them out via the uio, then free. 1035 * Sockbuf must be consistent here (points to current mbuf, 1036 * it points to next record) when we drop priority; 1037 * we must note any additions to the sockbuf when we 1038 * block interrupts again. 1039 */ 1040 if (mp == NULL && uio_error == 0) { 1041 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 1042 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 1043 resid = uio->uio_resid; 1044 sounlock_shared(so); 1045 uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1046 solock_shared(so); 1047 if (uio_error) 1048 uio->uio_resid = resid - len; 1049 } else 1050 uio->uio_resid -= len; 1051 if (len == m->m_len - moff) { 1052 if (m->m_flags & M_EOR) 1053 flags |= MSG_EOR; 1054 if (flags & MSG_PEEK) { 1055 m = m->m_next; 1056 moff = 0; 1057 orig_resid = 0; 1058 } else { 1059 nextrecord = m->m_nextpkt; 1060 sbfree(so, &so->so_rcv, m); 1061 if (mp) { 1062 *mp = m; 1063 mp = &m->m_next; 1064 so->so_rcv.sb_mb = m = m->m_next; 1065 *mp = NULL; 1066 } else { 1067 so->so_rcv.sb_mb = m_free(m); 1068 m = so->so_rcv.sb_mb; 1069 } 1070 /* 1071 * If m != NULL, we also know that 1072 * so->so_rcv.sb_mb != NULL. 1073 */ 1074 KASSERT(so->so_rcv.sb_mb == m); 1075 if (m) { 1076 m->m_nextpkt = nextrecord; 1077 if (nextrecord == NULL) 1078 so->so_rcv.sb_lastrecord = m; 1079 } else { 1080 so->so_rcv.sb_mb = nextrecord; 1081 SB_EMPTY_FIXUP(&so->so_rcv); 1082 } 1083 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1084 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1085 } 1086 } else { 1087 if (flags & MSG_PEEK) { 1088 moff += len; 1089 orig_resid = 0; 1090 } else { 1091 if (mp) 1092 *mp = m_copym(m, 0, len, M_WAIT); 1093 m->m_data += len; 1094 m->m_len -= len; 1095 so->so_rcv.sb_cc -= len; 1096 so->so_rcv.sb_datacc -= len; 1097 } 1098 } 1099 if (so->so_oobmark) { 1100 if ((flags & MSG_PEEK) == 0) { 1101 so->so_oobmark -= len; 1102 if (so->so_oobmark == 0) { 1103 so->so_state |= SS_RCVATMARK; 1104 break; 1105 } 1106 } else { 1107 offset += len; 1108 if (offset == so->so_oobmark) 1109 break; 1110 } 1111 } 1112 if (flags & MSG_EOR) 1113 break; 1114 /* 1115 * If the MSG_WAITALL flag is set (for non-atomic socket), 1116 * we must not quit until "uio->uio_resid == 0" or an error 1117 * termination. If a signal/timeout occurs, return 1118 * with a short count but without error. 1119 * Keep sockbuf locked against other readers. 1120 */ 1121 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1122 !sosendallatonce(so) && !nextrecord) { 1123 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1124 break; 1125 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1126 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1127 error = sbwait(so, &so->so_rcv); 1128 if (error) { 1129 sbunlock(so, &so->so_rcv); 1130 sounlock_shared(so); 1131 return (0); 1132 } 1133 if ((m = so->so_rcv.sb_mb) != NULL) 1134 nextrecord = m->m_nextpkt; 1135 } 1136 } 1137 1138 if (m && pr->pr_flags & PR_ATOMIC) { 1139 flags |= MSG_TRUNC; 1140 if ((flags & MSG_PEEK) == 0) 1141 (void) sbdroprecord(so, &so->so_rcv); 1142 } 1143 if ((flags & MSG_PEEK) == 0) { 1144 if (m == NULL) { 1145 /* 1146 * First part is an inline SB_EMPTY_FIXUP(). Second 1147 * part makes sure sb_lastrecord is up-to-date if 1148 * there is still data in the socket buffer. 1149 */ 1150 so->so_rcv.sb_mb = nextrecord; 1151 if (so->so_rcv.sb_mb == NULL) { 1152 so->so_rcv.sb_mbtail = NULL; 1153 so->so_rcv.sb_lastrecord = NULL; 1154 } else if (nextrecord->m_nextpkt == NULL) 1155 so->so_rcv.sb_lastrecord = nextrecord; 1156 } 1157 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1158 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1159 if (pr->pr_flags & PR_WANTRCVD) 1160 pru_rcvd(so); 1161 } 1162 if (orig_resid == uio->uio_resid && orig_resid && 1163 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1164 sbunlock(so, &so->so_rcv); 1165 goto restart; 1166 } 1167 1168 if (uio_error) 1169 error = uio_error; 1170 1171 if (flagsp) 1172 *flagsp |= flags; 1173 release: 1174 sbunlock(so, &so->so_rcv); 1175 sounlock_shared(so); 1176 return (error); 1177 } 1178 1179 int 1180 soshutdown(struct socket *so, int how) 1181 { 1182 int error = 0; 1183 1184 solock(so); 1185 switch (how) { 1186 case SHUT_RD: 1187 sorflush(so); 1188 break; 1189 case SHUT_RDWR: 1190 sorflush(so); 1191 /* FALLTHROUGH */ 1192 case SHUT_WR: 1193 error = pru_shutdown(so); 1194 break; 1195 default: 1196 error = EINVAL; 1197 break; 1198 } 1199 sounlock(so); 1200 1201 return (error); 1202 } 1203 1204 void 1205 sorflush(struct socket *so) 1206 { 1207 struct sockbuf *sb = &so->so_rcv; 1208 struct mbuf *m; 1209 const struct protosw *pr = so->so_proto; 1210 int error; 1211 1212 sb->sb_flags |= SB_NOINTR; 1213 error = sblock(so, sb, M_WAITOK); 1214 /* with SB_NOINTR and M_WAITOK sblock() must not fail */ 1215 KASSERT(error == 0); 1216 socantrcvmore(so); 1217 m = sb->sb_mb; 1218 memset(&sb->sb_startzero, 0, 1219 (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); 1220 sb->sb_timeo_nsecs = INFSLP; 1221 sbunlock(so, sb); 1222 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1223 (*pr->pr_domain->dom_dispose)(m); 1224 m_purge(m); 1225 } 1226 1227 #ifdef SOCKET_SPLICE 1228 1229 #define so_splicelen so_sp->ssp_len 1230 #define so_splicemax so_sp->ssp_max 1231 #define so_idletv so_sp->ssp_idletv 1232 #define so_idleto so_sp->ssp_idleto 1233 #define so_splicetask so_sp->ssp_task 1234 1235 int 1236 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) 1237 { 1238 struct file *fp; 1239 struct socket *sosp; 1240 struct sosplice *sp; 1241 struct taskq *tq; 1242 int error = 0; 1243 1244 soassertlocked(so); 1245 1246 if (sosplice_taskq == NULL) { 1247 rw_enter_write(&sosplice_lock); 1248 if (sosplice_taskq == NULL) { 1249 tq = taskq_create("sosplice", 1, IPL_SOFTNET, 1250 TASKQ_MPSAFE); 1251 /* Ensure the taskq is fully visible to other CPUs. */ 1252 membar_producer(); 1253 sosplice_taskq = tq; 1254 } 1255 rw_exit_write(&sosplice_lock); 1256 } 1257 if (sosplice_taskq == NULL) 1258 return (ENOMEM); 1259 1260 if ((so->so_proto->pr_flags & PR_SPLICE) == 0) 1261 return (EPROTONOSUPPORT); 1262 if (so->so_options & SO_ACCEPTCONN) 1263 return (EOPNOTSUPP); 1264 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1265 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 1266 return (ENOTCONN); 1267 if (so->so_sp == NULL) { 1268 sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1269 if (so->so_sp == NULL) 1270 so->so_sp = sp; 1271 else 1272 pool_put(&sosplice_pool, sp); 1273 } 1274 1275 /* If no fd is given, unsplice by removing existing link. */ 1276 if (fd < 0) { 1277 /* Lock receive buffer. */ 1278 if ((error = sblock(so, &so->so_rcv, M_WAITOK)) != 0) { 1279 return (error); 1280 } 1281 if (so->so_sp->ssp_socket) 1282 sounsplice(so, so->so_sp->ssp_socket, 0); 1283 sbunlock(so, &so->so_rcv); 1284 return (0); 1285 } 1286 1287 if (max && max < 0) 1288 return (EINVAL); 1289 1290 if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) 1291 return (EINVAL); 1292 1293 /* Find sosp, the drain socket where data will be spliced into. */ 1294 if ((error = getsock(curproc, fd, &fp)) != 0) 1295 return (error); 1296 sosp = fp->f_data; 1297 if (sosp->so_proto->pr_usrreqs->pru_send != 1298 so->so_proto->pr_usrreqs->pru_send) { 1299 error = EPROTONOSUPPORT; 1300 goto frele; 1301 } 1302 if (sosp->so_sp == NULL) { 1303 sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1304 if (sosp->so_sp == NULL) 1305 sosp->so_sp = sp; 1306 else 1307 pool_put(&sosplice_pool, sp); 1308 } 1309 1310 /* Lock both receive and send buffer. */ 1311 if ((error = sblock(so, &so->so_rcv, M_WAITOK)) != 0) { 1312 goto frele; 1313 } 1314 if ((error = sblock(so, &sosp->so_snd, M_WAITOK)) != 0) { 1315 sbunlock(so, &so->so_rcv); 1316 goto frele; 1317 } 1318 1319 if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { 1320 error = EBUSY; 1321 goto release; 1322 } 1323 if (sosp->so_options & SO_ACCEPTCONN) { 1324 error = EOPNOTSUPP; 1325 goto release; 1326 } 1327 if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) { 1328 error = ENOTCONN; 1329 goto release; 1330 } 1331 1332 /* Splice so and sosp together. */ 1333 so->so_sp->ssp_socket = sosp; 1334 sosp->so_sp->ssp_soback = so; 1335 so->so_splicelen = 0; 1336 so->so_splicemax = max; 1337 if (tv) 1338 so->so_idletv = *tv; 1339 else 1340 timerclear(&so->so_idletv); 1341 timeout_set_proc(&so->so_idleto, soidle, so); 1342 task_set(&so->so_splicetask, sotask, so); 1343 1344 /* 1345 * To prevent softnet interrupt from calling somove() while 1346 * we sleep, the socket buffers are not marked as spliced yet. 1347 */ 1348 if (somove(so, M_WAIT)) { 1349 so->so_rcv.sb_flags |= SB_SPLICE; 1350 sosp->so_snd.sb_flags |= SB_SPLICE; 1351 } 1352 1353 release: 1354 sbunlock(sosp, &sosp->so_snd); 1355 sbunlock(so, &so->so_rcv); 1356 frele: 1357 /* 1358 * FRELE() must not be called with the socket lock held. It is safe to 1359 * release the lock here as long as no other operation happen on the 1360 * socket when sosplice() returns. The dance could be avoided by 1361 * grabbing the socket lock inside this function. 1362 */ 1363 sounlock(so); 1364 FRELE(fp, curproc); 1365 solock(so); 1366 return (error); 1367 } 1368 1369 void 1370 sounsplice(struct socket *so, struct socket *sosp, int freeing) 1371 { 1372 soassertlocked(so); 1373 1374 task_del(sosplice_taskq, &so->so_splicetask); 1375 timeout_del(&so->so_idleto); 1376 sosp->so_snd.sb_flags &= ~SB_SPLICE; 1377 so->so_rcv.sb_flags &= ~SB_SPLICE; 1378 so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; 1379 /* Do not wakeup a socket that is about to be freed. */ 1380 if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so)) 1381 sorwakeup(so); 1382 if ((freeing & SOSP_FREEING_WRITE) == 0 && sowriteable(sosp)) 1383 sowwakeup(sosp); 1384 } 1385 1386 void 1387 soidle(void *arg) 1388 { 1389 struct socket *so = arg; 1390 1391 solock(so); 1392 if (so->so_rcv.sb_flags & SB_SPLICE) { 1393 so->so_error = ETIMEDOUT; 1394 sounsplice(so, so->so_sp->ssp_socket, 0); 1395 } 1396 sounlock(so); 1397 } 1398 1399 void 1400 sotask(void *arg) 1401 { 1402 struct socket *so = arg; 1403 1404 solock(so); 1405 if (so->so_rcv.sb_flags & SB_SPLICE) { 1406 /* 1407 * We may not sleep here as sofree() and unsplice() may be 1408 * called from softnet interrupt context. This would remove 1409 * the socket during somove(). 1410 */ 1411 somove(so, M_DONTWAIT); 1412 } 1413 sounlock(so); 1414 1415 /* Avoid user land starvation. */ 1416 yield(); 1417 } 1418 1419 /* 1420 * The socket splicing task or idle timeout may sleep while grabbing the net 1421 * lock. As sofree() can be called anytime, sotask() or soidle() could access 1422 * the socket memory of a freed socket after wakeup. So delay the pool_put() 1423 * after all pending socket splicing tasks or timeouts have finished. Do this 1424 * by scheduling it on the same threads. 1425 */ 1426 void 1427 soreaper(void *arg) 1428 { 1429 struct socket *so = arg; 1430 1431 /* Reuse splice task, sounsplice() has been called before. */ 1432 task_set(&so->so_sp->ssp_task, soput, so); 1433 task_add(sosplice_taskq, &so->so_sp->ssp_task); 1434 } 1435 1436 void 1437 soput(void *arg) 1438 { 1439 struct socket *so = arg; 1440 1441 pool_put(&sosplice_pool, so->so_sp); 1442 pool_put(&socket_pool, so); 1443 } 1444 1445 /* 1446 * Move data from receive buffer of spliced source socket to send 1447 * buffer of drain socket. Try to move as much as possible in one 1448 * big chunk. It is a TCP only implementation. 1449 * Return value 0 means splicing has been finished, 1 continue. 1450 */ 1451 int 1452 somove(struct socket *so, int wait) 1453 { 1454 struct socket *sosp = so->so_sp->ssp_socket; 1455 struct mbuf *m, **mp, *nextrecord; 1456 u_long len, off, oobmark; 1457 long space; 1458 int error = 0, maxreached = 0; 1459 unsigned int state; 1460 1461 soassertlocked(so); 1462 1463 nextpkt: 1464 if (so->so_error) { 1465 error = so->so_error; 1466 goto release; 1467 } 1468 if (sosp->so_state & SS_CANTSENDMORE) { 1469 error = EPIPE; 1470 goto release; 1471 } 1472 if (sosp->so_error && sosp->so_error != ETIMEDOUT && 1473 sosp->so_error != EFBIG && sosp->so_error != ELOOP) { 1474 error = sosp->so_error; 1475 goto release; 1476 } 1477 if ((sosp->so_state & SS_ISCONNECTED) == 0) 1478 goto release; 1479 1480 /* Calculate how many bytes can be copied now. */ 1481 len = so->so_rcv.sb_datacc; 1482 if (so->so_splicemax) { 1483 KASSERT(so->so_splicelen < so->so_splicemax); 1484 if (so->so_splicemax <= so->so_splicelen + len) { 1485 len = so->so_splicemax - so->so_splicelen; 1486 maxreached = 1; 1487 } 1488 } 1489 space = sbspace(sosp, &sosp->so_snd); 1490 if (so->so_oobmark && so->so_oobmark < len && 1491 so->so_oobmark < space + 1024) 1492 space += 1024; 1493 if (space <= 0) { 1494 maxreached = 0; 1495 goto release; 1496 } 1497 if (space < len) { 1498 maxreached = 0; 1499 if (space < sosp->so_snd.sb_lowat) 1500 goto release; 1501 len = space; 1502 } 1503 sosp->so_state |= SS_ISSENDING; 1504 1505 SBLASTRECORDCHK(&so->so_rcv, "somove 1"); 1506 SBLASTMBUFCHK(&so->so_rcv, "somove 1"); 1507 m = so->so_rcv.sb_mb; 1508 if (m == NULL) 1509 goto release; 1510 nextrecord = m->m_nextpkt; 1511 1512 /* Drop address and control information not used with splicing. */ 1513 if (so->so_proto->pr_flags & PR_ADDR) { 1514 #ifdef DIAGNOSTIC 1515 if (m->m_type != MT_SONAME) 1516 panic("somove soname: so %p, so_type %d, m %p, " 1517 "m_type %d", so, so->so_type, m, m->m_type); 1518 #endif 1519 m = m->m_next; 1520 } 1521 while (m && m->m_type == MT_CONTROL) 1522 m = m->m_next; 1523 if (m == NULL) { 1524 sbdroprecord(so, &so->so_rcv); 1525 if (so->so_proto->pr_flags & PR_WANTRCVD) 1526 pru_rcvd(so); 1527 goto nextpkt; 1528 } 1529 1530 /* 1531 * By splicing sockets connected to localhost, userland might create a 1532 * loop. Dissolve splicing with error if loop is detected by counter. 1533 * 1534 * If we deal with looped broadcast/multicast packet we bail out with 1535 * no error to suppress splice termination. 1536 */ 1537 if ((m->m_flags & M_PKTHDR) && 1538 ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) || 1539 ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) { 1540 error = ELOOP; 1541 goto release; 1542 } 1543 1544 if (so->so_proto->pr_flags & PR_ATOMIC) { 1545 if ((m->m_flags & M_PKTHDR) == 0) 1546 panic("somove !PKTHDR: so %p, so_type %d, m %p, " 1547 "m_type %d", so, so->so_type, m, m->m_type); 1548 if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) { 1549 error = EMSGSIZE; 1550 goto release; 1551 } 1552 if (len < m->m_pkthdr.len) 1553 goto release; 1554 if (m->m_pkthdr.len < len) { 1555 maxreached = 0; 1556 len = m->m_pkthdr.len; 1557 } 1558 /* 1559 * Throw away the name mbuf after it has been assured 1560 * that the whole first record can be processed. 1561 */ 1562 m = so->so_rcv.sb_mb; 1563 sbfree(so, &so->so_rcv, m); 1564 so->so_rcv.sb_mb = m_free(m); 1565 sbsync(&so->so_rcv, nextrecord); 1566 } 1567 /* 1568 * Throw away the control mbufs after it has been assured 1569 * that the whole first record can be processed. 1570 */ 1571 m = so->so_rcv.sb_mb; 1572 while (m && m->m_type == MT_CONTROL) { 1573 sbfree(so, &so->so_rcv, m); 1574 so->so_rcv.sb_mb = m_free(m); 1575 m = so->so_rcv.sb_mb; 1576 sbsync(&so->so_rcv, nextrecord); 1577 } 1578 1579 SBLASTRECORDCHK(&so->so_rcv, "somove 2"); 1580 SBLASTMBUFCHK(&so->so_rcv, "somove 2"); 1581 1582 /* Take at most len mbufs out of receive buffer. */ 1583 for (off = 0, mp = &m; off <= len && *mp; 1584 off += (*mp)->m_len, mp = &(*mp)->m_next) { 1585 u_long size = len - off; 1586 1587 #ifdef DIAGNOSTIC 1588 if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER) 1589 panic("somove type: so %p, so_type %d, m %p, " 1590 "m_type %d", so, so->so_type, *mp, (*mp)->m_type); 1591 #endif 1592 if ((*mp)->m_len > size) { 1593 /* 1594 * Move only a partial mbuf at maximum splice length or 1595 * if the drain buffer is too small for this large mbuf. 1596 */ 1597 if (!maxreached && so->so_snd.sb_datacc > 0) { 1598 len -= size; 1599 break; 1600 } 1601 *mp = m_copym(so->so_rcv.sb_mb, 0, size, wait); 1602 if (*mp == NULL) { 1603 len -= size; 1604 break; 1605 } 1606 so->so_rcv.sb_mb->m_data += size; 1607 so->so_rcv.sb_mb->m_len -= size; 1608 so->so_rcv.sb_cc -= size; 1609 so->so_rcv.sb_datacc -= size; 1610 } else { 1611 *mp = so->so_rcv.sb_mb; 1612 sbfree(so, &so->so_rcv, *mp); 1613 so->so_rcv.sb_mb = (*mp)->m_next; 1614 sbsync(&so->so_rcv, nextrecord); 1615 } 1616 } 1617 *mp = NULL; 1618 1619 SBLASTRECORDCHK(&so->so_rcv, "somove 3"); 1620 SBLASTMBUFCHK(&so->so_rcv, "somove 3"); 1621 SBCHECK(so, &so->so_rcv); 1622 if (m == NULL) 1623 goto release; 1624 m->m_nextpkt = NULL; 1625 if (m->m_flags & M_PKTHDR) { 1626 m_resethdr(m); 1627 m->m_pkthdr.len = len; 1628 } 1629 1630 /* Send window update to source peer as receive buffer has changed. */ 1631 if (so->so_proto->pr_flags & PR_WANTRCVD) 1632 pru_rcvd(so); 1633 1634 /* Receive buffer did shrink by len bytes, adjust oob. */ 1635 state = so->so_state; 1636 so->so_state &= ~SS_RCVATMARK; 1637 oobmark = so->so_oobmark; 1638 so->so_oobmark = oobmark > len ? oobmark - len : 0; 1639 if (oobmark) { 1640 if (oobmark == len) 1641 so->so_state |= SS_RCVATMARK; 1642 if (oobmark >= len) 1643 oobmark = 0; 1644 } 1645 1646 /* 1647 * Handle oob data. If any malloc fails, ignore error. 1648 * TCP urgent data is not very reliable anyway. 1649 */ 1650 while (((state & SS_RCVATMARK) || oobmark) && 1651 (so->so_options & SO_OOBINLINE)) { 1652 struct mbuf *o = NULL; 1653 1654 if (state & SS_RCVATMARK) { 1655 o = m_get(wait, MT_DATA); 1656 state &= ~SS_RCVATMARK; 1657 } else if (oobmark) { 1658 o = m_split(m, oobmark, wait); 1659 if (o) { 1660 error = pru_send(sosp, m, NULL, NULL); 1661 if (error) { 1662 if (sosp->so_state & SS_CANTSENDMORE) 1663 error = EPIPE; 1664 m_freem(o); 1665 goto release; 1666 } 1667 len -= oobmark; 1668 so->so_splicelen += oobmark; 1669 m = o; 1670 o = m_get(wait, MT_DATA); 1671 } 1672 oobmark = 0; 1673 } 1674 if (o) { 1675 o->m_len = 1; 1676 *mtod(o, caddr_t) = *mtod(m, caddr_t); 1677 error = pru_sendoob(sosp, o, NULL, NULL); 1678 if (error) { 1679 if (sosp->so_state & SS_CANTSENDMORE) 1680 error = EPIPE; 1681 m_freem(m); 1682 goto release; 1683 } 1684 len -= 1; 1685 so->so_splicelen += 1; 1686 if (oobmark) { 1687 oobmark -= 1; 1688 if (oobmark == 0) 1689 state |= SS_RCVATMARK; 1690 } 1691 m_adj(m, 1); 1692 } 1693 } 1694 1695 /* Append all remaining data to drain socket. */ 1696 if (so->so_rcv.sb_cc == 0 || maxreached) 1697 sosp->so_state &= ~SS_ISSENDING; 1698 error = pru_send(sosp, m, NULL, NULL); 1699 if (error) { 1700 if (sosp->so_state & SS_CANTSENDMORE) 1701 error = EPIPE; 1702 goto release; 1703 } 1704 so->so_splicelen += len; 1705 1706 /* Move several packets if possible. */ 1707 if (!maxreached && nextrecord) 1708 goto nextpkt; 1709 1710 release: 1711 sosp->so_state &= ~SS_ISSENDING; 1712 if (!error && maxreached && so->so_splicemax == so->so_splicelen) 1713 error = EFBIG; 1714 if (error) 1715 so->so_error = error; 1716 if (((so->so_state & SS_CANTRCVMORE) && so->so_rcv.sb_cc == 0) || 1717 (sosp->so_state & SS_CANTSENDMORE) || maxreached || error) { 1718 sounsplice(so, sosp, 0); 1719 return (0); 1720 } 1721 if (timerisset(&so->so_idletv)) 1722 timeout_add_tv(&so->so_idleto, &so->so_idletv); 1723 return (1); 1724 } 1725 1726 #endif /* SOCKET_SPLICE */ 1727 1728 void 1729 sorwakeup(struct socket *so) 1730 { 1731 soassertlocked(so); 1732 1733 #ifdef SOCKET_SPLICE 1734 if (so->so_rcv.sb_flags & SB_SPLICE) { 1735 /* 1736 * TCP has a sendbuffer that can handle multiple packets 1737 * at once. So queue the stream a bit to accumulate data. 1738 * The sosplice thread will call somove() later and send 1739 * the packets calling tcp_output() only once. 1740 * In the UDP case, send out the packets immediately. 1741 * Using a thread would make things slower. 1742 */ 1743 if (so->so_proto->pr_flags & PR_WANTRCVD) 1744 task_add(sosplice_taskq, &so->so_splicetask); 1745 else 1746 somove(so, M_DONTWAIT); 1747 } 1748 if (isspliced(so)) 1749 return; 1750 #endif 1751 sowakeup(so, &so->so_rcv); 1752 if (so->so_upcall) 1753 (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT); 1754 } 1755 1756 void 1757 sowwakeup(struct socket *so) 1758 { 1759 soassertlocked(so); 1760 1761 #ifdef SOCKET_SPLICE 1762 if (so->so_snd.sb_flags & SB_SPLICE) 1763 task_add(sosplice_taskq, &so->so_sp->ssp_soback->so_splicetask); 1764 if (issplicedback(so)) 1765 return; 1766 #endif 1767 sowakeup(so, &so->so_snd); 1768 } 1769 1770 int 1771 sosetopt(struct socket *so, int level, int optname, struct mbuf *m) 1772 { 1773 int error = 0; 1774 1775 soassertlocked(so); 1776 1777 if (level != SOL_SOCKET) { 1778 if (so->so_proto->pr_ctloutput) { 1779 error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1780 level, optname, m); 1781 return (error); 1782 } 1783 error = ENOPROTOOPT; 1784 } else { 1785 switch (optname) { 1786 case SO_BINDANY: 1787 if ((error = suser(curproc)) != 0) /* XXX */ 1788 return (error); 1789 break; 1790 } 1791 1792 switch (optname) { 1793 1794 case SO_LINGER: 1795 if (m == NULL || m->m_len != sizeof (struct linger) || 1796 mtod(m, struct linger *)->l_linger < 0 || 1797 mtod(m, struct linger *)->l_linger > SHRT_MAX) 1798 return (EINVAL); 1799 so->so_linger = mtod(m, struct linger *)->l_linger; 1800 /* FALLTHROUGH */ 1801 1802 case SO_BINDANY: 1803 case SO_DEBUG: 1804 case SO_KEEPALIVE: 1805 case SO_USELOOPBACK: 1806 case SO_BROADCAST: 1807 case SO_REUSEADDR: 1808 case SO_REUSEPORT: 1809 case SO_OOBINLINE: 1810 case SO_TIMESTAMP: 1811 case SO_ZEROIZE: 1812 if (m == NULL || m->m_len < sizeof (int)) 1813 return (EINVAL); 1814 if (*mtod(m, int *)) 1815 so->so_options |= optname; 1816 else 1817 so->so_options &= ~optname; 1818 break; 1819 1820 case SO_DONTROUTE: 1821 if (m == NULL || m->m_len < sizeof (int)) 1822 return (EINVAL); 1823 if (*mtod(m, int *)) 1824 error = EOPNOTSUPP; 1825 break; 1826 1827 case SO_SNDBUF: 1828 case SO_RCVBUF: 1829 case SO_SNDLOWAT: 1830 case SO_RCVLOWAT: 1831 { 1832 u_long cnt; 1833 1834 if (m == NULL || m->m_len < sizeof (int)) 1835 return (EINVAL); 1836 cnt = *mtod(m, int *); 1837 if ((long)cnt <= 0) 1838 cnt = 1; 1839 switch (optname) { 1840 1841 case SO_SNDBUF: 1842 if (so->so_state & SS_CANTSENDMORE) 1843 return (EINVAL); 1844 if (sbcheckreserve(cnt, so->so_snd.sb_wat) || 1845 sbreserve(so, &so->so_snd, cnt)) 1846 return (ENOBUFS); 1847 so->so_snd.sb_wat = cnt; 1848 break; 1849 1850 case SO_RCVBUF: 1851 if (so->so_state & SS_CANTRCVMORE) 1852 return (EINVAL); 1853 if (sbcheckreserve(cnt, so->so_rcv.sb_wat) || 1854 sbreserve(so, &so->so_rcv, cnt)) 1855 return (ENOBUFS); 1856 so->so_rcv.sb_wat = cnt; 1857 break; 1858 1859 case SO_SNDLOWAT: 1860 so->so_snd.sb_lowat = 1861 (cnt > so->so_snd.sb_hiwat) ? 1862 so->so_snd.sb_hiwat : cnt; 1863 break; 1864 case SO_RCVLOWAT: 1865 so->so_rcv.sb_lowat = 1866 (cnt > so->so_rcv.sb_hiwat) ? 1867 so->so_rcv.sb_hiwat : cnt; 1868 break; 1869 } 1870 break; 1871 } 1872 1873 case SO_SNDTIMEO: 1874 case SO_RCVTIMEO: 1875 { 1876 struct timeval tv; 1877 uint64_t nsecs; 1878 1879 if (m == NULL || m->m_len < sizeof (tv)) 1880 return (EINVAL); 1881 memcpy(&tv, mtod(m, struct timeval *), sizeof tv); 1882 if (!timerisvalid(&tv)) 1883 return (EINVAL); 1884 nsecs = TIMEVAL_TO_NSEC(&tv); 1885 if (nsecs == UINT64_MAX) 1886 return (EDOM); 1887 if (nsecs == 0) 1888 nsecs = INFSLP; 1889 switch (optname) { 1890 1891 case SO_SNDTIMEO: 1892 so->so_snd.sb_timeo_nsecs = nsecs; 1893 break; 1894 case SO_RCVTIMEO: 1895 so->so_rcv.sb_timeo_nsecs = nsecs; 1896 break; 1897 } 1898 break; 1899 } 1900 1901 case SO_RTABLE: 1902 if (so->so_proto->pr_domain && 1903 so->so_proto->pr_domain->dom_protosw && 1904 so->so_proto->pr_ctloutput) { 1905 const struct domain *dom = 1906 so->so_proto->pr_domain; 1907 1908 level = dom->dom_protosw->pr_protocol; 1909 error = (*so->so_proto->pr_ctloutput) 1910 (PRCO_SETOPT, so, level, optname, m); 1911 return (error); 1912 } 1913 error = ENOPROTOOPT; 1914 break; 1915 1916 #ifdef SOCKET_SPLICE 1917 case SO_SPLICE: 1918 if (m == NULL) { 1919 error = sosplice(so, -1, 0, NULL); 1920 } else if (m->m_len < sizeof(int)) { 1921 return (EINVAL); 1922 } else if (m->m_len < sizeof(struct splice)) { 1923 error = sosplice(so, *mtod(m, int *), 0, NULL); 1924 } else { 1925 error = sosplice(so, 1926 mtod(m, struct splice *)->sp_fd, 1927 mtod(m, struct splice *)->sp_max, 1928 &mtod(m, struct splice *)->sp_idle); 1929 } 1930 break; 1931 #endif /* SOCKET_SPLICE */ 1932 1933 default: 1934 error = ENOPROTOOPT; 1935 break; 1936 } 1937 if (error == 0 && so->so_proto->pr_ctloutput) { 1938 (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1939 level, optname, m); 1940 } 1941 } 1942 1943 return (error); 1944 } 1945 1946 int 1947 sogetopt(struct socket *so, int level, int optname, struct mbuf *m) 1948 { 1949 int error = 0; 1950 1951 soassertlocked(so); 1952 1953 if (level != SOL_SOCKET) { 1954 if (so->so_proto->pr_ctloutput) { 1955 m->m_len = 0; 1956 1957 error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so, 1958 level, optname, m); 1959 return (error); 1960 } else 1961 return (ENOPROTOOPT); 1962 } else { 1963 m->m_len = sizeof (int); 1964 1965 switch (optname) { 1966 1967 case SO_LINGER: 1968 m->m_len = sizeof (struct linger); 1969 mtod(m, struct linger *)->l_onoff = 1970 so->so_options & SO_LINGER; 1971 mtod(m, struct linger *)->l_linger = so->so_linger; 1972 break; 1973 1974 case SO_BINDANY: 1975 case SO_USELOOPBACK: 1976 case SO_DEBUG: 1977 case SO_KEEPALIVE: 1978 case SO_REUSEADDR: 1979 case SO_REUSEPORT: 1980 case SO_BROADCAST: 1981 case SO_OOBINLINE: 1982 case SO_TIMESTAMP: 1983 case SO_ZEROIZE: 1984 *mtod(m, int *) = so->so_options & optname; 1985 break; 1986 1987 case SO_DONTROUTE: 1988 *mtod(m, int *) = 0; 1989 break; 1990 1991 case SO_TYPE: 1992 *mtod(m, int *) = so->so_type; 1993 break; 1994 1995 case SO_ERROR: 1996 *mtod(m, int *) = so->so_error; 1997 so->so_error = 0; 1998 break; 1999 2000 case SO_DOMAIN: 2001 *mtod(m, int *) = so->so_proto->pr_domain->dom_family; 2002 break; 2003 2004 case SO_PROTOCOL: 2005 *mtod(m, int *) = so->so_proto->pr_protocol; 2006 break; 2007 2008 case SO_SNDBUF: 2009 *mtod(m, int *) = so->so_snd.sb_hiwat; 2010 break; 2011 2012 case SO_RCVBUF: 2013 *mtod(m, int *) = so->so_rcv.sb_hiwat; 2014 break; 2015 2016 case SO_SNDLOWAT: 2017 *mtod(m, int *) = so->so_snd.sb_lowat; 2018 break; 2019 2020 case SO_RCVLOWAT: 2021 *mtod(m, int *) = so->so_rcv.sb_lowat; 2022 break; 2023 2024 case SO_SNDTIMEO: 2025 case SO_RCVTIMEO: 2026 { 2027 struct timeval tv; 2028 uint64_t nsecs = (optname == SO_SNDTIMEO ? 2029 so->so_snd.sb_timeo_nsecs : 2030 so->so_rcv.sb_timeo_nsecs); 2031 2032 m->m_len = sizeof(struct timeval); 2033 memset(&tv, 0, sizeof(tv)); 2034 if (nsecs != INFSLP) 2035 NSEC_TO_TIMEVAL(nsecs, &tv); 2036 memcpy(mtod(m, struct timeval *), &tv, sizeof tv); 2037 break; 2038 } 2039 2040 case SO_RTABLE: 2041 if (so->so_proto->pr_domain && 2042 so->so_proto->pr_domain->dom_protosw && 2043 so->so_proto->pr_ctloutput) { 2044 const struct domain *dom = 2045 so->so_proto->pr_domain; 2046 2047 level = dom->dom_protosw->pr_protocol; 2048 error = (*so->so_proto->pr_ctloutput) 2049 (PRCO_GETOPT, so, level, optname, m); 2050 if (error) 2051 return (error); 2052 break; 2053 } 2054 return (ENOPROTOOPT); 2055 2056 #ifdef SOCKET_SPLICE 2057 case SO_SPLICE: 2058 { 2059 off_t len; 2060 2061 m->m_len = sizeof(off_t); 2062 len = so->so_sp ? so->so_sp->ssp_len : 0; 2063 memcpy(mtod(m, off_t *), &len, sizeof(off_t)); 2064 break; 2065 } 2066 #endif /* SOCKET_SPLICE */ 2067 2068 case SO_PEERCRED: 2069 if (so->so_proto->pr_protocol == AF_UNIX) { 2070 struct unpcb *unp = sotounpcb(so); 2071 2072 if (unp->unp_flags & UNP_FEIDS) { 2073 m->m_len = sizeof(unp->unp_connid); 2074 memcpy(mtod(m, caddr_t), 2075 &(unp->unp_connid), m->m_len); 2076 break; 2077 } 2078 return (ENOTCONN); 2079 } 2080 return (EOPNOTSUPP); 2081 2082 default: 2083 return (ENOPROTOOPT); 2084 } 2085 return (0); 2086 } 2087 } 2088 2089 void 2090 sohasoutofband(struct socket *so) 2091 { 2092 pgsigio(&so->so_sigio, SIGURG, 0); 2093 KNOTE(&so->so_rcv.sb_sel.si_note, 0); 2094 } 2095 2096 int 2097 soo_kqfilter(struct file *fp, struct knote *kn) 2098 { 2099 struct socket *so = kn->kn_fp->f_data; 2100 struct sockbuf *sb; 2101 2102 solock(so); 2103 switch (kn->kn_filter) { 2104 case EVFILT_READ: 2105 if (so->so_options & SO_ACCEPTCONN) 2106 kn->kn_fop = &solisten_filtops; 2107 else 2108 kn->kn_fop = &soread_filtops; 2109 sb = &so->so_rcv; 2110 break; 2111 case EVFILT_WRITE: 2112 kn->kn_fop = &sowrite_filtops; 2113 sb = &so->so_snd; 2114 break; 2115 case EVFILT_EXCEPT: 2116 kn->kn_fop = &soexcept_filtops; 2117 sb = &so->so_rcv; 2118 break; 2119 default: 2120 sounlock(so); 2121 return (EINVAL); 2122 } 2123 2124 klist_insert_locked(&sb->sb_sel.si_note, kn); 2125 sounlock(so); 2126 2127 return (0); 2128 } 2129 2130 void 2131 filt_sordetach(struct knote *kn) 2132 { 2133 struct socket *so = kn->kn_fp->f_data; 2134 2135 klist_remove(&so->so_rcv.sb_sel.si_note, kn); 2136 } 2137 2138 int 2139 filt_soread(struct knote *kn, long hint) 2140 { 2141 struct socket *so = kn->kn_fp->f_data; 2142 int rv = 0; 2143 2144 soassertlocked(so); 2145 2146 kn->kn_data = so->so_rcv.sb_cc; 2147 #ifdef SOCKET_SPLICE 2148 if (isspliced(so)) { 2149 rv = 0; 2150 } else 2151 #endif /* SOCKET_SPLICE */ 2152 if (so->so_state & SS_CANTRCVMORE) { 2153 kn->kn_flags |= EV_EOF; 2154 if (kn->kn_flags & __EV_POLL) { 2155 if (so->so_state & SS_ISDISCONNECTED) 2156 kn->kn_flags |= __EV_HUP; 2157 } 2158 kn->kn_fflags = so->so_error; 2159 rv = 1; 2160 } else if (so->so_error) { /* temporary udp error */ 2161 rv = 1; 2162 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2163 rv = (kn->kn_data >= kn->kn_sdata); 2164 } else { 2165 rv = (kn->kn_data >= so->so_rcv.sb_lowat); 2166 } 2167 2168 return rv; 2169 } 2170 2171 void 2172 filt_sowdetach(struct knote *kn) 2173 { 2174 struct socket *so = kn->kn_fp->f_data; 2175 2176 klist_remove(&so->so_snd.sb_sel.si_note, kn); 2177 } 2178 2179 int 2180 filt_sowrite(struct knote *kn, long hint) 2181 { 2182 struct socket *so = kn->kn_fp->f_data; 2183 int rv; 2184 2185 soassertlocked(so); 2186 2187 kn->kn_data = sbspace(so, &so->so_snd); 2188 if (so->so_state & SS_CANTSENDMORE) { 2189 kn->kn_flags |= EV_EOF; 2190 if (kn->kn_flags & __EV_POLL) { 2191 if (so->so_state & SS_ISDISCONNECTED) 2192 kn->kn_flags |= __EV_HUP; 2193 } 2194 kn->kn_fflags = so->so_error; 2195 rv = 1; 2196 } else if (so->so_error) { /* temporary udp error */ 2197 rv = 1; 2198 } else if (((so->so_state & SS_ISCONNECTED) == 0) && 2199 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 2200 rv = 0; 2201 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2202 rv = (kn->kn_data >= kn->kn_sdata); 2203 } else { 2204 rv = (kn->kn_data >= so->so_snd.sb_lowat); 2205 } 2206 2207 return (rv); 2208 } 2209 2210 int 2211 filt_soexcept(struct knote *kn, long hint) 2212 { 2213 struct socket *so = kn->kn_fp->f_data; 2214 int rv = 0; 2215 2216 soassertlocked(so); 2217 2218 #ifdef SOCKET_SPLICE 2219 if (isspliced(so)) { 2220 rv = 0; 2221 } else 2222 #endif /* SOCKET_SPLICE */ 2223 if (kn->kn_sfflags & NOTE_OOB) { 2224 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) { 2225 kn->kn_fflags |= NOTE_OOB; 2226 kn->kn_data -= so->so_oobmark; 2227 rv = 1; 2228 } 2229 } 2230 2231 if (kn->kn_flags & __EV_POLL) { 2232 if (so->so_state & SS_ISDISCONNECTED) { 2233 kn->kn_flags |= __EV_HUP; 2234 rv = 1; 2235 } 2236 } 2237 2238 return rv; 2239 } 2240 2241 int 2242 filt_solisten(struct knote *kn, long hint) 2243 { 2244 struct socket *so = kn->kn_fp->f_data; 2245 int active; 2246 2247 soassertlocked(so); 2248 2249 kn->kn_data = so->so_qlen; 2250 active = (kn->kn_data != 0); 2251 2252 if (kn->kn_flags & (__EV_POLL | __EV_SELECT)) { 2253 if (so->so_state & SS_ISDISCONNECTED) { 2254 kn->kn_flags |= __EV_HUP; 2255 active = 1; 2256 } else { 2257 active = soreadable(so); 2258 } 2259 } 2260 2261 return (active); 2262 } 2263 2264 int 2265 filt_somodify(struct kevent *kev, struct knote *kn) 2266 { 2267 struct socket *so = kn->kn_fp->f_data; 2268 int rv; 2269 2270 solock(so); 2271 rv = knote_modify(kev, kn); 2272 sounlock(so); 2273 2274 return (rv); 2275 } 2276 2277 int 2278 filt_soprocess(struct knote *kn, struct kevent *kev) 2279 { 2280 struct socket *so = kn->kn_fp->f_data; 2281 int rv; 2282 2283 solock(so); 2284 rv = knote_process(kn, kev); 2285 sounlock(so); 2286 2287 return (rv); 2288 } 2289 2290 void 2291 klist_soassertlk(void *arg) 2292 { 2293 struct socket *so = arg; 2294 2295 soassertlocked(so); 2296 } 2297 2298 int 2299 klist_solock(void *arg) 2300 { 2301 struct socket *so = arg; 2302 2303 solock(so); 2304 return (1); 2305 } 2306 2307 void 2308 klist_sounlock(void *arg, int ls) 2309 { 2310 struct socket *so = arg; 2311 2312 sounlock(so); 2313 } 2314 2315 const struct klistops socket_klistops = { 2316 .klo_assertlk = klist_soassertlk, 2317 .klo_lock = klist_solock, 2318 .klo_unlock = klist_sounlock, 2319 }; 2320 2321 #ifdef DDB 2322 void 2323 sobuf_print(struct sockbuf *, 2324 int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))); 2325 2326 void 2327 sobuf_print(struct sockbuf *sb, 2328 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2329 { 2330 (*pr)("\tsb_cc: %lu\n", sb->sb_cc); 2331 (*pr)("\tsb_datacc: %lu\n", sb->sb_datacc); 2332 (*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat); 2333 (*pr)("\tsb_wat: %lu\n", sb->sb_wat); 2334 (*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt); 2335 (*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax); 2336 (*pr)("\tsb_lowat: %ld\n", sb->sb_lowat); 2337 (*pr)("\tsb_mb: %p\n", sb->sb_mb); 2338 (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail); 2339 (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord); 2340 (*pr)("\tsb_sel: ...\n"); 2341 (*pr)("\tsb_flags: %i\n", sb->sb_flags); 2342 (*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs); 2343 } 2344 2345 void 2346 so_print(void *v, 2347 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2348 { 2349 struct socket *so = v; 2350 2351 (*pr)("socket %p\n", so); 2352 (*pr)("so_type: %i\n", so->so_type); 2353 (*pr)("so_options: 0x%04x\n", so->so_options); /* %b */ 2354 (*pr)("so_linger: %i\n", so->so_linger); 2355 (*pr)("so_state: 0x%04x\n", so->so_state); 2356 (*pr)("so_pcb: %p\n", so->so_pcb); 2357 (*pr)("so_proto: %p\n", so->so_proto); 2358 (*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio); 2359 2360 (*pr)("so_head: %p\n", so->so_head); 2361 (*pr)("so_onq: %p\n", so->so_onq); 2362 (*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0)); 2363 (*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q)); 2364 (*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe)); 2365 (*pr)("so_q0len: %i\n", so->so_q0len); 2366 (*pr)("so_qlen: %i\n", so->so_qlen); 2367 (*pr)("so_qlimit: %i\n", so->so_qlimit); 2368 (*pr)("so_timeo: %i\n", so->so_timeo); 2369 (*pr)("so_obmark: %lu\n", so->so_oobmark); 2370 2371 (*pr)("so_sp: %p\n", so->so_sp); 2372 if (so->so_sp != NULL) { 2373 (*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket); 2374 (*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback); 2375 (*pr)("\tssp_len: %lld\n", 2376 (unsigned long long)so->so_sp->ssp_len); 2377 (*pr)("\tssp_max: %lld\n", 2378 (unsigned long long)so->so_sp->ssp_max); 2379 (*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec, 2380 so->so_sp->ssp_idletv.tv_usec); 2381 (*pr)("\tssp_idleto: %spending (@%i)\n", 2382 timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ", 2383 so->so_sp->ssp_idleto.to_time); 2384 } 2385 2386 (*pr)("so_rcv:\n"); 2387 sobuf_print(&so->so_rcv, pr); 2388 (*pr)("so_snd:\n"); 2389 sobuf_print(&so->so_snd, pr); 2390 2391 (*pr)("so_upcall: %p so_upcallarg: %p\n", 2392 so->so_upcall, so->so_upcallarg); 2393 2394 (*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid); 2395 (*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid); 2396 (*pr)("so_cpid: %d\n", so->so_cpid); 2397 } 2398 #endif 2399