1 /* $OpenBSD: uipc_socket.c,v 1.276 2022/05/06 13:09:41 visa Exp $ */ 2 /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 #include <sys/file.h> 39 #include <sys/filedesc.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/domain.h> 43 #include <sys/kernel.h> 44 #include <sys/event.h> 45 #include <sys/protosw.h> 46 #include <sys/socket.h> 47 #include <sys/unpcb.h> 48 #include <sys/socketvar.h> 49 #include <sys/signalvar.h> 50 #include <net/if.h> 51 #include <sys/pool.h> 52 #include <sys/atomic.h> 53 #include <sys/rwlock.h> 54 #include <sys/time.h> 55 56 #ifdef DDB 57 #include <machine/db_machdep.h> 58 #endif 59 60 void sbsync(struct sockbuf *, struct mbuf *); 61 62 int sosplice(struct socket *, int, off_t, struct timeval *); 63 void sounsplice(struct socket *, struct socket *, int); 64 void soidle(void *); 65 void sotask(void *); 66 void soreaper(void *); 67 void soput(void *); 68 int somove(struct socket *, int); 69 void sorflush(struct socket *); 70 71 void filt_sordetach(struct knote *kn); 72 int filt_soread(struct knote *kn, long hint); 73 void filt_sowdetach(struct knote *kn); 74 int filt_sowrite(struct knote *kn, long hint); 75 int filt_soexcept(struct knote *kn, long hint); 76 int filt_solisten(struct knote *kn, long hint); 77 int filt_somodify(struct kevent *kev, struct knote *kn); 78 int filt_soprocess(struct knote *kn, struct kevent *kev); 79 80 const struct filterops solisten_filtops = { 81 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 82 .f_attach = NULL, 83 .f_detach = filt_sordetach, 84 .f_event = filt_solisten, 85 .f_modify = filt_somodify, 86 .f_process = filt_soprocess, 87 }; 88 89 const struct filterops soread_filtops = { 90 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 91 .f_attach = NULL, 92 .f_detach = filt_sordetach, 93 .f_event = filt_soread, 94 .f_modify = filt_somodify, 95 .f_process = filt_soprocess, 96 }; 97 98 const struct filterops sowrite_filtops = { 99 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 100 .f_attach = NULL, 101 .f_detach = filt_sowdetach, 102 .f_event = filt_sowrite, 103 .f_modify = filt_somodify, 104 .f_process = filt_soprocess, 105 }; 106 107 const struct filterops soexcept_filtops = { 108 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 109 .f_attach = NULL, 110 .f_detach = filt_sordetach, 111 .f_event = filt_soexcept, 112 .f_modify = filt_somodify, 113 .f_process = filt_soprocess, 114 }; 115 116 #ifndef SOMINCONN 117 #define SOMINCONN 80 118 #endif /* SOMINCONN */ 119 120 int somaxconn = SOMAXCONN; 121 int sominconn = SOMINCONN; 122 123 struct pool socket_pool; 124 #ifdef SOCKET_SPLICE 125 struct pool sosplice_pool; 126 struct taskq *sosplice_taskq; 127 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk"); 128 #endif 129 130 void 131 soinit(void) 132 { 133 pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0, 134 "sockpl", NULL); 135 #ifdef SOCKET_SPLICE 136 pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0, 137 "sosppl", NULL); 138 #endif 139 } 140 141 struct socket * 142 soalloc(int prflags) 143 { 144 struct socket *so; 145 146 so = pool_get(&socket_pool, prflags); 147 if (so == NULL) 148 return (NULL); 149 rw_init(&so->so_lock, "solock"); 150 return (so); 151 } 152 153 /* 154 * Socket operation routines. 155 * These routines are called by the routines in 156 * sys_socket.c or from a system process, and 157 * implement the semantics of socket operations by 158 * switching out to the protocol specific routines. 159 */ 160 int 161 socreate(int dom, struct socket **aso, int type, int proto) 162 { 163 struct proc *p = curproc; /* XXX */ 164 const struct protosw *prp; 165 struct socket *so; 166 int error, s; 167 168 if (proto) 169 prp = pffindproto(dom, proto, type); 170 else 171 prp = pffindtype(dom, type); 172 if (prp == NULL || prp->pr_attach == NULL) 173 return (EPROTONOSUPPORT); 174 if (prp->pr_type != type) 175 return (EPROTOTYPE); 176 so = soalloc(PR_WAITOK | PR_ZERO); 177 klist_init(&so->so_rcv.sb_sel.si_note, &socket_klistops, so); 178 klist_init(&so->so_snd.sb_sel.si_note, &socket_klistops, so); 179 sigio_init(&so->so_sigio); 180 TAILQ_INIT(&so->so_q0); 181 TAILQ_INIT(&so->so_q); 182 so->so_type = type; 183 if (suser(p) == 0) 184 so->so_state = SS_PRIV; 185 so->so_ruid = p->p_ucred->cr_ruid; 186 so->so_euid = p->p_ucred->cr_uid; 187 so->so_rgid = p->p_ucred->cr_rgid; 188 so->so_egid = p->p_ucred->cr_gid; 189 so->so_cpid = p->p_p->ps_pid; 190 so->so_proto = prp; 191 so->so_snd.sb_timeo_nsecs = INFSLP; 192 so->so_rcv.sb_timeo_nsecs = INFSLP; 193 194 s = solock(so); 195 error = (*prp->pr_attach)(so, proto); 196 if (error) { 197 so->so_state |= SS_NOFDREF; 198 /* sofree() calls sounlock(). */ 199 sofree(so, s); 200 return (error); 201 } 202 sounlock(so, s); 203 *aso = so; 204 return (0); 205 } 206 207 int 208 sobind(struct socket *so, struct mbuf *nam, struct proc *p) 209 { 210 int error; 211 212 soassertlocked(so); 213 214 error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, NULL, nam, NULL, p); 215 return (error); 216 } 217 218 int 219 solisten(struct socket *so, int backlog) 220 { 221 int error; 222 223 soassertlocked(so); 224 225 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 226 return (EINVAL); 227 #ifdef SOCKET_SPLICE 228 if (isspliced(so) || issplicedback(so)) 229 return (EOPNOTSUPP); 230 #endif /* SOCKET_SPLICE */ 231 error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, NULL, NULL, 232 curproc); 233 if (error) 234 return (error); 235 if (TAILQ_FIRST(&so->so_q) == NULL) 236 so->so_options |= SO_ACCEPTCONN; 237 if (backlog < 0 || backlog > somaxconn) 238 backlog = somaxconn; 239 if (backlog < sominconn) 240 backlog = sominconn; 241 so->so_qlimit = backlog; 242 return (0); 243 } 244 245 #define SOSP_FREEING_READ 1 246 #define SOSP_FREEING_WRITE 2 247 void 248 sofree(struct socket *so, int s) 249 { 250 soassertlocked(so); 251 252 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { 253 sounlock(so, s); 254 return; 255 } 256 if (so->so_head) { 257 /* 258 * We must not decommission a socket that's on the accept(2) 259 * queue. If we do, then accept(2) may hang after select(2) 260 * indicated that the listening socket was ready. 261 */ 262 if (!soqremque(so, 0)) { 263 sounlock(so, s); 264 return; 265 } 266 } 267 sigio_free(&so->so_sigio); 268 klist_free(&so->so_rcv.sb_sel.si_note); 269 klist_free(&so->so_snd.sb_sel.si_note); 270 #ifdef SOCKET_SPLICE 271 if (so->so_sp) { 272 if (issplicedback(so)) { 273 int freeing = SOSP_FREEING_WRITE; 274 275 if (so->so_sp->ssp_soback == so) 276 freeing |= SOSP_FREEING_READ; 277 sounsplice(so->so_sp->ssp_soback, so, freeing); 278 } 279 if (isspliced(so)) { 280 int freeing = SOSP_FREEING_READ; 281 282 if (so == so->so_sp->ssp_socket) 283 freeing |= SOSP_FREEING_WRITE; 284 sounsplice(so, so->so_sp->ssp_socket, freeing); 285 } 286 } 287 #endif /* SOCKET_SPLICE */ 288 sbrelease(so, &so->so_snd); 289 sorflush(so); 290 sounlock(so, s); 291 #ifdef SOCKET_SPLICE 292 if (so->so_sp) { 293 /* Reuse splice idle, sounsplice() has been called before. */ 294 timeout_set_proc(&so->so_sp->ssp_idleto, soreaper, so); 295 timeout_add(&so->so_sp->ssp_idleto, 0); 296 } else 297 #endif /* SOCKET_SPLICE */ 298 { 299 pool_put(&socket_pool, so); 300 } 301 } 302 303 static inline uint64_t 304 solinger_nsec(struct socket *so) 305 { 306 if (so->so_linger == 0) 307 return INFSLP; 308 309 return SEC_TO_NSEC(so->so_linger); 310 } 311 312 /* 313 * Close a socket on last file table reference removal. 314 * Initiate disconnect if connected. 315 * Free socket when disconnect complete. 316 */ 317 int 318 soclose(struct socket *so, int flags) 319 { 320 struct socket *so2; 321 int s, error = 0; 322 323 s = solock(so); 324 /* Revoke async IO early. There is a final revocation in sofree(). */ 325 sigio_free(&so->so_sigio); 326 if (so->so_state & SS_ISCONNECTED) { 327 if (so->so_pcb == NULL) 328 goto discard; 329 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 330 error = sodisconnect(so); 331 if (error) 332 goto drop; 333 } 334 if (so->so_options & SO_LINGER) { 335 if ((so->so_state & SS_ISDISCONNECTING) && 336 (flags & MSG_DONTWAIT)) 337 goto drop; 338 while (so->so_state & SS_ISCONNECTED) { 339 error = sosleep_nsec(so, &so->so_timeo, 340 PSOCK | PCATCH, "netcls", 341 solinger_nsec(so)); 342 if (error) 343 break; 344 } 345 } 346 } 347 drop: 348 if (so->so_pcb) { 349 int error2; 350 KASSERT(so->so_proto->pr_detach); 351 error2 = (*so->so_proto->pr_detach)(so); 352 if (error == 0) 353 error = error2; 354 } 355 if (so->so_options & SO_ACCEPTCONN) { 356 while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { 357 (void) soqremque(so2, 0); 358 (void) soabort(so2); 359 } 360 while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { 361 (void) soqremque(so2, 1); 362 (void) soabort(so2); 363 } 364 } 365 discard: 366 if (so->so_state & SS_NOFDREF) 367 panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type); 368 so->so_state |= SS_NOFDREF; 369 /* sofree() calls sounlock(). */ 370 sofree(so, s); 371 return (error); 372 } 373 374 int 375 soabort(struct socket *so) 376 { 377 soassertlocked(so); 378 379 return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, NULL, NULL, NULL, 380 curproc); 381 } 382 383 int 384 soaccept(struct socket *so, struct mbuf *nam) 385 { 386 int error = 0; 387 388 soassertlocked(so); 389 390 if ((so->so_state & SS_NOFDREF) == 0) 391 panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type); 392 so->so_state &= ~SS_NOFDREF; 393 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 394 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 395 error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, NULL, 396 nam, NULL, curproc); 397 else 398 error = ECONNABORTED; 399 return (error); 400 } 401 402 int 403 soconnect(struct socket *so, struct mbuf *nam) 404 { 405 int error; 406 407 soassertlocked(so); 408 409 if (so->so_options & SO_ACCEPTCONN) 410 return (EOPNOTSUPP); 411 /* 412 * If protocol is connection-based, can only connect once. 413 * Otherwise, if connected, try to disconnect first. 414 * This allows user to disconnect by connecting to, e.g., 415 * a null address. 416 */ 417 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 418 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 419 (error = sodisconnect(so)))) 420 error = EISCONN; 421 else 422 error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, 423 NULL, nam, NULL, curproc); 424 return (error); 425 } 426 427 int 428 soconnect2(struct socket *so1, struct socket *so2) 429 { 430 int s, error; 431 432 s = solock(so1); 433 error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, NULL, 434 (struct mbuf *)so2, NULL, curproc); 435 sounlock(so1, s); 436 return (error); 437 } 438 439 int 440 sodisconnect(struct socket *so) 441 { 442 int error; 443 444 soassertlocked(so); 445 446 if ((so->so_state & SS_ISCONNECTED) == 0) 447 return (ENOTCONN); 448 if (so->so_state & SS_ISDISCONNECTING) 449 return (EALREADY); 450 error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, NULL, NULL, 451 NULL, curproc); 452 return (error); 453 } 454 455 int m_getuio(struct mbuf **, int, long, struct uio *); 456 457 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 458 /* 459 * Send on a socket. 460 * If send must go all at once and message is larger than 461 * send buffering, then hard error. 462 * Lock against other senders. 463 * If must go all at once and not enough room now, then 464 * inform user that this would block and do nothing. 465 * Otherwise, if nonblocking, send as much as possible. 466 * The data to be sent is described by "uio" if nonzero, 467 * otherwise by the mbuf chain "top" (which must be null 468 * if uio is not). Data provided in mbuf chain must be small 469 * enough to send all at once. 470 * 471 * Returns nonzero on error, timeout or signal; callers 472 * must check for short counts if EINTR/ERESTART are returned. 473 * Data and control buffers are freed on return. 474 */ 475 int 476 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 477 struct mbuf *control, int flags) 478 { 479 long space, clen = 0; 480 size_t resid; 481 int error, s; 482 int atomic = sosendallatonce(so) || top; 483 484 if (uio) 485 resid = uio->uio_resid; 486 else 487 resid = top->m_pkthdr.len; 488 /* MSG_EOR on a SOCK_STREAM socket is invalid. */ 489 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 490 m_freem(top); 491 m_freem(control); 492 return (EINVAL); 493 } 494 if (uio && uio->uio_procp) 495 uio->uio_procp->p_ru.ru_msgsnd++; 496 if (control) { 497 /* 498 * In theory clen should be unsigned (since control->m_len is). 499 * However, space must be signed, as it might be less than 0 500 * if we over-committed, and we must use a signed comparison 501 * of space and clen. 502 */ 503 clen = control->m_len; 504 /* reserve extra space for AF_UNIX's internalize */ 505 if (so->so_proto->pr_domain->dom_family == AF_UNIX && 506 clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) && 507 mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 508 clen = CMSG_SPACE( 509 (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) * 510 (sizeof(struct fdpass) / sizeof(int))); 511 } 512 513 #define snderr(errno) { error = errno; goto release; } 514 515 s = solock(so); 516 restart: 517 if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0) 518 goto out; 519 so->so_state |= SS_ISSENDING; 520 do { 521 if (so->so_state & SS_CANTSENDMORE) 522 snderr(EPIPE); 523 if (so->so_error) { 524 error = so->so_error; 525 so->so_error = 0; 526 snderr(error); 527 } 528 if ((so->so_state & SS_ISCONNECTED) == 0) { 529 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 530 if (!(resid == 0 && clen != 0)) 531 snderr(ENOTCONN); 532 } else if (addr == NULL) 533 snderr(EDESTADDRREQ); 534 } 535 space = sbspace(so, &so->so_snd); 536 if (flags & MSG_OOB) 537 space += 1024; 538 if (so->so_proto->pr_domain->dom_family == AF_UNIX) { 539 if (atomic && resid > so->so_snd.sb_hiwat) 540 snderr(EMSGSIZE); 541 } else { 542 if (clen > so->so_snd.sb_hiwat || 543 (atomic && resid > so->so_snd.sb_hiwat - clen)) 544 snderr(EMSGSIZE); 545 } 546 if (space < clen || 547 (space - clen < resid && 548 (atomic || space < so->so_snd.sb_lowat))) { 549 if (flags & MSG_DONTWAIT) 550 snderr(EWOULDBLOCK); 551 sbunlock(so, &so->so_snd); 552 error = sbwait(so, &so->so_snd); 553 so->so_state &= ~SS_ISSENDING; 554 if (error) 555 goto out; 556 goto restart; 557 } 558 space -= clen; 559 do { 560 if (uio == NULL) { 561 /* 562 * Data is prepackaged in "top". 563 */ 564 resid = 0; 565 if (flags & MSG_EOR) 566 top->m_flags |= M_EOR; 567 } else { 568 sounlock(so, s); 569 error = m_getuio(&top, atomic, space, uio); 570 s = solock(so); 571 if (error) 572 goto release; 573 space -= top->m_pkthdr.len; 574 resid = uio->uio_resid; 575 if (flags & MSG_EOR) 576 top->m_flags |= M_EOR; 577 } 578 if (resid == 0) 579 so->so_state &= ~SS_ISSENDING; 580 if (top && so->so_options & SO_ZEROIZE) 581 top->m_flags |= M_ZEROIZE; 582 error = (*so->so_proto->pr_usrreq)(so, 583 (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, 584 top, addr, control, curproc); 585 clen = 0; 586 control = NULL; 587 top = NULL; 588 if (error) 589 goto release; 590 } while (resid && space > 0); 591 } while (resid); 592 593 release: 594 so->so_state &= ~SS_ISSENDING; 595 sbunlock(so, &so->so_snd); 596 out: 597 sounlock(so, s); 598 m_freem(top); 599 m_freem(control); 600 return (error); 601 } 602 603 int 604 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio) 605 { 606 struct mbuf *m, *top = NULL; 607 struct mbuf **nextp = ⊤ 608 u_long len, mlen; 609 size_t resid = uio->uio_resid; 610 int error; 611 612 do { 613 if (top == NULL) { 614 MGETHDR(m, M_WAIT, MT_DATA); 615 mlen = MHLEN; 616 m->m_pkthdr.len = 0; 617 m->m_pkthdr.ph_ifidx = 0; 618 } else { 619 MGET(m, M_WAIT, MT_DATA); 620 mlen = MLEN; 621 } 622 /* chain mbuf together */ 623 *nextp = m; 624 nextp = &m->m_next; 625 626 resid = ulmin(resid, space); 627 if (resid >= MINCLSIZE) { 628 MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES)); 629 if ((m->m_flags & M_EXT) == 0) 630 MCLGETL(m, M_NOWAIT, MCLBYTES); 631 if ((m->m_flags & M_EXT) == 0) 632 goto nopages; 633 mlen = m->m_ext.ext_size; 634 len = ulmin(mlen, resid); 635 /* 636 * For datagram protocols, leave room 637 * for protocol headers in first mbuf. 638 */ 639 if (atomic && m == top && len < mlen - max_hdr) 640 m->m_data += max_hdr; 641 } else { 642 nopages: 643 len = ulmin(mlen, resid); 644 /* 645 * For datagram protocols, leave room 646 * for protocol headers in first mbuf. 647 */ 648 if (atomic && m == top && len < mlen - max_hdr) 649 m_align(m, len); 650 } 651 652 error = uiomove(mtod(m, caddr_t), len, uio); 653 if (error) { 654 m_freem(top); 655 return (error); 656 } 657 658 /* adjust counters */ 659 resid = uio->uio_resid; 660 space -= len; 661 m->m_len = len; 662 top->m_pkthdr.len += len; 663 664 /* Is there more space and more data? */ 665 } while (space > 0 && resid > 0); 666 667 *mp = top; 668 return 0; 669 } 670 671 /* 672 * Following replacement or removal of the first mbuf on the first 673 * mbuf chain of a socket buffer, push necessary state changes back 674 * into the socket buffer so that other consumers see the values 675 * consistently. 'nextrecord' is the callers locally stored value of 676 * the original value of sb->sb_mb->m_nextpkt which must be restored 677 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 678 */ 679 void 680 sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 681 { 682 683 /* 684 * First, update for the new value of nextrecord. If necessary, 685 * make it the first record. 686 */ 687 if (sb->sb_mb != NULL) 688 sb->sb_mb->m_nextpkt = nextrecord; 689 else 690 sb->sb_mb = nextrecord; 691 692 /* 693 * Now update any dependent socket buffer fields to reflect 694 * the new state. This is an inline of SB_EMPTY_FIXUP, with 695 * the addition of a second clause that takes care of the 696 * case where sb_mb has been updated, but remains the last 697 * record. 698 */ 699 if (sb->sb_mb == NULL) { 700 sb->sb_mbtail = NULL; 701 sb->sb_lastrecord = NULL; 702 } else if (sb->sb_mb->m_nextpkt == NULL) 703 sb->sb_lastrecord = sb->sb_mb; 704 } 705 706 /* 707 * Implement receive operations on a socket. 708 * We depend on the way that records are added to the sockbuf 709 * by sbappend*. In particular, each record (mbufs linked through m_next) 710 * must begin with an address if the protocol so specifies, 711 * followed by an optional mbuf or mbufs containing ancillary data, 712 * and then zero or more mbufs of data. 713 * In order to avoid blocking network for the entire time here, we release 714 * the solock() while doing the actual copy to user space. 715 * Although the sockbuf is locked, new data may still be appended, 716 * and thus we must maintain consistency of the sockbuf during that time. 717 * 718 * The caller may receive the data as a single mbuf chain by supplying 719 * an mbuf **mp0 for use in returning the chain. The uio is then used 720 * only for the count in uio_resid. 721 */ 722 int 723 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 724 struct mbuf **mp0, struct mbuf **controlp, int *flagsp, 725 socklen_t controllen) 726 { 727 struct mbuf *m, **mp; 728 struct mbuf *cm; 729 u_long len, offset, moff; 730 int flags, error, s, type, uio_error = 0; 731 const struct protosw *pr = so->so_proto; 732 struct mbuf *nextrecord; 733 size_t resid, orig_resid = uio->uio_resid; 734 735 mp = mp0; 736 if (paddr) 737 *paddr = NULL; 738 if (controlp) 739 *controlp = NULL; 740 if (flagsp) 741 flags = *flagsp &~ MSG_EOR; 742 else 743 flags = 0; 744 if (flags & MSG_OOB) { 745 m = m_get(M_WAIT, MT_DATA); 746 s = solock(so); 747 error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, 748 (struct mbuf *)(long)(flags & MSG_PEEK), NULL, curproc); 749 sounlock(so, s); 750 if (error) 751 goto bad; 752 do { 753 error = uiomove(mtod(m, caddr_t), 754 ulmin(uio->uio_resid, m->m_len), uio); 755 m = m_free(m); 756 } while (uio->uio_resid && error == 0 && m); 757 bad: 758 m_freem(m); 759 return (error); 760 } 761 if (mp) 762 *mp = NULL; 763 764 s = solock(so); 765 restart: 766 if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) { 767 sounlock(so, s); 768 return (error); 769 } 770 771 m = so->so_rcv.sb_mb; 772 #ifdef SOCKET_SPLICE 773 if (isspliced(so)) 774 m = NULL; 775 #endif /* SOCKET_SPLICE */ 776 /* 777 * If we have less data than requested, block awaiting more 778 * (subject to any timeout) if: 779 * 1. the current count is less than the low water mark, 780 * 2. MSG_WAITALL is set, and it is possible to do the entire 781 * receive operation at once if we block (resid <= hiwat), or 782 * 3. MSG_DONTWAIT is not set. 783 * If MSG_WAITALL is set but resid is larger than the receive buffer, 784 * we have to do the receive in sections, and thus risk returning 785 * a short count if a timeout or signal occurs after we start. 786 */ 787 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 788 so->so_rcv.sb_cc < uio->uio_resid) && 789 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 790 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 791 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 792 #ifdef DIAGNOSTIC 793 if (m == NULL && so->so_rcv.sb_cc) 794 #ifdef SOCKET_SPLICE 795 if (!isspliced(so)) 796 #endif /* SOCKET_SPLICE */ 797 panic("receive 1: so %p, so_type %d, sb_cc %lu", 798 so, so->so_type, so->so_rcv.sb_cc); 799 #endif 800 if (so->so_error) { 801 if (m) 802 goto dontblock; 803 error = so->so_error; 804 if ((flags & MSG_PEEK) == 0) 805 so->so_error = 0; 806 goto release; 807 } 808 if (so->so_state & SS_CANTRCVMORE) { 809 if (m) 810 goto dontblock; 811 else if (so->so_rcv.sb_cc == 0) 812 goto release; 813 } 814 for (; m; m = m->m_next) 815 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 816 m = so->so_rcv.sb_mb; 817 goto dontblock; 818 } 819 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 820 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 821 error = ENOTCONN; 822 goto release; 823 } 824 if (uio->uio_resid == 0 && controlp == NULL) 825 goto release; 826 if (flags & MSG_DONTWAIT) { 827 error = EWOULDBLOCK; 828 goto release; 829 } 830 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 831 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 832 sbunlock(so, &so->so_rcv); 833 error = sbwait(so, &so->so_rcv); 834 if (error) { 835 sounlock(so, s); 836 return (error); 837 } 838 goto restart; 839 } 840 dontblock: 841 /* 842 * On entry here, m points to the first record of the socket buffer. 843 * From this point onward, we maintain 'nextrecord' as a cache of the 844 * pointer to the next record in the socket buffer. We must keep the 845 * various socket buffer pointers and local stack versions of the 846 * pointers in sync, pushing out modifications before operations that 847 * may sleep, and re-reading them afterwards. 848 * 849 * Otherwise, we will race with the network stack appending new data 850 * or records onto the socket buffer by using inconsistent/stale 851 * versions of the field, possibly resulting in socket buffer 852 * corruption. 853 */ 854 if (uio->uio_procp) 855 uio->uio_procp->p_ru.ru_msgrcv++; 856 KASSERT(m == so->so_rcv.sb_mb); 857 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 858 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 859 nextrecord = m->m_nextpkt; 860 if (pr->pr_flags & PR_ADDR) { 861 #ifdef DIAGNOSTIC 862 if (m->m_type != MT_SONAME) 863 panic("receive 1a: so %p, so_type %d, m %p, m_type %d", 864 so, so->so_type, m, m->m_type); 865 #endif 866 orig_resid = 0; 867 if (flags & MSG_PEEK) { 868 if (paddr) 869 *paddr = m_copym(m, 0, m->m_len, M_NOWAIT); 870 m = m->m_next; 871 } else { 872 sbfree(so, &so->so_rcv, m); 873 if (paddr) { 874 *paddr = m; 875 so->so_rcv.sb_mb = m->m_next; 876 m->m_next = NULL; 877 m = so->so_rcv.sb_mb; 878 } else { 879 so->so_rcv.sb_mb = m_free(m); 880 m = so->so_rcv.sb_mb; 881 } 882 sbsync(&so->so_rcv, nextrecord); 883 } 884 } 885 while (m && m->m_type == MT_CONTROL && error == 0) { 886 int skip = 0; 887 if (flags & MSG_PEEK) { 888 if (mtod(m, struct cmsghdr *)->cmsg_type == 889 SCM_RIGHTS) { 890 /* don't leak internalized SCM_RIGHTS msgs */ 891 skip = 1; 892 } else if (controlp) 893 *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); 894 m = m->m_next; 895 } else { 896 sbfree(so, &so->so_rcv, m); 897 so->so_rcv.sb_mb = m->m_next; 898 m->m_nextpkt = m->m_next = NULL; 899 cm = m; 900 m = so->so_rcv.sb_mb; 901 sbsync(&so->so_rcv, nextrecord); 902 if (controlp) { 903 if (pr->pr_domain->dom_externalize) { 904 sounlock(so, s); 905 error = 906 (*pr->pr_domain->dom_externalize) 907 (cm, controllen, flags); 908 s = solock(so); 909 } 910 *controlp = cm; 911 } else { 912 /* 913 * Dispose of any SCM_RIGHTS message that went 914 * through the read path rather than recv. 915 */ 916 if (pr->pr_domain->dom_dispose) 917 pr->pr_domain->dom_dispose(cm); 918 m_free(cm); 919 } 920 } 921 if (m != NULL) 922 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 923 else 924 nextrecord = so->so_rcv.sb_mb; 925 if (controlp && !skip) 926 controlp = &(*controlp)->m_next; 927 orig_resid = 0; 928 } 929 930 /* If m is non-NULL, we have some data to read. */ 931 if (m) { 932 type = m->m_type; 933 if (type == MT_OOBDATA) 934 flags |= MSG_OOB; 935 if (m->m_flags & M_BCAST) 936 flags |= MSG_BCAST; 937 if (m->m_flags & M_MCAST) 938 flags |= MSG_MCAST; 939 } 940 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 941 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 942 943 moff = 0; 944 offset = 0; 945 while (m && uio->uio_resid > 0 && error == 0) { 946 if (m->m_type == MT_OOBDATA) { 947 if (type != MT_OOBDATA) 948 break; 949 } else if (type == MT_OOBDATA) { 950 break; 951 } else if (m->m_type == MT_CONTROL) { 952 /* 953 * If there is more than one control message in the 954 * stream, we do a short read. Next can be received 955 * or disposed by another system call. 956 */ 957 break; 958 #ifdef DIAGNOSTIC 959 } else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) { 960 panic("receive 3: so %p, so_type %d, m %p, m_type %d", 961 so, so->so_type, m, m->m_type); 962 #endif 963 } 964 so->so_state &= ~SS_RCVATMARK; 965 len = uio->uio_resid; 966 if (so->so_oobmark && len > so->so_oobmark - offset) 967 len = so->so_oobmark - offset; 968 if (len > m->m_len - moff) 969 len = m->m_len - moff; 970 /* 971 * If mp is set, just pass back the mbufs. 972 * Otherwise copy them out via the uio, then free. 973 * Sockbuf must be consistent here (points to current mbuf, 974 * it points to next record) when we drop priority; 975 * we must note any additions to the sockbuf when we 976 * block interrupts again. 977 */ 978 if (mp == NULL && uio_error == 0) { 979 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 980 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 981 resid = uio->uio_resid; 982 sounlock(so, s); 983 uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); 984 s = solock(so); 985 if (uio_error) 986 uio->uio_resid = resid - len; 987 } else 988 uio->uio_resid -= len; 989 if (len == m->m_len - moff) { 990 if (m->m_flags & M_EOR) 991 flags |= MSG_EOR; 992 if (flags & MSG_PEEK) { 993 m = m->m_next; 994 moff = 0; 995 orig_resid = 0; 996 } else { 997 nextrecord = m->m_nextpkt; 998 sbfree(so, &so->so_rcv, m); 999 if (mp) { 1000 *mp = m; 1001 mp = &m->m_next; 1002 so->so_rcv.sb_mb = m = m->m_next; 1003 *mp = NULL; 1004 } else { 1005 so->so_rcv.sb_mb = m_free(m); 1006 m = so->so_rcv.sb_mb; 1007 } 1008 /* 1009 * If m != NULL, we also know that 1010 * so->so_rcv.sb_mb != NULL. 1011 */ 1012 KASSERT(so->so_rcv.sb_mb == m); 1013 if (m) { 1014 m->m_nextpkt = nextrecord; 1015 if (nextrecord == NULL) 1016 so->so_rcv.sb_lastrecord = m; 1017 } else { 1018 so->so_rcv.sb_mb = nextrecord; 1019 SB_EMPTY_FIXUP(&so->so_rcv); 1020 } 1021 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1022 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1023 } 1024 } else { 1025 if (flags & MSG_PEEK) { 1026 moff += len; 1027 orig_resid = 0; 1028 } else { 1029 if (mp) 1030 *mp = m_copym(m, 0, len, M_WAIT); 1031 m->m_data += len; 1032 m->m_len -= len; 1033 so->so_rcv.sb_cc -= len; 1034 so->so_rcv.sb_datacc -= len; 1035 } 1036 } 1037 if (so->so_oobmark) { 1038 if ((flags & MSG_PEEK) == 0) { 1039 so->so_oobmark -= len; 1040 if (so->so_oobmark == 0) { 1041 so->so_state |= SS_RCVATMARK; 1042 break; 1043 } 1044 } else { 1045 offset += len; 1046 if (offset == so->so_oobmark) 1047 break; 1048 } 1049 } 1050 if (flags & MSG_EOR) 1051 break; 1052 /* 1053 * If the MSG_WAITALL flag is set (for non-atomic socket), 1054 * we must not quit until "uio->uio_resid == 0" or an error 1055 * termination. If a signal/timeout occurs, return 1056 * with a short count but without error. 1057 * Keep sockbuf locked against other readers. 1058 */ 1059 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1060 !sosendallatonce(so) && !nextrecord) { 1061 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1062 break; 1063 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1064 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1065 error = sbwait(so, &so->so_rcv); 1066 if (error) { 1067 sbunlock(so, &so->so_rcv); 1068 sounlock(so, s); 1069 return (0); 1070 } 1071 if ((m = so->so_rcv.sb_mb) != NULL) 1072 nextrecord = m->m_nextpkt; 1073 } 1074 } 1075 1076 if (m && pr->pr_flags & PR_ATOMIC) { 1077 flags |= MSG_TRUNC; 1078 if ((flags & MSG_PEEK) == 0) 1079 (void) sbdroprecord(so, &so->so_rcv); 1080 } 1081 if ((flags & MSG_PEEK) == 0) { 1082 if (m == NULL) { 1083 /* 1084 * First part is an inline SB_EMPTY_FIXUP(). Second 1085 * part makes sure sb_lastrecord is up-to-date if 1086 * there is still data in the socket buffer. 1087 */ 1088 so->so_rcv.sb_mb = nextrecord; 1089 if (so->so_rcv.sb_mb == NULL) { 1090 so->so_rcv.sb_mbtail = NULL; 1091 so->so_rcv.sb_lastrecord = NULL; 1092 } else if (nextrecord->m_nextpkt == NULL) 1093 so->so_rcv.sb_lastrecord = nextrecord; 1094 } 1095 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1096 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1097 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 1098 (*pr->pr_usrreq)(so, PRU_RCVD, NULL, 1099 (struct mbuf *)(long)flags, NULL, curproc); 1100 } 1101 if (orig_resid == uio->uio_resid && orig_resid && 1102 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1103 sbunlock(so, &so->so_rcv); 1104 goto restart; 1105 } 1106 1107 if (uio_error) 1108 error = uio_error; 1109 1110 if (flagsp) 1111 *flagsp |= flags; 1112 release: 1113 sbunlock(so, &so->so_rcv); 1114 sounlock(so, s); 1115 return (error); 1116 } 1117 1118 int 1119 soshutdown(struct socket *so, int how) 1120 { 1121 const struct protosw *pr = so->so_proto; 1122 int s, error = 0; 1123 1124 s = solock(so); 1125 switch (how) { 1126 case SHUT_RD: 1127 sorflush(so); 1128 break; 1129 case SHUT_RDWR: 1130 sorflush(so); 1131 /* FALLTHROUGH */ 1132 case SHUT_WR: 1133 error = (*pr->pr_usrreq)(so, PRU_SHUTDOWN, NULL, NULL, NULL, 1134 curproc); 1135 break; 1136 default: 1137 error = EINVAL; 1138 break; 1139 } 1140 sounlock(so, s); 1141 1142 return (error); 1143 } 1144 1145 void 1146 sorflush(struct socket *so) 1147 { 1148 struct sockbuf *sb = &so->so_rcv; 1149 struct mbuf *m; 1150 const struct protosw *pr = so->so_proto; 1151 int error; 1152 1153 sb->sb_flags |= SB_NOINTR; 1154 error = sblock(so, sb, M_WAITOK); 1155 /* with SB_NOINTR and M_WAITOK sblock() must not fail */ 1156 KASSERT(error == 0); 1157 socantrcvmore(so); 1158 m = sb->sb_mb; 1159 memset(&sb->sb_startzero, 0, 1160 (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); 1161 sb->sb_timeo_nsecs = INFSLP; 1162 sbunlock(so, sb); 1163 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1164 (*pr->pr_domain->dom_dispose)(m); 1165 m_purge(m); 1166 } 1167 1168 #ifdef SOCKET_SPLICE 1169 1170 #define so_splicelen so_sp->ssp_len 1171 #define so_splicemax so_sp->ssp_max 1172 #define so_idletv so_sp->ssp_idletv 1173 #define so_idleto so_sp->ssp_idleto 1174 #define so_splicetask so_sp->ssp_task 1175 1176 int 1177 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) 1178 { 1179 struct file *fp; 1180 struct socket *sosp; 1181 struct sosplice *sp; 1182 struct taskq *tq; 1183 int error = 0; 1184 1185 soassertlocked(so); 1186 1187 if (sosplice_taskq == NULL) { 1188 rw_enter_write(&sosplice_lock); 1189 if (sosplice_taskq == NULL) { 1190 tq = taskq_create("sosplice", 1, IPL_SOFTNET, 1191 TASKQ_MPSAFE); 1192 /* Ensure the taskq is fully visible to other CPUs. */ 1193 membar_producer(); 1194 sosplice_taskq = tq; 1195 } 1196 rw_exit_write(&sosplice_lock); 1197 } 1198 if (sosplice_taskq == NULL) 1199 return (ENOMEM); 1200 1201 if ((so->so_proto->pr_flags & PR_SPLICE) == 0) 1202 return (EPROTONOSUPPORT); 1203 if (so->so_options & SO_ACCEPTCONN) 1204 return (EOPNOTSUPP); 1205 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1206 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 1207 return (ENOTCONN); 1208 if (so->so_sp == NULL) { 1209 sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1210 if (so->so_sp == NULL) 1211 so->so_sp = sp; 1212 else 1213 pool_put(&sosplice_pool, sp); 1214 } 1215 1216 /* If no fd is given, unsplice by removing existing link. */ 1217 if (fd < 0) { 1218 /* Lock receive buffer. */ 1219 if ((error = sblock(so, &so->so_rcv, M_WAITOK)) != 0) { 1220 return (error); 1221 } 1222 if (so->so_sp->ssp_socket) 1223 sounsplice(so, so->so_sp->ssp_socket, 0); 1224 sbunlock(so, &so->so_rcv); 1225 return (0); 1226 } 1227 1228 if (max && max < 0) 1229 return (EINVAL); 1230 1231 if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) 1232 return (EINVAL); 1233 1234 /* Find sosp, the drain socket where data will be spliced into. */ 1235 if ((error = getsock(curproc, fd, &fp)) != 0) 1236 return (error); 1237 sosp = fp->f_data; 1238 if (sosp->so_proto->pr_usrreq != so->so_proto->pr_usrreq) { 1239 error = EPROTONOSUPPORT; 1240 goto frele; 1241 } 1242 if (sosp->so_sp == NULL) { 1243 sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1244 if (sosp->so_sp == NULL) 1245 sosp->so_sp = sp; 1246 else 1247 pool_put(&sosplice_pool, sp); 1248 } 1249 1250 /* Lock both receive and send buffer. */ 1251 if ((error = sblock(so, &so->so_rcv, M_WAITOK)) != 0) { 1252 goto frele; 1253 } 1254 if ((error = sblock(so, &sosp->so_snd, M_WAITOK)) != 0) { 1255 sbunlock(so, &so->so_rcv); 1256 goto frele; 1257 } 1258 1259 if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { 1260 error = EBUSY; 1261 goto release; 1262 } 1263 if (sosp->so_options & SO_ACCEPTCONN) { 1264 error = EOPNOTSUPP; 1265 goto release; 1266 } 1267 if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) { 1268 error = ENOTCONN; 1269 goto release; 1270 } 1271 1272 /* Splice so and sosp together. */ 1273 so->so_sp->ssp_socket = sosp; 1274 sosp->so_sp->ssp_soback = so; 1275 so->so_splicelen = 0; 1276 so->so_splicemax = max; 1277 if (tv) 1278 so->so_idletv = *tv; 1279 else 1280 timerclear(&so->so_idletv); 1281 timeout_set_proc(&so->so_idleto, soidle, so); 1282 task_set(&so->so_splicetask, sotask, so); 1283 1284 /* 1285 * To prevent softnet interrupt from calling somove() while 1286 * we sleep, the socket buffers are not marked as spliced yet. 1287 */ 1288 if (somove(so, M_WAIT)) { 1289 so->so_rcv.sb_flags |= SB_SPLICE; 1290 sosp->so_snd.sb_flags |= SB_SPLICE; 1291 } 1292 1293 release: 1294 sbunlock(sosp, &sosp->so_snd); 1295 sbunlock(so, &so->so_rcv); 1296 frele: 1297 /* 1298 * FRELE() must not be called with the socket lock held. It is safe to 1299 * release the lock here as long as no other operation happen on the 1300 * socket when sosplice() returns. The dance could be avoided by 1301 * grabbing the socket lock inside this function. 1302 */ 1303 sounlock(so, SL_LOCKED); 1304 FRELE(fp, curproc); 1305 solock(so); 1306 return (error); 1307 } 1308 1309 void 1310 sounsplice(struct socket *so, struct socket *sosp, int freeing) 1311 { 1312 soassertlocked(so); 1313 1314 task_del(sosplice_taskq, &so->so_splicetask); 1315 timeout_del(&so->so_idleto); 1316 sosp->so_snd.sb_flags &= ~SB_SPLICE; 1317 so->so_rcv.sb_flags &= ~SB_SPLICE; 1318 so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; 1319 /* Do not wakeup a socket that is about to be freed. */ 1320 if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so)) 1321 sorwakeup(so); 1322 if ((freeing & SOSP_FREEING_WRITE) == 0 && sowriteable(sosp)) 1323 sowwakeup(sosp); 1324 } 1325 1326 void 1327 soidle(void *arg) 1328 { 1329 struct socket *so = arg; 1330 int s; 1331 1332 s = solock(so); 1333 if (so->so_rcv.sb_flags & SB_SPLICE) { 1334 so->so_error = ETIMEDOUT; 1335 sounsplice(so, so->so_sp->ssp_socket, 0); 1336 } 1337 sounlock(so, s); 1338 } 1339 1340 void 1341 sotask(void *arg) 1342 { 1343 struct socket *so = arg; 1344 int s; 1345 1346 s = solock(so); 1347 if (so->so_rcv.sb_flags & SB_SPLICE) { 1348 /* 1349 * We may not sleep here as sofree() and unsplice() may be 1350 * called from softnet interrupt context. This would remove 1351 * the socket during somove(). 1352 */ 1353 somove(so, M_DONTWAIT); 1354 } 1355 sounlock(so, s); 1356 1357 /* Avoid user land starvation. */ 1358 yield(); 1359 } 1360 1361 /* 1362 * The socket splicing task or idle timeout may sleep while grabbing the net 1363 * lock. As sofree() can be called anytime, sotask() or soidle() could access 1364 * the socket memory of a freed socket after wakeup. So delay the pool_put() 1365 * after all pending socket splicing tasks or timeouts have finished. Do this 1366 * by scheduling it on the same threads. 1367 */ 1368 void 1369 soreaper(void *arg) 1370 { 1371 struct socket *so = arg; 1372 1373 /* Reuse splice task, sounsplice() has been called before. */ 1374 task_set(&so->so_sp->ssp_task, soput, so); 1375 task_add(sosplice_taskq, &so->so_sp->ssp_task); 1376 } 1377 1378 void 1379 soput(void *arg) 1380 { 1381 struct socket *so = arg; 1382 1383 pool_put(&sosplice_pool, so->so_sp); 1384 pool_put(&socket_pool, so); 1385 } 1386 1387 /* 1388 * Move data from receive buffer of spliced source socket to send 1389 * buffer of drain socket. Try to move as much as possible in one 1390 * big chunk. It is a TCP only implementation. 1391 * Return value 0 means splicing has been finished, 1 continue. 1392 */ 1393 int 1394 somove(struct socket *so, int wait) 1395 { 1396 struct socket *sosp = so->so_sp->ssp_socket; 1397 struct mbuf *m, **mp, *nextrecord; 1398 u_long len, off, oobmark; 1399 long space; 1400 int error = 0, maxreached = 0; 1401 unsigned int state; 1402 1403 soassertlocked(so); 1404 1405 nextpkt: 1406 if (so->so_error) { 1407 error = so->so_error; 1408 goto release; 1409 } 1410 if (sosp->so_state & SS_CANTSENDMORE) { 1411 error = EPIPE; 1412 goto release; 1413 } 1414 if (sosp->so_error && sosp->so_error != ETIMEDOUT && 1415 sosp->so_error != EFBIG && sosp->so_error != ELOOP) { 1416 error = sosp->so_error; 1417 goto release; 1418 } 1419 if ((sosp->so_state & SS_ISCONNECTED) == 0) 1420 goto release; 1421 1422 /* Calculate how many bytes can be copied now. */ 1423 len = so->so_rcv.sb_datacc; 1424 if (so->so_splicemax) { 1425 KASSERT(so->so_splicelen < so->so_splicemax); 1426 if (so->so_splicemax <= so->so_splicelen + len) { 1427 len = so->so_splicemax - so->so_splicelen; 1428 maxreached = 1; 1429 } 1430 } 1431 space = sbspace(sosp, &sosp->so_snd); 1432 if (so->so_oobmark && so->so_oobmark < len && 1433 so->so_oobmark < space + 1024) 1434 space += 1024; 1435 if (space <= 0) { 1436 maxreached = 0; 1437 goto release; 1438 } 1439 if (space < len) { 1440 maxreached = 0; 1441 if (space < sosp->so_snd.sb_lowat) 1442 goto release; 1443 len = space; 1444 } 1445 sosp->so_state |= SS_ISSENDING; 1446 1447 SBLASTRECORDCHK(&so->so_rcv, "somove 1"); 1448 SBLASTMBUFCHK(&so->so_rcv, "somove 1"); 1449 m = so->so_rcv.sb_mb; 1450 if (m == NULL) 1451 goto release; 1452 nextrecord = m->m_nextpkt; 1453 1454 /* Drop address and control information not used with splicing. */ 1455 if (so->so_proto->pr_flags & PR_ADDR) { 1456 #ifdef DIAGNOSTIC 1457 if (m->m_type != MT_SONAME) 1458 panic("somove soname: so %p, so_type %d, m %p, " 1459 "m_type %d", so, so->so_type, m, m->m_type); 1460 #endif 1461 m = m->m_next; 1462 } 1463 while (m && m->m_type == MT_CONTROL) 1464 m = m->m_next; 1465 if (m == NULL) { 1466 sbdroprecord(so, &so->so_rcv); 1467 if (so->so_proto->pr_flags & PR_WANTRCVD && so->so_pcb) 1468 (so->so_proto->pr_usrreq)(so, PRU_RCVD, NULL, 1469 NULL, NULL, NULL); 1470 goto nextpkt; 1471 } 1472 1473 /* 1474 * By splicing sockets connected to localhost, userland might create a 1475 * loop. Dissolve splicing with error if loop is detected by counter. 1476 * 1477 * If we deal with looped broadcast/multicast packet we bail out with 1478 * no error to suppress splice termination. 1479 */ 1480 if ((m->m_flags & M_PKTHDR) && 1481 ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) || 1482 ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) { 1483 error = ELOOP; 1484 goto release; 1485 } 1486 1487 if (so->so_proto->pr_flags & PR_ATOMIC) { 1488 if ((m->m_flags & M_PKTHDR) == 0) 1489 panic("somove !PKTHDR: so %p, so_type %d, m %p, " 1490 "m_type %d", so, so->so_type, m, m->m_type); 1491 if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) { 1492 error = EMSGSIZE; 1493 goto release; 1494 } 1495 if (len < m->m_pkthdr.len) 1496 goto release; 1497 if (m->m_pkthdr.len < len) { 1498 maxreached = 0; 1499 len = m->m_pkthdr.len; 1500 } 1501 /* 1502 * Throw away the name mbuf after it has been assured 1503 * that the whole first record can be processed. 1504 */ 1505 m = so->so_rcv.sb_mb; 1506 sbfree(so, &so->so_rcv, m); 1507 so->so_rcv.sb_mb = m_free(m); 1508 sbsync(&so->so_rcv, nextrecord); 1509 } 1510 /* 1511 * Throw away the control mbufs after it has been assured 1512 * that the whole first record can be processed. 1513 */ 1514 m = so->so_rcv.sb_mb; 1515 while (m && m->m_type == MT_CONTROL) { 1516 sbfree(so, &so->so_rcv, m); 1517 so->so_rcv.sb_mb = m_free(m); 1518 m = so->so_rcv.sb_mb; 1519 sbsync(&so->so_rcv, nextrecord); 1520 } 1521 1522 SBLASTRECORDCHK(&so->so_rcv, "somove 2"); 1523 SBLASTMBUFCHK(&so->so_rcv, "somove 2"); 1524 1525 /* Take at most len mbufs out of receive buffer. */ 1526 for (off = 0, mp = &m; off <= len && *mp; 1527 off += (*mp)->m_len, mp = &(*mp)->m_next) { 1528 u_long size = len - off; 1529 1530 #ifdef DIAGNOSTIC 1531 if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER) 1532 panic("somove type: so %p, so_type %d, m %p, " 1533 "m_type %d", so, so->so_type, *mp, (*mp)->m_type); 1534 #endif 1535 if ((*mp)->m_len > size) { 1536 /* 1537 * Move only a partial mbuf at maximum splice length or 1538 * if the drain buffer is too small for this large mbuf. 1539 */ 1540 if (!maxreached && so->so_snd.sb_datacc > 0) { 1541 len -= size; 1542 break; 1543 } 1544 *mp = m_copym(so->so_rcv.sb_mb, 0, size, wait); 1545 if (*mp == NULL) { 1546 len -= size; 1547 break; 1548 } 1549 so->so_rcv.sb_mb->m_data += size; 1550 so->so_rcv.sb_mb->m_len -= size; 1551 so->so_rcv.sb_cc -= size; 1552 so->so_rcv.sb_datacc -= size; 1553 } else { 1554 *mp = so->so_rcv.sb_mb; 1555 sbfree(so, &so->so_rcv, *mp); 1556 so->so_rcv.sb_mb = (*mp)->m_next; 1557 sbsync(&so->so_rcv, nextrecord); 1558 } 1559 } 1560 *mp = NULL; 1561 1562 SBLASTRECORDCHK(&so->so_rcv, "somove 3"); 1563 SBLASTMBUFCHK(&so->so_rcv, "somove 3"); 1564 SBCHECK(so, &so->so_rcv); 1565 if (m == NULL) 1566 goto release; 1567 m->m_nextpkt = NULL; 1568 if (m->m_flags & M_PKTHDR) { 1569 m_resethdr(m); 1570 m->m_pkthdr.len = len; 1571 } 1572 1573 /* Send window update to source peer as receive buffer has changed. */ 1574 if (so->so_proto->pr_flags & PR_WANTRCVD && so->so_pcb) 1575 (so->so_proto->pr_usrreq)(so, PRU_RCVD, NULL, 1576 NULL, NULL, NULL); 1577 1578 /* Receive buffer did shrink by len bytes, adjust oob. */ 1579 state = so->so_state; 1580 so->so_state &= ~SS_RCVATMARK; 1581 oobmark = so->so_oobmark; 1582 so->so_oobmark = oobmark > len ? oobmark - len : 0; 1583 if (oobmark) { 1584 if (oobmark == len) 1585 so->so_state |= SS_RCVATMARK; 1586 if (oobmark >= len) 1587 oobmark = 0; 1588 } 1589 1590 /* 1591 * Handle oob data. If any malloc fails, ignore error. 1592 * TCP urgent data is not very reliable anyway. 1593 */ 1594 while (((state & SS_RCVATMARK) || oobmark) && 1595 (so->so_options & SO_OOBINLINE)) { 1596 struct mbuf *o = NULL; 1597 1598 if (state & SS_RCVATMARK) { 1599 o = m_get(wait, MT_DATA); 1600 state &= ~SS_RCVATMARK; 1601 } else if (oobmark) { 1602 o = m_split(m, oobmark, wait); 1603 if (o) { 1604 error = (*sosp->so_proto->pr_usrreq)(sosp, 1605 PRU_SEND, m, NULL, NULL, NULL); 1606 if (error) { 1607 if (sosp->so_state & SS_CANTSENDMORE) 1608 error = EPIPE; 1609 m_freem(o); 1610 goto release; 1611 } 1612 len -= oobmark; 1613 so->so_splicelen += oobmark; 1614 m = o; 1615 o = m_get(wait, MT_DATA); 1616 } 1617 oobmark = 0; 1618 } 1619 if (o) { 1620 o->m_len = 1; 1621 *mtod(o, caddr_t) = *mtod(m, caddr_t); 1622 error = (*sosp->so_proto->pr_usrreq)(sosp, PRU_SENDOOB, 1623 o, NULL, NULL, NULL); 1624 if (error) { 1625 if (sosp->so_state & SS_CANTSENDMORE) 1626 error = EPIPE; 1627 m_freem(m); 1628 goto release; 1629 } 1630 len -= 1; 1631 so->so_splicelen += 1; 1632 if (oobmark) { 1633 oobmark -= 1; 1634 if (oobmark == 0) 1635 state |= SS_RCVATMARK; 1636 } 1637 m_adj(m, 1); 1638 } 1639 } 1640 1641 /* Append all remaining data to drain socket. */ 1642 if (so->so_rcv.sb_cc == 0 || maxreached) 1643 sosp->so_state &= ~SS_ISSENDING; 1644 error = (*sosp->so_proto->pr_usrreq)(sosp, PRU_SEND, m, NULL, NULL, 1645 NULL); 1646 if (error) { 1647 if (sosp->so_state & SS_CANTSENDMORE) 1648 error = EPIPE; 1649 goto release; 1650 } 1651 so->so_splicelen += len; 1652 1653 /* Move several packets if possible. */ 1654 if (!maxreached && nextrecord) 1655 goto nextpkt; 1656 1657 release: 1658 sosp->so_state &= ~SS_ISSENDING; 1659 if (!error && maxreached && so->so_splicemax == so->so_splicelen) 1660 error = EFBIG; 1661 if (error) 1662 so->so_error = error; 1663 if (((so->so_state & SS_CANTRCVMORE) && so->so_rcv.sb_cc == 0) || 1664 (sosp->so_state & SS_CANTSENDMORE) || maxreached || error) { 1665 sounsplice(so, sosp, 0); 1666 return (0); 1667 } 1668 if (timerisset(&so->so_idletv)) 1669 timeout_add_tv(&so->so_idleto, &so->so_idletv); 1670 return (1); 1671 } 1672 1673 #endif /* SOCKET_SPLICE */ 1674 1675 void 1676 sorwakeup(struct socket *so) 1677 { 1678 soassertlocked(so); 1679 1680 #ifdef SOCKET_SPLICE 1681 if (so->so_rcv.sb_flags & SB_SPLICE) { 1682 /* 1683 * TCP has a sendbuffer that can handle multiple packets 1684 * at once. So queue the stream a bit to accumulate data. 1685 * The sosplice thread will call somove() later and send 1686 * the packets calling tcp_output() only once. 1687 * In the UDP case, send out the packets immediately. 1688 * Using a thread would make things slower. 1689 */ 1690 if (so->so_proto->pr_flags & PR_WANTRCVD) 1691 task_add(sosplice_taskq, &so->so_splicetask); 1692 else 1693 somove(so, M_DONTWAIT); 1694 } 1695 if (isspliced(so)) 1696 return; 1697 #endif 1698 sowakeup(so, &so->so_rcv); 1699 if (so->so_upcall) 1700 (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT); 1701 } 1702 1703 void 1704 sowwakeup(struct socket *so) 1705 { 1706 soassertlocked(so); 1707 1708 #ifdef SOCKET_SPLICE 1709 if (so->so_snd.sb_flags & SB_SPLICE) 1710 task_add(sosplice_taskq, &so->so_sp->ssp_soback->so_splicetask); 1711 if (issplicedback(so)) 1712 return; 1713 #endif 1714 sowakeup(so, &so->so_snd); 1715 } 1716 1717 int 1718 sosetopt(struct socket *so, int level, int optname, struct mbuf *m) 1719 { 1720 int error = 0; 1721 1722 soassertlocked(so); 1723 1724 if (level != SOL_SOCKET) { 1725 if (so->so_proto->pr_ctloutput) { 1726 error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1727 level, optname, m); 1728 return (error); 1729 } 1730 error = ENOPROTOOPT; 1731 } else { 1732 switch (optname) { 1733 case SO_BINDANY: 1734 if ((error = suser(curproc)) != 0) /* XXX */ 1735 return (error); 1736 break; 1737 } 1738 1739 switch (optname) { 1740 1741 case SO_LINGER: 1742 if (m == NULL || m->m_len != sizeof (struct linger) || 1743 mtod(m, struct linger *)->l_linger < 0 || 1744 mtod(m, struct linger *)->l_linger > SHRT_MAX) 1745 return (EINVAL); 1746 so->so_linger = mtod(m, struct linger *)->l_linger; 1747 /* FALLTHROUGH */ 1748 1749 case SO_BINDANY: 1750 case SO_DEBUG: 1751 case SO_KEEPALIVE: 1752 case SO_USELOOPBACK: 1753 case SO_BROADCAST: 1754 case SO_REUSEADDR: 1755 case SO_REUSEPORT: 1756 case SO_OOBINLINE: 1757 case SO_TIMESTAMP: 1758 case SO_ZEROIZE: 1759 if (m == NULL || m->m_len < sizeof (int)) 1760 return (EINVAL); 1761 if (*mtod(m, int *)) 1762 so->so_options |= optname; 1763 else 1764 so->so_options &= ~optname; 1765 break; 1766 1767 case SO_DONTROUTE: 1768 if (m == NULL || m->m_len < sizeof (int)) 1769 return (EINVAL); 1770 if (*mtod(m, int *)) 1771 error = EOPNOTSUPP; 1772 break; 1773 1774 case SO_SNDBUF: 1775 case SO_RCVBUF: 1776 case SO_SNDLOWAT: 1777 case SO_RCVLOWAT: 1778 { 1779 u_long cnt; 1780 1781 if (m == NULL || m->m_len < sizeof (int)) 1782 return (EINVAL); 1783 cnt = *mtod(m, int *); 1784 if ((long)cnt <= 0) 1785 cnt = 1; 1786 switch (optname) { 1787 1788 case SO_SNDBUF: 1789 if (so->so_state & SS_CANTSENDMORE) 1790 return (EINVAL); 1791 if (sbcheckreserve(cnt, so->so_snd.sb_wat) || 1792 sbreserve(so, &so->so_snd, cnt)) 1793 return (ENOBUFS); 1794 so->so_snd.sb_wat = cnt; 1795 break; 1796 1797 case SO_RCVBUF: 1798 if (so->so_state & SS_CANTRCVMORE) 1799 return (EINVAL); 1800 if (sbcheckreserve(cnt, so->so_rcv.sb_wat) || 1801 sbreserve(so, &so->so_rcv, cnt)) 1802 return (ENOBUFS); 1803 so->so_rcv.sb_wat = cnt; 1804 break; 1805 1806 case SO_SNDLOWAT: 1807 so->so_snd.sb_lowat = 1808 (cnt > so->so_snd.sb_hiwat) ? 1809 so->so_snd.sb_hiwat : cnt; 1810 break; 1811 case SO_RCVLOWAT: 1812 so->so_rcv.sb_lowat = 1813 (cnt > so->so_rcv.sb_hiwat) ? 1814 so->so_rcv.sb_hiwat : cnt; 1815 break; 1816 } 1817 break; 1818 } 1819 1820 case SO_SNDTIMEO: 1821 case SO_RCVTIMEO: 1822 { 1823 struct timeval tv; 1824 uint64_t nsecs; 1825 1826 if (m == NULL || m->m_len < sizeof (tv)) 1827 return (EINVAL); 1828 memcpy(&tv, mtod(m, struct timeval *), sizeof tv); 1829 if (!timerisvalid(&tv)) 1830 return (EINVAL); 1831 nsecs = TIMEVAL_TO_NSEC(&tv); 1832 if (nsecs == UINT64_MAX) 1833 return (EDOM); 1834 if (nsecs == 0) 1835 nsecs = INFSLP; 1836 switch (optname) { 1837 1838 case SO_SNDTIMEO: 1839 so->so_snd.sb_timeo_nsecs = nsecs; 1840 break; 1841 case SO_RCVTIMEO: 1842 so->so_rcv.sb_timeo_nsecs = nsecs; 1843 break; 1844 } 1845 break; 1846 } 1847 1848 case SO_RTABLE: 1849 if (so->so_proto->pr_domain && 1850 so->so_proto->pr_domain->dom_protosw && 1851 so->so_proto->pr_ctloutput) { 1852 const struct domain *dom = 1853 so->so_proto->pr_domain; 1854 1855 level = dom->dom_protosw->pr_protocol; 1856 error = (*so->so_proto->pr_ctloutput) 1857 (PRCO_SETOPT, so, level, optname, m); 1858 return (error); 1859 } 1860 error = ENOPROTOOPT; 1861 break; 1862 1863 #ifdef SOCKET_SPLICE 1864 case SO_SPLICE: 1865 if (m == NULL) { 1866 error = sosplice(so, -1, 0, NULL); 1867 } else if (m->m_len < sizeof(int)) { 1868 return (EINVAL); 1869 } else if (m->m_len < sizeof(struct splice)) { 1870 error = sosplice(so, *mtod(m, int *), 0, NULL); 1871 } else { 1872 error = sosplice(so, 1873 mtod(m, struct splice *)->sp_fd, 1874 mtod(m, struct splice *)->sp_max, 1875 &mtod(m, struct splice *)->sp_idle); 1876 } 1877 break; 1878 #endif /* SOCKET_SPLICE */ 1879 1880 default: 1881 error = ENOPROTOOPT; 1882 break; 1883 } 1884 if (error == 0 && so->so_proto->pr_ctloutput) { 1885 (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1886 level, optname, m); 1887 } 1888 } 1889 1890 return (error); 1891 } 1892 1893 int 1894 sogetopt(struct socket *so, int level, int optname, struct mbuf *m) 1895 { 1896 int error = 0; 1897 1898 soassertlocked(so); 1899 1900 if (level != SOL_SOCKET) { 1901 if (so->so_proto->pr_ctloutput) { 1902 m->m_len = 0; 1903 1904 error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so, 1905 level, optname, m); 1906 if (error) 1907 return (error); 1908 return (0); 1909 } else 1910 return (ENOPROTOOPT); 1911 } else { 1912 m->m_len = sizeof (int); 1913 1914 switch (optname) { 1915 1916 case SO_LINGER: 1917 m->m_len = sizeof (struct linger); 1918 mtod(m, struct linger *)->l_onoff = 1919 so->so_options & SO_LINGER; 1920 mtod(m, struct linger *)->l_linger = so->so_linger; 1921 break; 1922 1923 case SO_BINDANY: 1924 case SO_USELOOPBACK: 1925 case SO_DEBUG: 1926 case SO_KEEPALIVE: 1927 case SO_REUSEADDR: 1928 case SO_REUSEPORT: 1929 case SO_BROADCAST: 1930 case SO_OOBINLINE: 1931 case SO_TIMESTAMP: 1932 case SO_ZEROIZE: 1933 *mtod(m, int *) = so->so_options & optname; 1934 break; 1935 1936 case SO_DONTROUTE: 1937 *mtod(m, int *) = 0; 1938 break; 1939 1940 case SO_TYPE: 1941 *mtod(m, int *) = so->so_type; 1942 break; 1943 1944 case SO_ERROR: 1945 *mtod(m, int *) = so->so_error; 1946 so->so_error = 0; 1947 break; 1948 1949 case SO_DOMAIN: 1950 *mtod(m, int *) = so->so_proto->pr_domain->dom_family; 1951 break; 1952 1953 case SO_PROTOCOL: 1954 *mtod(m, int *) = so->so_proto->pr_protocol; 1955 break; 1956 1957 case SO_SNDBUF: 1958 *mtod(m, int *) = so->so_snd.sb_hiwat; 1959 break; 1960 1961 case SO_RCVBUF: 1962 *mtod(m, int *) = so->so_rcv.sb_hiwat; 1963 break; 1964 1965 case SO_SNDLOWAT: 1966 *mtod(m, int *) = so->so_snd.sb_lowat; 1967 break; 1968 1969 case SO_RCVLOWAT: 1970 *mtod(m, int *) = so->so_rcv.sb_lowat; 1971 break; 1972 1973 case SO_SNDTIMEO: 1974 case SO_RCVTIMEO: 1975 { 1976 struct timeval tv; 1977 uint64_t nsecs = (optname == SO_SNDTIMEO ? 1978 so->so_snd.sb_timeo_nsecs : 1979 so->so_rcv.sb_timeo_nsecs); 1980 1981 m->m_len = sizeof(struct timeval); 1982 memset(&tv, 0, sizeof(tv)); 1983 if (nsecs != INFSLP) 1984 NSEC_TO_TIMEVAL(nsecs, &tv); 1985 memcpy(mtod(m, struct timeval *), &tv, sizeof tv); 1986 break; 1987 } 1988 1989 case SO_RTABLE: 1990 if (so->so_proto->pr_domain && 1991 so->so_proto->pr_domain->dom_protosw && 1992 so->so_proto->pr_ctloutput) { 1993 const struct domain *dom = 1994 so->so_proto->pr_domain; 1995 1996 level = dom->dom_protosw->pr_protocol; 1997 error = (*so->so_proto->pr_ctloutput) 1998 (PRCO_GETOPT, so, level, optname, m); 1999 if (error) 2000 return (error); 2001 break; 2002 } 2003 return (ENOPROTOOPT); 2004 2005 #ifdef SOCKET_SPLICE 2006 case SO_SPLICE: 2007 { 2008 off_t len; 2009 2010 m->m_len = sizeof(off_t); 2011 len = so->so_sp ? so->so_sp->ssp_len : 0; 2012 memcpy(mtod(m, off_t *), &len, sizeof(off_t)); 2013 break; 2014 } 2015 #endif /* SOCKET_SPLICE */ 2016 2017 case SO_PEERCRED: 2018 if (so->so_proto->pr_protocol == AF_UNIX) { 2019 struct unpcb *unp = sotounpcb(so); 2020 2021 if (unp->unp_flags & UNP_FEIDS) { 2022 m->m_len = sizeof(unp->unp_connid); 2023 memcpy(mtod(m, caddr_t), 2024 &(unp->unp_connid), m->m_len); 2025 break; 2026 } 2027 return (ENOTCONN); 2028 } 2029 return (EOPNOTSUPP); 2030 2031 default: 2032 return (ENOPROTOOPT); 2033 } 2034 return (0); 2035 } 2036 } 2037 2038 void 2039 sohasoutofband(struct socket *so) 2040 { 2041 pgsigio(&so->so_sigio, SIGURG, 0); 2042 KNOTE(&so->so_rcv.sb_sel.si_note, 0); 2043 } 2044 2045 int 2046 soo_kqfilter(struct file *fp, struct knote *kn) 2047 { 2048 struct socket *so = kn->kn_fp->f_data; 2049 struct sockbuf *sb; 2050 int s; 2051 2052 s = solock(so); 2053 switch (kn->kn_filter) { 2054 case EVFILT_READ: 2055 if (so->so_options & SO_ACCEPTCONN) 2056 kn->kn_fop = &solisten_filtops; 2057 else 2058 kn->kn_fop = &soread_filtops; 2059 sb = &so->so_rcv; 2060 break; 2061 case EVFILT_WRITE: 2062 kn->kn_fop = &sowrite_filtops; 2063 sb = &so->so_snd; 2064 break; 2065 case EVFILT_EXCEPT: 2066 kn->kn_fop = &soexcept_filtops; 2067 sb = &so->so_rcv; 2068 break; 2069 default: 2070 sounlock(so, s); 2071 return (EINVAL); 2072 } 2073 2074 klist_insert_locked(&sb->sb_sel.si_note, kn); 2075 sounlock(so, s); 2076 2077 return (0); 2078 } 2079 2080 void 2081 filt_sordetach(struct knote *kn) 2082 { 2083 struct socket *so = kn->kn_fp->f_data; 2084 2085 klist_remove(&so->so_rcv.sb_sel.si_note, kn); 2086 } 2087 2088 int 2089 filt_soread(struct knote *kn, long hint) 2090 { 2091 struct socket *so = kn->kn_fp->f_data; 2092 int rv = 0; 2093 2094 soassertlocked(so); 2095 2096 kn->kn_data = so->so_rcv.sb_cc; 2097 #ifdef SOCKET_SPLICE 2098 if (isspliced(so)) { 2099 rv = 0; 2100 } else 2101 #endif /* SOCKET_SPLICE */ 2102 if (so->so_state & SS_CANTRCVMORE) { 2103 kn->kn_flags |= EV_EOF; 2104 if (kn->kn_flags & __EV_POLL) { 2105 if (so->so_state & SS_ISDISCONNECTED) 2106 kn->kn_flags |= __EV_HUP; 2107 } 2108 kn->kn_fflags = so->so_error; 2109 rv = 1; 2110 } else if (so->so_error) { /* temporary udp error */ 2111 rv = 1; 2112 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2113 rv = (kn->kn_data >= kn->kn_sdata); 2114 } else { 2115 rv = (kn->kn_data >= so->so_rcv.sb_lowat); 2116 } 2117 2118 return rv; 2119 } 2120 2121 void 2122 filt_sowdetach(struct knote *kn) 2123 { 2124 struct socket *so = kn->kn_fp->f_data; 2125 2126 klist_remove(&so->so_snd.sb_sel.si_note, kn); 2127 } 2128 2129 int 2130 filt_sowrite(struct knote *kn, long hint) 2131 { 2132 struct socket *so = kn->kn_fp->f_data; 2133 int rv; 2134 2135 soassertlocked(so); 2136 2137 kn->kn_data = sbspace(so, &so->so_snd); 2138 if (so->so_state & SS_CANTSENDMORE) { 2139 kn->kn_flags |= EV_EOF; 2140 if (kn->kn_flags & __EV_POLL) { 2141 if (so->so_state & SS_ISDISCONNECTED) 2142 kn->kn_flags |= __EV_HUP; 2143 } 2144 kn->kn_fflags = so->so_error; 2145 rv = 1; 2146 } else if (so->so_error) { /* temporary udp error */ 2147 rv = 1; 2148 } else if (((so->so_state & SS_ISCONNECTED) == 0) && 2149 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 2150 rv = 0; 2151 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2152 rv = (kn->kn_data >= kn->kn_sdata); 2153 } else { 2154 rv = (kn->kn_data >= so->so_snd.sb_lowat); 2155 } 2156 2157 return (rv); 2158 } 2159 2160 int 2161 filt_soexcept(struct knote *kn, long hint) 2162 { 2163 struct socket *so = kn->kn_fp->f_data; 2164 int rv = 0; 2165 2166 soassertlocked(so); 2167 2168 #ifdef SOCKET_SPLICE 2169 if (isspliced(so)) { 2170 rv = 0; 2171 } else 2172 #endif /* SOCKET_SPLICE */ 2173 if (kn->kn_sfflags & NOTE_OOB) { 2174 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) { 2175 kn->kn_fflags |= NOTE_OOB; 2176 kn->kn_data -= so->so_oobmark; 2177 rv = 1; 2178 } 2179 } 2180 2181 if (kn->kn_flags & __EV_POLL) { 2182 if (so->so_state & SS_ISDISCONNECTED) { 2183 kn->kn_flags |= __EV_HUP; 2184 rv = 1; 2185 } 2186 } 2187 2188 return rv; 2189 } 2190 2191 int 2192 filt_solisten(struct knote *kn, long hint) 2193 { 2194 struct socket *so = kn->kn_fp->f_data; 2195 int active; 2196 2197 soassertlocked(so); 2198 2199 kn->kn_data = so->so_qlen; 2200 active = (kn->kn_data != 0); 2201 2202 if (kn->kn_flags & (__EV_POLL | __EV_SELECT)) { 2203 if (so->so_state & SS_ISDISCONNECTED) { 2204 kn->kn_flags |= __EV_HUP; 2205 active = 1; 2206 } else { 2207 active = soreadable(so); 2208 } 2209 } 2210 2211 return (active); 2212 } 2213 2214 int 2215 filt_somodify(struct kevent *kev, struct knote *kn) 2216 { 2217 struct socket *so = kn->kn_fp->f_data; 2218 int rv, s; 2219 2220 s = solock(so); 2221 rv = knote_modify(kev, kn); 2222 sounlock(so, s); 2223 2224 return (rv); 2225 } 2226 2227 int 2228 filt_soprocess(struct knote *kn, struct kevent *kev) 2229 { 2230 struct socket *so = kn->kn_fp->f_data; 2231 int rv, s; 2232 2233 s = solock(so); 2234 rv = knote_process(kn, kev); 2235 sounlock(so, s); 2236 2237 return (rv); 2238 } 2239 2240 void 2241 klist_soassertlk(void *arg) 2242 { 2243 struct socket *so = arg; 2244 2245 soassertlocked(so); 2246 } 2247 2248 int 2249 klist_solock(void *arg) 2250 { 2251 struct socket *so = arg; 2252 2253 return (solock(so)); 2254 } 2255 2256 void 2257 klist_sounlock(void *arg, int ls) 2258 { 2259 struct socket *so = arg; 2260 2261 sounlock(so, ls); 2262 } 2263 2264 const struct klistops socket_klistops = { 2265 .klo_assertlk = klist_soassertlk, 2266 .klo_lock = klist_solock, 2267 .klo_unlock = klist_sounlock, 2268 }; 2269 2270 #ifdef DDB 2271 void 2272 sobuf_print(struct sockbuf *, 2273 int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))); 2274 2275 void 2276 sobuf_print(struct sockbuf *sb, 2277 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2278 { 2279 (*pr)("\tsb_cc: %lu\n", sb->sb_cc); 2280 (*pr)("\tsb_datacc: %lu\n", sb->sb_datacc); 2281 (*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat); 2282 (*pr)("\tsb_wat: %lu\n", sb->sb_wat); 2283 (*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt); 2284 (*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax); 2285 (*pr)("\tsb_lowat: %ld\n", sb->sb_lowat); 2286 (*pr)("\tsb_mb: %p\n", sb->sb_mb); 2287 (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail); 2288 (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord); 2289 (*pr)("\tsb_sel: ...\n"); 2290 (*pr)("\tsb_flags: %i\n", sb->sb_flags); 2291 (*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs); 2292 } 2293 2294 void 2295 so_print(void *v, 2296 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2297 { 2298 struct socket *so = v; 2299 2300 (*pr)("socket %p\n", so); 2301 (*pr)("so_type: %i\n", so->so_type); 2302 (*pr)("so_options: 0x%04x\n", so->so_options); /* %b */ 2303 (*pr)("so_linger: %i\n", so->so_linger); 2304 (*pr)("so_state: 0x%04x\n", so->so_state); 2305 (*pr)("so_pcb: %p\n", so->so_pcb); 2306 (*pr)("so_proto: %p\n", so->so_proto); 2307 (*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio); 2308 2309 (*pr)("so_head: %p\n", so->so_head); 2310 (*pr)("so_onq: %p\n", so->so_onq); 2311 (*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0)); 2312 (*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q)); 2313 (*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe)); 2314 (*pr)("so_q0len: %i\n", so->so_q0len); 2315 (*pr)("so_qlen: %i\n", so->so_qlen); 2316 (*pr)("so_qlimit: %i\n", so->so_qlimit); 2317 (*pr)("so_timeo: %i\n", so->so_timeo); 2318 (*pr)("so_obmark: %lu\n", so->so_oobmark); 2319 2320 (*pr)("so_sp: %p\n", so->so_sp); 2321 if (so->so_sp != NULL) { 2322 (*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket); 2323 (*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback); 2324 (*pr)("\tssp_len: %lld\n", 2325 (unsigned long long)so->so_sp->ssp_len); 2326 (*pr)("\tssp_max: %lld\n", 2327 (unsigned long long)so->so_sp->ssp_max); 2328 (*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec, 2329 so->so_sp->ssp_idletv.tv_usec); 2330 (*pr)("\tssp_idleto: %spending (@%i)\n", 2331 timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ", 2332 so->so_sp->ssp_idleto.to_time); 2333 } 2334 2335 (*pr)("so_rcv:\n"); 2336 sobuf_print(&so->so_rcv, pr); 2337 (*pr)("so_snd:\n"); 2338 sobuf_print(&so->so_snd, pr); 2339 2340 (*pr)("so_upcall: %p so_upcallarg: %p\n", 2341 so->so_upcall, so->so_upcallarg); 2342 2343 (*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid); 2344 (*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid); 2345 (*pr)("so_cpid: %d\n", so->so_cpid); 2346 } 2347 #endif 2348