1 /* $OpenBSD: uipc_socket.c,v 1.266 2021/10/22 15:11:32 mpi Exp $ */ 2 /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 #include <sys/file.h> 39 #include <sys/filedesc.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/domain.h> 43 #include <sys/kernel.h> 44 #include <sys/event.h> 45 #include <sys/protosw.h> 46 #include <sys/socket.h> 47 #include <sys/unpcb.h> 48 #include <sys/socketvar.h> 49 #include <sys/signalvar.h> 50 #include <net/if.h> 51 #include <sys/pool.h> 52 #include <sys/atomic.h> 53 #include <sys/rwlock.h> 54 #include <sys/time.h> 55 56 #ifdef DDB 57 #include <machine/db_machdep.h> 58 #endif 59 60 void sbsync(struct sockbuf *, struct mbuf *); 61 62 int sosplice(struct socket *, int, off_t, struct timeval *); 63 void sounsplice(struct socket *, struct socket *, int); 64 void soidle(void *); 65 void sotask(void *); 66 void soreaper(void *); 67 void soput(void *); 68 int somove(struct socket *, int); 69 void sorflush(struct socket *); 70 71 void filt_sordetach(struct knote *kn); 72 int filt_soread(struct knote *kn, long hint); 73 int filt_soreadmodify(struct kevent *kev, struct knote *kn); 74 int filt_soreadprocess(struct knote *kn, struct kevent *kev); 75 int filt_soread_common(struct knote *kn, struct socket *so); 76 void filt_sowdetach(struct knote *kn); 77 int filt_sowrite(struct knote *kn, long hint); 78 int filt_sowritemodify(struct kevent *kev, struct knote *kn); 79 int filt_sowriteprocess(struct knote *kn, struct kevent *kev); 80 int filt_sowrite_common(struct knote *kn, struct socket *so); 81 int filt_soexcept(struct knote *kn, long hint); 82 int filt_soexceptmodify(struct kevent *kev, struct knote *kn); 83 int filt_soexceptprocess(struct knote *kn, struct kevent *kev); 84 int filt_soexcept_common(struct knote *kn, struct socket *so); 85 int filt_solisten(struct knote *kn, long hint); 86 int filt_solistenmodify(struct kevent *kev, struct knote *kn); 87 int filt_solistenprocess(struct knote *kn, struct kevent *kev); 88 int filt_solisten_common(struct knote *kn, struct socket *so); 89 90 const struct filterops solisten_filtops = { 91 .f_flags = FILTEROP_ISFD, 92 .f_attach = NULL, 93 .f_detach = filt_sordetach, 94 .f_event = filt_solisten, 95 .f_modify = filt_solistenmodify, 96 .f_process = filt_solistenprocess, 97 }; 98 99 const struct filterops soread_filtops = { 100 .f_flags = FILTEROP_ISFD, 101 .f_attach = NULL, 102 .f_detach = filt_sordetach, 103 .f_event = filt_soread, 104 .f_modify = filt_soreadmodify, 105 .f_process = filt_soreadprocess, 106 }; 107 108 const struct filterops sowrite_filtops = { 109 .f_flags = FILTEROP_ISFD, 110 .f_attach = NULL, 111 .f_detach = filt_sowdetach, 112 .f_event = filt_sowrite, 113 .f_modify = filt_sowritemodify, 114 .f_process = filt_sowriteprocess, 115 }; 116 117 const struct filterops soexcept_filtops = { 118 .f_flags = FILTEROP_ISFD, 119 .f_attach = NULL, 120 .f_detach = filt_sordetach, 121 .f_event = filt_soexcept, 122 .f_modify = filt_soexceptmodify, 123 .f_process = filt_soexceptprocess, 124 }; 125 126 #ifndef SOMINCONN 127 #define SOMINCONN 80 128 #endif /* SOMINCONN */ 129 130 int somaxconn = SOMAXCONN; 131 int sominconn = SOMINCONN; 132 133 struct pool socket_pool; 134 #ifdef SOCKET_SPLICE 135 struct pool sosplice_pool; 136 struct taskq *sosplice_taskq; 137 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk"); 138 #endif 139 140 void 141 soinit(void) 142 { 143 pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0, 144 "sockpl", NULL); 145 #ifdef SOCKET_SPLICE 146 pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0, 147 "sosppl", NULL); 148 #endif 149 } 150 151 /* 152 * Socket operation routines. 153 * These routines are called by the routines in 154 * sys_socket.c or from a system process, and 155 * implement the semantics of socket operations by 156 * switching out to the protocol specific routines. 157 */ 158 int 159 socreate(int dom, struct socket **aso, int type, int proto) 160 { 161 struct proc *p = curproc; /* XXX */ 162 const struct protosw *prp; 163 struct socket *so; 164 int error, s; 165 166 if (proto) 167 prp = pffindproto(dom, proto, type); 168 else 169 prp = pffindtype(dom, type); 170 if (prp == NULL || prp->pr_attach == NULL) 171 return (EPROTONOSUPPORT); 172 if (prp->pr_type != type) 173 return (EPROTOTYPE); 174 so = pool_get(&socket_pool, PR_WAITOK | PR_ZERO); 175 rw_init(&so->so_lock, "solock"); 176 sigio_init(&so->so_sigio); 177 TAILQ_INIT(&so->so_q0); 178 TAILQ_INIT(&so->so_q); 179 so->so_type = type; 180 if (suser(p) == 0) 181 so->so_state = SS_PRIV; 182 so->so_ruid = p->p_ucred->cr_ruid; 183 so->so_euid = p->p_ucred->cr_uid; 184 so->so_rgid = p->p_ucred->cr_rgid; 185 so->so_egid = p->p_ucred->cr_gid; 186 so->so_cpid = p->p_p->ps_pid; 187 so->so_proto = prp; 188 so->so_snd.sb_timeo_nsecs = INFSLP; 189 so->so_rcv.sb_timeo_nsecs = INFSLP; 190 191 s = solock(so); 192 error = (*prp->pr_attach)(so, proto); 193 if (error) { 194 so->so_state |= SS_NOFDREF; 195 /* sofree() calls sounlock(). */ 196 sofree(so, s); 197 return (error); 198 } 199 sounlock(so, s); 200 *aso = so; 201 return (0); 202 } 203 204 int 205 sobind(struct socket *so, struct mbuf *nam, struct proc *p) 206 { 207 int error; 208 209 soassertlocked(so); 210 211 error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, NULL, nam, NULL, p); 212 return (error); 213 } 214 215 int 216 solisten(struct socket *so, int backlog) 217 { 218 int error; 219 220 soassertlocked(so); 221 222 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 223 return (EINVAL); 224 #ifdef SOCKET_SPLICE 225 if (isspliced(so) || issplicedback(so)) 226 return (EOPNOTSUPP); 227 #endif /* SOCKET_SPLICE */ 228 error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, NULL, NULL, 229 curproc); 230 if (error) 231 return (error); 232 if (TAILQ_FIRST(&so->so_q) == NULL) 233 so->so_options |= SO_ACCEPTCONN; 234 if (backlog < 0 || backlog > somaxconn) 235 backlog = somaxconn; 236 if (backlog < sominconn) 237 backlog = sominconn; 238 so->so_qlimit = backlog; 239 return (0); 240 } 241 242 #define SOSP_FREEING_READ 1 243 #define SOSP_FREEING_WRITE 2 244 void 245 sofree(struct socket *so, int s) 246 { 247 soassertlocked(so); 248 249 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { 250 sounlock(so, s); 251 return; 252 } 253 if (so->so_head) { 254 /* 255 * We must not decommission a socket that's on the accept(2) 256 * queue. If we do, then accept(2) may hang after select(2) 257 * indicated that the listening socket was ready. 258 */ 259 if (!soqremque(so, 0)) { 260 sounlock(so, s); 261 return; 262 } 263 } 264 sigio_free(&so->so_sigio); 265 #ifdef SOCKET_SPLICE 266 if (so->so_sp) { 267 if (issplicedback(so)) { 268 int freeing = SOSP_FREEING_WRITE; 269 270 if (so->so_sp->ssp_soback == so) 271 freeing |= SOSP_FREEING_READ; 272 sounsplice(so->so_sp->ssp_soback, so, freeing); 273 } 274 if (isspliced(so)) { 275 int freeing = SOSP_FREEING_READ; 276 277 if (so == so->so_sp->ssp_socket) 278 freeing |= SOSP_FREEING_WRITE; 279 sounsplice(so, so->so_sp->ssp_socket, freeing); 280 } 281 } 282 #endif /* SOCKET_SPLICE */ 283 sbrelease(so, &so->so_snd); 284 sorflush(so); 285 sounlock(so, s); 286 #ifdef SOCKET_SPLICE 287 if (so->so_sp) { 288 /* Reuse splice idle, sounsplice() has been called before. */ 289 timeout_set_proc(&so->so_sp->ssp_idleto, soreaper, so); 290 timeout_add(&so->so_sp->ssp_idleto, 0); 291 } else 292 #endif /* SOCKET_SPLICE */ 293 { 294 pool_put(&socket_pool, so); 295 } 296 } 297 298 static inline uint64_t 299 solinger_nsec(struct socket *so) 300 { 301 if (so->so_linger == 0) 302 return INFSLP; 303 304 return SEC_TO_NSEC(so->so_linger); 305 } 306 307 /* 308 * Close a socket on last file table reference removal. 309 * Initiate disconnect if connected. 310 * Free socket when disconnect complete. 311 */ 312 int 313 soclose(struct socket *so, int flags) 314 { 315 struct socket *so2; 316 int s, error = 0; 317 318 s = solock(so); 319 /* Revoke async IO early. There is a final revocation in sofree(). */ 320 sigio_free(&so->so_sigio); 321 if (so->so_options & SO_ACCEPTCONN) { 322 while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { 323 (void) soqremque(so2, 0); 324 (void) soabort(so2); 325 } 326 while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { 327 (void) soqremque(so2, 1); 328 (void) soabort(so2); 329 } 330 } 331 if (so->so_pcb == NULL) 332 goto discard; 333 if (so->so_state & SS_ISCONNECTED) { 334 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 335 error = sodisconnect(so); 336 if (error) 337 goto drop; 338 } 339 if (so->so_options & SO_LINGER) { 340 if ((so->so_state & SS_ISDISCONNECTING) && 341 (flags & MSG_DONTWAIT)) 342 goto drop; 343 while (so->so_state & SS_ISCONNECTED) { 344 error = sosleep_nsec(so, &so->so_timeo, 345 PSOCK | PCATCH, "netcls", 346 solinger_nsec(so)); 347 if (error) 348 break; 349 } 350 } 351 } 352 drop: 353 if (so->so_pcb) { 354 int error2; 355 KASSERT(so->so_proto->pr_detach); 356 error2 = (*so->so_proto->pr_detach)(so); 357 if (error == 0) 358 error = error2; 359 } 360 discard: 361 if (so->so_state & SS_NOFDREF) 362 panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type); 363 so->so_state |= SS_NOFDREF; 364 /* sofree() calls sounlock(). */ 365 sofree(so, s); 366 return (error); 367 } 368 369 int 370 soabort(struct socket *so) 371 { 372 soassertlocked(so); 373 374 return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, NULL, NULL, NULL, 375 curproc); 376 } 377 378 int 379 soaccept(struct socket *so, struct mbuf *nam) 380 { 381 int error = 0; 382 383 soassertlocked(so); 384 385 if ((so->so_state & SS_NOFDREF) == 0) 386 panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type); 387 so->so_state &= ~SS_NOFDREF; 388 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 389 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 390 error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, NULL, 391 nam, NULL, curproc); 392 else 393 error = ECONNABORTED; 394 return (error); 395 } 396 397 int 398 soconnect(struct socket *so, struct mbuf *nam) 399 { 400 int error; 401 402 soassertlocked(so); 403 404 if (so->so_options & SO_ACCEPTCONN) 405 return (EOPNOTSUPP); 406 /* 407 * If protocol is connection-based, can only connect once. 408 * Otherwise, if connected, try to disconnect first. 409 * This allows user to disconnect by connecting to, e.g., 410 * a null address. 411 */ 412 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 413 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 414 (error = sodisconnect(so)))) 415 error = EISCONN; 416 else 417 error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, 418 NULL, nam, NULL, curproc); 419 return (error); 420 } 421 422 int 423 soconnect2(struct socket *so1, struct socket *so2) 424 { 425 int s, error; 426 427 s = solock(so1); 428 error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, NULL, 429 (struct mbuf *)so2, NULL, curproc); 430 sounlock(so1, s); 431 return (error); 432 } 433 434 int 435 sodisconnect(struct socket *so) 436 { 437 int error; 438 439 soassertlocked(so); 440 441 if ((so->so_state & SS_ISCONNECTED) == 0) 442 return (ENOTCONN); 443 if (so->so_state & SS_ISDISCONNECTING) 444 return (EALREADY); 445 error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, NULL, NULL, 446 NULL, curproc); 447 return (error); 448 } 449 450 int m_getuio(struct mbuf **, int, long, struct uio *); 451 452 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 453 /* 454 * Send on a socket. 455 * If send must go all at once and message is larger than 456 * send buffering, then hard error. 457 * Lock against other senders. 458 * If must go all at once and not enough room now, then 459 * inform user that this would block and do nothing. 460 * Otherwise, if nonblocking, send as much as possible. 461 * The data to be sent is described by "uio" if nonzero, 462 * otherwise by the mbuf chain "top" (which must be null 463 * if uio is not). Data provided in mbuf chain must be small 464 * enough to send all at once. 465 * 466 * Returns nonzero on error, timeout or signal; callers 467 * must check for short counts if EINTR/ERESTART are returned. 468 * Data and control buffers are freed on return. 469 */ 470 int 471 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 472 struct mbuf *control, int flags) 473 { 474 long space, clen = 0; 475 size_t resid; 476 int error, s; 477 int atomic = sosendallatonce(so) || top; 478 479 if (uio) 480 resid = uio->uio_resid; 481 else 482 resid = top->m_pkthdr.len; 483 /* MSG_EOR on a SOCK_STREAM socket is invalid. */ 484 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 485 m_freem(top); 486 m_freem(control); 487 return (EINVAL); 488 } 489 if (uio && uio->uio_procp) 490 uio->uio_procp->p_ru.ru_msgsnd++; 491 if (control) { 492 /* 493 * In theory clen should be unsigned (since control->m_len is). 494 * However, space must be signed, as it might be less than 0 495 * if we over-committed, and we must use a signed comparison 496 * of space and clen. 497 */ 498 clen = control->m_len; 499 /* reserve extra space for AF_UNIX's internalize */ 500 if (so->so_proto->pr_domain->dom_family == AF_UNIX && 501 clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) && 502 mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 503 clen = CMSG_SPACE( 504 (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) * 505 (sizeof(struct fdpass) / sizeof(int))); 506 } 507 508 #define snderr(errno) { error = errno; goto release; } 509 510 s = solock(so); 511 restart: 512 if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0) 513 goto out; 514 so->so_state |= SS_ISSENDING; 515 do { 516 if (so->so_state & SS_CANTSENDMORE) 517 snderr(EPIPE); 518 if (so->so_error) { 519 error = so->so_error; 520 so->so_error = 0; 521 snderr(error); 522 } 523 if ((so->so_state & SS_ISCONNECTED) == 0) { 524 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 525 if (!(resid == 0 && clen != 0)) 526 snderr(ENOTCONN); 527 } else if (addr == NULL) 528 snderr(EDESTADDRREQ); 529 } 530 space = sbspace(so, &so->so_snd); 531 if (flags & MSG_OOB) 532 space += 1024; 533 if (so->so_proto->pr_domain->dom_family == AF_UNIX) { 534 if (atomic && resid > so->so_snd.sb_hiwat) 535 snderr(EMSGSIZE); 536 } else { 537 if (clen > so->so_snd.sb_hiwat || 538 (atomic && resid > so->so_snd.sb_hiwat - clen)) 539 snderr(EMSGSIZE); 540 } 541 if (space < clen || 542 (space - clen < resid && 543 (atomic || space < so->so_snd.sb_lowat))) { 544 if (flags & MSG_DONTWAIT) 545 snderr(EWOULDBLOCK); 546 sbunlock(so, &so->so_snd); 547 error = sbwait(so, &so->so_snd); 548 so->so_state &= ~SS_ISSENDING; 549 if (error) 550 goto out; 551 goto restart; 552 } 553 space -= clen; 554 do { 555 if (uio == NULL) { 556 /* 557 * Data is prepackaged in "top". 558 */ 559 resid = 0; 560 if (flags & MSG_EOR) 561 top->m_flags |= M_EOR; 562 } else { 563 sounlock(so, s); 564 error = m_getuio(&top, atomic, space, uio); 565 s = solock(so); 566 if (error) 567 goto release; 568 space -= top->m_pkthdr.len; 569 resid = uio->uio_resid; 570 if (flags & MSG_EOR) 571 top->m_flags |= M_EOR; 572 } 573 if (resid == 0) 574 so->so_state &= ~SS_ISSENDING; 575 if (top && so->so_options & SO_ZEROIZE) 576 top->m_flags |= M_ZEROIZE; 577 error = (*so->so_proto->pr_usrreq)(so, 578 (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, 579 top, addr, control, curproc); 580 clen = 0; 581 control = NULL; 582 top = NULL; 583 if (error) 584 goto release; 585 } while (resid && space > 0); 586 } while (resid); 587 588 release: 589 so->so_state &= ~SS_ISSENDING; 590 sbunlock(so, &so->so_snd); 591 out: 592 sounlock(so, s); 593 m_freem(top); 594 m_freem(control); 595 return (error); 596 } 597 598 int 599 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio) 600 { 601 struct mbuf *m, *top = NULL; 602 struct mbuf **nextp = ⊤ 603 u_long len, mlen; 604 size_t resid = uio->uio_resid; 605 int error; 606 607 do { 608 if (top == NULL) { 609 MGETHDR(m, M_WAIT, MT_DATA); 610 mlen = MHLEN; 611 m->m_pkthdr.len = 0; 612 m->m_pkthdr.ph_ifidx = 0; 613 } else { 614 MGET(m, M_WAIT, MT_DATA); 615 mlen = MLEN; 616 } 617 /* chain mbuf together */ 618 *nextp = m; 619 nextp = &m->m_next; 620 621 resid = ulmin(resid, space); 622 if (resid >= MINCLSIZE) { 623 MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES)); 624 if ((m->m_flags & M_EXT) == 0) 625 MCLGETL(m, M_NOWAIT, MCLBYTES); 626 if ((m->m_flags & M_EXT) == 0) 627 goto nopages; 628 mlen = m->m_ext.ext_size; 629 len = ulmin(mlen, resid); 630 /* 631 * For datagram protocols, leave room 632 * for protocol headers in first mbuf. 633 */ 634 if (atomic && m == top && len < mlen - max_hdr) 635 m->m_data += max_hdr; 636 } else { 637 nopages: 638 len = ulmin(mlen, resid); 639 /* 640 * For datagram protocols, leave room 641 * for protocol headers in first mbuf. 642 */ 643 if (atomic && m == top && len < mlen - max_hdr) 644 m_align(m, len); 645 } 646 647 error = uiomove(mtod(m, caddr_t), len, uio); 648 if (error) { 649 m_freem(top); 650 return (error); 651 } 652 653 /* adjust counters */ 654 resid = uio->uio_resid; 655 space -= len; 656 m->m_len = len; 657 top->m_pkthdr.len += len; 658 659 /* Is there more space and more data? */ 660 } while (space > 0 && resid > 0); 661 662 *mp = top; 663 return 0; 664 } 665 666 /* 667 * Following replacement or removal of the first mbuf on the first 668 * mbuf chain of a socket buffer, push necessary state changes back 669 * into the socket buffer so that other consumers see the values 670 * consistently. 'nextrecord' is the callers locally stored value of 671 * the original value of sb->sb_mb->m_nextpkt which must be restored 672 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 673 */ 674 void 675 sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 676 { 677 678 /* 679 * First, update for the new value of nextrecord. If necessary, 680 * make it the first record. 681 */ 682 if (sb->sb_mb != NULL) 683 sb->sb_mb->m_nextpkt = nextrecord; 684 else 685 sb->sb_mb = nextrecord; 686 687 /* 688 * Now update any dependent socket buffer fields to reflect 689 * the new state. This is an inline of SB_EMPTY_FIXUP, with 690 * the addition of a second clause that takes care of the 691 * case where sb_mb has been updated, but remains the last 692 * record. 693 */ 694 if (sb->sb_mb == NULL) { 695 sb->sb_mbtail = NULL; 696 sb->sb_lastrecord = NULL; 697 } else if (sb->sb_mb->m_nextpkt == NULL) 698 sb->sb_lastrecord = sb->sb_mb; 699 } 700 701 /* 702 * Implement receive operations on a socket. 703 * We depend on the way that records are added to the sockbuf 704 * by sbappend*. In particular, each record (mbufs linked through m_next) 705 * must begin with an address if the protocol so specifies, 706 * followed by an optional mbuf or mbufs containing ancillary data, 707 * and then zero or more mbufs of data. 708 * In order to avoid blocking network for the entire time here, we release 709 * the solock() while doing the actual copy to user space. 710 * Although the sockbuf is locked, new data may still be appended, 711 * and thus we must maintain consistency of the sockbuf during that time. 712 * 713 * The caller may receive the data as a single mbuf chain by supplying 714 * an mbuf **mp0 for use in returning the chain. The uio is then used 715 * only for the count in uio_resid. 716 */ 717 int 718 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 719 struct mbuf **mp0, struct mbuf **controlp, int *flagsp, 720 socklen_t controllen) 721 { 722 struct mbuf *m, **mp; 723 struct mbuf *cm; 724 u_long len, offset, moff; 725 int flags, error, s, type, uio_error = 0; 726 const struct protosw *pr = so->so_proto; 727 struct mbuf *nextrecord; 728 size_t resid, orig_resid = uio->uio_resid; 729 730 mp = mp0; 731 if (paddr) 732 *paddr = NULL; 733 if (controlp) 734 *controlp = NULL; 735 if (flagsp) 736 flags = *flagsp &~ MSG_EOR; 737 else 738 flags = 0; 739 if (flags & MSG_OOB) { 740 m = m_get(M_WAIT, MT_DATA); 741 s = solock(so); 742 error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, 743 (struct mbuf *)(long)(flags & MSG_PEEK), NULL, curproc); 744 sounlock(so, s); 745 if (error) 746 goto bad; 747 do { 748 error = uiomove(mtod(m, caddr_t), 749 ulmin(uio->uio_resid, m->m_len), uio); 750 m = m_free(m); 751 } while (uio->uio_resid && error == 0 && m); 752 bad: 753 m_freem(m); 754 return (error); 755 } 756 if (mp) 757 *mp = NULL; 758 759 s = solock(so); 760 restart: 761 if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) { 762 sounlock(so, s); 763 return (error); 764 } 765 766 m = so->so_rcv.sb_mb; 767 #ifdef SOCKET_SPLICE 768 if (isspliced(so)) 769 m = NULL; 770 #endif /* SOCKET_SPLICE */ 771 /* 772 * If we have less data than requested, block awaiting more 773 * (subject to any timeout) if: 774 * 1. the current count is less than the low water mark, 775 * 2. MSG_WAITALL is set, and it is possible to do the entire 776 * receive operation at once if we block (resid <= hiwat), or 777 * 3. MSG_DONTWAIT is not set. 778 * If MSG_WAITALL is set but resid is larger than the receive buffer, 779 * we have to do the receive in sections, and thus risk returning 780 * a short count if a timeout or signal occurs after we start. 781 */ 782 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 783 so->so_rcv.sb_cc < uio->uio_resid) && 784 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 785 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 786 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 787 #ifdef DIAGNOSTIC 788 if (m == NULL && so->so_rcv.sb_cc) 789 #ifdef SOCKET_SPLICE 790 if (!isspliced(so)) 791 #endif /* SOCKET_SPLICE */ 792 panic("receive 1: so %p, so_type %d, sb_cc %lu", 793 so, so->so_type, so->so_rcv.sb_cc); 794 #endif 795 if (so->so_error) { 796 if (m) 797 goto dontblock; 798 error = so->so_error; 799 if ((flags & MSG_PEEK) == 0) 800 so->so_error = 0; 801 goto release; 802 } 803 if (so->so_state & SS_CANTRCVMORE) { 804 if (m) 805 goto dontblock; 806 else if (so->so_rcv.sb_cc == 0) 807 goto release; 808 } 809 for (; m; m = m->m_next) 810 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 811 m = so->so_rcv.sb_mb; 812 goto dontblock; 813 } 814 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 815 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 816 error = ENOTCONN; 817 goto release; 818 } 819 if (uio->uio_resid == 0 && controlp == NULL) 820 goto release; 821 if (flags & MSG_DONTWAIT) { 822 error = EWOULDBLOCK; 823 goto release; 824 } 825 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 826 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 827 sbunlock(so, &so->so_rcv); 828 error = sbwait(so, &so->so_rcv); 829 if (error) { 830 sounlock(so, s); 831 return (error); 832 } 833 goto restart; 834 } 835 dontblock: 836 /* 837 * On entry here, m points to the first record of the socket buffer. 838 * From this point onward, we maintain 'nextrecord' as a cache of the 839 * pointer to the next record in the socket buffer. We must keep the 840 * various socket buffer pointers and local stack versions of the 841 * pointers in sync, pushing out modifications before operations that 842 * may sleep, and re-reading them afterwards. 843 * 844 * Otherwise, we will race with the network stack appending new data 845 * or records onto the socket buffer by using inconsistent/stale 846 * versions of the field, possibly resulting in socket buffer 847 * corruption. 848 */ 849 if (uio->uio_procp) 850 uio->uio_procp->p_ru.ru_msgrcv++; 851 KASSERT(m == so->so_rcv.sb_mb); 852 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 853 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 854 nextrecord = m->m_nextpkt; 855 if (pr->pr_flags & PR_ADDR) { 856 #ifdef DIAGNOSTIC 857 if (m->m_type != MT_SONAME) 858 panic("receive 1a: so %p, so_type %d, m %p, m_type %d", 859 so, so->so_type, m, m->m_type); 860 #endif 861 orig_resid = 0; 862 if (flags & MSG_PEEK) { 863 if (paddr) 864 *paddr = m_copym(m, 0, m->m_len, M_NOWAIT); 865 m = m->m_next; 866 } else { 867 sbfree(so, &so->so_rcv, m); 868 if (paddr) { 869 *paddr = m; 870 so->so_rcv.sb_mb = m->m_next; 871 m->m_next = NULL; 872 m = so->so_rcv.sb_mb; 873 } else { 874 so->so_rcv.sb_mb = m_free(m); 875 m = so->so_rcv.sb_mb; 876 } 877 sbsync(&so->so_rcv, nextrecord); 878 } 879 } 880 while (m && m->m_type == MT_CONTROL && error == 0) { 881 int skip = 0; 882 if (flags & MSG_PEEK) { 883 if (mtod(m, struct cmsghdr *)->cmsg_type == 884 SCM_RIGHTS) { 885 /* don't leak internalized SCM_RIGHTS msgs */ 886 skip = 1; 887 } else if (controlp) 888 *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); 889 m = m->m_next; 890 } else { 891 sbfree(so, &so->so_rcv, m); 892 so->so_rcv.sb_mb = m->m_next; 893 m->m_nextpkt = m->m_next = NULL; 894 cm = m; 895 m = so->so_rcv.sb_mb; 896 sbsync(&so->so_rcv, nextrecord); 897 if (controlp) { 898 if (pr->pr_domain->dom_externalize) { 899 sounlock(so, s); 900 error = 901 (*pr->pr_domain->dom_externalize) 902 (cm, controllen, flags); 903 s = solock(so); 904 } 905 *controlp = cm; 906 } else { 907 /* 908 * Dispose of any SCM_RIGHTS message that went 909 * through the read path rather than recv. 910 */ 911 if (pr->pr_domain->dom_dispose) 912 pr->pr_domain->dom_dispose(cm); 913 m_free(cm); 914 } 915 } 916 if (m != NULL) 917 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 918 else 919 nextrecord = so->so_rcv.sb_mb; 920 if (controlp && !skip) 921 controlp = &(*controlp)->m_next; 922 orig_resid = 0; 923 } 924 925 /* If m is non-NULL, we have some data to read. */ 926 if (m) { 927 type = m->m_type; 928 if (type == MT_OOBDATA) 929 flags |= MSG_OOB; 930 if (m->m_flags & M_BCAST) 931 flags |= MSG_BCAST; 932 if (m->m_flags & M_MCAST) 933 flags |= MSG_MCAST; 934 } 935 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 936 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 937 938 moff = 0; 939 offset = 0; 940 while (m && uio->uio_resid > 0 && error == 0) { 941 if (m->m_type == MT_OOBDATA) { 942 if (type != MT_OOBDATA) 943 break; 944 } else if (type == MT_OOBDATA) { 945 break; 946 } else if (m->m_type == MT_CONTROL) { 947 /* 948 * If there is more than one control message in the 949 * stream, we do a short read. Next can be received 950 * or disposed by another system call. 951 */ 952 break; 953 #ifdef DIAGNOSTIC 954 } else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) { 955 panic("receive 3: so %p, so_type %d, m %p, m_type %d", 956 so, so->so_type, m, m->m_type); 957 #endif 958 } 959 so->so_state &= ~SS_RCVATMARK; 960 len = uio->uio_resid; 961 if (so->so_oobmark && len > so->so_oobmark - offset) 962 len = so->so_oobmark - offset; 963 if (len > m->m_len - moff) 964 len = m->m_len - moff; 965 /* 966 * If mp is set, just pass back the mbufs. 967 * Otherwise copy them out via the uio, then free. 968 * Sockbuf must be consistent here (points to current mbuf, 969 * it points to next record) when we drop priority; 970 * we must note any additions to the sockbuf when we 971 * block interrupts again. 972 */ 973 if (mp == NULL && uio_error == 0) { 974 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 975 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 976 resid = uio->uio_resid; 977 sounlock(so, s); 978 uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); 979 s = solock(so); 980 if (uio_error) 981 uio->uio_resid = resid - len; 982 } else 983 uio->uio_resid -= len; 984 if (len == m->m_len - moff) { 985 if (m->m_flags & M_EOR) 986 flags |= MSG_EOR; 987 if (flags & MSG_PEEK) { 988 m = m->m_next; 989 moff = 0; 990 orig_resid = 0; 991 } else { 992 nextrecord = m->m_nextpkt; 993 sbfree(so, &so->so_rcv, m); 994 if (mp) { 995 *mp = m; 996 mp = &m->m_next; 997 so->so_rcv.sb_mb = m = m->m_next; 998 *mp = NULL; 999 } else { 1000 so->so_rcv.sb_mb = m_free(m); 1001 m = so->so_rcv.sb_mb; 1002 } 1003 /* 1004 * If m != NULL, we also know that 1005 * so->so_rcv.sb_mb != NULL. 1006 */ 1007 KASSERT(so->so_rcv.sb_mb == m); 1008 if (m) { 1009 m->m_nextpkt = nextrecord; 1010 if (nextrecord == NULL) 1011 so->so_rcv.sb_lastrecord = m; 1012 } else { 1013 so->so_rcv.sb_mb = nextrecord; 1014 SB_EMPTY_FIXUP(&so->so_rcv); 1015 } 1016 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1017 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1018 } 1019 } else { 1020 if (flags & MSG_PEEK) { 1021 moff += len; 1022 orig_resid = 0; 1023 } else { 1024 if (mp) 1025 *mp = m_copym(m, 0, len, M_WAIT); 1026 m->m_data += len; 1027 m->m_len -= len; 1028 so->so_rcv.sb_cc -= len; 1029 so->so_rcv.sb_datacc -= len; 1030 } 1031 } 1032 if (so->so_oobmark) { 1033 if ((flags & MSG_PEEK) == 0) { 1034 so->so_oobmark -= len; 1035 if (so->so_oobmark == 0) { 1036 so->so_state |= SS_RCVATMARK; 1037 break; 1038 } 1039 } else { 1040 offset += len; 1041 if (offset == so->so_oobmark) 1042 break; 1043 } 1044 } 1045 if (flags & MSG_EOR) 1046 break; 1047 /* 1048 * If the MSG_WAITALL flag is set (for non-atomic socket), 1049 * we must not quit until "uio->uio_resid == 0" or an error 1050 * termination. If a signal/timeout occurs, return 1051 * with a short count but without error. 1052 * Keep sockbuf locked against other readers. 1053 */ 1054 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1055 !sosendallatonce(so) && !nextrecord) { 1056 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1057 break; 1058 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1059 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1060 error = sbwait(so, &so->so_rcv); 1061 if (error) { 1062 sbunlock(so, &so->so_rcv); 1063 sounlock(so, s); 1064 return (0); 1065 } 1066 if ((m = so->so_rcv.sb_mb) != NULL) 1067 nextrecord = m->m_nextpkt; 1068 } 1069 } 1070 1071 if (m && pr->pr_flags & PR_ATOMIC) { 1072 flags |= MSG_TRUNC; 1073 if ((flags & MSG_PEEK) == 0) 1074 (void) sbdroprecord(so, &so->so_rcv); 1075 } 1076 if ((flags & MSG_PEEK) == 0) { 1077 if (m == NULL) { 1078 /* 1079 * First part is an inline SB_EMPTY_FIXUP(). Second 1080 * part makes sure sb_lastrecord is up-to-date if 1081 * there is still data in the socket buffer. 1082 */ 1083 so->so_rcv.sb_mb = nextrecord; 1084 if (so->so_rcv.sb_mb == NULL) { 1085 so->so_rcv.sb_mbtail = NULL; 1086 so->so_rcv.sb_lastrecord = NULL; 1087 } else if (nextrecord->m_nextpkt == NULL) 1088 so->so_rcv.sb_lastrecord = nextrecord; 1089 } 1090 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1091 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1092 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 1093 (*pr->pr_usrreq)(so, PRU_RCVD, NULL, 1094 (struct mbuf *)(long)flags, NULL, curproc); 1095 } 1096 if (orig_resid == uio->uio_resid && orig_resid && 1097 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1098 sbunlock(so, &so->so_rcv); 1099 goto restart; 1100 } 1101 1102 if (uio_error) 1103 error = uio_error; 1104 1105 if (flagsp) 1106 *flagsp |= flags; 1107 release: 1108 sbunlock(so, &so->so_rcv); 1109 sounlock(so, s); 1110 return (error); 1111 } 1112 1113 int 1114 soshutdown(struct socket *so, int how) 1115 { 1116 const struct protosw *pr = so->so_proto; 1117 int s, error = 0; 1118 1119 s = solock(so); 1120 switch (how) { 1121 case SHUT_RD: 1122 sorflush(so); 1123 break; 1124 case SHUT_RDWR: 1125 sorflush(so); 1126 /* FALLTHROUGH */ 1127 case SHUT_WR: 1128 error = (*pr->pr_usrreq)(so, PRU_SHUTDOWN, NULL, NULL, NULL, 1129 curproc); 1130 break; 1131 default: 1132 error = EINVAL; 1133 break; 1134 } 1135 sounlock(so, s); 1136 1137 return (error); 1138 } 1139 1140 void 1141 sorflush(struct socket *so) 1142 { 1143 struct sockbuf *sb = &so->so_rcv; 1144 struct mbuf *m; 1145 const struct protosw *pr = so->so_proto; 1146 int error; 1147 1148 sb->sb_flags |= SB_NOINTR; 1149 error = sblock(so, sb, M_WAITOK); 1150 /* with SB_NOINTR and M_WAITOK sblock() must not fail */ 1151 KASSERT(error == 0); 1152 socantrcvmore(so); 1153 m = sb->sb_mb; 1154 memset(&sb->sb_startzero, 0, 1155 (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); 1156 sb->sb_timeo_nsecs = INFSLP; 1157 sbunlock(so, sb); 1158 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1159 (*pr->pr_domain->dom_dispose)(m); 1160 m_purge(m); 1161 } 1162 1163 #ifdef SOCKET_SPLICE 1164 1165 #define so_splicelen so_sp->ssp_len 1166 #define so_splicemax so_sp->ssp_max 1167 #define so_idletv so_sp->ssp_idletv 1168 #define so_idleto so_sp->ssp_idleto 1169 #define so_splicetask so_sp->ssp_task 1170 1171 int 1172 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) 1173 { 1174 struct file *fp; 1175 struct socket *sosp; 1176 struct sosplice *sp; 1177 struct taskq *tq; 1178 int error = 0; 1179 1180 soassertlocked(so); 1181 1182 if (sosplice_taskq == NULL) { 1183 rw_enter_write(&sosplice_lock); 1184 if (sosplice_taskq == NULL) { 1185 tq = taskq_create("sosplice", 1, IPL_SOFTNET, 1186 TASKQ_MPSAFE); 1187 /* Ensure the taskq is fully visible to other CPUs. */ 1188 membar_producer(); 1189 sosplice_taskq = tq; 1190 } 1191 rw_exit_write(&sosplice_lock); 1192 } 1193 if (sosplice_taskq == NULL) 1194 return (ENOMEM); 1195 1196 if ((so->so_proto->pr_flags & PR_SPLICE) == 0) 1197 return (EPROTONOSUPPORT); 1198 if (so->so_options & SO_ACCEPTCONN) 1199 return (EOPNOTSUPP); 1200 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1201 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 1202 return (ENOTCONN); 1203 if (so->so_sp == NULL) { 1204 sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1205 if (so->so_sp == NULL) 1206 so->so_sp = sp; 1207 else 1208 pool_put(&sosplice_pool, sp); 1209 } 1210 1211 /* If no fd is given, unsplice by removing existing link. */ 1212 if (fd < 0) { 1213 /* Lock receive buffer. */ 1214 if ((error = sblock(so, &so->so_rcv, M_WAITOK)) != 0) { 1215 return (error); 1216 } 1217 if (so->so_sp->ssp_socket) 1218 sounsplice(so, so->so_sp->ssp_socket, 0); 1219 sbunlock(so, &so->so_rcv); 1220 return (0); 1221 } 1222 1223 if (max && max < 0) 1224 return (EINVAL); 1225 1226 if (tv && (tv->tv_sec < 0 || !timerisvalid(tv))) 1227 return (EINVAL); 1228 1229 /* Find sosp, the drain socket where data will be spliced into. */ 1230 if ((error = getsock(curproc, fd, &fp)) != 0) 1231 return (error); 1232 sosp = fp->f_data; 1233 if (sosp->so_proto->pr_usrreq != so->so_proto->pr_usrreq) { 1234 error = EPROTONOSUPPORT; 1235 goto frele; 1236 } 1237 if (sosp->so_sp == NULL) { 1238 sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); 1239 if (sosp->so_sp == NULL) 1240 sosp->so_sp = sp; 1241 else 1242 pool_put(&sosplice_pool, sp); 1243 } 1244 1245 /* Lock both receive and send buffer. */ 1246 if ((error = sblock(so, &so->so_rcv, M_WAITOK)) != 0) { 1247 goto frele; 1248 } 1249 if ((error = sblock(so, &sosp->so_snd, M_WAITOK)) != 0) { 1250 sbunlock(so, &so->so_rcv); 1251 goto frele; 1252 } 1253 1254 if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { 1255 error = EBUSY; 1256 goto release; 1257 } 1258 if (sosp->so_options & SO_ACCEPTCONN) { 1259 error = EOPNOTSUPP; 1260 goto release; 1261 } 1262 if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) { 1263 error = ENOTCONN; 1264 goto release; 1265 } 1266 1267 /* Splice so and sosp together. */ 1268 so->so_sp->ssp_socket = sosp; 1269 sosp->so_sp->ssp_soback = so; 1270 so->so_splicelen = 0; 1271 so->so_splicemax = max; 1272 if (tv) 1273 so->so_idletv = *tv; 1274 else 1275 timerclear(&so->so_idletv); 1276 timeout_set_proc(&so->so_idleto, soidle, so); 1277 task_set(&so->so_splicetask, sotask, so); 1278 1279 /* 1280 * To prevent softnet interrupt from calling somove() while 1281 * we sleep, the socket buffers are not marked as spliced yet. 1282 */ 1283 if (somove(so, M_WAIT)) { 1284 so->so_rcv.sb_flags |= SB_SPLICE; 1285 sosp->so_snd.sb_flags |= SB_SPLICE; 1286 } 1287 1288 release: 1289 sbunlock(sosp, &sosp->so_snd); 1290 sbunlock(so, &so->so_rcv); 1291 frele: 1292 /* 1293 * FRELE() must not be called with the socket lock held. It is safe to 1294 * release the lock here as long as no other operation happen on the 1295 * socket when sosplice() returns. The dance could be avoided by 1296 * grabbing the socket lock inside this function. 1297 */ 1298 sounlock(so, SL_LOCKED); 1299 FRELE(fp, curproc); 1300 solock(so); 1301 return (error); 1302 } 1303 1304 void 1305 sounsplice(struct socket *so, struct socket *sosp, int freeing) 1306 { 1307 soassertlocked(so); 1308 1309 task_del(sosplice_taskq, &so->so_splicetask); 1310 timeout_del(&so->so_idleto); 1311 sosp->so_snd.sb_flags &= ~SB_SPLICE; 1312 so->so_rcv.sb_flags &= ~SB_SPLICE; 1313 so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; 1314 /* Do not wakeup a socket that is about to be freed. */ 1315 if ((freeing & SOSP_FREEING_READ) == 0 && soreadable(so)) 1316 sorwakeup(so); 1317 if ((freeing & SOSP_FREEING_WRITE) == 0 && sowriteable(sosp)) 1318 sowwakeup(sosp); 1319 } 1320 1321 void 1322 soidle(void *arg) 1323 { 1324 struct socket *so = arg; 1325 int s; 1326 1327 s = solock(so); 1328 if (so->so_rcv.sb_flags & SB_SPLICE) { 1329 so->so_error = ETIMEDOUT; 1330 sounsplice(so, so->so_sp->ssp_socket, 0); 1331 } 1332 sounlock(so, s); 1333 } 1334 1335 void 1336 sotask(void *arg) 1337 { 1338 struct socket *so = arg; 1339 int s; 1340 1341 s = solock(so); 1342 if (so->so_rcv.sb_flags & SB_SPLICE) { 1343 /* 1344 * We may not sleep here as sofree() and unsplice() may be 1345 * called from softnet interrupt context. This would remove 1346 * the socket during somove(). 1347 */ 1348 somove(so, M_DONTWAIT); 1349 } 1350 sounlock(so, s); 1351 1352 /* Avoid user land starvation. */ 1353 yield(); 1354 } 1355 1356 /* 1357 * The socket splicing task or idle timeout may sleep while grabbing the net 1358 * lock. As sofree() can be called anytime, sotask() or soidle() could access 1359 * the socket memory of a freed socket after wakeup. So delay the pool_put() 1360 * after all pending socket splicing tasks or timeouts have finished. Do this 1361 * by scheduling it on the same threads. 1362 */ 1363 void 1364 soreaper(void *arg) 1365 { 1366 struct socket *so = arg; 1367 1368 /* Reuse splice task, sounsplice() has been called before. */ 1369 task_set(&so->so_sp->ssp_task, soput, so); 1370 task_add(sosplice_taskq, &so->so_sp->ssp_task); 1371 } 1372 1373 void 1374 soput(void *arg) 1375 { 1376 struct socket *so = arg; 1377 1378 pool_put(&sosplice_pool, so->so_sp); 1379 pool_put(&socket_pool, so); 1380 } 1381 1382 /* 1383 * Move data from receive buffer of spliced source socket to send 1384 * buffer of drain socket. Try to move as much as possible in one 1385 * big chunk. It is a TCP only implementation. 1386 * Return value 0 means splicing has been finished, 1 continue. 1387 */ 1388 int 1389 somove(struct socket *so, int wait) 1390 { 1391 struct socket *sosp = so->so_sp->ssp_socket; 1392 struct mbuf *m, **mp, *nextrecord; 1393 u_long len, off, oobmark; 1394 long space; 1395 int error = 0, maxreached = 0; 1396 unsigned int state; 1397 1398 soassertlocked(so); 1399 1400 nextpkt: 1401 if (so->so_error) { 1402 error = so->so_error; 1403 goto release; 1404 } 1405 if (sosp->so_state & SS_CANTSENDMORE) { 1406 error = EPIPE; 1407 goto release; 1408 } 1409 if (sosp->so_error && sosp->so_error != ETIMEDOUT && 1410 sosp->so_error != EFBIG && sosp->so_error != ELOOP) { 1411 error = sosp->so_error; 1412 goto release; 1413 } 1414 if ((sosp->so_state & SS_ISCONNECTED) == 0) 1415 goto release; 1416 1417 /* Calculate how many bytes can be copied now. */ 1418 len = so->so_rcv.sb_datacc; 1419 if (so->so_splicemax) { 1420 KASSERT(so->so_splicelen < so->so_splicemax); 1421 if (so->so_splicemax <= so->so_splicelen + len) { 1422 len = so->so_splicemax - so->so_splicelen; 1423 maxreached = 1; 1424 } 1425 } 1426 space = sbspace(sosp, &sosp->so_snd); 1427 if (so->so_oobmark && so->so_oobmark < len && 1428 so->so_oobmark < space + 1024) 1429 space += 1024; 1430 if (space <= 0) { 1431 maxreached = 0; 1432 goto release; 1433 } 1434 if (space < len) { 1435 maxreached = 0; 1436 if (space < sosp->so_snd.sb_lowat) 1437 goto release; 1438 len = space; 1439 } 1440 sosp->so_state |= SS_ISSENDING; 1441 1442 SBLASTRECORDCHK(&so->so_rcv, "somove 1"); 1443 SBLASTMBUFCHK(&so->so_rcv, "somove 1"); 1444 m = so->so_rcv.sb_mb; 1445 if (m == NULL) 1446 goto release; 1447 nextrecord = m->m_nextpkt; 1448 1449 /* Drop address and control information not used with splicing. */ 1450 if (so->so_proto->pr_flags & PR_ADDR) { 1451 #ifdef DIAGNOSTIC 1452 if (m->m_type != MT_SONAME) 1453 panic("somove soname: so %p, so_type %d, m %p, " 1454 "m_type %d", so, so->so_type, m, m->m_type); 1455 #endif 1456 m = m->m_next; 1457 } 1458 while (m && m->m_type == MT_CONTROL) 1459 m = m->m_next; 1460 if (m == NULL) { 1461 sbdroprecord(so, &so->so_rcv); 1462 if (so->so_proto->pr_flags & PR_WANTRCVD && so->so_pcb) 1463 (so->so_proto->pr_usrreq)(so, PRU_RCVD, NULL, 1464 NULL, NULL, NULL); 1465 goto nextpkt; 1466 } 1467 1468 /* 1469 * By splicing sockets connected to localhost, userland might create a 1470 * loop. Dissolve splicing with error if loop is detected by counter. 1471 * 1472 * If we deal with looped broadcast/multicast packet we bail out with 1473 * no error to suppress splice termination. 1474 */ 1475 if ((m->m_flags & M_PKTHDR) && 1476 ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) || 1477 ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) { 1478 error = ELOOP; 1479 goto release; 1480 } 1481 1482 if (so->so_proto->pr_flags & PR_ATOMIC) { 1483 if ((m->m_flags & M_PKTHDR) == 0) 1484 panic("somove !PKTHDR: so %p, so_type %d, m %p, " 1485 "m_type %d", so, so->so_type, m, m->m_type); 1486 if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) { 1487 error = EMSGSIZE; 1488 goto release; 1489 } 1490 if (len < m->m_pkthdr.len) 1491 goto release; 1492 if (m->m_pkthdr.len < len) { 1493 maxreached = 0; 1494 len = m->m_pkthdr.len; 1495 } 1496 /* 1497 * Throw away the name mbuf after it has been assured 1498 * that the whole first record can be processed. 1499 */ 1500 m = so->so_rcv.sb_mb; 1501 sbfree(so, &so->so_rcv, m); 1502 so->so_rcv.sb_mb = m_free(m); 1503 sbsync(&so->so_rcv, nextrecord); 1504 } 1505 /* 1506 * Throw away the control mbufs after it has been assured 1507 * that the whole first record can be processed. 1508 */ 1509 m = so->so_rcv.sb_mb; 1510 while (m && m->m_type == MT_CONTROL) { 1511 sbfree(so, &so->so_rcv, m); 1512 so->so_rcv.sb_mb = m_free(m); 1513 m = so->so_rcv.sb_mb; 1514 sbsync(&so->so_rcv, nextrecord); 1515 } 1516 1517 SBLASTRECORDCHK(&so->so_rcv, "somove 2"); 1518 SBLASTMBUFCHK(&so->so_rcv, "somove 2"); 1519 1520 /* Take at most len mbufs out of receive buffer. */ 1521 for (off = 0, mp = &m; off <= len && *mp; 1522 off += (*mp)->m_len, mp = &(*mp)->m_next) { 1523 u_long size = len - off; 1524 1525 #ifdef DIAGNOSTIC 1526 if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER) 1527 panic("somove type: so %p, so_type %d, m %p, " 1528 "m_type %d", so, so->so_type, *mp, (*mp)->m_type); 1529 #endif 1530 if ((*mp)->m_len > size) { 1531 /* 1532 * Move only a partial mbuf at maximum splice length or 1533 * if the drain buffer is too small for this large mbuf. 1534 */ 1535 if (!maxreached && so->so_snd.sb_datacc > 0) { 1536 len -= size; 1537 break; 1538 } 1539 *mp = m_copym(so->so_rcv.sb_mb, 0, size, wait); 1540 if (*mp == NULL) { 1541 len -= size; 1542 break; 1543 } 1544 so->so_rcv.sb_mb->m_data += size; 1545 so->so_rcv.sb_mb->m_len -= size; 1546 so->so_rcv.sb_cc -= size; 1547 so->so_rcv.sb_datacc -= size; 1548 } else { 1549 *mp = so->so_rcv.sb_mb; 1550 sbfree(so, &so->so_rcv, *mp); 1551 so->so_rcv.sb_mb = (*mp)->m_next; 1552 sbsync(&so->so_rcv, nextrecord); 1553 } 1554 } 1555 *mp = NULL; 1556 1557 SBLASTRECORDCHK(&so->so_rcv, "somove 3"); 1558 SBLASTMBUFCHK(&so->so_rcv, "somove 3"); 1559 SBCHECK(so, &so->so_rcv); 1560 if (m == NULL) 1561 goto release; 1562 m->m_nextpkt = NULL; 1563 if (m->m_flags & M_PKTHDR) { 1564 m_resethdr(m); 1565 m->m_pkthdr.len = len; 1566 } 1567 1568 /* Send window update to source peer as receive buffer has changed. */ 1569 if (so->so_proto->pr_flags & PR_WANTRCVD && so->so_pcb) 1570 (so->so_proto->pr_usrreq)(so, PRU_RCVD, NULL, 1571 NULL, NULL, NULL); 1572 1573 /* Receive buffer did shrink by len bytes, adjust oob. */ 1574 state = so->so_state; 1575 so->so_state &= ~SS_RCVATMARK; 1576 oobmark = so->so_oobmark; 1577 so->so_oobmark = oobmark > len ? oobmark - len : 0; 1578 if (oobmark) { 1579 if (oobmark == len) 1580 so->so_state |= SS_RCVATMARK; 1581 if (oobmark >= len) 1582 oobmark = 0; 1583 } 1584 1585 /* 1586 * Handle oob data. If any malloc fails, ignore error. 1587 * TCP urgent data is not very reliable anyway. 1588 */ 1589 while (((state & SS_RCVATMARK) || oobmark) && 1590 (so->so_options & SO_OOBINLINE)) { 1591 struct mbuf *o = NULL; 1592 1593 if (state & SS_RCVATMARK) { 1594 o = m_get(wait, MT_DATA); 1595 state &= ~SS_RCVATMARK; 1596 } else if (oobmark) { 1597 o = m_split(m, oobmark, wait); 1598 if (o) { 1599 error = (*sosp->so_proto->pr_usrreq)(sosp, 1600 PRU_SEND, m, NULL, NULL, NULL); 1601 if (error) { 1602 if (sosp->so_state & SS_CANTSENDMORE) 1603 error = EPIPE; 1604 m_freem(o); 1605 goto release; 1606 } 1607 len -= oobmark; 1608 so->so_splicelen += oobmark; 1609 m = o; 1610 o = m_get(wait, MT_DATA); 1611 } 1612 oobmark = 0; 1613 } 1614 if (o) { 1615 o->m_len = 1; 1616 *mtod(o, caddr_t) = *mtod(m, caddr_t); 1617 error = (*sosp->so_proto->pr_usrreq)(sosp, PRU_SENDOOB, 1618 o, NULL, NULL, NULL); 1619 if (error) { 1620 if (sosp->so_state & SS_CANTSENDMORE) 1621 error = EPIPE; 1622 m_freem(m); 1623 goto release; 1624 } 1625 len -= 1; 1626 so->so_splicelen += 1; 1627 if (oobmark) { 1628 oobmark -= 1; 1629 if (oobmark == 0) 1630 state |= SS_RCVATMARK; 1631 } 1632 m_adj(m, 1); 1633 } 1634 } 1635 1636 /* Append all remaining data to drain socket. */ 1637 if (so->so_rcv.sb_cc == 0 || maxreached) 1638 sosp->so_state &= ~SS_ISSENDING; 1639 error = (*sosp->so_proto->pr_usrreq)(sosp, PRU_SEND, m, NULL, NULL, 1640 NULL); 1641 if (error) { 1642 if (sosp->so_state & SS_CANTSENDMORE) 1643 error = EPIPE; 1644 goto release; 1645 } 1646 so->so_splicelen += len; 1647 1648 /* Move several packets if possible. */ 1649 if (!maxreached && nextrecord) 1650 goto nextpkt; 1651 1652 release: 1653 sosp->so_state &= ~SS_ISSENDING; 1654 if (!error && maxreached && so->so_splicemax == so->so_splicelen) 1655 error = EFBIG; 1656 if (error) 1657 so->so_error = error; 1658 if (((so->so_state & SS_CANTRCVMORE) && so->so_rcv.sb_cc == 0) || 1659 (sosp->so_state & SS_CANTSENDMORE) || maxreached || error) { 1660 sounsplice(so, sosp, 0); 1661 return (0); 1662 } 1663 if (timerisset(&so->so_idletv)) 1664 timeout_add_tv(&so->so_idleto, &so->so_idletv); 1665 return (1); 1666 } 1667 1668 #endif /* SOCKET_SPLICE */ 1669 1670 void 1671 sorwakeup(struct socket *so) 1672 { 1673 soassertlocked(so); 1674 1675 #ifdef SOCKET_SPLICE 1676 if (so->so_rcv.sb_flags & SB_SPLICE) { 1677 /* 1678 * TCP has a sendbuffer that can handle multiple packets 1679 * at once. So queue the stream a bit to accumulate data. 1680 * The sosplice thread will call somove() later and send 1681 * the packets calling tcp_output() only once. 1682 * In the UDP case, send out the packets immediately. 1683 * Using a thread would make things slower. 1684 */ 1685 if (so->so_proto->pr_flags & PR_WANTRCVD) 1686 task_add(sosplice_taskq, &so->so_splicetask); 1687 else 1688 somove(so, M_DONTWAIT); 1689 } 1690 if (isspliced(so)) 1691 return; 1692 #endif 1693 sowakeup(so, &so->so_rcv); 1694 if (so->so_upcall) 1695 (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT); 1696 } 1697 1698 void 1699 sowwakeup(struct socket *so) 1700 { 1701 soassertlocked(so); 1702 1703 #ifdef SOCKET_SPLICE 1704 if (so->so_snd.sb_flags & SB_SPLICE) 1705 task_add(sosplice_taskq, &so->so_sp->ssp_soback->so_splicetask); 1706 if (issplicedback(so)) 1707 return; 1708 #endif 1709 sowakeup(so, &so->so_snd); 1710 } 1711 1712 int 1713 sosetopt(struct socket *so, int level, int optname, struct mbuf *m) 1714 { 1715 int error = 0; 1716 1717 soassertlocked(so); 1718 1719 if (level != SOL_SOCKET) { 1720 if (so->so_proto->pr_ctloutput) { 1721 error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1722 level, optname, m); 1723 return (error); 1724 } 1725 error = ENOPROTOOPT; 1726 } else { 1727 switch (optname) { 1728 case SO_BINDANY: 1729 if ((error = suser(curproc)) != 0) /* XXX */ 1730 return (error); 1731 break; 1732 } 1733 1734 switch (optname) { 1735 1736 case SO_LINGER: 1737 if (m == NULL || m->m_len != sizeof (struct linger) || 1738 mtod(m, struct linger *)->l_linger < 0 || 1739 mtod(m, struct linger *)->l_linger > SHRT_MAX) 1740 return (EINVAL); 1741 so->so_linger = mtod(m, struct linger *)->l_linger; 1742 /* FALLTHROUGH */ 1743 1744 case SO_BINDANY: 1745 case SO_DEBUG: 1746 case SO_KEEPALIVE: 1747 case SO_USELOOPBACK: 1748 case SO_BROADCAST: 1749 case SO_REUSEADDR: 1750 case SO_REUSEPORT: 1751 case SO_OOBINLINE: 1752 case SO_TIMESTAMP: 1753 case SO_ZEROIZE: 1754 if (m == NULL || m->m_len < sizeof (int)) 1755 return (EINVAL); 1756 if (*mtod(m, int *)) 1757 so->so_options |= optname; 1758 else 1759 so->so_options &= ~optname; 1760 break; 1761 1762 case SO_DONTROUTE: 1763 if (m == NULL || m->m_len < sizeof (int)) 1764 return (EINVAL); 1765 if (*mtod(m, int *)) 1766 error = EOPNOTSUPP; 1767 break; 1768 1769 case SO_SNDBUF: 1770 case SO_RCVBUF: 1771 case SO_SNDLOWAT: 1772 case SO_RCVLOWAT: 1773 { 1774 u_long cnt; 1775 1776 if (m == NULL || m->m_len < sizeof (int)) 1777 return (EINVAL); 1778 cnt = *mtod(m, int *); 1779 if ((long)cnt <= 0) 1780 cnt = 1; 1781 switch (optname) { 1782 1783 case SO_SNDBUF: 1784 if (so->so_state & SS_CANTSENDMORE) 1785 return (EINVAL); 1786 if (sbcheckreserve(cnt, so->so_snd.sb_wat) || 1787 sbreserve(so, &so->so_snd, cnt)) 1788 return (ENOBUFS); 1789 so->so_snd.sb_wat = cnt; 1790 break; 1791 1792 case SO_RCVBUF: 1793 if (so->so_state & SS_CANTRCVMORE) 1794 return (EINVAL); 1795 if (sbcheckreserve(cnt, so->so_rcv.sb_wat) || 1796 sbreserve(so, &so->so_rcv, cnt)) 1797 return (ENOBUFS); 1798 so->so_rcv.sb_wat = cnt; 1799 break; 1800 1801 case SO_SNDLOWAT: 1802 so->so_snd.sb_lowat = 1803 (cnt > so->so_snd.sb_hiwat) ? 1804 so->so_snd.sb_hiwat : cnt; 1805 break; 1806 case SO_RCVLOWAT: 1807 so->so_rcv.sb_lowat = 1808 (cnt > so->so_rcv.sb_hiwat) ? 1809 so->so_rcv.sb_hiwat : cnt; 1810 break; 1811 } 1812 break; 1813 } 1814 1815 case SO_SNDTIMEO: 1816 case SO_RCVTIMEO: 1817 { 1818 struct timeval tv; 1819 uint64_t nsecs; 1820 1821 if (m == NULL || m->m_len < sizeof (tv)) 1822 return (EINVAL); 1823 memcpy(&tv, mtod(m, struct timeval *), sizeof tv); 1824 if (!timerisvalid(&tv)) 1825 return (EINVAL); 1826 nsecs = TIMEVAL_TO_NSEC(&tv); 1827 if (nsecs == UINT64_MAX) 1828 return (EDOM); 1829 if (nsecs == 0) 1830 nsecs = INFSLP; 1831 switch (optname) { 1832 1833 case SO_SNDTIMEO: 1834 so->so_snd.sb_timeo_nsecs = nsecs; 1835 break; 1836 case SO_RCVTIMEO: 1837 so->so_rcv.sb_timeo_nsecs = nsecs; 1838 break; 1839 } 1840 break; 1841 } 1842 1843 case SO_RTABLE: 1844 if (so->so_proto->pr_domain && 1845 so->so_proto->pr_domain->dom_protosw && 1846 so->so_proto->pr_ctloutput) { 1847 const struct domain *dom = 1848 so->so_proto->pr_domain; 1849 1850 level = dom->dom_protosw->pr_protocol; 1851 error = (*so->so_proto->pr_ctloutput) 1852 (PRCO_SETOPT, so, level, optname, m); 1853 return (error); 1854 } 1855 error = ENOPROTOOPT; 1856 break; 1857 1858 #ifdef SOCKET_SPLICE 1859 case SO_SPLICE: 1860 if (m == NULL) { 1861 error = sosplice(so, -1, 0, NULL); 1862 } else if (m->m_len < sizeof(int)) { 1863 return (EINVAL); 1864 } else if (m->m_len < sizeof(struct splice)) { 1865 error = sosplice(so, *mtod(m, int *), 0, NULL); 1866 } else { 1867 error = sosplice(so, 1868 mtod(m, struct splice *)->sp_fd, 1869 mtod(m, struct splice *)->sp_max, 1870 &mtod(m, struct splice *)->sp_idle); 1871 } 1872 break; 1873 #endif /* SOCKET_SPLICE */ 1874 1875 default: 1876 error = ENOPROTOOPT; 1877 break; 1878 } 1879 if (error == 0 && so->so_proto->pr_ctloutput) { 1880 (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, 1881 level, optname, m); 1882 } 1883 } 1884 1885 return (error); 1886 } 1887 1888 int 1889 sogetopt(struct socket *so, int level, int optname, struct mbuf *m) 1890 { 1891 int error = 0; 1892 1893 soassertlocked(so); 1894 1895 if (level != SOL_SOCKET) { 1896 if (so->so_proto->pr_ctloutput) { 1897 m->m_len = 0; 1898 1899 error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so, 1900 level, optname, m); 1901 if (error) 1902 return (error); 1903 return (0); 1904 } else 1905 return (ENOPROTOOPT); 1906 } else { 1907 m->m_len = sizeof (int); 1908 1909 switch (optname) { 1910 1911 case SO_LINGER: 1912 m->m_len = sizeof (struct linger); 1913 mtod(m, struct linger *)->l_onoff = 1914 so->so_options & SO_LINGER; 1915 mtod(m, struct linger *)->l_linger = so->so_linger; 1916 break; 1917 1918 case SO_BINDANY: 1919 case SO_USELOOPBACK: 1920 case SO_DEBUG: 1921 case SO_KEEPALIVE: 1922 case SO_REUSEADDR: 1923 case SO_REUSEPORT: 1924 case SO_BROADCAST: 1925 case SO_OOBINLINE: 1926 case SO_TIMESTAMP: 1927 case SO_ZEROIZE: 1928 *mtod(m, int *) = so->so_options & optname; 1929 break; 1930 1931 case SO_DONTROUTE: 1932 *mtod(m, int *) = 0; 1933 break; 1934 1935 case SO_TYPE: 1936 *mtod(m, int *) = so->so_type; 1937 break; 1938 1939 case SO_ERROR: 1940 *mtod(m, int *) = so->so_error; 1941 so->so_error = 0; 1942 break; 1943 1944 case SO_DOMAIN: 1945 *mtod(m, int *) = so->so_proto->pr_domain->dom_family; 1946 break; 1947 1948 case SO_PROTOCOL: 1949 *mtod(m, int *) = so->so_proto->pr_protocol; 1950 break; 1951 1952 case SO_SNDBUF: 1953 *mtod(m, int *) = so->so_snd.sb_hiwat; 1954 break; 1955 1956 case SO_RCVBUF: 1957 *mtod(m, int *) = so->so_rcv.sb_hiwat; 1958 break; 1959 1960 case SO_SNDLOWAT: 1961 *mtod(m, int *) = so->so_snd.sb_lowat; 1962 break; 1963 1964 case SO_RCVLOWAT: 1965 *mtod(m, int *) = so->so_rcv.sb_lowat; 1966 break; 1967 1968 case SO_SNDTIMEO: 1969 case SO_RCVTIMEO: 1970 { 1971 struct timeval tv; 1972 uint64_t nsecs = (optname == SO_SNDTIMEO ? 1973 so->so_snd.sb_timeo_nsecs : 1974 so->so_rcv.sb_timeo_nsecs); 1975 1976 m->m_len = sizeof(struct timeval); 1977 memset(&tv, 0, sizeof(tv)); 1978 if (nsecs != INFSLP) 1979 NSEC_TO_TIMEVAL(nsecs, &tv); 1980 memcpy(mtod(m, struct timeval *), &tv, sizeof tv); 1981 break; 1982 } 1983 1984 case SO_RTABLE: 1985 if (so->so_proto->pr_domain && 1986 so->so_proto->pr_domain->dom_protosw && 1987 so->so_proto->pr_ctloutput) { 1988 const struct domain *dom = 1989 so->so_proto->pr_domain; 1990 1991 level = dom->dom_protosw->pr_protocol; 1992 error = (*so->so_proto->pr_ctloutput) 1993 (PRCO_GETOPT, so, level, optname, m); 1994 if (error) 1995 return (error); 1996 break; 1997 } 1998 return (ENOPROTOOPT); 1999 2000 #ifdef SOCKET_SPLICE 2001 case SO_SPLICE: 2002 { 2003 off_t len; 2004 2005 m->m_len = sizeof(off_t); 2006 len = so->so_sp ? so->so_sp->ssp_len : 0; 2007 memcpy(mtod(m, off_t *), &len, sizeof(off_t)); 2008 break; 2009 } 2010 #endif /* SOCKET_SPLICE */ 2011 2012 case SO_PEERCRED: 2013 if (so->so_proto->pr_protocol == AF_UNIX) { 2014 struct unpcb *unp = sotounpcb(so); 2015 2016 if (unp->unp_flags & UNP_FEIDS) { 2017 m->m_len = sizeof(unp->unp_connid); 2018 memcpy(mtod(m, caddr_t), 2019 &(unp->unp_connid), m->m_len); 2020 break; 2021 } 2022 return (ENOTCONN); 2023 } 2024 return (EOPNOTSUPP); 2025 2026 default: 2027 return (ENOPROTOOPT); 2028 } 2029 return (0); 2030 } 2031 } 2032 2033 void 2034 sohasoutofband(struct socket *so) 2035 { 2036 pgsigio(&so->so_sigio, SIGURG, 0); 2037 selwakeup(&so->so_rcv.sb_sel); 2038 } 2039 2040 int 2041 soo_kqfilter(struct file *fp, struct knote *kn) 2042 { 2043 struct socket *so = kn->kn_fp->f_data; 2044 struct sockbuf *sb; 2045 2046 KERNEL_ASSERT_LOCKED(); 2047 2048 switch (kn->kn_filter) { 2049 case EVFILT_READ: 2050 if (so->so_options & SO_ACCEPTCONN) 2051 kn->kn_fop = &solisten_filtops; 2052 else 2053 kn->kn_fop = &soread_filtops; 2054 sb = &so->so_rcv; 2055 break; 2056 case EVFILT_WRITE: 2057 kn->kn_fop = &sowrite_filtops; 2058 sb = &so->so_snd; 2059 break; 2060 case EVFILT_EXCEPT: 2061 kn->kn_fop = &soexcept_filtops; 2062 sb = &so->so_rcv; 2063 break; 2064 default: 2065 return (EINVAL); 2066 } 2067 2068 klist_insert_locked(&sb->sb_sel.si_note, kn); 2069 2070 return (0); 2071 } 2072 2073 void 2074 filt_sordetach(struct knote *kn) 2075 { 2076 struct socket *so = kn->kn_fp->f_data; 2077 2078 KERNEL_ASSERT_LOCKED(); 2079 2080 klist_remove_locked(&so->so_rcv.sb_sel.si_note, kn); 2081 } 2082 2083 int 2084 filt_soread_common(struct knote *kn, struct socket *so) 2085 { 2086 int rv = 0; 2087 2088 soassertlocked(so); 2089 2090 kn->kn_data = so->so_rcv.sb_cc; 2091 #ifdef SOCKET_SPLICE 2092 if (isspliced(so)) { 2093 rv = 0; 2094 } else 2095 #endif /* SOCKET_SPLICE */ 2096 if (so->so_state & SS_CANTRCVMORE) { 2097 kn->kn_flags |= EV_EOF; 2098 if (kn->kn_flags & __EV_POLL) { 2099 if (so->so_state & SS_ISDISCONNECTED) 2100 kn->kn_flags |= __EV_HUP; 2101 } 2102 kn->kn_fflags = so->so_error; 2103 rv = 1; 2104 } else if (so->so_error) { /* temporary udp error */ 2105 rv = 1; 2106 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2107 rv = (kn->kn_data >= kn->kn_sdata); 2108 } else { 2109 rv = (kn->kn_data >= so->so_rcv.sb_lowat); 2110 } 2111 2112 return rv; 2113 } 2114 2115 int 2116 filt_soread(struct knote *kn, long hint) 2117 { 2118 struct socket *so = kn->kn_fp->f_data; 2119 2120 return (filt_soread_common(kn, so)); 2121 } 2122 2123 int 2124 filt_soreadmodify(struct kevent *kev, struct knote *kn) 2125 { 2126 struct socket *so = kn->kn_fp->f_data; 2127 int rv, s; 2128 2129 s = solock(so); 2130 knote_modify(kev, kn); 2131 rv = filt_soread_common(kn, so); 2132 sounlock(so, s); 2133 2134 return (rv); 2135 } 2136 2137 int 2138 filt_soreadprocess(struct knote *kn, struct kevent *kev) 2139 { 2140 struct socket *so = kn->kn_fp->f_data; 2141 int rv, s; 2142 2143 s = solock(so); 2144 if (kev != NULL && (kn->kn_flags & EV_ONESHOT)) 2145 rv = 1; 2146 else 2147 rv = filt_soread_common(kn, so); 2148 if (rv != 0) 2149 knote_submit(kn, kev); 2150 sounlock(so, s); 2151 2152 return (rv); 2153 } 2154 2155 void 2156 filt_sowdetach(struct knote *kn) 2157 { 2158 struct socket *so = kn->kn_fp->f_data; 2159 2160 KERNEL_ASSERT_LOCKED(); 2161 2162 klist_remove_locked(&so->so_snd.sb_sel.si_note, kn); 2163 } 2164 2165 int 2166 filt_sowrite_common(struct knote *kn, struct socket *so) 2167 { 2168 int rv; 2169 2170 soassertlocked(so); 2171 2172 kn->kn_data = sbspace(so, &so->so_snd); 2173 if (so->so_state & SS_CANTSENDMORE) { 2174 kn->kn_flags |= EV_EOF; 2175 if (kn->kn_flags & __EV_POLL) { 2176 if (so->so_state & SS_ISDISCONNECTED) 2177 kn->kn_flags |= __EV_HUP; 2178 } 2179 kn->kn_fflags = so->so_error; 2180 rv = 1; 2181 } else if (so->so_error) { /* temporary udp error */ 2182 rv = 1; 2183 } else if (((so->so_state & SS_ISCONNECTED) == 0) && 2184 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 2185 rv = 0; 2186 } else if (kn->kn_sfflags & NOTE_LOWAT) { 2187 rv = (kn->kn_data >= kn->kn_sdata); 2188 } else { 2189 rv = (kn->kn_data >= so->so_snd.sb_lowat); 2190 } 2191 2192 return (rv); 2193 } 2194 2195 int 2196 filt_sowrite(struct knote *kn, long hint) 2197 { 2198 struct socket *so = kn->kn_fp->f_data; 2199 2200 return (filt_sowrite_common(kn, so)); 2201 } 2202 2203 int 2204 filt_sowritemodify(struct kevent *kev, struct knote *kn) 2205 { 2206 struct socket *so = kn->kn_fp->f_data; 2207 int rv, s; 2208 2209 s = solock(so); 2210 knote_modify(kev, kn); 2211 rv = filt_sowrite_common(kn, so); 2212 sounlock(so, s); 2213 2214 return (rv); 2215 } 2216 2217 int 2218 filt_sowriteprocess(struct knote *kn, struct kevent *kev) 2219 { 2220 struct socket *so = kn->kn_fp->f_data; 2221 int rv, s; 2222 2223 s = solock(so); 2224 if (kev != NULL && (kn->kn_flags & EV_ONESHOT)) 2225 rv = 1; 2226 else 2227 rv = filt_sowrite_common(kn, so); 2228 if (rv != 0) 2229 knote_submit(kn, kev); 2230 sounlock(so, s); 2231 2232 return (rv); 2233 } 2234 2235 int 2236 filt_soexcept_common(struct knote *kn, struct socket *so) 2237 { 2238 int rv = 0; 2239 2240 soassertlocked(so); 2241 2242 #ifdef SOCKET_SPLICE 2243 if (isspliced(so)) { 2244 rv = 0; 2245 } else 2246 #endif /* SOCKET_SPLICE */ 2247 if (kn->kn_sfflags & NOTE_OOB) { 2248 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) { 2249 kn->kn_fflags |= NOTE_OOB; 2250 kn->kn_data -= so->so_oobmark; 2251 rv = 1; 2252 } 2253 } else if (so->so_state & SS_CANTRCVMORE) { 2254 kn->kn_flags |= EV_EOF; 2255 if (kn->kn_flags & __EV_POLL) { 2256 if (so->so_state & SS_ISDISCONNECTED) 2257 kn->kn_flags |= __EV_HUP; 2258 } 2259 kn->kn_fflags = so->so_error; 2260 rv = 1; 2261 } 2262 2263 return rv; 2264 } 2265 2266 int 2267 filt_soexcept(struct knote *kn, long hint) 2268 { 2269 struct socket *so = kn->kn_fp->f_data; 2270 2271 return (filt_soexcept_common(kn, so)); 2272 } 2273 2274 int 2275 filt_soexceptmodify(struct kevent *kev, struct knote *kn) 2276 { 2277 struct socket *so = kn->kn_fp->f_data; 2278 int rv, s; 2279 2280 s = solock(so); 2281 knote_modify(kev, kn); 2282 rv = filt_soexcept_common(kn, so); 2283 sounlock(so, s); 2284 2285 return (rv); 2286 } 2287 2288 int 2289 filt_soexceptprocess(struct knote *kn, struct kevent *kev) 2290 { 2291 struct socket *so = kn->kn_fp->f_data; 2292 int rv, s; 2293 2294 s = solock(so); 2295 if (kev != NULL && (kn->kn_flags & EV_ONESHOT)) 2296 rv = 1; 2297 else 2298 rv = filt_soexcept_common(kn, so); 2299 if (rv != 0) 2300 knote_submit(kn, kev); 2301 sounlock(so, s); 2302 2303 return (rv); 2304 } 2305 2306 int 2307 filt_solisten_common(struct knote *kn, struct socket *so) 2308 { 2309 soassertlocked(so); 2310 2311 kn->kn_data = so->so_qlen; 2312 2313 return (kn->kn_data != 0); 2314 } 2315 2316 int 2317 filt_solisten(struct knote *kn, long hint) 2318 { 2319 struct socket *so = kn->kn_fp->f_data; 2320 2321 return (filt_solisten_common(kn, so)); 2322 } 2323 2324 int 2325 filt_solistenmodify(struct kevent *kev, struct knote *kn) 2326 { 2327 struct socket *so = kn->kn_fp->f_data; 2328 int rv, s; 2329 2330 s = solock(so); 2331 knote_modify(kev, kn); 2332 rv = filt_solisten_common(kn, so); 2333 sounlock(so, s); 2334 2335 return (rv); 2336 } 2337 2338 int 2339 filt_solistenprocess(struct knote *kn, struct kevent *kev) 2340 { 2341 struct socket *so = kn->kn_fp->f_data; 2342 int rv, s; 2343 2344 s = solock(so); 2345 if (kev != NULL && (kn->kn_flags & EV_ONESHOT)) 2346 rv = 1; 2347 else 2348 rv = filt_solisten_common(kn, so); 2349 if (rv != 0) 2350 knote_submit(kn, kev); 2351 sounlock(so, s); 2352 2353 return (rv); 2354 } 2355 2356 #ifdef DDB 2357 void 2358 sobuf_print(struct sockbuf *, 2359 int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))); 2360 2361 void 2362 sobuf_print(struct sockbuf *sb, 2363 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2364 { 2365 (*pr)("\tsb_cc: %lu\n", sb->sb_cc); 2366 (*pr)("\tsb_datacc: %lu\n", sb->sb_datacc); 2367 (*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat); 2368 (*pr)("\tsb_wat: %lu\n", sb->sb_wat); 2369 (*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt); 2370 (*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax); 2371 (*pr)("\tsb_lowat: %ld\n", sb->sb_lowat); 2372 (*pr)("\tsb_mb: %p\n", sb->sb_mb); 2373 (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail); 2374 (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord); 2375 (*pr)("\tsb_sel: ...\n"); 2376 (*pr)("\tsb_flags: %i\n", sb->sb_flags); 2377 (*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs); 2378 } 2379 2380 void 2381 so_print(void *v, 2382 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 2383 { 2384 struct socket *so = v; 2385 2386 (*pr)("socket %p\n", so); 2387 (*pr)("so_type: %i\n", so->so_type); 2388 (*pr)("so_options: 0x%04x\n", so->so_options); /* %b */ 2389 (*pr)("so_linger: %i\n", so->so_linger); 2390 (*pr)("so_state: 0x%04x\n", so->so_state); 2391 (*pr)("so_pcb: %p\n", so->so_pcb); 2392 (*pr)("so_proto: %p\n", so->so_proto); 2393 (*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio); 2394 2395 (*pr)("so_head: %p\n", so->so_head); 2396 (*pr)("so_onq: %p\n", so->so_onq); 2397 (*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0)); 2398 (*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q)); 2399 (*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe)); 2400 (*pr)("so_q0len: %i\n", so->so_q0len); 2401 (*pr)("so_qlen: %i\n", so->so_qlen); 2402 (*pr)("so_qlimit: %i\n", so->so_qlimit); 2403 (*pr)("so_timeo: %i\n", so->so_timeo); 2404 (*pr)("so_obmark: %lu\n", so->so_oobmark); 2405 2406 (*pr)("so_sp: %p\n", so->so_sp); 2407 if (so->so_sp != NULL) { 2408 (*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket); 2409 (*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback); 2410 (*pr)("\tssp_len: %lld\n", 2411 (unsigned long long)so->so_sp->ssp_len); 2412 (*pr)("\tssp_max: %lld\n", 2413 (unsigned long long)so->so_sp->ssp_max); 2414 (*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec, 2415 so->so_sp->ssp_idletv.tv_usec); 2416 (*pr)("\tssp_idleto: %spending (@%i)\n", 2417 timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ", 2418 so->so_sp->ssp_idleto.to_time); 2419 } 2420 2421 (*pr)("so_rcv:\n"); 2422 sobuf_print(&so->so_rcv, pr); 2423 (*pr)("so_snd:\n"); 2424 sobuf_print(&so->so_snd, pr); 2425 2426 (*pr)("so_upcall: %p so_upcallarg: %p\n", 2427 so->so_upcall, so->so_upcallarg); 2428 2429 (*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid); 2430 (*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid); 2431 (*pr)("so_cpid: %d\n", so->so_cpid); 2432 } 2433 #endif 2434