1 /* $NetBSD: uipc_socket.c,v 1.302 2022/04/09 23:52:22 riastradh Exp $ */ 2 3 /* 4 * Copyright (c) 2002, 2007, 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of Wasabi Systems, Inc, and by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2004 The FreeBSD Foundation 34 * Copyright (c) 2004 Robert Watson 35 * Copyright (c) 1982, 1986, 1988, 1990, 1993 36 * The Regents of the University of California. All rights reserved. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. Neither the name of the University nor the names of its contributors 47 * may be used to endorse or promote products derived from this software 48 * without specific prior written permission. 49 * 50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 60 * SUCH DAMAGE. 61 * 62 * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95 63 */ 64 65 /* 66 * Socket operation routines. 67 * 68 * These routines are called by the routines in sys_socket.c or from a 69 * system process, and implement the semantics of socket operations by 70 * switching out to the protocol specific routines. 71 */ 72 73 #include <sys/cdefs.h> 74 __KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.302 2022/04/09 23:52:22 riastradh Exp $"); 75 76 #ifdef _KERNEL_OPT 77 #include "opt_compat_netbsd.h" 78 #include "opt_sock_counters.h" 79 #include "opt_sosend_loan.h" 80 #include "opt_mbuftrace.h" 81 #include "opt_somaxkva.h" 82 #include "opt_multiprocessor.h" /* XXX */ 83 #include "opt_sctp.h" 84 #endif 85 86 #include <sys/param.h> 87 #include <sys/systm.h> 88 #include <sys/proc.h> 89 #include <sys/file.h> 90 #include <sys/filedesc.h> 91 #include <sys/kmem.h> 92 #include <sys/mbuf.h> 93 #include <sys/domain.h> 94 #include <sys/kernel.h> 95 #include <sys/protosw.h> 96 #include <sys/socket.h> 97 #include <sys/socketvar.h> 98 #include <sys/signalvar.h> 99 #include <sys/resourcevar.h> 100 #include <sys/uidinfo.h> 101 #include <sys/event.h> 102 #include <sys/poll.h> 103 #include <sys/kauth.h> 104 #include <sys/mutex.h> 105 #include <sys/condvar.h> 106 #include <sys/kthread.h> 107 #include <sys/compat_stub.h> 108 109 #include <compat/sys/time.h> 110 #include <compat/sys/socket.h> 111 112 #include <uvm/uvm_extern.h> 113 #include <uvm/uvm_loan.h> 114 #include <uvm/uvm_page.h> 115 116 #ifdef SCTP 117 #include <netinet/sctp_route.h> 118 #endif 119 120 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 121 122 extern const struct fileops socketops; 123 124 static int sooptions; 125 extern int somaxconn; /* patchable (XXX sysctl) */ 126 int somaxconn = SOMAXCONN; 127 kmutex_t *softnet_lock; 128 129 #ifdef SOSEND_COUNTERS 130 #include <sys/device.h> 131 132 static struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 133 NULL, "sosend", "loan big"); 134 static struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 135 NULL, "sosend", "copy big"); 136 static struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 137 NULL, "sosend", "copy small"); 138 static struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 139 NULL, "sosend", "kva limit"); 140 141 #define SOSEND_COUNTER_INCR(ev) (ev)->ev_count++ 142 143 EVCNT_ATTACH_STATIC(sosend_loan_big); 144 EVCNT_ATTACH_STATIC(sosend_copy_big); 145 EVCNT_ATTACH_STATIC(sosend_copy_small); 146 EVCNT_ATTACH_STATIC(sosend_kvalimit); 147 #else 148 149 #define SOSEND_COUNTER_INCR(ev) /* nothing */ 150 151 #endif /* SOSEND_COUNTERS */ 152 153 #if defined(SOSEND_NO_LOAN) || defined(MULTIPROCESSOR) 154 int sock_loan_thresh = -1; 155 #else 156 int sock_loan_thresh = 4096; 157 #endif 158 159 static kmutex_t so_pendfree_lock; 160 static struct mbuf *so_pendfree = NULL; 161 162 #ifndef SOMAXKVA 163 #define SOMAXKVA (16 * 1024 * 1024) 164 #endif 165 int somaxkva = SOMAXKVA; 166 static int socurkva; 167 static kcondvar_t socurkva_cv; 168 169 #ifndef SOFIXEDBUF 170 #define SOFIXEDBUF true 171 #endif 172 bool sofixedbuf = SOFIXEDBUF; 173 174 static kauth_listener_t socket_listener; 175 176 #define SOCK_LOAN_CHUNK 65536 177 178 static void sopendfree_thread(void *); 179 static kcondvar_t pendfree_thread_cv; 180 static lwp_t *sopendfree_lwp; 181 182 static void sysctl_kern_socket_setup(void); 183 static struct sysctllog *socket_sysctllog; 184 185 static vsize_t 186 sokvareserve(struct socket *so, vsize_t len) 187 { 188 int error; 189 190 mutex_enter(&so_pendfree_lock); 191 while (socurkva + len > somaxkva) { 192 SOSEND_COUNTER_INCR(&sosend_kvalimit); 193 error = cv_wait_sig(&socurkva_cv, &so_pendfree_lock); 194 if (error) { 195 len = 0; 196 break; 197 } 198 } 199 socurkva += len; 200 mutex_exit(&so_pendfree_lock); 201 return len; 202 } 203 204 static void 205 sokvaunreserve(vsize_t len) 206 { 207 208 mutex_enter(&so_pendfree_lock); 209 socurkva -= len; 210 cv_broadcast(&socurkva_cv); 211 mutex_exit(&so_pendfree_lock); 212 } 213 214 /* 215 * sokvaalloc: allocate kva for loan. 216 */ 217 vaddr_t 218 sokvaalloc(vaddr_t sva, vsize_t len, struct socket *so) 219 { 220 vaddr_t lva; 221 222 if (sokvareserve(so, len) == 0) 223 return 0; 224 225 lva = uvm_km_alloc(kernel_map, len, atop(sva) & uvmexp.colormask, 226 UVM_KMF_COLORMATCH | UVM_KMF_VAONLY | UVM_KMF_WAITVA); 227 if (lva == 0) { 228 sokvaunreserve(len); 229 return 0; 230 } 231 232 return lva; 233 } 234 235 /* 236 * sokvafree: free kva for loan. 237 */ 238 void 239 sokvafree(vaddr_t sva, vsize_t len) 240 { 241 242 uvm_km_free(kernel_map, sva, len, UVM_KMF_VAONLY); 243 sokvaunreserve(len); 244 } 245 246 static void 247 sodoloanfree(struct vm_page **pgs, void *buf, size_t size) 248 { 249 vaddr_t sva, eva; 250 vsize_t len; 251 int npgs; 252 253 KASSERT(pgs != NULL); 254 255 eva = round_page((vaddr_t) buf + size); 256 sva = trunc_page((vaddr_t) buf); 257 len = eva - sva; 258 npgs = len >> PAGE_SHIFT; 259 260 pmap_kremove(sva, len); 261 pmap_update(pmap_kernel()); 262 uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE); 263 sokvafree(sva, len); 264 } 265 266 /* 267 * sopendfree_thread: free mbufs on "pendfree" list. Unlock and relock 268 * so_pendfree_lock when freeing mbufs. 269 */ 270 static void 271 sopendfree_thread(void *v) 272 { 273 struct mbuf *m, *next; 274 size_t rv; 275 276 mutex_enter(&so_pendfree_lock); 277 278 for (;;) { 279 rv = 0; 280 while (so_pendfree != NULL) { 281 m = so_pendfree; 282 so_pendfree = NULL; 283 mutex_exit(&so_pendfree_lock); 284 285 for (; m != NULL; m = next) { 286 next = m->m_next; 287 KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) == 288 0); 289 KASSERT(m->m_ext.ext_refcnt == 0); 290 291 rv += m->m_ext.ext_size; 292 sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf, 293 m->m_ext.ext_size); 294 pool_cache_put(mb_cache, m); 295 } 296 297 mutex_enter(&so_pendfree_lock); 298 } 299 if (rv) 300 cv_broadcast(&socurkva_cv); 301 cv_wait(&pendfree_thread_cv, &so_pendfree_lock); 302 } 303 panic("sopendfree_thread"); 304 /* NOTREACHED */ 305 } 306 307 void 308 soloanfree(struct mbuf *m, void *buf, size_t size, void *arg) 309 { 310 311 KASSERT(m != NULL); 312 313 /* 314 * postpone freeing mbuf. 315 * 316 * we can't do it in interrupt context 317 * because we need to put kva back to kernel_map. 318 */ 319 320 mutex_enter(&so_pendfree_lock); 321 m->m_next = so_pendfree; 322 so_pendfree = m; 323 cv_signal(&pendfree_thread_cv); 324 mutex_exit(&so_pendfree_lock); 325 } 326 327 static long 328 sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space) 329 { 330 struct iovec *iov = uio->uio_iov; 331 vaddr_t sva, eva; 332 vsize_t len; 333 vaddr_t lva; 334 int npgs, error; 335 vaddr_t va; 336 int i; 337 338 if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) 339 return 0; 340 341 if (iov->iov_len < (size_t) space) 342 space = iov->iov_len; 343 if (space > SOCK_LOAN_CHUNK) 344 space = SOCK_LOAN_CHUNK; 345 346 eva = round_page((vaddr_t) iov->iov_base + space); 347 sva = trunc_page((vaddr_t) iov->iov_base); 348 len = eva - sva; 349 npgs = len >> PAGE_SHIFT; 350 351 KASSERT(npgs <= M_EXT_MAXPAGES); 352 353 lva = sokvaalloc(sva, len, so); 354 if (lva == 0) 355 return 0; 356 357 error = uvm_loan(&uio->uio_vmspace->vm_map, sva, len, 358 m->m_ext.ext_pgs, UVM_LOAN_TOPAGE); 359 if (error) { 360 sokvafree(lva, len); 361 return 0; 362 } 363 364 for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE) 365 pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]), 366 VM_PROT_READ, 0); 367 pmap_update(pmap_kernel()); 368 369 lva += (vaddr_t) iov->iov_base & PAGE_MASK; 370 371 MEXTADD(m, (void *) lva, space, M_MBUF, soloanfree, so); 372 m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP; 373 374 uio->uio_resid -= space; 375 /* uio_offset not updated, not set/used for write(2) */ 376 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + space; 377 uio->uio_iov->iov_len -= space; 378 if (uio->uio_iov->iov_len == 0) { 379 uio->uio_iov++; 380 uio->uio_iovcnt--; 381 } 382 383 return space; 384 } 385 386 static int 387 socket_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, 388 void *arg0, void *arg1, void *arg2, void *arg3) 389 { 390 int result; 391 enum kauth_network_req req; 392 393 result = KAUTH_RESULT_DEFER; 394 req = (enum kauth_network_req)(uintptr_t)arg0; 395 396 if ((action != KAUTH_NETWORK_SOCKET) && 397 (action != KAUTH_NETWORK_BIND)) 398 return result; 399 400 switch (req) { 401 case KAUTH_REQ_NETWORK_BIND_PORT: 402 result = KAUTH_RESULT_ALLOW; 403 break; 404 405 case KAUTH_REQ_NETWORK_SOCKET_DROP: { 406 /* Normal users can only drop their own connections. */ 407 struct socket *so = (struct socket *)arg1; 408 409 if (so->so_cred && proc_uidmatch(cred, so->so_cred) == 0) 410 result = KAUTH_RESULT_ALLOW; 411 412 break; 413 } 414 415 case KAUTH_REQ_NETWORK_SOCKET_OPEN: 416 /* We allow "raw" routing/bluetooth sockets to anyone. */ 417 switch ((u_long)arg1) { 418 case PF_ROUTE: 419 case PF_OROUTE: 420 case PF_BLUETOOTH: 421 case PF_CAN: 422 result = KAUTH_RESULT_ALLOW; 423 break; 424 default: 425 /* Privileged, let secmodel handle this. */ 426 if ((u_long)arg2 == SOCK_RAW) 427 break; 428 result = KAUTH_RESULT_ALLOW; 429 break; 430 } 431 break; 432 433 case KAUTH_REQ_NETWORK_SOCKET_CANSEE: 434 result = KAUTH_RESULT_ALLOW; 435 436 break; 437 438 default: 439 break; 440 } 441 442 return result; 443 } 444 445 void 446 soinit(void) 447 { 448 449 sysctl_kern_socket_setup(); 450 451 #ifdef SCTP 452 /* Update the SCTP function hooks if necessary*/ 453 454 vec_sctp_add_ip_address = sctp_add_ip_address; 455 vec_sctp_delete_ip_address = sctp_delete_ip_address; 456 #endif 457 458 mutex_init(&so_pendfree_lock, MUTEX_DEFAULT, IPL_VM); 459 softnet_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); 460 cv_init(&socurkva_cv, "sokva"); 461 cv_init(&pendfree_thread_cv, "sopendfr"); 462 soinit2(); 463 464 /* Set the initial adjusted socket buffer size. */ 465 if (sb_max_set(sb_max)) 466 panic("bad initial sb_max value: %lu", sb_max); 467 468 socket_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK, 469 socket_listener_cb, NULL); 470 } 471 472 void 473 soinit1(void) 474 { 475 int error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, 476 sopendfree_thread, NULL, &sopendfree_lwp, "sopendfree"); 477 if (error) 478 panic("soinit1 %d", error); 479 } 480 481 /* 482 * socreate: create a new socket of the specified type and the protocol. 483 * 484 * => Caller may specify another socket for lock sharing (must not be held). 485 * => Returns the new socket without lock held. 486 */ 487 int 488 socreate(int dom, struct socket **aso, int type, int proto, struct lwp *l, 489 struct socket *lockso) 490 { 491 const struct protosw *prp; 492 struct socket *so; 493 uid_t uid; 494 int error; 495 kmutex_t *lock; 496 497 error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET, 498 KAUTH_REQ_NETWORK_SOCKET_OPEN, KAUTH_ARG(dom), KAUTH_ARG(type), 499 KAUTH_ARG(proto)); 500 if (error != 0) 501 return error; 502 503 if (proto) 504 prp = pffindproto(dom, proto, type); 505 else 506 prp = pffindtype(dom, type); 507 if (prp == NULL) { 508 /* no support for domain */ 509 if (pffinddomain(dom) == 0) 510 return EAFNOSUPPORT; 511 /* no support for socket type */ 512 if (proto == 0 && type != 0) 513 return EPROTOTYPE; 514 return EPROTONOSUPPORT; 515 } 516 if (prp->pr_usrreqs == NULL) 517 return EPROTONOSUPPORT; 518 if (prp->pr_type != type) 519 return EPROTOTYPE; 520 521 so = soget(true); 522 so->so_type = type; 523 so->so_proto = prp; 524 so->so_send = sosend; 525 so->so_receive = soreceive; 526 so->so_options = sooptions; 527 #ifdef MBUFTRACE 528 so->so_rcv.sb_mowner = &prp->pr_domain->dom_mowner; 529 so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner; 530 so->so_mowner = &prp->pr_domain->dom_mowner; 531 #endif 532 uid = kauth_cred_geteuid(l->l_cred); 533 so->so_uidinfo = uid_find(uid); 534 so->so_egid = kauth_cred_getegid(l->l_cred); 535 so->so_cpid = l->l_proc->p_pid; 536 537 /* 538 * Lock assigned and taken during PCB attach, unless we share 539 * the lock with another socket, e.g. socketpair(2) case. 540 */ 541 if (lockso) { 542 /* 543 * lockso->so_lock should be stable at this point, so 544 * no need for atomic_load_*. 545 */ 546 lock = lockso->so_lock; 547 so->so_lock = lock; 548 mutex_obj_hold(lock); 549 mutex_enter(lock); 550 } 551 552 /* Attach the PCB (returns with the socket lock held). */ 553 error = (*prp->pr_usrreqs->pr_attach)(so, proto); 554 KASSERT(solocked(so)); 555 556 if (error) { 557 KASSERT(so->so_pcb == NULL); 558 so->so_state |= SS_NOFDREF; 559 sofree(so); 560 return error; 561 } 562 so->so_cred = kauth_cred_dup(l->l_cred); 563 sounlock(so); 564 565 *aso = so; 566 return 0; 567 } 568 569 /* 570 * fsocreate: create a socket and a file descriptor associated with it. 571 * 572 * => On success, write file descriptor to fdout and return zero. 573 * => On failure, return non-zero; *fdout will be undefined. 574 */ 575 int 576 fsocreate(int domain, struct socket **sop, int type, int proto, int *fdout) 577 { 578 lwp_t *l = curlwp; 579 int error, fd, flags; 580 struct socket *so; 581 struct file *fp; 582 583 if ((error = fd_allocfile(&fp, &fd)) != 0) { 584 return error; 585 } 586 flags = type & SOCK_FLAGS_MASK; 587 fd_set_exclose(l, fd, (flags & SOCK_CLOEXEC) != 0); 588 fp->f_flag = FREAD|FWRITE|((flags & SOCK_NONBLOCK) ? FNONBLOCK : 0)| 589 ((flags & SOCK_NOSIGPIPE) ? FNOSIGPIPE : 0); 590 fp->f_type = DTYPE_SOCKET; 591 fp->f_ops = &socketops; 592 593 type &= ~SOCK_FLAGS_MASK; 594 error = socreate(domain, &so, type, proto, l, NULL); 595 if (error) { 596 fd_abort(curproc, fp, fd); 597 return error; 598 } 599 if (flags & SOCK_NONBLOCK) { 600 so->so_state |= SS_NBIO; 601 } 602 fp->f_socket = so; 603 fd_affix(curproc, fp, fd); 604 605 if (sop != NULL) { 606 *sop = so; 607 } 608 *fdout = fd; 609 return error; 610 } 611 612 int 613 sofamily(const struct socket *so) 614 { 615 const struct protosw *pr; 616 const struct domain *dom; 617 618 if ((pr = so->so_proto) == NULL) 619 return AF_UNSPEC; 620 if ((dom = pr->pr_domain) == NULL) 621 return AF_UNSPEC; 622 return dom->dom_family; 623 } 624 625 int 626 sobind(struct socket *so, struct sockaddr *nam, struct lwp *l) 627 { 628 int error; 629 630 solock(so); 631 if (nam->sa_family != so->so_proto->pr_domain->dom_family) { 632 sounlock(so); 633 return EAFNOSUPPORT; 634 } 635 error = (*so->so_proto->pr_usrreqs->pr_bind)(so, nam, l); 636 sounlock(so); 637 return error; 638 } 639 640 int 641 solisten(struct socket *so, int backlog, struct lwp *l) 642 { 643 int error; 644 short oldopt, oldqlimit; 645 646 solock(so); 647 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 648 SS_ISDISCONNECTING)) != 0) { 649 sounlock(so); 650 return EINVAL; 651 } 652 oldopt = so->so_options; 653 oldqlimit = so->so_qlimit; 654 if (TAILQ_EMPTY(&so->so_q)) 655 so->so_options |= SO_ACCEPTCONN; 656 if (backlog < 0) 657 backlog = 0; 658 so->so_qlimit = uimin(backlog, somaxconn); 659 660 error = (*so->so_proto->pr_usrreqs->pr_listen)(so, l); 661 if (error != 0) { 662 so->so_options = oldopt; 663 so->so_qlimit = oldqlimit; 664 sounlock(so); 665 return error; 666 } 667 sounlock(so); 668 return 0; 669 } 670 671 void 672 sofree(struct socket *so) 673 { 674 u_int refs; 675 676 KASSERT(solocked(so)); 677 678 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { 679 sounlock(so); 680 return; 681 } 682 if (so->so_head) { 683 /* 684 * We must not decommission a socket that's on the accept(2) 685 * queue. If we do, then accept(2) may hang after select(2) 686 * indicated that the listening socket was ready. 687 */ 688 if (!soqremque(so, 0)) { 689 sounlock(so); 690 return; 691 } 692 } 693 if (so->so_rcv.sb_hiwat) 694 (void)chgsbsize(so->so_uidinfo, &so->so_rcv.sb_hiwat, 0, 695 RLIM_INFINITY); 696 if (so->so_snd.sb_hiwat) 697 (void)chgsbsize(so->so_uidinfo, &so->so_snd.sb_hiwat, 0, 698 RLIM_INFINITY); 699 sbrelease(&so->so_snd, so); 700 KASSERT(!cv_has_waiters(&so->so_cv)); 701 KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv)); 702 KASSERT(!cv_has_waiters(&so->so_snd.sb_cv)); 703 sorflush(so); 704 refs = so->so_aborting; /* XXX */ 705 /* Remove acccept filter if one is present. */ 706 if (so->so_accf != NULL) 707 (void)accept_filt_clear(so); 708 sounlock(so); 709 if (refs == 0) /* XXX */ 710 soput(so); 711 } 712 713 /* 714 * soclose: close a socket on last file table reference removal. 715 * Initiate disconnect if connected. Free socket when disconnect complete. 716 */ 717 int 718 soclose(struct socket *so) 719 { 720 struct socket *so2; 721 int error = 0; 722 723 solock(so); 724 if (so->so_options & SO_ACCEPTCONN) { 725 for (;;) { 726 if ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) { 727 KASSERT(solocked2(so, so2)); 728 (void) soqremque(so2, 0); 729 /* soabort drops the lock. */ 730 (void) soabort(so2); 731 solock(so); 732 continue; 733 } 734 if ((so2 = TAILQ_FIRST(&so->so_q)) != 0) { 735 KASSERT(solocked2(so, so2)); 736 (void) soqremque(so2, 1); 737 /* soabort drops the lock. */ 738 (void) soabort(so2); 739 solock(so); 740 continue; 741 } 742 break; 743 } 744 } 745 if (so->so_pcb == NULL) 746 goto discard; 747 if (so->so_state & SS_ISCONNECTED) { 748 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 749 error = sodisconnect(so); 750 if (error) 751 goto drop; 752 } 753 if (so->so_options & SO_LINGER) { 754 if ((so->so_state & (SS_ISDISCONNECTING|SS_NBIO)) == 755 (SS_ISDISCONNECTING|SS_NBIO)) 756 goto drop; 757 while (so->so_state & SS_ISCONNECTED) { 758 error = sowait(so, true, so->so_linger * hz); 759 if (error) 760 break; 761 } 762 } 763 } 764 drop: 765 if (so->so_pcb) { 766 KASSERT(solocked(so)); 767 (*so->so_proto->pr_usrreqs->pr_detach)(so); 768 } 769 discard: 770 KASSERT((so->so_state & SS_NOFDREF) == 0); 771 kauth_cred_free(so->so_cred); 772 so->so_cred = NULL; 773 so->so_state |= SS_NOFDREF; 774 sofree(so); 775 return error; 776 } 777 778 /* 779 * Must be called with the socket locked.. Will return with it unlocked. 780 */ 781 int 782 soabort(struct socket *so) 783 { 784 u_int refs; 785 int error; 786 787 KASSERT(solocked(so)); 788 KASSERT(so->so_head == NULL); 789 790 so->so_aborting++; /* XXX */ 791 error = (*so->so_proto->pr_usrreqs->pr_abort)(so); 792 refs = --so->so_aborting; /* XXX */ 793 if (error || (refs == 0)) { 794 sofree(so); 795 } else { 796 sounlock(so); 797 } 798 return error; 799 } 800 801 int 802 soaccept(struct socket *so, struct sockaddr *nam) 803 { 804 int error; 805 806 KASSERT(solocked(so)); 807 KASSERT((so->so_state & SS_NOFDREF) != 0); 808 809 so->so_state &= ~SS_NOFDREF; 810 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 811 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 812 error = (*so->so_proto->pr_usrreqs->pr_accept)(so, nam); 813 else 814 error = ECONNABORTED; 815 816 return error; 817 } 818 819 int 820 soconnect(struct socket *so, struct sockaddr *nam, struct lwp *l) 821 { 822 int error; 823 824 KASSERT(solocked(so)); 825 826 if (so->so_options & SO_ACCEPTCONN) 827 return EOPNOTSUPP; 828 /* 829 * If protocol is connection-based, can only connect once. 830 * Otherwise, if connected, try to disconnect first. 831 * This allows user to disconnect by connecting to, e.g., 832 * a null address. 833 */ 834 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 835 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 836 (error = sodisconnect(so)))) { 837 error = EISCONN; 838 } else { 839 if (nam->sa_family != so->so_proto->pr_domain->dom_family) { 840 return EAFNOSUPPORT; 841 } 842 error = (*so->so_proto->pr_usrreqs->pr_connect)(so, nam, l); 843 } 844 845 return error; 846 } 847 848 int 849 soconnect2(struct socket *so1, struct socket *so2) 850 { 851 KASSERT(solocked2(so1, so2)); 852 853 return (*so1->so_proto->pr_usrreqs->pr_connect2)(so1, so2); 854 } 855 856 int 857 sodisconnect(struct socket *so) 858 { 859 int error; 860 861 KASSERT(solocked(so)); 862 863 if ((so->so_state & SS_ISCONNECTED) == 0) { 864 error = ENOTCONN; 865 } else if (so->so_state & SS_ISDISCONNECTING) { 866 error = EALREADY; 867 } else { 868 error = (*so->so_proto->pr_usrreqs->pr_disconnect)(so); 869 } 870 return error; 871 } 872 873 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 874 /* 875 * Send on a socket. 876 * If send must go all at once and message is larger than 877 * send buffering, then hard error. 878 * Lock against other senders. 879 * If must go all at once and not enough room now, then 880 * inform user that this would block and do nothing. 881 * Otherwise, if nonblocking, send as much as possible. 882 * The data to be sent is described by "uio" if nonzero, 883 * otherwise by the mbuf chain "top" (which must be null 884 * if uio is not). Data provided in mbuf chain must be small 885 * enough to send all at once. 886 * 887 * Returns nonzero on error, timeout or signal; callers 888 * must check for short counts if EINTR/ERESTART are returned. 889 * Data and control buffers are freed on return. 890 */ 891 int 892 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 893 struct mbuf *top, struct mbuf *control, int flags, struct lwp *l) 894 { 895 struct mbuf **mp, *m; 896 long space, len, resid, clen, mlen; 897 int error, s, dontroute, atomic; 898 short wakeup_state = 0; 899 900 clen = 0; 901 902 /* 903 * solock() provides atomicity of access. splsoftnet() prevents 904 * protocol processing soft interrupts from interrupting us and 905 * blocking (expensive). 906 */ 907 s = splsoftnet(); 908 solock(so); 909 atomic = sosendallatonce(so) || top; 910 if (uio) 911 resid = uio->uio_resid; 912 else 913 resid = top->m_pkthdr.len; 914 /* 915 * In theory resid should be unsigned. 916 * However, space must be signed, as it might be less than 0 917 * if we over-committed, and we must use a signed comparison 918 * of space and resid. On the other hand, a negative resid 919 * causes us to loop sending 0-length segments to the protocol. 920 */ 921 if (resid < 0) { 922 error = EINVAL; 923 goto out; 924 } 925 dontroute = 926 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 927 (so->so_proto->pr_flags & PR_ATOMIC); 928 l->l_ru.ru_msgsnd++; 929 if (control) 930 clen = control->m_len; 931 restart: 932 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) 933 goto out; 934 do { 935 if (so->so_state & SS_CANTSENDMORE) { 936 error = EPIPE; 937 goto release; 938 } 939 if (so->so_error) { 940 error = so->so_error; 941 if ((flags & MSG_PEEK) == 0) 942 so->so_error = 0; 943 goto release; 944 } 945 if ((so->so_state & SS_ISCONNECTED) == 0) { 946 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 947 if (resid || clen == 0) { 948 error = ENOTCONN; 949 goto release; 950 } 951 } else if (addr == NULL) { 952 error = EDESTADDRREQ; 953 goto release; 954 } 955 } 956 space = sbspace(&so->so_snd); 957 if (flags & MSG_OOB) 958 space += 1024; 959 if ((atomic && resid > so->so_snd.sb_hiwat) || 960 clen > so->so_snd.sb_hiwat) { 961 error = EMSGSIZE; 962 goto release; 963 } 964 if (space < resid + clen && 965 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 966 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) { 967 error = EWOULDBLOCK; 968 goto release; 969 } 970 sbunlock(&so->so_snd); 971 if (wakeup_state & SS_RESTARTSYS) { 972 error = ERESTART; 973 goto out; 974 } 975 error = sbwait(&so->so_snd); 976 if (error) 977 goto out; 978 wakeup_state = so->so_state; 979 goto restart; 980 } 981 wakeup_state = 0; 982 mp = ⊤ 983 space -= clen; 984 do { 985 if (uio == NULL) { 986 /* 987 * Data is prepackaged in "top". 988 */ 989 resid = 0; 990 if (flags & MSG_EOR) 991 top->m_flags |= M_EOR; 992 } else do { 993 sounlock(so); 994 splx(s); 995 if (top == NULL) { 996 m = m_gethdr(M_WAIT, MT_DATA); 997 mlen = MHLEN; 998 m->m_pkthdr.len = 0; 999 m_reset_rcvif(m); 1000 } else { 1001 m = m_get(M_WAIT, MT_DATA); 1002 mlen = MLEN; 1003 } 1004 MCLAIM(m, so->so_snd.sb_mowner); 1005 if (sock_loan_thresh >= 0 && 1006 uio->uio_iov->iov_len >= sock_loan_thresh && 1007 space >= sock_loan_thresh && 1008 (len = sosend_loan(so, uio, m, 1009 space)) != 0) { 1010 SOSEND_COUNTER_INCR(&sosend_loan_big); 1011 space -= len; 1012 goto have_data; 1013 } 1014 if (resid >= MINCLSIZE && space >= MCLBYTES) { 1015 SOSEND_COUNTER_INCR(&sosend_copy_big); 1016 m_clget(m, M_DONTWAIT); 1017 if ((m->m_flags & M_EXT) == 0) 1018 goto nopages; 1019 mlen = MCLBYTES; 1020 if (atomic && top == 0) { 1021 len = lmin(MCLBYTES - max_hdr, 1022 resid); 1023 m->m_data += max_hdr; 1024 } else 1025 len = lmin(MCLBYTES, resid); 1026 space -= len; 1027 } else { 1028 nopages: 1029 SOSEND_COUNTER_INCR(&sosend_copy_small); 1030 len = lmin(lmin(mlen, resid), space); 1031 space -= len; 1032 /* 1033 * For datagram protocols, leave room 1034 * for protocol headers in first mbuf. 1035 */ 1036 if (atomic && top == 0 && len < mlen) 1037 m_align(m, len); 1038 } 1039 error = uiomove(mtod(m, void *), (int)len, uio); 1040 have_data: 1041 resid = uio->uio_resid; 1042 m->m_len = len; 1043 *mp = m; 1044 top->m_pkthdr.len += len; 1045 s = splsoftnet(); 1046 solock(so); 1047 if (error != 0) 1048 goto release; 1049 mp = &m->m_next; 1050 if (resid <= 0) { 1051 if (flags & MSG_EOR) 1052 top->m_flags |= M_EOR; 1053 break; 1054 } 1055 } while (space > 0 && atomic); 1056 1057 if (so->so_state & SS_CANTSENDMORE) { 1058 error = EPIPE; 1059 goto release; 1060 } 1061 if (dontroute) 1062 so->so_options |= SO_DONTROUTE; 1063 if (resid > 0) 1064 so->so_state |= SS_MORETOCOME; 1065 if (flags & MSG_OOB) { 1066 error = (*so->so_proto->pr_usrreqs->pr_sendoob)( 1067 so, top, control); 1068 } else { 1069 error = (*so->so_proto->pr_usrreqs->pr_send)(so, 1070 top, addr, control, l); 1071 } 1072 if (dontroute) 1073 so->so_options &= ~SO_DONTROUTE; 1074 if (resid > 0) 1075 so->so_state &= ~SS_MORETOCOME; 1076 clen = 0; 1077 control = NULL; 1078 top = NULL; 1079 mp = ⊤ 1080 if (error != 0) 1081 goto release; 1082 } while (resid && space > 0); 1083 } while (resid); 1084 1085 release: 1086 sbunlock(&so->so_snd); 1087 out: 1088 sounlock(so); 1089 splx(s); 1090 if (top) 1091 m_freem(top); 1092 if (control) 1093 m_freem(control); 1094 return error; 1095 } 1096 1097 /* 1098 * Following replacement or removal of the first mbuf on the first 1099 * mbuf chain of a socket buffer, push necessary state changes back 1100 * into the socket buffer so that other consumers see the values 1101 * consistently. 'nextrecord' is the caller's locally stored value of 1102 * the original value of sb->sb_mb->m_nextpkt which must be restored 1103 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 1104 */ 1105 static void 1106 sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 1107 { 1108 1109 KASSERT(solocked(sb->sb_so)); 1110 1111 /* 1112 * First, update for the new value of nextrecord. If necessary, 1113 * make it the first record. 1114 */ 1115 if (sb->sb_mb != NULL) 1116 sb->sb_mb->m_nextpkt = nextrecord; 1117 else 1118 sb->sb_mb = nextrecord; 1119 1120 /* 1121 * Now update any dependent socket buffer fields to reflect 1122 * the new state. This is an inline of SB_EMPTY_FIXUP, with 1123 * the addition of a second clause that takes care of the 1124 * case where sb_mb has been updated, but remains the last 1125 * record. 1126 */ 1127 if (sb->sb_mb == NULL) { 1128 sb->sb_mbtail = NULL; 1129 sb->sb_lastrecord = NULL; 1130 } else if (sb->sb_mb->m_nextpkt == NULL) 1131 sb->sb_lastrecord = sb->sb_mb; 1132 } 1133 1134 /* 1135 * Implement receive operations on a socket. 1136 * 1137 * We depend on the way that records are added to the sockbuf by sbappend*. In 1138 * particular, each record (mbufs linked through m_next) must begin with an 1139 * address if the protocol so specifies, followed by an optional mbuf or mbufs 1140 * containing ancillary data, and then zero or more mbufs of data. 1141 * 1142 * In order to avoid blocking network interrupts for the entire time here, we 1143 * splx() while doing the actual copy to user space. Although the sockbuf is 1144 * locked, new data may still be appended, and thus we must maintain 1145 * consistency of the sockbuf during that time. 1146 * 1147 * The caller may receive the data as a single mbuf chain by supplying an mbuf 1148 * **mp0 for use in returning the chain. The uio is then used only for the 1149 * count in uio_resid. 1150 */ 1151 int 1152 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 1153 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1154 { 1155 struct lwp *l = curlwp; 1156 struct mbuf *m, **mp, *mt; 1157 size_t len, offset, moff, orig_resid; 1158 int atomic, flags, error, s, type; 1159 const struct protosw *pr; 1160 struct mbuf *nextrecord; 1161 int mbuf_removed = 0; 1162 const struct domain *dom; 1163 short wakeup_state = 0; 1164 1165 pr = so->so_proto; 1166 atomic = pr->pr_flags & PR_ATOMIC; 1167 dom = pr->pr_domain; 1168 mp = mp0; 1169 type = 0; 1170 orig_resid = uio->uio_resid; 1171 1172 if (paddr != NULL) 1173 *paddr = NULL; 1174 if (controlp != NULL) 1175 *controlp = NULL; 1176 if (flagsp != NULL) 1177 flags = *flagsp &~ MSG_EOR; 1178 else 1179 flags = 0; 1180 1181 if (flags & MSG_OOB) { 1182 m = m_get(M_WAIT, MT_DATA); 1183 solock(so); 1184 error = (*pr->pr_usrreqs->pr_recvoob)(so, m, flags & MSG_PEEK); 1185 sounlock(so); 1186 if (error) 1187 goto bad; 1188 do { 1189 error = uiomove(mtod(m, void *), 1190 MIN(uio->uio_resid, m->m_len), uio); 1191 m = m_free(m); 1192 } while (uio->uio_resid > 0 && error == 0 && m); 1193 bad: 1194 if (m != NULL) 1195 m_freem(m); 1196 return error; 1197 } 1198 if (mp != NULL) 1199 *mp = NULL; 1200 1201 /* 1202 * solock() provides atomicity of access. splsoftnet() prevents 1203 * protocol processing soft interrupts from interrupting us and 1204 * blocking (expensive). 1205 */ 1206 s = splsoftnet(); 1207 solock(so); 1208 restart: 1209 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) { 1210 sounlock(so); 1211 splx(s); 1212 return error; 1213 } 1214 m = so->so_rcv.sb_mb; 1215 1216 /* 1217 * If we have less data than requested, block awaiting more 1218 * (subject to any timeout) if: 1219 * 1. the current count is less than the low water mark, 1220 * 2. MSG_WAITALL is set, and it is possible to do the entire 1221 * receive operation at once if we block (resid <= hiwat), or 1222 * 3. MSG_DONTWAIT is not set. 1223 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1224 * we have to do the receive in sections, and thus risk returning 1225 * a short count if a timeout or signal occurs after we start. 1226 */ 1227 if (m == NULL || 1228 ((flags & MSG_DONTWAIT) == 0 && 1229 so->so_rcv.sb_cc < uio->uio_resid && 1230 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 1231 ((flags & MSG_WAITALL) && 1232 uio->uio_resid <= so->so_rcv.sb_hiwat)) && 1233 m->m_nextpkt == NULL && !atomic)) { 1234 #ifdef DIAGNOSTIC 1235 if (m == NULL && so->so_rcv.sb_cc) 1236 panic("receive 1"); 1237 #endif 1238 if (so->so_error || so->so_rerror) { 1239 u_short *e; 1240 if (m != NULL) 1241 goto dontblock; 1242 e = so->so_error ? &so->so_error : &so->so_rerror; 1243 error = *e; 1244 if ((flags & MSG_PEEK) == 0) 1245 *e = 0; 1246 goto release; 1247 } 1248 if (so->so_state & SS_CANTRCVMORE) { 1249 if (m != NULL) 1250 goto dontblock; 1251 else 1252 goto release; 1253 } 1254 for (; m != NULL; m = m->m_next) 1255 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1256 m = so->so_rcv.sb_mb; 1257 goto dontblock; 1258 } 1259 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1260 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1261 error = ENOTCONN; 1262 goto release; 1263 } 1264 if (uio->uio_resid == 0) 1265 goto release; 1266 if ((so->so_state & SS_NBIO) || 1267 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 1268 error = EWOULDBLOCK; 1269 goto release; 1270 } 1271 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 1272 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 1273 sbunlock(&so->so_rcv); 1274 if (wakeup_state & SS_RESTARTSYS) 1275 error = ERESTART; 1276 else 1277 error = sbwait(&so->so_rcv); 1278 if (error != 0) { 1279 sounlock(so); 1280 splx(s); 1281 return error; 1282 } 1283 wakeup_state = so->so_state; 1284 goto restart; 1285 } 1286 1287 dontblock: 1288 /* 1289 * On entry here, m points to the first record of the socket buffer. 1290 * From this point onward, we maintain 'nextrecord' as a cache of the 1291 * pointer to the next record in the socket buffer. We must keep the 1292 * various socket buffer pointers and local stack versions of the 1293 * pointers in sync, pushing out modifications before dropping the 1294 * socket lock, and re-reading them when picking it up. 1295 * 1296 * Otherwise, we will race with the network stack appending new data 1297 * or records onto the socket buffer by using inconsistent/stale 1298 * versions of the field, possibly resulting in socket buffer 1299 * corruption. 1300 * 1301 * By holding the high-level sblock(), we prevent simultaneous 1302 * readers from pulling off the front of the socket buffer. 1303 */ 1304 if (l != NULL) 1305 l->l_ru.ru_msgrcv++; 1306 KASSERT(m == so->so_rcv.sb_mb); 1307 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 1308 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 1309 nextrecord = m->m_nextpkt; 1310 1311 if (pr->pr_flags & PR_ADDR) { 1312 KASSERT(m->m_type == MT_SONAME); 1313 orig_resid = 0; 1314 if (flags & MSG_PEEK) { 1315 if (paddr) 1316 *paddr = m_copym(m, 0, m->m_len, M_DONTWAIT); 1317 m = m->m_next; 1318 } else { 1319 sbfree(&so->so_rcv, m); 1320 mbuf_removed = 1; 1321 if (paddr != NULL) { 1322 *paddr = m; 1323 so->so_rcv.sb_mb = m->m_next; 1324 m->m_next = NULL; 1325 m = so->so_rcv.sb_mb; 1326 } else { 1327 m = so->so_rcv.sb_mb = m_free(m); 1328 } 1329 sbsync(&so->so_rcv, nextrecord); 1330 } 1331 } 1332 1333 if (pr->pr_flags & PR_ADDR_OPT) { 1334 /* 1335 * For SCTP we may be getting a whole message OR a partial 1336 * delivery. 1337 */ 1338 if (m->m_type == MT_SONAME) { 1339 orig_resid = 0; 1340 if (flags & MSG_PEEK) { 1341 if (paddr) 1342 *paddr = m_copym(m, 0, m->m_len, M_DONTWAIT); 1343 m = m->m_next; 1344 } else { 1345 sbfree(&so->so_rcv, m); 1346 mbuf_removed = 1; 1347 if (paddr) { 1348 *paddr = m; 1349 so->so_rcv.sb_mb = m->m_next; 1350 m->m_next = 0; 1351 m = so->so_rcv.sb_mb; 1352 } else { 1353 m = so->so_rcv.sb_mb = m_free(m); 1354 } 1355 sbsync(&so->so_rcv, nextrecord); 1356 } 1357 } 1358 } 1359 1360 /* 1361 * Process one or more MT_CONTROL mbufs present before any data mbufs 1362 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1363 * just copy the data; if !MSG_PEEK, we call into the protocol to 1364 * perform externalization (or freeing if controlp == NULL). 1365 */ 1366 if (__predict_false(m != NULL && m->m_type == MT_CONTROL)) { 1367 struct mbuf *cm = NULL, *cmn; 1368 struct mbuf **cme = &cm; 1369 1370 do { 1371 if (flags & MSG_PEEK) { 1372 if (controlp != NULL) { 1373 *controlp = m_copym(m, 0, m->m_len, M_DONTWAIT); 1374 controlp = (*controlp == NULL ? NULL : 1375 &(*controlp)->m_next); 1376 } 1377 m = m->m_next; 1378 } else { 1379 sbfree(&so->so_rcv, m); 1380 so->so_rcv.sb_mb = m->m_next; 1381 m->m_next = NULL; 1382 *cme = m; 1383 cme = &(*cme)->m_next; 1384 m = so->so_rcv.sb_mb; 1385 } 1386 } while (m != NULL && m->m_type == MT_CONTROL); 1387 if ((flags & MSG_PEEK) == 0) 1388 sbsync(&so->so_rcv, nextrecord); 1389 1390 for (; cm != NULL; cm = cmn) { 1391 cmn = cm->m_next; 1392 cm->m_next = NULL; 1393 type = mtod(cm, struct cmsghdr *)->cmsg_type; 1394 if (controlp != NULL) { 1395 if (dom->dom_externalize != NULL && 1396 type == SCM_RIGHTS) { 1397 sounlock(so); 1398 splx(s); 1399 error = (*dom->dom_externalize)(cm, l, 1400 (flags & MSG_CMSG_CLOEXEC) ? 1401 O_CLOEXEC : 0); 1402 s = splsoftnet(); 1403 solock(so); 1404 } 1405 *controlp = cm; 1406 while (*controlp != NULL) 1407 controlp = &(*controlp)->m_next; 1408 } else { 1409 /* 1410 * Dispose of any SCM_RIGHTS message that went 1411 * through the read path rather than recv. 1412 */ 1413 if (dom->dom_dispose != NULL && 1414 type == SCM_RIGHTS) { 1415 sounlock(so); 1416 (*dom->dom_dispose)(cm); 1417 solock(so); 1418 } 1419 m_freem(cm); 1420 } 1421 } 1422 if (m != NULL) 1423 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1424 else 1425 nextrecord = so->so_rcv.sb_mb; 1426 orig_resid = 0; 1427 } 1428 1429 /* If m is non-NULL, we have some data to read. */ 1430 if (__predict_true(m != NULL)) { 1431 type = m->m_type; 1432 if (type == MT_OOBDATA) 1433 flags |= MSG_OOB; 1434 } 1435 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 1436 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 1437 1438 moff = 0; 1439 offset = 0; 1440 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1441 /* 1442 * If the type of mbuf has changed, end the receive 1443 * operation and do a short read. 1444 */ 1445 if (m->m_type == MT_OOBDATA) { 1446 if (type != MT_OOBDATA) 1447 break; 1448 } else if (type == MT_OOBDATA) { 1449 break; 1450 } else if (m->m_type == MT_CONTROL) { 1451 break; 1452 } 1453 #ifdef DIAGNOSTIC 1454 else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) { 1455 panic("%s: m_type=%d", __func__, m->m_type); 1456 } 1457 #endif 1458 1459 so->so_state &= ~SS_RCVATMARK; 1460 wakeup_state = 0; 1461 len = uio->uio_resid; 1462 if (so->so_oobmark && len > so->so_oobmark - offset) 1463 len = so->so_oobmark - offset; 1464 if (len > m->m_len - moff) 1465 len = m->m_len - moff; 1466 1467 /* 1468 * If mp is set, just pass back the mbufs. 1469 * Otherwise copy them out via the uio, then free. 1470 * Sockbuf must be consistent here (points to current mbuf, 1471 * it points to next record) when we drop priority; 1472 * we must note any additions to the sockbuf when we 1473 * block interrupts again. 1474 */ 1475 if (mp == NULL) { 1476 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 1477 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 1478 sounlock(so); 1479 splx(s); 1480 error = uiomove(mtod(m, char *) + moff, len, uio); 1481 s = splsoftnet(); 1482 solock(so); 1483 if (error != 0) { 1484 /* 1485 * If any part of the record has been removed 1486 * (such as the MT_SONAME mbuf, which will 1487 * happen when PR_ADDR, and thus also 1488 * PR_ATOMIC, is set), then drop the entire 1489 * record to maintain the atomicity of the 1490 * receive operation. 1491 * 1492 * This avoids a later panic("receive 1a") 1493 * when compiled with DIAGNOSTIC. 1494 */ 1495 if (m && mbuf_removed && atomic) 1496 (void) sbdroprecord(&so->so_rcv); 1497 1498 goto release; 1499 } 1500 } else { 1501 uio->uio_resid -= len; 1502 } 1503 1504 if (len == m->m_len - moff) { 1505 if (m->m_flags & M_EOR) 1506 flags |= MSG_EOR; 1507 #ifdef SCTP 1508 if (m->m_flags & M_NOTIFICATION) 1509 flags |= MSG_NOTIFICATION; 1510 #endif 1511 if (flags & MSG_PEEK) { 1512 m = m->m_next; 1513 moff = 0; 1514 } else { 1515 nextrecord = m->m_nextpkt; 1516 sbfree(&so->so_rcv, m); 1517 if (mp) { 1518 *mp = m; 1519 mp = &m->m_next; 1520 so->so_rcv.sb_mb = m = m->m_next; 1521 *mp = NULL; 1522 } else { 1523 m = so->so_rcv.sb_mb = m_free(m); 1524 } 1525 /* 1526 * If m != NULL, we also know that 1527 * so->so_rcv.sb_mb != NULL. 1528 */ 1529 KASSERT(so->so_rcv.sb_mb == m); 1530 if (m) { 1531 m->m_nextpkt = nextrecord; 1532 if (nextrecord == NULL) 1533 so->so_rcv.sb_lastrecord = m; 1534 } else { 1535 so->so_rcv.sb_mb = nextrecord; 1536 SB_EMPTY_FIXUP(&so->so_rcv); 1537 } 1538 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1539 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1540 } 1541 } else if (flags & MSG_PEEK) { 1542 moff += len; 1543 } else { 1544 if (mp != NULL) { 1545 mt = m_copym(m, 0, len, M_NOWAIT); 1546 if (__predict_false(mt == NULL)) { 1547 sounlock(so); 1548 mt = m_copym(m, 0, len, M_WAIT); 1549 solock(so); 1550 } 1551 *mp = mt; 1552 } 1553 m->m_data += len; 1554 m->m_len -= len; 1555 so->so_rcv.sb_cc -= len; 1556 } 1557 1558 if (so->so_oobmark) { 1559 if ((flags & MSG_PEEK) == 0) { 1560 so->so_oobmark -= len; 1561 if (so->so_oobmark == 0) { 1562 so->so_state |= SS_RCVATMARK; 1563 break; 1564 } 1565 } else { 1566 offset += len; 1567 if (offset == so->so_oobmark) 1568 break; 1569 } 1570 } else { 1571 so->so_state &= ~SS_POLLRDBAND; 1572 } 1573 if (flags & MSG_EOR) 1574 break; 1575 1576 /* 1577 * If the MSG_WAITALL flag is set (for non-atomic socket), 1578 * we must not quit until "uio->uio_resid == 0" or an error 1579 * termination. If a signal/timeout occurs, return 1580 * with a short count but without error. 1581 * Keep sockbuf locked against other readers. 1582 */ 1583 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1584 !sosendallatonce(so) && !nextrecord) { 1585 if (so->so_error || so->so_rerror || 1586 so->so_state & SS_CANTRCVMORE) 1587 break; 1588 /* 1589 * If we are peeking and the socket receive buffer is 1590 * full, stop since we can't get more data to peek at. 1591 */ 1592 if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0) 1593 break; 1594 /* 1595 * If we've drained the socket buffer, tell the 1596 * protocol in case it needs to do something to 1597 * get it filled again. 1598 */ 1599 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) 1600 (*pr->pr_usrreqs->pr_rcvd)(so, flags, l); 1601 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1602 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1603 if (wakeup_state & SS_RESTARTSYS) 1604 error = ERESTART; 1605 else 1606 error = sbwait(&so->so_rcv); 1607 if (error != 0) { 1608 sbunlock(&so->so_rcv); 1609 sounlock(so); 1610 splx(s); 1611 return 0; 1612 } 1613 if ((m = so->so_rcv.sb_mb) != NULL) 1614 nextrecord = m->m_nextpkt; 1615 wakeup_state = so->so_state; 1616 } 1617 } 1618 1619 if (m && atomic) { 1620 flags |= MSG_TRUNC; 1621 if ((flags & MSG_PEEK) == 0) 1622 (void) sbdroprecord(&so->so_rcv); 1623 } 1624 if ((flags & MSG_PEEK) == 0) { 1625 if (m == NULL) { 1626 /* 1627 * First part is an inline SB_EMPTY_FIXUP(). Second 1628 * part makes sure sb_lastrecord is up-to-date if 1629 * there is still data in the socket buffer. 1630 */ 1631 so->so_rcv.sb_mb = nextrecord; 1632 if (so->so_rcv.sb_mb == NULL) { 1633 so->so_rcv.sb_mbtail = NULL; 1634 so->so_rcv.sb_lastrecord = NULL; 1635 } else if (nextrecord->m_nextpkt == NULL) 1636 so->so_rcv.sb_lastrecord = nextrecord; 1637 } 1638 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1639 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1640 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 1641 (*pr->pr_usrreqs->pr_rcvd)(so, flags, l); 1642 } 1643 if (orig_resid == uio->uio_resid && orig_resid && 1644 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1645 sbunlock(&so->so_rcv); 1646 goto restart; 1647 } 1648 1649 if (flagsp != NULL) 1650 *flagsp |= flags; 1651 release: 1652 sbunlock(&so->so_rcv); 1653 sounlock(so); 1654 splx(s); 1655 return error; 1656 } 1657 1658 int 1659 soshutdown(struct socket *so, int how) 1660 { 1661 const struct protosw *pr; 1662 int error; 1663 1664 KASSERT(solocked(so)); 1665 1666 pr = so->so_proto; 1667 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1668 return EINVAL; 1669 1670 if (how == SHUT_RD || how == SHUT_RDWR) { 1671 sorflush(so); 1672 error = 0; 1673 } 1674 if (how == SHUT_WR || how == SHUT_RDWR) 1675 error = (*pr->pr_usrreqs->pr_shutdown)(so); 1676 1677 return error; 1678 } 1679 1680 void 1681 sorestart(struct socket *so) 1682 { 1683 /* 1684 * An application has called close() on an fd on which another 1685 * of its threads has called a socket system call. 1686 * Mark this and wake everyone up, and code that would block again 1687 * instead returns ERESTART. 1688 * On system call re-entry the fd is validated and EBADF returned. 1689 * Any other fd will block again on the 2nd syscall. 1690 */ 1691 solock(so); 1692 so->so_state |= SS_RESTARTSYS; 1693 cv_broadcast(&so->so_cv); 1694 cv_broadcast(&so->so_snd.sb_cv); 1695 cv_broadcast(&so->so_rcv.sb_cv); 1696 sounlock(so); 1697 } 1698 1699 void 1700 sorflush(struct socket *so) 1701 { 1702 struct sockbuf *sb, asb; 1703 const struct protosw *pr; 1704 1705 KASSERT(solocked(so)); 1706 1707 sb = &so->so_rcv; 1708 pr = so->so_proto; 1709 socantrcvmore(so); 1710 sb->sb_flags |= SB_NOINTR; 1711 (void )sblock(sb, M_WAITOK); 1712 sbunlock(sb); 1713 asb = *sb; 1714 /* 1715 * Clear most of the sockbuf structure, but leave some of the 1716 * fields valid. 1717 */ 1718 memset(&sb->sb_startzero, 0, 1719 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1720 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) { 1721 sounlock(so); 1722 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1723 solock(so); 1724 } 1725 sbrelease(&asb, so); 1726 } 1727 1728 /* 1729 * internal set SOL_SOCKET options 1730 */ 1731 static int 1732 sosetopt1(struct socket *so, const struct sockopt *sopt) 1733 { 1734 int error, opt; 1735 int optval = 0; /* XXX: gcc */ 1736 struct linger l; 1737 struct timeval tv; 1738 1739 opt = sopt->sopt_name; 1740 1741 switch (opt) { 1742 1743 case SO_ACCEPTFILTER: 1744 error = accept_filt_setopt(so, sopt); 1745 KASSERT(solocked(so)); 1746 break; 1747 1748 case SO_LINGER: 1749 error = sockopt_get(sopt, &l, sizeof(l)); 1750 solock(so); 1751 if (error) 1752 break; 1753 if (l.l_linger < 0 || l.l_linger > USHRT_MAX || 1754 l.l_linger > (INT_MAX / hz)) { 1755 error = EDOM; 1756 break; 1757 } 1758 so->so_linger = l.l_linger; 1759 if (l.l_onoff) 1760 so->so_options |= SO_LINGER; 1761 else 1762 so->so_options &= ~SO_LINGER; 1763 break; 1764 1765 case SO_DEBUG: 1766 case SO_KEEPALIVE: 1767 case SO_DONTROUTE: 1768 case SO_USELOOPBACK: 1769 case SO_BROADCAST: 1770 case SO_REUSEADDR: 1771 case SO_REUSEPORT: 1772 case SO_OOBINLINE: 1773 case SO_TIMESTAMP: 1774 case SO_NOSIGPIPE: 1775 case SO_RERROR: 1776 error = sockopt_getint(sopt, &optval); 1777 solock(so); 1778 if (error) 1779 break; 1780 if (optval) 1781 so->so_options |= opt; 1782 else 1783 so->so_options &= ~opt; 1784 break; 1785 1786 case SO_SNDBUF: 1787 case SO_RCVBUF: 1788 case SO_SNDLOWAT: 1789 case SO_RCVLOWAT: 1790 error = sockopt_getint(sopt, &optval); 1791 solock(so); 1792 if (error) 1793 break; 1794 1795 /* 1796 * Values < 1 make no sense for any of these 1797 * options, so disallow them. 1798 */ 1799 if (optval < 1) { 1800 error = EINVAL; 1801 break; 1802 } 1803 1804 switch (opt) { 1805 case SO_SNDBUF: 1806 if (sbreserve(&so->so_snd, (u_long)optval, so) == 0) { 1807 error = ENOBUFS; 1808 break; 1809 } 1810 if (sofixedbuf) 1811 so->so_snd.sb_flags &= ~SB_AUTOSIZE; 1812 break; 1813 1814 case SO_RCVBUF: 1815 if (sbreserve(&so->so_rcv, (u_long)optval, so) == 0) { 1816 error = ENOBUFS; 1817 break; 1818 } 1819 if (sofixedbuf) 1820 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 1821 break; 1822 1823 /* 1824 * Make sure the low-water is never greater than 1825 * the high-water. 1826 */ 1827 case SO_SNDLOWAT: 1828 if (optval > so->so_snd.sb_hiwat) 1829 optval = so->so_snd.sb_hiwat; 1830 1831 so->so_snd.sb_lowat = optval; 1832 break; 1833 1834 case SO_RCVLOWAT: 1835 if (optval > so->so_rcv.sb_hiwat) 1836 optval = so->so_rcv.sb_hiwat; 1837 1838 so->so_rcv.sb_lowat = optval; 1839 break; 1840 } 1841 break; 1842 1843 case SO_SNDTIMEO: 1844 case SO_RCVTIMEO: 1845 solock(so); 1846 error = sockopt_get(sopt, &tv, sizeof(tv)); 1847 if (error) 1848 break; 1849 1850 if (tv.tv_sec < 0 || tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 1851 error = EDOM; 1852 break; 1853 } 1854 if (tv.tv_sec > (INT_MAX - tv.tv_usec / tick) / hz) { 1855 error = EDOM; 1856 break; 1857 } 1858 1859 optval = tv.tv_sec * hz + tv.tv_usec / tick; 1860 if (optval == 0 && tv.tv_usec != 0) 1861 optval = 1; 1862 1863 switch (opt) { 1864 case SO_SNDTIMEO: 1865 so->so_snd.sb_timeo = optval; 1866 break; 1867 case SO_RCVTIMEO: 1868 so->so_rcv.sb_timeo = optval; 1869 break; 1870 } 1871 break; 1872 1873 default: 1874 MODULE_HOOK_CALL(uipc_socket_50_setopt1_hook, 1875 (opt, so, sopt), enosys(), error); 1876 if (error == ENOSYS || error == EPASSTHROUGH) { 1877 solock(so); 1878 error = ENOPROTOOPT; 1879 } 1880 break; 1881 } 1882 KASSERT(solocked(so)); 1883 return error; 1884 } 1885 1886 int 1887 sosetopt(struct socket *so, struct sockopt *sopt) 1888 { 1889 int error, prerr; 1890 1891 if (sopt->sopt_level == SOL_SOCKET) { 1892 error = sosetopt1(so, sopt); 1893 KASSERT(solocked(so)); 1894 } else { 1895 error = ENOPROTOOPT; 1896 solock(so); 1897 } 1898 1899 if ((error == 0 || error == ENOPROTOOPT) && 1900 so->so_proto != NULL && so->so_proto->pr_ctloutput != NULL) { 1901 /* give the protocol stack a shot */ 1902 prerr = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, sopt); 1903 if (prerr == 0) 1904 error = 0; 1905 else if (prerr != ENOPROTOOPT) 1906 error = prerr; 1907 } 1908 sounlock(so); 1909 return error; 1910 } 1911 1912 /* 1913 * so_setsockopt() is a wrapper providing a sockopt structure for sosetopt() 1914 */ 1915 int 1916 so_setsockopt(struct lwp *l, struct socket *so, int level, int name, 1917 const void *val, size_t valsize) 1918 { 1919 struct sockopt sopt; 1920 int error; 1921 1922 KASSERT(valsize == 0 || val != NULL); 1923 1924 sockopt_init(&sopt, level, name, valsize); 1925 sockopt_set(&sopt, val, valsize); 1926 1927 error = sosetopt(so, &sopt); 1928 1929 sockopt_destroy(&sopt); 1930 1931 return error; 1932 } 1933 1934 /* 1935 * internal get SOL_SOCKET options 1936 */ 1937 static int 1938 sogetopt1(struct socket *so, struct sockopt *sopt) 1939 { 1940 int error, optval, opt; 1941 struct linger l; 1942 struct timeval tv; 1943 1944 switch ((opt = sopt->sopt_name)) { 1945 1946 case SO_ACCEPTFILTER: 1947 error = accept_filt_getopt(so, sopt); 1948 break; 1949 1950 case SO_LINGER: 1951 l.l_onoff = (so->so_options & SO_LINGER) ? 1 : 0; 1952 l.l_linger = so->so_linger; 1953 1954 error = sockopt_set(sopt, &l, sizeof(l)); 1955 break; 1956 1957 case SO_USELOOPBACK: 1958 case SO_DONTROUTE: 1959 case SO_DEBUG: 1960 case SO_KEEPALIVE: 1961 case SO_REUSEADDR: 1962 case SO_REUSEPORT: 1963 case SO_BROADCAST: 1964 case SO_OOBINLINE: 1965 case SO_TIMESTAMP: 1966 case SO_NOSIGPIPE: 1967 case SO_RERROR: 1968 case SO_ACCEPTCONN: 1969 error = sockopt_setint(sopt, (so->so_options & opt) ? 1 : 0); 1970 break; 1971 1972 case SO_TYPE: 1973 error = sockopt_setint(sopt, so->so_type); 1974 break; 1975 1976 case SO_ERROR: 1977 if (so->so_error == 0) { 1978 so->so_error = so->so_rerror; 1979 so->so_rerror = 0; 1980 } 1981 error = sockopt_setint(sopt, so->so_error); 1982 so->so_error = 0; 1983 break; 1984 1985 case SO_SNDBUF: 1986 error = sockopt_setint(sopt, so->so_snd.sb_hiwat); 1987 break; 1988 1989 case SO_RCVBUF: 1990 error = sockopt_setint(sopt, so->so_rcv.sb_hiwat); 1991 break; 1992 1993 case SO_SNDLOWAT: 1994 error = sockopt_setint(sopt, so->so_snd.sb_lowat); 1995 break; 1996 1997 case SO_RCVLOWAT: 1998 error = sockopt_setint(sopt, so->so_rcv.sb_lowat); 1999 break; 2000 2001 case SO_SNDTIMEO: 2002 case SO_RCVTIMEO: 2003 optval = (opt == SO_SNDTIMEO ? 2004 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 2005 2006 memset(&tv, 0, sizeof(tv)); 2007 tv.tv_sec = optval / hz; 2008 tv.tv_usec = (optval % hz) * tick; 2009 2010 error = sockopt_set(sopt, &tv, sizeof(tv)); 2011 break; 2012 2013 case SO_OVERFLOWED: 2014 error = sockopt_setint(sopt, so->so_rcv.sb_overflowed); 2015 break; 2016 2017 default: 2018 MODULE_HOOK_CALL(uipc_socket_50_getopt1_hook, 2019 (opt, so, sopt), enosys(), error); 2020 if (error) 2021 error = ENOPROTOOPT; 2022 break; 2023 } 2024 2025 return error; 2026 } 2027 2028 int 2029 sogetopt(struct socket *so, struct sockopt *sopt) 2030 { 2031 int error; 2032 2033 solock(so); 2034 if (sopt->sopt_level != SOL_SOCKET) { 2035 if (so->so_proto && so->so_proto->pr_ctloutput) { 2036 error = ((*so->so_proto->pr_ctloutput) 2037 (PRCO_GETOPT, so, sopt)); 2038 } else 2039 error = (ENOPROTOOPT); 2040 } else { 2041 error = sogetopt1(so, sopt); 2042 } 2043 sounlock(so); 2044 return error; 2045 } 2046 2047 /* 2048 * alloc sockopt data buffer buffer 2049 * - will be released at destroy 2050 */ 2051 static int 2052 sockopt_alloc(struct sockopt *sopt, size_t len, km_flag_t kmflag) 2053 { 2054 void *data; 2055 2056 KASSERT(sopt->sopt_size == 0); 2057 2058 if (len > sizeof(sopt->sopt_buf)) { 2059 data = kmem_zalloc(len, kmflag); 2060 if (data == NULL) 2061 return ENOMEM; 2062 sopt->sopt_data = data; 2063 } else 2064 sopt->sopt_data = sopt->sopt_buf; 2065 2066 sopt->sopt_size = len; 2067 return 0; 2068 } 2069 2070 /* 2071 * initialise sockopt storage 2072 * - MAY sleep during allocation 2073 */ 2074 void 2075 sockopt_init(struct sockopt *sopt, int level, int name, size_t size) 2076 { 2077 2078 memset(sopt, 0, sizeof(*sopt)); 2079 2080 sopt->sopt_level = level; 2081 sopt->sopt_name = name; 2082 (void)sockopt_alloc(sopt, size, KM_SLEEP); 2083 } 2084 2085 /* 2086 * destroy sockopt storage 2087 * - will release any held memory references 2088 */ 2089 void 2090 sockopt_destroy(struct sockopt *sopt) 2091 { 2092 2093 if (sopt->sopt_data != sopt->sopt_buf) 2094 kmem_free(sopt->sopt_data, sopt->sopt_size); 2095 2096 memset(sopt, 0, sizeof(*sopt)); 2097 } 2098 2099 /* 2100 * set sockopt value 2101 * - value is copied into sockopt 2102 * - memory is allocated when necessary, will not sleep 2103 */ 2104 int 2105 sockopt_set(struct sockopt *sopt, const void *buf, size_t len) 2106 { 2107 int error; 2108 2109 if (sopt->sopt_size == 0) { 2110 error = sockopt_alloc(sopt, len, KM_NOSLEEP); 2111 if (error) 2112 return error; 2113 } 2114 2115 sopt->sopt_retsize = MIN(sopt->sopt_size, len); 2116 if (sopt->sopt_retsize > 0) { 2117 memcpy(sopt->sopt_data, buf, sopt->sopt_retsize); 2118 } 2119 2120 return 0; 2121 } 2122 2123 /* 2124 * common case of set sockopt integer value 2125 */ 2126 int 2127 sockopt_setint(struct sockopt *sopt, int val) 2128 { 2129 2130 return sockopt_set(sopt, &val, sizeof(int)); 2131 } 2132 2133 /* 2134 * get sockopt value 2135 * - correct size must be given 2136 */ 2137 int 2138 sockopt_get(const struct sockopt *sopt, void *buf, size_t len) 2139 { 2140 2141 if (sopt->sopt_size != len) 2142 return EINVAL; 2143 2144 memcpy(buf, sopt->sopt_data, len); 2145 return 0; 2146 } 2147 2148 /* 2149 * common case of get sockopt integer value 2150 */ 2151 int 2152 sockopt_getint(const struct sockopt *sopt, int *valp) 2153 { 2154 2155 return sockopt_get(sopt, valp, sizeof(int)); 2156 } 2157 2158 /* 2159 * set sockopt value from mbuf 2160 * - ONLY for legacy code 2161 * - mbuf is released by sockopt 2162 * - will not sleep 2163 */ 2164 int 2165 sockopt_setmbuf(struct sockopt *sopt, struct mbuf *m) 2166 { 2167 size_t len; 2168 int error; 2169 2170 len = m_length(m); 2171 2172 if (sopt->sopt_size == 0) { 2173 error = sockopt_alloc(sopt, len, KM_NOSLEEP); 2174 if (error) 2175 return error; 2176 } 2177 2178 sopt->sopt_retsize = MIN(sopt->sopt_size, len); 2179 m_copydata(m, 0, sopt->sopt_retsize, sopt->sopt_data); 2180 m_freem(m); 2181 2182 return 0; 2183 } 2184 2185 /* 2186 * get sockopt value into mbuf 2187 * - ONLY for legacy code 2188 * - mbuf to be released by the caller 2189 * - will not sleep 2190 */ 2191 struct mbuf * 2192 sockopt_getmbuf(const struct sockopt *sopt) 2193 { 2194 struct mbuf *m; 2195 2196 if (sopt->sopt_size > MCLBYTES) 2197 return NULL; 2198 2199 m = m_get(M_DONTWAIT, MT_SOOPTS); 2200 if (m == NULL) 2201 return NULL; 2202 2203 if (sopt->sopt_size > MLEN) { 2204 MCLGET(m, M_DONTWAIT); 2205 if ((m->m_flags & M_EXT) == 0) { 2206 m_free(m); 2207 return NULL; 2208 } 2209 } 2210 2211 memcpy(mtod(m, void *), sopt->sopt_data, sopt->sopt_size); 2212 m->m_len = sopt->sopt_size; 2213 2214 return m; 2215 } 2216 2217 void 2218 sohasoutofband(struct socket *so) 2219 { 2220 2221 so->so_state |= SS_POLLRDBAND; 2222 fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so); 2223 selnotify(&so->so_rcv.sb_sel, POLLPRI | POLLRDBAND, NOTE_SUBMIT); 2224 } 2225 2226 static void 2227 filt_sordetach(struct knote *kn) 2228 { 2229 struct socket *so; 2230 2231 so = ((file_t *)kn->kn_obj)->f_socket; 2232 solock(so); 2233 if (selremove_knote(&so->so_rcv.sb_sel, kn)) 2234 so->so_rcv.sb_flags &= ~SB_KNOTE; 2235 sounlock(so); 2236 } 2237 2238 /*ARGSUSED*/ 2239 static int 2240 filt_soread(struct knote *kn, long hint) 2241 { 2242 struct socket *so; 2243 int rv; 2244 2245 so = ((file_t *)kn->kn_obj)->f_socket; 2246 if (hint != NOTE_SUBMIT) 2247 solock(so); 2248 kn->kn_data = so->so_rcv.sb_cc; 2249 if (so->so_state & SS_CANTRCVMORE) { 2250 knote_set_eof(kn, 0); 2251 kn->kn_fflags = so->so_error; 2252 rv = 1; 2253 } else if (so->so_error || so->so_rerror) 2254 rv = 1; 2255 else if (kn->kn_sfflags & NOTE_LOWAT) 2256 rv = (kn->kn_data >= kn->kn_sdata); 2257 else 2258 rv = (kn->kn_data >= so->so_rcv.sb_lowat); 2259 if (hint != NOTE_SUBMIT) 2260 sounlock(so); 2261 return rv; 2262 } 2263 2264 static void 2265 filt_sowdetach(struct knote *kn) 2266 { 2267 struct socket *so; 2268 2269 so = ((file_t *)kn->kn_obj)->f_socket; 2270 solock(so); 2271 if (selremove_knote(&so->so_snd.sb_sel, kn)) 2272 so->so_snd.sb_flags &= ~SB_KNOTE; 2273 sounlock(so); 2274 } 2275 2276 /*ARGSUSED*/ 2277 static int 2278 filt_sowrite(struct knote *kn, long hint) 2279 { 2280 struct socket *so; 2281 int rv; 2282 2283 so = ((file_t *)kn->kn_obj)->f_socket; 2284 if (hint != NOTE_SUBMIT) 2285 solock(so); 2286 kn->kn_data = sbspace(&so->so_snd); 2287 if (so->so_state & SS_CANTSENDMORE) { 2288 knote_set_eof(kn, 0); 2289 kn->kn_fflags = so->so_error; 2290 rv = 1; 2291 } else if (so->so_error) 2292 rv = 1; 2293 else if (((so->so_state & SS_ISCONNECTED) == 0) && 2294 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2295 rv = 0; 2296 else if (kn->kn_sfflags & NOTE_LOWAT) 2297 rv = (kn->kn_data >= kn->kn_sdata); 2298 else 2299 rv = (kn->kn_data >= so->so_snd.sb_lowat); 2300 if (hint != NOTE_SUBMIT) 2301 sounlock(so); 2302 return rv; 2303 } 2304 2305 static int 2306 filt_soempty(struct knote *kn, long hint) 2307 { 2308 struct socket *so; 2309 int rv; 2310 2311 so = ((file_t *)kn->kn_obj)->f_socket; 2312 if (hint != NOTE_SUBMIT) 2313 solock(so); 2314 rv = (kn->kn_data = sbused(&so->so_snd)) == 0 || 2315 (so->so_options & SO_ACCEPTCONN) != 0; 2316 if (hint != NOTE_SUBMIT) 2317 sounlock(so); 2318 return rv; 2319 } 2320 2321 /*ARGSUSED*/ 2322 static int 2323 filt_solisten(struct knote *kn, long hint) 2324 { 2325 struct socket *so; 2326 int rv; 2327 2328 so = ((file_t *)kn->kn_obj)->f_socket; 2329 2330 /* 2331 * Set kn_data to number of incoming connections, not 2332 * counting partial (incomplete) connections. 2333 */ 2334 if (hint != NOTE_SUBMIT) 2335 solock(so); 2336 kn->kn_data = so->so_qlen; 2337 rv = (kn->kn_data > 0); 2338 if (hint != NOTE_SUBMIT) 2339 sounlock(so); 2340 return rv; 2341 } 2342 2343 static const struct filterops solisten_filtops = { 2344 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 2345 .f_attach = NULL, 2346 .f_detach = filt_sordetach, 2347 .f_event = filt_solisten, 2348 }; 2349 2350 static const struct filterops soread_filtops = { 2351 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 2352 .f_attach = NULL, 2353 .f_detach = filt_sordetach, 2354 .f_event = filt_soread, 2355 }; 2356 2357 static const struct filterops sowrite_filtops = { 2358 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 2359 .f_attach = NULL, 2360 .f_detach = filt_sowdetach, 2361 .f_event = filt_sowrite, 2362 }; 2363 2364 static const struct filterops soempty_filtops = { 2365 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, 2366 .f_attach = NULL, 2367 .f_detach = filt_sowdetach, 2368 .f_event = filt_soempty, 2369 }; 2370 2371 int 2372 soo_kqfilter(struct file *fp, struct knote *kn) 2373 { 2374 struct socket *so; 2375 struct sockbuf *sb; 2376 2377 so = ((file_t *)kn->kn_obj)->f_socket; 2378 solock(so); 2379 switch (kn->kn_filter) { 2380 case EVFILT_READ: 2381 if (so->so_options & SO_ACCEPTCONN) 2382 kn->kn_fop = &solisten_filtops; 2383 else 2384 kn->kn_fop = &soread_filtops; 2385 sb = &so->so_rcv; 2386 break; 2387 case EVFILT_WRITE: 2388 kn->kn_fop = &sowrite_filtops; 2389 sb = &so->so_snd; 2390 break; 2391 case EVFILT_EMPTY: 2392 kn->kn_fop = &soempty_filtops; 2393 sb = &so->so_snd; 2394 break; 2395 default: 2396 sounlock(so); 2397 return EINVAL; 2398 } 2399 selrecord_knote(&sb->sb_sel, kn); 2400 sb->sb_flags |= SB_KNOTE; 2401 sounlock(so); 2402 return 0; 2403 } 2404 2405 static int 2406 sodopoll(struct socket *so, int events) 2407 { 2408 int revents; 2409 2410 revents = 0; 2411 2412 if (events & (POLLIN | POLLRDNORM)) 2413 if (soreadable(so)) 2414 revents |= events & (POLLIN | POLLRDNORM); 2415 2416 if (events & (POLLOUT | POLLWRNORM)) 2417 if (sowritable(so)) 2418 revents |= events & (POLLOUT | POLLWRNORM); 2419 2420 if (events & (POLLPRI | POLLRDBAND)) 2421 if (so->so_state & SS_POLLRDBAND) 2422 revents |= events & (POLLPRI | POLLRDBAND); 2423 2424 return revents; 2425 } 2426 2427 int 2428 sopoll(struct socket *so, int events) 2429 { 2430 int revents = 0; 2431 2432 #ifndef DIAGNOSTIC 2433 /* 2434 * Do a quick, unlocked check in expectation that the socket 2435 * will be ready for I/O. Don't do this check if DIAGNOSTIC, 2436 * as the solocked() assertions will fail. 2437 */ 2438 if ((revents = sodopoll(so, events)) != 0) 2439 return revents; 2440 #endif 2441 2442 solock(so); 2443 if ((revents = sodopoll(so, events)) == 0) { 2444 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { 2445 selrecord(curlwp, &so->so_rcv.sb_sel); 2446 so->so_rcv.sb_flags |= SB_NOTIFY; 2447 } 2448 2449 if (events & (POLLOUT | POLLWRNORM)) { 2450 selrecord(curlwp, &so->so_snd.sb_sel); 2451 so->so_snd.sb_flags |= SB_NOTIFY; 2452 } 2453 } 2454 sounlock(so); 2455 2456 return revents; 2457 } 2458 2459 struct mbuf ** 2460 sbsavetimestamp(int opt, struct mbuf **mp) 2461 { 2462 struct timeval tv; 2463 int error; 2464 2465 memset(&tv, 0, sizeof(tv)); 2466 microtime(&tv); 2467 2468 MODULE_HOOK_CALL(uipc_socket_50_sbts_hook, (opt, &mp), enosys(), error); 2469 if (error == 0) 2470 return mp; 2471 2472 if (opt & SO_TIMESTAMP) { 2473 *mp = sbcreatecontrol(&tv, sizeof(tv), 2474 SCM_TIMESTAMP, SOL_SOCKET); 2475 if (*mp) 2476 mp = &(*mp)->m_next; 2477 } 2478 return mp; 2479 } 2480 2481 2482 #include <sys/sysctl.h> 2483 2484 static int sysctl_kern_somaxkva(SYSCTLFN_PROTO); 2485 static int sysctl_kern_sbmax(SYSCTLFN_PROTO); 2486 2487 /* 2488 * sysctl helper routine for kern.somaxkva. ensures that the given 2489 * value is not too small. 2490 * (XXX should we maybe make sure it's not too large as well?) 2491 */ 2492 static int 2493 sysctl_kern_somaxkva(SYSCTLFN_ARGS) 2494 { 2495 int error, new_somaxkva; 2496 struct sysctlnode node; 2497 2498 new_somaxkva = somaxkva; 2499 node = *rnode; 2500 node.sysctl_data = &new_somaxkva; 2501 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 2502 if (error || newp == NULL) 2503 return error; 2504 2505 if (new_somaxkva < (16 * 1024 * 1024)) /* sanity */ 2506 return EINVAL; 2507 2508 mutex_enter(&so_pendfree_lock); 2509 somaxkva = new_somaxkva; 2510 cv_broadcast(&socurkva_cv); 2511 mutex_exit(&so_pendfree_lock); 2512 2513 return error; 2514 } 2515 2516 /* 2517 * sysctl helper routine for kern.sbmax. Basically just ensures that 2518 * any new value is not too small. 2519 */ 2520 static int 2521 sysctl_kern_sbmax(SYSCTLFN_ARGS) 2522 { 2523 int error, new_sbmax; 2524 struct sysctlnode node; 2525 2526 new_sbmax = sb_max; 2527 node = *rnode; 2528 node.sysctl_data = &new_sbmax; 2529 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 2530 if (error || newp == NULL) 2531 return error; 2532 2533 KERNEL_LOCK(1, NULL); 2534 error = sb_max_set(new_sbmax); 2535 KERNEL_UNLOCK_ONE(NULL); 2536 2537 return error; 2538 } 2539 2540 /* 2541 * sysctl helper routine for kern.sooptions. Ensures that only allowed 2542 * options can be set. 2543 */ 2544 static int 2545 sysctl_kern_sooptions(SYSCTLFN_ARGS) 2546 { 2547 int error, new_options; 2548 struct sysctlnode node; 2549 2550 new_options = sooptions; 2551 node = *rnode; 2552 node.sysctl_data = &new_options; 2553 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 2554 if (error || newp == NULL) 2555 return error; 2556 2557 if (new_options & ~SO_DEFOPTS) 2558 return EINVAL; 2559 2560 sooptions = new_options; 2561 2562 return 0; 2563 } 2564 2565 static void 2566 sysctl_kern_socket_setup(void) 2567 { 2568 2569 KASSERT(socket_sysctllog == NULL); 2570 2571 sysctl_createv(&socket_sysctllog, 0, NULL, NULL, 2572 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2573 CTLTYPE_INT, "somaxkva", 2574 SYSCTL_DESCR("Maximum amount of kernel memory to be " 2575 "used for socket buffers"), 2576 sysctl_kern_somaxkva, 0, NULL, 0, 2577 CTL_KERN, KERN_SOMAXKVA, CTL_EOL); 2578 2579 sysctl_createv(&socket_sysctllog, 0, NULL, NULL, 2580 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2581 CTLTYPE_BOOL, "sofixedbuf", 2582 SYSCTL_DESCR("Prevent scaling of fixed socket buffers"), 2583 NULL, 0, &sofixedbuf, 0, 2584 CTL_KERN, KERN_SOFIXEDBUF, CTL_EOL); 2585 2586 sysctl_createv(&socket_sysctllog, 0, NULL, NULL, 2587 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2588 CTLTYPE_INT, "sbmax", 2589 SYSCTL_DESCR("Maximum socket buffer size"), 2590 sysctl_kern_sbmax, 0, NULL, 0, 2591 CTL_KERN, KERN_SBMAX, CTL_EOL); 2592 2593 sysctl_createv(&socket_sysctllog, 0, NULL, NULL, 2594 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2595 CTLTYPE_INT, "sooptions", 2596 SYSCTL_DESCR("Default socket options"), 2597 sysctl_kern_sooptions, 0, NULL, 0, 2598 CTL_KERN, CTL_CREATE, CTL_EOL); 2599 } 2600