1 /* $NetBSD: uipc_socket.c,v 1.193 2009/10/03 03:59:39 elad Exp $ */ 2 3 /*- 4 * Copyright (c) 2002, 2007, 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of Wasabi Systems, Inc, and by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2004 The FreeBSD Foundation 34 * Copyright (c) 2004 Robert Watson 35 * Copyright (c) 1982, 1986, 1988, 1990, 1993 36 * The Regents of the University of California. All rights reserved. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. Neither the name of the University nor the names of its contributors 47 * may be used to endorse or promote products derived from this software 48 * without specific prior written permission. 49 * 50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 60 * SUCH DAMAGE. 61 * 62 * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95 63 */ 64 65 #include <sys/cdefs.h> 66 __KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.193 2009/10/03 03:59:39 elad Exp $"); 67 68 #include "opt_compat_netbsd.h" 69 #include "opt_sock_counters.h" 70 #include "opt_sosend_loan.h" 71 #include "opt_mbuftrace.h" 72 #include "opt_somaxkva.h" 73 #include "opt_multiprocessor.h" /* XXX */ 74 75 #include <sys/param.h> 76 #include <sys/systm.h> 77 #include <sys/proc.h> 78 #include <sys/file.h> 79 #include <sys/filedesc.h> 80 #include <sys/kmem.h> 81 #include <sys/mbuf.h> 82 #include <sys/domain.h> 83 #include <sys/kernel.h> 84 #include <sys/protosw.h> 85 #include <sys/socket.h> 86 #include <sys/socketvar.h> 87 #include <sys/signalvar.h> 88 #include <sys/resourcevar.h> 89 #include <sys/uidinfo.h> 90 #include <sys/event.h> 91 #include <sys/poll.h> 92 #include <sys/kauth.h> 93 #include <sys/mutex.h> 94 #include <sys/condvar.h> 95 96 #ifdef COMPAT_50 97 #include <compat/sys/time.h> 98 #include <compat/sys/socket.h> 99 #endif 100 101 #include <uvm/uvm.h> 102 103 MALLOC_DEFINE(M_SOOPTS, "soopts", "socket options"); 104 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 105 106 extern const struct fileops socketops; 107 108 extern int somaxconn; /* patchable (XXX sysctl) */ 109 int somaxconn = SOMAXCONN; 110 kmutex_t *softnet_lock; 111 112 #ifdef SOSEND_COUNTERS 113 #include <sys/device.h> 114 115 static struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 116 NULL, "sosend", "loan big"); 117 static struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 118 NULL, "sosend", "copy big"); 119 static struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 120 NULL, "sosend", "copy small"); 121 static struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 122 NULL, "sosend", "kva limit"); 123 124 #define SOSEND_COUNTER_INCR(ev) (ev)->ev_count++ 125 126 EVCNT_ATTACH_STATIC(sosend_loan_big); 127 EVCNT_ATTACH_STATIC(sosend_copy_big); 128 EVCNT_ATTACH_STATIC(sosend_copy_small); 129 EVCNT_ATTACH_STATIC(sosend_kvalimit); 130 #else 131 132 #define SOSEND_COUNTER_INCR(ev) /* nothing */ 133 134 #endif /* SOSEND_COUNTERS */ 135 136 static struct callback_entry sokva_reclaimerentry; 137 138 #if defined(SOSEND_NO_LOAN) || defined(MULTIPROCESSOR) 139 int sock_loan_thresh = -1; 140 #else 141 int sock_loan_thresh = 4096; 142 #endif 143 144 static kmutex_t so_pendfree_lock; 145 static struct mbuf *so_pendfree; 146 147 #ifndef SOMAXKVA 148 #define SOMAXKVA (16 * 1024 * 1024) 149 #endif 150 int somaxkva = SOMAXKVA; 151 static int socurkva; 152 static kcondvar_t socurkva_cv; 153 154 static kauth_listener_t socket_listener; 155 156 #define SOCK_LOAN_CHUNK 65536 157 158 static size_t sodopendfree(void); 159 static size_t sodopendfreel(void); 160 161 static void sysctl_kern_somaxkva_setup(void); 162 static struct sysctllog *socket_sysctllog; 163 164 static vsize_t 165 sokvareserve(struct socket *so, vsize_t len) 166 { 167 int error; 168 169 mutex_enter(&so_pendfree_lock); 170 while (socurkva + len > somaxkva) { 171 size_t freed; 172 173 /* 174 * try to do pendfree. 175 */ 176 177 freed = sodopendfreel(); 178 179 /* 180 * if some kva was freed, try again. 181 */ 182 183 if (freed) 184 continue; 185 186 SOSEND_COUNTER_INCR(&sosend_kvalimit); 187 error = cv_wait_sig(&socurkva_cv, &so_pendfree_lock); 188 if (error) { 189 len = 0; 190 break; 191 } 192 } 193 socurkva += len; 194 mutex_exit(&so_pendfree_lock); 195 return len; 196 } 197 198 static void 199 sokvaunreserve(vsize_t len) 200 { 201 202 mutex_enter(&so_pendfree_lock); 203 socurkva -= len; 204 cv_broadcast(&socurkva_cv); 205 mutex_exit(&so_pendfree_lock); 206 } 207 208 /* 209 * sokvaalloc: allocate kva for loan. 210 */ 211 212 vaddr_t 213 sokvaalloc(vsize_t len, struct socket *so) 214 { 215 vaddr_t lva; 216 217 /* 218 * reserve kva. 219 */ 220 221 if (sokvareserve(so, len) == 0) 222 return 0; 223 224 /* 225 * allocate kva. 226 */ 227 228 lva = uvm_km_alloc(kernel_map, len, 0, UVM_KMF_VAONLY | UVM_KMF_WAITVA); 229 if (lva == 0) { 230 sokvaunreserve(len); 231 return (0); 232 } 233 234 return lva; 235 } 236 237 /* 238 * sokvafree: free kva for loan. 239 */ 240 241 void 242 sokvafree(vaddr_t sva, vsize_t len) 243 { 244 245 /* 246 * free kva. 247 */ 248 249 uvm_km_free(kernel_map, sva, len, UVM_KMF_VAONLY); 250 251 /* 252 * unreserve kva. 253 */ 254 255 sokvaunreserve(len); 256 } 257 258 static void 259 sodoloanfree(struct vm_page **pgs, void *buf, size_t size) 260 { 261 vaddr_t sva, eva; 262 vsize_t len; 263 int npgs; 264 265 KASSERT(pgs != NULL); 266 267 eva = round_page((vaddr_t) buf + size); 268 sva = trunc_page((vaddr_t) buf); 269 len = eva - sva; 270 npgs = len >> PAGE_SHIFT; 271 272 pmap_kremove(sva, len); 273 pmap_update(pmap_kernel()); 274 uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE); 275 sokvafree(sva, len); 276 } 277 278 static size_t 279 sodopendfree(void) 280 { 281 size_t rv; 282 283 if (__predict_true(so_pendfree == NULL)) 284 return 0; 285 286 mutex_enter(&so_pendfree_lock); 287 rv = sodopendfreel(); 288 mutex_exit(&so_pendfree_lock); 289 290 return rv; 291 } 292 293 /* 294 * sodopendfreel: free mbufs on "pendfree" list. 295 * unlock and relock so_pendfree_lock when freeing mbufs. 296 * 297 * => called with so_pendfree_lock held. 298 */ 299 300 static size_t 301 sodopendfreel(void) 302 { 303 struct mbuf *m, *next; 304 size_t rv = 0; 305 306 KASSERT(mutex_owned(&so_pendfree_lock)); 307 308 while (so_pendfree != NULL) { 309 m = so_pendfree; 310 so_pendfree = NULL; 311 mutex_exit(&so_pendfree_lock); 312 313 for (; m != NULL; m = next) { 314 next = m->m_next; 315 KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) == 0); 316 KASSERT(m->m_ext.ext_refcnt == 0); 317 318 rv += m->m_ext.ext_size; 319 sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf, 320 m->m_ext.ext_size); 321 pool_cache_put(mb_cache, m); 322 } 323 324 mutex_enter(&so_pendfree_lock); 325 } 326 327 return (rv); 328 } 329 330 void 331 soloanfree(struct mbuf *m, void *buf, size_t size, void *arg) 332 { 333 334 KASSERT(m != NULL); 335 336 /* 337 * postpone freeing mbuf. 338 * 339 * we can't do it in interrupt context 340 * because we need to put kva back to kernel_map. 341 */ 342 343 mutex_enter(&so_pendfree_lock); 344 m->m_next = so_pendfree; 345 so_pendfree = m; 346 cv_broadcast(&socurkva_cv); 347 mutex_exit(&so_pendfree_lock); 348 } 349 350 static long 351 sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space) 352 { 353 struct iovec *iov = uio->uio_iov; 354 vaddr_t sva, eva; 355 vsize_t len; 356 vaddr_t lva; 357 int npgs, error; 358 vaddr_t va; 359 int i; 360 361 if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) 362 return (0); 363 364 if (iov->iov_len < (size_t) space) 365 space = iov->iov_len; 366 if (space > SOCK_LOAN_CHUNK) 367 space = SOCK_LOAN_CHUNK; 368 369 eva = round_page((vaddr_t) iov->iov_base + space); 370 sva = trunc_page((vaddr_t) iov->iov_base); 371 len = eva - sva; 372 npgs = len >> PAGE_SHIFT; 373 374 KASSERT(npgs <= M_EXT_MAXPAGES); 375 376 lva = sokvaalloc(len, so); 377 if (lva == 0) 378 return 0; 379 380 error = uvm_loan(&uio->uio_vmspace->vm_map, sva, len, 381 m->m_ext.ext_pgs, UVM_LOAN_TOPAGE); 382 if (error) { 383 sokvafree(lva, len); 384 return (0); 385 } 386 387 for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE) 388 pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]), 389 VM_PROT_READ); 390 pmap_update(pmap_kernel()); 391 392 lva += (vaddr_t) iov->iov_base & PAGE_MASK; 393 394 MEXTADD(m, (void *) lva, space, M_MBUF, soloanfree, so); 395 m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP; 396 397 uio->uio_resid -= space; 398 /* uio_offset not updated, not set/used for write(2) */ 399 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + space; 400 uio->uio_iov->iov_len -= space; 401 if (uio->uio_iov->iov_len == 0) { 402 uio->uio_iov++; 403 uio->uio_iovcnt--; 404 } 405 406 return (space); 407 } 408 409 static int 410 sokva_reclaim_callback(struct callback_entry *ce, void *obj, void *arg) 411 { 412 413 KASSERT(ce == &sokva_reclaimerentry); 414 KASSERT(obj == NULL); 415 416 sodopendfree(); 417 if (!vm_map_starved_p(kernel_map)) { 418 return CALLBACK_CHAIN_ABORT; 419 } 420 return CALLBACK_CHAIN_CONTINUE; 421 } 422 423 struct mbuf * 424 getsombuf(struct socket *so, int type) 425 { 426 struct mbuf *m; 427 428 m = m_get(M_WAIT, type); 429 MCLAIM(m, so->so_mowner); 430 return m; 431 } 432 433 static int 434 socket_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, 435 void *arg0, void *arg1, void *arg2, void *arg3) 436 { 437 int result; 438 enum kauth_network_req req; 439 440 result = KAUTH_RESULT_DEFER; 441 req = (enum kauth_network_req)arg0; 442 443 if ((action != KAUTH_NETWORK_SOCKET) && 444 (action != KAUTH_NETWORK_BIND)) 445 return result; 446 447 switch (req) { 448 case KAUTH_REQ_NETWORK_BIND_PORT: 449 result = KAUTH_RESULT_ALLOW; 450 break; 451 452 case KAUTH_REQ_NETWORK_SOCKET_DROP: { 453 /* Normal users can only drop their own connections. */ 454 struct socket *so = (struct socket *)arg1; 455 uid_t sockuid = so->so_uidinfo->ui_uid; 456 457 if (sockuid == kauth_cred_getuid(cred) || 458 sockuid == kauth_cred_geteuid(cred)) 459 result = KAUTH_RESULT_ALLOW; 460 461 break; 462 } 463 464 case KAUTH_REQ_NETWORK_SOCKET_OPEN: 465 /* We allow "raw" routing/bluetooth sockets to anyone. */ 466 if ((u_long)arg1 == PF_ROUTE || (u_long)arg1 == PF_BLUETOOTH) 467 result = KAUTH_RESULT_ALLOW; 468 else { 469 /* Privileged, let secmodel handle this. */ 470 if ((u_long)arg2 == SOCK_RAW) 471 break; 472 } 473 474 result = KAUTH_RESULT_ALLOW; 475 476 break; 477 478 case KAUTH_REQ_NETWORK_SOCKET_CANSEE: 479 result = KAUTH_RESULT_ALLOW; 480 481 break; 482 483 default: 484 break; 485 } 486 487 return result; 488 } 489 490 void 491 soinit(void) 492 { 493 494 sysctl_kern_somaxkva_setup(); 495 496 mutex_init(&so_pendfree_lock, MUTEX_DEFAULT, IPL_VM); 497 softnet_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); 498 cv_init(&socurkva_cv, "sokva"); 499 soinit2(); 500 501 /* Set the initial adjusted socket buffer size. */ 502 if (sb_max_set(sb_max)) 503 panic("bad initial sb_max value: %lu", sb_max); 504 505 callback_register(&vm_map_to_kernel(kernel_map)->vmk_reclaim_callback, 506 &sokva_reclaimerentry, NULL, sokva_reclaim_callback); 507 508 socket_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK, 509 socket_listener_cb, NULL); 510 } 511 512 /* 513 * Socket operation routines. 514 * These routines are called by the routines in 515 * sys_socket.c or from a system process, and 516 * implement the semantics of socket operations by 517 * switching out to the protocol specific routines. 518 */ 519 /*ARGSUSED*/ 520 int 521 socreate(int dom, struct socket **aso, int type, int proto, struct lwp *l, 522 struct socket *lockso) 523 { 524 const struct protosw *prp; 525 struct socket *so; 526 uid_t uid; 527 int error; 528 kmutex_t *lock; 529 530 error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET, 531 KAUTH_REQ_NETWORK_SOCKET_OPEN, KAUTH_ARG(dom), KAUTH_ARG(type), 532 KAUTH_ARG(proto)); 533 if (error != 0) 534 return error; 535 536 if (proto) 537 prp = pffindproto(dom, proto, type); 538 else 539 prp = pffindtype(dom, type); 540 if (prp == NULL) { 541 /* no support for domain */ 542 if (pffinddomain(dom) == 0) 543 return EAFNOSUPPORT; 544 /* no support for socket type */ 545 if (proto == 0 && type != 0) 546 return EPROTOTYPE; 547 return EPROTONOSUPPORT; 548 } 549 if (prp->pr_usrreq == NULL) 550 return EPROTONOSUPPORT; 551 if (prp->pr_type != type) 552 return EPROTOTYPE; 553 554 so = soget(true); 555 so->so_type = type; 556 so->so_proto = prp; 557 so->so_send = sosend; 558 so->so_receive = soreceive; 559 #ifdef MBUFTRACE 560 so->so_rcv.sb_mowner = &prp->pr_domain->dom_mowner; 561 so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner; 562 so->so_mowner = &prp->pr_domain->dom_mowner; 563 #endif 564 /* so->so_cred = kauth_cred_dup(l->l_cred); */ 565 uid = kauth_cred_geteuid(l->l_cred); 566 so->so_uidinfo = uid_find(uid); 567 so->so_egid = kauth_cred_getegid(l->l_cred); 568 so->so_cpid = l->l_proc->p_pid; 569 if (lockso != NULL) { 570 /* Caller wants us to share a lock. */ 571 lock = lockso->so_lock; 572 so->so_lock = lock; 573 mutex_obj_hold(lock); 574 mutex_enter(lock); 575 } else { 576 /* Lock assigned and taken during PRU_ATTACH. */ 577 } 578 error = (*prp->pr_usrreq)(so, PRU_ATTACH, NULL, 579 (struct mbuf *)(long)proto, NULL, l); 580 KASSERT(solocked(so)); 581 if (error != 0) { 582 so->so_state |= SS_NOFDREF; 583 sofree(so); 584 return error; 585 } 586 sounlock(so); 587 *aso = so; 588 return 0; 589 } 590 591 /* On success, write file descriptor to fdout and return zero. On 592 * failure, return non-zero; *fdout will be undefined. 593 */ 594 int 595 fsocreate(int domain, struct socket **sop, int type, int protocol, 596 struct lwp *l, int *fdout) 597 { 598 struct socket *so; 599 struct file *fp; 600 int fd, error; 601 602 if ((error = fd_allocfile(&fp, &fd)) != 0) 603 return (error); 604 fp->f_flag = FREAD|FWRITE; 605 fp->f_type = DTYPE_SOCKET; 606 fp->f_ops = &socketops; 607 error = socreate(domain, &so, type, protocol, l, NULL); 608 if (error != 0) { 609 fd_abort(curproc, fp, fd); 610 } else { 611 if (sop != NULL) 612 *sop = so; 613 fp->f_data = so; 614 fd_affix(curproc, fp, fd); 615 *fdout = fd; 616 } 617 return error; 618 } 619 620 int 621 sofamily(const struct socket *so) 622 { 623 const struct protosw *pr; 624 const struct domain *dom; 625 626 if ((pr = so->so_proto) == NULL) 627 return AF_UNSPEC; 628 if ((dom = pr->pr_domain) == NULL) 629 return AF_UNSPEC; 630 return dom->dom_family; 631 } 632 633 int 634 sobind(struct socket *so, struct mbuf *nam, struct lwp *l) 635 { 636 int error; 637 638 solock(so); 639 error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, NULL, nam, NULL, l); 640 sounlock(so); 641 return error; 642 } 643 644 int 645 solisten(struct socket *so, int backlog, struct lwp *l) 646 { 647 int error; 648 649 solock(so); 650 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 651 SS_ISDISCONNECTING)) != 0) { 652 sounlock(so); 653 return (EOPNOTSUPP); 654 } 655 error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, 656 NULL, NULL, l); 657 if (error != 0) { 658 sounlock(so); 659 return error; 660 } 661 if (TAILQ_EMPTY(&so->so_q)) 662 so->so_options |= SO_ACCEPTCONN; 663 if (backlog < 0) 664 backlog = 0; 665 so->so_qlimit = min(backlog, somaxconn); 666 sounlock(so); 667 return 0; 668 } 669 670 void 671 sofree(struct socket *so) 672 { 673 u_int refs; 674 675 KASSERT(solocked(so)); 676 677 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { 678 sounlock(so); 679 return; 680 } 681 if (so->so_head) { 682 /* 683 * We must not decommission a socket that's on the accept(2) 684 * queue. If we do, then accept(2) may hang after select(2) 685 * indicated that the listening socket was ready. 686 */ 687 if (!soqremque(so, 0)) { 688 sounlock(so); 689 return; 690 } 691 } 692 if (so->so_rcv.sb_hiwat) 693 (void)chgsbsize(so->so_uidinfo, &so->so_rcv.sb_hiwat, 0, 694 RLIM_INFINITY); 695 if (so->so_snd.sb_hiwat) 696 (void)chgsbsize(so->so_uidinfo, &so->so_snd.sb_hiwat, 0, 697 RLIM_INFINITY); 698 sbrelease(&so->so_snd, so); 699 KASSERT(!cv_has_waiters(&so->so_cv)); 700 KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv)); 701 KASSERT(!cv_has_waiters(&so->so_snd.sb_cv)); 702 sorflush(so); 703 refs = so->so_aborting; /* XXX */ 704 /* Remove acccept filter if one is present. */ 705 if (so->so_accf != NULL) 706 (void)accept_filt_clear(so); 707 /* kauth_cred_free(so->so_cred); */ 708 sounlock(so); 709 if (refs == 0) /* XXX */ 710 soput(so); 711 } 712 713 /* 714 * Close a socket on last file table reference removal. 715 * Initiate disconnect if connected. 716 * Free socket when disconnect complete. 717 */ 718 int 719 soclose(struct socket *so) 720 { 721 struct socket *so2; 722 int error; 723 int error2; 724 725 error = 0; 726 solock(so); 727 if (so->so_options & SO_ACCEPTCONN) { 728 for (;;) { 729 if ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) { 730 KASSERT(solocked2(so, so2)); 731 (void) soqremque(so2, 0); 732 /* soabort drops the lock. */ 733 (void) soabort(so2); 734 solock(so); 735 continue; 736 } 737 if ((so2 = TAILQ_FIRST(&so->so_q)) != 0) { 738 KASSERT(solocked2(so, so2)); 739 (void) soqremque(so2, 1); 740 /* soabort drops the lock. */ 741 (void) soabort(so2); 742 solock(so); 743 continue; 744 } 745 break; 746 } 747 } 748 if (so->so_pcb == 0) 749 goto discard; 750 if (so->so_state & SS_ISCONNECTED) { 751 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 752 error = sodisconnect(so); 753 if (error) 754 goto drop; 755 } 756 if (so->so_options & SO_LINGER) { 757 if ((so->so_state & SS_ISDISCONNECTING) && so->so_nbio) 758 goto drop; 759 while (so->so_state & SS_ISCONNECTED) { 760 error = sowait(so, true, so->so_linger * hz); 761 if (error) 762 break; 763 } 764 } 765 } 766 drop: 767 if (so->so_pcb) { 768 error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, 769 NULL, NULL, NULL, NULL); 770 if (error == 0) 771 error = error2; 772 } 773 discard: 774 if (so->so_state & SS_NOFDREF) 775 panic("soclose: NOFDREF"); 776 so->so_state |= SS_NOFDREF; 777 sofree(so); 778 return (error); 779 } 780 781 /* 782 * Must be called with the socket locked.. Will return with it unlocked. 783 */ 784 int 785 soabort(struct socket *so) 786 { 787 u_int refs; 788 int error; 789 790 KASSERT(solocked(so)); 791 KASSERT(so->so_head == NULL); 792 793 so->so_aborting++; /* XXX */ 794 error = (*so->so_proto->pr_usrreq)(so, PRU_ABORT, NULL, 795 NULL, NULL, NULL); 796 refs = --so->so_aborting; /* XXX */ 797 if (error || (refs == 0)) { 798 sofree(so); 799 } else { 800 sounlock(so); 801 } 802 return error; 803 } 804 805 int 806 soaccept(struct socket *so, struct mbuf *nam) 807 { 808 int error; 809 810 KASSERT(solocked(so)); 811 812 error = 0; 813 if ((so->so_state & SS_NOFDREF) == 0) 814 panic("soaccept: !NOFDREF"); 815 so->so_state &= ~SS_NOFDREF; 816 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 817 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 818 error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, 819 NULL, nam, NULL, NULL); 820 else 821 error = ECONNABORTED; 822 823 return (error); 824 } 825 826 int 827 soconnect(struct socket *so, struct mbuf *nam, struct lwp *l) 828 { 829 int error; 830 831 KASSERT(solocked(so)); 832 833 if (so->so_options & SO_ACCEPTCONN) 834 return (EOPNOTSUPP); 835 /* 836 * If protocol is connection-based, can only connect once. 837 * Otherwise, if connected, try to disconnect first. 838 * This allows user to disconnect by connecting to, e.g., 839 * a null address. 840 */ 841 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 842 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 843 (error = sodisconnect(so)))) 844 error = EISCONN; 845 else 846 error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, 847 NULL, nam, NULL, l); 848 return (error); 849 } 850 851 int 852 soconnect2(struct socket *so1, struct socket *so2) 853 { 854 int error; 855 856 KASSERT(solocked2(so1, so2)); 857 858 error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, 859 NULL, (struct mbuf *)so2, NULL, NULL); 860 return (error); 861 } 862 863 int 864 sodisconnect(struct socket *so) 865 { 866 int error; 867 868 KASSERT(solocked(so)); 869 870 if ((so->so_state & SS_ISCONNECTED) == 0) { 871 error = ENOTCONN; 872 } else if (so->so_state & SS_ISDISCONNECTING) { 873 error = EALREADY; 874 } else { 875 error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, 876 NULL, NULL, NULL, NULL); 877 } 878 sodopendfree(); 879 return (error); 880 } 881 882 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 883 /* 884 * Send on a socket. 885 * If send must go all at once and message is larger than 886 * send buffering, then hard error. 887 * Lock against other senders. 888 * If must go all at once and not enough room now, then 889 * inform user that this would block and do nothing. 890 * Otherwise, if nonblocking, send as much as possible. 891 * The data to be sent is described by "uio" if nonzero, 892 * otherwise by the mbuf chain "top" (which must be null 893 * if uio is not). Data provided in mbuf chain must be small 894 * enough to send all at once. 895 * 896 * Returns nonzero on error, timeout or signal; callers 897 * must check for short counts if EINTR/ERESTART are returned. 898 * Data and control buffers are freed on return. 899 */ 900 int 901 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 902 struct mbuf *control, int flags, struct lwp *l) 903 { 904 struct mbuf **mp, *m; 905 struct proc *p; 906 long space, len, resid, clen, mlen; 907 int error, s, dontroute, atomic; 908 909 p = l->l_proc; 910 sodopendfree(); 911 clen = 0; 912 913 /* 914 * solock() provides atomicity of access. splsoftnet() prevents 915 * protocol processing soft interrupts from interrupting us and 916 * blocking (expensive). 917 */ 918 s = splsoftnet(); 919 solock(so); 920 atomic = sosendallatonce(so) || top; 921 if (uio) 922 resid = uio->uio_resid; 923 else 924 resid = top->m_pkthdr.len; 925 /* 926 * In theory resid should be unsigned. 927 * However, space must be signed, as it might be less than 0 928 * if we over-committed, and we must use a signed comparison 929 * of space and resid. On the other hand, a negative resid 930 * causes us to loop sending 0-length segments to the protocol. 931 */ 932 if (resid < 0) { 933 error = EINVAL; 934 goto out; 935 } 936 dontroute = 937 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 938 (so->so_proto->pr_flags & PR_ATOMIC); 939 l->l_ru.ru_msgsnd++; 940 if (control) 941 clen = control->m_len; 942 restart: 943 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) 944 goto out; 945 do { 946 if (so->so_state & SS_CANTSENDMORE) { 947 error = EPIPE; 948 goto release; 949 } 950 if (so->so_error) { 951 error = so->so_error; 952 so->so_error = 0; 953 goto release; 954 } 955 if ((so->so_state & SS_ISCONNECTED) == 0) { 956 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 957 if ((so->so_state & SS_ISCONFIRMING) == 0 && 958 !(resid == 0 && clen != 0)) { 959 error = ENOTCONN; 960 goto release; 961 } 962 } else if (addr == 0) { 963 error = EDESTADDRREQ; 964 goto release; 965 } 966 } 967 space = sbspace(&so->so_snd); 968 if (flags & MSG_OOB) 969 space += 1024; 970 if ((atomic && resid > so->so_snd.sb_hiwat) || 971 clen > so->so_snd.sb_hiwat) { 972 error = EMSGSIZE; 973 goto release; 974 } 975 if (space < resid + clen && 976 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 977 if (so->so_nbio) { 978 error = EWOULDBLOCK; 979 goto release; 980 } 981 sbunlock(&so->so_snd); 982 error = sbwait(&so->so_snd); 983 if (error) 984 goto out; 985 goto restart; 986 } 987 mp = ⊤ 988 space -= clen; 989 do { 990 if (uio == NULL) { 991 /* 992 * Data is prepackaged in "top". 993 */ 994 resid = 0; 995 if (flags & MSG_EOR) 996 top->m_flags |= M_EOR; 997 } else do { 998 sounlock(so); 999 splx(s); 1000 if (top == NULL) { 1001 m = m_gethdr(M_WAIT, MT_DATA); 1002 mlen = MHLEN; 1003 m->m_pkthdr.len = 0; 1004 m->m_pkthdr.rcvif = NULL; 1005 } else { 1006 m = m_get(M_WAIT, MT_DATA); 1007 mlen = MLEN; 1008 } 1009 MCLAIM(m, so->so_snd.sb_mowner); 1010 if (sock_loan_thresh >= 0 && 1011 uio->uio_iov->iov_len >= sock_loan_thresh && 1012 space >= sock_loan_thresh && 1013 (len = sosend_loan(so, uio, m, 1014 space)) != 0) { 1015 SOSEND_COUNTER_INCR(&sosend_loan_big); 1016 space -= len; 1017 goto have_data; 1018 } 1019 if (resid >= MINCLSIZE && space >= MCLBYTES) { 1020 SOSEND_COUNTER_INCR(&sosend_copy_big); 1021 m_clget(m, M_WAIT); 1022 if ((m->m_flags & M_EXT) == 0) 1023 goto nopages; 1024 mlen = MCLBYTES; 1025 if (atomic && top == 0) { 1026 len = lmin(MCLBYTES - max_hdr, 1027 resid); 1028 m->m_data += max_hdr; 1029 } else 1030 len = lmin(MCLBYTES, resid); 1031 space -= len; 1032 } else { 1033 nopages: 1034 SOSEND_COUNTER_INCR(&sosend_copy_small); 1035 len = lmin(lmin(mlen, resid), space); 1036 space -= len; 1037 /* 1038 * For datagram protocols, leave room 1039 * for protocol headers in first mbuf. 1040 */ 1041 if (atomic && top == 0 && len < mlen) 1042 MH_ALIGN(m, len); 1043 } 1044 error = uiomove(mtod(m, void *), (int)len, uio); 1045 have_data: 1046 resid = uio->uio_resid; 1047 m->m_len = len; 1048 *mp = m; 1049 top->m_pkthdr.len += len; 1050 s = splsoftnet(); 1051 solock(so); 1052 if (error != 0) 1053 goto release; 1054 mp = &m->m_next; 1055 if (resid <= 0) { 1056 if (flags & MSG_EOR) 1057 top->m_flags |= M_EOR; 1058 break; 1059 } 1060 } while (space > 0 && atomic); 1061 1062 if (so->so_state & SS_CANTSENDMORE) { 1063 error = EPIPE; 1064 goto release; 1065 } 1066 if (dontroute) 1067 so->so_options |= SO_DONTROUTE; 1068 if (resid > 0) 1069 so->so_state |= SS_MORETOCOME; 1070 error = (*so->so_proto->pr_usrreq)(so, 1071 (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, 1072 top, addr, control, curlwp); 1073 if (dontroute) 1074 so->so_options &= ~SO_DONTROUTE; 1075 if (resid > 0) 1076 so->so_state &= ~SS_MORETOCOME; 1077 clen = 0; 1078 control = NULL; 1079 top = NULL; 1080 mp = ⊤ 1081 if (error != 0) 1082 goto release; 1083 } while (resid && space > 0); 1084 } while (resid); 1085 1086 release: 1087 sbunlock(&so->so_snd); 1088 out: 1089 sounlock(so); 1090 splx(s); 1091 if (top) 1092 m_freem(top); 1093 if (control) 1094 m_freem(control); 1095 return (error); 1096 } 1097 1098 /* 1099 * Following replacement or removal of the first mbuf on the first 1100 * mbuf chain of a socket buffer, push necessary state changes back 1101 * into the socket buffer so that other consumers see the values 1102 * consistently. 'nextrecord' is the callers locally stored value of 1103 * the original value of sb->sb_mb->m_nextpkt which must be restored 1104 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 1105 */ 1106 static void 1107 sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 1108 { 1109 1110 KASSERT(solocked(sb->sb_so)); 1111 1112 /* 1113 * First, update for the new value of nextrecord. If necessary, 1114 * make it the first record. 1115 */ 1116 if (sb->sb_mb != NULL) 1117 sb->sb_mb->m_nextpkt = nextrecord; 1118 else 1119 sb->sb_mb = nextrecord; 1120 1121 /* 1122 * Now update any dependent socket buffer fields to reflect 1123 * the new state. This is an inline of SB_EMPTY_FIXUP, with 1124 * the addition of a second clause that takes care of the 1125 * case where sb_mb has been updated, but remains the last 1126 * record. 1127 */ 1128 if (sb->sb_mb == NULL) { 1129 sb->sb_mbtail = NULL; 1130 sb->sb_lastrecord = NULL; 1131 } else if (sb->sb_mb->m_nextpkt == NULL) 1132 sb->sb_lastrecord = sb->sb_mb; 1133 } 1134 1135 /* 1136 * Implement receive operations on a socket. 1137 * We depend on the way that records are added to the sockbuf 1138 * by sbappend*. In particular, each record (mbufs linked through m_next) 1139 * must begin with an address if the protocol so specifies, 1140 * followed by an optional mbuf or mbufs containing ancillary data, 1141 * and then zero or more mbufs of data. 1142 * In order to avoid blocking network interrupts for the entire time here, 1143 * we splx() while doing the actual copy to user space. 1144 * Although the sockbuf is locked, new data may still be appended, 1145 * and thus we must maintain consistency of the sockbuf during that time. 1146 * 1147 * The caller may receive the data as a single mbuf chain by supplying 1148 * an mbuf **mp0 for use in returning the chain. The uio is then used 1149 * only for the count in uio_resid. 1150 */ 1151 int 1152 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 1153 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1154 { 1155 struct lwp *l = curlwp; 1156 struct mbuf *m, **mp, *mt; 1157 int atomic, flags, len, error, s, offset, moff, type, orig_resid; 1158 const struct protosw *pr; 1159 struct mbuf *nextrecord; 1160 int mbuf_removed = 0; 1161 const struct domain *dom; 1162 1163 pr = so->so_proto; 1164 atomic = pr->pr_flags & PR_ATOMIC; 1165 dom = pr->pr_domain; 1166 mp = mp0; 1167 type = 0; 1168 orig_resid = uio->uio_resid; 1169 1170 if (paddr != NULL) 1171 *paddr = NULL; 1172 if (controlp != NULL) 1173 *controlp = NULL; 1174 if (flagsp != NULL) 1175 flags = *flagsp &~ MSG_EOR; 1176 else 1177 flags = 0; 1178 1179 if ((flags & MSG_DONTWAIT) == 0) 1180 sodopendfree(); 1181 1182 if (flags & MSG_OOB) { 1183 m = m_get(M_WAIT, MT_DATA); 1184 solock(so); 1185 error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, 1186 (struct mbuf *)(long)(flags & MSG_PEEK), NULL, l); 1187 sounlock(so); 1188 if (error) 1189 goto bad; 1190 do { 1191 error = uiomove(mtod(m, void *), 1192 (int) min(uio->uio_resid, m->m_len), uio); 1193 m = m_free(m); 1194 } while (uio->uio_resid > 0 && error == 0 && m); 1195 bad: 1196 if (m != NULL) 1197 m_freem(m); 1198 return error; 1199 } 1200 if (mp != NULL) 1201 *mp = NULL; 1202 1203 /* 1204 * solock() provides atomicity of access. splsoftnet() prevents 1205 * protocol processing soft interrupts from interrupting us and 1206 * blocking (expensive). 1207 */ 1208 s = splsoftnet(); 1209 solock(so); 1210 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 1211 (*pr->pr_usrreq)(so, PRU_RCVD, NULL, NULL, NULL, l); 1212 1213 restart: 1214 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) { 1215 sounlock(so); 1216 splx(s); 1217 return error; 1218 } 1219 1220 m = so->so_rcv.sb_mb; 1221 /* 1222 * If we have less data than requested, block awaiting more 1223 * (subject to any timeout) if: 1224 * 1. the current count is less than the low water mark, 1225 * 2. MSG_WAITALL is set, and it is possible to do the entire 1226 * receive operation at once if we block (resid <= hiwat), or 1227 * 3. MSG_DONTWAIT is not set. 1228 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1229 * we have to do the receive in sections, and thus risk returning 1230 * a short count if a timeout or signal occurs after we start. 1231 */ 1232 if (m == NULL || 1233 ((flags & MSG_DONTWAIT) == 0 && 1234 so->so_rcv.sb_cc < uio->uio_resid && 1235 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 1236 ((flags & MSG_WAITALL) && 1237 uio->uio_resid <= so->so_rcv.sb_hiwat)) && 1238 m->m_nextpkt == NULL && !atomic)) { 1239 #ifdef DIAGNOSTIC 1240 if (m == NULL && so->so_rcv.sb_cc) 1241 panic("receive 1"); 1242 #endif 1243 if (so->so_error) { 1244 if (m != NULL) 1245 goto dontblock; 1246 error = so->so_error; 1247 if ((flags & MSG_PEEK) == 0) 1248 so->so_error = 0; 1249 goto release; 1250 } 1251 if (so->so_state & SS_CANTRCVMORE) { 1252 if (m != NULL) 1253 goto dontblock; 1254 else 1255 goto release; 1256 } 1257 for (; m != NULL; m = m->m_next) 1258 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1259 m = so->so_rcv.sb_mb; 1260 goto dontblock; 1261 } 1262 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1263 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1264 error = ENOTCONN; 1265 goto release; 1266 } 1267 if (uio->uio_resid == 0) 1268 goto release; 1269 if (so->so_nbio || (flags & MSG_DONTWAIT)) { 1270 error = EWOULDBLOCK; 1271 goto release; 1272 } 1273 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 1274 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 1275 sbunlock(&so->so_rcv); 1276 error = sbwait(&so->so_rcv); 1277 if (error != 0) { 1278 sounlock(so); 1279 splx(s); 1280 return error; 1281 } 1282 goto restart; 1283 } 1284 dontblock: 1285 /* 1286 * On entry here, m points to the first record of the socket buffer. 1287 * From this point onward, we maintain 'nextrecord' as a cache of the 1288 * pointer to the next record in the socket buffer. We must keep the 1289 * various socket buffer pointers and local stack versions of the 1290 * pointers in sync, pushing out modifications before dropping the 1291 * socket lock, and re-reading them when picking it up. 1292 * 1293 * Otherwise, we will race with the network stack appending new data 1294 * or records onto the socket buffer by using inconsistent/stale 1295 * versions of the field, possibly resulting in socket buffer 1296 * corruption. 1297 * 1298 * By holding the high-level sblock(), we prevent simultaneous 1299 * readers from pulling off the front of the socket buffer. 1300 */ 1301 if (l != NULL) 1302 l->l_ru.ru_msgrcv++; 1303 KASSERT(m == so->so_rcv.sb_mb); 1304 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 1305 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 1306 nextrecord = m->m_nextpkt; 1307 if (pr->pr_flags & PR_ADDR) { 1308 #ifdef DIAGNOSTIC 1309 if (m->m_type != MT_SONAME) 1310 panic("receive 1a"); 1311 #endif 1312 orig_resid = 0; 1313 if (flags & MSG_PEEK) { 1314 if (paddr) 1315 *paddr = m_copy(m, 0, m->m_len); 1316 m = m->m_next; 1317 } else { 1318 sbfree(&so->so_rcv, m); 1319 mbuf_removed = 1; 1320 if (paddr != NULL) { 1321 *paddr = m; 1322 so->so_rcv.sb_mb = m->m_next; 1323 m->m_next = NULL; 1324 m = so->so_rcv.sb_mb; 1325 } else { 1326 MFREE(m, so->so_rcv.sb_mb); 1327 m = so->so_rcv.sb_mb; 1328 } 1329 sbsync(&so->so_rcv, nextrecord); 1330 } 1331 } 1332 1333 /* 1334 * Process one or more MT_CONTROL mbufs present before any data mbufs 1335 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1336 * just copy the data; if !MSG_PEEK, we call into the protocol to 1337 * perform externalization (or freeing if controlp == NULL). 1338 */ 1339 if (__predict_false(m != NULL && m->m_type == MT_CONTROL)) { 1340 struct mbuf *cm = NULL, *cmn; 1341 struct mbuf **cme = &cm; 1342 1343 do { 1344 if (flags & MSG_PEEK) { 1345 if (controlp != NULL) { 1346 *controlp = m_copy(m, 0, m->m_len); 1347 controlp = &(*controlp)->m_next; 1348 } 1349 m = m->m_next; 1350 } else { 1351 sbfree(&so->so_rcv, m); 1352 so->so_rcv.sb_mb = m->m_next; 1353 m->m_next = NULL; 1354 *cme = m; 1355 cme = &(*cme)->m_next; 1356 m = so->so_rcv.sb_mb; 1357 } 1358 } while (m != NULL && m->m_type == MT_CONTROL); 1359 if ((flags & MSG_PEEK) == 0) 1360 sbsync(&so->so_rcv, nextrecord); 1361 for (; cm != NULL; cm = cmn) { 1362 cmn = cm->m_next; 1363 cm->m_next = NULL; 1364 type = mtod(cm, struct cmsghdr *)->cmsg_type; 1365 if (controlp != NULL) { 1366 if (dom->dom_externalize != NULL && 1367 type == SCM_RIGHTS) { 1368 sounlock(so); 1369 splx(s); 1370 error = (*dom->dom_externalize)(cm, l); 1371 s = splsoftnet(); 1372 solock(so); 1373 } 1374 *controlp = cm; 1375 while (*controlp != NULL) 1376 controlp = &(*controlp)->m_next; 1377 } else { 1378 /* 1379 * Dispose of any SCM_RIGHTS message that went 1380 * through the read path rather than recv. 1381 */ 1382 if (dom->dom_dispose != NULL && 1383 type == SCM_RIGHTS) { 1384 sounlock(so); 1385 (*dom->dom_dispose)(cm); 1386 solock(so); 1387 } 1388 m_freem(cm); 1389 } 1390 } 1391 if (m != NULL) 1392 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1393 else 1394 nextrecord = so->so_rcv.sb_mb; 1395 orig_resid = 0; 1396 } 1397 1398 /* If m is non-NULL, we have some data to read. */ 1399 if (__predict_true(m != NULL)) { 1400 type = m->m_type; 1401 if (type == MT_OOBDATA) 1402 flags |= MSG_OOB; 1403 } 1404 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 1405 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 1406 1407 moff = 0; 1408 offset = 0; 1409 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1410 if (m->m_type == MT_OOBDATA) { 1411 if (type != MT_OOBDATA) 1412 break; 1413 } else if (type == MT_OOBDATA) 1414 break; 1415 #ifdef DIAGNOSTIC 1416 else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) 1417 panic("receive 3"); 1418 #endif 1419 so->so_state &= ~SS_RCVATMARK; 1420 len = uio->uio_resid; 1421 if (so->so_oobmark && len > so->so_oobmark - offset) 1422 len = so->so_oobmark - offset; 1423 if (len > m->m_len - moff) 1424 len = m->m_len - moff; 1425 /* 1426 * If mp is set, just pass back the mbufs. 1427 * Otherwise copy them out via the uio, then free. 1428 * Sockbuf must be consistent here (points to current mbuf, 1429 * it points to next record) when we drop priority; 1430 * we must note any additions to the sockbuf when we 1431 * block interrupts again. 1432 */ 1433 if (mp == NULL) { 1434 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 1435 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 1436 sounlock(so); 1437 splx(s); 1438 error = uiomove(mtod(m, char *) + moff, (int)len, uio); 1439 s = splsoftnet(); 1440 solock(so); 1441 if (error != 0) { 1442 /* 1443 * If any part of the record has been removed 1444 * (such as the MT_SONAME mbuf, which will 1445 * happen when PR_ADDR, and thus also 1446 * PR_ATOMIC, is set), then drop the entire 1447 * record to maintain the atomicity of the 1448 * receive operation. 1449 * 1450 * This avoids a later panic("receive 1a") 1451 * when compiled with DIAGNOSTIC. 1452 */ 1453 if (m && mbuf_removed && atomic) 1454 (void) sbdroprecord(&so->so_rcv); 1455 1456 goto release; 1457 } 1458 } else 1459 uio->uio_resid -= len; 1460 if (len == m->m_len - moff) { 1461 if (m->m_flags & M_EOR) 1462 flags |= MSG_EOR; 1463 if (flags & MSG_PEEK) { 1464 m = m->m_next; 1465 moff = 0; 1466 } else { 1467 nextrecord = m->m_nextpkt; 1468 sbfree(&so->so_rcv, m); 1469 if (mp) { 1470 *mp = m; 1471 mp = &m->m_next; 1472 so->so_rcv.sb_mb = m = m->m_next; 1473 *mp = NULL; 1474 } else { 1475 MFREE(m, so->so_rcv.sb_mb); 1476 m = so->so_rcv.sb_mb; 1477 } 1478 /* 1479 * If m != NULL, we also know that 1480 * so->so_rcv.sb_mb != NULL. 1481 */ 1482 KASSERT(so->so_rcv.sb_mb == m); 1483 if (m) { 1484 m->m_nextpkt = nextrecord; 1485 if (nextrecord == NULL) 1486 so->so_rcv.sb_lastrecord = m; 1487 } else { 1488 so->so_rcv.sb_mb = nextrecord; 1489 SB_EMPTY_FIXUP(&so->so_rcv); 1490 } 1491 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1492 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1493 } 1494 } else if (flags & MSG_PEEK) 1495 moff += len; 1496 else { 1497 if (mp != NULL) { 1498 mt = m_copym(m, 0, len, M_NOWAIT); 1499 if (__predict_false(mt == NULL)) { 1500 sounlock(so); 1501 mt = m_copym(m, 0, len, M_WAIT); 1502 solock(so); 1503 } 1504 *mp = mt; 1505 } 1506 m->m_data += len; 1507 m->m_len -= len; 1508 so->so_rcv.sb_cc -= len; 1509 } 1510 if (so->so_oobmark) { 1511 if ((flags & MSG_PEEK) == 0) { 1512 so->so_oobmark -= len; 1513 if (so->so_oobmark == 0) { 1514 so->so_state |= SS_RCVATMARK; 1515 break; 1516 } 1517 } else { 1518 offset += len; 1519 if (offset == so->so_oobmark) 1520 break; 1521 } 1522 } 1523 if (flags & MSG_EOR) 1524 break; 1525 /* 1526 * If the MSG_WAITALL flag is set (for non-atomic socket), 1527 * we must not quit until "uio->uio_resid == 0" or an error 1528 * termination. If a signal/timeout occurs, return 1529 * with a short count but without error. 1530 * Keep sockbuf locked against other readers. 1531 */ 1532 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1533 !sosendallatonce(so) && !nextrecord) { 1534 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1535 break; 1536 /* 1537 * If we are peeking and the socket receive buffer is 1538 * full, stop since we can't get more data to peek at. 1539 */ 1540 if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0) 1541 break; 1542 /* 1543 * If we've drained the socket buffer, tell the 1544 * protocol in case it needs to do something to 1545 * get it filled again. 1546 */ 1547 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) 1548 (*pr->pr_usrreq)(so, PRU_RCVD, 1549 NULL, (struct mbuf *)(long)flags, NULL, l); 1550 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1551 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1552 error = sbwait(&so->so_rcv); 1553 if (error != 0) { 1554 sbunlock(&so->so_rcv); 1555 sounlock(so); 1556 splx(s); 1557 return 0; 1558 } 1559 if ((m = so->so_rcv.sb_mb) != NULL) 1560 nextrecord = m->m_nextpkt; 1561 } 1562 } 1563 1564 if (m && atomic) { 1565 flags |= MSG_TRUNC; 1566 if ((flags & MSG_PEEK) == 0) 1567 (void) sbdroprecord(&so->so_rcv); 1568 } 1569 if ((flags & MSG_PEEK) == 0) { 1570 if (m == NULL) { 1571 /* 1572 * First part is an inline SB_EMPTY_FIXUP(). Second 1573 * part makes sure sb_lastrecord is up-to-date if 1574 * there is still data in the socket buffer. 1575 */ 1576 so->so_rcv.sb_mb = nextrecord; 1577 if (so->so_rcv.sb_mb == NULL) { 1578 so->so_rcv.sb_mbtail = NULL; 1579 so->so_rcv.sb_lastrecord = NULL; 1580 } else if (nextrecord->m_nextpkt == NULL) 1581 so->so_rcv.sb_lastrecord = nextrecord; 1582 } 1583 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1584 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1585 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 1586 (*pr->pr_usrreq)(so, PRU_RCVD, NULL, 1587 (struct mbuf *)(long)flags, NULL, l); 1588 } 1589 if (orig_resid == uio->uio_resid && orig_resid && 1590 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1591 sbunlock(&so->so_rcv); 1592 goto restart; 1593 } 1594 1595 if (flagsp != NULL) 1596 *flagsp |= flags; 1597 release: 1598 sbunlock(&so->so_rcv); 1599 sounlock(so); 1600 splx(s); 1601 return error; 1602 } 1603 1604 int 1605 soshutdown(struct socket *so, int how) 1606 { 1607 const struct protosw *pr; 1608 int error; 1609 1610 KASSERT(solocked(so)); 1611 1612 pr = so->so_proto; 1613 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1614 return (EINVAL); 1615 1616 if (how == SHUT_RD || how == SHUT_RDWR) { 1617 sorflush(so); 1618 error = 0; 1619 } 1620 if (how == SHUT_WR || how == SHUT_RDWR) 1621 error = (*pr->pr_usrreq)(so, PRU_SHUTDOWN, NULL, 1622 NULL, NULL, NULL); 1623 1624 return error; 1625 } 1626 1627 int 1628 sodrain(struct socket *so) 1629 { 1630 int error; 1631 1632 solock(so); 1633 so->so_state |= SS_ISDRAINING; 1634 cv_broadcast(&so->so_cv); 1635 error = soshutdown(so, SHUT_RDWR); 1636 sounlock(so); 1637 1638 return error; 1639 } 1640 1641 void 1642 sorflush(struct socket *so) 1643 { 1644 struct sockbuf *sb, asb; 1645 const struct protosw *pr; 1646 1647 KASSERT(solocked(so)); 1648 1649 sb = &so->so_rcv; 1650 pr = so->so_proto; 1651 socantrcvmore(so); 1652 sb->sb_flags |= SB_NOINTR; 1653 (void )sblock(sb, M_WAITOK); 1654 sbunlock(sb); 1655 asb = *sb; 1656 /* 1657 * Clear most of the sockbuf structure, but leave some of the 1658 * fields valid. 1659 */ 1660 memset(&sb->sb_startzero, 0, 1661 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1662 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) { 1663 sounlock(so); 1664 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1665 solock(so); 1666 } 1667 sbrelease(&asb, so); 1668 } 1669 1670 /* 1671 * internal set SOL_SOCKET options 1672 */ 1673 static int 1674 sosetopt1(struct socket *so, const struct sockopt *sopt) 1675 { 1676 int error = EINVAL, optval, opt; 1677 struct linger l; 1678 struct timeval tv; 1679 1680 switch ((opt = sopt->sopt_name)) { 1681 1682 case SO_ACCEPTFILTER: 1683 error = accept_filt_setopt(so, sopt); 1684 KASSERT(solocked(so)); 1685 break; 1686 1687 case SO_LINGER: 1688 error = sockopt_get(sopt, &l, sizeof(l)); 1689 solock(so); 1690 if (error) 1691 break; 1692 if (l.l_linger < 0 || l.l_linger > USHRT_MAX || 1693 l.l_linger > (INT_MAX / hz)) { 1694 error = EDOM; 1695 break; 1696 } 1697 so->so_linger = l.l_linger; 1698 if (l.l_onoff) 1699 so->so_options |= SO_LINGER; 1700 else 1701 so->so_options &= ~SO_LINGER; 1702 break; 1703 1704 case SO_DEBUG: 1705 case SO_KEEPALIVE: 1706 case SO_DONTROUTE: 1707 case SO_USELOOPBACK: 1708 case SO_BROADCAST: 1709 case SO_REUSEADDR: 1710 case SO_REUSEPORT: 1711 case SO_OOBINLINE: 1712 case SO_TIMESTAMP: 1713 #ifdef SO_OTIMESTAMP 1714 case SO_OTIMESTAMP: 1715 #endif 1716 error = sockopt_getint(sopt, &optval); 1717 solock(so); 1718 if (error) 1719 break; 1720 if (optval) 1721 so->so_options |= opt; 1722 else 1723 so->so_options &= ~opt; 1724 break; 1725 1726 case SO_SNDBUF: 1727 case SO_RCVBUF: 1728 case SO_SNDLOWAT: 1729 case SO_RCVLOWAT: 1730 error = sockopt_getint(sopt, &optval); 1731 solock(so); 1732 if (error) 1733 break; 1734 1735 /* 1736 * Values < 1 make no sense for any of these 1737 * options, so disallow them. 1738 */ 1739 if (optval < 1) { 1740 error = EINVAL; 1741 break; 1742 } 1743 1744 switch (opt) { 1745 case SO_SNDBUF: 1746 if (sbreserve(&so->so_snd, (u_long)optval, so) == 0) { 1747 error = ENOBUFS; 1748 break; 1749 } 1750 so->so_snd.sb_flags &= ~SB_AUTOSIZE; 1751 break; 1752 1753 case SO_RCVBUF: 1754 if (sbreserve(&so->so_rcv, (u_long)optval, so) == 0) { 1755 error = ENOBUFS; 1756 break; 1757 } 1758 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 1759 break; 1760 1761 /* 1762 * Make sure the low-water is never greater than 1763 * the high-water. 1764 */ 1765 case SO_SNDLOWAT: 1766 if (optval > so->so_snd.sb_hiwat) 1767 optval = so->so_snd.sb_hiwat; 1768 1769 so->so_snd.sb_lowat = optval; 1770 break; 1771 1772 case SO_RCVLOWAT: 1773 if (optval > so->so_rcv.sb_hiwat) 1774 optval = so->so_rcv.sb_hiwat; 1775 1776 so->so_rcv.sb_lowat = optval; 1777 break; 1778 } 1779 break; 1780 1781 #ifdef COMPAT_50 1782 case SO_OSNDTIMEO: 1783 case SO_ORCVTIMEO: { 1784 struct timeval50 otv; 1785 error = sockopt_get(sopt, &otv, sizeof(otv)); 1786 if (error) { 1787 solock(so); 1788 break; 1789 } 1790 timeval50_to_timeval(&otv, &tv); 1791 opt = opt == SO_OSNDTIMEO ? SO_SNDTIMEO : SO_RCVTIMEO; 1792 error = 0; 1793 /*FALLTHROUGH*/ 1794 } 1795 #endif /* COMPAT_50 */ 1796 1797 case SO_SNDTIMEO: 1798 case SO_RCVTIMEO: 1799 if (error) 1800 error = sockopt_get(sopt, &tv, sizeof(tv)); 1801 solock(so); 1802 if (error) 1803 break; 1804 1805 if (tv.tv_sec > (INT_MAX - tv.tv_usec / tick) / hz) { 1806 error = EDOM; 1807 break; 1808 } 1809 1810 optval = tv.tv_sec * hz + tv.tv_usec / tick; 1811 if (optval == 0 && tv.tv_usec != 0) 1812 optval = 1; 1813 1814 switch (opt) { 1815 case SO_SNDTIMEO: 1816 so->so_snd.sb_timeo = optval; 1817 break; 1818 case SO_RCVTIMEO: 1819 so->so_rcv.sb_timeo = optval; 1820 break; 1821 } 1822 break; 1823 1824 default: 1825 solock(so); 1826 error = ENOPROTOOPT; 1827 break; 1828 } 1829 KASSERT(solocked(so)); 1830 return error; 1831 } 1832 1833 int 1834 sosetopt(struct socket *so, struct sockopt *sopt) 1835 { 1836 int error, prerr; 1837 1838 if (sopt->sopt_level == SOL_SOCKET) { 1839 error = sosetopt1(so, sopt); 1840 KASSERT(solocked(so)); 1841 } else { 1842 error = ENOPROTOOPT; 1843 solock(so); 1844 } 1845 1846 if ((error == 0 || error == ENOPROTOOPT) && 1847 so->so_proto != NULL && so->so_proto->pr_ctloutput != NULL) { 1848 /* give the protocol stack a shot */ 1849 prerr = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, sopt); 1850 if (prerr == 0) 1851 error = 0; 1852 else if (prerr != ENOPROTOOPT) 1853 error = prerr; 1854 } 1855 sounlock(so); 1856 return error; 1857 } 1858 1859 /* 1860 * so_setsockopt() is a wrapper providing a sockopt structure for sosetopt() 1861 */ 1862 int 1863 so_setsockopt(struct lwp *l, struct socket *so, int level, int name, 1864 const void *val, size_t valsize) 1865 { 1866 struct sockopt sopt; 1867 int error; 1868 1869 KASSERT(valsize == 0 || val != NULL); 1870 1871 sockopt_init(&sopt, level, name, valsize); 1872 sockopt_set(&sopt, val, valsize); 1873 1874 error = sosetopt(so, &sopt); 1875 1876 sockopt_destroy(&sopt); 1877 1878 return error; 1879 } 1880 1881 /* 1882 * internal get SOL_SOCKET options 1883 */ 1884 static int 1885 sogetopt1(struct socket *so, struct sockopt *sopt) 1886 { 1887 int error, optval, opt; 1888 struct linger l; 1889 struct timeval tv; 1890 1891 switch ((opt = sopt->sopt_name)) { 1892 1893 case SO_ACCEPTFILTER: 1894 error = accept_filt_getopt(so, sopt); 1895 break; 1896 1897 case SO_LINGER: 1898 l.l_onoff = (so->so_options & SO_LINGER) ? 1 : 0; 1899 l.l_linger = so->so_linger; 1900 1901 error = sockopt_set(sopt, &l, sizeof(l)); 1902 break; 1903 1904 case SO_USELOOPBACK: 1905 case SO_DONTROUTE: 1906 case SO_DEBUG: 1907 case SO_KEEPALIVE: 1908 case SO_REUSEADDR: 1909 case SO_REUSEPORT: 1910 case SO_BROADCAST: 1911 case SO_OOBINLINE: 1912 case SO_TIMESTAMP: 1913 #ifdef SO_OTIMESTAMP 1914 case SO_OTIMESTAMP: 1915 #endif 1916 error = sockopt_setint(sopt, (so->so_options & opt) ? 1 : 0); 1917 break; 1918 1919 case SO_TYPE: 1920 error = sockopt_setint(sopt, so->so_type); 1921 break; 1922 1923 case SO_ERROR: 1924 error = sockopt_setint(sopt, so->so_error); 1925 so->so_error = 0; 1926 break; 1927 1928 case SO_SNDBUF: 1929 error = sockopt_setint(sopt, so->so_snd.sb_hiwat); 1930 break; 1931 1932 case SO_RCVBUF: 1933 error = sockopt_setint(sopt, so->so_rcv.sb_hiwat); 1934 break; 1935 1936 case SO_SNDLOWAT: 1937 error = sockopt_setint(sopt, so->so_snd.sb_lowat); 1938 break; 1939 1940 case SO_RCVLOWAT: 1941 error = sockopt_setint(sopt, so->so_rcv.sb_lowat); 1942 break; 1943 1944 #ifdef COMPAT_50 1945 case SO_OSNDTIMEO: 1946 case SO_ORCVTIMEO: { 1947 struct timeval50 otv; 1948 1949 optval = (opt == SO_OSNDTIMEO ? 1950 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1951 1952 otv.tv_sec = optval / hz; 1953 otv.tv_usec = (optval % hz) * tick; 1954 1955 error = sockopt_set(sopt, &otv, sizeof(otv)); 1956 break; 1957 } 1958 #endif /* COMPAT_50 */ 1959 1960 case SO_SNDTIMEO: 1961 case SO_RCVTIMEO: 1962 optval = (opt == SO_SNDTIMEO ? 1963 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1964 1965 tv.tv_sec = optval / hz; 1966 tv.tv_usec = (optval % hz) * tick; 1967 1968 error = sockopt_set(sopt, &tv, sizeof(tv)); 1969 break; 1970 1971 case SO_OVERFLOWED: 1972 error = sockopt_setint(sopt, so->so_rcv.sb_overflowed); 1973 break; 1974 1975 default: 1976 error = ENOPROTOOPT; 1977 break; 1978 } 1979 1980 return (error); 1981 } 1982 1983 int 1984 sogetopt(struct socket *so, struct sockopt *sopt) 1985 { 1986 int error; 1987 1988 solock(so); 1989 if (sopt->sopt_level != SOL_SOCKET) { 1990 if (so->so_proto && so->so_proto->pr_ctloutput) { 1991 error = ((*so->so_proto->pr_ctloutput) 1992 (PRCO_GETOPT, so, sopt)); 1993 } else 1994 error = (ENOPROTOOPT); 1995 } else { 1996 error = sogetopt1(so, sopt); 1997 } 1998 sounlock(so); 1999 return (error); 2000 } 2001 2002 /* 2003 * alloc sockopt data buffer buffer 2004 * - will be released at destroy 2005 */ 2006 static int 2007 sockopt_alloc(struct sockopt *sopt, size_t len, km_flag_t kmflag) 2008 { 2009 2010 KASSERT(sopt->sopt_size == 0); 2011 2012 if (len > sizeof(sopt->sopt_buf)) { 2013 sopt->sopt_data = kmem_zalloc(len, kmflag); 2014 if (sopt->sopt_data == NULL) 2015 return ENOMEM; 2016 } else 2017 sopt->sopt_data = sopt->sopt_buf; 2018 2019 sopt->sopt_size = len; 2020 return 0; 2021 } 2022 2023 /* 2024 * initialise sockopt storage 2025 * - MAY sleep during allocation 2026 */ 2027 void 2028 sockopt_init(struct sockopt *sopt, int level, int name, size_t size) 2029 { 2030 2031 memset(sopt, 0, sizeof(*sopt)); 2032 2033 sopt->sopt_level = level; 2034 sopt->sopt_name = name; 2035 (void)sockopt_alloc(sopt, size, KM_SLEEP); 2036 } 2037 2038 /* 2039 * destroy sockopt storage 2040 * - will release any held memory references 2041 */ 2042 void 2043 sockopt_destroy(struct sockopt *sopt) 2044 { 2045 2046 if (sopt->sopt_data != sopt->sopt_buf) 2047 kmem_free(sopt->sopt_data, sopt->sopt_size); 2048 2049 memset(sopt, 0, sizeof(*sopt)); 2050 } 2051 2052 /* 2053 * set sockopt value 2054 * - value is copied into sockopt 2055 * - memory is allocated when necessary, will not sleep 2056 */ 2057 int 2058 sockopt_set(struct sockopt *sopt, const void *buf, size_t len) 2059 { 2060 int error; 2061 2062 if (sopt->sopt_size == 0) { 2063 error = sockopt_alloc(sopt, len, KM_NOSLEEP); 2064 if (error) 2065 return error; 2066 } 2067 2068 KASSERT(sopt->sopt_size == len); 2069 memcpy(sopt->sopt_data, buf, len); 2070 return 0; 2071 } 2072 2073 /* 2074 * common case of set sockopt integer value 2075 */ 2076 int 2077 sockopt_setint(struct sockopt *sopt, int val) 2078 { 2079 2080 return sockopt_set(sopt, &val, sizeof(int)); 2081 } 2082 2083 /* 2084 * get sockopt value 2085 * - correct size must be given 2086 */ 2087 int 2088 sockopt_get(const struct sockopt *sopt, void *buf, size_t len) 2089 { 2090 2091 if (sopt->sopt_size != len) 2092 return EINVAL; 2093 2094 memcpy(buf, sopt->sopt_data, len); 2095 return 0; 2096 } 2097 2098 /* 2099 * common case of get sockopt integer value 2100 */ 2101 int 2102 sockopt_getint(const struct sockopt *sopt, int *valp) 2103 { 2104 2105 return sockopt_get(sopt, valp, sizeof(int)); 2106 } 2107 2108 /* 2109 * set sockopt value from mbuf 2110 * - ONLY for legacy code 2111 * - mbuf is released by sockopt 2112 * - will not sleep 2113 */ 2114 int 2115 sockopt_setmbuf(struct sockopt *sopt, struct mbuf *m) 2116 { 2117 size_t len; 2118 int error; 2119 2120 len = m_length(m); 2121 2122 if (sopt->sopt_size == 0) { 2123 error = sockopt_alloc(sopt, len, KM_NOSLEEP); 2124 if (error) 2125 return error; 2126 } 2127 2128 KASSERT(sopt->sopt_size == len); 2129 m_copydata(m, 0, len, sopt->sopt_data); 2130 m_freem(m); 2131 2132 return 0; 2133 } 2134 2135 /* 2136 * get sockopt value into mbuf 2137 * - ONLY for legacy code 2138 * - mbuf to be released by the caller 2139 * - will not sleep 2140 */ 2141 struct mbuf * 2142 sockopt_getmbuf(const struct sockopt *sopt) 2143 { 2144 struct mbuf *m; 2145 2146 if (sopt->sopt_size > MCLBYTES) 2147 return NULL; 2148 2149 m = m_get(M_DONTWAIT, MT_SOOPTS); 2150 if (m == NULL) 2151 return NULL; 2152 2153 if (sopt->sopt_size > MLEN) { 2154 MCLGET(m, M_DONTWAIT); 2155 if ((m->m_flags & M_EXT) == 0) { 2156 m_free(m); 2157 return NULL; 2158 } 2159 } 2160 2161 memcpy(mtod(m, void *), sopt->sopt_data, sopt->sopt_size); 2162 m->m_len = sopt->sopt_size; 2163 2164 return m; 2165 } 2166 2167 void 2168 sohasoutofband(struct socket *so) 2169 { 2170 2171 fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so); 2172 selnotify(&so->so_rcv.sb_sel, POLLPRI | POLLRDBAND, NOTE_SUBMIT); 2173 } 2174 2175 static void 2176 filt_sordetach(struct knote *kn) 2177 { 2178 struct socket *so; 2179 2180 so = ((file_t *)kn->kn_obj)->f_data; 2181 solock(so); 2182 SLIST_REMOVE(&so->so_rcv.sb_sel.sel_klist, kn, knote, kn_selnext); 2183 if (SLIST_EMPTY(&so->so_rcv.sb_sel.sel_klist)) 2184 so->so_rcv.sb_flags &= ~SB_KNOTE; 2185 sounlock(so); 2186 } 2187 2188 /*ARGSUSED*/ 2189 static int 2190 filt_soread(struct knote *kn, long hint) 2191 { 2192 struct socket *so; 2193 int rv; 2194 2195 so = ((file_t *)kn->kn_obj)->f_data; 2196 if (hint != NOTE_SUBMIT) 2197 solock(so); 2198 kn->kn_data = so->so_rcv.sb_cc; 2199 if (so->so_state & SS_CANTRCVMORE) { 2200 kn->kn_flags |= EV_EOF; 2201 kn->kn_fflags = so->so_error; 2202 rv = 1; 2203 } else if (so->so_error) /* temporary udp error */ 2204 rv = 1; 2205 else if (kn->kn_sfflags & NOTE_LOWAT) 2206 rv = (kn->kn_data >= kn->kn_sdata); 2207 else 2208 rv = (kn->kn_data >= so->so_rcv.sb_lowat); 2209 if (hint != NOTE_SUBMIT) 2210 sounlock(so); 2211 return rv; 2212 } 2213 2214 static void 2215 filt_sowdetach(struct knote *kn) 2216 { 2217 struct socket *so; 2218 2219 so = ((file_t *)kn->kn_obj)->f_data; 2220 solock(so); 2221 SLIST_REMOVE(&so->so_snd.sb_sel.sel_klist, kn, knote, kn_selnext); 2222 if (SLIST_EMPTY(&so->so_snd.sb_sel.sel_klist)) 2223 so->so_snd.sb_flags &= ~SB_KNOTE; 2224 sounlock(so); 2225 } 2226 2227 /*ARGSUSED*/ 2228 static int 2229 filt_sowrite(struct knote *kn, long hint) 2230 { 2231 struct socket *so; 2232 int rv; 2233 2234 so = ((file_t *)kn->kn_obj)->f_data; 2235 if (hint != NOTE_SUBMIT) 2236 solock(so); 2237 kn->kn_data = sbspace(&so->so_snd); 2238 if (so->so_state & SS_CANTSENDMORE) { 2239 kn->kn_flags |= EV_EOF; 2240 kn->kn_fflags = so->so_error; 2241 rv = 1; 2242 } else if (so->so_error) /* temporary udp error */ 2243 rv = 1; 2244 else if (((so->so_state & SS_ISCONNECTED) == 0) && 2245 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2246 rv = 0; 2247 else if (kn->kn_sfflags & NOTE_LOWAT) 2248 rv = (kn->kn_data >= kn->kn_sdata); 2249 else 2250 rv = (kn->kn_data >= so->so_snd.sb_lowat); 2251 if (hint != NOTE_SUBMIT) 2252 sounlock(so); 2253 return rv; 2254 } 2255 2256 /*ARGSUSED*/ 2257 static int 2258 filt_solisten(struct knote *kn, long hint) 2259 { 2260 struct socket *so; 2261 int rv; 2262 2263 so = ((file_t *)kn->kn_obj)->f_data; 2264 2265 /* 2266 * Set kn_data to number of incoming connections, not 2267 * counting partial (incomplete) connections. 2268 */ 2269 if (hint != NOTE_SUBMIT) 2270 solock(so); 2271 kn->kn_data = so->so_qlen; 2272 rv = (kn->kn_data > 0); 2273 if (hint != NOTE_SUBMIT) 2274 sounlock(so); 2275 return rv; 2276 } 2277 2278 static const struct filterops solisten_filtops = 2279 { 1, NULL, filt_sordetach, filt_solisten }; 2280 static const struct filterops soread_filtops = 2281 { 1, NULL, filt_sordetach, filt_soread }; 2282 static const struct filterops sowrite_filtops = 2283 { 1, NULL, filt_sowdetach, filt_sowrite }; 2284 2285 int 2286 soo_kqfilter(struct file *fp, struct knote *kn) 2287 { 2288 struct socket *so; 2289 struct sockbuf *sb; 2290 2291 so = ((file_t *)kn->kn_obj)->f_data; 2292 solock(so); 2293 switch (kn->kn_filter) { 2294 case EVFILT_READ: 2295 if (so->so_options & SO_ACCEPTCONN) 2296 kn->kn_fop = &solisten_filtops; 2297 else 2298 kn->kn_fop = &soread_filtops; 2299 sb = &so->so_rcv; 2300 break; 2301 case EVFILT_WRITE: 2302 kn->kn_fop = &sowrite_filtops; 2303 sb = &so->so_snd; 2304 break; 2305 default: 2306 sounlock(so); 2307 return (EINVAL); 2308 } 2309 SLIST_INSERT_HEAD(&sb->sb_sel.sel_klist, kn, kn_selnext); 2310 sb->sb_flags |= SB_KNOTE; 2311 sounlock(so); 2312 return (0); 2313 } 2314 2315 static int 2316 sodopoll(struct socket *so, int events) 2317 { 2318 int revents; 2319 2320 revents = 0; 2321 2322 if (events & (POLLIN | POLLRDNORM)) 2323 if (soreadable(so)) 2324 revents |= events & (POLLIN | POLLRDNORM); 2325 2326 if (events & (POLLOUT | POLLWRNORM)) 2327 if (sowritable(so)) 2328 revents |= events & (POLLOUT | POLLWRNORM); 2329 2330 if (events & (POLLPRI | POLLRDBAND)) 2331 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) 2332 revents |= events & (POLLPRI | POLLRDBAND); 2333 2334 return revents; 2335 } 2336 2337 int 2338 sopoll(struct socket *so, int events) 2339 { 2340 int revents = 0; 2341 2342 #ifndef DIAGNOSTIC 2343 /* 2344 * Do a quick, unlocked check in expectation that the socket 2345 * will be ready for I/O. Don't do this check if DIAGNOSTIC, 2346 * as the solocked() assertions will fail. 2347 */ 2348 if ((revents = sodopoll(so, events)) != 0) 2349 return revents; 2350 #endif 2351 2352 solock(so); 2353 if ((revents = sodopoll(so, events)) == 0) { 2354 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { 2355 selrecord(curlwp, &so->so_rcv.sb_sel); 2356 so->so_rcv.sb_flags |= SB_NOTIFY; 2357 } 2358 2359 if (events & (POLLOUT | POLLWRNORM)) { 2360 selrecord(curlwp, &so->so_snd.sb_sel); 2361 so->so_snd.sb_flags |= SB_NOTIFY; 2362 } 2363 } 2364 sounlock(so); 2365 2366 return revents; 2367 } 2368 2369 2370 #include <sys/sysctl.h> 2371 2372 static int sysctl_kern_somaxkva(SYSCTLFN_PROTO); 2373 2374 /* 2375 * sysctl helper routine for kern.somaxkva. ensures that the given 2376 * value is not too small. 2377 * (XXX should we maybe make sure it's not too large as well?) 2378 */ 2379 static int 2380 sysctl_kern_somaxkva(SYSCTLFN_ARGS) 2381 { 2382 int error, new_somaxkva; 2383 struct sysctlnode node; 2384 2385 new_somaxkva = somaxkva; 2386 node = *rnode; 2387 node.sysctl_data = &new_somaxkva; 2388 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 2389 if (error || newp == NULL) 2390 return (error); 2391 2392 if (new_somaxkva < (16 * 1024 * 1024)) /* sanity */ 2393 return (EINVAL); 2394 2395 mutex_enter(&so_pendfree_lock); 2396 somaxkva = new_somaxkva; 2397 cv_broadcast(&socurkva_cv); 2398 mutex_exit(&so_pendfree_lock); 2399 2400 return (error); 2401 } 2402 2403 static void 2404 sysctl_kern_somaxkva_setup(void) 2405 { 2406 2407 KASSERT(socket_sysctllog == NULL); 2408 sysctl_createv(&socket_sysctllog, 0, NULL, NULL, 2409 CTLFLAG_PERMANENT, 2410 CTLTYPE_NODE, "kern", NULL, 2411 NULL, 0, NULL, 0, 2412 CTL_KERN, CTL_EOL); 2413 2414 sysctl_createv(&socket_sysctllog, 0, NULL, NULL, 2415 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2416 CTLTYPE_INT, "somaxkva", 2417 SYSCTL_DESCR("Maximum amount of kernel memory to be " 2418 "used for socket buffers"), 2419 sysctl_kern_somaxkva, 0, NULL, 0, 2420 CTL_KERN, KERN_SOMAXKVA, CTL_EOL); 2421 } 2422