1 /* $NetBSD: uipc_socket.c,v 1.195 2009/12/09 21:32:59 dsl Exp $ */ 2 3 /*- 4 * Copyright (c) 2002, 2007, 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of Wasabi Systems, Inc, and by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2004 The FreeBSD Foundation 34 * Copyright (c) 2004 Robert Watson 35 * Copyright (c) 1982, 1986, 1988, 1990, 1993 36 * The Regents of the University of California. All rights reserved. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. Neither the name of the University nor the names of its contributors 47 * may be used to endorse or promote products derived from this software 48 * without specific prior written permission. 49 * 50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 60 * SUCH DAMAGE. 61 * 62 * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95 63 */ 64 65 #include <sys/cdefs.h> 66 __KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.195 2009/12/09 21:32:59 dsl Exp $"); 67 68 #include "opt_compat_netbsd.h" 69 #include "opt_sock_counters.h" 70 #include "opt_sosend_loan.h" 71 #include "opt_mbuftrace.h" 72 #include "opt_somaxkva.h" 73 #include "opt_multiprocessor.h" /* XXX */ 74 75 #include <sys/param.h> 76 #include <sys/systm.h> 77 #include <sys/proc.h> 78 #include <sys/file.h> 79 #include <sys/filedesc.h> 80 #include <sys/kmem.h> 81 #include <sys/mbuf.h> 82 #include <sys/domain.h> 83 #include <sys/kernel.h> 84 #include <sys/protosw.h> 85 #include <sys/socket.h> 86 #include <sys/socketvar.h> 87 #include <sys/signalvar.h> 88 #include <sys/resourcevar.h> 89 #include <sys/uidinfo.h> 90 #include <sys/event.h> 91 #include <sys/poll.h> 92 #include <sys/kauth.h> 93 #include <sys/mutex.h> 94 #include <sys/condvar.h> 95 96 #ifdef COMPAT_50 97 #include <compat/sys/time.h> 98 #include <compat/sys/socket.h> 99 #endif 100 101 #include <uvm/uvm.h> 102 103 MALLOC_DEFINE(M_SOOPTS, "soopts", "socket options"); 104 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 105 106 extern const struct fileops socketops; 107 108 extern int somaxconn; /* patchable (XXX sysctl) */ 109 int somaxconn = SOMAXCONN; 110 kmutex_t *softnet_lock; 111 112 #ifdef SOSEND_COUNTERS 113 #include <sys/device.h> 114 115 static struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 116 NULL, "sosend", "loan big"); 117 static struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 118 NULL, "sosend", "copy big"); 119 static struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 120 NULL, "sosend", "copy small"); 121 static struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 122 NULL, "sosend", "kva limit"); 123 124 #define SOSEND_COUNTER_INCR(ev) (ev)->ev_count++ 125 126 EVCNT_ATTACH_STATIC(sosend_loan_big); 127 EVCNT_ATTACH_STATIC(sosend_copy_big); 128 EVCNT_ATTACH_STATIC(sosend_copy_small); 129 EVCNT_ATTACH_STATIC(sosend_kvalimit); 130 #else 131 132 #define SOSEND_COUNTER_INCR(ev) /* nothing */ 133 134 #endif /* SOSEND_COUNTERS */ 135 136 static struct callback_entry sokva_reclaimerentry; 137 138 #if defined(SOSEND_NO_LOAN) || defined(MULTIPROCESSOR) 139 int sock_loan_thresh = -1; 140 #else 141 int sock_loan_thresh = 4096; 142 #endif 143 144 static kmutex_t so_pendfree_lock; 145 static struct mbuf *so_pendfree; 146 147 #ifndef SOMAXKVA 148 #define SOMAXKVA (16 * 1024 * 1024) 149 #endif 150 int somaxkva = SOMAXKVA; 151 static int socurkva; 152 static kcondvar_t socurkva_cv; 153 154 static kauth_listener_t socket_listener; 155 156 #define SOCK_LOAN_CHUNK 65536 157 158 static size_t sodopendfree(void); 159 static size_t sodopendfreel(void); 160 161 static void sysctl_kern_somaxkva_setup(void); 162 static struct sysctllog *socket_sysctllog; 163 164 static vsize_t 165 sokvareserve(struct socket *so, vsize_t len) 166 { 167 int error; 168 169 mutex_enter(&so_pendfree_lock); 170 while (socurkva + len > somaxkva) { 171 size_t freed; 172 173 /* 174 * try to do pendfree. 175 */ 176 177 freed = sodopendfreel(); 178 179 /* 180 * if some kva was freed, try again. 181 */ 182 183 if (freed) 184 continue; 185 186 SOSEND_COUNTER_INCR(&sosend_kvalimit); 187 error = cv_wait_sig(&socurkva_cv, &so_pendfree_lock); 188 if (error) { 189 len = 0; 190 break; 191 } 192 } 193 socurkva += len; 194 mutex_exit(&so_pendfree_lock); 195 return len; 196 } 197 198 static void 199 sokvaunreserve(vsize_t len) 200 { 201 202 mutex_enter(&so_pendfree_lock); 203 socurkva -= len; 204 cv_broadcast(&socurkva_cv); 205 mutex_exit(&so_pendfree_lock); 206 } 207 208 /* 209 * sokvaalloc: allocate kva for loan. 210 */ 211 212 vaddr_t 213 sokvaalloc(vsize_t len, struct socket *so) 214 { 215 vaddr_t lva; 216 217 /* 218 * reserve kva. 219 */ 220 221 if (sokvareserve(so, len) == 0) 222 return 0; 223 224 /* 225 * allocate kva. 226 */ 227 228 lva = uvm_km_alloc(kernel_map, len, 0, UVM_KMF_VAONLY | UVM_KMF_WAITVA); 229 if (lva == 0) { 230 sokvaunreserve(len); 231 return (0); 232 } 233 234 return lva; 235 } 236 237 /* 238 * sokvafree: free kva for loan. 239 */ 240 241 void 242 sokvafree(vaddr_t sva, vsize_t len) 243 { 244 245 /* 246 * free kva. 247 */ 248 249 uvm_km_free(kernel_map, sva, len, UVM_KMF_VAONLY); 250 251 /* 252 * unreserve kva. 253 */ 254 255 sokvaunreserve(len); 256 } 257 258 static void 259 sodoloanfree(struct vm_page **pgs, void *buf, size_t size) 260 { 261 vaddr_t sva, eva; 262 vsize_t len; 263 int npgs; 264 265 KASSERT(pgs != NULL); 266 267 eva = round_page((vaddr_t) buf + size); 268 sva = trunc_page((vaddr_t) buf); 269 len = eva - sva; 270 npgs = len >> PAGE_SHIFT; 271 272 pmap_kremove(sva, len); 273 pmap_update(pmap_kernel()); 274 uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE); 275 sokvafree(sva, len); 276 } 277 278 static size_t 279 sodopendfree(void) 280 { 281 size_t rv; 282 283 if (__predict_true(so_pendfree == NULL)) 284 return 0; 285 286 mutex_enter(&so_pendfree_lock); 287 rv = sodopendfreel(); 288 mutex_exit(&so_pendfree_lock); 289 290 return rv; 291 } 292 293 /* 294 * sodopendfreel: free mbufs on "pendfree" list. 295 * unlock and relock so_pendfree_lock when freeing mbufs. 296 * 297 * => called with so_pendfree_lock held. 298 */ 299 300 static size_t 301 sodopendfreel(void) 302 { 303 struct mbuf *m, *next; 304 size_t rv = 0; 305 306 KASSERT(mutex_owned(&so_pendfree_lock)); 307 308 while (so_pendfree != NULL) { 309 m = so_pendfree; 310 so_pendfree = NULL; 311 mutex_exit(&so_pendfree_lock); 312 313 for (; m != NULL; m = next) { 314 next = m->m_next; 315 KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) == 0); 316 KASSERT(m->m_ext.ext_refcnt == 0); 317 318 rv += m->m_ext.ext_size; 319 sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf, 320 m->m_ext.ext_size); 321 pool_cache_put(mb_cache, m); 322 } 323 324 mutex_enter(&so_pendfree_lock); 325 } 326 327 return (rv); 328 } 329 330 void 331 soloanfree(struct mbuf *m, void *buf, size_t size, void *arg) 332 { 333 334 KASSERT(m != NULL); 335 336 /* 337 * postpone freeing mbuf. 338 * 339 * we can't do it in interrupt context 340 * because we need to put kva back to kernel_map. 341 */ 342 343 mutex_enter(&so_pendfree_lock); 344 m->m_next = so_pendfree; 345 so_pendfree = m; 346 cv_broadcast(&socurkva_cv); 347 mutex_exit(&so_pendfree_lock); 348 } 349 350 static long 351 sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space) 352 { 353 struct iovec *iov = uio->uio_iov; 354 vaddr_t sva, eva; 355 vsize_t len; 356 vaddr_t lva; 357 int npgs, error; 358 vaddr_t va; 359 int i; 360 361 if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) 362 return (0); 363 364 if (iov->iov_len < (size_t) space) 365 space = iov->iov_len; 366 if (space > SOCK_LOAN_CHUNK) 367 space = SOCK_LOAN_CHUNK; 368 369 eva = round_page((vaddr_t) iov->iov_base + space); 370 sva = trunc_page((vaddr_t) iov->iov_base); 371 len = eva - sva; 372 npgs = len >> PAGE_SHIFT; 373 374 KASSERT(npgs <= M_EXT_MAXPAGES); 375 376 lva = sokvaalloc(len, so); 377 if (lva == 0) 378 return 0; 379 380 error = uvm_loan(&uio->uio_vmspace->vm_map, sva, len, 381 m->m_ext.ext_pgs, UVM_LOAN_TOPAGE); 382 if (error) { 383 sokvafree(lva, len); 384 return (0); 385 } 386 387 for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE) 388 pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]), 389 VM_PROT_READ, 0); 390 pmap_update(pmap_kernel()); 391 392 lva += (vaddr_t) iov->iov_base & PAGE_MASK; 393 394 MEXTADD(m, (void *) lva, space, M_MBUF, soloanfree, so); 395 m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP; 396 397 uio->uio_resid -= space; 398 /* uio_offset not updated, not set/used for write(2) */ 399 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + space; 400 uio->uio_iov->iov_len -= space; 401 if (uio->uio_iov->iov_len == 0) { 402 uio->uio_iov++; 403 uio->uio_iovcnt--; 404 } 405 406 return (space); 407 } 408 409 static int 410 sokva_reclaim_callback(struct callback_entry *ce, void *obj, void *arg) 411 { 412 413 KASSERT(ce == &sokva_reclaimerentry); 414 KASSERT(obj == NULL); 415 416 sodopendfree(); 417 if (!vm_map_starved_p(kernel_map)) { 418 return CALLBACK_CHAIN_ABORT; 419 } 420 return CALLBACK_CHAIN_CONTINUE; 421 } 422 423 struct mbuf * 424 getsombuf(struct socket *so, int type) 425 { 426 struct mbuf *m; 427 428 m = m_get(M_WAIT, type); 429 MCLAIM(m, so->so_mowner); 430 return m; 431 } 432 433 static int 434 socket_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie, 435 void *arg0, void *arg1, void *arg2, void *arg3) 436 { 437 int result; 438 enum kauth_network_req req; 439 440 result = KAUTH_RESULT_DEFER; 441 req = (enum kauth_network_req)arg0; 442 443 if ((action != KAUTH_NETWORK_SOCKET) && 444 (action != KAUTH_NETWORK_BIND)) 445 return result; 446 447 switch (req) { 448 case KAUTH_REQ_NETWORK_BIND_PORT: 449 result = KAUTH_RESULT_ALLOW; 450 break; 451 452 case KAUTH_REQ_NETWORK_SOCKET_DROP: { 453 /* Normal users can only drop their own connections. */ 454 struct socket *so = (struct socket *)arg1; 455 uid_t sockuid = so->so_uidinfo->ui_uid; 456 457 if (sockuid == kauth_cred_getuid(cred) || 458 sockuid == kauth_cred_geteuid(cred)) 459 result = KAUTH_RESULT_ALLOW; 460 461 break; 462 } 463 464 case KAUTH_REQ_NETWORK_SOCKET_OPEN: 465 /* We allow "raw" routing/bluetooth sockets to anyone. */ 466 if ((u_long)arg1 == PF_ROUTE || (u_long)arg1 == PF_BLUETOOTH) 467 result = KAUTH_RESULT_ALLOW; 468 else { 469 /* Privileged, let secmodel handle this. */ 470 if ((u_long)arg2 == SOCK_RAW) 471 break; 472 } 473 474 result = KAUTH_RESULT_ALLOW; 475 476 break; 477 478 case KAUTH_REQ_NETWORK_SOCKET_CANSEE: 479 result = KAUTH_RESULT_ALLOW; 480 481 break; 482 483 default: 484 break; 485 } 486 487 return result; 488 } 489 490 void 491 soinit(void) 492 { 493 494 sysctl_kern_somaxkva_setup(); 495 496 mutex_init(&so_pendfree_lock, MUTEX_DEFAULT, IPL_VM); 497 softnet_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); 498 cv_init(&socurkva_cv, "sokva"); 499 soinit2(); 500 501 /* Set the initial adjusted socket buffer size. */ 502 if (sb_max_set(sb_max)) 503 panic("bad initial sb_max value: %lu", sb_max); 504 505 callback_register(&vm_map_to_kernel(kernel_map)->vmk_reclaim_callback, 506 &sokva_reclaimerentry, NULL, sokva_reclaim_callback); 507 508 socket_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK, 509 socket_listener_cb, NULL); 510 } 511 512 /* 513 * Socket operation routines. 514 * These routines are called by the routines in 515 * sys_socket.c or from a system process, and 516 * implement the semantics of socket operations by 517 * switching out to the protocol specific routines. 518 */ 519 /*ARGSUSED*/ 520 int 521 socreate(int dom, struct socket **aso, int type, int proto, struct lwp *l, 522 struct socket *lockso) 523 { 524 const struct protosw *prp; 525 struct socket *so; 526 uid_t uid; 527 int error; 528 kmutex_t *lock; 529 530 error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET, 531 KAUTH_REQ_NETWORK_SOCKET_OPEN, KAUTH_ARG(dom), KAUTH_ARG(type), 532 KAUTH_ARG(proto)); 533 if (error != 0) 534 return error; 535 536 if (proto) 537 prp = pffindproto(dom, proto, type); 538 else 539 prp = pffindtype(dom, type); 540 if (prp == NULL) { 541 /* no support for domain */ 542 if (pffinddomain(dom) == 0) 543 return EAFNOSUPPORT; 544 /* no support for socket type */ 545 if (proto == 0 && type != 0) 546 return EPROTOTYPE; 547 return EPROTONOSUPPORT; 548 } 549 if (prp->pr_usrreq == NULL) 550 return EPROTONOSUPPORT; 551 if (prp->pr_type != type) 552 return EPROTOTYPE; 553 554 so = soget(true); 555 so->so_type = type; 556 so->so_proto = prp; 557 so->so_send = sosend; 558 so->so_receive = soreceive; 559 #ifdef MBUFTRACE 560 so->so_rcv.sb_mowner = &prp->pr_domain->dom_mowner; 561 so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner; 562 so->so_mowner = &prp->pr_domain->dom_mowner; 563 #endif 564 /* so->so_cred = kauth_cred_dup(l->l_cred); */ 565 uid = kauth_cred_geteuid(l->l_cred); 566 so->so_uidinfo = uid_find(uid); 567 so->so_egid = kauth_cred_getegid(l->l_cred); 568 so->so_cpid = l->l_proc->p_pid; 569 if (lockso != NULL) { 570 /* Caller wants us to share a lock. */ 571 lock = lockso->so_lock; 572 so->so_lock = lock; 573 mutex_obj_hold(lock); 574 mutex_enter(lock); 575 } else { 576 /* Lock assigned and taken during PRU_ATTACH. */ 577 } 578 error = (*prp->pr_usrreq)(so, PRU_ATTACH, NULL, 579 (struct mbuf *)(long)proto, NULL, l); 580 KASSERT(solocked(so)); 581 if (error != 0) { 582 so->so_state |= SS_NOFDREF; 583 sofree(so); 584 return error; 585 } 586 sounlock(so); 587 *aso = so; 588 return 0; 589 } 590 591 /* On success, write file descriptor to fdout and return zero. On 592 * failure, return non-zero; *fdout will be undefined. 593 */ 594 int 595 fsocreate(int domain, struct socket **sop, int type, int protocol, 596 struct lwp *l, int *fdout) 597 { 598 struct socket *so; 599 struct file *fp; 600 int fd, error; 601 602 if ((error = fd_allocfile(&fp, &fd)) != 0) 603 return (error); 604 fp->f_flag = FREAD|FWRITE; 605 fp->f_type = DTYPE_SOCKET; 606 fp->f_ops = &socketops; 607 error = socreate(domain, &so, type, protocol, l, NULL); 608 if (error != 0) { 609 fd_abort(curproc, fp, fd); 610 } else { 611 if (sop != NULL) 612 *sop = so; 613 fp->f_data = so; 614 fd_affix(curproc, fp, fd); 615 *fdout = fd; 616 } 617 return error; 618 } 619 620 int 621 sofamily(const struct socket *so) 622 { 623 const struct protosw *pr; 624 const struct domain *dom; 625 626 if ((pr = so->so_proto) == NULL) 627 return AF_UNSPEC; 628 if ((dom = pr->pr_domain) == NULL) 629 return AF_UNSPEC; 630 return dom->dom_family; 631 } 632 633 int 634 sobind(struct socket *so, struct mbuf *nam, struct lwp *l) 635 { 636 int error; 637 638 solock(so); 639 error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, NULL, nam, NULL, l); 640 sounlock(so); 641 return error; 642 } 643 644 int 645 solisten(struct socket *so, int backlog, struct lwp *l) 646 { 647 int error; 648 649 solock(so); 650 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 651 SS_ISDISCONNECTING)) != 0) { 652 sounlock(so); 653 return (EOPNOTSUPP); 654 } 655 error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, 656 NULL, NULL, l); 657 if (error != 0) { 658 sounlock(so); 659 return error; 660 } 661 if (TAILQ_EMPTY(&so->so_q)) 662 so->so_options |= SO_ACCEPTCONN; 663 if (backlog < 0) 664 backlog = 0; 665 so->so_qlimit = min(backlog, somaxconn); 666 sounlock(so); 667 return 0; 668 } 669 670 void 671 sofree(struct socket *so) 672 { 673 u_int refs; 674 675 KASSERT(solocked(so)); 676 677 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { 678 sounlock(so); 679 return; 680 } 681 if (so->so_head) { 682 /* 683 * We must not decommission a socket that's on the accept(2) 684 * queue. If we do, then accept(2) may hang after select(2) 685 * indicated that the listening socket was ready. 686 */ 687 if (!soqremque(so, 0)) { 688 sounlock(so); 689 return; 690 } 691 } 692 if (so->so_rcv.sb_hiwat) 693 (void)chgsbsize(so->so_uidinfo, &so->so_rcv.sb_hiwat, 0, 694 RLIM_INFINITY); 695 if (so->so_snd.sb_hiwat) 696 (void)chgsbsize(so->so_uidinfo, &so->so_snd.sb_hiwat, 0, 697 RLIM_INFINITY); 698 sbrelease(&so->so_snd, so); 699 KASSERT(!cv_has_waiters(&so->so_cv)); 700 KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv)); 701 KASSERT(!cv_has_waiters(&so->so_snd.sb_cv)); 702 sorflush(so); 703 refs = so->so_aborting; /* XXX */ 704 /* Remove acccept filter if one is present. */ 705 if (so->so_accf != NULL) 706 (void)accept_filt_clear(so); 707 /* kauth_cred_free(so->so_cred); */ 708 sounlock(so); 709 if (refs == 0) /* XXX */ 710 soput(so); 711 } 712 713 /* 714 * Close a socket on last file table reference removal. 715 * Initiate disconnect if connected. 716 * Free socket when disconnect complete. 717 */ 718 int 719 soclose(struct socket *so) 720 { 721 struct socket *so2; 722 int error; 723 int error2; 724 725 error = 0; 726 solock(so); 727 if (so->so_options & SO_ACCEPTCONN) { 728 for (;;) { 729 if ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) { 730 KASSERT(solocked2(so, so2)); 731 (void) soqremque(so2, 0); 732 /* soabort drops the lock. */ 733 (void) soabort(so2); 734 solock(so); 735 continue; 736 } 737 if ((so2 = TAILQ_FIRST(&so->so_q)) != 0) { 738 KASSERT(solocked2(so, so2)); 739 (void) soqremque(so2, 1); 740 /* soabort drops the lock. */ 741 (void) soabort(so2); 742 solock(so); 743 continue; 744 } 745 break; 746 } 747 } 748 if (so->so_pcb == 0) 749 goto discard; 750 if (so->so_state & SS_ISCONNECTED) { 751 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 752 error = sodisconnect(so); 753 if (error) 754 goto drop; 755 } 756 if (so->so_options & SO_LINGER) { 757 if ((so->so_state & SS_ISDISCONNECTING) && so->so_nbio) 758 goto drop; 759 while (so->so_state & SS_ISCONNECTED) { 760 error = sowait(so, true, so->so_linger * hz); 761 if (error) 762 break; 763 } 764 } 765 } 766 drop: 767 if (so->so_pcb) { 768 error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, 769 NULL, NULL, NULL, NULL); 770 if (error == 0) 771 error = error2; 772 } 773 discard: 774 if (so->so_state & SS_NOFDREF) 775 panic("soclose: NOFDREF"); 776 so->so_state |= SS_NOFDREF; 777 sofree(so); 778 return (error); 779 } 780 781 /* 782 * Must be called with the socket locked.. Will return with it unlocked. 783 */ 784 int 785 soabort(struct socket *so) 786 { 787 u_int refs; 788 int error; 789 790 KASSERT(solocked(so)); 791 KASSERT(so->so_head == NULL); 792 793 so->so_aborting++; /* XXX */ 794 error = (*so->so_proto->pr_usrreq)(so, PRU_ABORT, NULL, 795 NULL, NULL, NULL); 796 refs = --so->so_aborting; /* XXX */ 797 if (error || (refs == 0)) { 798 sofree(so); 799 } else { 800 sounlock(so); 801 } 802 return error; 803 } 804 805 int 806 soaccept(struct socket *so, struct mbuf *nam) 807 { 808 int error; 809 810 KASSERT(solocked(so)); 811 812 error = 0; 813 if ((so->so_state & SS_NOFDREF) == 0) 814 panic("soaccept: !NOFDREF"); 815 so->so_state &= ~SS_NOFDREF; 816 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 817 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 818 error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, 819 NULL, nam, NULL, NULL); 820 else 821 error = ECONNABORTED; 822 823 return (error); 824 } 825 826 int 827 soconnect(struct socket *so, struct mbuf *nam, struct lwp *l) 828 { 829 int error; 830 831 KASSERT(solocked(so)); 832 833 if (so->so_options & SO_ACCEPTCONN) 834 return (EOPNOTSUPP); 835 /* 836 * If protocol is connection-based, can only connect once. 837 * Otherwise, if connected, try to disconnect first. 838 * This allows user to disconnect by connecting to, e.g., 839 * a null address. 840 */ 841 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 842 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 843 (error = sodisconnect(so)))) 844 error = EISCONN; 845 else 846 error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, 847 NULL, nam, NULL, l); 848 return (error); 849 } 850 851 int 852 soconnect2(struct socket *so1, struct socket *so2) 853 { 854 int error; 855 856 KASSERT(solocked2(so1, so2)); 857 858 error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, 859 NULL, (struct mbuf *)so2, NULL, NULL); 860 return (error); 861 } 862 863 int 864 sodisconnect(struct socket *so) 865 { 866 int error; 867 868 KASSERT(solocked(so)); 869 870 if ((so->so_state & SS_ISCONNECTED) == 0) { 871 error = ENOTCONN; 872 } else if (so->so_state & SS_ISDISCONNECTING) { 873 error = EALREADY; 874 } else { 875 error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, 876 NULL, NULL, NULL, NULL); 877 } 878 sodopendfree(); 879 return (error); 880 } 881 882 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 883 /* 884 * Send on a socket. 885 * If send must go all at once and message is larger than 886 * send buffering, then hard error. 887 * Lock against other senders. 888 * If must go all at once and not enough room now, then 889 * inform user that this would block and do nothing. 890 * Otherwise, if nonblocking, send as much as possible. 891 * The data to be sent is described by "uio" if nonzero, 892 * otherwise by the mbuf chain "top" (which must be null 893 * if uio is not). Data provided in mbuf chain must be small 894 * enough to send all at once. 895 * 896 * Returns nonzero on error, timeout or signal; callers 897 * must check for short counts if EINTR/ERESTART are returned. 898 * Data and control buffers are freed on return. 899 */ 900 int 901 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 902 struct mbuf *control, int flags, struct lwp *l) 903 { 904 struct mbuf **mp, *m; 905 struct proc *p; 906 long space, len, resid, clen, mlen; 907 int error, s, dontroute, atomic; 908 909 p = l->l_proc; 910 sodopendfree(); 911 clen = 0; 912 913 /* 914 * solock() provides atomicity of access. splsoftnet() prevents 915 * protocol processing soft interrupts from interrupting us and 916 * blocking (expensive). 917 */ 918 s = splsoftnet(); 919 solock(so); 920 atomic = sosendallatonce(so) || top; 921 if (uio) 922 resid = uio->uio_resid; 923 else 924 resid = top->m_pkthdr.len; 925 /* 926 * In theory resid should be unsigned. 927 * However, space must be signed, as it might be less than 0 928 * if we over-committed, and we must use a signed comparison 929 * of space and resid. On the other hand, a negative resid 930 * causes us to loop sending 0-length segments to the protocol. 931 */ 932 if (resid < 0) { 933 error = EINVAL; 934 goto out; 935 } 936 dontroute = 937 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 938 (so->so_proto->pr_flags & PR_ATOMIC); 939 l->l_ru.ru_msgsnd++; 940 if (control) 941 clen = control->m_len; 942 restart: 943 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) 944 goto out; 945 do { 946 if (so->so_state & SS_CANTSENDMORE) { 947 error = EPIPE; 948 goto release; 949 } 950 if (so->so_error) { 951 error = so->so_error; 952 so->so_error = 0; 953 goto release; 954 } 955 if ((so->so_state & SS_ISCONNECTED) == 0) { 956 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 957 if ((so->so_state & SS_ISCONFIRMING) == 0 && 958 !(resid == 0 && clen != 0)) { 959 error = ENOTCONN; 960 goto release; 961 } 962 } else if (addr == 0) { 963 error = EDESTADDRREQ; 964 goto release; 965 } 966 } 967 space = sbspace(&so->so_snd); 968 if (flags & MSG_OOB) 969 space += 1024; 970 if ((atomic && resid > so->so_snd.sb_hiwat) || 971 clen > so->so_snd.sb_hiwat) { 972 error = EMSGSIZE; 973 goto release; 974 } 975 if (space < resid + clen && 976 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 977 if (so->so_nbio) { 978 error = EWOULDBLOCK; 979 goto release; 980 } 981 sbunlock(&so->so_snd); 982 error = sbwait(&so->so_snd); 983 if (error) 984 goto out; 985 goto restart; 986 } 987 mp = ⊤ 988 space -= clen; 989 do { 990 if (uio == NULL) { 991 /* 992 * Data is prepackaged in "top". 993 */ 994 resid = 0; 995 if (flags & MSG_EOR) 996 top->m_flags |= M_EOR; 997 } else do { 998 sounlock(so); 999 splx(s); 1000 if (top == NULL) { 1001 m = m_gethdr(M_WAIT, MT_DATA); 1002 mlen = MHLEN; 1003 m->m_pkthdr.len = 0; 1004 m->m_pkthdr.rcvif = NULL; 1005 } else { 1006 m = m_get(M_WAIT, MT_DATA); 1007 mlen = MLEN; 1008 } 1009 MCLAIM(m, so->so_snd.sb_mowner); 1010 if (sock_loan_thresh >= 0 && 1011 uio->uio_iov->iov_len >= sock_loan_thresh && 1012 space >= sock_loan_thresh && 1013 (len = sosend_loan(so, uio, m, 1014 space)) != 0) { 1015 SOSEND_COUNTER_INCR(&sosend_loan_big); 1016 space -= len; 1017 goto have_data; 1018 } 1019 if (resid >= MINCLSIZE && space >= MCLBYTES) { 1020 SOSEND_COUNTER_INCR(&sosend_copy_big); 1021 m_clget(m, M_WAIT); 1022 if ((m->m_flags & M_EXT) == 0) 1023 goto nopages; 1024 mlen = MCLBYTES; 1025 if (atomic && top == 0) { 1026 len = lmin(MCLBYTES - max_hdr, 1027 resid); 1028 m->m_data += max_hdr; 1029 } else 1030 len = lmin(MCLBYTES, resid); 1031 space -= len; 1032 } else { 1033 nopages: 1034 SOSEND_COUNTER_INCR(&sosend_copy_small); 1035 len = lmin(lmin(mlen, resid), space); 1036 space -= len; 1037 /* 1038 * For datagram protocols, leave room 1039 * for protocol headers in first mbuf. 1040 */ 1041 if (atomic && top == 0 && len < mlen) 1042 MH_ALIGN(m, len); 1043 } 1044 error = uiomove(mtod(m, void *), (int)len, uio); 1045 have_data: 1046 resid = uio->uio_resid; 1047 m->m_len = len; 1048 *mp = m; 1049 top->m_pkthdr.len += len; 1050 s = splsoftnet(); 1051 solock(so); 1052 if (error != 0) 1053 goto release; 1054 mp = &m->m_next; 1055 if (resid <= 0) { 1056 if (flags & MSG_EOR) 1057 top->m_flags |= M_EOR; 1058 break; 1059 } 1060 } while (space > 0 && atomic); 1061 1062 if (so->so_state & SS_CANTSENDMORE) { 1063 error = EPIPE; 1064 goto release; 1065 } 1066 if (dontroute) 1067 so->so_options |= SO_DONTROUTE; 1068 if (resid > 0) 1069 so->so_state |= SS_MORETOCOME; 1070 error = (*so->so_proto->pr_usrreq)(so, 1071 (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, 1072 top, addr, control, curlwp); 1073 if (dontroute) 1074 so->so_options &= ~SO_DONTROUTE; 1075 if (resid > 0) 1076 so->so_state &= ~SS_MORETOCOME; 1077 clen = 0; 1078 control = NULL; 1079 top = NULL; 1080 mp = ⊤ 1081 if (error != 0) 1082 goto release; 1083 } while (resid && space > 0); 1084 } while (resid); 1085 1086 release: 1087 sbunlock(&so->so_snd); 1088 out: 1089 sounlock(so); 1090 splx(s); 1091 if (top) 1092 m_freem(top); 1093 if (control) 1094 m_freem(control); 1095 return (error); 1096 } 1097 1098 /* 1099 * Following replacement or removal of the first mbuf on the first 1100 * mbuf chain of a socket buffer, push necessary state changes back 1101 * into the socket buffer so that other consumers see the values 1102 * consistently. 'nextrecord' is the callers locally stored value of 1103 * the original value of sb->sb_mb->m_nextpkt which must be restored 1104 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 1105 */ 1106 static void 1107 sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 1108 { 1109 1110 KASSERT(solocked(sb->sb_so)); 1111 1112 /* 1113 * First, update for the new value of nextrecord. If necessary, 1114 * make it the first record. 1115 */ 1116 if (sb->sb_mb != NULL) 1117 sb->sb_mb->m_nextpkt = nextrecord; 1118 else 1119 sb->sb_mb = nextrecord; 1120 1121 /* 1122 * Now update any dependent socket buffer fields to reflect 1123 * the new state. This is an inline of SB_EMPTY_FIXUP, with 1124 * the addition of a second clause that takes care of the 1125 * case where sb_mb has been updated, but remains the last 1126 * record. 1127 */ 1128 if (sb->sb_mb == NULL) { 1129 sb->sb_mbtail = NULL; 1130 sb->sb_lastrecord = NULL; 1131 } else if (sb->sb_mb->m_nextpkt == NULL) 1132 sb->sb_lastrecord = sb->sb_mb; 1133 } 1134 1135 /* 1136 * Implement receive operations on a socket. 1137 * We depend on the way that records are added to the sockbuf 1138 * by sbappend*. In particular, each record (mbufs linked through m_next) 1139 * must begin with an address if the protocol so specifies, 1140 * followed by an optional mbuf or mbufs containing ancillary data, 1141 * and then zero or more mbufs of data. 1142 * In order to avoid blocking network interrupts for the entire time here, 1143 * we splx() while doing the actual copy to user space. 1144 * Although the sockbuf is locked, new data may still be appended, 1145 * and thus we must maintain consistency of the sockbuf during that time. 1146 * 1147 * The caller may receive the data as a single mbuf chain by supplying 1148 * an mbuf **mp0 for use in returning the chain. The uio is then used 1149 * only for the count in uio_resid. 1150 */ 1151 int 1152 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 1153 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1154 { 1155 struct lwp *l = curlwp; 1156 struct mbuf *m, **mp, *mt; 1157 int atomic, flags, len, error, s, offset, moff, type, orig_resid; 1158 const struct protosw *pr; 1159 struct mbuf *nextrecord; 1160 int mbuf_removed = 0; 1161 const struct domain *dom; 1162 1163 pr = so->so_proto; 1164 atomic = pr->pr_flags & PR_ATOMIC; 1165 dom = pr->pr_domain; 1166 mp = mp0; 1167 type = 0; 1168 orig_resid = uio->uio_resid; 1169 1170 if (paddr != NULL) 1171 *paddr = NULL; 1172 if (controlp != NULL) 1173 *controlp = NULL; 1174 if (flagsp != NULL) 1175 flags = *flagsp &~ MSG_EOR; 1176 else 1177 flags = 0; 1178 1179 if ((flags & MSG_DONTWAIT) == 0) 1180 sodopendfree(); 1181 1182 if (flags & MSG_OOB) { 1183 m = m_get(M_WAIT, MT_DATA); 1184 solock(so); 1185 error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, 1186 (struct mbuf *)(long)(flags & MSG_PEEK), NULL, l); 1187 sounlock(so); 1188 if (error) 1189 goto bad; 1190 do { 1191 error = uiomove(mtod(m, void *), 1192 (int) min(uio->uio_resid, m->m_len), uio); 1193 m = m_free(m); 1194 } while (uio->uio_resid > 0 && error == 0 && m); 1195 bad: 1196 if (m != NULL) 1197 m_freem(m); 1198 return error; 1199 } 1200 if (mp != NULL) 1201 *mp = NULL; 1202 1203 /* 1204 * solock() provides atomicity of access. splsoftnet() prevents 1205 * protocol processing soft interrupts from interrupting us and 1206 * blocking (expensive). 1207 */ 1208 s = splsoftnet(); 1209 solock(so); 1210 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 1211 (*pr->pr_usrreq)(so, PRU_RCVD, NULL, NULL, NULL, l); 1212 1213 restart: 1214 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) { 1215 sounlock(so); 1216 splx(s); 1217 return error; 1218 } 1219 1220 m = so->so_rcv.sb_mb; 1221 /* 1222 * If we have less data than requested, block awaiting more 1223 * (subject to any timeout) if: 1224 * 1. the current count is less than the low water mark, 1225 * 2. MSG_WAITALL is set, and it is possible to do the entire 1226 * receive operation at once if we block (resid <= hiwat), or 1227 * 3. MSG_DONTWAIT is not set. 1228 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1229 * we have to do the receive in sections, and thus risk returning 1230 * a short count if a timeout or signal occurs after we start. 1231 */ 1232 if (m == NULL || 1233 ((flags & MSG_DONTWAIT) == 0 && 1234 so->so_rcv.sb_cc < uio->uio_resid && 1235 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 1236 ((flags & MSG_WAITALL) && 1237 uio->uio_resid <= so->so_rcv.sb_hiwat)) && 1238 m->m_nextpkt == NULL && !atomic)) { 1239 #ifdef DIAGNOSTIC 1240 if (m == NULL && so->so_rcv.sb_cc) 1241 panic("receive 1"); 1242 #endif 1243 if (so->so_error) { 1244 if (m != NULL) 1245 goto dontblock; 1246 error = so->so_error; 1247 if ((flags & MSG_PEEK) == 0) 1248 so->so_error = 0; 1249 goto release; 1250 } 1251 if (so->so_state & SS_CANTRCVMORE) { 1252 if (m != NULL) 1253 goto dontblock; 1254 else 1255 goto release; 1256 } 1257 for (; m != NULL; m = m->m_next) 1258 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1259 m = so->so_rcv.sb_mb; 1260 goto dontblock; 1261 } 1262 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1263 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1264 error = ENOTCONN; 1265 goto release; 1266 } 1267 if (uio->uio_resid == 0) 1268 goto release; 1269 if (so->so_nbio || (flags & MSG_DONTWAIT)) { 1270 error = EWOULDBLOCK; 1271 goto release; 1272 } 1273 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 1274 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 1275 sbunlock(&so->so_rcv); 1276 error = sbwait(&so->so_rcv); 1277 if (error != 0) { 1278 sounlock(so); 1279 splx(s); 1280 return error; 1281 } 1282 goto restart; 1283 } 1284 dontblock: 1285 /* 1286 * On entry here, m points to the first record of the socket buffer. 1287 * From this point onward, we maintain 'nextrecord' as a cache of the 1288 * pointer to the next record in the socket buffer. We must keep the 1289 * various socket buffer pointers and local stack versions of the 1290 * pointers in sync, pushing out modifications before dropping the 1291 * socket lock, and re-reading them when picking it up. 1292 * 1293 * Otherwise, we will race with the network stack appending new data 1294 * or records onto the socket buffer by using inconsistent/stale 1295 * versions of the field, possibly resulting in socket buffer 1296 * corruption. 1297 * 1298 * By holding the high-level sblock(), we prevent simultaneous 1299 * readers from pulling off the front of the socket buffer. 1300 */ 1301 if (l != NULL) 1302 l->l_ru.ru_msgrcv++; 1303 KASSERT(m == so->so_rcv.sb_mb); 1304 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 1305 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 1306 nextrecord = m->m_nextpkt; 1307 if (pr->pr_flags & PR_ADDR) { 1308 #ifdef DIAGNOSTIC 1309 if (m->m_type != MT_SONAME) 1310 panic("receive 1a"); 1311 #endif 1312 orig_resid = 0; 1313 if (flags & MSG_PEEK) { 1314 if (paddr) 1315 *paddr = m_copy(m, 0, m->m_len); 1316 m = m->m_next; 1317 } else { 1318 sbfree(&so->so_rcv, m); 1319 mbuf_removed = 1; 1320 if (paddr != NULL) { 1321 *paddr = m; 1322 so->so_rcv.sb_mb = m->m_next; 1323 m->m_next = NULL; 1324 m = so->so_rcv.sb_mb; 1325 } else { 1326 MFREE(m, so->so_rcv.sb_mb); 1327 m = so->so_rcv.sb_mb; 1328 } 1329 sbsync(&so->so_rcv, nextrecord); 1330 } 1331 } 1332 1333 /* 1334 * Process one or more MT_CONTROL mbufs present before any data mbufs 1335 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1336 * just copy the data; if !MSG_PEEK, we call into the protocol to 1337 * perform externalization (or freeing if controlp == NULL). 1338 */ 1339 if (__predict_false(m != NULL && m->m_type == MT_CONTROL)) { 1340 struct mbuf *cm = NULL, *cmn; 1341 struct mbuf **cme = &cm; 1342 1343 do { 1344 if (flags & MSG_PEEK) { 1345 if (controlp != NULL) { 1346 *controlp = m_copy(m, 0, m->m_len); 1347 controlp = &(*controlp)->m_next; 1348 } 1349 m = m->m_next; 1350 } else { 1351 sbfree(&so->so_rcv, m); 1352 so->so_rcv.sb_mb = m->m_next; 1353 m->m_next = NULL; 1354 *cme = m; 1355 cme = &(*cme)->m_next; 1356 m = so->so_rcv.sb_mb; 1357 } 1358 } while (m != NULL && m->m_type == MT_CONTROL); 1359 if ((flags & MSG_PEEK) == 0) 1360 sbsync(&so->so_rcv, nextrecord); 1361 for (; cm != NULL; cm = cmn) { 1362 cmn = cm->m_next; 1363 cm->m_next = NULL; 1364 type = mtod(cm, struct cmsghdr *)->cmsg_type; 1365 if (controlp != NULL) { 1366 if (dom->dom_externalize != NULL && 1367 type == SCM_RIGHTS) { 1368 sounlock(so); 1369 splx(s); 1370 error = (*dom->dom_externalize)(cm, l); 1371 s = splsoftnet(); 1372 solock(so); 1373 } 1374 *controlp = cm; 1375 while (*controlp != NULL) 1376 controlp = &(*controlp)->m_next; 1377 } else { 1378 /* 1379 * Dispose of any SCM_RIGHTS message that went 1380 * through the read path rather than recv. 1381 */ 1382 if (dom->dom_dispose != NULL && 1383 type == SCM_RIGHTS) { 1384 sounlock(so); 1385 (*dom->dom_dispose)(cm); 1386 solock(so); 1387 } 1388 m_freem(cm); 1389 } 1390 } 1391 if (m != NULL) 1392 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1393 else 1394 nextrecord = so->so_rcv.sb_mb; 1395 orig_resid = 0; 1396 } 1397 1398 /* If m is non-NULL, we have some data to read. */ 1399 if (__predict_true(m != NULL)) { 1400 type = m->m_type; 1401 if (type == MT_OOBDATA) 1402 flags |= MSG_OOB; 1403 } 1404 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 1405 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 1406 1407 moff = 0; 1408 offset = 0; 1409 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1410 if (m->m_type == MT_OOBDATA) { 1411 if (type != MT_OOBDATA) 1412 break; 1413 } else if (type == MT_OOBDATA) 1414 break; 1415 #ifdef DIAGNOSTIC 1416 else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) 1417 panic("receive 3"); 1418 #endif 1419 so->so_state &= ~SS_RCVATMARK; 1420 len = uio->uio_resid; 1421 if (so->so_oobmark && len > so->so_oobmark - offset) 1422 len = so->so_oobmark - offset; 1423 if (len > m->m_len - moff) 1424 len = m->m_len - moff; 1425 /* 1426 * If mp is set, just pass back the mbufs. 1427 * Otherwise copy them out via the uio, then free. 1428 * Sockbuf must be consistent here (points to current mbuf, 1429 * it points to next record) when we drop priority; 1430 * we must note any additions to the sockbuf when we 1431 * block interrupts again. 1432 */ 1433 if (mp == NULL) { 1434 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 1435 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 1436 sounlock(so); 1437 splx(s); 1438 error = uiomove(mtod(m, char *) + moff, (int)len, uio); 1439 s = splsoftnet(); 1440 solock(so); 1441 if (error != 0) { 1442 /* 1443 * If any part of the record has been removed 1444 * (such as the MT_SONAME mbuf, which will 1445 * happen when PR_ADDR, and thus also 1446 * PR_ATOMIC, is set), then drop the entire 1447 * record to maintain the atomicity of the 1448 * receive operation. 1449 * 1450 * This avoids a later panic("receive 1a") 1451 * when compiled with DIAGNOSTIC. 1452 */ 1453 if (m && mbuf_removed && atomic) 1454 (void) sbdroprecord(&so->so_rcv); 1455 1456 goto release; 1457 } 1458 } else 1459 uio->uio_resid -= len; 1460 if (len == m->m_len - moff) { 1461 if (m->m_flags & M_EOR) 1462 flags |= MSG_EOR; 1463 if (flags & MSG_PEEK) { 1464 m = m->m_next; 1465 moff = 0; 1466 } else { 1467 nextrecord = m->m_nextpkt; 1468 sbfree(&so->so_rcv, m); 1469 if (mp) { 1470 *mp = m; 1471 mp = &m->m_next; 1472 so->so_rcv.sb_mb = m = m->m_next; 1473 *mp = NULL; 1474 } else { 1475 MFREE(m, so->so_rcv.sb_mb); 1476 m = so->so_rcv.sb_mb; 1477 } 1478 /* 1479 * If m != NULL, we also know that 1480 * so->so_rcv.sb_mb != NULL. 1481 */ 1482 KASSERT(so->so_rcv.sb_mb == m); 1483 if (m) { 1484 m->m_nextpkt = nextrecord; 1485 if (nextrecord == NULL) 1486 so->so_rcv.sb_lastrecord = m; 1487 } else { 1488 so->so_rcv.sb_mb = nextrecord; 1489 SB_EMPTY_FIXUP(&so->so_rcv); 1490 } 1491 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1492 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1493 } 1494 } else if (flags & MSG_PEEK) 1495 moff += len; 1496 else { 1497 if (mp != NULL) { 1498 mt = m_copym(m, 0, len, M_NOWAIT); 1499 if (__predict_false(mt == NULL)) { 1500 sounlock(so); 1501 mt = m_copym(m, 0, len, M_WAIT); 1502 solock(so); 1503 } 1504 *mp = mt; 1505 } 1506 m->m_data += len; 1507 m->m_len -= len; 1508 so->so_rcv.sb_cc -= len; 1509 } 1510 if (so->so_oobmark) { 1511 if ((flags & MSG_PEEK) == 0) { 1512 so->so_oobmark -= len; 1513 if (so->so_oobmark == 0) { 1514 so->so_state |= SS_RCVATMARK; 1515 break; 1516 } 1517 } else { 1518 offset += len; 1519 if (offset == so->so_oobmark) 1520 break; 1521 } 1522 } 1523 if (flags & MSG_EOR) 1524 break; 1525 /* 1526 * If the MSG_WAITALL flag is set (for non-atomic socket), 1527 * we must not quit until "uio->uio_resid == 0" or an error 1528 * termination. If a signal/timeout occurs, return 1529 * with a short count but without error. 1530 * Keep sockbuf locked against other readers. 1531 */ 1532 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1533 !sosendallatonce(so) && !nextrecord) { 1534 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1535 break; 1536 /* 1537 * If we are peeking and the socket receive buffer is 1538 * full, stop since we can't get more data to peek at. 1539 */ 1540 if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0) 1541 break; 1542 /* 1543 * If we've drained the socket buffer, tell the 1544 * protocol in case it needs to do something to 1545 * get it filled again. 1546 */ 1547 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) 1548 (*pr->pr_usrreq)(so, PRU_RCVD, 1549 NULL, (struct mbuf *)(long)flags, NULL, l); 1550 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1551 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1552 error = sbwait(&so->so_rcv); 1553 if (error != 0) { 1554 sbunlock(&so->so_rcv); 1555 sounlock(so); 1556 splx(s); 1557 return 0; 1558 } 1559 if ((m = so->so_rcv.sb_mb) != NULL) 1560 nextrecord = m->m_nextpkt; 1561 } 1562 } 1563 1564 if (m && atomic) { 1565 flags |= MSG_TRUNC; 1566 if ((flags & MSG_PEEK) == 0) 1567 (void) sbdroprecord(&so->so_rcv); 1568 } 1569 if ((flags & MSG_PEEK) == 0) { 1570 if (m == NULL) { 1571 /* 1572 * First part is an inline SB_EMPTY_FIXUP(). Second 1573 * part makes sure sb_lastrecord is up-to-date if 1574 * there is still data in the socket buffer. 1575 */ 1576 so->so_rcv.sb_mb = nextrecord; 1577 if (so->so_rcv.sb_mb == NULL) { 1578 so->so_rcv.sb_mbtail = NULL; 1579 so->so_rcv.sb_lastrecord = NULL; 1580 } else if (nextrecord->m_nextpkt == NULL) 1581 so->so_rcv.sb_lastrecord = nextrecord; 1582 } 1583 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1584 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1585 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 1586 (*pr->pr_usrreq)(so, PRU_RCVD, NULL, 1587 (struct mbuf *)(long)flags, NULL, l); 1588 } 1589 if (orig_resid == uio->uio_resid && orig_resid && 1590 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1591 sbunlock(&so->so_rcv); 1592 goto restart; 1593 } 1594 1595 if (flagsp != NULL) 1596 *flagsp |= flags; 1597 release: 1598 sbunlock(&so->so_rcv); 1599 sounlock(so); 1600 splx(s); 1601 return error; 1602 } 1603 1604 int 1605 soshutdown(struct socket *so, int how) 1606 { 1607 const struct protosw *pr; 1608 int error; 1609 1610 KASSERT(solocked(so)); 1611 1612 pr = so->so_proto; 1613 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1614 return (EINVAL); 1615 1616 if (how == SHUT_RD || how == SHUT_RDWR) { 1617 sorflush(so); 1618 error = 0; 1619 } 1620 if (how == SHUT_WR || how == SHUT_RDWR) 1621 error = (*pr->pr_usrreq)(so, PRU_SHUTDOWN, NULL, 1622 NULL, NULL, NULL); 1623 1624 return error; 1625 } 1626 1627 void 1628 soabortop(struct socket *so) 1629 { 1630 #if 0 /* ad@ wrote this, then disabled it as 'not working' */ 1631 solock(so); 1632 so->so_state |= SS_ISABORTING; 1633 cv_broadcast(&so->so_cv); 1634 soshutdown(so, SHUT_RDWR); 1635 sounlock(so); 1636 #endif 1637 } 1638 1639 void 1640 sorflush(struct socket *so) 1641 { 1642 struct sockbuf *sb, asb; 1643 const struct protosw *pr; 1644 1645 KASSERT(solocked(so)); 1646 1647 sb = &so->so_rcv; 1648 pr = so->so_proto; 1649 socantrcvmore(so); 1650 sb->sb_flags |= SB_NOINTR; 1651 (void )sblock(sb, M_WAITOK); 1652 sbunlock(sb); 1653 asb = *sb; 1654 /* 1655 * Clear most of the sockbuf structure, but leave some of the 1656 * fields valid. 1657 */ 1658 memset(&sb->sb_startzero, 0, 1659 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1660 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) { 1661 sounlock(so); 1662 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1663 solock(so); 1664 } 1665 sbrelease(&asb, so); 1666 } 1667 1668 /* 1669 * internal set SOL_SOCKET options 1670 */ 1671 static int 1672 sosetopt1(struct socket *so, const struct sockopt *sopt) 1673 { 1674 int error = EINVAL, optval, opt; 1675 struct linger l; 1676 struct timeval tv; 1677 1678 switch ((opt = sopt->sopt_name)) { 1679 1680 case SO_ACCEPTFILTER: 1681 error = accept_filt_setopt(so, sopt); 1682 KASSERT(solocked(so)); 1683 break; 1684 1685 case SO_LINGER: 1686 error = sockopt_get(sopt, &l, sizeof(l)); 1687 solock(so); 1688 if (error) 1689 break; 1690 if (l.l_linger < 0 || l.l_linger > USHRT_MAX || 1691 l.l_linger > (INT_MAX / hz)) { 1692 error = EDOM; 1693 break; 1694 } 1695 so->so_linger = l.l_linger; 1696 if (l.l_onoff) 1697 so->so_options |= SO_LINGER; 1698 else 1699 so->so_options &= ~SO_LINGER; 1700 break; 1701 1702 case SO_DEBUG: 1703 case SO_KEEPALIVE: 1704 case SO_DONTROUTE: 1705 case SO_USELOOPBACK: 1706 case SO_BROADCAST: 1707 case SO_REUSEADDR: 1708 case SO_REUSEPORT: 1709 case SO_OOBINLINE: 1710 case SO_TIMESTAMP: 1711 #ifdef SO_OTIMESTAMP 1712 case SO_OTIMESTAMP: 1713 #endif 1714 error = sockopt_getint(sopt, &optval); 1715 solock(so); 1716 if (error) 1717 break; 1718 if (optval) 1719 so->so_options |= opt; 1720 else 1721 so->so_options &= ~opt; 1722 break; 1723 1724 case SO_SNDBUF: 1725 case SO_RCVBUF: 1726 case SO_SNDLOWAT: 1727 case SO_RCVLOWAT: 1728 error = sockopt_getint(sopt, &optval); 1729 solock(so); 1730 if (error) 1731 break; 1732 1733 /* 1734 * Values < 1 make no sense for any of these 1735 * options, so disallow them. 1736 */ 1737 if (optval < 1) { 1738 error = EINVAL; 1739 break; 1740 } 1741 1742 switch (opt) { 1743 case SO_SNDBUF: 1744 if (sbreserve(&so->so_snd, (u_long)optval, so) == 0) { 1745 error = ENOBUFS; 1746 break; 1747 } 1748 so->so_snd.sb_flags &= ~SB_AUTOSIZE; 1749 break; 1750 1751 case SO_RCVBUF: 1752 if (sbreserve(&so->so_rcv, (u_long)optval, so) == 0) { 1753 error = ENOBUFS; 1754 break; 1755 } 1756 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 1757 break; 1758 1759 /* 1760 * Make sure the low-water is never greater than 1761 * the high-water. 1762 */ 1763 case SO_SNDLOWAT: 1764 if (optval > so->so_snd.sb_hiwat) 1765 optval = so->so_snd.sb_hiwat; 1766 1767 so->so_snd.sb_lowat = optval; 1768 break; 1769 1770 case SO_RCVLOWAT: 1771 if (optval > so->so_rcv.sb_hiwat) 1772 optval = so->so_rcv.sb_hiwat; 1773 1774 so->so_rcv.sb_lowat = optval; 1775 break; 1776 } 1777 break; 1778 1779 #ifdef COMPAT_50 1780 case SO_OSNDTIMEO: 1781 case SO_ORCVTIMEO: { 1782 struct timeval50 otv; 1783 error = sockopt_get(sopt, &otv, sizeof(otv)); 1784 if (error) { 1785 solock(so); 1786 break; 1787 } 1788 timeval50_to_timeval(&otv, &tv); 1789 opt = opt == SO_OSNDTIMEO ? SO_SNDTIMEO : SO_RCVTIMEO; 1790 error = 0; 1791 /*FALLTHROUGH*/ 1792 } 1793 #endif /* COMPAT_50 */ 1794 1795 case SO_SNDTIMEO: 1796 case SO_RCVTIMEO: 1797 if (error) 1798 error = sockopt_get(sopt, &tv, sizeof(tv)); 1799 solock(so); 1800 if (error) 1801 break; 1802 1803 if (tv.tv_sec > (INT_MAX - tv.tv_usec / tick) / hz) { 1804 error = EDOM; 1805 break; 1806 } 1807 1808 optval = tv.tv_sec * hz + tv.tv_usec / tick; 1809 if (optval == 0 && tv.tv_usec != 0) 1810 optval = 1; 1811 1812 switch (opt) { 1813 case SO_SNDTIMEO: 1814 so->so_snd.sb_timeo = optval; 1815 break; 1816 case SO_RCVTIMEO: 1817 so->so_rcv.sb_timeo = optval; 1818 break; 1819 } 1820 break; 1821 1822 default: 1823 solock(so); 1824 error = ENOPROTOOPT; 1825 break; 1826 } 1827 KASSERT(solocked(so)); 1828 return error; 1829 } 1830 1831 int 1832 sosetopt(struct socket *so, struct sockopt *sopt) 1833 { 1834 int error, prerr; 1835 1836 if (sopt->sopt_level == SOL_SOCKET) { 1837 error = sosetopt1(so, sopt); 1838 KASSERT(solocked(so)); 1839 } else { 1840 error = ENOPROTOOPT; 1841 solock(so); 1842 } 1843 1844 if ((error == 0 || error == ENOPROTOOPT) && 1845 so->so_proto != NULL && so->so_proto->pr_ctloutput != NULL) { 1846 /* give the protocol stack a shot */ 1847 prerr = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, sopt); 1848 if (prerr == 0) 1849 error = 0; 1850 else if (prerr != ENOPROTOOPT) 1851 error = prerr; 1852 } 1853 sounlock(so); 1854 return error; 1855 } 1856 1857 /* 1858 * so_setsockopt() is a wrapper providing a sockopt structure for sosetopt() 1859 */ 1860 int 1861 so_setsockopt(struct lwp *l, struct socket *so, int level, int name, 1862 const void *val, size_t valsize) 1863 { 1864 struct sockopt sopt; 1865 int error; 1866 1867 KASSERT(valsize == 0 || val != NULL); 1868 1869 sockopt_init(&sopt, level, name, valsize); 1870 sockopt_set(&sopt, val, valsize); 1871 1872 error = sosetopt(so, &sopt); 1873 1874 sockopt_destroy(&sopt); 1875 1876 return error; 1877 } 1878 1879 /* 1880 * internal get SOL_SOCKET options 1881 */ 1882 static int 1883 sogetopt1(struct socket *so, struct sockopt *sopt) 1884 { 1885 int error, optval, opt; 1886 struct linger l; 1887 struct timeval tv; 1888 1889 switch ((opt = sopt->sopt_name)) { 1890 1891 case SO_ACCEPTFILTER: 1892 error = accept_filt_getopt(so, sopt); 1893 break; 1894 1895 case SO_LINGER: 1896 l.l_onoff = (so->so_options & SO_LINGER) ? 1 : 0; 1897 l.l_linger = so->so_linger; 1898 1899 error = sockopt_set(sopt, &l, sizeof(l)); 1900 break; 1901 1902 case SO_USELOOPBACK: 1903 case SO_DONTROUTE: 1904 case SO_DEBUG: 1905 case SO_KEEPALIVE: 1906 case SO_REUSEADDR: 1907 case SO_REUSEPORT: 1908 case SO_BROADCAST: 1909 case SO_OOBINLINE: 1910 case SO_TIMESTAMP: 1911 #ifdef SO_OTIMESTAMP 1912 case SO_OTIMESTAMP: 1913 #endif 1914 error = sockopt_setint(sopt, (so->so_options & opt) ? 1 : 0); 1915 break; 1916 1917 case SO_TYPE: 1918 error = sockopt_setint(sopt, so->so_type); 1919 break; 1920 1921 case SO_ERROR: 1922 error = sockopt_setint(sopt, so->so_error); 1923 so->so_error = 0; 1924 break; 1925 1926 case SO_SNDBUF: 1927 error = sockopt_setint(sopt, so->so_snd.sb_hiwat); 1928 break; 1929 1930 case SO_RCVBUF: 1931 error = sockopt_setint(sopt, so->so_rcv.sb_hiwat); 1932 break; 1933 1934 case SO_SNDLOWAT: 1935 error = sockopt_setint(sopt, so->so_snd.sb_lowat); 1936 break; 1937 1938 case SO_RCVLOWAT: 1939 error = sockopt_setint(sopt, so->so_rcv.sb_lowat); 1940 break; 1941 1942 #ifdef COMPAT_50 1943 case SO_OSNDTIMEO: 1944 case SO_ORCVTIMEO: { 1945 struct timeval50 otv; 1946 1947 optval = (opt == SO_OSNDTIMEO ? 1948 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1949 1950 otv.tv_sec = optval / hz; 1951 otv.tv_usec = (optval % hz) * tick; 1952 1953 error = sockopt_set(sopt, &otv, sizeof(otv)); 1954 break; 1955 } 1956 #endif /* COMPAT_50 */ 1957 1958 case SO_SNDTIMEO: 1959 case SO_RCVTIMEO: 1960 optval = (opt == SO_SNDTIMEO ? 1961 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1962 1963 tv.tv_sec = optval / hz; 1964 tv.tv_usec = (optval % hz) * tick; 1965 1966 error = sockopt_set(sopt, &tv, sizeof(tv)); 1967 break; 1968 1969 case SO_OVERFLOWED: 1970 error = sockopt_setint(sopt, so->so_rcv.sb_overflowed); 1971 break; 1972 1973 default: 1974 error = ENOPROTOOPT; 1975 break; 1976 } 1977 1978 return (error); 1979 } 1980 1981 int 1982 sogetopt(struct socket *so, struct sockopt *sopt) 1983 { 1984 int error; 1985 1986 solock(so); 1987 if (sopt->sopt_level != SOL_SOCKET) { 1988 if (so->so_proto && so->so_proto->pr_ctloutput) { 1989 error = ((*so->so_proto->pr_ctloutput) 1990 (PRCO_GETOPT, so, sopt)); 1991 } else 1992 error = (ENOPROTOOPT); 1993 } else { 1994 error = sogetopt1(so, sopt); 1995 } 1996 sounlock(so); 1997 return (error); 1998 } 1999 2000 /* 2001 * alloc sockopt data buffer buffer 2002 * - will be released at destroy 2003 */ 2004 static int 2005 sockopt_alloc(struct sockopt *sopt, size_t len, km_flag_t kmflag) 2006 { 2007 2008 KASSERT(sopt->sopt_size == 0); 2009 2010 if (len > sizeof(sopt->sopt_buf)) { 2011 sopt->sopt_data = kmem_zalloc(len, kmflag); 2012 if (sopt->sopt_data == NULL) 2013 return ENOMEM; 2014 } else 2015 sopt->sopt_data = sopt->sopt_buf; 2016 2017 sopt->sopt_size = len; 2018 return 0; 2019 } 2020 2021 /* 2022 * initialise sockopt storage 2023 * - MAY sleep during allocation 2024 */ 2025 void 2026 sockopt_init(struct sockopt *sopt, int level, int name, size_t size) 2027 { 2028 2029 memset(sopt, 0, sizeof(*sopt)); 2030 2031 sopt->sopt_level = level; 2032 sopt->sopt_name = name; 2033 (void)sockopt_alloc(sopt, size, KM_SLEEP); 2034 } 2035 2036 /* 2037 * destroy sockopt storage 2038 * - will release any held memory references 2039 */ 2040 void 2041 sockopt_destroy(struct sockopt *sopt) 2042 { 2043 2044 if (sopt->sopt_data != sopt->sopt_buf) 2045 kmem_free(sopt->sopt_data, sopt->sopt_size); 2046 2047 memset(sopt, 0, sizeof(*sopt)); 2048 } 2049 2050 /* 2051 * set sockopt value 2052 * - value is copied into sockopt 2053 * - memory is allocated when necessary, will not sleep 2054 */ 2055 int 2056 sockopt_set(struct sockopt *sopt, const void *buf, size_t len) 2057 { 2058 int error; 2059 2060 if (sopt->sopt_size == 0) { 2061 error = sockopt_alloc(sopt, len, KM_NOSLEEP); 2062 if (error) 2063 return error; 2064 } 2065 2066 KASSERT(sopt->sopt_size == len); 2067 memcpy(sopt->sopt_data, buf, len); 2068 return 0; 2069 } 2070 2071 /* 2072 * common case of set sockopt integer value 2073 */ 2074 int 2075 sockopt_setint(struct sockopt *sopt, int val) 2076 { 2077 2078 return sockopt_set(sopt, &val, sizeof(int)); 2079 } 2080 2081 /* 2082 * get sockopt value 2083 * - correct size must be given 2084 */ 2085 int 2086 sockopt_get(const struct sockopt *sopt, void *buf, size_t len) 2087 { 2088 2089 if (sopt->sopt_size != len) 2090 return EINVAL; 2091 2092 memcpy(buf, sopt->sopt_data, len); 2093 return 0; 2094 } 2095 2096 /* 2097 * common case of get sockopt integer value 2098 */ 2099 int 2100 sockopt_getint(const struct sockopt *sopt, int *valp) 2101 { 2102 2103 return sockopt_get(sopt, valp, sizeof(int)); 2104 } 2105 2106 /* 2107 * set sockopt value from mbuf 2108 * - ONLY for legacy code 2109 * - mbuf is released by sockopt 2110 * - will not sleep 2111 */ 2112 int 2113 sockopt_setmbuf(struct sockopt *sopt, struct mbuf *m) 2114 { 2115 size_t len; 2116 int error; 2117 2118 len = m_length(m); 2119 2120 if (sopt->sopt_size == 0) { 2121 error = sockopt_alloc(sopt, len, KM_NOSLEEP); 2122 if (error) 2123 return error; 2124 } 2125 2126 KASSERT(sopt->sopt_size == len); 2127 m_copydata(m, 0, len, sopt->sopt_data); 2128 m_freem(m); 2129 2130 return 0; 2131 } 2132 2133 /* 2134 * get sockopt value into mbuf 2135 * - ONLY for legacy code 2136 * - mbuf to be released by the caller 2137 * - will not sleep 2138 */ 2139 struct mbuf * 2140 sockopt_getmbuf(const struct sockopt *sopt) 2141 { 2142 struct mbuf *m; 2143 2144 if (sopt->sopt_size > MCLBYTES) 2145 return NULL; 2146 2147 m = m_get(M_DONTWAIT, MT_SOOPTS); 2148 if (m == NULL) 2149 return NULL; 2150 2151 if (sopt->sopt_size > MLEN) { 2152 MCLGET(m, M_DONTWAIT); 2153 if ((m->m_flags & M_EXT) == 0) { 2154 m_free(m); 2155 return NULL; 2156 } 2157 } 2158 2159 memcpy(mtod(m, void *), sopt->sopt_data, sopt->sopt_size); 2160 m->m_len = sopt->sopt_size; 2161 2162 return m; 2163 } 2164 2165 void 2166 sohasoutofband(struct socket *so) 2167 { 2168 2169 fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so); 2170 selnotify(&so->so_rcv.sb_sel, POLLPRI | POLLRDBAND, NOTE_SUBMIT); 2171 } 2172 2173 static void 2174 filt_sordetach(struct knote *kn) 2175 { 2176 struct socket *so; 2177 2178 so = ((file_t *)kn->kn_obj)->f_data; 2179 solock(so); 2180 SLIST_REMOVE(&so->so_rcv.sb_sel.sel_klist, kn, knote, kn_selnext); 2181 if (SLIST_EMPTY(&so->so_rcv.sb_sel.sel_klist)) 2182 so->so_rcv.sb_flags &= ~SB_KNOTE; 2183 sounlock(so); 2184 } 2185 2186 /*ARGSUSED*/ 2187 static int 2188 filt_soread(struct knote *kn, long hint) 2189 { 2190 struct socket *so; 2191 int rv; 2192 2193 so = ((file_t *)kn->kn_obj)->f_data; 2194 if (hint != NOTE_SUBMIT) 2195 solock(so); 2196 kn->kn_data = so->so_rcv.sb_cc; 2197 if (so->so_state & SS_CANTRCVMORE) { 2198 kn->kn_flags |= EV_EOF; 2199 kn->kn_fflags = so->so_error; 2200 rv = 1; 2201 } else if (so->so_error) /* temporary udp error */ 2202 rv = 1; 2203 else if (kn->kn_sfflags & NOTE_LOWAT) 2204 rv = (kn->kn_data >= kn->kn_sdata); 2205 else 2206 rv = (kn->kn_data >= so->so_rcv.sb_lowat); 2207 if (hint != NOTE_SUBMIT) 2208 sounlock(so); 2209 return rv; 2210 } 2211 2212 static void 2213 filt_sowdetach(struct knote *kn) 2214 { 2215 struct socket *so; 2216 2217 so = ((file_t *)kn->kn_obj)->f_data; 2218 solock(so); 2219 SLIST_REMOVE(&so->so_snd.sb_sel.sel_klist, kn, knote, kn_selnext); 2220 if (SLIST_EMPTY(&so->so_snd.sb_sel.sel_klist)) 2221 so->so_snd.sb_flags &= ~SB_KNOTE; 2222 sounlock(so); 2223 } 2224 2225 /*ARGSUSED*/ 2226 static int 2227 filt_sowrite(struct knote *kn, long hint) 2228 { 2229 struct socket *so; 2230 int rv; 2231 2232 so = ((file_t *)kn->kn_obj)->f_data; 2233 if (hint != NOTE_SUBMIT) 2234 solock(so); 2235 kn->kn_data = sbspace(&so->so_snd); 2236 if (so->so_state & SS_CANTSENDMORE) { 2237 kn->kn_flags |= EV_EOF; 2238 kn->kn_fflags = so->so_error; 2239 rv = 1; 2240 } else if (so->so_error) /* temporary udp error */ 2241 rv = 1; 2242 else if (((so->so_state & SS_ISCONNECTED) == 0) && 2243 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2244 rv = 0; 2245 else if (kn->kn_sfflags & NOTE_LOWAT) 2246 rv = (kn->kn_data >= kn->kn_sdata); 2247 else 2248 rv = (kn->kn_data >= so->so_snd.sb_lowat); 2249 if (hint != NOTE_SUBMIT) 2250 sounlock(so); 2251 return rv; 2252 } 2253 2254 /*ARGSUSED*/ 2255 static int 2256 filt_solisten(struct knote *kn, long hint) 2257 { 2258 struct socket *so; 2259 int rv; 2260 2261 so = ((file_t *)kn->kn_obj)->f_data; 2262 2263 /* 2264 * Set kn_data to number of incoming connections, not 2265 * counting partial (incomplete) connections. 2266 */ 2267 if (hint != NOTE_SUBMIT) 2268 solock(so); 2269 kn->kn_data = so->so_qlen; 2270 rv = (kn->kn_data > 0); 2271 if (hint != NOTE_SUBMIT) 2272 sounlock(so); 2273 return rv; 2274 } 2275 2276 static const struct filterops solisten_filtops = 2277 { 1, NULL, filt_sordetach, filt_solisten }; 2278 static const struct filterops soread_filtops = 2279 { 1, NULL, filt_sordetach, filt_soread }; 2280 static const struct filterops sowrite_filtops = 2281 { 1, NULL, filt_sowdetach, filt_sowrite }; 2282 2283 int 2284 soo_kqfilter(struct file *fp, struct knote *kn) 2285 { 2286 struct socket *so; 2287 struct sockbuf *sb; 2288 2289 so = ((file_t *)kn->kn_obj)->f_data; 2290 solock(so); 2291 switch (kn->kn_filter) { 2292 case EVFILT_READ: 2293 if (so->so_options & SO_ACCEPTCONN) 2294 kn->kn_fop = &solisten_filtops; 2295 else 2296 kn->kn_fop = &soread_filtops; 2297 sb = &so->so_rcv; 2298 break; 2299 case EVFILT_WRITE: 2300 kn->kn_fop = &sowrite_filtops; 2301 sb = &so->so_snd; 2302 break; 2303 default: 2304 sounlock(so); 2305 return (EINVAL); 2306 } 2307 SLIST_INSERT_HEAD(&sb->sb_sel.sel_klist, kn, kn_selnext); 2308 sb->sb_flags |= SB_KNOTE; 2309 sounlock(so); 2310 return (0); 2311 } 2312 2313 static int 2314 sodopoll(struct socket *so, int events) 2315 { 2316 int revents; 2317 2318 revents = 0; 2319 2320 if (events & (POLLIN | POLLRDNORM)) 2321 if (soreadable(so)) 2322 revents |= events & (POLLIN | POLLRDNORM); 2323 2324 if (events & (POLLOUT | POLLWRNORM)) 2325 if (sowritable(so)) 2326 revents |= events & (POLLOUT | POLLWRNORM); 2327 2328 if (events & (POLLPRI | POLLRDBAND)) 2329 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) 2330 revents |= events & (POLLPRI | POLLRDBAND); 2331 2332 return revents; 2333 } 2334 2335 int 2336 sopoll(struct socket *so, int events) 2337 { 2338 int revents = 0; 2339 2340 #ifndef DIAGNOSTIC 2341 /* 2342 * Do a quick, unlocked check in expectation that the socket 2343 * will be ready for I/O. Don't do this check if DIAGNOSTIC, 2344 * as the solocked() assertions will fail. 2345 */ 2346 if ((revents = sodopoll(so, events)) != 0) 2347 return revents; 2348 #endif 2349 2350 solock(so); 2351 if ((revents = sodopoll(so, events)) == 0) { 2352 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { 2353 selrecord(curlwp, &so->so_rcv.sb_sel); 2354 so->so_rcv.sb_flags |= SB_NOTIFY; 2355 } 2356 2357 if (events & (POLLOUT | POLLWRNORM)) { 2358 selrecord(curlwp, &so->so_snd.sb_sel); 2359 so->so_snd.sb_flags |= SB_NOTIFY; 2360 } 2361 } 2362 sounlock(so); 2363 2364 return revents; 2365 } 2366 2367 2368 #include <sys/sysctl.h> 2369 2370 static int sysctl_kern_somaxkva(SYSCTLFN_PROTO); 2371 2372 /* 2373 * sysctl helper routine for kern.somaxkva. ensures that the given 2374 * value is not too small. 2375 * (XXX should we maybe make sure it's not too large as well?) 2376 */ 2377 static int 2378 sysctl_kern_somaxkva(SYSCTLFN_ARGS) 2379 { 2380 int error, new_somaxkva; 2381 struct sysctlnode node; 2382 2383 new_somaxkva = somaxkva; 2384 node = *rnode; 2385 node.sysctl_data = &new_somaxkva; 2386 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 2387 if (error || newp == NULL) 2388 return (error); 2389 2390 if (new_somaxkva < (16 * 1024 * 1024)) /* sanity */ 2391 return (EINVAL); 2392 2393 mutex_enter(&so_pendfree_lock); 2394 somaxkva = new_somaxkva; 2395 cv_broadcast(&socurkva_cv); 2396 mutex_exit(&so_pendfree_lock); 2397 2398 return (error); 2399 } 2400 2401 static void 2402 sysctl_kern_somaxkva_setup(void) 2403 { 2404 2405 KASSERT(socket_sysctllog == NULL); 2406 sysctl_createv(&socket_sysctllog, 0, NULL, NULL, 2407 CTLFLAG_PERMANENT, 2408 CTLTYPE_NODE, "kern", NULL, 2409 NULL, 0, NULL, 0, 2410 CTL_KERN, CTL_EOL); 2411 2412 sysctl_createv(&socket_sysctllog, 0, NULL, NULL, 2413 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2414 CTLTYPE_INT, "somaxkva", 2415 SYSCTL_DESCR("Maximum amount of kernel memory to be " 2416 "used for socket buffers"), 2417 sysctl_kern_somaxkva, 0, NULL, 0, 2418 CTL_KERN, KERN_SOMAXKVA, CTL_EOL); 2419 } 2420