1 /* $NetBSD: uipc_socket.c,v 1.177 2008/10/14 13:45:26 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 2002, 2007, 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of Wasabi Systems, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2004 The FreeBSD Foundation 34 * Copyright (c) 2004 Robert Watson 35 * Copyright (c) 1982, 1986, 1988, 1990, 1993 36 * The Regents of the University of California. All rights reserved. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. Neither the name of the University nor the names of its contributors 47 * may be used to endorse or promote products derived from this software 48 * without specific prior written permission. 49 * 50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 60 * SUCH DAMAGE. 61 * 62 * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95 63 */ 64 65 #include <sys/cdefs.h> 66 __KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.177 2008/10/14 13:45:26 ad Exp $"); 67 68 #include "opt_sock_counters.h" 69 #include "opt_sosend_loan.h" 70 #include "opt_mbuftrace.h" 71 #include "opt_somaxkva.h" 72 #include "opt_multiprocessor.h" /* XXX */ 73 74 #include <sys/param.h> 75 #include <sys/systm.h> 76 #include <sys/proc.h> 77 #include <sys/file.h> 78 #include <sys/filedesc.h> 79 #include <sys/kmem.h> 80 #include <sys/mbuf.h> 81 #include <sys/domain.h> 82 #include <sys/kernel.h> 83 #include <sys/protosw.h> 84 #include <sys/socket.h> 85 #include <sys/socketvar.h> 86 #include <sys/signalvar.h> 87 #include <sys/resourcevar.h> 88 #include <sys/uidinfo.h> 89 #include <sys/event.h> 90 #include <sys/poll.h> 91 #include <sys/kauth.h> 92 #include <sys/mutex.h> 93 #include <sys/condvar.h> 94 95 #include <uvm/uvm.h> 96 97 MALLOC_DEFINE(M_SOOPTS, "soopts", "socket options"); 98 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 99 100 extern const struct fileops socketops; 101 102 extern int somaxconn; /* patchable (XXX sysctl) */ 103 int somaxconn = SOMAXCONN; 104 kmutex_t *softnet_lock; 105 106 #ifdef SOSEND_COUNTERS 107 #include <sys/device.h> 108 109 static struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 110 NULL, "sosend", "loan big"); 111 static struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 112 NULL, "sosend", "copy big"); 113 static struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 114 NULL, "sosend", "copy small"); 115 static struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 116 NULL, "sosend", "kva limit"); 117 118 #define SOSEND_COUNTER_INCR(ev) (ev)->ev_count++ 119 120 EVCNT_ATTACH_STATIC(sosend_loan_big); 121 EVCNT_ATTACH_STATIC(sosend_copy_big); 122 EVCNT_ATTACH_STATIC(sosend_copy_small); 123 EVCNT_ATTACH_STATIC(sosend_kvalimit); 124 #else 125 126 #define SOSEND_COUNTER_INCR(ev) /* nothing */ 127 128 #endif /* SOSEND_COUNTERS */ 129 130 static struct callback_entry sokva_reclaimerentry; 131 132 #if defined(SOSEND_NO_LOAN) || defined(MULTIPROCESSOR) 133 int sock_loan_thresh = -1; 134 #else 135 int sock_loan_thresh = 4096; 136 #endif 137 138 static kmutex_t so_pendfree_lock; 139 static struct mbuf *so_pendfree; 140 141 #ifndef SOMAXKVA 142 #define SOMAXKVA (16 * 1024 * 1024) 143 #endif 144 int somaxkva = SOMAXKVA; 145 static int socurkva; 146 static kcondvar_t socurkva_cv; 147 148 #define SOCK_LOAN_CHUNK 65536 149 150 static size_t sodopendfree(void); 151 static size_t sodopendfreel(void); 152 153 static vsize_t 154 sokvareserve(struct socket *so, vsize_t len) 155 { 156 int error; 157 158 mutex_enter(&so_pendfree_lock); 159 while (socurkva + len > somaxkva) { 160 size_t freed; 161 162 /* 163 * try to do pendfree. 164 */ 165 166 freed = sodopendfreel(); 167 168 /* 169 * if some kva was freed, try again. 170 */ 171 172 if (freed) 173 continue; 174 175 SOSEND_COUNTER_INCR(&sosend_kvalimit); 176 error = cv_wait_sig(&socurkva_cv, &so_pendfree_lock); 177 if (error) { 178 len = 0; 179 break; 180 } 181 } 182 socurkva += len; 183 mutex_exit(&so_pendfree_lock); 184 return len; 185 } 186 187 static void 188 sokvaunreserve(vsize_t len) 189 { 190 191 mutex_enter(&so_pendfree_lock); 192 socurkva -= len; 193 cv_broadcast(&socurkva_cv); 194 mutex_exit(&so_pendfree_lock); 195 } 196 197 /* 198 * sokvaalloc: allocate kva for loan. 199 */ 200 201 vaddr_t 202 sokvaalloc(vsize_t len, struct socket *so) 203 { 204 vaddr_t lva; 205 206 /* 207 * reserve kva. 208 */ 209 210 if (sokvareserve(so, len) == 0) 211 return 0; 212 213 /* 214 * allocate kva. 215 */ 216 217 lva = uvm_km_alloc(kernel_map, len, 0, UVM_KMF_VAONLY | UVM_KMF_WAITVA); 218 if (lva == 0) { 219 sokvaunreserve(len); 220 return (0); 221 } 222 223 return lva; 224 } 225 226 /* 227 * sokvafree: free kva for loan. 228 */ 229 230 void 231 sokvafree(vaddr_t sva, vsize_t len) 232 { 233 234 /* 235 * free kva. 236 */ 237 238 uvm_km_free(kernel_map, sva, len, UVM_KMF_VAONLY); 239 240 /* 241 * unreserve kva. 242 */ 243 244 sokvaunreserve(len); 245 } 246 247 static void 248 sodoloanfree(struct vm_page **pgs, void *buf, size_t size) 249 { 250 vaddr_t sva, eva; 251 vsize_t len; 252 int npgs; 253 254 KASSERT(pgs != NULL); 255 256 eva = round_page((vaddr_t) buf + size); 257 sva = trunc_page((vaddr_t) buf); 258 len = eva - sva; 259 npgs = len >> PAGE_SHIFT; 260 261 pmap_kremove(sva, len); 262 pmap_update(pmap_kernel()); 263 uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE); 264 sokvafree(sva, len); 265 } 266 267 static size_t 268 sodopendfree(void) 269 { 270 size_t rv; 271 272 if (__predict_true(so_pendfree == NULL)) 273 return 0; 274 275 mutex_enter(&so_pendfree_lock); 276 rv = sodopendfreel(); 277 mutex_exit(&so_pendfree_lock); 278 279 return rv; 280 } 281 282 /* 283 * sodopendfreel: free mbufs on "pendfree" list. 284 * unlock and relock so_pendfree_lock when freeing mbufs. 285 * 286 * => called with so_pendfree_lock held. 287 */ 288 289 static size_t 290 sodopendfreel(void) 291 { 292 struct mbuf *m, *next; 293 size_t rv = 0; 294 295 KASSERT(mutex_owned(&so_pendfree_lock)); 296 297 while (so_pendfree != NULL) { 298 m = so_pendfree; 299 so_pendfree = NULL; 300 mutex_exit(&so_pendfree_lock); 301 302 for (; m != NULL; m = next) { 303 next = m->m_next; 304 KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) == 0); 305 KASSERT(m->m_ext.ext_refcnt == 0); 306 307 rv += m->m_ext.ext_size; 308 sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf, 309 m->m_ext.ext_size); 310 pool_cache_put(mb_cache, m); 311 } 312 313 mutex_enter(&so_pendfree_lock); 314 } 315 316 return (rv); 317 } 318 319 void 320 soloanfree(struct mbuf *m, void *buf, size_t size, void *arg) 321 { 322 323 KASSERT(m != NULL); 324 325 /* 326 * postpone freeing mbuf. 327 * 328 * we can't do it in interrupt context 329 * because we need to put kva back to kernel_map. 330 */ 331 332 mutex_enter(&so_pendfree_lock); 333 m->m_next = so_pendfree; 334 so_pendfree = m; 335 cv_broadcast(&socurkva_cv); 336 mutex_exit(&so_pendfree_lock); 337 } 338 339 static long 340 sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space) 341 { 342 struct iovec *iov = uio->uio_iov; 343 vaddr_t sva, eva; 344 vsize_t len; 345 vaddr_t lva; 346 int npgs, error; 347 vaddr_t va; 348 int i; 349 350 if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) 351 return (0); 352 353 if (iov->iov_len < (size_t) space) 354 space = iov->iov_len; 355 if (space > SOCK_LOAN_CHUNK) 356 space = SOCK_LOAN_CHUNK; 357 358 eva = round_page((vaddr_t) iov->iov_base + space); 359 sva = trunc_page((vaddr_t) iov->iov_base); 360 len = eva - sva; 361 npgs = len >> PAGE_SHIFT; 362 363 KASSERT(npgs <= M_EXT_MAXPAGES); 364 365 lva = sokvaalloc(len, so); 366 if (lva == 0) 367 return 0; 368 369 error = uvm_loan(&uio->uio_vmspace->vm_map, sva, len, 370 m->m_ext.ext_pgs, UVM_LOAN_TOPAGE); 371 if (error) { 372 sokvafree(lva, len); 373 return (0); 374 } 375 376 for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE) 377 pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]), 378 VM_PROT_READ); 379 pmap_update(pmap_kernel()); 380 381 lva += (vaddr_t) iov->iov_base & PAGE_MASK; 382 383 MEXTADD(m, (void *) lva, space, M_MBUF, soloanfree, so); 384 m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP; 385 386 uio->uio_resid -= space; 387 /* uio_offset not updated, not set/used for write(2) */ 388 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + space; 389 uio->uio_iov->iov_len -= space; 390 if (uio->uio_iov->iov_len == 0) { 391 uio->uio_iov++; 392 uio->uio_iovcnt--; 393 } 394 395 return (space); 396 } 397 398 static int 399 sokva_reclaim_callback(struct callback_entry *ce, void *obj, void *arg) 400 { 401 402 KASSERT(ce == &sokva_reclaimerentry); 403 KASSERT(obj == NULL); 404 405 sodopendfree(); 406 if (!vm_map_starved_p(kernel_map)) { 407 return CALLBACK_CHAIN_ABORT; 408 } 409 return CALLBACK_CHAIN_CONTINUE; 410 } 411 412 struct mbuf * 413 getsombuf(struct socket *so, int type) 414 { 415 struct mbuf *m; 416 417 m = m_get(M_WAIT, type); 418 MCLAIM(m, so->so_mowner); 419 return m; 420 } 421 422 void 423 soinit(void) 424 { 425 426 mutex_init(&so_pendfree_lock, MUTEX_DEFAULT, IPL_VM); 427 softnet_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE); 428 cv_init(&socurkva_cv, "sokva"); 429 soinit2(); 430 431 /* Set the initial adjusted socket buffer size. */ 432 if (sb_max_set(sb_max)) 433 panic("bad initial sb_max value: %lu", sb_max); 434 435 callback_register(&vm_map_to_kernel(kernel_map)->vmk_reclaim_callback, 436 &sokva_reclaimerentry, NULL, sokva_reclaim_callback); 437 } 438 439 /* 440 * Socket operation routines. 441 * These routines are called by the routines in 442 * sys_socket.c or from a system process, and 443 * implement the semantics of socket operations by 444 * switching out to the protocol specific routines. 445 */ 446 /*ARGSUSED*/ 447 int 448 socreate(int dom, struct socket **aso, int type, int proto, struct lwp *l, 449 struct socket *lockso) 450 { 451 const struct protosw *prp; 452 struct socket *so; 453 uid_t uid; 454 int error; 455 kmutex_t *lock; 456 457 error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET, 458 KAUTH_REQ_NETWORK_SOCKET_OPEN, KAUTH_ARG(dom), KAUTH_ARG(type), 459 KAUTH_ARG(proto)); 460 if (error != 0) 461 return error; 462 463 if (proto) 464 prp = pffindproto(dom, proto, type); 465 else 466 prp = pffindtype(dom, type); 467 if (prp == NULL) { 468 /* no support for domain */ 469 if (pffinddomain(dom) == 0) 470 return EAFNOSUPPORT; 471 /* no support for socket type */ 472 if (proto == 0 && type != 0) 473 return EPROTOTYPE; 474 return EPROTONOSUPPORT; 475 } 476 if (prp->pr_usrreq == NULL) 477 return EPROTONOSUPPORT; 478 if (prp->pr_type != type) 479 return EPROTOTYPE; 480 481 so = soget(true); 482 so->so_type = type; 483 so->so_proto = prp; 484 so->so_send = sosend; 485 so->so_receive = soreceive; 486 #ifdef MBUFTRACE 487 so->so_rcv.sb_mowner = &prp->pr_domain->dom_mowner; 488 so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner; 489 so->so_mowner = &prp->pr_domain->dom_mowner; 490 #endif 491 uid = kauth_cred_geteuid(l->l_cred); 492 so->so_uidinfo = uid_find(uid); 493 so->so_egid = kauth_cred_getegid(l->l_cred); 494 so->so_cpid = l->l_proc->p_pid; 495 if (lockso != NULL) { 496 /* Caller wants us to share a lock. */ 497 lock = lockso->so_lock; 498 so->so_lock = lock; 499 mutex_obj_hold(lock); 500 mutex_enter(lock); 501 } else { 502 /* Lock assigned and taken during PRU_ATTACH. */ 503 } 504 error = (*prp->pr_usrreq)(so, PRU_ATTACH, NULL, 505 (struct mbuf *)(long)proto, NULL, l); 506 KASSERT(solocked(so)); 507 if (error != 0) { 508 so->so_state |= SS_NOFDREF; 509 sofree(so); 510 return error; 511 } 512 sounlock(so); 513 *aso = so; 514 return 0; 515 } 516 517 /* On success, write file descriptor to fdout and return zero. On 518 * failure, return non-zero; *fdout will be undefined. 519 */ 520 int 521 fsocreate(int domain, struct socket **sop, int type, int protocol, 522 struct lwp *l, int *fdout) 523 { 524 struct socket *so; 525 struct file *fp; 526 int fd, error; 527 528 if ((error = fd_allocfile(&fp, &fd)) != 0) 529 return (error); 530 fp->f_flag = FREAD|FWRITE; 531 fp->f_type = DTYPE_SOCKET; 532 fp->f_ops = &socketops; 533 error = socreate(domain, &so, type, protocol, l, NULL); 534 if (error != 0) { 535 fd_abort(curproc, fp, fd); 536 } else { 537 if (sop != NULL) 538 *sop = so; 539 fp->f_data = so; 540 fd_affix(curproc, fp, fd); 541 *fdout = fd; 542 } 543 return error; 544 } 545 546 int 547 sobind(struct socket *so, struct mbuf *nam, struct lwp *l) 548 { 549 int error; 550 551 solock(so); 552 error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, NULL, nam, NULL, l); 553 sounlock(so); 554 return error; 555 } 556 557 int 558 solisten(struct socket *so, int backlog, struct lwp *l) 559 { 560 int error; 561 562 solock(so); 563 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 564 SS_ISDISCONNECTING)) != 0) { 565 sounlock(so); 566 return (EOPNOTSUPP); 567 } 568 error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, 569 NULL, NULL, l); 570 if (error != 0) { 571 sounlock(so); 572 return error; 573 } 574 if (TAILQ_EMPTY(&so->so_q)) 575 so->so_options |= SO_ACCEPTCONN; 576 if (backlog < 0) 577 backlog = 0; 578 so->so_qlimit = min(backlog, somaxconn); 579 sounlock(so); 580 return 0; 581 } 582 583 void 584 sofree(struct socket *so) 585 { 586 u_int refs; 587 588 KASSERT(solocked(so)); 589 590 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { 591 sounlock(so); 592 return; 593 } 594 if (so->so_head) { 595 /* 596 * We must not decommission a socket that's on the accept(2) 597 * queue. If we do, then accept(2) may hang after select(2) 598 * indicated that the listening socket was ready. 599 */ 600 if (!soqremque(so, 0)) { 601 sounlock(so); 602 return; 603 } 604 } 605 if (so->so_rcv.sb_hiwat) 606 (void)chgsbsize(so->so_uidinfo, &so->so_rcv.sb_hiwat, 0, 607 RLIM_INFINITY); 608 if (so->so_snd.sb_hiwat) 609 (void)chgsbsize(so->so_uidinfo, &so->so_snd.sb_hiwat, 0, 610 RLIM_INFINITY); 611 sbrelease(&so->so_snd, so); 612 KASSERT(!cv_has_waiters(&so->so_cv)); 613 KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv)); 614 KASSERT(!cv_has_waiters(&so->so_snd.sb_cv)); 615 sorflush(so); 616 refs = so->so_aborting; /* XXX */ 617 /* Remove acccept filter if one is present. */ 618 if (so->so_accf != NULL) 619 (void)accept_filt_clear(so); 620 sounlock(so); 621 if (refs == 0) /* XXX */ 622 soput(so); 623 } 624 625 /* 626 * Close a socket on last file table reference removal. 627 * Initiate disconnect if connected. 628 * Free socket when disconnect complete. 629 */ 630 int 631 soclose(struct socket *so) 632 { 633 struct socket *so2; 634 int error; 635 int error2; 636 637 error = 0; 638 solock(so); 639 if (so->so_options & SO_ACCEPTCONN) { 640 for (;;) { 641 if ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) { 642 KASSERT(solocked2(so, so2)); 643 (void) soqremque(so2, 0); 644 /* soabort drops the lock. */ 645 (void) soabort(so2); 646 solock(so); 647 continue; 648 } 649 if ((so2 = TAILQ_FIRST(&so->so_q)) != 0) { 650 KASSERT(solocked2(so, so2)); 651 (void) soqremque(so2, 1); 652 /* soabort drops the lock. */ 653 (void) soabort(so2); 654 solock(so); 655 continue; 656 } 657 break; 658 } 659 } 660 if (so->so_pcb == 0) 661 goto discard; 662 if (so->so_state & SS_ISCONNECTED) { 663 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 664 error = sodisconnect(so); 665 if (error) 666 goto drop; 667 } 668 if (so->so_options & SO_LINGER) { 669 if ((so->so_state & SS_ISDISCONNECTING) && so->so_nbio) 670 goto drop; 671 while (so->so_state & SS_ISCONNECTED) { 672 error = sowait(so, so->so_linger * hz); 673 if (error) 674 break; 675 } 676 } 677 } 678 drop: 679 if (so->so_pcb) { 680 error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, 681 NULL, NULL, NULL, NULL); 682 if (error == 0) 683 error = error2; 684 } 685 discard: 686 if (so->so_state & SS_NOFDREF) 687 panic("soclose: NOFDREF"); 688 so->so_state |= SS_NOFDREF; 689 sofree(so); 690 return (error); 691 } 692 693 /* 694 * Must be called with the socket locked.. Will return with it unlocked. 695 */ 696 int 697 soabort(struct socket *so) 698 { 699 u_int refs; 700 int error; 701 702 KASSERT(solocked(so)); 703 KASSERT(so->so_head == NULL); 704 705 so->so_aborting++; /* XXX */ 706 error = (*so->so_proto->pr_usrreq)(so, PRU_ABORT, NULL, 707 NULL, NULL, NULL); 708 refs = --so->so_aborting; /* XXX */ 709 if (error || (refs == 0)) { 710 sofree(so); 711 } else { 712 sounlock(so); 713 } 714 return error; 715 } 716 717 int 718 soaccept(struct socket *so, struct mbuf *nam) 719 { 720 int error; 721 722 KASSERT(solocked(so)); 723 724 error = 0; 725 if ((so->so_state & SS_NOFDREF) == 0) 726 panic("soaccept: !NOFDREF"); 727 so->so_state &= ~SS_NOFDREF; 728 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 729 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 730 error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, 731 NULL, nam, NULL, NULL); 732 else 733 error = ECONNABORTED; 734 735 return (error); 736 } 737 738 int 739 soconnect(struct socket *so, struct mbuf *nam, struct lwp *l) 740 { 741 int error; 742 743 KASSERT(solocked(so)); 744 745 if (so->so_options & SO_ACCEPTCONN) 746 return (EOPNOTSUPP); 747 /* 748 * If protocol is connection-based, can only connect once. 749 * Otherwise, if connected, try to disconnect first. 750 * This allows user to disconnect by connecting to, e.g., 751 * a null address. 752 */ 753 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 754 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 755 (error = sodisconnect(so)))) 756 error = EISCONN; 757 else 758 error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, 759 NULL, nam, NULL, l); 760 return (error); 761 } 762 763 int 764 soconnect2(struct socket *so1, struct socket *so2) 765 { 766 int error; 767 768 KASSERT(solocked2(so1, so2)); 769 770 error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, 771 NULL, (struct mbuf *)so2, NULL, NULL); 772 return (error); 773 } 774 775 int 776 sodisconnect(struct socket *so) 777 { 778 int error; 779 780 KASSERT(solocked(so)); 781 782 if ((so->so_state & SS_ISCONNECTED) == 0) { 783 error = ENOTCONN; 784 } else if (so->so_state & SS_ISDISCONNECTING) { 785 error = EALREADY; 786 } else { 787 error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, 788 NULL, NULL, NULL, NULL); 789 } 790 sodopendfree(); 791 return (error); 792 } 793 794 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 795 /* 796 * Send on a socket. 797 * If send must go all at once and message is larger than 798 * send buffering, then hard error. 799 * Lock against other senders. 800 * If must go all at once and not enough room now, then 801 * inform user that this would block and do nothing. 802 * Otherwise, if nonblocking, send as much as possible. 803 * The data to be sent is described by "uio" if nonzero, 804 * otherwise by the mbuf chain "top" (which must be null 805 * if uio is not). Data provided in mbuf chain must be small 806 * enough to send all at once. 807 * 808 * Returns nonzero on error, timeout or signal; callers 809 * must check for short counts if EINTR/ERESTART are returned. 810 * Data and control buffers are freed on return. 811 */ 812 int 813 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 814 struct mbuf *control, int flags, struct lwp *l) 815 { 816 struct mbuf **mp, *m; 817 struct proc *p; 818 long space, len, resid, clen, mlen; 819 int error, s, dontroute, atomic; 820 821 p = l->l_proc; 822 sodopendfree(); 823 clen = 0; 824 825 /* 826 * solock() provides atomicity of access. splsoftnet() prevents 827 * protocol processing soft interrupts from interrupting us and 828 * blocking (expensive). 829 */ 830 s = splsoftnet(); 831 solock(so); 832 atomic = sosendallatonce(so) || top; 833 if (uio) 834 resid = uio->uio_resid; 835 else 836 resid = top->m_pkthdr.len; 837 /* 838 * In theory resid should be unsigned. 839 * However, space must be signed, as it might be less than 0 840 * if we over-committed, and we must use a signed comparison 841 * of space and resid. On the other hand, a negative resid 842 * causes us to loop sending 0-length segments to the protocol. 843 */ 844 if (resid < 0) { 845 error = EINVAL; 846 goto out; 847 } 848 dontroute = 849 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 850 (so->so_proto->pr_flags & PR_ATOMIC); 851 l->l_ru.ru_msgsnd++; 852 if (control) 853 clen = control->m_len; 854 restart: 855 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) 856 goto out; 857 do { 858 if (so->so_state & SS_CANTSENDMORE) { 859 error = EPIPE; 860 goto release; 861 } 862 if (so->so_error) { 863 error = so->so_error; 864 so->so_error = 0; 865 goto release; 866 } 867 if ((so->so_state & SS_ISCONNECTED) == 0) { 868 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 869 if ((so->so_state & SS_ISCONFIRMING) == 0 && 870 !(resid == 0 && clen != 0)) { 871 error = ENOTCONN; 872 goto release; 873 } 874 } else if (addr == 0) { 875 error = EDESTADDRREQ; 876 goto release; 877 } 878 } 879 space = sbspace(&so->so_snd); 880 if (flags & MSG_OOB) 881 space += 1024; 882 if ((atomic && resid > so->so_snd.sb_hiwat) || 883 clen > so->so_snd.sb_hiwat) { 884 error = EMSGSIZE; 885 goto release; 886 } 887 if (space < resid + clen && 888 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 889 if (so->so_nbio) { 890 error = EWOULDBLOCK; 891 goto release; 892 } 893 sbunlock(&so->so_snd); 894 error = sbwait(&so->so_snd); 895 if (error) 896 goto out; 897 goto restart; 898 } 899 mp = ⊤ 900 space -= clen; 901 do { 902 if (uio == NULL) { 903 /* 904 * Data is prepackaged in "top". 905 */ 906 resid = 0; 907 if (flags & MSG_EOR) 908 top->m_flags |= M_EOR; 909 } else do { 910 sounlock(so); 911 splx(s); 912 if (top == NULL) { 913 m = m_gethdr(M_WAIT, MT_DATA); 914 mlen = MHLEN; 915 m->m_pkthdr.len = 0; 916 m->m_pkthdr.rcvif = NULL; 917 } else { 918 m = m_get(M_WAIT, MT_DATA); 919 mlen = MLEN; 920 } 921 MCLAIM(m, so->so_snd.sb_mowner); 922 if (sock_loan_thresh >= 0 && 923 uio->uio_iov->iov_len >= sock_loan_thresh && 924 space >= sock_loan_thresh && 925 (len = sosend_loan(so, uio, m, 926 space)) != 0) { 927 SOSEND_COUNTER_INCR(&sosend_loan_big); 928 space -= len; 929 goto have_data; 930 } 931 if (resid >= MINCLSIZE && space >= MCLBYTES) { 932 SOSEND_COUNTER_INCR(&sosend_copy_big); 933 m_clget(m, M_WAIT); 934 if ((m->m_flags & M_EXT) == 0) 935 goto nopages; 936 mlen = MCLBYTES; 937 if (atomic && top == 0) { 938 len = lmin(MCLBYTES - max_hdr, 939 resid); 940 m->m_data += max_hdr; 941 } else 942 len = lmin(MCLBYTES, resid); 943 space -= len; 944 } else { 945 nopages: 946 SOSEND_COUNTER_INCR(&sosend_copy_small); 947 len = lmin(lmin(mlen, resid), space); 948 space -= len; 949 /* 950 * For datagram protocols, leave room 951 * for protocol headers in first mbuf. 952 */ 953 if (atomic && top == 0 && len < mlen) 954 MH_ALIGN(m, len); 955 } 956 error = uiomove(mtod(m, void *), (int)len, uio); 957 have_data: 958 resid = uio->uio_resid; 959 m->m_len = len; 960 *mp = m; 961 top->m_pkthdr.len += len; 962 s = splsoftnet(); 963 solock(so); 964 if (error != 0) 965 goto release; 966 mp = &m->m_next; 967 if (resid <= 0) { 968 if (flags & MSG_EOR) 969 top->m_flags |= M_EOR; 970 break; 971 } 972 } while (space > 0 && atomic); 973 974 if (so->so_state & SS_CANTSENDMORE) { 975 error = EPIPE; 976 goto release; 977 } 978 if (dontroute) 979 so->so_options |= SO_DONTROUTE; 980 if (resid > 0) 981 so->so_state |= SS_MORETOCOME; 982 error = (*so->so_proto->pr_usrreq)(so, 983 (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, 984 top, addr, control, curlwp); 985 if (dontroute) 986 so->so_options &= ~SO_DONTROUTE; 987 if (resid > 0) 988 so->so_state &= ~SS_MORETOCOME; 989 clen = 0; 990 control = NULL; 991 top = NULL; 992 mp = ⊤ 993 if (error != 0) 994 goto release; 995 } while (resid && space > 0); 996 } while (resid); 997 998 release: 999 sbunlock(&so->so_snd); 1000 out: 1001 sounlock(so); 1002 splx(s); 1003 if (top) 1004 m_freem(top); 1005 if (control) 1006 m_freem(control); 1007 return (error); 1008 } 1009 1010 /* 1011 * Following replacement or removal of the first mbuf on the first 1012 * mbuf chain of a socket buffer, push necessary state changes back 1013 * into the socket buffer so that other consumers see the values 1014 * consistently. 'nextrecord' is the callers locally stored value of 1015 * the original value of sb->sb_mb->m_nextpkt which must be restored 1016 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL. 1017 */ 1018 static void 1019 sbsync(struct sockbuf *sb, struct mbuf *nextrecord) 1020 { 1021 1022 KASSERT(solocked(sb->sb_so)); 1023 1024 /* 1025 * First, update for the new value of nextrecord. If necessary, 1026 * make it the first record. 1027 */ 1028 if (sb->sb_mb != NULL) 1029 sb->sb_mb->m_nextpkt = nextrecord; 1030 else 1031 sb->sb_mb = nextrecord; 1032 1033 /* 1034 * Now update any dependent socket buffer fields to reflect 1035 * the new state. This is an inline of SB_EMPTY_FIXUP, with 1036 * the addition of a second clause that takes care of the 1037 * case where sb_mb has been updated, but remains the last 1038 * record. 1039 */ 1040 if (sb->sb_mb == NULL) { 1041 sb->sb_mbtail = NULL; 1042 sb->sb_lastrecord = NULL; 1043 } else if (sb->sb_mb->m_nextpkt == NULL) 1044 sb->sb_lastrecord = sb->sb_mb; 1045 } 1046 1047 /* 1048 * Implement receive operations on a socket. 1049 * We depend on the way that records are added to the sockbuf 1050 * by sbappend*. In particular, each record (mbufs linked through m_next) 1051 * must begin with an address if the protocol so specifies, 1052 * followed by an optional mbuf or mbufs containing ancillary data, 1053 * and then zero or more mbufs of data. 1054 * In order to avoid blocking network interrupts for the entire time here, 1055 * we splx() while doing the actual copy to user space. 1056 * Although the sockbuf is locked, new data may still be appended, 1057 * and thus we must maintain consistency of the sockbuf during that time. 1058 * 1059 * The caller may receive the data as a single mbuf chain by supplying 1060 * an mbuf **mp0 for use in returning the chain. The uio is then used 1061 * only for the count in uio_resid. 1062 */ 1063 int 1064 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 1065 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1066 { 1067 struct lwp *l = curlwp; 1068 struct mbuf *m, **mp, *mt; 1069 int atomic, flags, len, error, s, offset, moff, type, orig_resid; 1070 const struct protosw *pr; 1071 struct mbuf *nextrecord; 1072 int mbuf_removed = 0; 1073 const struct domain *dom; 1074 1075 pr = so->so_proto; 1076 atomic = pr->pr_flags & PR_ATOMIC; 1077 dom = pr->pr_domain; 1078 mp = mp0; 1079 type = 0; 1080 orig_resid = uio->uio_resid; 1081 1082 if (paddr != NULL) 1083 *paddr = NULL; 1084 if (controlp != NULL) 1085 *controlp = NULL; 1086 if (flagsp != NULL) 1087 flags = *flagsp &~ MSG_EOR; 1088 else 1089 flags = 0; 1090 1091 if ((flags & MSG_DONTWAIT) == 0) 1092 sodopendfree(); 1093 1094 if (flags & MSG_OOB) { 1095 m = m_get(M_WAIT, MT_DATA); 1096 solock(so); 1097 error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, 1098 (struct mbuf *)(long)(flags & MSG_PEEK), NULL, l); 1099 sounlock(so); 1100 if (error) 1101 goto bad; 1102 do { 1103 error = uiomove(mtod(m, void *), 1104 (int) min(uio->uio_resid, m->m_len), uio); 1105 m = m_free(m); 1106 } while (uio->uio_resid > 0 && error == 0 && m); 1107 bad: 1108 if (m != NULL) 1109 m_freem(m); 1110 return error; 1111 } 1112 if (mp != NULL) 1113 *mp = NULL; 1114 1115 /* 1116 * solock() provides atomicity of access. splsoftnet() prevents 1117 * protocol processing soft interrupts from interrupting us and 1118 * blocking (expensive). 1119 */ 1120 s = splsoftnet(); 1121 solock(so); 1122 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 1123 (*pr->pr_usrreq)(so, PRU_RCVD, NULL, NULL, NULL, l); 1124 1125 restart: 1126 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) { 1127 sounlock(so); 1128 splx(s); 1129 return error; 1130 } 1131 1132 m = so->so_rcv.sb_mb; 1133 /* 1134 * If we have less data than requested, block awaiting more 1135 * (subject to any timeout) if: 1136 * 1. the current count is less than the low water mark, 1137 * 2. MSG_WAITALL is set, and it is possible to do the entire 1138 * receive operation at once if we block (resid <= hiwat), or 1139 * 3. MSG_DONTWAIT is not set. 1140 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1141 * we have to do the receive in sections, and thus risk returning 1142 * a short count if a timeout or signal occurs after we start. 1143 */ 1144 if (m == NULL || 1145 ((flags & MSG_DONTWAIT) == 0 && 1146 so->so_rcv.sb_cc < uio->uio_resid && 1147 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 1148 ((flags & MSG_WAITALL) && 1149 uio->uio_resid <= so->so_rcv.sb_hiwat)) && 1150 m->m_nextpkt == NULL && !atomic)) { 1151 #ifdef DIAGNOSTIC 1152 if (m == NULL && so->so_rcv.sb_cc) 1153 panic("receive 1"); 1154 #endif 1155 if (so->so_error) { 1156 if (m != NULL) 1157 goto dontblock; 1158 error = so->so_error; 1159 if ((flags & MSG_PEEK) == 0) 1160 so->so_error = 0; 1161 goto release; 1162 } 1163 if (so->so_state & SS_CANTRCVMORE) { 1164 if (m != NULL) 1165 goto dontblock; 1166 else 1167 goto release; 1168 } 1169 for (; m != NULL; m = m->m_next) 1170 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1171 m = so->so_rcv.sb_mb; 1172 goto dontblock; 1173 } 1174 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1175 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1176 error = ENOTCONN; 1177 goto release; 1178 } 1179 if (uio->uio_resid == 0) 1180 goto release; 1181 if (so->so_nbio || (flags & MSG_DONTWAIT)) { 1182 error = EWOULDBLOCK; 1183 goto release; 1184 } 1185 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 1186 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 1187 sbunlock(&so->so_rcv); 1188 error = sbwait(&so->so_rcv); 1189 if (error != 0) { 1190 sounlock(so); 1191 splx(s); 1192 return error; 1193 } 1194 goto restart; 1195 } 1196 dontblock: 1197 /* 1198 * On entry here, m points to the first record of the socket buffer. 1199 * From this point onward, we maintain 'nextrecord' as a cache of the 1200 * pointer to the next record in the socket buffer. We must keep the 1201 * various socket buffer pointers and local stack versions of the 1202 * pointers in sync, pushing out modifications before dropping the 1203 * socket lock, and re-reading them when picking it up. 1204 * 1205 * Otherwise, we will race with the network stack appending new data 1206 * or records onto the socket buffer by using inconsistent/stale 1207 * versions of the field, possibly resulting in socket buffer 1208 * corruption. 1209 * 1210 * By holding the high-level sblock(), we prevent simultaneous 1211 * readers from pulling off the front of the socket buffer. 1212 */ 1213 if (l != NULL) 1214 l->l_ru.ru_msgrcv++; 1215 KASSERT(m == so->so_rcv.sb_mb); 1216 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 1217 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 1218 nextrecord = m->m_nextpkt; 1219 if (pr->pr_flags & PR_ADDR) { 1220 #ifdef DIAGNOSTIC 1221 if (m->m_type != MT_SONAME) 1222 panic("receive 1a"); 1223 #endif 1224 orig_resid = 0; 1225 if (flags & MSG_PEEK) { 1226 if (paddr) 1227 *paddr = m_copy(m, 0, m->m_len); 1228 m = m->m_next; 1229 } else { 1230 sbfree(&so->so_rcv, m); 1231 mbuf_removed = 1; 1232 if (paddr != NULL) { 1233 *paddr = m; 1234 so->so_rcv.sb_mb = m->m_next; 1235 m->m_next = NULL; 1236 m = so->so_rcv.sb_mb; 1237 } else { 1238 MFREE(m, so->so_rcv.sb_mb); 1239 m = so->so_rcv.sb_mb; 1240 } 1241 sbsync(&so->so_rcv, nextrecord); 1242 } 1243 } 1244 1245 /* 1246 * Process one or more MT_CONTROL mbufs present before any data mbufs 1247 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1248 * just copy the data; if !MSG_PEEK, we call into the protocol to 1249 * perform externalization (or freeing if controlp == NULL). 1250 */ 1251 if (__predict_false(m != NULL && m->m_type == MT_CONTROL)) { 1252 struct mbuf *cm = NULL, *cmn; 1253 struct mbuf **cme = &cm; 1254 1255 do { 1256 if (flags & MSG_PEEK) { 1257 if (controlp != NULL) { 1258 *controlp = m_copy(m, 0, m->m_len); 1259 controlp = &(*controlp)->m_next; 1260 } 1261 m = m->m_next; 1262 } else { 1263 sbfree(&so->so_rcv, m); 1264 so->so_rcv.sb_mb = m->m_next; 1265 m->m_next = NULL; 1266 *cme = m; 1267 cme = &(*cme)->m_next; 1268 m = so->so_rcv.sb_mb; 1269 } 1270 } while (m != NULL && m->m_type == MT_CONTROL); 1271 if ((flags & MSG_PEEK) == 0) 1272 sbsync(&so->so_rcv, nextrecord); 1273 for (; cm != NULL; cm = cmn) { 1274 cmn = cm->m_next; 1275 cm->m_next = NULL; 1276 type = mtod(cm, struct cmsghdr *)->cmsg_type; 1277 if (controlp != NULL) { 1278 if (dom->dom_externalize != NULL && 1279 type == SCM_RIGHTS) { 1280 sounlock(so); 1281 splx(s); 1282 error = (*dom->dom_externalize)(cm, l); 1283 s = splsoftnet(); 1284 solock(so); 1285 } 1286 *controlp = cm; 1287 while (*controlp != NULL) 1288 controlp = &(*controlp)->m_next; 1289 } else { 1290 /* 1291 * Dispose of any SCM_RIGHTS message that went 1292 * through the read path rather than recv. 1293 */ 1294 if (dom->dom_dispose != NULL && 1295 type == SCM_RIGHTS) { 1296 sounlock(so); 1297 (*dom->dom_dispose)(cm); 1298 solock(so); 1299 } 1300 m_freem(cm); 1301 } 1302 } 1303 if (m != NULL) 1304 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1305 else 1306 nextrecord = so->so_rcv.sb_mb; 1307 orig_resid = 0; 1308 } 1309 1310 /* If m is non-NULL, we have some data to read. */ 1311 if (__predict_true(m != NULL)) { 1312 type = m->m_type; 1313 if (type == MT_OOBDATA) 1314 flags |= MSG_OOB; 1315 } 1316 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 1317 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 1318 1319 moff = 0; 1320 offset = 0; 1321 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1322 if (m->m_type == MT_OOBDATA) { 1323 if (type != MT_OOBDATA) 1324 break; 1325 } else if (type == MT_OOBDATA) 1326 break; 1327 #ifdef DIAGNOSTIC 1328 else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) 1329 panic("receive 3"); 1330 #endif 1331 so->so_state &= ~SS_RCVATMARK; 1332 len = uio->uio_resid; 1333 if (so->so_oobmark && len > so->so_oobmark - offset) 1334 len = so->so_oobmark - offset; 1335 if (len > m->m_len - moff) 1336 len = m->m_len - moff; 1337 /* 1338 * If mp is set, just pass back the mbufs. 1339 * Otherwise copy them out via the uio, then free. 1340 * Sockbuf must be consistent here (points to current mbuf, 1341 * it points to next record) when we drop priority; 1342 * we must note any additions to the sockbuf when we 1343 * block interrupts again. 1344 */ 1345 if (mp == NULL) { 1346 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 1347 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 1348 sounlock(so); 1349 splx(s); 1350 error = uiomove(mtod(m, char *) + moff, (int)len, uio); 1351 s = splsoftnet(); 1352 solock(so); 1353 if (error != 0) { 1354 /* 1355 * If any part of the record has been removed 1356 * (such as the MT_SONAME mbuf, which will 1357 * happen when PR_ADDR, and thus also 1358 * PR_ATOMIC, is set), then drop the entire 1359 * record to maintain the atomicity of the 1360 * receive operation. 1361 * 1362 * This avoids a later panic("receive 1a") 1363 * when compiled with DIAGNOSTIC. 1364 */ 1365 if (m && mbuf_removed && atomic) 1366 (void) sbdroprecord(&so->so_rcv); 1367 1368 goto release; 1369 } 1370 } else 1371 uio->uio_resid -= len; 1372 if (len == m->m_len - moff) { 1373 if (m->m_flags & M_EOR) 1374 flags |= MSG_EOR; 1375 if (flags & MSG_PEEK) { 1376 m = m->m_next; 1377 moff = 0; 1378 } else { 1379 nextrecord = m->m_nextpkt; 1380 sbfree(&so->so_rcv, m); 1381 if (mp) { 1382 *mp = m; 1383 mp = &m->m_next; 1384 so->so_rcv.sb_mb = m = m->m_next; 1385 *mp = NULL; 1386 } else { 1387 MFREE(m, so->so_rcv.sb_mb); 1388 m = so->so_rcv.sb_mb; 1389 } 1390 /* 1391 * If m != NULL, we also know that 1392 * so->so_rcv.sb_mb != NULL. 1393 */ 1394 KASSERT(so->so_rcv.sb_mb == m); 1395 if (m) { 1396 m->m_nextpkt = nextrecord; 1397 if (nextrecord == NULL) 1398 so->so_rcv.sb_lastrecord = m; 1399 } else { 1400 so->so_rcv.sb_mb = nextrecord; 1401 SB_EMPTY_FIXUP(&so->so_rcv); 1402 } 1403 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1404 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1405 } 1406 } else if (flags & MSG_PEEK) 1407 moff += len; 1408 else { 1409 if (mp != NULL) { 1410 mt = m_copym(m, 0, len, M_NOWAIT); 1411 if (__predict_false(mt == NULL)) { 1412 sounlock(so); 1413 mt = m_copym(m, 0, len, M_WAIT); 1414 solock(so); 1415 } 1416 *mp = mt; 1417 } 1418 m->m_data += len; 1419 m->m_len -= len; 1420 so->so_rcv.sb_cc -= len; 1421 } 1422 if (so->so_oobmark) { 1423 if ((flags & MSG_PEEK) == 0) { 1424 so->so_oobmark -= len; 1425 if (so->so_oobmark == 0) { 1426 so->so_state |= SS_RCVATMARK; 1427 break; 1428 } 1429 } else { 1430 offset += len; 1431 if (offset == so->so_oobmark) 1432 break; 1433 } 1434 } 1435 if (flags & MSG_EOR) 1436 break; 1437 /* 1438 * If the MSG_WAITALL flag is set (for non-atomic socket), 1439 * we must not quit until "uio->uio_resid == 0" or an error 1440 * termination. If a signal/timeout occurs, return 1441 * with a short count but without error. 1442 * Keep sockbuf locked against other readers. 1443 */ 1444 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1445 !sosendallatonce(so) && !nextrecord) { 1446 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1447 break; 1448 /* 1449 * If we are peeking and the socket receive buffer is 1450 * full, stop since we can't get more data to peek at. 1451 */ 1452 if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0) 1453 break; 1454 /* 1455 * If we've drained the socket buffer, tell the 1456 * protocol in case it needs to do something to 1457 * get it filled again. 1458 */ 1459 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) 1460 (*pr->pr_usrreq)(so, PRU_RCVD, 1461 NULL, (struct mbuf *)(long)flags, NULL, l); 1462 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1463 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1464 error = sbwait(&so->so_rcv); 1465 if (error != 0) { 1466 sbunlock(&so->so_rcv); 1467 sounlock(so); 1468 splx(s); 1469 return 0; 1470 } 1471 if ((m = so->so_rcv.sb_mb) != NULL) 1472 nextrecord = m->m_nextpkt; 1473 } 1474 } 1475 1476 if (m && atomic) { 1477 flags |= MSG_TRUNC; 1478 if ((flags & MSG_PEEK) == 0) 1479 (void) sbdroprecord(&so->so_rcv); 1480 } 1481 if ((flags & MSG_PEEK) == 0) { 1482 if (m == NULL) { 1483 /* 1484 * First part is an inline SB_EMPTY_FIXUP(). Second 1485 * part makes sure sb_lastrecord is up-to-date if 1486 * there is still data in the socket buffer. 1487 */ 1488 so->so_rcv.sb_mb = nextrecord; 1489 if (so->so_rcv.sb_mb == NULL) { 1490 so->so_rcv.sb_mbtail = NULL; 1491 so->so_rcv.sb_lastrecord = NULL; 1492 } else if (nextrecord->m_nextpkt == NULL) 1493 so->so_rcv.sb_lastrecord = nextrecord; 1494 } 1495 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1496 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1497 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 1498 (*pr->pr_usrreq)(so, PRU_RCVD, NULL, 1499 (struct mbuf *)(long)flags, NULL, l); 1500 } 1501 if (orig_resid == uio->uio_resid && orig_resid && 1502 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1503 sbunlock(&so->so_rcv); 1504 goto restart; 1505 } 1506 1507 if (flagsp != NULL) 1508 *flagsp |= flags; 1509 release: 1510 sbunlock(&so->so_rcv); 1511 sounlock(so); 1512 splx(s); 1513 return error; 1514 } 1515 1516 int 1517 soshutdown(struct socket *so, int how) 1518 { 1519 const struct protosw *pr; 1520 int error; 1521 1522 KASSERT(solocked(so)); 1523 1524 pr = so->so_proto; 1525 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1526 return (EINVAL); 1527 1528 if (how == SHUT_RD || how == SHUT_RDWR) { 1529 sorflush(so); 1530 error = 0; 1531 } 1532 if (how == SHUT_WR || how == SHUT_RDWR) 1533 error = (*pr->pr_usrreq)(so, PRU_SHUTDOWN, NULL, 1534 NULL, NULL, NULL); 1535 1536 return error; 1537 } 1538 1539 void 1540 sorflush(struct socket *so) 1541 { 1542 struct sockbuf *sb, asb; 1543 const struct protosw *pr; 1544 1545 KASSERT(solocked(so)); 1546 1547 sb = &so->so_rcv; 1548 pr = so->so_proto; 1549 socantrcvmore(so); 1550 sb->sb_flags |= SB_NOINTR; 1551 (void )sblock(sb, M_WAITOK); 1552 sbunlock(sb); 1553 asb = *sb; 1554 /* 1555 * Clear most of the sockbuf structure, but leave some of the 1556 * fields valid. 1557 */ 1558 memset(&sb->sb_startzero, 0, 1559 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1560 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) { 1561 sounlock(so); 1562 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1563 solock(so); 1564 } 1565 sbrelease(&asb, so); 1566 } 1567 1568 /* 1569 * internal set SOL_SOCKET options 1570 */ 1571 static int 1572 sosetopt1(struct socket *so, const struct sockopt *sopt) 1573 { 1574 int error, optval; 1575 struct linger l; 1576 struct timeval tv; 1577 1578 switch (sopt->sopt_name) { 1579 1580 case SO_ACCEPTFILTER: 1581 error = accept_filt_setopt(so, sopt); 1582 KASSERT(solocked(so)); 1583 break; 1584 1585 case SO_LINGER: 1586 error = sockopt_get(sopt, &l, sizeof(l)); 1587 solock(so); 1588 if (error) 1589 break; 1590 if (l.l_linger < 0 || l.l_linger > USHRT_MAX || 1591 l.l_linger > (INT_MAX / hz)) { 1592 error = EDOM; 1593 break; 1594 } 1595 so->so_linger = l.l_linger; 1596 if (l.l_onoff) 1597 so->so_options |= SO_LINGER; 1598 else 1599 so->so_options &= ~SO_LINGER; 1600 break; 1601 1602 case SO_DEBUG: 1603 case SO_KEEPALIVE: 1604 case SO_DONTROUTE: 1605 case SO_USELOOPBACK: 1606 case SO_BROADCAST: 1607 case SO_REUSEADDR: 1608 case SO_REUSEPORT: 1609 case SO_OOBINLINE: 1610 case SO_TIMESTAMP: 1611 error = sockopt_getint(sopt, &optval); 1612 solock(so); 1613 if (error) 1614 break; 1615 if (optval) 1616 so->so_options |= sopt->sopt_name; 1617 else 1618 so->so_options &= ~sopt->sopt_name; 1619 break; 1620 1621 case SO_SNDBUF: 1622 case SO_RCVBUF: 1623 case SO_SNDLOWAT: 1624 case SO_RCVLOWAT: 1625 error = sockopt_getint(sopt, &optval); 1626 solock(so); 1627 if (error) 1628 break; 1629 1630 /* 1631 * Values < 1 make no sense for any of these 1632 * options, so disallow them. 1633 */ 1634 if (optval < 1) { 1635 error = EINVAL; 1636 break; 1637 } 1638 1639 switch (sopt->sopt_name) { 1640 case SO_SNDBUF: 1641 if (sbreserve(&so->so_snd, (u_long)optval, so) == 0) { 1642 error = ENOBUFS; 1643 break; 1644 } 1645 so->so_snd.sb_flags &= ~SB_AUTOSIZE; 1646 break; 1647 1648 case SO_RCVBUF: 1649 if (sbreserve(&so->so_rcv, (u_long)optval, so) == 0) { 1650 error = ENOBUFS; 1651 break; 1652 } 1653 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 1654 break; 1655 1656 /* 1657 * Make sure the low-water is never greater than 1658 * the high-water. 1659 */ 1660 case SO_SNDLOWAT: 1661 if (optval > so->so_snd.sb_hiwat) 1662 optval = so->so_snd.sb_hiwat; 1663 1664 so->so_snd.sb_lowat = optval; 1665 break; 1666 1667 case SO_RCVLOWAT: 1668 if (optval > so->so_rcv.sb_hiwat) 1669 optval = so->so_rcv.sb_hiwat; 1670 1671 so->so_rcv.sb_lowat = optval; 1672 break; 1673 } 1674 break; 1675 1676 case SO_SNDTIMEO: 1677 case SO_RCVTIMEO: 1678 error = sockopt_get(sopt, &tv, sizeof(tv)); 1679 solock(so); 1680 if (error) 1681 break; 1682 1683 if (tv.tv_sec > (INT_MAX - tv.tv_usec / tick) / hz) { 1684 error = EDOM; 1685 break; 1686 } 1687 1688 optval = tv.tv_sec * hz + tv.tv_usec / tick; 1689 if (optval == 0 && tv.tv_usec != 0) 1690 optval = 1; 1691 1692 switch (sopt->sopt_name) { 1693 case SO_SNDTIMEO: 1694 so->so_snd.sb_timeo = optval; 1695 break; 1696 case SO_RCVTIMEO: 1697 so->so_rcv.sb_timeo = optval; 1698 break; 1699 } 1700 break; 1701 1702 default: 1703 solock(so); 1704 error = ENOPROTOOPT; 1705 break; 1706 } 1707 KASSERT(solocked(so)); 1708 return error; 1709 } 1710 1711 int 1712 sosetopt(struct socket *so, struct sockopt *sopt) 1713 { 1714 int error, prerr; 1715 1716 if (sopt->sopt_level == SOL_SOCKET) { 1717 error = sosetopt1(so, sopt); 1718 KASSERT(solocked(so)); 1719 } else { 1720 error = ENOPROTOOPT; 1721 solock(so); 1722 } 1723 1724 if ((error == 0 || error == ENOPROTOOPT) && 1725 so->so_proto != NULL && so->so_proto->pr_ctloutput != NULL) { 1726 /* give the protocol stack a shot */ 1727 prerr = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, sopt); 1728 if (prerr == 0) 1729 error = 0; 1730 else if (prerr != ENOPROTOOPT) 1731 error = prerr; 1732 } 1733 sounlock(so); 1734 return error; 1735 } 1736 1737 /* 1738 * so_setsockopt() is a wrapper providing a sockopt structure for sosetopt() 1739 */ 1740 int 1741 so_setsockopt(struct lwp *l, struct socket *so, int level, int name, 1742 const void *val, size_t valsize) 1743 { 1744 struct sockopt sopt; 1745 int error; 1746 1747 KASSERT(valsize == 0 || val != NULL); 1748 1749 sockopt_init(&sopt, level, name, valsize); 1750 sockopt_set(&sopt, val, valsize); 1751 1752 error = sosetopt(so, &sopt); 1753 1754 sockopt_destroy(&sopt); 1755 1756 return error; 1757 } 1758 1759 /* 1760 * internal get SOL_SOCKET options 1761 */ 1762 static int 1763 sogetopt1(struct socket *so, struct sockopt *sopt) 1764 { 1765 int error, optval; 1766 struct linger l; 1767 struct timeval tv; 1768 1769 switch (sopt->sopt_name) { 1770 1771 case SO_ACCEPTFILTER: 1772 error = accept_filt_getopt(so, sopt); 1773 break; 1774 1775 case SO_LINGER: 1776 l.l_onoff = (so->so_options & SO_LINGER) ? 1 : 0; 1777 l.l_linger = so->so_linger; 1778 1779 error = sockopt_set(sopt, &l, sizeof(l)); 1780 break; 1781 1782 case SO_USELOOPBACK: 1783 case SO_DONTROUTE: 1784 case SO_DEBUG: 1785 case SO_KEEPALIVE: 1786 case SO_REUSEADDR: 1787 case SO_REUSEPORT: 1788 case SO_BROADCAST: 1789 case SO_OOBINLINE: 1790 case SO_TIMESTAMP: 1791 error = sockopt_setint(sopt, 1792 (so->so_options & sopt->sopt_name) ? 1 : 0); 1793 break; 1794 1795 case SO_TYPE: 1796 error = sockopt_setint(sopt, so->so_type); 1797 break; 1798 1799 case SO_ERROR: 1800 error = sockopt_setint(sopt, so->so_error); 1801 so->so_error = 0; 1802 break; 1803 1804 case SO_SNDBUF: 1805 error = sockopt_setint(sopt, so->so_snd.sb_hiwat); 1806 break; 1807 1808 case SO_RCVBUF: 1809 error = sockopt_setint(sopt, so->so_rcv.sb_hiwat); 1810 break; 1811 1812 case SO_SNDLOWAT: 1813 error = sockopt_setint(sopt, so->so_snd.sb_lowat); 1814 break; 1815 1816 case SO_RCVLOWAT: 1817 error = sockopt_setint(sopt, so->so_rcv.sb_lowat); 1818 break; 1819 1820 case SO_SNDTIMEO: 1821 case SO_RCVTIMEO: 1822 optval = (sopt->sopt_name == SO_SNDTIMEO ? 1823 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1824 1825 tv.tv_sec = optval / hz; 1826 tv.tv_usec = (optval % hz) * tick; 1827 1828 error = sockopt_set(sopt, &tv, sizeof(tv)); 1829 break; 1830 1831 case SO_OVERFLOWED: 1832 error = sockopt_setint(sopt, so->so_rcv.sb_overflowed); 1833 break; 1834 1835 default: 1836 error = ENOPROTOOPT; 1837 break; 1838 } 1839 1840 return (error); 1841 } 1842 1843 int 1844 sogetopt(struct socket *so, struct sockopt *sopt) 1845 { 1846 int error; 1847 1848 solock(so); 1849 if (sopt->sopt_level != SOL_SOCKET) { 1850 if (so->so_proto && so->so_proto->pr_ctloutput) { 1851 error = ((*so->so_proto->pr_ctloutput) 1852 (PRCO_GETOPT, so, sopt)); 1853 } else 1854 error = (ENOPROTOOPT); 1855 } else { 1856 error = sogetopt1(so, sopt); 1857 } 1858 sounlock(so); 1859 return (error); 1860 } 1861 1862 /* 1863 * alloc sockopt data buffer buffer 1864 * - will be released at destroy 1865 */ 1866 static int 1867 sockopt_alloc(struct sockopt *sopt, size_t len, km_flag_t kmflag) 1868 { 1869 1870 KASSERT(sopt->sopt_size == 0); 1871 1872 if (len > sizeof(sopt->sopt_buf)) { 1873 sopt->sopt_data = kmem_zalloc(len, kmflag); 1874 if (sopt->sopt_data == NULL) 1875 return ENOMEM; 1876 } else 1877 sopt->sopt_data = sopt->sopt_buf; 1878 1879 sopt->sopt_size = len; 1880 return 0; 1881 } 1882 1883 /* 1884 * initialise sockopt storage 1885 * - MAY sleep during allocation 1886 */ 1887 void 1888 sockopt_init(struct sockopt *sopt, int level, int name, size_t size) 1889 { 1890 1891 memset(sopt, 0, sizeof(*sopt)); 1892 1893 sopt->sopt_level = level; 1894 sopt->sopt_name = name; 1895 (void)sockopt_alloc(sopt, size, KM_SLEEP); 1896 } 1897 1898 /* 1899 * destroy sockopt storage 1900 * - will release any held memory references 1901 */ 1902 void 1903 sockopt_destroy(struct sockopt *sopt) 1904 { 1905 1906 if (sopt->sopt_data != sopt->sopt_buf) 1907 kmem_free(sopt->sopt_data, sopt->sopt_size); 1908 1909 memset(sopt, 0, sizeof(*sopt)); 1910 } 1911 1912 /* 1913 * set sockopt value 1914 * - value is copied into sockopt 1915 * - memory is allocated when necessary, will not sleep 1916 */ 1917 int 1918 sockopt_set(struct sockopt *sopt, const void *buf, size_t len) 1919 { 1920 int error; 1921 1922 if (sopt->sopt_size == 0) { 1923 error = sockopt_alloc(sopt, len, KM_NOSLEEP); 1924 if (error) 1925 return error; 1926 } 1927 1928 KASSERT(sopt->sopt_size == len); 1929 memcpy(sopt->sopt_data, buf, len); 1930 return 0; 1931 } 1932 1933 /* 1934 * common case of set sockopt integer value 1935 */ 1936 int 1937 sockopt_setint(struct sockopt *sopt, int val) 1938 { 1939 1940 return sockopt_set(sopt, &val, sizeof(int)); 1941 } 1942 1943 /* 1944 * get sockopt value 1945 * - correct size must be given 1946 */ 1947 int 1948 sockopt_get(const struct sockopt *sopt, void *buf, size_t len) 1949 { 1950 1951 if (sopt->sopt_size != len) 1952 return EINVAL; 1953 1954 memcpy(buf, sopt->sopt_data, len); 1955 return 0; 1956 } 1957 1958 /* 1959 * common case of get sockopt integer value 1960 */ 1961 int 1962 sockopt_getint(const struct sockopt *sopt, int *valp) 1963 { 1964 1965 return sockopt_get(sopt, valp, sizeof(int)); 1966 } 1967 1968 /* 1969 * set sockopt value from mbuf 1970 * - ONLY for legacy code 1971 * - mbuf is released by sockopt 1972 * - will not sleep 1973 */ 1974 int 1975 sockopt_setmbuf(struct sockopt *sopt, struct mbuf *m) 1976 { 1977 size_t len; 1978 int error; 1979 1980 len = m_length(m); 1981 1982 if (sopt->sopt_size == 0) { 1983 error = sockopt_alloc(sopt, len, KM_NOSLEEP); 1984 if (error) 1985 return error; 1986 } 1987 1988 KASSERT(sopt->sopt_size == len); 1989 m_copydata(m, 0, len, sopt->sopt_data); 1990 m_freem(m); 1991 1992 return 0; 1993 } 1994 1995 /* 1996 * get sockopt value into mbuf 1997 * - ONLY for legacy code 1998 * - mbuf to be released by the caller 1999 * - will not sleep 2000 */ 2001 struct mbuf * 2002 sockopt_getmbuf(const struct sockopt *sopt) 2003 { 2004 struct mbuf *m; 2005 2006 if (sopt->sopt_size > MCLBYTES) 2007 return NULL; 2008 2009 m = m_get(M_DONTWAIT, MT_SOOPTS); 2010 if (m == NULL) 2011 return NULL; 2012 2013 if (sopt->sopt_size > MLEN) { 2014 MCLGET(m, M_DONTWAIT); 2015 if ((m->m_flags & M_EXT) == 0) { 2016 m_free(m); 2017 return NULL; 2018 } 2019 } 2020 2021 memcpy(mtod(m, void *), sopt->sopt_data, sopt->sopt_size); 2022 m->m_len = sopt->sopt_size; 2023 2024 return m; 2025 } 2026 2027 void 2028 sohasoutofband(struct socket *so) 2029 { 2030 2031 fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so); 2032 selnotify(&so->so_rcv.sb_sel, POLLPRI | POLLRDBAND, 0); 2033 } 2034 2035 static void 2036 filt_sordetach(struct knote *kn) 2037 { 2038 struct socket *so; 2039 2040 so = ((file_t *)kn->kn_obj)->f_data; 2041 solock(so); 2042 SLIST_REMOVE(&so->so_rcv.sb_sel.sel_klist, kn, knote, kn_selnext); 2043 if (SLIST_EMPTY(&so->so_rcv.sb_sel.sel_klist)) 2044 so->so_rcv.sb_flags &= ~SB_KNOTE; 2045 sounlock(so); 2046 } 2047 2048 /*ARGSUSED*/ 2049 static int 2050 filt_soread(struct knote *kn, long hint) 2051 { 2052 struct socket *so; 2053 int rv; 2054 2055 so = ((file_t *)kn->kn_obj)->f_data; 2056 if (hint != NOTE_SUBMIT) 2057 solock(so); 2058 kn->kn_data = so->so_rcv.sb_cc; 2059 if (so->so_state & SS_CANTRCVMORE) { 2060 kn->kn_flags |= EV_EOF; 2061 kn->kn_fflags = so->so_error; 2062 rv = 1; 2063 } else if (so->so_error) /* temporary udp error */ 2064 rv = 1; 2065 else if (kn->kn_sfflags & NOTE_LOWAT) 2066 rv = (kn->kn_data >= kn->kn_sdata); 2067 else 2068 rv = (kn->kn_data >= so->so_rcv.sb_lowat); 2069 if (hint != NOTE_SUBMIT) 2070 sounlock(so); 2071 return rv; 2072 } 2073 2074 static void 2075 filt_sowdetach(struct knote *kn) 2076 { 2077 struct socket *so; 2078 2079 so = ((file_t *)kn->kn_obj)->f_data; 2080 solock(so); 2081 SLIST_REMOVE(&so->so_snd.sb_sel.sel_klist, kn, knote, kn_selnext); 2082 if (SLIST_EMPTY(&so->so_snd.sb_sel.sel_klist)) 2083 so->so_snd.sb_flags &= ~SB_KNOTE; 2084 sounlock(so); 2085 } 2086 2087 /*ARGSUSED*/ 2088 static int 2089 filt_sowrite(struct knote *kn, long hint) 2090 { 2091 struct socket *so; 2092 int rv; 2093 2094 so = ((file_t *)kn->kn_obj)->f_data; 2095 if (hint != NOTE_SUBMIT) 2096 solock(so); 2097 kn->kn_data = sbspace(&so->so_snd); 2098 if (so->so_state & SS_CANTSENDMORE) { 2099 kn->kn_flags |= EV_EOF; 2100 kn->kn_fflags = so->so_error; 2101 rv = 1; 2102 } else if (so->so_error) /* temporary udp error */ 2103 rv = 1; 2104 else if (((so->so_state & SS_ISCONNECTED) == 0) && 2105 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2106 rv = 0; 2107 else if (kn->kn_sfflags & NOTE_LOWAT) 2108 rv = (kn->kn_data >= kn->kn_sdata); 2109 else 2110 rv = (kn->kn_data >= so->so_snd.sb_lowat); 2111 if (hint != NOTE_SUBMIT) 2112 sounlock(so); 2113 return rv; 2114 } 2115 2116 /*ARGSUSED*/ 2117 static int 2118 filt_solisten(struct knote *kn, long hint) 2119 { 2120 struct socket *so; 2121 int rv; 2122 2123 so = ((file_t *)kn->kn_obj)->f_data; 2124 2125 /* 2126 * Set kn_data to number of incoming connections, not 2127 * counting partial (incomplete) connections. 2128 */ 2129 if (hint != NOTE_SUBMIT) 2130 solock(so); 2131 kn->kn_data = so->so_qlen; 2132 rv = (kn->kn_data > 0); 2133 if (hint != NOTE_SUBMIT) 2134 sounlock(so); 2135 return rv; 2136 } 2137 2138 static const struct filterops solisten_filtops = 2139 { 1, NULL, filt_sordetach, filt_solisten }; 2140 static const struct filterops soread_filtops = 2141 { 1, NULL, filt_sordetach, filt_soread }; 2142 static const struct filterops sowrite_filtops = 2143 { 1, NULL, filt_sowdetach, filt_sowrite }; 2144 2145 int 2146 soo_kqfilter(struct file *fp, struct knote *kn) 2147 { 2148 struct socket *so; 2149 struct sockbuf *sb; 2150 2151 so = ((file_t *)kn->kn_obj)->f_data; 2152 solock(so); 2153 switch (kn->kn_filter) { 2154 case EVFILT_READ: 2155 if (so->so_options & SO_ACCEPTCONN) 2156 kn->kn_fop = &solisten_filtops; 2157 else 2158 kn->kn_fop = &soread_filtops; 2159 sb = &so->so_rcv; 2160 break; 2161 case EVFILT_WRITE: 2162 kn->kn_fop = &sowrite_filtops; 2163 sb = &so->so_snd; 2164 break; 2165 default: 2166 sounlock(so); 2167 return (EINVAL); 2168 } 2169 SLIST_INSERT_HEAD(&sb->sb_sel.sel_klist, kn, kn_selnext); 2170 sb->sb_flags |= SB_KNOTE; 2171 sounlock(so); 2172 return (0); 2173 } 2174 2175 static int 2176 sodopoll(struct socket *so, int events) 2177 { 2178 int revents; 2179 2180 revents = 0; 2181 2182 if (events & (POLLIN | POLLRDNORM)) 2183 if (soreadable(so)) 2184 revents |= events & (POLLIN | POLLRDNORM); 2185 2186 if (events & (POLLOUT | POLLWRNORM)) 2187 if (sowritable(so)) 2188 revents |= events & (POLLOUT | POLLWRNORM); 2189 2190 if (events & (POLLPRI | POLLRDBAND)) 2191 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) 2192 revents |= events & (POLLPRI | POLLRDBAND); 2193 2194 return revents; 2195 } 2196 2197 int 2198 sopoll(struct socket *so, int events) 2199 { 2200 int revents = 0; 2201 2202 #ifndef DIAGNOSTIC 2203 /* 2204 * Do a quick, unlocked check in expectation that the socket 2205 * will be ready for I/O. Don't do this check if DIAGNOSTIC, 2206 * as the solocked() assertions will fail. 2207 */ 2208 if ((revents = sodopoll(so, events)) != 0) 2209 return revents; 2210 #endif 2211 2212 solock(so); 2213 if ((revents = sodopoll(so, events)) == 0) { 2214 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { 2215 selrecord(curlwp, &so->so_rcv.sb_sel); 2216 so->so_rcv.sb_flags |= SB_NOTIFY; 2217 } 2218 2219 if (events & (POLLOUT | POLLWRNORM)) { 2220 selrecord(curlwp, &so->so_snd.sb_sel); 2221 so->so_snd.sb_flags |= SB_NOTIFY; 2222 } 2223 } 2224 sounlock(so); 2225 2226 return revents; 2227 } 2228 2229 2230 #include <sys/sysctl.h> 2231 2232 static int sysctl_kern_somaxkva(SYSCTLFN_PROTO); 2233 2234 /* 2235 * sysctl helper routine for kern.somaxkva. ensures that the given 2236 * value is not too small. 2237 * (XXX should we maybe make sure it's not too large as well?) 2238 */ 2239 static int 2240 sysctl_kern_somaxkva(SYSCTLFN_ARGS) 2241 { 2242 int error, new_somaxkva; 2243 struct sysctlnode node; 2244 2245 new_somaxkva = somaxkva; 2246 node = *rnode; 2247 node.sysctl_data = &new_somaxkva; 2248 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 2249 if (error || newp == NULL) 2250 return (error); 2251 2252 if (new_somaxkva < (16 * 1024 * 1024)) /* sanity */ 2253 return (EINVAL); 2254 2255 mutex_enter(&so_pendfree_lock); 2256 somaxkva = new_somaxkva; 2257 cv_broadcast(&socurkva_cv); 2258 mutex_exit(&so_pendfree_lock); 2259 2260 return (error); 2261 } 2262 2263 SYSCTL_SETUP(sysctl_kern_somaxkva_setup, "sysctl kern.somaxkva setup") 2264 { 2265 2266 sysctl_createv(clog, 0, NULL, NULL, 2267 CTLFLAG_PERMANENT, 2268 CTLTYPE_NODE, "kern", NULL, 2269 NULL, 0, NULL, 0, 2270 CTL_KERN, CTL_EOL); 2271 2272 sysctl_createv(clog, 0, NULL, NULL, 2273 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 2274 CTLTYPE_INT, "somaxkva", 2275 SYSCTL_DESCR("Maximum amount of kernel memory to be " 2276 "used for socket buffers"), 2277 sysctl_kern_somaxkva, 0, NULL, 0, 2278 CTL_KERN, KERN_SOMAXKVA, CTL_EOL); 2279 } 2280