1 /* $NetBSD: uipc_socket.c,v 1.121 2006/06/21 12:55:12 yamt Exp $ */ 2 3 /*- 4 * Copyright (c) 2002 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of Wasabi Systems, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 39 /* 40 * Copyright (c) 1982, 1986, 1988, 1990, 1993 41 * The Regents of the University of California. All rights reserved. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. Neither the name of the University nor the names of its contributors 52 * may be used to endorse or promote products derived from this software 53 * without specific prior written permission. 54 * 55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 65 * SUCH DAMAGE. 66 * 67 * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95 68 */ 69 70 #include <sys/cdefs.h> 71 __KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.121 2006/06/21 12:55:12 yamt Exp $"); 72 73 #include "opt_sock_counters.h" 74 #include "opt_sosend_loan.h" 75 #include "opt_mbuftrace.h" 76 #include "opt_somaxkva.h" 77 78 #include <sys/param.h> 79 #include <sys/systm.h> 80 #include <sys/proc.h> 81 #include <sys/file.h> 82 #include <sys/malloc.h> 83 #include <sys/mbuf.h> 84 #include <sys/domain.h> 85 #include <sys/kernel.h> 86 #include <sys/protosw.h> 87 #include <sys/socket.h> 88 #include <sys/socketvar.h> 89 #include <sys/signalvar.h> 90 #include <sys/resourcevar.h> 91 #include <sys/pool.h> 92 #include <sys/event.h> 93 #include <sys/poll.h> 94 #include <sys/kauth.h> 95 96 #include <uvm/uvm.h> 97 98 POOL_INIT(socket_pool, sizeof(struct socket), 0, 0, 0, "sockpl", NULL); 99 100 MALLOC_DEFINE(M_SOOPTS, "soopts", "socket options"); 101 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 102 103 extern int somaxconn; /* patchable (XXX sysctl) */ 104 int somaxconn = SOMAXCONN; 105 106 #ifdef SOSEND_COUNTERS 107 #include <sys/device.h> 108 109 static struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 110 NULL, "sosend", "loan big"); 111 static struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 112 NULL, "sosend", "copy big"); 113 static struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 114 NULL, "sosend", "copy small"); 115 static struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 116 NULL, "sosend", "kva limit"); 117 118 #define SOSEND_COUNTER_INCR(ev) (ev)->ev_count++ 119 120 EVCNT_ATTACH_STATIC(sosend_loan_big); 121 EVCNT_ATTACH_STATIC(sosend_copy_big); 122 EVCNT_ATTACH_STATIC(sosend_copy_small); 123 EVCNT_ATTACH_STATIC(sosend_kvalimit); 124 #else 125 126 #define SOSEND_COUNTER_INCR(ev) /* nothing */ 127 128 #endif /* SOSEND_COUNTERS */ 129 130 static struct callback_entry sokva_reclaimerentry; 131 132 #ifdef SOSEND_NO_LOAN 133 int sock_loan_thresh = -1; 134 #else 135 int sock_loan_thresh = 4096; 136 #endif 137 138 static struct simplelock so_pendfree_slock = SIMPLELOCK_INITIALIZER; 139 static struct mbuf *so_pendfree; 140 141 #ifndef SOMAXKVA 142 #define SOMAXKVA (16 * 1024 * 1024) 143 #endif 144 int somaxkva = SOMAXKVA; 145 static int socurkva; 146 static int sokvawaiters; 147 148 #define SOCK_LOAN_CHUNK 65536 149 150 static size_t sodopendfree(void); 151 static size_t sodopendfreel(void); 152 153 static vsize_t 154 sokvareserve(struct socket *so, vsize_t len) 155 { 156 int s; 157 int error; 158 159 s = splvm(); 160 simple_lock(&so_pendfree_slock); 161 while (socurkva + len > somaxkva) { 162 size_t freed; 163 164 /* 165 * try to do pendfree. 166 */ 167 168 freed = sodopendfreel(); 169 170 /* 171 * if some kva was freed, try again. 172 */ 173 174 if (freed) 175 continue; 176 177 SOSEND_COUNTER_INCR(&sosend_kvalimit); 178 sokvawaiters++; 179 error = ltsleep(&socurkva, PVM | PCATCH, "sokva", 0, 180 &so_pendfree_slock); 181 sokvawaiters--; 182 if (error) { 183 len = 0; 184 break; 185 } 186 } 187 socurkva += len; 188 simple_unlock(&so_pendfree_slock); 189 splx(s); 190 return len; 191 } 192 193 static void 194 sokvaunreserve(vsize_t len) 195 { 196 int s; 197 198 s = splvm(); 199 simple_lock(&so_pendfree_slock); 200 socurkva -= len; 201 if (sokvawaiters) 202 wakeup(&socurkva); 203 simple_unlock(&so_pendfree_slock); 204 splx(s); 205 } 206 207 /* 208 * sokvaalloc: allocate kva for loan. 209 */ 210 211 vaddr_t 212 sokvaalloc(vsize_t len, struct socket *so) 213 { 214 vaddr_t lva; 215 216 /* 217 * reserve kva. 218 */ 219 220 if (sokvareserve(so, len) == 0) 221 return 0; 222 223 /* 224 * allocate kva. 225 */ 226 227 lva = uvm_km_alloc(kernel_map, len, 0, UVM_KMF_VAONLY | UVM_KMF_WAITVA); 228 if (lva == 0) { 229 sokvaunreserve(len); 230 return (0); 231 } 232 233 return lva; 234 } 235 236 /* 237 * sokvafree: free kva for loan. 238 */ 239 240 void 241 sokvafree(vaddr_t sva, vsize_t len) 242 { 243 244 /* 245 * free kva. 246 */ 247 248 uvm_km_free(kernel_map, sva, len, UVM_KMF_VAONLY); 249 250 /* 251 * unreserve kva. 252 */ 253 254 sokvaunreserve(len); 255 } 256 257 static void 258 sodoloanfree(struct vm_page **pgs, caddr_t buf, size_t size) 259 { 260 vaddr_t va, sva, eva; 261 vsize_t len; 262 paddr_t pa; 263 int i, npgs; 264 265 eva = round_page((vaddr_t) buf + size); 266 sva = trunc_page((vaddr_t) buf); 267 len = eva - sva; 268 npgs = len >> PAGE_SHIFT; 269 270 if (__predict_false(pgs == NULL)) { 271 pgs = alloca(npgs * sizeof(*pgs)); 272 273 for (i = 0, va = sva; va < eva; i++, va += PAGE_SIZE) { 274 if (pmap_extract(pmap_kernel(), va, &pa) == FALSE) 275 panic("sodoloanfree: va 0x%lx not mapped", va); 276 pgs[i] = PHYS_TO_VM_PAGE(pa); 277 } 278 } 279 280 pmap_kremove(sva, len); 281 pmap_update(pmap_kernel()); 282 uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE); 283 sokvafree(sva, len); 284 } 285 286 static size_t 287 sodopendfree() 288 { 289 int s; 290 size_t rv; 291 292 s = splvm(); 293 simple_lock(&so_pendfree_slock); 294 rv = sodopendfreel(); 295 simple_unlock(&so_pendfree_slock); 296 splx(s); 297 298 return rv; 299 } 300 301 /* 302 * sodopendfreel: free mbufs on "pendfree" list. 303 * unlock and relock so_pendfree_slock when freeing mbufs. 304 * 305 * => called with so_pendfree_slock held. 306 * => called at splvm. 307 */ 308 309 static size_t 310 sodopendfreel() 311 { 312 size_t rv = 0; 313 314 LOCK_ASSERT(simple_lock_held(&so_pendfree_slock)); 315 316 for (;;) { 317 struct mbuf *m; 318 struct mbuf *next; 319 320 m = so_pendfree; 321 if (m == NULL) 322 break; 323 so_pendfree = NULL; 324 simple_unlock(&so_pendfree_slock); 325 /* XXX splx */ 326 327 for (; m != NULL; m = next) { 328 next = m->m_next; 329 330 rv += m->m_ext.ext_size; 331 sodoloanfree((m->m_flags & M_EXT_PAGES) ? 332 m->m_ext.ext_pgs : NULL, m->m_ext.ext_buf, 333 m->m_ext.ext_size); 334 pool_cache_put(&mbpool_cache, m); 335 } 336 337 /* XXX splvm */ 338 simple_lock(&so_pendfree_slock); 339 } 340 341 return (rv); 342 } 343 344 void 345 soloanfree(struct mbuf *m, caddr_t buf, size_t size, void *arg) 346 { 347 int s; 348 349 if (m == NULL) { 350 351 /* 352 * called from MEXTREMOVE. 353 */ 354 355 sodoloanfree(NULL, buf, size); 356 return; 357 } 358 359 /* 360 * postpone freeing mbuf. 361 * 362 * we can't do it in interrupt context 363 * because we need to put kva back to kernel_map. 364 */ 365 366 s = splvm(); 367 simple_lock(&so_pendfree_slock); 368 m->m_next = so_pendfree; 369 so_pendfree = m; 370 if (sokvawaiters) 371 wakeup(&socurkva); 372 simple_unlock(&so_pendfree_slock); 373 splx(s); 374 } 375 376 static long 377 sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space) 378 { 379 struct iovec *iov = uio->uio_iov; 380 vaddr_t sva, eva; 381 vsize_t len; 382 vaddr_t lva, va; 383 int npgs, i, error; 384 385 if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) 386 return (0); 387 388 if (iov->iov_len < (size_t) space) 389 space = iov->iov_len; 390 if (space > SOCK_LOAN_CHUNK) 391 space = SOCK_LOAN_CHUNK; 392 393 eva = round_page((vaddr_t) iov->iov_base + space); 394 sva = trunc_page((vaddr_t) iov->iov_base); 395 len = eva - sva; 396 npgs = len >> PAGE_SHIFT; 397 398 /* XXX KDASSERT */ 399 KASSERT(npgs <= M_EXT_MAXPAGES); 400 401 lva = sokvaalloc(len, so); 402 if (lva == 0) 403 return 0; 404 405 error = uvm_loan(&uio->uio_vmspace->vm_map, sva, len, 406 m->m_ext.ext_pgs, UVM_LOAN_TOPAGE); 407 if (error) { 408 sokvafree(lva, len); 409 return (0); 410 } 411 412 for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE) 413 pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]), 414 VM_PROT_READ); 415 pmap_update(pmap_kernel()); 416 417 lva += (vaddr_t) iov->iov_base & PAGE_MASK; 418 419 MEXTADD(m, (caddr_t) lva, space, M_MBUF, soloanfree, so); 420 m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP; 421 422 uio->uio_resid -= space; 423 /* uio_offset not updated, not set/used for write(2) */ 424 uio->uio_iov->iov_base = (caddr_t) uio->uio_iov->iov_base + space; 425 uio->uio_iov->iov_len -= space; 426 if (uio->uio_iov->iov_len == 0) { 427 uio->uio_iov++; 428 uio->uio_iovcnt--; 429 } 430 431 return (space); 432 } 433 434 static int 435 sokva_reclaim_callback(struct callback_entry *ce, void *obj, void *arg) 436 { 437 438 KASSERT(ce == &sokva_reclaimerentry); 439 KASSERT(obj == NULL); 440 441 sodopendfree(); 442 if (!vm_map_starved_p(kernel_map)) { 443 return CALLBACK_CHAIN_ABORT; 444 } 445 return CALLBACK_CHAIN_CONTINUE; 446 } 447 448 void 449 soinit(void) 450 { 451 452 /* Set the initial adjusted socket buffer size. */ 453 if (sb_max_set(sb_max)) 454 panic("bad initial sb_max value: %lu", sb_max); 455 456 callback_register(&vm_map_to_kernel(kernel_map)->vmk_reclaim_callback, 457 &sokva_reclaimerentry, NULL, sokva_reclaim_callback); 458 } 459 460 /* 461 * Socket operation routines. 462 * These routines are called by the routines in 463 * sys_socket.c or from a system process, and 464 * implement the semantics of socket operations by 465 * switching out to the protocol specific routines. 466 */ 467 /*ARGSUSED*/ 468 int 469 socreate(int dom, struct socket **aso, int type, int proto, struct lwp *l) 470 { 471 const struct protosw *prp; 472 struct socket *so; 473 uid_t uid; 474 int error, s; 475 476 if (proto) 477 prp = pffindproto(dom, proto, type); 478 else 479 prp = pffindtype(dom, type); 480 if (prp == 0) { 481 /* no support for domain */ 482 if (pffinddomain(dom) == 0) 483 return (EAFNOSUPPORT); 484 /* no support for socket type */ 485 if (proto == 0 && type != 0) 486 return (EPROTOTYPE); 487 return (EPROTONOSUPPORT); 488 } 489 if (prp->pr_usrreq == 0) 490 return (EPROTONOSUPPORT); 491 if (prp->pr_type != type) 492 return (EPROTOTYPE); 493 s = splsoftnet(); 494 so = pool_get(&socket_pool, PR_WAITOK); 495 memset((caddr_t)so, 0, sizeof(*so)); 496 TAILQ_INIT(&so->so_q0); 497 TAILQ_INIT(&so->so_q); 498 so->so_type = type; 499 so->so_proto = prp; 500 so->so_send = sosend; 501 so->so_receive = soreceive; 502 #ifdef MBUFTRACE 503 so->so_rcv.sb_mowner = &prp->pr_domain->dom_mowner; 504 so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner; 505 so->so_mowner = &prp->pr_domain->dom_mowner; 506 #endif 507 if (l != NULL) { 508 uid = kauth_cred_geteuid(l->l_proc->p_cred); 509 } else { 510 uid = 0; 511 } 512 so->so_uidinfo = uid_find(uid); 513 error = (*prp->pr_usrreq)(so, PRU_ATTACH, (struct mbuf *)0, 514 (struct mbuf *)(long)proto, (struct mbuf *)0, l); 515 if (error) { 516 so->so_state |= SS_NOFDREF; 517 sofree(so); 518 splx(s); 519 return (error); 520 } 521 splx(s); 522 *aso = so; 523 return (0); 524 } 525 526 int 527 sobind(struct socket *so, struct mbuf *nam, struct lwp *l) 528 { 529 int s, error; 530 531 s = splsoftnet(); 532 error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, (struct mbuf *)0, 533 nam, (struct mbuf *)0, l); 534 splx(s); 535 return (error); 536 } 537 538 int 539 solisten(struct socket *so, int backlog) 540 { 541 int s, error; 542 543 s = splsoftnet(); 544 error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, (struct mbuf *)0, 545 (struct mbuf *)0, (struct mbuf *)0, (struct lwp *)0); 546 if (error) { 547 splx(s); 548 return (error); 549 } 550 if (TAILQ_EMPTY(&so->so_q)) 551 so->so_options |= SO_ACCEPTCONN; 552 if (backlog < 0) 553 backlog = 0; 554 so->so_qlimit = min(backlog, somaxconn); 555 splx(s); 556 return (0); 557 } 558 559 void 560 sofree(struct socket *so) 561 { 562 563 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) 564 return; 565 if (so->so_head) { 566 /* 567 * We must not decommission a socket that's on the accept(2) 568 * queue. If we do, then accept(2) may hang after select(2) 569 * indicated that the listening socket was ready. 570 */ 571 if (!soqremque(so, 0)) 572 return; 573 } 574 if (so->so_rcv.sb_hiwat) 575 (void)chgsbsize(so->so_uidinfo, &so->so_rcv.sb_hiwat, 0, 576 RLIM_INFINITY); 577 if (so->so_snd.sb_hiwat) 578 (void)chgsbsize(so->so_uidinfo, &so->so_snd.sb_hiwat, 0, 579 RLIM_INFINITY); 580 sbrelease(&so->so_snd, so); 581 sorflush(so); 582 pool_put(&socket_pool, so); 583 } 584 585 /* 586 * Close a socket on last file table reference removal. 587 * Initiate disconnect if connected. 588 * Free socket when disconnect complete. 589 */ 590 int 591 soclose(struct socket *so) 592 { 593 struct socket *so2; 594 int s, error; 595 596 error = 0; 597 s = splsoftnet(); /* conservative */ 598 if (so->so_options & SO_ACCEPTCONN) { 599 while ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) { 600 (void) soqremque(so2, 0); 601 (void) soabort(so2); 602 } 603 while ((so2 = TAILQ_FIRST(&so->so_q)) != 0) { 604 (void) soqremque(so2, 1); 605 (void) soabort(so2); 606 } 607 } 608 if (so->so_pcb == 0) 609 goto discard; 610 if (so->so_state & SS_ISCONNECTED) { 611 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 612 error = sodisconnect(so); 613 if (error) 614 goto drop; 615 } 616 if (so->so_options & SO_LINGER) { 617 if ((so->so_state & SS_ISDISCONNECTING) && 618 (so->so_state & SS_NBIO)) 619 goto drop; 620 while (so->so_state & SS_ISCONNECTED) { 621 error = tsleep((caddr_t)&so->so_timeo, 622 PSOCK | PCATCH, netcls, 623 so->so_linger * hz); 624 if (error) 625 break; 626 } 627 } 628 } 629 drop: 630 if (so->so_pcb) { 631 int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, 632 (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, 633 (struct lwp *)0); 634 if (error == 0) 635 error = error2; 636 } 637 discard: 638 if (so->so_state & SS_NOFDREF) 639 panic("soclose: NOFDREF"); 640 so->so_state |= SS_NOFDREF; 641 sofree(so); 642 splx(s); 643 return (error); 644 } 645 646 /* 647 * Must be called at splsoftnet... 648 */ 649 int 650 soabort(struct socket *so) 651 { 652 653 return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, (struct mbuf *)0, 654 (struct mbuf *)0, (struct mbuf *)0, (struct lwp *)0); 655 } 656 657 int 658 soaccept(struct socket *so, struct mbuf *nam) 659 { 660 int s, error; 661 662 error = 0; 663 s = splsoftnet(); 664 if ((so->so_state & SS_NOFDREF) == 0) 665 panic("soaccept: !NOFDREF"); 666 so->so_state &= ~SS_NOFDREF; 667 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 668 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 669 error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, 670 (struct mbuf *)0, nam, (struct mbuf *)0, (struct lwp *)0); 671 else 672 error = ECONNABORTED; 673 674 splx(s); 675 return (error); 676 } 677 678 int 679 soconnect(struct socket *so, struct mbuf *nam, struct lwp *l) 680 { 681 int s, error; 682 683 if (so->so_options & SO_ACCEPTCONN) 684 return (EOPNOTSUPP); 685 s = splsoftnet(); 686 /* 687 * If protocol is connection-based, can only connect once. 688 * Otherwise, if connected, try to disconnect first. 689 * This allows user to disconnect by connecting to, e.g., 690 * a null address. 691 */ 692 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 693 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 694 (error = sodisconnect(so)))) 695 error = EISCONN; 696 else 697 error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, 698 (struct mbuf *)0, nam, (struct mbuf *)0, l); 699 splx(s); 700 return (error); 701 } 702 703 int 704 soconnect2(struct socket *so1, struct socket *so2) 705 { 706 int s, error; 707 708 s = splsoftnet(); 709 error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, 710 (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0, 711 (struct lwp *)0); 712 splx(s); 713 return (error); 714 } 715 716 int 717 sodisconnect(struct socket *so) 718 { 719 int s, error; 720 721 s = splsoftnet(); 722 if ((so->so_state & SS_ISCONNECTED) == 0) { 723 error = ENOTCONN; 724 goto bad; 725 } 726 if (so->so_state & SS_ISDISCONNECTING) { 727 error = EALREADY; 728 goto bad; 729 } 730 error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, 731 (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, 732 (struct lwp *)0); 733 bad: 734 splx(s); 735 sodopendfree(); 736 return (error); 737 } 738 739 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 740 /* 741 * Send on a socket. 742 * If send must go all at once and message is larger than 743 * send buffering, then hard error. 744 * Lock against other senders. 745 * If must go all at once and not enough room now, then 746 * inform user that this would block and do nothing. 747 * Otherwise, if nonblocking, send as much as possible. 748 * The data to be sent is described by "uio" if nonzero, 749 * otherwise by the mbuf chain "top" (which must be null 750 * if uio is not). Data provided in mbuf chain must be small 751 * enough to send all at once. 752 * 753 * Returns nonzero on error, timeout or signal; callers 754 * must check for short counts if EINTR/ERESTART are returned. 755 * Data and control buffers are freed on return. 756 */ 757 int 758 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 759 struct mbuf *control, int flags, struct lwp *l) 760 { 761 struct mbuf **mp, *m; 762 struct proc *p; 763 long space, len, resid, clen, mlen; 764 int error, s, dontroute, atomic; 765 766 p = l->l_proc; 767 sodopendfree(); 768 769 clen = 0; 770 atomic = sosendallatonce(so) || top; 771 if (uio) 772 resid = uio->uio_resid; 773 else 774 resid = top->m_pkthdr.len; 775 /* 776 * In theory resid should be unsigned. 777 * However, space must be signed, as it might be less than 0 778 * if we over-committed, and we must use a signed comparison 779 * of space and resid. On the other hand, a negative resid 780 * causes us to loop sending 0-length segments to the protocol. 781 */ 782 if (resid < 0) { 783 error = EINVAL; 784 goto out; 785 } 786 dontroute = 787 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 788 (so->so_proto->pr_flags & PR_ATOMIC); 789 if (p) 790 p->p_stats->p_ru.ru_msgsnd++; 791 if (control) 792 clen = control->m_len; 793 #define snderr(errno) { error = errno; splx(s); goto release; } 794 795 restart: 796 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) 797 goto out; 798 do { 799 s = splsoftnet(); 800 if (so->so_state & SS_CANTSENDMORE) 801 snderr(EPIPE); 802 if (so->so_error) { 803 error = so->so_error; 804 so->so_error = 0; 805 splx(s); 806 goto release; 807 } 808 if ((so->so_state & SS_ISCONNECTED) == 0) { 809 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 810 if ((so->so_state & SS_ISCONFIRMING) == 0 && 811 !(resid == 0 && clen != 0)) 812 snderr(ENOTCONN); 813 } else if (addr == 0) 814 snderr(EDESTADDRREQ); 815 } 816 space = sbspace(&so->so_snd); 817 if (flags & MSG_OOB) 818 space += 1024; 819 if ((atomic && resid > so->so_snd.sb_hiwat) || 820 clen > so->so_snd.sb_hiwat) 821 snderr(EMSGSIZE); 822 if (space < resid + clen && 823 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 824 if (so->so_state & SS_NBIO) 825 snderr(EWOULDBLOCK); 826 sbunlock(&so->so_snd); 827 error = sbwait(&so->so_snd); 828 splx(s); 829 if (error) 830 goto out; 831 goto restart; 832 } 833 splx(s); 834 mp = ⊤ 835 space -= clen; 836 do { 837 if (uio == NULL) { 838 /* 839 * Data is prepackaged in "top". 840 */ 841 resid = 0; 842 if (flags & MSG_EOR) 843 top->m_flags |= M_EOR; 844 } else do { 845 if (top == 0) { 846 m = m_gethdr(M_WAIT, MT_DATA); 847 mlen = MHLEN; 848 m->m_pkthdr.len = 0; 849 m->m_pkthdr.rcvif = (struct ifnet *)0; 850 } else { 851 m = m_get(M_WAIT, MT_DATA); 852 mlen = MLEN; 853 } 854 MCLAIM(m, so->so_snd.sb_mowner); 855 if (sock_loan_thresh >= 0 && 856 uio->uio_iov->iov_len >= sock_loan_thresh && 857 space >= sock_loan_thresh && 858 (len = sosend_loan(so, uio, m, 859 space)) != 0) { 860 SOSEND_COUNTER_INCR(&sosend_loan_big); 861 space -= len; 862 goto have_data; 863 } 864 if (resid >= MINCLSIZE && space >= MCLBYTES) { 865 SOSEND_COUNTER_INCR(&sosend_copy_big); 866 m_clget(m, M_WAIT); 867 if ((m->m_flags & M_EXT) == 0) 868 goto nopages; 869 mlen = MCLBYTES; 870 if (atomic && top == 0) { 871 len = lmin(MCLBYTES - max_hdr, 872 resid); 873 m->m_data += max_hdr; 874 } else 875 len = lmin(MCLBYTES, resid); 876 space -= len; 877 } else { 878 nopages: 879 SOSEND_COUNTER_INCR(&sosend_copy_small); 880 len = lmin(lmin(mlen, resid), space); 881 space -= len; 882 /* 883 * For datagram protocols, leave room 884 * for protocol headers in first mbuf. 885 */ 886 if (atomic && top == 0 && len < mlen) 887 MH_ALIGN(m, len); 888 } 889 error = uiomove(mtod(m, caddr_t), (int)len, 890 uio); 891 have_data: 892 resid = uio->uio_resid; 893 m->m_len = len; 894 *mp = m; 895 top->m_pkthdr.len += len; 896 if (error) 897 goto release; 898 mp = &m->m_next; 899 if (resid <= 0) { 900 if (flags & MSG_EOR) 901 top->m_flags |= M_EOR; 902 break; 903 } 904 } while (space > 0 && atomic); 905 906 s = splsoftnet(); 907 908 if (so->so_state & SS_CANTSENDMORE) 909 snderr(EPIPE); 910 911 if (dontroute) 912 so->so_options |= SO_DONTROUTE; 913 if (resid > 0) 914 so->so_state |= SS_MORETOCOME; 915 error = (*so->so_proto->pr_usrreq)(so, 916 (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, 917 top, addr, control, curlwp); /* XXX */ 918 if (dontroute) 919 so->so_options &= ~SO_DONTROUTE; 920 if (resid > 0) 921 so->so_state &= ~SS_MORETOCOME; 922 splx(s); 923 924 clen = 0; 925 control = 0; 926 top = 0; 927 mp = ⊤ 928 if (error) 929 goto release; 930 } while (resid && space > 0); 931 } while (resid); 932 933 release: 934 sbunlock(&so->so_snd); 935 out: 936 if (top) 937 m_freem(top); 938 if (control) 939 m_freem(control); 940 return (error); 941 } 942 943 /* 944 * Implement receive operations on a socket. 945 * We depend on the way that records are added to the sockbuf 946 * by sbappend*. In particular, each record (mbufs linked through m_next) 947 * must begin with an address if the protocol so specifies, 948 * followed by an optional mbuf or mbufs containing ancillary data, 949 * and then zero or more mbufs of data. 950 * In order to avoid blocking network interrupts for the entire time here, 951 * we splx() while doing the actual copy to user space. 952 * Although the sockbuf is locked, new data may still be appended, 953 * and thus we must maintain consistency of the sockbuf during that time. 954 * 955 * The caller may receive the data as a single mbuf chain by supplying 956 * an mbuf **mp0 for use in returning the chain. The uio is then used 957 * only for the count in uio_resid. 958 */ 959 int 960 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 961 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 962 { 963 struct lwp *l = curlwp; 964 struct mbuf *m, **mp; 965 int flags, len, error, s, offset, moff, type, orig_resid; 966 const struct protosw *pr; 967 struct mbuf *nextrecord; 968 int mbuf_removed = 0; 969 970 pr = so->so_proto; 971 mp = mp0; 972 type = 0; 973 orig_resid = uio->uio_resid; 974 975 if (paddr) 976 *paddr = 0; 977 if (controlp) 978 *controlp = 0; 979 if (flagsp) 980 flags = *flagsp &~ MSG_EOR; 981 else 982 flags = 0; 983 984 if ((flags & MSG_DONTWAIT) == 0) 985 sodopendfree(); 986 987 if (flags & MSG_OOB) { 988 m = m_get(M_WAIT, MT_DATA); 989 error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, 990 (struct mbuf *)(long)(flags & MSG_PEEK), 991 (struct mbuf *)0, l); 992 if (error) 993 goto bad; 994 do { 995 error = uiomove(mtod(m, caddr_t), 996 (int) min(uio->uio_resid, m->m_len), uio); 997 m = m_free(m); 998 } while (uio->uio_resid && error == 0 && m); 999 bad: 1000 if (m) 1001 m_freem(m); 1002 return (error); 1003 } 1004 if (mp) 1005 *mp = (struct mbuf *)0; 1006 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 1007 (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, 1008 (struct mbuf *)0, (struct mbuf *)0, l); 1009 1010 restart: 1011 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) 1012 return (error); 1013 s = splsoftnet(); 1014 1015 m = so->so_rcv.sb_mb; 1016 /* 1017 * If we have less data than requested, block awaiting more 1018 * (subject to any timeout) if: 1019 * 1. the current count is less than the low water mark, 1020 * 2. MSG_WAITALL is set, and it is possible to do the entire 1021 * receive operation at once if we block (resid <= hiwat), or 1022 * 3. MSG_DONTWAIT is not set. 1023 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1024 * we have to do the receive in sections, and thus risk returning 1025 * a short count if a timeout or signal occurs after we start. 1026 */ 1027 if (m == 0 || (((flags & MSG_DONTWAIT) == 0 && 1028 so->so_rcv.sb_cc < uio->uio_resid) && 1029 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 1030 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 1031 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { 1032 #ifdef DIAGNOSTIC 1033 if (m == 0 && so->so_rcv.sb_cc) 1034 panic("receive 1"); 1035 #endif 1036 if (so->so_error) { 1037 if (m) 1038 goto dontblock; 1039 error = so->so_error; 1040 if ((flags & MSG_PEEK) == 0) 1041 so->so_error = 0; 1042 goto release; 1043 } 1044 if (so->so_state & SS_CANTRCVMORE) { 1045 if (m) 1046 goto dontblock; 1047 else 1048 goto release; 1049 } 1050 for (; m; m = m->m_next) 1051 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1052 m = so->so_rcv.sb_mb; 1053 goto dontblock; 1054 } 1055 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1056 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1057 error = ENOTCONN; 1058 goto release; 1059 } 1060 if (uio->uio_resid == 0) 1061 goto release; 1062 if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { 1063 error = EWOULDBLOCK; 1064 goto release; 1065 } 1066 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 1067 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 1068 sbunlock(&so->so_rcv); 1069 error = sbwait(&so->so_rcv); 1070 splx(s); 1071 if (error) 1072 return (error); 1073 goto restart; 1074 } 1075 dontblock: 1076 /* 1077 * On entry here, m points to the first record of the socket buffer. 1078 * While we process the initial mbufs containing address and control 1079 * info, we save a copy of m->m_nextpkt into nextrecord. 1080 */ 1081 if (l) 1082 l->l_proc->p_stats->p_ru.ru_msgrcv++; 1083 KASSERT(m == so->so_rcv.sb_mb); 1084 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 1085 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 1086 nextrecord = m->m_nextpkt; 1087 if (pr->pr_flags & PR_ADDR) { 1088 #ifdef DIAGNOSTIC 1089 if (m->m_type != MT_SONAME) 1090 panic("receive 1a"); 1091 #endif 1092 orig_resid = 0; 1093 if (flags & MSG_PEEK) { 1094 if (paddr) 1095 *paddr = m_copy(m, 0, m->m_len); 1096 m = m->m_next; 1097 } else { 1098 sbfree(&so->so_rcv, m); 1099 mbuf_removed = 1; 1100 if (paddr) { 1101 *paddr = m; 1102 so->so_rcv.sb_mb = m->m_next; 1103 m->m_next = 0; 1104 m = so->so_rcv.sb_mb; 1105 } else { 1106 MFREE(m, so->so_rcv.sb_mb); 1107 m = so->so_rcv.sb_mb; 1108 } 1109 } 1110 } 1111 while (m && m->m_type == MT_CONTROL && error == 0) { 1112 if (flags & MSG_PEEK) { 1113 if (controlp) 1114 *controlp = m_copy(m, 0, m->m_len); 1115 m = m->m_next; 1116 } else { 1117 sbfree(&so->so_rcv, m); 1118 mbuf_removed = 1; 1119 if (controlp) { 1120 struct domain *dom = pr->pr_domain; 1121 if (dom->dom_externalize && l && 1122 mtod(m, struct cmsghdr *)->cmsg_type == 1123 SCM_RIGHTS) 1124 error = (*dom->dom_externalize)(m, l); 1125 *controlp = m; 1126 so->so_rcv.sb_mb = m->m_next; 1127 m->m_next = 0; 1128 m = so->so_rcv.sb_mb; 1129 } else { 1130 /* 1131 * Dispose of any SCM_RIGHTS message that went 1132 * through the read path rather than recv. 1133 */ 1134 if (pr->pr_domain->dom_dispose && 1135 mtod(m, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 1136 (*pr->pr_domain->dom_dispose)(m); 1137 MFREE(m, so->so_rcv.sb_mb); 1138 m = so->so_rcv.sb_mb; 1139 } 1140 } 1141 if (controlp) { 1142 orig_resid = 0; 1143 controlp = &(*controlp)->m_next; 1144 } 1145 } 1146 1147 /* 1148 * If m is non-NULL, we have some data to read. From now on, 1149 * make sure to keep sb_lastrecord consistent when working on 1150 * the last packet on the chain (nextrecord == NULL) and we 1151 * change m->m_nextpkt. 1152 */ 1153 if (m) { 1154 if ((flags & MSG_PEEK) == 0) { 1155 m->m_nextpkt = nextrecord; 1156 /* 1157 * If nextrecord == NULL (this is a single chain), 1158 * then sb_lastrecord may not be valid here if m 1159 * was changed earlier. 1160 */ 1161 if (nextrecord == NULL) { 1162 KASSERT(so->so_rcv.sb_mb == m); 1163 so->so_rcv.sb_lastrecord = m; 1164 } 1165 } 1166 type = m->m_type; 1167 if (type == MT_OOBDATA) 1168 flags |= MSG_OOB; 1169 } else { 1170 if ((flags & MSG_PEEK) == 0) { 1171 KASSERT(so->so_rcv.sb_mb == m); 1172 so->so_rcv.sb_mb = nextrecord; 1173 SB_EMPTY_FIXUP(&so->so_rcv); 1174 } 1175 } 1176 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 1177 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 1178 1179 moff = 0; 1180 offset = 0; 1181 while (m && uio->uio_resid > 0 && error == 0) { 1182 if (m->m_type == MT_OOBDATA) { 1183 if (type != MT_OOBDATA) 1184 break; 1185 } else if (type == MT_OOBDATA) 1186 break; 1187 #ifdef DIAGNOSTIC 1188 else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) 1189 panic("receive 3"); 1190 #endif 1191 so->so_state &= ~SS_RCVATMARK; 1192 len = uio->uio_resid; 1193 if (so->so_oobmark && len > so->so_oobmark - offset) 1194 len = so->so_oobmark - offset; 1195 if (len > m->m_len - moff) 1196 len = m->m_len - moff; 1197 /* 1198 * If mp is set, just pass back the mbufs. 1199 * Otherwise copy them out via the uio, then free. 1200 * Sockbuf must be consistent here (points to current mbuf, 1201 * it points to next record) when we drop priority; 1202 * we must note any additions to the sockbuf when we 1203 * block interrupts again. 1204 */ 1205 if (mp == 0) { 1206 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 1207 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 1208 splx(s); 1209 error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio); 1210 s = splsoftnet(); 1211 if (error) { 1212 /* 1213 * If any part of the record has been removed 1214 * (such as the MT_SONAME mbuf, which will 1215 * happen when PR_ADDR, and thus also 1216 * PR_ATOMIC, is set), then drop the entire 1217 * record to maintain the atomicity of the 1218 * receive operation. 1219 * 1220 * This avoids a later panic("receive 1a") 1221 * when compiled with DIAGNOSTIC. 1222 */ 1223 if (m && mbuf_removed 1224 && (pr->pr_flags & PR_ATOMIC)) 1225 (void) sbdroprecord(&so->so_rcv); 1226 1227 goto release; 1228 } 1229 } else 1230 uio->uio_resid -= len; 1231 if (len == m->m_len - moff) { 1232 if (m->m_flags & M_EOR) 1233 flags |= MSG_EOR; 1234 if (flags & MSG_PEEK) { 1235 m = m->m_next; 1236 moff = 0; 1237 } else { 1238 nextrecord = m->m_nextpkt; 1239 sbfree(&so->so_rcv, m); 1240 if (mp) { 1241 *mp = m; 1242 mp = &m->m_next; 1243 so->so_rcv.sb_mb = m = m->m_next; 1244 *mp = (struct mbuf *)0; 1245 } else { 1246 MFREE(m, so->so_rcv.sb_mb); 1247 m = so->so_rcv.sb_mb; 1248 } 1249 /* 1250 * If m != NULL, we also know that 1251 * so->so_rcv.sb_mb != NULL. 1252 */ 1253 KASSERT(so->so_rcv.sb_mb == m); 1254 if (m) { 1255 m->m_nextpkt = nextrecord; 1256 if (nextrecord == NULL) 1257 so->so_rcv.sb_lastrecord = m; 1258 } else { 1259 so->so_rcv.sb_mb = nextrecord; 1260 SB_EMPTY_FIXUP(&so->so_rcv); 1261 } 1262 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1263 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1264 } 1265 } else { 1266 if (flags & MSG_PEEK) 1267 moff += len; 1268 else { 1269 if (mp) 1270 *mp = m_copym(m, 0, len, M_WAIT); 1271 m->m_data += len; 1272 m->m_len -= len; 1273 so->so_rcv.sb_cc -= len; 1274 } 1275 } 1276 if (so->so_oobmark) { 1277 if ((flags & MSG_PEEK) == 0) { 1278 so->so_oobmark -= len; 1279 if (so->so_oobmark == 0) { 1280 so->so_state |= SS_RCVATMARK; 1281 break; 1282 } 1283 } else { 1284 offset += len; 1285 if (offset == so->so_oobmark) 1286 break; 1287 } 1288 } 1289 if (flags & MSG_EOR) 1290 break; 1291 /* 1292 * If the MSG_WAITALL flag is set (for non-atomic socket), 1293 * we must not quit until "uio->uio_resid == 0" or an error 1294 * termination. If a signal/timeout occurs, return 1295 * with a short count but without error. 1296 * Keep sockbuf locked against other readers. 1297 */ 1298 while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 && 1299 !sosendallatonce(so) && !nextrecord) { 1300 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1301 break; 1302 /* 1303 * If we are peeking and the socket receive buffer is 1304 * full, stop since we can't get more data to peek at. 1305 */ 1306 if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0) 1307 break; 1308 /* 1309 * If we've drained the socket buffer, tell the 1310 * protocol in case it needs to do something to 1311 * get it filled again. 1312 */ 1313 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) 1314 (*pr->pr_usrreq)(so, PRU_RCVD, 1315 (struct mbuf *)0, 1316 (struct mbuf *)(long)flags, 1317 (struct mbuf *)0, l); 1318 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1319 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1320 error = sbwait(&so->so_rcv); 1321 if (error) { 1322 sbunlock(&so->so_rcv); 1323 splx(s); 1324 return (0); 1325 } 1326 if ((m = so->so_rcv.sb_mb) != NULL) 1327 nextrecord = m->m_nextpkt; 1328 } 1329 } 1330 1331 if (m && pr->pr_flags & PR_ATOMIC) { 1332 flags |= MSG_TRUNC; 1333 if ((flags & MSG_PEEK) == 0) 1334 (void) sbdroprecord(&so->so_rcv); 1335 } 1336 if ((flags & MSG_PEEK) == 0) { 1337 if (m == 0) { 1338 /* 1339 * First part is an inline SB_EMPTY_FIXUP(). Second 1340 * part makes sure sb_lastrecord is up-to-date if 1341 * there is still data in the socket buffer. 1342 */ 1343 so->so_rcv.sb_mb = nextrecord; 1344 if (so->so_rcv.sb_mb == NULL) { 1345 so->so_rcv.sb_mbtail = NULL; 1346 so->so_rcv.sb_lastrecord = NULL; 1347 } else if (nextrecord->m_nextpkt == NULL) 1348 so->so_rcv.sb_lastrecord = nextrecord; 1349 } 1350 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1351 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1352 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 1353 (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, 1354 (struct mbuf *)(long)flags, (struct mbuf *)0, l); 1355 } 1356 if (orig_resid == uio->uio_resid && orig_resid && 1357 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1358 sbunlock(&so->so_rcv); 1359 splx(s); 1360 goto restart; 1361 } 1362 1363 if (flagsp) 1364 *flagsp |= flags; 1365 release: 1366 sbunlock(&so->so_rcv); 1367 splx(s); 1368 return (error); 1369 } 1370 1371 int 1372 soshutdown(struct socket *so, int how) 1373 { 1374 const struct protosw *pr; 1375 1376 pr = so->so_proto; 1377 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1378 return (EINVAL); 1379 1380 if (how == SHUT_RD || how == SHUT_RDWR) 1381 sorflush(so); 1382 if (how == SHUT_WR || how == SHUT_RDWR) 1383 return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, (struct mbuf *)0, 1384 (struct mbuf *)0, (struct mbuf *)0, (struct lwp *)0); 1385 return (0); 1386 } 1387 1388 void 1389 sorflush(struct socket *so) 1390 { 1391 struct sockbuf *sb, asb; 1392 const struct protosw *pr; 1393 int s; 1394 1395 sb = &so->so_rcv; 1396 pr = so->so_proto; 1397 sb->sb_flags |= SB_NOINTR; 1398 (void) sblock(sb, M_WAITOK); 1399 s = splnet(); 1400 socantrcvmore(so); 1401 sbunlock(sb); 1402 asb = *sb; 1403 /* 1404 * Clear most of the sockbuf structure, but leave some of the 1405 * fields valid. 1406 */ 1407 memset(&sb->sb_startzero, 0, 1408 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1409 splx(s); 1410 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1411 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1412 sbrelease(&asb, so); 1413 } 1414 1415 int 1416 sosetopt(struct socket *so, int level, int optname, struct mbuf *m0) 1417 { 1418 int error; 1419 struct mbuf *m; 1420 1421 error = 0; 1422 m = m0; 1423 if (level != SOL_SOCKET) { 1424 if (so->so_proto && so->so_proto->pr_ctloutput) 1425 return ((*so->so_proto->pr_ctloutput) 1426 (PRCO_SETOPT, so, level, optname, &m0)); 1427 error = ENOPROTOOPT; 1428 } else { 1429 switch (optname) { 1430 1431 case SO_LINGER: 1432 if (m == NULL || m->m_len != sizeof(struct linger)) { 1433 error = EINVAL; 1434 goto bad; 1435 } 1436 if (mtod(m, struct linger *)->l_linger < 0 || 1437 mtod(m, struct linger *)->l_linger > (INT_MAX / hz)) { 1438 error = EDOM; 1439 goto bad; 1440 } 1441 so->so_linger = mtod(m, struct linger *)->l_linger; 1442 /* fall thru... */ 1443 1444 case SO_DEBUG: 1445 case SO_KEEPALIVE: 1446 case SO_DONTROUTE: 1447 case SO_USELOOPBACK: 1448 case SO_BROADCAST: 1449 case SO_REUSEADDR: 1450 case SO_REUSEPORT: 1451 case SO_OOBINLINE: 1452 case SO_TIMESTAMP: 1453 if (m == NULL || m->m_len < sizeof(int)) { 1454 error = EINVAL; 1455 goto bad; 1456 } 1457 if (*mtod(m, int *)) 1458 so->so_options |= optname; 1459 else 1460 so->so_options &= ~optname; 1461 break; 1462 1463 case SO_SNDBUF: 1464 case SO_RCVBUF: 1465 case SO_SNDLOWAT: 1466 case SO_RCVLOWAT: 1467 { 1468 int optval; 1469 1470 if (m == NULL || m->m_len < sizeof(int)) { 1471 error = EINVAL; 1472 goto bad; 1473 } 1474 1475 /* 1476 * Values < 1 make no sense for any of these 1477 * options, so disallow them. 1478 */ 1479 optval = *mtod(m, int *); 1480 if (optval < 1) { 1481 error = EINVAL; 1482 goto bad; 1483 } 1484 1485 switch (optname) { 1486 1487 case SO_SNDBUF: 1488 case SO_RCVBUF: 1489 if (sbreserve(optname == SO_SNDBUF ? 1490 &so->so_snd : &so->so_rcv, 1491 (u_long) optval, so) == 0) { 1492 error = ENOBUFS; 1493 goto bad; 1494 } 1495 break; 1496 1497 /* 1498 * Make sure the low-water is never greater than 1499 * the high-water. 1500 */ 1501 case SO_SNDLOWAT: 1502 so->so_snd.sb_lowat = 1503 (optval > so->so_snd.sb_hiwat) ? 1504 so->so_snd.sb_hiwat : optval; 1505 break; 1506 case SO_RCVLOWAT: 1507 so->so_rcv.sb_lowat = 1508 (optval > so->so_rcv.sb_hiwat) ? 1509 so->so_rcv.sb_hiwat : optval; 1510 break; 1511 } 1512 break; 1513 } 1514 1515 case SO_SNDTIMEO: 1516 case SO_RCVTIMEO: 1517 { 1518 struct timeval *tv; 1519 int val; 1520 1521 if (m == NULL || m->m_len < sizeof(*tv)) { 1522 error = EINVAL; 1523 goto bad; 1524 } 1525 tv = mtod(m, struct timeval *); 1526 if (tv->tv_sec > (INT_MAX - tv->tv_usec / tick) / hz) { 1527 error = EDOM; 1528 goto bad; 1529 } 1530 val = tv->tv_sec * hz + tv->tv_usec / tick; 1531 if (val == 0 && tv->tv_usec != 0) 1532 val = 1; 1533 1534 switch (optname) { 1535 1536 case SO_SNDTIMEO: 1537 so->so_snd.sb_timeo = val; 1538 break; 1539 case SO_RCVTIMEO: 1540 so->so_rcv.sb_timeo = val; 1541 break; 1542 } 1543 break; 1544 } 1545 1546 default: 1547 error = ENOPROTOOPT; 1548 break; 1549 } 1550 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { 1551 (void) ((*so->so_proto->pr_ctloutput) 1552 (PRCO_SETOPT, so, level, optname, &m0)); 1553 m = NULL; /* freed by protocol */ 1554 } 1555 } 1556 bad: 1557 if (m) 1558 (void) m_free(m); 1559 return (error); 1560 } 1561 1562 int 1563 sogetopt(struct socket *so, int level, int optname, struct mbuf **mp) 1564 { 1565 struct mbuf *m; 1566 1567 if (level != SOL_SOCKET) { 1568 if (so->so_proto && so->so_proto->pr_ctloutput) { 1569 return ((*so->so_proto->pr_ctloutput) 1570 (PRCO_GETOPT, so, level, optname, mp)); 1571 } else 1572 return (ENOPROTOOPT); 1573 } else { 1574 m = m_get(M_WAIT, MT_SOOPTS); 1575 m->m_len = sizeof(int); 1576 1577 switch (optname) { 1578 1579 case SO_LINGER: 1580 m->m_len = sizeof(struct linger); 1581 mtod(m, struct linger *)->l_onoff = 1582 so->so_options & SO_LINGER; 1583 mtod(m, struct linger *)->l_linger = so->so_linger; 1584 break; 1585 1586 case SO_USELOOPBACK: 1587 case SO_DONTROUTE: 1588 case SO_DEBUG: 1589 case SO_KEEPALIVE: 1590 case SO_REUSEADDR: 1591 case SO_REUSEPORT: 1592 case SO_BROADCAST: 1593 case SO_OOBINLINE: 1594 case SO_TIMESTAMP: 1595 *mtod(m, int *) = so->so_options & optname; 1596 break; 1597 1598 case SO_TYPE: 1599 *mtod(m, int *) = so->so_type; 1600 break; 1601 1602 case SO_ERROR: 1603 *mtod(m, int *) = so->so_error; 1604 so->so_error = 0; 1605 break; 1606 1607 case SO_SNDBUF: 1608 *mtod(m, int *) = so->so_snd.sb_hiwat; 1609 break; 1610 1611 case SO_RCVBUF: 1612 *mtod(m, int *) = so->so_rcv.sb_hiwat; 1613 break; 1614 1615 case SO_SNDLOWAT: 1616 *mtod(m, int *) = so->so_snd.sb_lowat; 1617 break; 1618 1619 case SO_RCVLOWAT: 1620 *mtod(m, int *) = so->so_rcv.sb_lowat; 1621 break; 1622 1623 case SO_SNDTIMEO: 1624 case SO_RCVTIMEO: 1625 { 1626 int val = (optname == SO_SNDTIMEO ? 1627 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1628 1629 m->m_len = sizeof(struct timeval); 1630 mtod(m, struct timeval *)->tv_sec = val / hz; 1631 mtod(m, struct timeval *)->tv_usec = 1632 (val % hz) * tick; 1633 break; 1634 } 1635 1636 case SO_OVERFLOWED: 1637 *mtod(m, int *) = so->so_rcv.sb_overflowed; 1638 break; 1639 1640 default: 1641 (void)m_free(m); 1642 return (ENOPROTOOPT); 1643 } 1644 *mp = m; 1645 return (0); 1646 } 1647 } 1648 1649 void 1650 sohasoutofband(struct socket *so) 1651 { 1652 fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so); 1653 selwakeup(&so->so_rcv.sb_sel); 1654 } 1655 1656 static void 1657 filt_sordetach(struct knote *kn) 1658 { 1659 struct socket *so; 1660 1661 so = (struct socket *)kn->kn_fp->f_data; 1662 SLIST_REMOVE(&so->so_rcv.sb_sel.sel_klist, kn, knote, kn_selnext); 1663 if (SLIST_EMPTY(&so->so_rcv.sb_sel.sel_klist)) 1664 so->so_rcv.sb_flags &= ~SB_KNOTE; 1665 } 1666 1667 /*ARGSUSED*/ 1668 static int 1669 filt_soread(struct knote *kn, long hint) 1670 { 1671 struct socket *so; 1672 1673 so = (struct socket *)kn->kn_fp->f_data; 1674 kn->kn_data = so->so_rcv.sb_cc; 1675 if (so->so_state & SS_CANTRCVMORE) { 1676 kn->kn_flags |= EV_EOF; 1677 kn->kn_fflags = so->so_error; 1678 return (1); 1679 } 1680 if (so->so_error) /* temporary udp error */ 1681 return (1); 1682 if (kn->kn_sfflags & NOTE_LOWAT) 1683 return (kn->kn_data >= kn->kn_sdata); 1684 return (kn->kn_data >= so->so_rcv.sb_lowat); 1685 } 1686 1687 static void 1688 filt_sowdetach(struct knote *kn) 1689 { 1690 struct socket *so; 1691 1692 so = (struct socket *)kn->kn_fp->f_data; 1693 SLIST_REMOVE(&so->so_snd.sb_sel.sel_klist, kn, knote, kn_selnext); 1694 if (SLIST_EMPTY(&so->so_snd.sb_sel.sel_klist)) 1695 so->so_snd.sb_flags &= ~SB_KNOTE; 1696 } 1697 1698 /*ARGSUSED*/ 1699 static int 1700 filt_sowrite(struct knote *kn, long hint) 1701 { 1702 struct socket *so; 1703 1704 so = (struct socket *)kn->kn_fp->f_data; 1705 kn->kn_data = sbspace(&so->so_snd); 1706 if (so->so_state & SS_CANTSENDMORE) { 1707 kn->kn_flags |= EV_EOF; 1708 kn->kn_fflags = so->so_error; 1709 return (1); 1710 } 1711 if (so->so_error) /* temporary udp error */ 1712 return (1); 1713 if (((so->so_state & SS_ISCONNECTED) == 0) && 1714 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 1715 return (0); 1716 if (kn->kn_sfflags & NOTE_LOWAT) 1717 return (kn->kn_data >= kn->kn_sdata); 1718 return (kn->kn_data >= so->so_snd.sb_lowat); 1719 } 1720 1721 /*ARGSUSED*/ 1722 static int 1723 filt_solisten(struct knote *kn, long hint) 1724 { 1725 struct socket *so; 1726 1727 so = (struct socket *)kn->kn_fp->f_data; 1728 1729 /* 1730 * Set kn_data to number of incoming connections, not 1731 * counting partial (incomplete) connections. 1732 */ 1733 kn->kn_data = so->so_qlen; 1734 return (kn->kn_data > 0); 1735 } 1736 1737 static const struct filterops solisten_filtops = 1738 { 1, NULL, filt_sordetach, filt_solisten }; 1739 static const struct filterops soread_filtops = 1740 { 1, NULL, filt_sordetach, filt_soread }; 1741 static const struct filterops sowrite_filtops = 1742 { 1, NULL, filt_sowdetach, filt_sowrite }; 1743 1744 int 1745 soo_kqfilter(struct file *fp, struct knote *kn) 1746 { 1747 struct socket *so; 1748 struct sockbuf *sb; 1749 1750 so = (struct socket *)kn->kn_fp->f_data; 1751 switch (kn->kn_filter) { 1752 case EVFILT_READ: 1753 if (so->so_options & SO_ACCEPTCONN) 1754 kn->kn_fop = &solisten_filtops; 1755 else 1756 kn->kn_fop = &soread_filtops; 1757 sb = &so->so_rcv; 1758 break; 1759 case EVFILT_WRITE: 1760 kn->kn_fop = &sowrite_filtops; 1761 sb = &so->so_snd; 1762 break; 1763 default: 1764 return (1); 1765 } 1766 SLIST_INSERT_HEAD(&sb->sb_sel.sel_klist, kn, kn_selnext); 1767 sb->sb_flags |= SB_KNOTE; 1768 return (0); 1769 } 1770 1771 #include <sys/sysctl.h> 1772 1773 static int sysctl_kern_somaxkva(SYSCTLFN_PROTO); 1774 1775 /* 1776 * sysctl helper routine for kern.somaxkva. ensures that the given 1777 * value is not too small. 1778 * (XXX should we maybe make sure it's not too large as well?) 1779 */ 1780 static int 1781 sysctl_kern_somaxkva(SYSCTLFN_ARGS) 1782 { 1783 int error, new_somaxkva; 1784 struct sysctlnode node; 1785 int s; 1786 1787 new_somaxkva = somaxkva; 1788 node = *rnode; 1789 node.sysctl_data = &new_somaxkva; 1790 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 1791 if (error || newp == NULL) 1792 return (error); 1793 1794 if (new_somaxkva < (16 * 1024 * 1024)) /* sanity */ 1795 return (EINVAL); 1796 1797 s = splvm(); 1798 simple_lock(&so_pendfree_slock); 1799 somaxkva = new_somaxkva; 1800 wakeup(&socurkva); 1801 simple_unlock(&so_pendfree_slock); 1802 splx(s); 1803 1804 return (error); 1805 } 1806 1807 SYSCTL_SETUP(sysctl_kern_somaxkva_setup, "sysctl kern.somaxkva setup") 1808 { 1809 1810 sysctl_createv(clog, 0, NULL, NULL, 1811 CTLFLAG_PERMANENT, 1812 CTLTYPE_NODE, "kern", NULL, 1813 NULL, 0, NULL, 0, 1814 CTL_KERN, CTL_EOL); 1815 1816 sysctl_createv(clog, 0, NULL, NULL, 1817 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1818 CTLTYPE_INT, "somaxkva", 1819 SYSCTL_DESCR("Maximum amount of kernel memory to be " 1820 "used for socket buffers"), 1821 sysctl_kern_somaxkva, 0, NULL, 0, 1822 CTL_KERN, KERN_SOMAXKVA, CTL_EOL); 1823 } 1824