1 /* $NetBSD: uipc_socket.c,v 1.129 2006/11/01 10:17:59 yamt Exp $ */ 2 3 /*- 4 * Copyright (c) 2002 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of Wasabi Systems, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 39 /* 40 * Copyright (c) 1982, 1986, 1988, 1990, 1993 41 * The Regents of the University of California. All rights reserved. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. Neither the name of the University nor the names of its contributors 52 * may be used to endorse or promote products derived from this software 53 * without specific prior written permission. 54 * 55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 65 * SUCH DAMAGE. 66 * 67 * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95 68 */ 69 70 #include <sys/cdefs.h> 71 __KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.129 2006/11/01 10:17:59 yamt Exp $"); 72 73 #include "opt_sock_counters.h" 74 #include "opt_sosend_loan.h" 75 #include "opt_mbuftrace.h" 76 #include "opt_somaxkva.h" 77 78 #include <sys/param.h> 79 #include <sys/systm.h> 80 #include <sys/proc.h> 81 #include <sys/file.h> 82 #include <sys/malloc.h> 83 #include <sys/mbuf.h> 84 #include <sys/domain.h> 85 #include <sys/kernel.h> 86 #include <sys/protosw.h> 87 #include <sys/socket.h> 88 #include <sys/socketvar.h> 89 #include <sys/signalvar.h> 90 #include <sys/resourcevar.h> 91 #include <sys/pool.h> 92 #include <sys/event.h> 93 #include <sys/poll.h> 94 #include <sys/kauth.h> 95 96 #include <uvm/uvm.h> 97 98 POOL_INIT(socket_pool, sizeof(struct socket), 0, 0, 0, "sockpl", NULL); 99 100 MALLOC_DEFINE(M_SOOPTS, "soopts", "socket options"); 101 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 102 103 extern int somaxconn; /* patchable (XXX sysctl) */ 104 int somaxconn = SOMAXCONN; 105 106 #ifdef SOSEND_COUNTERS 107 #include <sys/device.h> 108 109 static struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 110 NULL, "sosend", "loan big"); 111 static struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 112 NULL, "sosend", "copy big"); 113 static struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 114 NULL, "sosend", "copy small"); 115 static struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 116 NULL, "sosend", "kva limit"); 117 118 #define SOSEND_COUNTER_INCR(ev) (ev)->ev_count++ 119 120 EVCNT_ATTACH_STATIC(sosend_loan_big); 121 EVCNT_ATTACH_STATIC(sosend_copy_big); 122 EVCNT_ATTACH_STATIC(sosend_copy_small); 123 EVCNT_ATTACH_STATIC(sosend_kvalimit); 124 #else 125 126 #define SOSEND_COUNTER_INCR(ev) /* nothing */ 127 128 #endif /* SOSEND_COUNTERS */ 129 130 static struct callback_entry sokva_reclaimerentry; 131 132 #ifdef SOSEND_NO_LOAN 133 int sock_loan_thresh = -1; 134 #else 135 int sock_loan_thresh = 4096; 136 #endif 137 138 static struct simplelock so_pendfree_slock = SIMPLELOCK_INITIALIZER; 139 static struct mbuf *so_pendfree; 140 141 #ifndef SOMAXKVA 142 #define SOMAXKVA (16 * 1024 * 1024) 143 #endif 144 int somaxkva = SOMAXKVA; 145 static int socurkva; 146 static int sokvawaiters; 147 148 #define SOCK_LOAN_CHUNK 65536 149 150 static size_t sodopendfree(void); 151 static size_t sodopendfreel(void); 152 153 static vsize_t 154 sokvareserve(struct socket *so, vsize_t len) 155 { 156 int s; 157 int error; 158 159 s = splvm(); 160 simple_lock(&so_pendfree_slock); 161 while (socurkva + len > somaxkva) { 162 size_t freed; 163 164 /* 165 * try to do pendfree. 166 */ 167 168 freed = sodopendfreel(); 169 170 /* 171 * if some kva was freed, try again. 172 */ 173 174 if (freed) 175 continue; 176 177 SOSEND_COUNTER_INCR(&sosend_kvalimit); 178 sokvawaiters++; 179 error = ltsleep(&socurkva, PVM | PCATCH, "sokva", 0, 180 &so_pendfree_slock); 181 sokvawaiters--; 182 if (error) { 183 len = 0; 184 break; 185 } 186 } 187 socurkva += len; 188 simple_unlock(&so_pendfree_slock); 189 splx(s); 190 return len; 191 } 192 193 static void 194 sokvaunreserve(vsize_t len) 195 { 196 int s; 197 198 s = splvm(); 199 simple_lock(&so_pendfree_slock); 200 socurkva -= len; 201 if (sokvawaiters) 202 wakeup(&socurkva); 203 simple_unlock(&so_pendfree_slock); 204 splx(s); 205 } 206 207 /* 208 * sokvaalloc: allocate kva for loan. 209 */ 210 211 vaddr_t 212 sokvaalloc(vsize_t len, struct socket *so) 213 { 214 vaddr_t lva; 215 216 /* 217 * reserve kva. 218 */ 219 220 if (sokvareserve(so, len) == 0) 221 return 0; 222 223 /* 224 * allocate kva. 225 */ 226 227 lva = uvm_km_alloc(kernel_map, len, 0, UVM_KMF_VAONLY | UVM_KMF_WAITVA); 228 if (lva == 0) { 229 sokvaunreserve(len); 230 return (0); 231 } 232 233 return lva; 234 } 235 236 /* 237 * sokvafree: free kva for loan. 238 */ 239 240 void 241 sokvafree(vaddr_t sva, vsize_t len) 242 { 243 244 /* 245 * free kva. 246 */ 247 248 uvm_km_free(kernel_map, sva, len, UVM_KMF_VAONLY); 249 250 /* 251 * unreserve kva. 252 */ 253 254 sokvaunreserve(len); 255 } 256 257 static void 258 sodoloanfree(struct vm_page **pgs, caddr_t buf, size_t size) 259 { 260 vaddr_t va, sva, eva; 261 vsize_t len; 262 paddr_t pa; 263 int i, npgs; 264 265 eva = round_page((vaddr_t) buf + size); 266 sva = trunc_page((vaddr_t) buf); 267 len = eva - sva; 268 npgs = len >> PAGE_SHIFT; 269 270 if (__predict_false(pgs == NULL)) { 271 pgs = alloca(npgs * sizeof(*pgs)); 272 273 for (i = 0, va = sva; va < eva; i++, va += PAGE_SIZE) { 274 if (pmap_extract(pmap_kernel(), va, &pa) == FALSE) 275 panic("sodoloanfree: va 0x%lx not mapped", va); 276 pgs[i] = PHYS_TO_VM_PAGE(pa); 277 } 278 } 279 280 pmap_kremove(sva, len); 281 pmap_update(pmap_kernel()); 282 uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE); 283 sokvafree(sva, len); 284 } 285 286 static size_t 287 sodopendfree() 288 { 289 int s; 290 size_t rv; 291 292 s = splvm(); 293 simple_lock(&so_pendfree_slock); 294 rv = sodopendfreel(); 295 simple_unlock(&so_pendfree_slock); 296 splx(s); 297 298 return rv; 299 } 300 301 /* 302 * sodopendfreel: free mbufs on "pendfree" list. 303 * unlock and relock so_pendfree_slock when freeing mbufs. 304 * 305 * => called with so_pendfree_slock held. 306 * => called at splvm. 307 */ 308 309 static size_t 310 sodopendfreel() 311 { 312 size_t rv = 0; 313 314 LOCK_ASSERT(simple_lock_held(&so_pendfree_slock)); 315 316 for (;;) { 317 struct mbuf *m; 318 struct mbuf *next; 319 320 m = so_pendfree; 321 if (m == NULL) 322 break; 323 so_pendfree = NULL; 324 simple_unlock(&so_pendfree_slock); 325 /* XXX splx */ 326 327 for (; m != NULL; m = next) { 328 next = m->m_next; 329 330 rv += m->m_ext.ext_size; 331 sodoloanfree((m->m_flags & M_EXT_PAGES) ? 332 m->m_ext.ext_pgs : NULL, m->m_ext.ext_buf, 333 m->m_ext.ext_size); 334 pool_cache_put(&mbpool_cache, m); 335 } 336 337 /* XXX splvm */ 338 simple_lock(&so_pendfree_slock); 339 } 340 341 return (rv); 342 } 343 344 void 345 soloanfree(struct mbuf *m, caddr_t buf, size_t size, void *arg) 346 { 347 int s; 348 349 if (m == NULL) { 350 351 /* 352 * called from MEXTREMOVE. 353 */ 354 355 sodoloanfree(NULL, buf, size); 356 return; 357 } 358 359 /* 360 * postpone freeing mbuf. 361 * 362 * we can't do it in interrupt context 363 * because we need to put kva back to kernel_map. 364 */ 365 366 s = splvm(); 367 simple_lock(&so_pendfree_slock); 368 m->m_next = so_pendfree; 369 so_pendfree = m; 370 if (sokvawaiters) 371 wakeup(&socurkva); 372 simple_unlock(&so_pendfree_slock); 373 splx(s); 374 } 375 376 static long 377 sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space) 378 { 379 struct iovec *iov = uio->uio_iov; 380 vaddr_t sva, eva; 381 vsize_t len; 382 vaddr_t lva, va; 383 int npgs, i, error; 384 385 if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) 386 return (0); 387 388 if (iov->iov_len < (size_t) space) 389 space = iov->iov_len; 390 if (space > SOCK_LOAN_CHUNK) 391 space = SOCK_LOAN_CHUNK; 392 393 eva = round_page((vaddr_t) iov->iov_base + space); 394 sva = trunc_page((vaddr_t) iov->iov_base); 395 len = eva - sva; 396 npgs = len >> PAGE_SHIFT; 397 398 /* XXX KDASSERT */ 399 KASSERT(npgs <= M_EXT_MAXPAGES); 400 401 lva = sokvaalloc(len, so); 402 if (lva == 0) 403 return 0; 404 405 error = uvm_loan(&uio->uio_vmspace->vm_map, sva, len, 406 m->m_ext.ext_pgs, UVM_LOAN_TOPAGE); 407 if (error) { 408 sokvafree(lva, len); 409 return (0); 410 } 411 412 for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE) 413 pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]), 414 VM_PROT_READ); 415 pmap_update(pmap_kernel()); 416 417 lva += (vaddr_t) iov->iov_base & PAGE_MASK; 418 419 MEXTADD(m, (caddr_t) lva, space, M_MBUF, soloanfree, so); 420 m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP; 421 422 uio->uio_resid -= space; 423 /* uio_offset not updated, not set/used for write(2) */ 424 uio->uio_iov->iov_base = (caddr_t) uio->uio_iov->iov_base + space; 425 uio->uio_iov->iov_len -= space; 426 if (uio->uio_iov->iov_len == 0) { 427 uio->uio_iov++; 428 uio->uio_iovcnt--; 429 } 430 431 return (space); 432 } 433 434 static int 435 sokva_reclaim_callback(struct callback_entry *ce, void *obj, void *arg) 436 { 437 438 KASSERT(ce == &sokva_reclaimerentry); 439 KASSERT(obj == NULL); 440 441 sodopendfree(); 442 if (!vm_map_starved_p(kernel_map)) { 443 return CALLBACK_CHAIN_ABORT; 444 } 445 return CALLBACK_CHAIN_CONTINUE; 446 } 447 448 void 449 soinit(void) 450 { 451 452 /* Set the initial adjusted socket buffer size. */ 453 if (sb_max_set(sb_max)) 454 panic("bad initial sb_max value: %lu", sb_max); 455 456 callback_register(&vm_map_to_kernel(kernel_map)->vmk_reclaim_callback, 457 &sokva_reclaimerentry, NULL, sokva_reclaim_callback); 458 } 459 460 /* 461 * Socket operation routines. 462 * These routines are called by the routines in 463 * sys_socket.c or from a system process, and 464 * implement the semantics of socket operations by 465 * switching out to the protocol specific routines. 466 */ 467 /*ARGSUSED*/ 468 int 469 socreate(int dom, struct socket **aso, int type, int proto, struct lwp *l) 470 { 471 const struct protosw *prp; 472 struct socket *so; 473 uid_t uid; 474 int error, s; 475 476 if (kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET, 477 KAUTH_REQ_NETWORK_SOCKET_OPEN, (void *)(u_long)dom, 478 (void *)(u_long)type, (void *)(u_long)proto) != 0) 479 return (EPERM); 480 481 if (proto) 482 prp = pffindproto(dom, proto, type); 483 else 484 prp = pffindtype(dom, type); 485 if (prp == 0) { 486 /* no support for domain */ 487 if (pffinddomain(dom) == 0) 488 return (EAFNOSUPPORT); 489 /* no support for socket type */ 490 if (proto == 0 && type != 0) 491 return (EPROTOTYPE); 492 return (EPROTONOSUPPORT); 493 } 494 if (prp->pr_usrreq == 0) 495 return (EPROTONOSUPPORT); 496 if (prp->pr_type != type) 497 return (EPROTOTYPE); 498 s = splsoftnet(); 499 so = pool_get(&socket_pool, PR_WAITOK); 500 memset((caddr_t)so, 0, sizeof(*so)); 501 TAILQ_INIT(&so->so_q0); 502 TAILQ_INIT(&so->so_q); 503 so->so_type = type; 504 so->so_proto = prp; 505 so->so_send = sosend; 506 so->so_receive = soreceive; 507 #ifdef MBUFTRACE 508 so->so_rcv.sb_mowner = &prp->pr_domain->dom_mowner; 509 so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner; 510 so->so_mowner = &prp->pr_domain->dom_mowner; 511 #endif 512 if (l != NULL) { 513 uid = kauth_cred_geteuid(l->l_cred); 514 } else { 515 uid = 0; 516 } 517 so->so_uidinfo = uid_find(uid); 518 error = (*prp->pr_usrreq)(so, PRU_ATTACH, (struct mbuf *)0, 519 (struct mbuf *)(long)proto, (struct mbuf *)0, l); 520 if (error) { 521 so->so_state |= SS_NOFDREF; 522 sofree(so); 523 splx(s); 524 return (error); 525 } 526 splx(s); 527 *aso = so; 528 return (0); 529 } 530 531 int 532 sobind(struct socket *so, struct mbuf *nam, struct lwp *l) 533 { 534 int s, error; 535 536 s = splsoftnet(); 537 error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, (struct mbuf *)0, 538 nam, (struct mbuf *)0, l); 539 splx(s); 540 return (error); 541 } 542 543 int 544 solisten(struct socket *so, int backlog) 545 { 546 int s, error; 547 548 s = splsoftnet(); 549 error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, (struct mbuf *)0, 550 (struct mbuf *)0, (struct mbuf *)0, (struct lwp *)0); 551 if (error) { 552 splx(s); 553 return (error); 554 } 555 if (TAILQ_EMPTY(&so->so_q)) 556 so->so_options |= SO_ACCEPTCONN; 557 if (backlog < 0) 558 backlog = 0; 559 so->so_qlimit = min(backlog, somaxconn); 560 splx(s); 561 return (0); 562 } 563 564 void 565 sofree(struct socket *so) 566 { 567 568 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) 569 return; 570 if (so->so_head) { 571 /* 572 * We must not decommission a socket that's on the accept(2) 573 * queue. If we do, then accept(2) may hang after select(2) 574 * indicated that the listening socket was ready. 575 */ 576 if (!soqremque(so, 0)) 577 return; 578 } 579 if (so->so_rcv.sb_hiwat) 580 (void)chgsbsize(so->so_uidinfo, &so->so_rcv.sb_hiwat, 0, 581 RLIM_INFINITY); 582 if (so->so_snd.sb_hiwat) 583 (void)chgsbsize(so->so_uidinfo, &so->so_snd.sb_hiwat, 0, 584 RLIM_INFINITY); 585 sbrelease(&so->so_snd, so); 586 sorflush(so); 587 pool_put(&socket_pool, so); 588 } 589 590 /* 591 * Close a socket on last file table reference removal. 592 * Initiate disconnect if connected. 593 * Free socket when disconnect complete. 594 */ 595 int 596 soclose(struct socket *so) 597 { 598 struct socket *so2; 599 int s, error; 600 601 error = 0; 602 s = splsoftnet(); /* conservative */ 603 if (so->so_options & SO_ACCEPTCONN) { 604 while ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) { 605 (void) soqremque(so2, 0); 606 (void) soabort(so2); 607 } 608 while ((so2 = TAILQ_FIRST(&so->so_q)) != 0) { 609 (void) soqremque(so2, 1); 610 (void) soabort(so2); 611 } 612 } 613 if (so->so_pcb == 0) 614 goto discard; 615 if (so->so_state & SS_ISCONNECTED) { 616 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 617 error = sodisconnect(so); 618 if (error) 619 goto drop; 620 } 621 if (so->so_options & SO_LINGER) { 622 if ((so->so_state & SS_ISDISCONNECTING) && 623 (so->so_state & SS_NBIO)) 624 goto drop; 625 while (so->so_state & SS_ISCONNECTED) { 626 error = tsleep((caddr_t)&so->so_timeo, 627 PSOCK | PCATCH, netcls, 628 so->so_linger * hz); 629 if (error) 630 break; 631 } 632 } 633 } 634 drop: 635 if (so->so_pcb) { 636 int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, 637 (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, 638 (struct lwp *)0); 639 if (error == 0) 640 error = error2; 641 } 642 discard: 643 if (so->so_state & SS_NOFDREF) 644 panic("soclose: NOFDREF"); 645 so->so_state |= SS_NOFDREF; 646 sofree(so); 647 splx(s); 648 return (error); 649 } 650 651 /* 652 * Must be called at splsoftnet... 653 */ 654 int 655 soabort(struct socket *so) 656 { 657 658 return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, (struct mbuf *)0, 659 (struct mbuf *)0, (struct mbuf *)0, (struct lwp *)0); 660 } 661 662 int 663 soaccept(struct socket *so, struct mbuf *nam) 664 { 665 int s, error; 666 667 error = 0; 668 s = splsoftnet(); 669 if ((so->so_state & SS_NOFDREF) == 0) 670 panic("soaccept: !NOFDREF"); 671 so->so_state &= ~SS_NOFDREF; 672 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 673 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 674 error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, 675 (struct mbuf *)0, nam, (struct mbuf *)0, (struct lwp *)0); 676 else 677 error = ECONNABORTED; 678 679 splx(s); 680 return (error); 681 } 682 683 int 684 soconnect(struct socket *so, struct mbuf *nam, struct lwp *l) 685 { 686 int s, error; 687 688 if (so->so_options & SO_ACCEPTCONN) 689 return (EOPNOTSUPP); 690 s = splsoftnet(); 691 /* 692 * If protocol is connection-based, can only connect once. 693 * Otherwise, if connected, try to disconnect first. 694 * This allows user to disconnect by connecting to, e.g., 695 * a null address. 696 */ 697 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 698 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 699 (error = sodisconnect(so)))) 700 error = EISCONN; 701 else 702 error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, 703 (struct mbuf *)0, nam, (struct mbuf *)0, l); 704 splx(s); 705 return (error); 706 } 707 708 int 709 soconnect2(struct socket *so1, struct socket *so2) 710 { 711 int s, error; 712 713 s = splsoftnet(); 714 error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, 715 (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0, 716 (struct lwp *)0); 717 splx(s); 718 return (error); 719 } 720 721 int 722 sodisconnect(struct socket *so) 723 { 724 int s, error; 725 726 s = splsoftnet(); 727 if ((so->so_state & SS_ISCONNECTED) == 0) { 728 error = ENOTCONN; 729 goto bad; 730 } 731 if (so->so_state & SS_ISDISCONNECTING) { 732 error = EALREADY; 733 goto bad; 734 } 735 error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, 736 (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, 737 (struct lwp *)0); 738 bad: 739 splx(s); 740 sodopendfree(); 741 return (error); 742 } 743 744 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 745 /* 746 * Send on a socket. 747 * If send must go all at once and message is larger than 748 * send buffering, then hard error. 749 * Lock against other senders. 750 * If must go all at once and not enough room now, then 751 * inform user that this would block and do nothing. 752 * Otherwise, if nonblocking, send as much as possible. 753 * The data to be sent is described by "uio" if nonzero, 754 * otherwise by the mbuf chain "top" (which must be null 755 * if uio is not). Data provided in mbuf chain must be small 756 * enough to send all at once. 757 * 758 * Returns nonzero on error, timeout or signal; callers 759 * must check for short counts if EINTR/ERESTART are returned. 760 * Data and control buffers are freed on return. 761 */ 762 int 763 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 764 struct mbuf *control, int flags, struct lwp *l) 765 { 766 struct mbuf **mp, *m; 767 struct proc *p; 768 long space, len, resid, clen, mlen; 769 int error, s, dontroute, atomic; 770 771 p = l->l_proc; 772 sodopendfree(); 773 774 clen = 0; 775 atomic = sosendallatonce(so) || top; 776 if (uio) 777 resid = uio->uio_resid; 778 else 779 resid = top->m_pkthdr.len; 780 /* 781 * In theory resid should be unsigned. 782 * However, space must be signed, as it might be less than 0 783 * if we over-committed, and we must use a signed comparison 784 * of space and resid. On the other hand, a negative resid 785 * causes us to loop sending 0-length segments to the protocol. 786 */ 787 if (resid < 0) { 788 error = EINVAL; 789 goto out; 790 } 791 dontroute = 792 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 793 (so->so_proto->pr_flags & PR_ATOMIC); 794 if (p) 795 p->p_stats->p_ru.ru_msgsnd++; 796 if (control) 797 clen = control->m_len; 798 #define snderr(errno) { error = errno; splx(s); goto release; } 799 800 restart: 801 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) 802 goto out; 803 do { 804 s = splsoftnet(); 805 if (so->so_state & SS_CANTSENDMORE) 806 snderr(EPIPE); 807 if (so->so_error) { 808 error = so->so_error; 809 so->so_error = 0; 810 splx(s); 811 goto release; 812 } 813 if ((so->so_state & SS_ISCONNECTED) == 0) { 814 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 815 if ((so->so_state & SS_ISCONFIRMING) == 0 && 816 !(resid == 0 && clen != 0)) 817 snderr(ENOTCONN); 818 } else if (addr == 0) 819 snderr(EDESTADDRREQ); 820 } 821 space = sbspace(&so->so_snd); 822 if (flags & MSG_OOB) 823 space += 1024; 824 if ((atomic && resid > so->so_snd.sb_hiwat) || 825 clen > so->so_snd.sb_hiwat) 826 snderr(EMSGSIZE); 827 if (space < resid + clen && 828 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 829 if (so->so_state & SS_NBIO) 830 snderr(EWOULDBLOCK); 831 sbunlock(&so->so_snd); 832 error = sbwait(&so->so_snd); 833 splx(s); 834 if (error) 835 goto out; 836 goto restart; 837 } 838 splx(s); 839 mp = ⊤ 840 space -= clen; 841 do { 842 if (uio == NULL) { 843 /* 844 * Data is prepackaged in "top". 845 */ 846 resid = 0; 847 if (flags & MSG_EOR) 848 top->m_flags |= M_EOR; 849 } else do { 850 if (top == 0) { 851 m = m_gethdr(M_WAIT, MT_DATA); 852 mlen = MHLEN; 853 m->m_pkthdr.len = 0; 854 m->m_pkthdr.rcvif = (struct ifnet *)0; 855 } else { 856 m = m_get(M_WAIT, MT_DATA); 857 mlen = MLEN; 858 } 859 MCLAIM(m, so->so_snd.sb_mowner); 860 if (sock_loan_thresh >= 0 && 861 uio->uio_iov->iov_len >= sock_loan_thresh && 862 space >= sock_loan_thresh && 863 (len = sosend_loan(so, uio, m, 864 space)) != 0) { 865 SOSEND_COUNTER_INCR(&sosend_loan_big); 866 space -= len; 867 goto have_data; 868 } 869 if (resid >= MINCLSIZE && space >= MCLBYTES) { 870 SOSEND_COUNTER_INCR(&sosend_copy_big); 871 m_clget(m, M_WAIT); 872 if ((m->m_flags & M_EXT) == 0) 873 goto nopages; 874 mlen = MCLBYTES; 875 if (atomic && top == 0) { 876 len = lmin(MCLBYTES - max_hdr, 877 resid); 878 m->m_data += max_hdr; 879 } else 880 len = lmin(MCLBYTES, resid); 881 space -= len; 882 } else { 883 nopages: 884 SOSEND_COUNTER_INCR(&sosend_copy_small); 885 len = lmin(lmin(mlen, resid), space); 886 space -= len; 887 /* 888 * For datagram protocols, leave room 889 * for protocol headers in first mbuf. 890 */ 891 if (atomic && top == 0 && len < mlen) 892 MH_ALIGN(m, len); 893 } 894 error = uiomove(mtod(m, caddr_t), (int)len, 895 uio); 896 have_data: 897 resid = uio->uio_resid; 898 m->m_len = len; 899 *mp = m; 900 top->m_pkthdr.len += len; 901 if (error) 902 goto release; 903 mp = &m->m_next; 904 if (resid <= 0) { 905 if (flags & MSG_EOR) 906 top->m_flags |= M_EOR; 907 break; 908 } 909 } while (space > 0 && atomic); 910 911 s = splsoftnet(); 912 913 if (so->so_state & SS_CANTSENDMORE) 914 snderr(EPIPE); 915 916 if (dontroute) 917 so->so_options |= SO_DONTROUTE; 918 if (resid > 0) 919 so->so_state |= SS_MORETOCOME; 920 error = (*so->so_proto->pr_usrreq)(so, 921 (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, 922 top, addr, control, curlwp); /* XXX */ 923 if (dontroute) 924 so->so_options &= ~SO_DONTROUTE; 925 if (resid > 0) 926 so->so_state &= ~SS_MORETOCOME; 927 splx(s); 928 929 clen = 0; 930 control = 0; 931 top = 0; 932 mp = ⊤ 933 if (error) 934 goto release; 935 } while (resid && space > 0); 936 } while (resid); 937 938 release: 939 sbunlock(&so->so_snd); 940 out: 941 if (top) 942 m_freem(top); 943 if (control) 944 m_freem(control); 945 return (error); 946 } 947 948 /* 949 * Implement receive operations on a socket. 950 * We depend on the way that records are added to the sockbuf 951 * by sbappend*. In particular, each record (mbufs linked through m_next) 952 * must begin with an address if the protocol so specifies, 953 * followed by an optional mbuf or mbufs containing ancillary data, 954 * and then zero or more mbufs of data. 955 * In order to avoid blocking network interrupts for the entire time here, 956 * we splx() while doing the actual copy to user space. 957 * Although the sockbuf is locked, new data may still be appended, 958 * and thus we must maintain consistency of the sockbuf during that time. 959 * 960 * The caller may receive the data as a single mbuf chain by supplying 961 * an mbuf **mp0 for use in returning the chain. The uio is then used 962 * only for the count in uio_resid. 963 */ 964 int 965 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 966 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 967 { 968 struct lwp *l = curlwp; 969 struct mbuf *m, **mp; 970 int flags, len, error, s, offset, moff, type, orig_resid; 971 const struct protosw *pr; 972 struct mbuf *nextrecord; 973 int mbuf_removed = 0; 974 975 pr = so->so_proto; 976 mp = mp0; 977 type = 0; 978 orig_resid = uio->uio_resid; 979 980 if (paddr) 981 *paddr = 0; 982 if (controlp) 983 *controlp = 0; 984 if (flagsp) 985 flags = *flagsp &~ MSG_EOR; 986 else 987 flags = 0; 988 989 if ((flags & MSG_DONTWAIT) == 0) 990 sodopendfree(); 991 992 if (flags & MSG_OOB) { 993 m = m_get(M_WAIT, MT_DATA); 994 error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, 995 (struct mbuf *)(long)(flags & MSG_PEEK), 996 (struct mbuf *)0, l); 997 if (error) 998 goto bad; 999 do { 1000 error = uiomove(mtod(m, caddr_t), 1001 (int) min(uio->uio_resid, m->m_len), uio); 1002 m = m_free(m); 1003 } while (uio->uio_resid && error == 0 && m); 1004 bad: 1005 if (m) 1006 m_freem(m); 1007 return (error); 1008 } 1009 if (mp) 1010 *mp = (struct mbuf *)0; 1011 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 1012 (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, 1013 (struct mbuf *)0, (struct mbuf *)0, l); 1014 1015 restart: 1016 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) 1017 return (error); 1018 s = splsoftnet(); 1019 1020 m = so->so_rcv.sb_mb; 1021 /* 1022 * If we have less data than requested, block awaiting more 1023 * (subject to any timeout) if: 1024 * 1. the current count is less than the low water mark, 1025 * 2. MSG_WAITALL is set, and it is possible to do the entire 1026 * receive operation at once if we block (resid <= hiwat), or 1027 * 3. MSG_DONTWAIT is not set. 1028 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1029 * we have to do the receive in sections, and thus risk returning 1030 * a short count if a timeout or signal occurs after we start. 1031 */ 1032 if (m == 0 || (((flags & MSG_DONTWAIT) == 0 && 1033 so->so_rcv.sb_cc < uio->uio_resid) && 1034 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 1035 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 1036 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { 1037 #ifdef DIAGNOSTIC 1038 if (m == 0 && so->so_rcv.sb_cc) 1039 panic("receive 1"); 1040 #endif 1041 if (so->so_error) { 1042 if (m) 1043 goto dontblock; 1044 error = so->so_error; 1045 if ((flags & MSG_PEEK) == 0) 1046 so->so_error = 0; 1047 goto release; 1048 } 1049 if (so->so_state & SS_CANTRCVMORE) { 1050 if (m) 1051 goto dontblock; 1052 else 1053 goto release; 1054 } 1055 for (; m; m = m->m_next) 1056 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1057 m = so->so_rcv.sb_mb; 1058 goto dontblock; 1059 } 1060 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1061 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1062 error = ENOTCONN; 1063 goto release; 1064 } 1065 if (uio->uio_resid == 0) 1066 goto release; 1067 if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { 1068 error = EWOULDBLOCK; 1069 goto release; 1070 } 1071 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 1072 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 1073 sbunlock(&so->so_rcv); 1074 error = sbwait(&so->so_rcv); 1075 splx(s); 1076 if (error) 1077 return (error); 1078 goto restart; 1079 } 1080 dontblock: 1081 /* 1082 * On entry here, m points to the first record of the socket buffer. 1083 * While we process the initial mbufs containing address and control 1084 * info, we save a copy of m->m_nextpkt into nextrecord. 1085 */ 1086 if (l) 1087 l->l_proc->p_stats->p_ru.ru_msgrcv++; 1088 KASSERT(m == so->so_rcv.sb_mb); 1089 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 1090 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 1091 nextrecord = m->m_nextpkt; 1092 if (pr->pr_flags & PR_ADDR) { 1093 #ifdef DIAGNOSTIC 1094 if (m->m_type != MT_SONAME) 1095 panic("receive 1a"); 1096 #endif 1097 orig_resid = 0; 1098 if (flags & MSG_PEEK) { 1099 if (paddr) 1100 *paddr = m_copy(m, 0, m->m_len); 1101 m = m->m_next; 1102 } else { 1103 sbfree(&so->so_rcv, m); 1104 mbuf_removed = 1; 1105 if (paddr) { 1106 *paddr = m; 1107 so->so_rcv.sb_mb = m->m_next; 1108 m->m_next = 0; 1109 m = so->so_rcv.sb_mb; 1110 } else { 1111 MFREE(m, so->so_rcv.sb_mb); 1112 m = so->so_rcv.sb_mb; 1113 } 1114 } 1115 } 1116 while (m && m->m_type == MT_CONTROL && error == 0) { 1117 if (flags & MSG_PEEK) { 1118 if (controlp) 1119 *controlp = m_copy(m, 0, m->m_len); 1120 m = m->m_next; 1121 } else { 1122 sbfree(&so->so_rcv, m); 1123 mbuf_removed = 1; 1124 if (controlp) { 1125 struct domain *dom = pr->pr_domain; 1126 if (dom->dom_externalize && l && 1127 mtod(m, struct cmsghdr *)->cmsg_type == 1128 SCM_RIGHTS) 1129 error = (*dom->dom_externalize)(m, l); 1130 *controlp = m; 1131 so->so_rcv.sb_mb = m->m_next; 1132 m->m_next = 0; 1133 m = so->so_rcv.sb_mb; 1134 } else { 1135 /* 1136 * Dispose of any SCM_RIGHTS message that went 1137 * through the read path rather than recv. 1138 */ 1139 if (pr->pr_domain->dom_dispose && 1140 mtod(m, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 1141 (*pr->pr_domain->dom_dispose)(m); 1142 MFREE(m, so->so_rcv.sb_mb); 1143 m = so->so_rcv.sb_mb; 1144 } 1145 } 1146 if (controlp) { 1147 orig_resid = 0; 1148 controlp = &(*controlp)->m_next; 1149 } 1150 } 1151 1152 /* 1153 * If m is non-NULL, we have some data to read. From now on, 1154 * make sure to keep sb_lastrecord consistent when working on 1155 * the last packet on the chain (nextrecord == NULL) and we 1156 * change m->m_nextpkt. 1157 */ 1158 if (m) { 1159 if ((flags & MSG_PEEK) == 0) { 1160 m->m_nextpkt = nextrecord; 1161 /* 1162 * If nextrecord == NULL (this is a single chain), 1163 * then sb_lastrecord may not be valid here if m 1164 * was changed earlier. 1165 */ 1166 if (nextrecord == NULL) { 1167 KASSERT(so->so_rcv.sb_mb == m); 1168 so->so_rcv.sb_lastrecord = m; 1169 } 1170 } 1171 type = m->m_type; 1172 if (type == MT_OOBDATA) 1173 flags |= MSG_OOB; 1174 } else { 1175 if ((flags & MSG_PEEK) == 0) { 1176 KASSERT(so->so_rcv.sb_mb == m); 1177 so->so_rcv.sb_mb = nextrecord; 1178 SB_EMPTY_FIXUP(&so->so_rcv); 1179 } 1180 } 1181 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 1182 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 1183 1184 moff = 0; 1185 offset = 0; 1186 while (m && uio->uio_resid > 0 && error == 0) { 1187 if (m->m_type == MT_OOBDATA) { 1188 if (type != MT_OOBDATA) 1189 break; 1190 } else if (type == MT_OOBDATA) 1191 break; 1192 #ifdef DIAGNOSTIC 1193 else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) 1194 panic("receive 3"); 1195 #endif 1196 so->so_state &= ~SS_RCVATMARK; 1197 len = uio->uio_resid; 1198 if (so->so_oobmark && len > so->so_oobmark - offset) 1199 len = so->so_oobmark - offset; 1200 if (len > m->m_len - moff) 1201 len = m->m_len - moff; 1202 /* 1203 * If mp is set, just pass back the mbufs. 1204 * Otherwise copy them out via the uio, then free. 1205 * Sockbuf must be consistent here (points to current mbuf, 1206 * it points to next record) when we drop priority; 1207 * we must note any additions to the sockbuf when we 1208 * block interrupts again. 1209 */ 1210 if (mp == 0) { 1211 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 1212 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 1213 splx(s); 1214 error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio); 1215 s = splsoftnet(); 1216 if (error) { 1217 /* 1218 * If any part of the record has been removed 1219 * (such as the MT_SONAME mbuf, which will 1220 * happen when PR_ADDR, and thus also 1221 * PR_ATOMIC, is set), then drop the entire 1222 * record to maintain the atomicity of the 1223 * receive operation. 1224 * 1225 * This avoids a later panic("receive 1a") 1226 * when compiled with DIAGNOSTIC. 1227 */ 1228 if (m && mbuf_removed 1229 && (pr->pr_flags & PR_ATOMIC)) 1230 (void) sbdroprecord(&so->so_rcv); 1231 1232 goto release; 1233 } 1234 } else 1235 uio->uio_resid -= len; 1236 if (len == m->m_len - moff) { 1237 if (m->m_flags & M_EOR) 1238 flags |= MSG_EOR; 1239 if (flags & MSG_PEEK) { 1240 m = m->m_next; 1241 moff = 0; 1242 } else { 1243 nextrecord = m->m_nextpkt; 1244 sbfree(&so->so_rcv, m); 1245 if (mp) { 1246 *mp = m; 1247 mp = &m->m_next; 1248 so->so_rcv.sb_mb = m = m->m_next; 1249 *mp = (struct mbuf *)0; 1250 } else { 1251 MFREE(m, so->so_rcv.sb_mb); 1252 m = so->so_rcv.sb_mb; 1253 } 1254 /* 1255 * If m != NULL, we also know that 1256 * so->so_rcv.sb_mb != NULL. 1257 */ 1258 KASSERT(so->so_rcv.sb_mb == m); 1259 if (m) { 1260 m->m_nextpkt = nextrecord; 1261 if (nextrecord == NULL) 1262 so->so_rcv.sb_lastrecord = m; 1263 } else { 1264 so->so_rcv.sb_mb = nextrecord; 1265 SB_EMPTY_FIXUP(&so->so_rcv); 1266 } 1267 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1268 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1269 } 1270 } else { 1271 if (flags & MSG_PEEK) 1272 moff += len; 1273 else { 1274 if (mp) 1275 *mp = m_copym(m, 0, len, M_WAIT); 1276 m->m_data += len; 1277 m->m_len -= len; 1278 so->so_rcv.sb_cc -= len; 1279 } 1280 } 1281 if (so->so_oobmark) { 1282 if ((flags & MSG_PEEK) == 0) { 1283 so->so_oobmark -= len; 1284 if (so->so_oobmark == 0) { 1285 so->so_state |= SS_RCVATMARK; 1286 break; 1287 } 1288 } else { 1289 offset += len; 1290 if (offset == so->so_oobmark) 1291 break; 1292 } 1293 } 1294 if (flags & MSG_EOR) 1295 break; 1296 /* 1297 * If the MSG_WAITALL flag is set (for non-atomic socket), 1298 * we must not quit until "uio->uio_resid == 0" or an error 1299 * termination. If a signal/timeout occurs, return 1300 * with a short count but without error. 1301 * Keep sockbuf locked against other readers. 1302 */ 1303 while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 && 1304 !sosendallatonce(so) && !nextrecord) { 1305 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1306 break; 1307 /* 1308 * If we are peeking and the socket receive buffer is 1309 * full, stop since we can't get more data to peek at. 1310 */ 1311 if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0) 1312 break; 1313 /* 1314 * If we've drained the socket buffer, tell the 1315 * protocol in case it needs to do something to 1316 * get it filled again. 1317 */ 1318 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) 1319 (*pr->pr_usrreq)(so, PRU_RCVD, 1320 (struct mbuf *)0, 1321 (struct mbuf *)(long)flags, 1322 (struct mbuf *)0, l); 1323 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1324 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1325 error = sbwait(&so->so_rcv); 1326 if (error) { 1327 sbunlock(&so->so_rcv); 1328 splx(s); 1329 return (0); 1330 } 1331 if ((m = so->so_rcv.sb_mb) != NULL) 1332 nextrecord = m->m_nextpkt; 1333 } 1334 } 1335 1336 if (m && pr->pr_flags & PR_ATOMIC) { 1337 flags |= MSG_TRUNC; 1338 if ((flags & MSG_PEEK) == 0) 1339 (void) sbdroprecord(&so->so_rcv); 1340 } 1341 if ((flags & MSG_PEEK) == 0) { 1342 if (m == 0) { 1343 /* 1344 * First part is an inline SB_EMPTY_FIXUP(). Second 1345 * part makes sure sb_lastrecord is up-to-date if 1346 * there is still data in the socket buffer. 1347 */ 1348 so->so_rcv.sb_mb = nextrecord; 1349 if (so->so_rcv.sb_mb == NULL) { 1350 so->so_rcv.sb_mbtail = NULL; 1351 so->so_rcv.sb_lastrecord = NULL; 1352 } else if (nextrecord->m_nextpkt == NULL) 1353 so->so_rcv.sb_lastrecord = nextrecord; 1354 } 1355 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1356 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1357 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 1358 (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, 1359 (struct mbuf *)(long)flags, (struct mbuf *)0, l); 1360 } 1361 if (orig_resid == uio->uio_resid && orig_resid && 1362 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1363 sbunlock(&so->so_rcv); 1364 splx(s); 1365 goto restart; 1366 } 1367 1368 if (flagsp) 1369 *flagsp |= flags; 1370 release: 1371 sbunlock(&so->so_rcv); 1372 splx(s); 1373 return (error); 1374 } 1375 1376 int 1377 soshutdown(struct socket *so, int how) 1378 { 1379 const struct protosw *pr; 1380 1381 pr = so->so_proto; 1382 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1383 return (EINVAL); 1384 1385 if (how == SHUT_RD || how == SHUT_RDWR) 1386 sorflush(so); 1387 if (how == SHUT_WR || how == SHUT_RDWR) 1388 return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, (struct mbuf *)0, 1389 (struct mbuf *)0, (struct mbuf *)0, (struct lwp *)0); 1390 return (0); 1391 } 1392 1393 void 1394 sorflush(struct socket *so) 1395 { 1396 struct sockbuf *sb, asb; 1397 const struct protosw *pr; 1398 int s; 1399 1400 sb = &so->so_rcv; 1401 pr = so->so_proto; 1402 sb->sb_flags |= SB_NOINTR; 1403 (void) sblock(sb, M_WAITOK); 1404 s = splnet(); 1405 socantrcvmore(so); 1406 sbunlock(sb); 1407 asb = *sb; 1408 /* 1409 * Clear most of the sockbuf structure, but leave some of the 1410 * fields valid. 1411 */ 1412 memset(&sb->sb_startzero, 0, 1413 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1414 splx(s); 1415 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1416 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1417 sbrelease(&asb, so); 1418 } 1419 1420 int 1421 sosetopt(struct socket *so, int level, int optname, struct mbuf *m0) 1422 { 1423 int error; 1424 struct mbuf *m; 1425 1426 error = 0; 1427 m = m0; 1428 if (level != SOL_SOCKET) { 1429 if (so->so_proto && so->so_proto->pr_ctloutput) 1430 return ((*so->so_proto->pr_ctloutput) 1431 (PRCO_SETOPT, so, level, optname, &m0)); 1432 error = ENOPROTOOPT; 1433 } else { 1434 switch (optname) { 1435 1436 case SO_LINGER: 1437 if (m == NULL || m->m_len != sizeof(struct linger)) { 1438 error = EINVAL; 1439 goto bad; 1440 } 1441 if (mtod(m, struct linger *)->l_linger < 0 || 1442 mtod(m, struct linger *)->l_linger > (INT_MAX / hz)) { 1443 error = EDOM; 1444 goto bad; 1445 } 1446 so->so_linger = mtod(m, struct linger *)->l_linger; 1447 /* fall thru... */ 1448 1449 case SO_DEBUG: 1450 case SO_KEEPALIVE: 1451 case SO_DONTROUTE: 1452 case SO_USELOOPBACK: 1453 case SO_BROADCAST: 1454 case SO_REUSEADDR: 1455 case SO_REUSEPORT: 1456 case SO_OOBINLINE: 1457 case SO_TIMESTAMP: 1458 if (m == NULL || m->m_len < sizeof(int)) { 1459 error = EINVAL; 1460 goto bad; 1461 } 1462 if (*mtod(m, int *)) 1463 so->so_options |= optname; 1464 else 1465 so->so_options &= ~optname; 1466 break; 1467 1468 case SO_SNDBUF: 1469 case SO_RCVBUF: 1470 case SO_SNDLOWAT: 1471 case SO_RCVLOWAT: 1472 { 1473 int optval; 1474 1475 if (m == NULL || m->m_len < sizeof(int)) { 1476 error = EINVAL; 1477 goto bad; 1478 } 1479 1480 /* 1481 * Values < 1 make no sense for any of these 1482 * options, so disallow them. 1483 */ 1484 optval = *mtod(m, int *); 1485 if (optval < 1) { 1486 error = EINVAL; 1487 goto bad; 1488 } 1489 1490 switch (optname) { 1491 1492 case SO_SNDBUF: 1493 case SO_RCVBUF: 1494 if (sbreserve(optname == SO_SNDBUF ? 1495 &so->so_snd : &so->so_rcv, 1496 (u_long) optval, so) == 0) { 1497 error = ENOBUFS; 1498 goto bad; 1499 } 1500 break; 1501 1502 /* 1503 * Make sure the low-water is never greater than 1504 * the high-water. 1505 */ 1506 case SO_SNDLOWAT: 1507 so->so_snd.sb_lowat = 1508 (optval > so->so_snd.sb_hiwat) ? 1509 so->so_snd.sb_hiwat : optval; 1510 break; 1511 case SO_RCVLOWAT: 1512 so->so_rcv.sb_lowat = 1513 (optval > so->so_rcv.sb_hiwat) ? 1514 so->so_rcv.sb_hiwat : optval; 1515 break; 1516 } 1517 break; 1518 } 1519 1520 case SO_SNDTIMEO: 1521 case SO_RCVTIMEO: 1522 { 1523 struct timeval *tv; 1524 int val; 1525 1526 if (m == NULL || m->m_len < sizeof(*tv)) { 1527 error = EINVAL; 1528 goto bad; 1529 } 1530 tv = mtod(m, struct timeval *); 1531 if (tv->tv_sec > (INT_MAX - tv->tv_usec / tick) / hz) { 1532 error = EDOM; 1533 goto bad; 1534 } 1535 val = tv->tv_sec * hz + tv->tv_usec / tick; 1536 if (val == 0 && tv->tv_usec != 0) 1537 val = 1; 1538 1539 switch (optname) { 1540 1541 case SO_SNDTIMEO: 1542 so->so_snd.sb_timeo = val; 1543 break; 1544 case SO_RCVTIMEO: 1545 so->so_rcv.sb_timeo = val; 1546 break; 1547 } 1548 break; 1549 } 1550 1551 default: 1552 error = ENOPROTOOPT; 1553 break; 1554 } 1555 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { 1556 (void) ((*so->so_proto->pr_ctloutput) 1557 (PRCO_SETOPT, so, level, optname, &m0)); 1558 m = NULL; /* freed by protocol */ 1559 } 1560 } 1561 bad: 1562 if (m) 1563 (void) m_free(m); 1564 return (error); 1565 } 1566 1567 int 1568 sogetopt(struct socket *so, int level, int optname, struct mbuf **mp) 1569 { 1570 struct mbuf *m; 1571 1572 if (level != SOL_SOCKET) { 1573 if (so->so_proto && so->so_proto->pr_ctloutput) { 1574 return ((*so->so_proto->pr_ctloutput) 1575 (PRCO_GETOPT, so, level, optname, mp)); 1576 } else 1577 return (ENOPROTOOPT); 1578 } else { 1579 m = m_get(M_WAIT, MT_SOOPTS); 1580 m->m_len = sizeof(int); 1581 1582 switch (optname) { 1583 1584 case SO_LINGER: 1585 m->m_len = sizeof(struct linger); 1586 mtod(m, struct linger *)->l_onoff = 1587 so->so_options & SO_LINGER; 1588 mtod(m, struct linger *)->l_linger = so->so_linger; 1589 break; 1590 1591 case SO_USELOOPBACK: 1592 case SO_DONTROUTE: 1593 case SO_DEBUG: 1594 case SO_KEEPALIVE: 1595 case SO_REUSEADDR: 1596 case SO_REUSEPORT: 1597 case SO_BROADCAST: 1598 case SO_OOBINLINE: 1599 case SO_TIMESTAMP: 1600 *mtod(m, int *) = so->so_options & optname; 1601 break; 1602 1603 case SO_TYPE: 1604 *mtod(m, int *) = so->so_type; 1605 break; 1606 1607 case SO_ERROR: 1608 *mtod(m, int *) = so->so_error; 1609 so->so_error = 0; 1610 break; 1611 1612 case SO_SNDBUF: 1613 *mtod(m, int *) = so->so_snd.sb_hiwat; 1614 break; 1615 1616 case SO_RCVBUF: 1617 *mtod(m, int *) = so->so_rcv.sb_hiwat; 1618 break; 1619 1620 case SO_SNDLOWAT: 1621 *mtod(m, int *) = so->so_snd.sb_lowat; 1622 break; 1623 1624 case SO_RCVLOWAT: 1625 *mtod(m, int *) = so->so_rcv.sb_lowat; 1626 break; 1627 1628 case SO_SNDTIMEO: 1629 case SO_RCVTIMEO: 1630 { 1631 int val = (optname == SO_SNDTIMEO ? 1632 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1633 1634 m->m_len = sizeof(struct timeval); 1635 mtod(m, struct timeval *)->tv_sec = val / hz; 1636 mtod(m, struct timeval *)->tv_usec = 1637 (val % hz) * tick; 1638 break; 1639 } 1640 1641 case SO_OVERFLOWED: 1642 *mtod(m, int *) = so->so_rcv.sb_overflowed; 1643 break; 1644 1645 default: 1646 (void)m_free(m); 1647 return (ENOPROTOOPT); 1648 } 1649 *mp = m; 1650 return (0); 1651 } 1652 } 1653 1654 void 1655 sohasoutofband(struct socket *so) 1656 { 1657 fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so); 1658 selwakeup(&so->so_rcv.sb_sel); 1659 } 1660 1661 static void 1662 filt_sordetach(struct knote *kn) 1663 { 1664 struct socket *so; 1665 1666 so = (struct socket *)kn->kn_fp->f_data; 1667 SLIST_REMOVE(&so->so_rcv.sb_sel.sel_klist, kn, knote, kn_selnext); 1668 if (SLIST_EMPTY(&so->so_rcv.sb_sel.sel_klist)) 1669 so->so_rcv.sb_flags &= ~SB_KNOTE; 1670 } 1671 1672 /*ARGSUSED*/ 1673 static int 1674 filt_soread(struct knote *kn, long hint) 1675 { 1676 struct socket *so; 1677 1678 so = (struct socket *)kn->kn_fp->f_data; 1679 kn->kn_data = so->so_rcv.sb_cc; 1680 if (so->so_state & SS_CANTRCVMORE) { 1681 kn->kn_flags |= EV_EOF; 1682 kn->kn_fflags = so->so_error; 1683 return (1); 1684 } 1685 if (so->so_error) /* temporary udp error */ 1686 return (1); 1687 if (kn->kn_sfflags & NOTE_LOWAT) 1688 return (kn->kn_data >= kn->kn_sdata); 1689 return (kn->kn_data >= so->so_rcv.sb_lowat); 1690 } 1691 1692 static void 1693 filt_sowdetach(struct knote *kn) 1694 { 1695 struct socket *so; 1696 1697 so = (struct socket *)kn->kn_fp->f_data; 1698 SLIST_REMOVE(&so->so_snd.sb_sel.sel_klist, kn, knote, kn_selnext); 1699 if (SLIST_EMPTY(&so->so_snd.sb_sel.sel_klist)) 1700 so->so_snd.sb_flags &= ~SB_KNOTE; 1701 } 1702 1703 /*ARGSUSED*/ 1704 static int 1705 filt_sowrite(struct knote *kn, long hint) 1706 { 1707 struct socket *so; 1708 1709 so = (struct socket *)kn->kn_fp->f_data; 1710 kn->kn_data = sbspace(&so->so_snd); 1711 if (so->so_state & SS_CANTSENDMORE) { 1712 kn->kn_flags |= EV_EOF; 1713 kn->kn_fflags = so->so_error; 1714 return (1); 1715 } 1716 if (so->so_error) /* temporary udp error */ 1717 return (1); 1718 if (((so->so_state & SS_ISCONNECTED) == 0) && 1719 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 1720 return (0); 1721 if (kn->kn_sfflags & NOTE_LOWAT) 1722 return (kn->kn_data >= kn->kn_sdata); 1723 return (kn->kn_data >= so->so_snd.sb_lowat); 1724 } 1725 1726 /*ARGSUSED*/ 1727 static int 1728 filt_solisten(struct knote *kn, long hint) 1729 { 1730 struct socket *so; 1731 1732 so = (struct socket *)kn->kn_fp->f_data; 1733 1734 /* 1735 * Set kn_data to number of incoming connections, not 1736 * counting partial (incomplete) connections. 1737 */ 1738 kn->kn_data = so->so_qlen; 1739 return (kn->kn_data > 0); 1740 } 1741 1742 static const struct filterops solisten_filtops = 1743 { 1, NULL, filt_sordetach, filt_solisten }; 1744 static const struct filterops soread_filtops = 1745 { 1, NULL, filt_sordetach, filt_soread }; 1746 static const struct filterops sowrite_filtops = 1747 { 1, NULL, filt_sowdetach, filt_sowrite }; 1748 1749 int 1750 soo_kqfilter(struct file *fp, struct knote *kn) 1751 { 1752 struct socket *so; 1753 struct sockbuf *sb; 1754 1755 so = (struct socket *)kn->kn_fp->f_data; 1756 switch (kn->kn_filter) { 1757 case EVFILT_READ: 1758 if (so->so_options & SO_ACCEPTCONN) 1759 kn->kn_fop = &solisten_filtops; 1760 else 1761 kn->kn_fop = &soread_filtops; 1762 sb = &so->so_rcv; 1763 break; 1764 case EVFILT_WRITE: 1765 kn->kn_fop = &sowrite_filtops; 1766 sb = &so->so_snd; 1767 break; 1768 default: 1769 return (1); 1770 } 1771 SLIST_INSERT_HEAD(&sb->sb_sel.sel_klist, kn, kn_selnext); 1772 sb->sb_flags |= SB_KNOTE; 1773 return (0); 1774 } 1775 1776 #include <sys/sysctl.h> 1777 1778 static int sysctl_kern_somaxkva(SYSCTLFN_PROTO); 1779 1780 /* 1781 * sysctl helper routine for kern.somaxkva. ensures that the given 1782 * value is not too small. 1783 * (XXX should we maybe make sure it's not too large as well?) 1784 */ 1785 static int 1786 sysctl_kern_somaxkva(SYSCTLFN_ARGS) 1787 { 1788 int error, new_somaxkva; 1789 struct sysctlnode node; 1790 int s; 1791 1792 new_somaxkva = somaxkva; 1793 node = *rnode; 1794 node.sysctl_data = &new_somaxkva; 1795 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 1796 if (error || newp == NULL) 1797 return (error); 1798 1799 if (new_somaxkva < (16 * 1024 * 1024)) /* sanity */ 1800 return (EINVAL); 1801 1802 s = splvm(); 1803 simple_lock(&so_pendfree_slock); 1804 somaxkva = new_somaxkva; 1805 wakeup(&socurkva); 1806 simple_unlock(&so_pendfree_slock); 1807 splx(s); 1808 1809 return (error); 1810 } 1811 1812 SYSCTL_SETUP(sysctl_kern_somaxkva_setup, "sysctl kern.somaxkva setup") 1813 { 1814 1815 sysctl_createv(clog, 0, NULL, NULL, 1816 CTLFLAG_PERMANENT, 1817 CTLTYPE_NODE, "kern", NULL, 1818 NULL, 0, NULL, 0, 1819 CTL_KERN, CTL_EOL); 1820 1821 sysctl_createv(clog, 0, NULL, NULL, 1822 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1823 CTLTYPE_INT, "somaxkva", 1824 SYSCTL_DESCR("Maximum amount of kernel memory to be " 1825 "used for socket buffers"), 1826 sysctl_kern_somaxkva, 0, NULL, 0, 1827 CTL_KERN, KERN_SOMAXKVA, CTL_EOL); 1828 } 1829