1 /* $NetBSD: uipc_socket.c,v 1.107 2004/09/03 18:14:09 darrenr Exp $ */ 2 3 /*- 4 * Copyright (c) 2002 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of Wasabi Systems, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 39 /* 40 * Copyright (c) 1982, 1986, 1988, 1990, 1993 41 * The Regents of the University of California. All rights reserved. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. Neither the name of the University nor the names of its contributors 52 * may be used to endorse or promote products derived from this software 53 * without specific prior written permission. 54 * 55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 65 * SUCH DAMAGE. 66 * 67 * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95 68 */ 69 70 #include <sys/cdefs.h> 71 __KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.107 2004/09/03 18:14:09 darrenr Exp $"); 72 73 #include "opt_sock_counters.h" 74 #include "opt_sosend_loan.h" 75 #include "opt_mbuftrace.h" 76 #include "opt_somaxkva.h" 77 78 #include <sys/param.h> 79 #include <sys/systm.h> 80 #include <sys/proc.h> 81 #include <sys/file.h> 82 #include <sys/malloc.h> 83 #include <sys/mbuf.h> 84 #include <sys/domain.h> 85 #include <sys/kernel.h> 86 #include <sys/protosw.h> 87 #include <sys/socket.h> 88 #include <sys/socketvar.h> 89 #include <sys/signalvar.h> 90 #include <sys/resourcevar.h> 91 #include <sys/pool.h> 92 #include <sys/event.h> 93 #include <sys/poll.h> 94 95 #include <uvm/uvm.h> 96 97 POOL_INIT(socket_pool, sizeof(struct socket), 0, 0, 0, "sockpl", NULL); 98 99 MALLOC_DEFINE(M_SOOPTS, "soopts", "socket options"); 100 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 101 102 extern int somaxconn; /* patchable (XXX sysctl) */ 103 int somaxconn = SOMAXCONN; 104 105 #ifdef SOSEND_COUNTERS 106 #include <sys/device.h> 107 108 struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 109 NULL, "sosend", "loan big"); 110 struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 111 NULL, "sosend", "copy big"); 112 struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 113 NULL, "sosend", "copy small"); 114 struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 115 NULL, "sosend", "kva limit"); 116 117 #define SOSEND_COUNTER_INCR(ev) (ev)->ev_count++ 118 119 EVCNT_ATTACH_STATIC(sosend_loan_big); 120 EVCNT_ATTACH_STATIC(sosend_copy_big); 121 EVCNT_ATTACH_STATIC(sosend_copy_small); 122 EVCNT_ATTACH_STATIC(sosend_kvalimit); 123 #else 124 125 #define SOSEND_COUNTER_INCR(ev) /* nothing */ 126 127 #endif /* SOSEND_COUNTERS */ 128 129 void 130 soinit(void) 131 { 132 133 /* Set the initial adjusted socket buffer size. */ 134 if (sb_max_set(sb_max)) 135 panic("bad initial sb_max value: %lu\n", sb_max); 136 137 } 138 139 #ifdef SOSEND_NO_LOAN 140 int use_sosend_loan = 0; 141 #else 142 int use_sosend_loan = 1; 143 #endif 144 145 struct simplelock so_pendfree_slock = SIMPLELOCK_INITIALIZER; 146 struct mbuf *so_pendfree; 147 148 #ifndef SOMAXKVA 149 #define SOMAXKVA (16 * 1024 * 1024) 150 #endif 151 int somaxkva = SOMAXKVA; 152 int socurkva; 153 int sokvawaiters; 154 155 #define SOCK_LOAN_THRESH 4096 156 #define SOCK_LOAN_CHUNK 65536 157 158 static size_t sodopendfree(struct socket *); 159 static size_t sodopendfreel(struct socket *); 160 static __inline vsize_t sokvareserve(struct socket *, vsize_t); 161 static __inline void sokvaunreserve(vsize_t); 162 163 static __inline vsize_t 164 sokvareserve(struct socket *so, vsize_t len) 165 { 166 int s; 167 int error; 168 169 s = splvm(); 170 simple_lock(&so_pendfree_slock); 171 while (socurkva + len > somaxkva) { 172 size_t freed; 173 174 /* 175 * try to do pendfree. 176 */ 177 178 freed = sodopendfreel(so); 179 180 /* 181 * if some kva was freed, try again. 182 */ 183 184 if (freed) 185 continue; 186 187 SOSEND_COUNTER_INCR(&sosend_kvalimit); 188 sokvawaiters++; 189 error = ltsleep(&socurkva, PVM | PCATCH, "sokva", 0, 190 &so_pendfree_slock); 191 sokvawaiters--; 192 if (error) { 193 len = 0; 194 break; 195 } 196 } 197 socurkva += len; 198 simple_unlock(&so_pendfree_slock); 199 splx(s); 200 return len; 201 } 202 203 static __inline void 204 sokvaunreserve(vsize_t len) 205 { 206 int s; 207 208 s = splvm(); 209 simple_lock(&so_pendfree_slock); 210 socurkva -= len; 211 if (sokvawaiters) 212 wakeup(&socurkva); 213 simple_unlock(&so_pendfree_slock); 214 splx(s); 215 } 216 217 /* 218 * sokvaalloc: allocate kva for loan. 219 */ 220 221 vaddr_t 222 sokvaalloc(vsize_t len, struct socket *so) 223 { 224 vaddr_t lva; 225 226 /* 227 * reserve kva. 228 */ 229 230 if (sokvareserve(so, len) == 0) 231 return 0; 232 233 /* 234 * allocate kva. 235 */ 236 237 lva = uvm_km_valloc_wait(kernel_map, len); 238 if (lva == 0) { 239 sokvaunreserve(len); 240 return (0); 241 } 242 243 return lva; 244 } 245 246 /* 247 * sokvafree: free kva for loan. 248 */ 249 250 void 251 sokvafree(vaddr_t sva, vsize_t len) 252 { 253 254 /* 255 * free kva. 256 */ 257 258 uvm_km_free(kernel_map, sva, len); 259 260 /* 261 * unreserve kva. 262 */ 263 264 sokvaunreserve(len); 265 } 266 267 static void 268 sodoloanfree(struct vm_page **pgs, caddr_t buf, size_t size) 269 { 270 vaddr_t va, sva, eva; 271 vsize_t len; 272 paddr_t pa; 273 int i, npgs; 274 275 eva = round_page((vaddr_t) buf + size); 276 sva = trunc_page((vaddr_t) buf); 277 len = eva - sva; 278 npgs = len >> PAGE_SHIFT; 279 280 if (__predict_false(pgs == NULL)) { 281 pgs = alloca(npgs * sizeof(*pgs)); 282 283 for (i = 0, va = sva; va < eva; i++, va += PAGE_SIZE) { 284 if (pmap_extract(pmap_kernel(), va, &pa) == FALSE) 285 panic("sodoloanfree: va 0x%lx not mapped", va); 286 pgs[i] = PHYS_TO_VM_PAGE(pa); 287 } 288 } 289 290 pmap_kremove(sva, len); 291 pmap_update(pmap_kernel()); 292 uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE); 293 sokvafree(sva, len); 294 } 295 296 static size_t 297 sodopendfree(struct socket *so) 298 { 299 int s; 300 size_t rv; 301 302 s = splvm(); 303 simple_lock(&so_pendfree_slock); 304 rv = sodopendfreel(so); 305 simple_unlock(&so_pendfree_slock); 306 splx(s); 307 308 return rv; 309 } 310 311 /* 312 * sodopendfreel: free mbufs on "pendfree" list. 313 * unlock and relock so_pendfree_slock when freeing mbufs. 314 * 315 * => called with so_pendfree_slock held. 316 * => called at splvm. 317 */ 318 319 static size_t 320 sodopendfreel(struct socket *so) 321 { 322 size_t rv = 0; 323 324 LOCK_ASSERT(simple_lock_held(&so_pendfree_slock)); 325 326 for (;;) { 327 struct mbuf *m; 328 struct mbuf *next; 329 330 m = so_pendfree; 331 if (m == NULL) 332 break; 333 so_pendfree = NULL; 334 simple_unlock(&so_pendfree_slock); 335 /* XXX splx */ 336 337 for (; m != NULL; m = next) { 338 next = m->m_next; 339 340 rv += m->m_ext.ext_size; 341 sodoloanfree((m->m_flags & M_EXT_PAGES) ? 342 m->m_ext.ext_pgs : NULL, m->m_ext.ext_buf, 343 m->m_ext.ext_size); 344 pool_cache_put(&mbpool_cache, m); 345 } 346 347 /* XXX splvm */ 348 simple_lock(&so_pendfree_slock); 349 } 350 351 return (rv); 352 } 353 354 void 355 soloanfree(struct mbuf *m, caddr_t buf, size_t size, void *arg) 356 { 357 int s; 358 359 if (m == NULL) { 360 361 /* 362 * called from MEXTREMOVE. 363 */ 364 365 sodoloanfree(NULL, buf, size); 366 return; 367 } 368 369 /* 370 * postpone freeing mbuf. 371 * 372 * we can't do it in interrupt context 373 * because we need to put kva back to kernel_map. 374 */ 375 376 s = splvm(); 377 simple_lock(&so_pendfree_slock); 378 m->m_next = so_pendfree; 379 so_pendfree = m; 380 if (sokvawaiters) 381 wakeup(&socurkva); 382 simple_unlock(&so_pendfree_slock); 383 splx(s); 384 } 385 386 static long 387 sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space) 388 { 389 struct iovec *iov = uio->uio_iov; 390 vaddr_t sva, eva; 391 vsize_t len; 392 vaddr_t lva, va; 393 int npgs, i, error; 394 395 if (uio->uio_segflg != UIO_USERSPACE) 396 return (0); 397 398 if (iov->iov_len < (size_t) space) 399 space = iov->iov_len; 400 if (space > SOCK_LOAN_CHUNK) 401 space = SOCK_LOAN_CHUNK; 402 403 eva = round_page((vaddr_t) iov->iov_base + space); 404 sva = trunc_page((vaddr_t) iov->iov_base); 405 len = eva - sva; 406 npgs = len >> PAGE_SHIFT; 407 408 /* XXX KDASSERT */ 409 KASSERT(npgs <= M_EXT_MAXPAGES); 410 KASSERT(uio->uio_procp != NULL); 411 412 lva = sokvaalloc(len, so); 413 if (lva == 0) 414 return 0; 415 416 error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, sva, len, 417 m->m_ext.ext_pgs, UVM_LOAN_TOPAGE); 418 if (error) { 419 sokvafree(lva, len); 420 return (0); 421 } 422 423 for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE) 424 pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]), 425 VM_PROT_READ); 426 pmap_update(pmap_kernel()); 427 428 lva += (vaddr_t) iov->iov_base & PAGE_MASK; 429 430 MEXTADD(m, (caddr_t) lva, space, M_MBUF, soloanfree, so); 431 m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP; 432 433 uio->uio_resid -= space; 434 /* uio_offset not updated, not set/used for write(2) */ 435 uio->uio_iov->iov_base = (caddr_t) uio->uio_iov->iov_base + space; 436 uio->uio_iov->iov_len -= space; 437 if (uio->uio_iov->iov_len == 0) { 438 uio->uio_iov++; 439 uio->uio_iovcnt--; 440 } 441 442 return (space); 443 } 444 445 /* 446 * Socket operation routines. 447 * These routines are called by the routines in 448 * sys_socket.c or from a system process, and 449 * implement the semantics of socket operations by 450 * switching out to the protocol specific routines. 451 */ 452 /*ARGSUSED*/ 453 int 454 socreate(int dom, struct socket **aso, int type, int proto, struct proc *p) 455 { 456 const struct protosw *prp; 457 struct socket *so; 458 int error, s; 459 460 if (proto) 461 prp = pffindproto(dom, proto, type); 462 else 463 prp = pffindtype(dom, type); 464 if (prp == 0 || prp->pr_usrreq == 0) 465 return (EPROTONOSUPPORT); 466 if (prp->pr_type != type) 467 return (EPROTOTYPE); 468 s = splsoftnet(); 469 so = pool_get(&socket_pool, PR_WAITOK); 470 memset((caddr_t)so, 0, sizeof(*so)); 471 TAILQ_INIT(&so->so_q0); 472 TAILQ_INIT(&so->so_q); 473 so->so_type = type; 474 so->so_proto = prp; 475 so->so_send = sosend; 476 so->so_receive = soreceive; 477 #ifdef MBUFTRACE 478 so->so_rcv.sb_mowner = &prp->pr_domain->dom_mowner; 479 so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner; 480 so->so_mowner = &prp->pr_domain->dom_mowner; 481 #endif 482 if (p != 0) 483 so->so_uid = p->p_ucred->cr_uid; 484 else 485 so->so_uid = UID_MAX; 486 error = (*prp->pr_usrreq)(so, PRU_ATTACH, (struct mbuf *)0, 487 (struct mbuf *)(long)proto, (struct mbuf *)0, p); 488 if (error) { 489 so->so_state |= SS_NOFDREF; 490 sofree(so); 491 splx(s); 492 return (error); 493 } 494 splx(s); 495 *aso = so; 496 return (0); 497 } 498 499 int 500 sobind(struct socket *so, struct mbuf *nam, struct proc *p) 501 { 502 int s, error; 503 504 s = splsoftnet(); 505 error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, (struct mbuf *)0, 506 nam, (struct mbuf *)0, p); 507 splx(s); 508 return (error); 509 } 510 511 int 512 solisten(struct socket *so, int backlog) 513 { 514 int s, error; 515 516 s = splsoftnet(); 517 error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, (struct mbuf *)0, 518 (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); 519 if (error) { 520 splx(s); 521 return (error); 522 } 523 if (TAILQ_EMPTY(&so->so_q)) 524 so->so_options |= SO_ACCEPTCONN; 525 if (backlog < 0) 526 backlog = 0; 527 so->so_qlimit = min(backlog, somaxconn); 528 splx(s); 529 return (0); 530 } 531 532 void 533 sofree(struct socket *so) 534 { 535 536 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) 537 return; 538 if (so->so_head) { 539 /* 540 * We must not decommission a socket that's on the accept(2) 541 * queue. If we do, then accept(2) may hang after select(2) 542 * indicated that the listening socket was ready. 543 */ 544 if (!soqremque(so, 0)) 545 return; 546 } 547 if (so->so_rcv.sb_hiwat) 548 (void)chgsbsize(so->so_uid, &so->so_rcv.sb_hiwat, 0, 549 RLIM_INFINITY); 550 if (so->so_snd.sb_hiwat) 551 (void)chgsbsize(so->so_uid, &so->so_snd.sb_hiwat, 0, 552 RLIM_INFINITY); 553 sbrelease(&so->so_snd, so); 554 sorflush(so); 555 pool_put(&socket_pool, so); 556 } 557 558 /* 559 * Close a socket on last file table reference removal. 560 * Initiate disconnect if connected. 561 * Free socket when disconnect complete. 562 */ 563 int 564 soclose(struct socket *so) 565 { 566 struct socket *so2; 567 int s, error; 568 569 error = 0; 570 s = splsoftnet(); /* conservative */ 571 if (so->so_options & SO_ACCEPTCONN) { 572 while ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) { 573 (void) soqremque(so2, 0); 574 (void) soabort(so2); 575 } 576 while ((so2 = TAILQ_FIRST(&so->so_q)) != 0) { 577 (void) soqremque(so2, 1); 578 (void) soabort(so2); 579 } 580 } 581 if (so->so_pcb == 0) 582 goto discard; 583 if (so->so_state & SS_ISCONNECTED) { 584 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 585 error = sodisconnect(so); 586 if (error) 587 goto drop; 588 } 589 if (so->so_options & SO_LINGER) { 590 if ((so->so_state & SS_ISDISCONNECTING) && 591 (so->so_state & SS_NBIO)) 592 goto drop; 593 while (so->so_state & SS_ISCONNECTED) { 594 error = tsleep((caddr_t)&so->so_timeo, 595 PSOCK | PCATCH, netcls, 596 so->so_linger * hz); 597 if (error) 598 break; 599 } 600 } 601 } 602 drop: 603 if (so->so_pcb) { 604 int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, 605 (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, 606 (struct proc *)0); 607 if (error == 0) 608 error = error2; 609 } 610 discard: 611 if (so->so_state & SS_NOFDREF) 612 panic("soclose: NOFDREF"); 613 so->so_state |= SS_NOFDREF; 614 sofree(so); 615 splx(s); 616 return (error); 617 } 618 619 /* 620 * Must be called at splsoftnet... 621 */ 622 int 623 soabort(struct socket *so) 624 { 625 626 return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, (struct mbuf *)0, 627 (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); 628 } 629 630 int 631 soaccept(struct socket *so, struct mbuf *nam) 632 { 633 int s, error; 634 635 error = 0; 636 s = splsoftnet(); 637 if ((so->so_state & SS_NOFDREF) == 0) 638 panic("soaccept: !NOFDREF"); 639 so->so_state &= ~SS_NOFDREF; 640 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 641 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 642 error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, 643 (struct mbuf *)0, nam, (struct mbuf *)0, (struct proc *)0); 644 else 645 error = ECONNABORTED; 646 647 splx(s); 648 return (error); 649 } 650 651 int 652 soconnect(struct socket *so, struct mbuf *nam, struct proc *p) 653 { 654 int s, error; 655 656 if (so->so_options & SO_ACCEPTCONN) 657 return (EOPNOTSUPP); 658 s = splsoftnet(); 659 /* 660 * If protocol is connection-based, can only connect once. 661 * Otherwise, if connected, try to disconnect first. 662 * This allows user to disconnect by connecting to, e.g., 663 * a null address. 664 */ 665 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 666 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 667 (error = sodisconnect(so)))) 668 error = EISCONN; 669 else 670 error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, 671 (struct mbuf *)0, nam, (struct mbuf *)0, p); 672 splx(s); 673 return (error); 674 } 675 676 int 677 soconnect2(struct socket *so1, struct socket *so2) 678 { 679 int s, error; 680 681 s = splsoftnet(); 682 error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, 683 (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0, 684 (struct proc *)0); 685 splx(s); 686 return (error); 687 } 688 689 int 690 sodisconnect(struct socket *so) 691 { 692 int s, error; 693 694 s = splsoftnet(); 695 if ((so->so_state & SS_ISCONNECTED) == 0) { 696 error = ENOTCONN; 697 goto bad; 698 } 699 if (so->so_state & SS_ISDISCONNECTING) { 700 error = EALREADY; 701 goto bad; 702 } 703 error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, 704 (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, 705 (struct proc *)0); 706 bad: 707 splx(s); 708 sodopendfree(so); 709 return (error); 710 } 711 712 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 713 /* 714 * Send on a socket. 715 * If send must go all at once and message is larger than 716 * send buffering, then hard error. 717 * Lock against other senders. 718 * If must go all at once and not enough room now, then 719 * inform user that this would block and do nothing. 720 * Otherwise, if nonblocking, send as much as possible. 721 * The data to be sent is described by "uio" if nonzero, 722 * otherwise by the mbuf chain "top" (which must be null 723 * if uio is not). Data provided in mbuf chain must be small 724 * enough to send all at once. 725 * 726 * Returns nonzero on error, timeout or signal; callers 727 * must check for short counts if EINTR/ERESTART are returned. 728 * Data and control buffers are freed on return. 729 */ 730 int 731 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 732 struct mbuf *control, int flags, struct proc *p) 733 { 734 struct mbuf **mp, *m; 735 long space, len, resid, clen, mlen; 736 int error, s, dontroute, atomic; 737 738 sodopendfree(so); 739 740 clen = 0; 741 atomic = sosendallatonce(so) || top; 742 if (uio) 743 resid = uio->uio_resid; 744 else 745 resid = top->m_pkthdr.len; 746 /* 747 * In theory resid should be unsigned. 748 * However, space must be signed, as it might be less than 0 749 * if we over-committed, and we must use a signed comparison 750 * of space and resid. On the other hand, a negative resid 751 * causes us to loop sending 0-length segments to the protocol. 752 */ 753 if (resid < 0) { 754 error = EINVAL; 755 goto out; 756 } 757 dontroute = 758 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 759 (so->so_proto->pr_flags & PR_ATOMIC); 760 if (p) 761 p->p_stats->p_ru.ru_msgsnd++; 762 if (control) 763 clen = control->m_len; 764 #define snderr(errno) { error = errno; splx(s); goto release; } 765 766 restart: 767 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) 768 goto out; 769 do { 770 s = splsoftnet(); 771 if (so->so_state & SS_CANTSENDMORE) 772 snderr(EPIPE); 773 if (so->so_error) { 774 error = so->so_error; 775 so->so_error = 0; 776 splx(s); 777 goto release; 778 } 779 if ((so->so_state & SS_ISCONNECTED) == 0) { 780 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 781 if ((so->so_state & SS_ISCONFIRMING) == 0 && 782 !(resid == 0 && clen != 0)) 783 snderr(ENOTCONN); 784 } else if (addr == 0) 785 snderr(EDESTADDRREQ); 786 } 787 space = sbspace(&so->so_snd); 788 if (flags & MSG_OOB) 789 space += 1024; 790 if ((atomic && resid > so->so_snd.sb_hiwat) || 791 clen > so->so_snd.sb_hiwat) 792 snderr(EMSGSIZE); 793 if (space < resid + clen && 794 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 795 if (so->so_state & SS_NBIO) 796 snderr(EWOULDBLOCK); 797 sbunlock(&so->so_snd); 798 error = sbwait(&so->so_snd); 799 splx(s); 800 if (error) 801 goto out; 802 goto restart; 803 } 804 splx(s); 805 mp = ⊤ 806 space -= clen; 807 do { 808 if (uio == NULL) { 809 /* 810 * Data is prepackaged in "top". 811 */ 812 resid = 0; 813 if (flags & MSG_EOR) 814 top->m_flags |= M_EOR; 815 } else do { 816 if (top == 0) { 817 m = m_gethdr(M_WAIT, MT_DATA); 818 mlen = MHLEN; 819 m->m_pkthdr.len = 0; 820 m->m_pkthdr.rcvif = (struct ifnet *)0; 821 } else { 822 m = m_get(M_WAIT, MT_DATA); 823 mlen = MLEN; 824 } 825 MCLAIM(m, so->so_snd.sb_mowner); 826 if (use_sosend_loan && 827 uio->uio_iov->iov_len >= SOCK_LOAN_THRESH && 828 space >= SOCK_LOAN_THRESH && 829 (len = sosend_loan(so, uio, m, 830 space)) != 0) { 831 SOSEND_COUNTER_INCR(&sosend_loan_big); 832 space -= len; 833 goto have_data; 834 } 835 if (resid >= MINCLSIZE && space >= MCLBYTES) { 836 SOSEND_COUNTER_INCR(&sosend_copy_big); 837 m_clget(m, M_WAIT); 838 if ((m->m_flags & M_EXT) == 0) 839 goto nopages; 840 mlen = MCLBYTES; 841 if (atomic && top == 0) { 842 len = lmin(MCLBYTES - max_hdr, 843 resid); 844 m->m_data += max_hdr; 845 } else 846 len = lmin(MCLBYTES, resid); 847 space -= len; 848 } else { 849 nopages: 850 SOSEND_COUNTER_INCR(&sosend_copy_small); 851 len = lmin(lmin(mlen, resid), space); 852 space -= len; 853 /* 854 * For datagram protocols, leave room 855 * for protocol headers in first mbuf. 856 */ 857 if (atomic && top == 0 && len < mlen) 858 MH_ALIGN(m, len); 859 } 860 error = uiomove(mtod(m, caddr_t), (int)len, 861 uio); 862 have_data: 863 resid = uio->uio_resid; 864 m->m_len = len; 865 *mp = m; 866 top->m_pkthdr.len += len; 867 if (error) 868 goto release; 869 mp = &m->m_next; 870 if (resid <= 0) { 871 if (flags & MSG_EOR) 872 top->m_flags |= M_EOR; 873 break; 874 } 875 } while (space > 0 && atomic); 876 877 s = splsoftnet(); 878 879 if (so->so_state & SS_CANTSENDMORE) 880 snderr(EPIPE); 881 882 if (dontroute) 883 so->so_options |= SO_DONTROUTE; 884 if (resid > 0) 885 so->so_state |= SS_MORETOCOME; 886 error = (*so->so_proto->pr_usrreq)(so, 887 (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, 888 top, addr, control, p); 889 if (dontroute) 890 so->so_options &= ~SO_DONTROUTE; 891 if (resid > 0) 892 so->so_state &= ~SS_MORETOCOME; 893 splx(s); 894 895 clen = 0; 896 control = 0; 897 top = 0; 898 mp = ⊤ 899 if (error) 900 goto release; 901 } while (resid && space > 0); 902 } while (resid); 903 904 release: 905 sbunlock(&so->so_snd); 906 out: 907 if (top) 908 m_freem(top); 909 if (control) 910 m_freem(control); 911 return (error); 912 } 913 914 /* 915 * Implement receive operations on a socket. 916 * We depend on the way that records are added to the sockbuf 917 * by sbappend*. In particular, each record (mbufs linked through m_next) 918 * must begin with an address if the protocol so specifies, 919 * followed by an optional mbuf or mbufs containing ancillary data, 920 * and then zero or more mbufs of data. 921 * In order to avoid blocking network interrupts for the entire time here, 922 * we splx() while doing the actual copy to user space. 923 * Although the sockbuf is locked, new data may still be appended, 924 * and thus we must maintain consistency of the sockbuf during that time. 925 * 926 * The caller may receive the data as a single mbuf chain by supplying 927 * an mbuf **mp0 for use in returning the chain. The uio is then used 928 * only for the count in uio_resid. 929 */ 930 int 931 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 932 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 933 { 934 struct proc * p; 935 struct mbuf *m, **mp; 936 int flags, len, error, s, offset, moff, type, orig_resid; 937 const struct protosw *pr; 938 struct mbuf *nextrecord; 939 int mbuf_removed = 0; 940 941 pr = so->so_proto; 942 mp = mp0; 943 type = 0; 944 orig_resid = uio->uio_resid; 945 p = uio->uio_procp; 946 947 if (paddr) 948 *paddr = 0; 949 if (controlp) 950 *controlp = 0; 951 if (flagsp) 952 flags = *flagsp &~ MSG_EOR; 953 else 954 flags = 0; 955 956 if ((flags & MSG_DONTWAIT) == 0) 957 sodopendfree(so); 958 959 if (flags & MSG_OOB) { 960 m = m_get(M_WAIT, MT_DATA); 961 error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, 962 (struct mbuf *)(long)(flags & MSG_PEEK), 963 (struct mbuf *)0, p); 964 if (error) 965 goto bad; 966 do { 967 error = uiomove(mtod(m, caddr_t), 968 (int) min(uio->uio_resid, m->m_len), uio); 969 m = m_free(m); 970 } while (uio->uio_resid && error == 0 && m); 971 bad: 972 if (m) 973 m_freem(m); 974 return (error); 975 } 976 if (mp) 977 *mp = (struct mbuf *)0; 978 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 979 (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, 980 (struct mbuf *)0, (struct mbuf *)0, p); 981 982 restart: 983 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) 984 return (error); 985 s = splsoftnet(); 986 987 m = so->so_rcv.sb_mb; 988 /* 989 * If we have less data than requested, block awaiting more 990 * (subject to any timeout) if: 991 * 1. the current count is less than the low water mark, 992 * 2. MSG_WAITALL is set, and it is possible to do the entire 993 * receive operation at once if we block (resid <= hiwat), or 994 * 3. MSG_DONTWAIT is not set. 995 * If MSG_WAITALL is set but resid is larger than the receive buffer, 996 * we have to do the receive in sections, and thus risk returning 997 * a short count if a timeout or signal occurs after we start. 998 */ 999 if (m == 0 || (((flags & MSG_DONTWAIT) == 0 && 1000 so->so_rcv.sb_cc < uio->uio_resid) && 1001 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 1002 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 1003 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { 1004 #ifdef DIAGNOSTIC 1005 if (m == 0 && so->so_rcv.sb_cc) 1006 panic("receive 1"); 1007 #endif 1008 if (so->so_error) { 1009 if (m) 1010 goto dontblock; 1011 error = so->so_error; 1012 if ((flags & MSG_PEEK) == 0) 1013 so->so_error = 0; 1014 goto release; 1015 } 1016 if (so->so_state & SS_CANTRCVMORE) { 1017 if (m) 1018 goto dontblock; 1019 else 1020 goto release; 1021 } 1022 for (; m; m = m->m_next) 1023 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1024 m = so->so_rcv.sb_mb; 1025 goto dontblock; 1026 } 1027 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1028 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1029 error = ENOTCONN; 1030 goto release; 1031 } 1032 if (uio->uio_resid == 0) 1033 goto release; 1034 if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { 1035 error = EWOULDBLOCK; 1036 goto release; 1037 } 1038 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 1039 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 1040 sbunlock(&so->so_rcv); 1041 error = sbwait(&so->so_rcv); 1042 splx(s); 1043 if (error) 1044 return (error); 1045 goto restart; 1046 } 1047 dontblock: 1048 /* 1049 * On entry here, m points to the first record of the socket buffer. 1050 * While we process the initial mbufs containing address and control 1051 * info, we save a copy of m->m_nextpkt into nextrecord. 1052 */ 1053 if (p) 1054 p->p_stats->p_ru.ru_msgrcv++; 1055 KASSERT(m == so->so_rcv.sb_mb); 1056 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 1057 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 1058 nextrecord = m->m_nextpkt; 1059 if (pr->pr_flags & PR_ADDR) { 1060 #ifdef DIAGNOSTIC 1061 if (m->m_type != MT_SONAME) 1062 panic("receive 1a"); 1063 #endif 1064 orig_resid = 0; 1065 if (flags & MSG_PEEK) { 1066 if (paddr) 1067 *paddr = m_copy(m, 0, m->m_len); 1068 m = m->m_next; 1069 } else { 1070 sbfree(&so->so_rcv, m); 1071 mbuf_removed = 1; 1072 if (paddr) { 1073 *paddr = m; 1074 so->so_rcv.sb_mb = m->m_next; 1075 m->m_next = 0; 1076 m = so->so_rcv.sb_mb; 1077 } else { 1078 MFREE(m, so->so_rcv.sb_mb); 1079 m = so->so_rcv.sb_mb; 1080 } 1081 } 1082 } 1083 while (m && m->m_type == MT_CONTROL && error == 0) { 1084 if (flags & MSG_PEEK) { 1085 if (controlp) 1086 *controlp = m_copy(m, 0, m->m_len); 1087 m = m->m_next; 1088 } else { 1089 sbfree(&so->so_rcv, m); 1090 mbuf_removed = 1; 1091 if (controlp) { 1092 struct domain *dom = pr->pr_domain; 1093 if (dom->dom_externalize && p && 1094 mtod(m, struct cmsghdr *)->cmsg_type == 1095 SCM_RIGHTS) 1096 error = (*dom->dom_externalize)(m, p); 1097 *controlp = m; 1098 so->so_rcv.sb_mb = m->m_next; 1099 m->m_next = 0; 1100 m = so->so_rcv.sb_mb; 1101 } else { 1102 /* 1103 * Dispose of any SCM_RIGHTS message that went 1104 * through the read path rather than recv. 1105 */ 1106 if (pr->pr_domain->dom_dispose && 1107 mtod(m, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 1108 (*pr->pr_domain->dom_dispose)(m); 1109 MFREE(m, so->so_rcv.sb_mb); 1110 m = so->so_rcv.sb_mb; 1111 } 1112 } 1113 if (controlp) { 1114 orig_resid = 0; 1115 controlp = &(*controlp)->m_next; 1116 } 1117 } 1118 1119 /* 1120 * If m is non-NULL, we have some data to read. From now on, 1121 * make sure to keep sb_lastrecord consistent when working on 1122 * the last packet on the chain (nextrecord == NULL) and we 1123 * change m->m_nextpkt. 1124 */ 1125 if (m) { 1126 if ((flags & MSG_PEEK) == 0) { 1127 m->m_nextpkt = nextrecord; 1128 /* 1129 * If nextrecord == NULL (this is a single chain), 1130 * then sb_lastrecord may not be valid here if m 1131 * was changed earlier. 1132 */ 1133 if (nextrecord == NULL) { 1134 KASSERT(so->so_rcv.sb_mb == m); 1135 so->so_rcv.sb_lastrecord = m; 1136 } 1137 } 1138 type = m->m_type; 1139 if (type == MT_OOBDATA) 1140 flags |= MSG_OOB; 1141 } else { 1142 if ((flags & MSG_PEEK) == 0) { 1143 KASSERT(so->so_rcv.sb_mb == m); 1144 so->so_rcv.sb_mb = nextrecord; 1145 SB_EMPTY_FIXUP(&so->so_rcv); 1146 } 1147 } 1148 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 1149 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 1150 1151 moff = 0; 1152 offset = 0; 1153 while (m && uio->uio_resid > 0 && error == 0) { 1154 if (m->m_type == MT_OOBDATA) { 1155 if (type != MT_OOBDATA) 1156 break; 1157 } else if (type == MT_OOBDATA) 1158 break; 1159 #ifdef DIAGNOSTIC 1160 else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) 1161 panic("receive 3"); 1162 #endif 1163 so->so_state &= ~SS_RCVATMARK; 1164 len = uio->uio_resid; 1165 if (so->so_oobmark && len > so->so_oobmark - offset) 1166 len = so->so_oobmark - offset; 1167 if (len > m->m_len - moff) 1168 len = m->m_len - moff; 1169 /* 1170 * If mp is set, just pass back the mbufs. 1171 * Otherwise copy them out via the uio, then free. 1172 * Sockbuf must be consistent here (points to current mbuf, 1173 * it points to next record) when we drop priority; 1174 * we must note any additions to the sockbuf when we 1175 * block interrupts again. 1176 */ 1177 if (mp == 0) { 1178 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 1179 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 1180 splx(s); 1181 error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio); 1182 s = splsoftnet(); 1183 if (error) { 1184 /* 1185 * If any part of the record has been removed 1186 * (such as the MT_SONAME mbuf, which will 1187 * happen when PR_ADDR, and thus also 1188 * PR_ATOMIC, is set), then drop the entire 1189 * record to maintain the atomicity of the 1190 * receive operation. 1191 * 1192 * This avoids a later panic("receive 1a") 1193 * when compiled with DIAGNOSTIC. 1194 */ 1195 if (m && mbuf_removed 1196 && (pr->pr_flags & PR_ATOMIC)) 1197 (void) sbdroprecord(&so->so_rcv); 1198 1199 goto release; 1200 } 1201 } else 1202 uio->uio_resid -= len; 1203 if (len == m->m_len - moff) { 1204 if (m->m_flags & M_EOR) 1205 flags |= MSG_EOR; 1206 if (flags & MSG_PEEK) { 1207 m = m->m_next; 1208 moff = 0; 1209 } else { 1210 nextrecord = m->m_nextpkt; 1211 sbfree(&so->so_rcv, m); 1212 if (mp) { 1213 *mp = m; 1214 mp = &m->m_next; 1215 so->so_rcv.sb_mb = m = m->m_next; 1216 *mp = (struct mbuf *)0; 1217 } else { 1218 MFREE(m, so->so_rcv.sb_mb); 1219 m = so->so_rcv.sb_mb; 1220 } 1221 /* 1222 * If m != NULL, we also know that 1223 * so->so_rcv.sb_mb != NULL. 1224 */ 1225 KASSERT(so->so_rcv.sb_mb == m); 1226 if (m) { 1227 m->m_nextpkt = nextrecord; 1228 if (nextrecord == NULL) 1229 so->so_rcv.sb_lastrecord = m; 1230 } else { 1231 so->so_rcv.sb_mb = nextrecord; 1232 SB_EMPTY_FIXUP(&so->so_rcv); 1233 } 1234 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 1235 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 1236 } 1237 } else { 1238 if (flags & MSG_PEEK) 1239 moff += len; 1240 else { 1241 if (mp) 1242 *mp = m_copym(m, 0, len, M_WAIT); 1243 m->m_data += len; 1244 m->m_len -= len; 1245 so->so_rcv.sb_cc -= len; 1246 } 1247 } 1248 if (so->so_oobmark) { 1249 if ((flags & MSG_PEEK) == 0) { 1250 so->so_oobmark -= len; 1251 if (so->so_oobmark == 0) { 1252 so->so_state |= SS_RCVATMARK; 1253 break; 1254 } 1255 } else { 1256 offset += len; 1257 if (offset == so->so_oobmark) 1258 break; 1259 } 1260 } 1261 if (flags & MSG_EOR) 1262 break; 1263 /* 1264 * If the MSG_WAITALL flag is set (for non-atomic socket), 1265 * we must not quit until "uio->uio_resid == 0" or an error 1266 * termination. If a signal/timeout occurs, return 1267 * with a short count but without error. 1268 * Keep sockbuf locked against other readers. 1269 */ 1270 while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 && 1271 !sosendallatonce(so) && !nextrecord) { 1272 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1273 break; 1274 /* 1275 * If we are peeking and the socket receive buffer is 1276 * full, stop since we can't get more data to peek at. 1277 */ 1278 if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0) 1279 break; 1280 /* 1281 * If we've drained the socket buffer, tell the 1282 * protocol in case it needs to do something to 1283 * get it filled again. 1284 */ 1285 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) 1286 (*pr->pr_usrreq)(so, PRU_RCVD, 1287 (struct mbuf *)0, 1288 (struct mbuf *)(long)flags, 1289 (struct mbuf *)0, p); 1290 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 1291 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 1292 error = sbwait(&so->so_rcv); 1293 if (error) { 1294 sbunlock(&so->so_rcv); 1295 splx(s); 1296 return (0); 1297 } 1298 if ((m = so->so_rcv.sb_mb) != NULL) 1299 nextrecord = m->m_nextpkt; 1300 } 1301 } 1302 1303 if (m && pr->pr_flags & PR_ATOMIC) { 1304 flags |= MSG_TRUNC; 1305 if ((flags & MSG_PEEK) == 0) 1306 (void) sbdroprecord(&so->so_rcv); 1307 } 1308 if ((flags & MSG_PEEK) == 0) { 1309 if (m == 0) { 1310 /* 1311 * First part is an inline SB_EMPTY_FIXUP(). Second 1312 * part makes sure sb_lastrecord is up-to-date if 1313 * there is still data in the socket buffer. 1314 */ 1315 so->so_rcv.sb_mb = nextrecord; 1316 if (so->so_rcv.sb_mb == NULL) { 1317 so->so_rcv.sb_mbtail = NULL; 1318 so->so_rcv.sb_lastrecord = NULL; 1319 } else if (nextrecord->m_nextpkt == NULL) 1320 so->so_rcv.sb_lastrecord = nextrecord; 1321 } 1322 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 1323 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 1324 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 1325 (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, 1326 (struct mbuf *)(long)flags, (struct mbuf *)0, p); 1327 } 1328 if (orig_resid == uio->uio_resid && orig_resid && 1329 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1330 sbunlock(&so->so_rcv); 1331 splx(s); 1332 goto restart; 1333 } 1334 1335 if (flagsp) 1336 *flagsp |= flags; 1337 release: 1338 sbunlock(&so->so_rcv); 1339 splx(s); 1340 return (error); 1341 } 1342 1343 int 1344 soshutdown(struct socket *so, int how) 1345 { 1346 const struct protosw *pr; 1347 1348 pr = so->so_proto; 1349 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1350 return (EINVAL); 1351 1352 if (how == SHUT_RD || how == SHUT_RDWR) 1353 sorflush(so); 1354 if (how == SHUT_WR || how == SHUT_RDWR) 1355 return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, (struct mbuf *)0, 1356 (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); 1357 return (0); 1358 } 1359 1360 void 1361 sorflush(struct socket *so) 1362 { 1363 struct sockbuf *sb, asb; 1364 const struct protosw *pr; 1365 int s; 1366 1367 sb = &so->so_rcv; 1368 pr = so->so_proto; 1369 sb->sb_flags |= SB_NOINTR; 1370 (void) sblock(sb, M_WAITOK); 1371 s = splnet(); 1372 socantrcvmore(so); 1373 sbunlock(sb); 1374 asb = *sb; 1375 /* 1376 * Clear most of the sockbuf structure, but leave some of the 1377 * fields valid. 1378 */ 1379 memset(&sb->sb_startzero, 0, 1380 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1381 splx(s); 1382 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1383 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1384 sbrelease(&asb, so); 1385 } 1386 1387 int 1388 sosetopt(struct socket *so, int level, int optname, struct mbuf *m0) 1389 { 1390 int error; 1391 struct mbuf *m; 1392 1393 error = 0; 1394 m = m0; 1395 if (level != SOL_SOCKET) { 1396 if (so->so_proto && so->so_proto->pr_ctloutput) 1397 return ((*so->so_proto->pr_ctloutput) 1398 (PRCO_SETOPT, so, level, optname, &m0)); 1399 error = ENOPROTOOPT; 1400 } else { 1401 switch (optname) { 1402 1403 case SO_LINGER: 1404 if (m == NULL || m->m_len != sizeof(struct linger)) { 1405 error = EINVAL; 1406 goto bad; 1407 } 1408 so->so_linger = mtod(m, struct linger *)->l_linger; 1409 /* fall thru... */ 1410 1411 case SO_DEBUG: 1412 case SO_KEEPALIVE: 1413 case SO_DONTROUTE: 1414 case SO_USELOOPBACK: 1415 case SO_BROADCAST: 1416 case SO_REUSEADDR: 1417 case SO_REUSEPORT: 1418 case SO_OOBINLINE: 1419 case SO_TIMESTAMP: 1420 if (m == NULL || m->m_len < sizeof(int)) { 1421 error = EINVAL; 1422 goto bad; 1423 } 1424 if (*mtod(m, int *)) 1425 so->so_options |= optname; 1426 else 1427 so->so_options &= ~optname; 1428 break; 1429 1430 case SO_SNDBUF: 1431 case SO_RCVBUF: 1432 case SO_SNDLOWAT: 1433 case SO_RCVLOWAT: 1434 { 1435 int optval; 1436 1437 if (m == NULL || m->m_len < sizeof(int)) { 1438 error = EINVAL; 1439 goto bad; 1440 } 1441 1442 /* 1443 * Values < 1 make no sense for any of these 1444 * options, so disallow them. 1445 */ 1446 optval = *mtod(m, int *); 1447 if (optval < 1) { 1448 error = EINVAL; 1449 goto bad; 1450 } 1451 1452 switch (optname) { 1453 1454 case SO_SNDBUF: 1455 case SO_RCVBUF: 1456 if (sbreserve(optname == SO_SNDBUF ? 1457 &so->so_snd : &so->so_rcv, 1458 (u_long) optval, so) == 0) { 1459 error = ENOBUFS; 1460 goto bad; 1461 } 1462 break; 1463 1464 /* 1465 * Make sure the low-water is never greater than 1466 * the high-water. 1467 */ 1468 case SO_SNDLOWAT: 1469 so->so_snd.sb_lowat = 1470 (optval > so->so_snd.sb_hiwat) ? 1471 so->so_snd.sb_hiwat : optval; 1472 break; 1473 case SO_RCVLOWAT: 1474 so->so_rcv.sb_lowat = 1475 (optval > so->so_rcv.sb_hiwat) ? 1476 so->so_rcv.sb_hiwat : optval; 1477 break; 1478 } 1479 break; 1480 } 1481 1482 case SO_SNDTIMEO: 1483 case SO_RCVTIMEO: 1484 { 1485 struct timeval *tv; 1486 int val; 1487 1488 if (m == NULL || m->m_len < sizeof(*tv)) { 1489 error = EINVAL; 1490 goto bad; 1491 } 1492 tv = mtod(m, struct timeval *); 1493 if (tv->tv_sec > (INT_MAX - tv->tv_usec / tick) / hz) { 1494 error = EDOM; 1495 goto bad; 1496 } 1497 val = tv->tv_sec * hz + tv->tv_usec / tick; 1498 if (val == 0 && tv->tv_usec != 0) 1499 val = 1; 1500 1501 switch (optname) { 1502 1503 case SO_SNDTIMEO: 1504 so->so_snd.sb_timeo = val; 1505 break; 1506 case SO_RCVTIMEO: 1507 so->so_rcv.sb_timeo = val; 1508 break; 1509 } 1510 break; 1511 } 1512 1513 default: 1514 error = ENOPROTOOPT; 1515 break; 1516 } 1517 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { 1518 (void) ((*so->so_proto->pr_ctloutput) 1519 (PRCO_SETOPT, so, level, optname, &m0)); 1520 m = NULL; /* freed by protocol */ 1521 } 1522 } 1523 bad: 1524 if (m) 1525 (void) m_free(m); 1526 return (error); 1527 } 1528 1529 int 1530 sogetopt(struct socket *so, int level, int optname, struct mbuf **mp) 1531 { 1532 struct mbuf *m; 1533 1534 if (level != SOL_SOCKET) { 1535 if (so->so_proto && so->so_proto->pr_ctloutput) { 1536 return ((*so->so_proto->pr_ctloutput) 1537 (PRCO_GETOPT, so, level, optname, mp)); 1538 } else 1539 return (ENOPROTOOPT); 1540 } else { 1541 m = m_get(M_WAIT, MT_SOOPTS); 1542 m->m_len = sizeof(int); 1543 1544 switch (optname) { 1545 1546 case SO_LINGER: 1547 m->m_len = sizeof(struct linger); 1548 mtod(m, struct linger *)->l_onoff = 1549 so->so_options & SO_LINGER; 1550 mtod(m, struct linger *)->l_linger = so->so_linger; 1551 break; 1552 1553 case SO_USELOOPBACK: 1554 case SO_DONTROUTE: 1555 case SO_DEBUG: 1556 case SO_KEEPALIVE: 1557 case SO_REUSEADDR: 1558 case SO_REUSEPORT: 1559 case SO_BROADCAST: 1560 case SO_OOBINLINE: 1561 case SO_TIMESTAMP: 1562 *mtod(m, int *) = so->so_options & optname; 1563 break; 1564 1565 case SO_TYPE: 1566 *mtod(m, int *) = so->so_type; 1567 break; 1568 1569 case SO_ERROR: 1570 *mtod(m, int *) = so->so_error; 1571 so->so_error = 0; 1572 break; 1573 1574 case SO_SNDBUF: 1575 *mtod(m, int *) = so->so_snd.sb_hiwat; 1576 break; 1577 1578 case SO_RCVBUF: 1579 *mtod(m, int *) = so->so_rcv.sb_hiwat; 1580 break; 1581 1582 case SO_SNDLOWAT: 1583 *mtod(m, int *) = so->so_snd.sb_lowat; 1584 break; 1585 1586 case SO_RCVLOWAT: 1587 *mtod(m, int *) = so->so_rcv.sb_lowat; 1588 break; 1589 1590 case SO_SNDTIMEO: 1591 case SO_RCVTIMEO: 1592 { 1593 int val = (optname == SO_SNDTIMEO ? 1594 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1595 1596 m->m_len = sizeof(struct timeval); 1597 mtod(m, struct timeval *)->tv_sec = val / hz; 1598 mtod(m, struct timeval *)->tv_usec = 1599 (val % hz) * tick; 1600 break; 1601 } 1602 1603 case SO_OVERFLOWED: 1604 *mtod(m, int *) = so->so_rcv.sb_overflowed; 1605 break; 1606 1607 default: 1608 (void)m_free(m); 1609 return (ENOPROTOOPT); 1610 } 1611 *mp = m; 1612 return (0); 1613 } 1614 } 1615 1616 void 1617 sohasoutofband(struct socket *so) 1618 { 1619 fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so); 1620 selwakeup(&so->so_rcv.sb_sel); 1621 } 1622 1623 static void 1624 filt_sordetach(struct knote *kn) 1625 { 1626 struct socket *so; 1627 1628 so = (struct socket *)kn->kn_fp->f_data; 1629 SLIST_REMOVE(&so->so_rcv.sb_sel.sel_klist, kn, knote, kn_selnext); 1630 if (SLIST_EMPTY(&so->so_rcv.sb_sel.sel_klist)) 1631 so->so_rcv.sb_flags &= ~SB_KNOTE; 1632 } 1633 1634 /*ARGSUSED*/ 1635 static int 1636 filt_soread(struct knote *kn, long hint) 1637 { 1638 struct socket *so; 1639 1640 so = (struct socket *)kn->kn_fp->f_data; 1641 kn->kn_data = so->so_rcv.sb_cc; 1642 if (so->so_state & SS_CANTRCVMORE) { 1643 kn->kn_flags |= EV_EOF; 1644 kn->kn_fflags = so->so_error; 1645 return (1); 1646 } 1647 if (so->so_error) /* temporary udp error */ 1648 return (1); 1649 if (kn->kn_sfflags & NOTE_LOWAT) 1650 return (kn->kn_data >= kn->kn_sdata); 1651 return (kn->kn_data >= so->so_rcv.sb_lowat); 1652 } 1653 1654 static void 1655 filt_sowdetach(struct knote *kn) 1656 { 1657 struct socket *so; 1658 1659 so = (struct socket *)kn->kn_fp->f_data; 1660 SLIST_REMOVE(&so->so_snd.sb_sel.sel_klist, kn, knote, kn_selnext); 1661 if (SLIST_EMPTY(&so->so_snd.sb_sel.sel_klist)) 1662 so->so_snd.sb_flags &= ~SB_KNOTE; 1663 } 1664 1665 /*ARGSUSED*/ 1666 static int 1667 filt_sowrite(struct knote *kn, long hint) 1668 { 1669 struct socket *so; 1670 1671 so = (struct socket *)kn->kn_fp->f_data; 1672 kn->kn_data = sbspace(&so->so_snd); 1673 if (so->so_state & SS_CANTSENDMORE) { 1674 kn->kn_flags |= EV_EOF; 1675 kn->kn_fflags = so->so_error; 1676 return (1); 1677 } 1678 if (so->so_error) /* temporary udp error */ 1679 return (1); 1680 if (((so->so_state & SS_ISCONNECTED) == 0) && 1681 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 1682 return (0); 1683 if (kn->kn_sfflags & NOTE_LOWAT) 1684 return (kn->kn_data >= kn->kn_sdata); 1685 return (kn->kn_data >= so->so_snd.sb_lowat); 1686 } 1687 1688 /*ARGSUSED*/ 1689 static int 1690 filt_solisten(struct knote *kn, long hint) 1691 { 1692 struct socket *so; 1693 1694 so = (struct socket *)kn->kn_fp->f_data; 1695 1696 /* 1697 * Set kn_data to number of incoming connections, not 1698 * counting partial (incomplete) connections. 1699 */ 1700 kn->kn_data = so->so_qlen; 1701 return (kn->kn_data > 0); 1702 } 1703 1704 static const struct filterops solisten_filtops = 1705 { 1, NULL, filt_sordetach, filt_solisten }; 1706 static const struct filterops soread_filtops = 1707 { 1, NULL, filt_sordetach, filt_soread }; 1708 static const struct filterops sowrite_filtops = 1709 { 1, NULL, filt_sowdetach, filt_sowrite }; 1710 1711 int 1712 soo_kqfilter(struct file *fp, struct knote *kn) 1713 { 1714 struct socket *so; 1715 struct sockbuf *sb; 1716 1717 so = (struct socket *)kn->kn_fp->f_data; 1718 switch (kn->kn_filter) { 1719 case EVFILT_READ: 1720 if (so->so_options & SO_ACCEPTCONN) 1721 kn->kn_fop = &solisten_filtops; 1722 else 1723 kn->kn_fop = &soread_filtops; 1724 sb = &so->so_rcv; 1725 break; 1726 case EVFILT_WRITE: 1727 kn->kn_fop = &sowrite_filtops; 1728 sb = &so->so_snd; 1729 break; 1730 default: 1731 return (1); 1732 } 1733 SLIST_INSERT_HEAD(&sb->sb_sel.sel_klist, kn, kn_selnext); 1734 sb->sb_flags |= SB_KNOTE; 1735 return (0); 1736 } 1737 1738 #include <sys/sysctl.h> 1739 1740 static int sysctl_kern_somaxkva(SYSCTLFN_PROTO); 1741 1742 /* 1743 * sysctl helper routine for kern.somaxkva. ensures that the given 1744 * value is not too small. 1745 * (XXX should we maybe make sure it's not too large as well?) 1746 */ 1747 static int 1748 sysctl_kern_somaxkva(SYSCTLFN_ARGS) 1749 { 1750 int error, new_somaxkva; 1751 struct sysctlnode node; 1752 int s; 1753 1754 new_somaxkva = somaxkva; 1755 node = *rnode; 1756 node.sysctl_data = &new_somaxkva; 1757 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 1758 if (error || newp == NULL) 1759 return (error); 1760 1761 if (new_somaxkva < (16 * 1024 * 1024)) /* sanity */ 1762 return (EINVAL); 1763 1764 s = splvm(); 1765 simple_lock(&so_pendfree_slock); 1766 somaxkva = new_somaxkva; 1767 wakeup(&socurkva); 1768 simple_unlock(&so_pendfree_slock); 1769 splx(s); 1770 1771 return (error); 1772 } 1773 1774 SYSCTL_SETUP(sysctl_kern_somaxkva_setup, "sysctl kern.somaxkva setup") 1775 { 1776 1777 sysctl_createv(clog, 0, NULL, NULL, 1778 CTLFLAG_PERMANENT, 1779 CTLTYPE_NODE, "kern", NULL, 1780 NULL, 0, NULL, 0, 1781 CTL_KERN, CTL_EOL); 1782 1783 sysctl_createv(clog, 0, NULL, NULL, 1784 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1785 CTLTYPE_INT, "somaxkva", 1786 SYSCTL_DESCR("Maximum amount of kernel memory to be " 1787 "used for socket buffers"), 1788 sysctl_kern_somaxkva, 0, NULL, 0, 1789 CTL_KERN, KERN_SOMAXKVA, CTL_EOL); 1790 } 1791