1 /* $NetBSD: uipc_socket.c,v 1.64 2002/05/02 17:55:51 thorpej Exp $ */ 2 3 /*- 4 * Copyright (c) 2002 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of Wasabi Systems, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 39 /* 40 * Copyright (c) 1982, 1986, 1988, 1990, 1993 41 * The Regents of the University of California. All rights reserved. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. All advertising materials mentioning features or use of this software 52 * must display the following acknowledgement: 53 * This product includes software developed by the University of 54 * California, Berkeley and its contributors. 55 * 4. Neither the name of the University nor the names of its contributors 56 * may be used to endorse or promote products derived from this software 57 * without specific prior written permission. 58 * 59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 62 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 69 * SUCH DAMAGE. 70 * 71 * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95 72 */ 73 74 #include <sys/cdefs.h> 75 __KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.64 2002/05/02 17:55:51 thorpej Exp $"); 76 77 #include "opt_sock_counters.h" 78 #include "opt_sosend_loan.h" 79 80 #include <sys/param.h> 81 #include <sys/systm.h> 82 #include <sys/proc.h> 83 #include <sys/file.h> 84 #include <sys/malloc.h> 85 #include <sys/mbuf.h> 86 #include <sys/domain.h> 87 #include <sys/kernel.h> 88 #include <sys/protosw.h> 89 #include <sys/socket.h> 90 #include <sys/socketvar.h> 91 #include <sys/signalvar.h> 92 #include <sys/resourcevar.h> 93 #include <sys/pool.h> 94 95 #include <uvm/uvm.h> 96 97 struct pool socket_pool; 98 99 extern int somaxconn; /* patchable (XXX sysctl) */ 100 int somaxconn = SOMAXCONN; 101 102 #ifdef SOSEND_COUNTERS 103 #include <sys/device.h> 104 105 struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 106 NULL, "sosend", "loan big"); 107 struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 108 NULL, "sosend", "copy big"); 109 struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 110 NULL, "sosend", "copy small"); 111 struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, 112 NULL, "sosend", "kva limit"); 113 114 #define SOSEND_COUNTER_INCR(ev) (ev)->ev_count++ 115 116 #else 117 118 #define SOSEND_COUNTER_INCR(ev) /* nothing */ 119 120 #endif /* SOSEND_COUNTERS */ 121 122 void 123 soinit(void) 124 { 125 126 pool_init(&socket_pool, sizeof(struct socket), 0, 0, 0, 127 "sockpl", NULL); 128 129 #ifdef SOSEND_COUNTERS 130 evcnt_attach_static(&sosend_loan_big); 131 evcnt_attach_static(&sosend_copy_big); 132 evcnt_attach_static(&sosend_copy_small); 133 evcnt_attach_static(&sosend_kvalimit); 134 #endif /* SOSEND_COUNTERS */ 135 } 136 137 #ifdef SOSEND_LOAN 138 139 struct mbuf *so_pendfree; 140 141 int somaxkva = 16 * 1024 * 1024; 142 int socurkva; 143 int sokvawaiters; 144 145 #define SOCK_LOAN_THRESH 4096 146 #define SOCK_LOAN_CHUNK 65536 147 148 static void 149 sodoloanfree(caddr_t buf, u_int size) 150 { 151 struct vm_page **pgs; 152 vaddr_t va, sva, eva; 153 vsize_t len; 154 paddr_t pa; 155 int i, npgs; 156 157 eva = round_page((vaddr_t) buf + size); 158 sva = trunc_page((vaddr_t) buf); 159 len = eva - sva; 160 npgs = len >> PAGE_SHIFT; 161 162 pgs = alloca(npgs * sizeof(*pgs)); 163 164 for (i = 0, va = sva; va < eva; i++, va += PAGE_SIZE) { 165 if (pmap_extract(pmap_kernel(), va, &pa) == FALSE) 166 panic("sodoloanfree: va 0x%lx not mapped", va); 167 pgs[i] = PHYS_TO_VM_PAGE(pa); 168 } 169 170 pmap_kremove(sva, len); 171 pmap_update(pmap_kernel()); 172 uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE); 173 uvm_km_free(kernel_map, sva, len); 174 socurkva -= len; 175 if (sokvawaiters) 176 wakeup(&socurkva); 177 } 178 179 static size_t 180 sodopendfree(struct socket *so) 181 { 182 struct mbuf *m; 183 size_t rv = 0; 184 int s; 185 186 s = splvm(); 187 188 for (;;) { 189 m = so_pendfree; 190 if (m == NULL) 191 break; 192 so_pendfree = m->m_next; 193 splx(s); 194 195 rv += m->m_ext.ext_size; 196 sodoloanfree(m->m_ext.ext_buf, m->m_ext.ext_size); 197 s = splvm(); 198 pool_cache_put(&mbpool_cache, m); 199 } 200 201 for (;;) { 202 m = so->so_pendfree; 203 if (m == NULL) 204 break; 205 so->so_pendfree = m->m_next; 206 splx(s); 207 208 rv += m->m_ext.ext_size; 209 sodoloanfree(m->m_ext.ext_buf, m->m_ext.ext_size); 210 s = splvm(); 211 pool_cache_put(&mbpool_cache, m); 212 } 213 214 splx(s); 215 return (rv); 216 } 217 218 static void 219 soloanfree(struct mbuf *m, caddr_t buf, u_int size, void *arg) 220 { 221 struct socket *so = arg; 222 int s; 223 224 if (m == NULL) { 225 sodoloanfree(buf, size); 226 return; 227 } 228 229 s = splvm(); 230 m->m_next = so->so_pendfree; 231 so->so_pendfree = m; 232 splx(s); 233 if (sokvawaiters) 234 wakeup(&socurkva); 235 } 236 237 static long 238 sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space) 239 { 240 struct iovec *iov = uio->uio_iov; 241 vaddr_t sva, eva; 242 vsize_t len; 243 struct vm_page **pgs; 244 vaddr_t lva, va; 245 int npgs, s, i, error; 246 247 if (uio->uio_segflg != UIO_USERSPACE) 248 return (0); 249 250 if (iov->iov_len < (size_t) space) 251 space = iov->iov_len; 252 if (space > SOCK_LOAN_CHUNK) 253 space = SOCK_LOAN_CHUNK; 254 255 eva = round_page((vaddr_t) iov->iov_base + space); 256 sva = trunc_page((vaddr_t) iov->iov_base); 257 len = eva - sva; 258 npgs = len >> PAGE_SHIFT; 259 260 while (socurkva + len > somaxkva) { 261 if (sodopendfree(so)) 262 continue; 263 SOSEND_COUNTER_INCR(&sosend_kvalimit); 264 s = splvm(); 265 sokvawaiters++; 266 (void) tsleep(&socurkva, PVM, "sokva", 0); 267 sokvawaiters--; 268 splx(s); 269 } 270 271 lva = uvm_km_valloc_wait(kernel_map, len); 272 if (lva == 0) 273 return (0); 274 socurkva += len; 275 276 pgs = alloca(npgs * sizeof(*pgs)); 277 278 error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, sva, len, 279 pgs, UVM_LOAN_TOPAGE); 280 if (error) { 281 uvm_km_free(kernel_map, lva, len); 282 socurkva -= len; 283 return (0); 284 } 285 286 for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE) 287 pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pgs[i]), VM_PROT_READ); 288 pmap_update(pmap_kernel()); 289 290 lva += (vaddr_t) iov->iov_base & PAGE_MASK; 291 292 MEXTADD(m, (caddr_t) lva, space, M_MBUF, soloanfree, so); 293 294 uio->uio_resid -= space; 295 /* uio_offset not updated, not set/used for write(2) */ 296 uio->uio_iov->iov_base = (caddr_t) uio->uio_iov->iov_base + space; 297 uio->uio_iov->iov_len -= space; 298 if (uio->uio_iov->iov_len == 0) { 299 uio->uio_iov++; 300 uio->uio_iovcnt--; 301 } 302 303 return (space); 304 } 305 306 #endif /* SOSEND_LOAN */ 307 308 /* 309 * Socket operation routines. 310 * These routines are called by the routines in 311 * sys_socket.c or from a system process, and 312 * implement the semantics of socket operations by 313 * switching out to the protocol specific routines. 314 */ 315 /*ARGSUSED*/ 316 int 317 socreate(int dom, struct socket **aso, int type, int proto) 318 { 319 struct proc *p; 320 struct protosw *prp; 321 struct socket *so; 322 int error, s; 323 324 p = curproc; /* XXX */ 325 if (proto) 326 prp = pffindproto(dom, proto, type); 327 else 328 prp = pffindtype(dom, type); 329 if (prp == 0 || prp->pr_usrreq == 0) 330 return (EPROTONOSUPPORT); 331 if (prp->pr_type != type) 332 return (EPROTOTYPE); 333 s = splsoftnet(); 334 so = pool_get(&socket_pool, PR_WAITOK); 335 memset((caddr_t)so, 0, sizeof(*so)); 336 TAILQ_INIT(&so->so_q0); 337 TAILQ_INIT(&so->so_q); 338 so->so_type = type; 339 so->so_proto = prp; 340 so->so_send = sosend; 341 so->so_receive = soreceive; 342 if (p != 0) 343 so->so_uid = p->p_ucred->cr_uid; 344 error = (*prp->pr_usrreq)(so, PRU_ATTACH, (struct mbuf *)0, 345 (struct mbuf *)(long)proto, (struct mbuf *)0, p); 346 if (error) { 347 so->so_state |= SS_NOFDREF; 348 sofree(so); 349 splx(s); 350 return (error); 351 } 352 splx(s); 353 *aso = so; 354 return (0); 355 } 356 357 int 358 sobind(struct socket *so, struct mbuf *nam, struct proc *p) 359 { 360 int s, error; 361 362 s = splsoftnet(); 363 error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, (struct mbuf *)0, 364 nam, (struct mbuf *)0, p); 365 splx(s); 366 return (error); 367 } 368 369 int 370 solisten(struct socket *so, int backlog) 371 { 372 int s, error; 373 374 s = splsoftnet(); 375 error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, (struct mbuf *)0, 376 (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); 377 if (error) { 378 splx(s); 379 return (error); 380 } 381 if (TAILQ_EMPTY(&so->so_q)) 382 so->so_options |= SO_ACCEPTCONN; 383 if (backlog < 0) 384 backlog = 0; 385 so->so_qlimit = min(backlog, somaxconn); 386 splx(s); 387 return (0); 388 } 389 390 void 391 sofree(struct socket *so) 392 { 393 #ifdef SOSEND_LOAN 394 struct mbuf *m; 395 #endif 396 397 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) 398 return; 399 if (so->so_head) { 400 /* 401 * We must not decommission a socket that's on the accept(2) 402 * queue. If we do, then accept(2) may hang after select(2) 403 * indicated that the listening socket was ready. 404 */ 405 if (!soqremque(so, 0)) 406 return; 407 } 408 sbrelease(&so->so_snd); 409 sorflush(so); 410 #ifdef SOSEND_LOAN 411 while ((m = so->so_pendfree) != NULL) { 412 so->so_pendfree = m->m_next; 413 m->m_next = so_pendfree; 414 so_pendfree = m; 415 } 416 #endif 417 pool_put(&socket_pool, so); 418 } 419 420 /* 421 * Close a socket on last file table reference removal. 422 * Initiate disconnect if connected. 423 * Free socket when disconnect complete. 424 */ 425 int 426 soclose(struct socket *so) 427 { 428 struct socket *so2; 429 int s, error; 430 431 error = 0; 432 s = splsoftnet(); /* conservative */ 433 if (so->so_options & SO_ACCEPTCONN) { 434 while ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) { 435 (void) soqremque(so2, 0); 436 (void) soabort(so2); 437 } 438 while ((so2 = TAILQ_FIRST(&so->so_q)) != 0) { 439 (void) soqremque(so2, 1); 440 (void) soabort(so2); 441 } 442 } 443 if (so->so_pcb == 0) 444 goto discard; 445 if (so->so_state & SS_ISCONNECTED) { 446 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 447 error = sodisconnect(so); 448 if (error) 449 goto drop; 450 } 451 if (so->so_options & SO_LINGER) { 452 if ((so->so_state & SS_ISDISCONNECTING) && 453 (so->so_state & SS_NBIO)) 454 goto drop; 455 while (so->so_state & SS_ISCONNECTED) { 456 error = tsleep((caddr_t)&so->so_timeo, 457 PSOCK | PCATCH, netcls, 458 so->so_linger * hz); 459 if (error) 460 break; 461 } 462 } 463 } 464 drop: 465 if (so->so_pcb) { 466 int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, 467 (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, 468 (struct proc *)0); 469 if (error == 0) 470 error = error2; 471 } 472 discard: 473 if (so->so_state & SS_NOFDREF) 474 panic("soclose: NOFDREF"); 475 so->so_state |= SS_NOFDREF; 476 sofree(so); 477 splx(s); 478 return (error); 479 } 480 481 /* 482 * Must be called at splsoftnet... 483 */ 484 int 485 soabort(struct socket *so) 486 { 487 488 return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, (struct mbuf *)0, 489 (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); 490 } 491 492 int 493 soaccept(struct socket *so, struct mbuf *nam) 494 { 495 int s, error; 496 497 error = 0; 498 s = splsoftnet(); 499 if ((so->so_state & SS_NOFDREF) == 0) 500 panic("soaccept: !NOFDREF"); 501 so->so_state &= ~SS_NOFDREF; 502 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 503 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 504 error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, 505 (struct mbuf *)0, nam, (struct mbuf *)0, (struct proc *)0); 506 else 507 error = ECONNABORTED; 508 509 splx(s); 510 return (error); 511 } 512 513 int 514 soconnect(struct socket *so, struct mbuf *nam) 515 { 516 struct proc *p; 517 int s, error; 518 519 p = curproc; /* XXX */ 520 if (so->so_options & SO_ACCEPTCONN) 521 return (EOPNOTSUPP); 522 s = splsoftnet(); 523 /* 524 * If protocol is connection-based, can only connect once. 525 * Otherwise, if connected, try to disconnect first. 526 * This allows user to disconnect by connecting to, e.g., 527 * a null address. 528 */ 529 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 530 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 531 (error = sodisconnect(so)))) 532 error = EISCONN; 533 else 534 error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, 535 (struct mbuf *)0, nam, (struct mbuf *)0, p); 536 splx(s); 537 return (error); 538 } 539 540 int 541 soconnect2(struct socket *so1, struct socket *so2) 542 { 543 int s, error; 544 545 s = splsoftnet(); 546 error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, 547 (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0, 548 (struct proc *)0); 549 splx(s); 550 return (error); 551 } 552 553 int 554 sodisconnect(struct socket *so) 555 { 556 int s, error; 557 558 s = splsoftnet(); 559 if ((so->so_state & SS_ISCONNECTED) == 0) { 560 error = ENOTCONN; 561 goto bad; 562 } 563 if (so->so_state & SS_ISDISCONNECTING) { 564 error = EALREADY; 565 goto bad; 566 } 567 error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, 568 (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0, 569 (struct proc *)0); 570 bad: 571 splx(s); 572 #ifdef SOSEND_LOAN 573 sodopendfree(so); 574 #endif 575 return (error); 576 } 577 578 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 579 /* 580 * Send on a socket. 581 * If send must go all at once and message is larger than 582 * send buffering, then hard error. 583 * Lock against other senders. 584 * If must go all at once and not enough room now, then 585 * inform user that this would block and do nothing. 586 * Otherwise, if nonblocking, send as much as possible. 587 * The data to be sent is described by "uio" if nonzero, 588 * otherwise by the mbuf chain "top" (which must be null 589 * if uio is not). Data provided in mbuf chain must be small 590 * enough to send all at once. 591 * 592 * Returns nonzero on error, timeout or signal; callers 593 * must check for short counts if EINTR/ERESTART are returned. 594 * Data and control buffers are freed on return. 595 */ 596 int 597 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 598 struct mbuf *control, int flags) 599 { 600 struct proc *p; 601 struct mbuf **mp, *m; 602 long space, len, resid, clen, mlen; 603 int error, s, dontroute, atomic; 604 605 #ifdef SOSEND_LOAN 606 sodopendfree(so); 607 #endif 608 609 p = curproc; /* XXX */ 610 clen = 0; 611 atomic = sosendallatonce(so) || top; 612 if (uio) 613 resid = uio->uio_resid; 614 else 615 resid = top->m_pkthdr.len; 616 /* 617 * In theory resid should be unsigned. 618 * However, space must be signed, as it might be less than 0 619 * if we over-committed, and we must use a signed comparison 620 * of space and resid. On the other hand, a negative resid 621 * causes us to loop sending 0-length segments to the protocol. 622 */ 623 if (resid < 0) { 624 error = EINVAL; 625 goto out; 626 } 627 dontroute = 628 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 629 (so->so_proto->pr_flags & PR_ATOMIC); 630 p->p_stats->p_ru.ru_msgsnd++; 631 if (control) 632 clen = control->m_len; 633 #define snderr(errno) { error = errno; splx(s); goto release; } 634 635 restart: 636 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) 637 goto out; 638 do { 639 s = splsoftnet(); 640 if (so->so_state & SS_CANTSENDMORE) 641 snderr(EPIPE); 642 if (so->so_error) { 643 error = so->so_error; 644 so->so_error = 0; 645 splx(s); 646 goto release; 647 } 648 if ((so->so_state & SS_ISCONNECTED) == 0) { 649 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 650 if ((so->so_state & SS_ISCONFIRMING) == 0 && 651 !(resid == 0 && clen != 0)) 652 snderr(ENOTCONN); 653 } else if (addr == 0) 654 snderr(EDESTADDRREQ); 655 } 656 space = sbspace(&so->so_snd); 657 if (flags & MSG_OOB) 658 space += 1024; 659 if ((atomic && resid > so->so_snd.sb_hiwat) || 660 clen > so->so_snd.sb_hiwat) 661 snderr(EMSGSIZE); 662 if (space < resid + clen && uio && 663 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 664 if (so->so_state & SS_NBIO) 665 snderr(EWOULDBLOCK); 666 sbunlock(&so->so_snd); 667 error = sbwait(&so->so_snd); 668 splx(s); 669 if (error) 670 goto out; 671 goto restart; 672 } 673 splx(s); 674 mp = ⊤ 675 space -= clen; 676 do { 677 if (uio == NULL) { 678 /* 679 * Data is prepackaged in "top". 680 */ 681 resid = 0; 682 if (flags & MSG_EOR) 683 top->m_flags |= M_EOR; 684 } else do { 685 if (top == 0) { 686 MGETHDR(m, M_WAIT, MT_DATA); 687 mlen = MHLEN; 688 m->m_pkthdr.len = 0; 689 m->m_pkthdr.rcvif = (struct ifnet *)0; 690 } else { 691 MGET(m, M_WAIT, MT_DATA); 692 mlen = MLEN; 693 } 694 #ifdef SOSEND_LOAN 695 if (uio->uio_iov->iov_len >= SOCK_LOAN_THRESH && 696 space >= SOCK_LOAN_THRESH && 697 (len = sosend_loan(so, uio, m, 698 space)) != 0) { 699 SOSEND_COUNTER_INCR(&sosend_loan_big); 700 space -= len; 701 goto have_data; 702 } 703 #endif /* SOSEND_LOAN */ 704 if (resid >= MINCLSIZE && space >= MCLBYTES) { 705 SOSEND_COUNTER_INCR(&sosend_copy_big); 706 MCLGET(m, M_WAIT); 707 if ((m->m_flags & M_EXT) == 0) 708 goto nopages; 709 mlen = MCLBYTES; 710 if (atomic && top == 0) { 711 len = lmin(MCLBYTES - max_hdr, 712 resid); 713 m->m_data += max_hdr; 714 } else 715 len = lmin(MCLBYTES, resid); 716 space -= len; 717 } else { 718 nopages: 719 SOSEND_COUNTER_INCR(&sosend_copy_small); 720 len = lmin(lmin(mlen, resid), space); 721 space -= len; 722 /* 723 * For datagram protocols, leave room 724 * for protocol headers in first mbuf. 725 */ 726 if (atomic && top == 0 && len < mlen) 727 MH_ALIGN(m, len); 728 } 729 error = uiomove(mtod(m, caddr_t), (int)len, 730 uio); 731 #ifdef SOSEND_LOAN 732 have_data: 733 #endif 734 resid = uio->uio_resid; 735 m->m_len = len; 736 *mp = m; 737 top->m_pkthdr.len += len; 738 if (error) 739 goto release; 740 mp = &m->m_next; 741 if (resid <= 0) { 742 if (flags & MSG_EOR) 743 top->m_flags |= M_EOR; 744 break; 745 } 746 } while (space > 0 && atomic); 747 748 s = splsoftnet(); 749 750 if (so->so_state & SS_CANTSENDMORE) 751 snderr(EPIPE); 752 753 if (dontroute) 754 so->so_options |= SO_DONTROUTE; 755 if (resid > 0) 756 so->so_state |= SS_MORETOCOME; 757 error = (*so->so_proto->pr_usrreq)(so, 758 (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, 759 top, addr, control, p); 760 if (dontroute) 761 so->so_options &= ~SO_DONTROUTE; 762 if (resid > 0) 763 so->so_state &= ~SS_MORETOCOME; 764 splx(s); 765 766 clen = 0; 767 control = 0; 768 top = 0; 769 mp = ⊤ 770 if (error) 771 goto release; 772 } while (resid && space > 0); 773 } while (resid); 774 775 release: 776 sbunlock(&so->so_snd); 777 out: 778 if (top) 779 m_freem(top); 780 if (control) 781 m_freem(control); 782 return (error); 783 } 784 785 /* 786 * Implement receive operations on a socket. 787 * We depend on the way that records are added to the sockbuf 788 * by sbappend*. In particular, each record (mbufs linked through m_next) 789 * must begin with an address if the protocol so specifies, 790 * followed by an optional mbuf or mbufs containing ancillary data, 791 * and then zero or more mbufs of data. 792 * In order to avoid blocking network interrupts for the entire time here, 793 * we splx() while doing the actual copy to user space. 794 * Although the sockbuf is locked, new data may still be appended, 795 * and thus we must maintain consistency of the sockbuf during that time. 796 * 797 * The caller may receive the data as a single mbuf chain by supplying 798 * an mbuf **mp0 for use in returning the chain. The uio is then used 799 * only for the count in uio_resid. 800 */ 801 int 802 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 803 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 804 { 805 struct mbuf *m, **mp; 806 int flags, len, error, s, offset, moff, type, orig_resid; 807 struct protosw *pr; 808 struct mbuf *nextrecord; 809 810 #ifdef SOSEND_LOAN 811 sodopendfree(so); 812 #endif 813 814 pr = so->so_proto; 815 mp = mp0; 816 type = 0; 817 orig_resid = uio->uio_resid; 818 if (paddr) 819 *paddr = 0; 820 if (controlp) 821 *controlp = 0; 822 if (flagsp) 823 flags = *flagsp &~ MSG_EOR; 824 else 825 flags = 0; 826 if (flags & MSG_OOB) { 827 m = m_get(M_WAIT, MT_DATA); 828 error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, 829 (struct mbuf *)(long)(flags & MSG_PEEK), (struct mbuf *)0, 830 (struct proc *)0); 831 if (error) 832 goto bad; 833 do { 834 error = uiomove(mtod(m, caddr_t), 835 (int) min(uio->uio_resid, m->m_len), uio); 836 m = m_free(m); 837 } while (uio->uio_resid && error == 0 && m); 838 bad: 839 if (m) 840 m_freem(m); 841 return (error); 842 } 843 if (mp) 844 *mp = (struct mbuf *)0; 845 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 846 (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, 847 (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); 848 849 restart: 850 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) 851 return (error); 852 s = splsoftnet(); 853 854 m = so->so_rcv.sb_mb; 855 /* 856 * If we have less data than requested, block awaiting more 857 * (subject to any timeout) if: 858 * 1. the current count is less than the low water mark, 859 * 2. MSG_WAITALL is set, and it is possible to do the entire 860 * receive operation at once if we block (resid <= hiwat), or 861 * 3. MSG_DONTWAIT is not set. 862 * If MSG_WAITALL is set but resid is larger than the receive buffer, 863 * we have to do the receive in sections, and thus risk returning 864 * a short count if a timeout or signal occurs after we start. 865 */ 866 if (m == 0 || (((flags & MSG_DONTWAIT) == 0 && 867 so->so_rcv.sb_cc < uio->uio_resid) && 868 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 869 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 870 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { 871 #ifdef DIAGNOSTIC 872 if (m == 0 && so->so_rcv.sb_cc) 873 panic("receive 1"); 874 #endif 875 if (so->so_error) { 876 if (m) 877 goto dontblock; 878 error = so->so_error; 879 if ((flags & MSG_PEEK) == 0) 880 so->so_error = 0; 881 goto release; 882 } 883 if (so->so_state & SS_CANTRCVMORE) { 884 if (m) 885 goto dontblock; 886 else 887 goto release; 888 } 889 for (; m; m = m->m_next) 890 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 891 m = so->so_rcv.sb_mb; 892 goto dontblock; 893 } 894 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 895 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 896 error = ENOTCONN; 897 goto release; 898 } 899 if (uio->uio_resid == 0) 900 goto release; 901 if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { 902 error = EWOULDBLOCK; 903 goto release; 904 } 905 sbunlock(&so->so_rcv); 906 error = sbwait(&so->so_rcv); 907 splx(s); 908 if (error) 909 return (error); 910 goto restart; 911 } 912 dontblock: 913 #ifdef notyet /* XXXX */ 914 if (uio->uio_procp) 915 uio->uio_procp->p_stats->p_ru.ru_msgrcv++; 916 #endif 917 nextrecord = m->m_nextpkt; 918 if (pr->pr_flags & PR_ADDR) { 919 #ifdef DIAGNOSTIC 920 if (m->m_type != MT_SONAME) 921 panic("receive 1a"); 922 #endif 923 orig_resid = 0; 924 if (flags & MSG_PEEK) { 925 if (paddr) 926 *paddr = m_copy(m, 0, m->m_len); 927 m = m->m_next; 928 } else { 929 sbfree(&so->so_rcv, m); 930 if (paddr) { 931 *paddr = m; 932 so->so_rcv.sb_mb = m->m_next; 933 m->m_next = 0; 934 m = so->so_rcv.sb_mb; 935 } else { 936 MFREE(m, so->so_rcv.sb_mb); 937 m = so->so_rcv.sb_mb; 938 } 939 } 940 } 941 while (m && m->m_type == MT_CONTROL && error == 0) { 942 if (flags & MSG_PEEK) { 943 if (controlp) 944 *controlp = m_copy(m, 0, m->m_len); 945 m = m->m_next; 946 } else { 947 sbfree(&so->so_rcv, m); 948 if (controlp) { 949 if (pr->pr_domain->dom_externalize && 950 mtod(m, struct cmsghdr *)->cmsg_type == 951 SCM_RIGHTS) 952 error = (*pr->pr_domain->dom_externalize)(m); 953 *controlp = m; 954 so->so_rcv.sb_mb = m->m_next; 955 m->m_next = 0; 956 m = so->so_rcv.sb_mb; 957 } else { 958 MFREE(m, so->so_rcv.sb_mb); 959 m = so->so_rcv.sb_mb; 960 } 961 } 962 if (controlp) { 963 orig_resid = 0; 964 controlp = &(*controlp)->m_next; 965 } 966 } 967 if (m) { 968 if ((flags & MSG_PEEK) == 0) 969 m->m_nextpkt = nextrecord; 970 type = m->m_type; 971 if (type == MT_OOBDATA) 972 flags |= MSG_OOB; 973 } 974 moff = 0; 975 offset = 0; 976 while (m && uio->uio_resid > 0 && error == 0) { 977 if (m->m_type == MT_OOBDATA) { 978 if (type != MT_OOBDATA) 979 break; 980 } else if (type == MT_OOBDATA) 981 break; 982 #ifdef DIAGNOSTIC 983 else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) 984 panic("receive 3"); 985 #endif 986 so->so_state &= ~SS_RCVATMARK; 987 len = uio->uio_resid; 988 if (so->so_oobmark && len > so->so_oobmark - offset) 989 len = so->so_oobmark - offset; 990 if (len > m->m_len - moff) 991 len = m->m_len - moff; 992 /* 993 * If mp is set, just pass back the mbufs. 994 * Otherwise copy them out via the uio, then free. 995 * Sockbuf must be consistent here (points to current mbuf, 996 * it points to next record) when we drop priority; 997 * we must note any additions to the sockbuf when we 998 * block interrupts again. 999 */ 1000 if (mp == 0) { 1001 splx(s); 1002 error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio); 1003 s = splsoftnet(); 1004 if (error) 1005 goto release; 1006 } else 1007 uio->uio_resid -= len; 1008 if (len == m->m_len - moff) { 1009 if (m->m_flags & M_EOR) 1010 flags |= MSG_EOR; 1011 if (flags & MSG_PEEK) { 1012 m = m->m_next; 1013 moff = 0; 1014 } else { 1015 nextrecord = m->m_nextpkt; 1016 sbfree(&so->so_rcv, m); 1017 if (mp) { 1018 *mp = m; 1019 mp = &m->m_next; 1020 so->so_rcv.sb_mb = m = m->m_next; 1021 *mp = (struct mbuf *)0; 1022 } else { 1023 MFREE(m, so->so_rcv.sb_mb); 1024 m = so->so_rcv.sb_mb; 1025 } 1026 if (m) 1027 m->m_nextpkt = nextrecord; 1028 } 1029 } else { 1030 if (flags & MSG_PEEK) 1031 moff += len; 1032 else { 1033 if (mp) 1034 *mp = m_copym(m, 0, len, M_WAIT); 1035 m->m_data += len; 1036 m->m_len -= len; 1037 so->so_rcv.sb_cc -= len; 1038 } 1039 } 1040 if (so->so_oobmark) { 1041 if ((flags & MSG_PEEK) == 0) { 1042 so->so_oobmark -= len; 1043 if (so->so_oobmark == 0) { 1044 so->so_state |= SS_RCVATMARK; 1045 break; 1046 } 1047 } else { 1048 offset += len; 1049 if (offset == so->so_oobmark) 1050 break; 1051 } 1052 } 1053 if (flags & MSG_EOR) 1054 break; 1055 /* 1056 * If the MSG_WAITALL flag is set (for non-atomic socket), 1057 * we must not quit until "uio->uio_resid == 0" or an error 1058 * termination. If a signal/timeout occurs, return 1059 * with a short count but without error. 1060 * Keep sockbuf locked against other readers. 1061 */ 1062 while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 && 1063 !sosendallatonce(so) && !nextrecord) { 1064 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1065 break; 1066 error = sbwait(&so->so_rcv); 1067 if (error) { 1068 sbunlock(&so->so_rcv); 1069 splx(s); 1070 return (0); 1071 } 1072 if ((m = so->so_rcv.sb_mb) != NULL) 1073 nextrecord = m->m_nextpkt; 1074 } 1075 } 1076 1077 if (m && pr->pr_flags & PR_ATOMIC) { 1078 flags |= MSG_TRUNC; 1079 if ((flags & MSG_PEEK) == 0) 1080 (void) sbdroprecord(&so->so_rcv); 1081 } 1082 if ((flags & MSG_PEEK) == 0) { 1083 if (m == 0) 1084 so->so_rcv.sb_mb = nextrecord; 1085 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 1086 (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, 1087 (struct mbuf *)(long)flags, (struct mbuf *)0, 1088 (struct proc *)0); 1089 } 1090 if (orig_resid == uio->uio_resid && orig_resid && 1091 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1092 sbunlock(&so->so_rcv); 1093 splx(s); 1094 goto restart; 1095 } 1096 1097 if (flagsp) 1098 *flagsp |= flags; 1099 release: 1100 sbunlock(&so->so_rcv); 1101 splx(s); 1102 return (error); 1103 } 1104 1105 int 1106 soshutdown(struct socket *so, int how) 1107 { 1108 struct protosw *pr; 1109 1110 pr = so->so_proto; 1111 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1112 return (EINVAL); 1113 1114 if (how == SHUT_RD || how == SHUT_RDWR) 1115 sorflush(so); 1116 if (how == SHUT_WR || how == SHUT_RDWR) 1117 return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, (struct mbuf *)0, 1118 (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0); 1119 return (0); 1120 } 1121 1122 void 1123 sorflush(struct socket *so) 1124 { 1125 struct sockbuf *sb, asb; 1126 struct protosw *pr; 1127 int s; 1128 1129 sb = &so->so_rcv; 1130 pr = so->so_proto; 1131 sb->sb_flags |= SB_NOINTR; 1132 (void) sblock(sb, M_WAITOK); 1133 s = splnet(); 1134 socantrcvmore(so); 1135 sbunlock(sb); 1136 asb = *sb; 1137 memset((caddr_t)sb, 0, sizeof(*sb)); 1138 splx(s); 1139 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1140 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1141 sbrelease(&asb); 1142 } 1143 1144 int 1145 sosetopt(struct socket *so, int level, int optname, struct mbuf *m0) 1146 { 1147 int error; 1148 struct mbuf *m; 1149 1150 error = 0; 1151 m = m0; 1152 if (level != SOL_SOCKET) { 1153 if (so->so_proto && so->so_proto->pr_ctloutput) 1154 return ((*so->so_proto->pr_ctloutput) 1155 (PRCO_SETOPT, so, level, optname, &m0)); 1156 error = ENOPROTOOPT; 1157 } else { 1158 switch (optname) { 1159 1160 case SO_LINGER: 1161 if (m == NULL || m->m_len != sizeof(struct linger)) { 1162 error = EINVAL; 1163 goto bad; 1164 } 1165 so->so_linger = mtod(m, struct linger *)->l_linger; 1166 /* fall thru... */ 1167 1168 case SO_DEBUG: 1169 case SO_KEEPALIVE: 1170 case SO_DONTROUTE: 1171 case SO_USELOOPBACK: 1172 case SO_BROADCAST: 1173 case SO_REUSEADDR: 1174 case SO_REUSEPORT: 1175 case SO_OOBINLINE: 1176 case SO_TIMESTAMP: 1177 if (m == NULL || m->m_len < sizeof(int)) { 1178 error = EINVAL; 1179 goto bad; 1180 } 1181 if (*mtod(m, int *)) 1182 so->so_options |= optname; 1183 else 1184 so->so_options &= ~optname; 1185 break; 1186 1187 case SO_SNDBUF: 1188 case SO_RCVBUF: 1189 case SO_SNDLOWAT: 1190 case SO_RCVLOWAT: 1191 { 1192 int optval; 1193 1194 if (m == NULL || m->m_len < sizeof(int)) { 1195 error = EINVAL; 1196 goto bad; 1197 } 1198 1199 /* 1200 * Values < 1 make no sense for any of these 1201 * options, so disallow them. 1202 */ 1203 optval = *mtod(m, int *); 1204 if (optval < 1) { 1205 error = EINVAL; 1206 goto bad; 1207 } 1208 1209 switch (optname) { 1210 1211 case SO_SNDBUF: 1212 case SO_RCVBUF: 1213 if (sbreserve(optname == SO_SNDBUF ? 1214 &so->so_snd : &so->so_rcv, 1215 (u_long) optval) == 0) { 1216 error = ENOBUFS; 1217 goto bad; 1218 } 1219 break; 1220 1221 /* 1222 * Make sure the low-water is never greater than 1223 * the high-water. 1224 */ 1225 case SO_SNDLOWAT: 1226 so->so_snd.sb_lowat = 1227 (optval > so->so_snd.sb_hiwat) ? 1228 so->so_snd.sb_hiwat : optval; 1229 break; 1230 case SO_RCVLOWAT: 1231 so->so_rcv.sb_lowat = 1232 (optval > so->so_rcv.sb_hiwat) ? 1233 so->so_rcv.sb_hiwat : optval; 1234 break; 1235 } 1236 break; 1237 } 1238 1239 case SO_SNDTIMEO: 1240 case SO_RCVTIMEO: 1241 { 1242 struct timeval *tv; 1243 short val; 1244 1245 if (m == NULL || m->m_len < sizeof(*tv)) { 1246 error = EINVAL; 1247 goto bad; 1248 } 1249 tv = mtod(m, struct timeval *); 1250 if (tv->tv_sec * hz + tv->tv_usec / tick > SHRT_MAX) { 1251 error = EDOM; 1252 goto bad; 1253 } 1254 val = tv->tv_sec * hz + tv->tv_usec / tick; 1255 1256 switch (optname) { 1257 1258 case SO_SNDTIMEO: 1259 so->so_snd.sb_timeo = val; 1260 break; 1261 case SO_RCVTIMEO: 1262 so->so_rcv.sb_timeo = val; 1263 break; 1264 } 1265 break; 1266 } 1267 1268 default: 1269 error = ENOPROTOOPT; 1270 break; 1271 } 1272 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { 1273 (void) ((*so->so_proto->pr_ctloutput) 1274 (PRCO_SETOPT, so, level, optname, &m0)); 1275 m = NULL; /* freed by protocol */ 1276 } 1277 } 1278 bad: 1279 if (m) 1280 (void) m_free(m); 1281 return (error); 1282 } 1283 1284 int 1285 sogetopt(struct socket *so, int level, int optname, struct mbuf **mp) 1286 { 1287 struct mbuf *m; 1288 1289 if (level != SOL_SOCKET) { 1290 if (so->so_proto && so->so_proto->pr_ctloutput) { 1291 return ((*so->so_proto->pr_ctloutput) 1292 (PRCO_GETOPT, so, level, optname, mp)); 1293 } else 1294 return (ENOPROTOOPT); 1295 } else { 1296 m = m_get(M_WAIT, MT_SOOPTS); 1297 m->m_len = sizeof(int); 1298 1299 switch (optname) { 1300 1301 case SO_LINGER: 1302 m->m_len = sizeof(struct linger); 1303 mtod(m, struct linger *)->l_onoff = 1304 so->so_options & SO_LINGER; 1305 mtod(m, struct linger *)->l_linger = so->so_linger; 1306 break; 1307 1308 case SO_USELOOPBACK: 1309 case SO_DONTROUTE: 1310 case SO_DEBUG: 1311 case SO_KEEPALIVE: 1312 case SO_REUSEADDR: 1313 case SO_REUSEPORT: 1314 case SO_BROADCAST: 1315 case SO_OOBINLINE: 1316 case SO_TIMESTAMP: 1317 *mtod(m, int *) = so->so_options & optname; 1318 break; 1319 1320 case SO_TYPE: 1321 *mtod(m, int *) = so->so_type; 1322 break; 1323 1324 case SO_ERROR: 1325 *mtod(m, int *) = so->so_error; 1326 so->so_error = 0; 1327 break; 1328 1329 case SO_SNDBUF: 1330 *mtod(m, int *) = so->so_snd.sb_hiwat; 1331 break; 1332 1333 case SO_RCVBUF: 1334 *mtod(m, int *) = so->so_rcv.sb_hiwat; 1335 break; 1336 1337 case SO_SNDLOWAT: 1338 *mtod(m, int *) = so->so_snd.sb_lowat; 1339 break; 1340 1341 case SO_RCVLOWAT: 1342 *mtod(m, int *) = so->so_rcv.sb_lowat; 1343 break; 1344 1345 case SO_SNDTIMEO: 1346 case SO_RCVTIMEO: 1347 { 1348 int val = (optname == SO_SNDTIMEO ? 1349 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1350 1351 m->m_len = sizeof(struct timeval); 1352 mtod(m, struct timeval *)->tv_sec = val / hz; 1353 mtod(m, struct timeval *)->tv_usec = 1354 (val % hz) * tick; 1355 break; 1356 } 1357 1358 default: 1359 (void)m_free(m); 1360 return (ENOPROTOOPT); 1361 } 1362 *mp = m; 1363 return (0); 1364 } 1365 } 1366 1367 void 1368 sohasoutofband(struct socket *so) 1369 { 1370 struct proc *p; 1371 1372 if (so->so_pgid < 0) 1373 gsignal(-so->so_pgid, SIGURG); 1374 else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0) 1375 psignal(p, SIGURG); 1376 selwakeup(&so->so_rcv.sb_sel); 1377 } 1378