1 /* 2 * Copyright (c) 2004 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 2004 The DragonFly Project. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Jeffrey M. Hsu. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of The DragonFly Project nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific, prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 /* 35 * Copyright (c) 1982, 1986, 1988, 1990, 1993 36 * The Regents of the University of California. All rights reserved. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. Neither the name of the University nor the names of its contributors 47 * may be used to endorse or promote products derived from this software 48 * without specific prior written permission. 49 * 50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 60 * SUCH DAMAGE. 61 * 62 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 63 * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.24 2003/11/11 17:18:18 silby Exp $ 64 */ 65 66 #include "opt_inet.h" 67 68 #include <sys/param.h> 69 #include <sys/systm.h> 70 #include <sys/fcntl.h> 71 #include <sys/malloc.h> 72 #include <sys/mbuf.h> 73 #include <sys/domain.h> 74 #include <sys/file.h> /* for struct knote */ 75 #include <sys/kernel.h> 76 #include <sys/event.h> 77 #include <sys/proc.h> 78 #include <sys/protosw.h> 79 #include <sys/socket.h> 80 #include <sys/socketvar.h> 81 #include <sys/socketops.h> 82 #include <sys/resourcevar.h> 83 #include <sys/signalvar.h> 84 #include <sys/sysctl.h> 85 #include <sys/uio.h> 86 #include <sys/jail.h> 87 #include <vm/vm_zone.h> 88 #include <vm/pmap.h> 89 #include <net/netmsg2.h> 90 #include <net/netisr2.h> 91 92 #include <sys/thread2.h> 93 #include <sys/socketvar2.h> 94 #include <sys/spinlock2.h> 95 96 #include <machine/limits.h> 97 98 #ifdef INET 99 extern int tcp_sosend_agglim; 100 extern int tcp_sosend_async; 101 extern int tcp_sosend_jcluster; 102 extern int udp_sosend_async; 103 extern int udp_sosend_prepend; 104 105 static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); 106 #endif /* INET */ 107 108 static void filt_sordetach(struct knote *kn); 109 static int filt_soread(struct knote *kn, long hint); 110 static void filt_sowdetach(struct knote *kn); 111 static int filt_sowrite(struct knote *kn, long hint); 112 static int filt_solisten(struct knote *kn, long hint); 113 114 static int soclose_sync(struct socket *so, int fflag); 115 static void soclose_fast(struct socket *so); 116 117 static struct filterops solisten_filtops = 118 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_solisten }; 119 static struct filterops soread_filtops = 120 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_soread }; 121 static struct filterops sowrite_filtops = 122 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sowdetach, filt_sowrite }; 123 static struct filterops soexcept_filtops = 124 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_soread }; 125 126 MALLOC_DEFINE(M_SOCKET, "socket", "socket struct"); 127 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 128 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 129 130 131 static int somaxconn = SOMAXCONN; 132 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, 133 &somaxconn, 0, "Maximum pending socket connection queue size"); 134 135 static int use_soclose_fast = 1; 136 SYSCTL_INT(_kern_ipc, OID_AUTO, soclose_fast, CTLFLAG_RW, 137 &use_soclose_fast, 0, "Fast socket close"); 138 139 int use_soaccept_pred_fast = 1; 140 SYSCTL_INT(_kern_ipc, OID_AUTO, soaccept_pred_fast, CTLFLAG_RW, 141 &use_soaccept_pred_fast, 0, "Fast socket accept predication"); 142 143 int use_sendfile_async = 1; 144 SYSCTL_INT(_kern_ipc, OID_AUTO, sendfile_async, CTLFLAG_RW, 145 &use_sendfile_async, 0, "sendfile uses asynchronized pru_send"); 146 147 int use_soconnect_async = 1; 148 SYSCTL_INT(_kern_ipc, OID_AUTO, soconnect_async, CTLFLAG_RW, 149 &use_soconnect_async, 0, "soconnect uses asynchronized pru_connect"); 150 151 /* 152 * Socket operation routines. 153 * These routines are called by the routines in 154 * sys_socket.c or from a system process, and 155 * implement the semantics of socket operations by 156 * switching out to the protocol specific routines. 157 */ 158 159 /* 160 * Get a socket structure, and initialize it. 161 * Note that it would probably be better to allocate socket 162 * and PCB at the same time, but I'm not convinced that all 163 * the protocols can be easily modified to do this. 164 */ 165 struct socket * 166 soalloc(int waitok, struct protosw *pr) 167 { 168 struct socket *so; 169 unsigned waitmask; 170 171 waitmask = waitok ? M_WAITOK : M_NOWAIT; 172 so = kmalloc(sizeof(struct socket), M_SOCKET, M_ZERO|waitmask); 173 if (so) { 174 /* XXX race condition for reentrant kernel */ 175 so->so_proto = pr; 176 TAILQ_INIT(&so->so_aiojobq); 177 TAILQ_INIT(&so->so_rcv.ssb_kq.ki_mlist); 178 TAILQ_INIT(&so->so_snd.ssb_kq.ki_mlist); 179 lwkt_token_init(&so->so_rcv.ssb_token, "rcvtok"); 180 lwkt_token_init(&so->so_snd.ssb_token, "sndtok"); 181 spin_init(&so->so_rcvd_spin, "soalloc"); 182 netmsg_init(&so->so_rcvd_msg.base, so, &netisr_adone_rport, 183 MSGF_DROPABLE | MSGF_PRIORITY, 184 so->so_proto->pr_usrreqs->pru_rcvd); 185 so->so_rcvd_msg.nm_pru_flags |= PRUR_ASYNC; 186 so->so_state = SS_NOFDREF; 187 so->so_refs = 1; 188 } 189 return so; 190 } 191 192 int 193 socreate(int dom, struct socket **aso, int type, 194 int proto, struct thread *td) 195 { 196 struct proc *p = td->td_proc; 197 struct protosw *prp; 198 struct socket *so; 199 struct pru_attach_info ai; 200 int error; 201 202 if (proto) 203 prp = pffindproto(dom, proto, type); 204 else 205 prp = pffindtype(dom, type); 206 207 if (prp == NULL || prp->pr_usrreqs->pru_attach == 0) 208 return (EPROTONOSUPPORT); 209 210 if (p->p_ucred->cr_prison && jail_socket_unixiproute_only && 211 prp->pr_domain->dom_family != PF_LOCAL && 212 prp->pr_domain->dom_family != PF_INET && 213 prp->pr_domain->dom_family != PF_INET6 && 214 prp->pr_domain->dom_family != PF_ROUTE) { 215 return (EPROTONOSUPPORT); 216 } 217 218 if (prp->pr_type != type) 219 return (EPROTOTYPE); 220 so = soalloc(p != NULL, prp); 221 if (so == NULL) 222 return (ENOBUFS); 223 224 /* 225 * Callers of socreate() presumably will connect up a descriptor 226 * and call soclose() if they cannot. This represents our so_refs 227 * (which should be 1) from soalloc(). 228 */ 229 soclrstate(so, SS_NOFDREF); 230 231 /* 232 * Set a default port for protocol processing. No action will occur 233 * on the socket on this port until an inpcb is attached to it and 234 * is able to match incoming packets, or until the socket becomes 235 * available to userland. 236 * 237 * We normally default the socket to the protocol thread on cpu 0, 238 * if protocol does not provide its own method to initialize the 239 * default port. 240 * 241 * If PR_SYNC_PORT is set (unix domain sockets) there is no protocol 242 * thread and all pr_*()/pru_*() calls are executed synchronously. 243 */ 244 if (prp->pr_flags & PR_SYNC_PORT) 245 so->so_port = &netisr_sync_port; 246 else if (prp->pr_initport != NULL) 247 so->so_port = prp->pr_initport(); 248 else 249 so->so_port = netisr_cpuport(0); 250 251 TAILQ_INIT(&so->so_incomp); 252 TAILQ_INIT(&so->so_comp); 253 so->so_type = type; 254 so->so_cred = crhold(p->p_ucred); 255 ai.sb_rlimit = &p->p_rlimit[RLIMIT_SBSIZE]; 256 ai.p_ucred = p->p_ucred; 257 ai.fd_rdir = p->p_fd->fd_rdir; 258 259 /* 260 * Auto-sizing of socket buffers is managed by the protocols and 261 * the appropriate flags must be set in the pru_attach function. 262 */ 263 error = so_pru_attach(so, proto, &ai); 264 if (error) { 265 sosetstate(so, SS_NOFDREF); 266 sofree(so); /* from soalloc */ 267 return error; 268 } 269 270 /* 271 * NOTE: Returns referenced socket. 272 */ 273 *aso = so; 274 return (0); 275 } 276 277 int 278 sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 279 { 280 int error; 281 282 error = so_pru_bind(so, nam, td); 283 return (error); 284 } 285 286 static void 287 sodealloc(struct socket *so) 288 { 289 if (so->so_rcv.ssb_hiwat) 290 (void)chgsbsize(so->so_cred->cr_uidinfo, 291 &so->so_rcv.ssb_hiwat, 0, RLIM_INFINITY); 292 if (so->so_snd.ssb_hiwat) 293 (void)chgsbsize(so->so_cred->cr_uidinfo, 294 &so->so_snd.ssb_hiwat, 0, RLIM_INFINITY); 295 #ifdef INET 296 /* remove accept filter if present */ 297 if (so->so_accf != NULL) 298 do_setopt_accept_filter(so, NULL); 299 #endif /* INET */ 300 crfree(so->so_cred); 301 if (so->so_faddr != NULL) 302 kfree(so->so_faddr, M_SONAME); 303 kfree(so, M_SOCKET); 304 } 305 306 int 307 solisten(struct socket *so, int backlog, struct thread *td) 308 { 309 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) 310 return (EINVAL); 311 312 lwkt_gettoken(&so->so_rcv.ssb_token); 313 if (TAILQ_EMPTY(&so->so_comp)) 314 so->so_options |= SO_ACCEPTCONN; 315 lwkt_reltoken(&so->so_rcv.ssb_token); 316 if (backlog < 0 || backlog > somaxconn) 317 backlog = somaxconn; 318 so->so_qlimit = backlog; 319 return so_pru_listen(so, td); 320 } 321 322 /* 323 * Destroy a disconnected socket. This routine is a NOP if entities 324 * still have a reference on the socket: 325 * 326 * so_pcb - The protocol stack still has a reference 327 * SS_NOFDREF - There is no longer a file pointer reference 328 */ 329 void 330 sofree(struct socket *so) 331 { 332 struct socket *head; 333 334 /* 335 * This is a bit hackish at the moment. We need to interlock 336 * any accept queue we are on before we potentially lose the 337 * last reference to avoid races against a re-reference from 338 * someone operating on the queue. 339 */ 340 while ((head = so->so_head) != NULL) { 341 lwkt_getpooltoken(head); 342 if (so->so_head == head) 343 break; 344 lwkt_relpooltoken(head); 345 } 346 347 /* 348 * Arbitrage the last free. 349 */ 350 KKASSERT(so->so_refs > 0); 351 if (atomic_fetchadd_int(&so->so_refs, -1) != 1) { 352 if (head) 353 lwkt_relpooltoken(head); 354 return; 355 } 356 357 KKASSERT(so->so_pcb == NULL && (so->so_state & SS_NOFDREF)); 358 KKASSERT((so->so_state & SS_ASSERTINPROG) == 0); 359 360 /* 361 * We're done, remove ourselves from the accept queue we are 362 * on, if we are on one. 363 */ 364 if (head != NULL) { 365 if (so->so_state & SS_INCOMP) { 366 TAILQ_REMOVE(&head->so_incomp, so, so_list); 367 head->so_incqlen--; 368 } else if (so->so_state & SS_COMP) { 369 /* 370 * We must not decommission a socket that's 371 * on the accept(2) queue. If we do, then 372 * accept(2) may hang after select(2) indicated 373 * that the listening socket was ready. 374 */ 375 lwkt_relpooltoken(head); 376 return; 377 } else { 378 panic("sofree: not queued"); 379 } 380 soclrstate(so, SS_INCOMP); 381 so->so_head = NULL; 382 lwkt_relpooltoken(head); 383 } 384 ssb_release(&so->so_snd, so); 385 sorflush(so); 386 sodealloc(so); 387 } 388 389 /* 390 * Close a socket on last file table reference removal. 391 * Initiate disconnect if connected. 392 * Free socket when disconnect complete. 393 */ 394 int 395 soclose(struct socket *so, int fflag) 396 { 397 int error; 398 399 funsetown(&so->so_sigio); 400 sosetstate(so, SS_ISCLOSING); 401 if (!use_soclose_fast || 402 (so->so_proto->pr_flags & PR_SYNC_PORT) || 403 ((so->so_state & SS_ISCONNECTED) && 404 (so->so_options & SO_LINGER))) { 405 error = soclose_sync(so, fflag); 406 } else { 407 soclose_fast(so); 408 error = 0; 409 } 410 return error; 411 } 412 413 void 414 sodiscard(struct socket *so) 415 { 416 lwkt_getpooltoken(so); 417 if (so->so_options & SO_ACCEPTCONN) { 418 struct socket *sp; 419 420 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 421 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 422 so->so_incqlen--; 423 soclrstate(sp, SS_INCOMP); 424 soabort_async(sp, TRUE); 425 } 426 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 427 TAILQ_REMOVE(&so->so_comp, sp, so_list); 428 so->so_qlen--; 429 soclrstate(sp, SS_COMP); 430 soabort_async(sp, TRUE); 431 } 432 } 433 lwkt_relpooltoken(so); 434 435 if (so->so_state & SS_NOFDREF) 436 panic("soclose: NOFDREF"); 437 sosetstate(so, SS_NOFDREF); /* take ref */ 438 } 439 440 void 441 soinherit(struct socket *so, struct socket *so_inh) 442 { 443 TAILQ_HEAD(, socket) comp, incomp; 444 struct socket *sp; 445 int qlen, incqlen; 446 447 KASSERT(so->so_options & SO_ACCEPTCONN, 448 ("so does not accept connection")); 449 KASSERT(so_inh->so_options & SO_ACCEPTCONN, 450 ("so_inh does not accept connection")); 451 452 TAILQ_INIT(&comp); 453 TAILQ_INIT(&incomp); 454 455 lwkt_getpooltoken(so); 456 lwkt_getpooltoken(so_inh); 457 458 /* 459 * Save completed queue and incompleted queue 460 */ 461 TAILQ_CONCAT(&comp, &so->so_comp, so_list); 462 qlen = so->so_qlen; 463 so->so_qlen = 0; 464 465 TAILQ_CONCAT(&incomp, &so->so_incomp, so_list); 466 incqlen = so->so_incqlen; 467 so->so_incqlen = 0; 468 469 /* 470 * Append the saved completed queue and incompleted 471 * queue to the socket inherits them. 472 * 473 * XXX 474 * This may temporarily break the inheriting socket's 475 * so_qlimit. 476 */ 477 TAILQ_FOREACH(sp, &comp, so_list) { 478 sp->so_head = so_inh; 479 crfree(sp->so_cred); 480 sp->so_cred = crhold(so_inh->so_cred); 481 } 482 483 TAILQ_FOREACH(sp, &incomp, so_list) { 484 sp->so_head = so_inh; 485 crfree(sp->so_cred); 486 sp->so_cred = crhold(so_inh->so_cred); 487 } 488 489 TAILQ_CONCAT(&so_inh->so_comp, &comp, so_list); 490 so_inh->so_qlen += qlen; 491 492 TAILQ_CONCAT(&so_inh->so_incomp, &incomp, so_list); 493 so_inh->so_incqlen += incqlen; 494 495 lwkt_relpooltoken(so_inh); 496 lwkt_relpooltoken(so); 497 498 if (qlen) { 499 /* 500 * "New" connections have arrived 501 */ 502 sorwakeup(so_inh); 503 wakeup(&so_inh->so_timeo); 504 } 505 } 506 507 static int 508 soclose_sync(struct socket *so, int fflag) 509 { 510 int error = 0; 511 512 if (so->so_pcb == NULL) 513 goto discard; 514 if (so->so_state & SS_ISCONNECTED) { 515 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 516 error = sodisconnect(so); 517 if (error) 518 goto drop; 519 } 520 if (so->so_options & SO_LINGER) { 521 if ((so->so_state & SS_ISDISCONNECTING) && 522 (fflag & FNONBLOCK)) 523 goto drop; 524 while (so->so_state & SS_ISCONNECTED) { 525 error = tsleep(&so->so_timeo, PCATCH, 526 "soclos", so->so_linger * hz); 527 if (error) 528 break; 529 } 530 } 531 } 532 drop: 533 if (so->so_pcb) { 534 int error2; 535 536 error2 = so_pru_detach(so); 537 if (error2 == EJUSTRETURN) { 538 /* 539 * Protocol will call sodiscard() 540 * and sofree() for us. 541 */ 542 return error; 543 } 544 if (error == 0) 545 error = error2; 546 } 547 discard: 548 sodiscard(so); 549 so_pru_sync(so); /* unpend async sending */ 550 sofree(so); /* dispose of ref */ 551 552 return (error); 553 } 554 555 static void 556 soclose_sofree_async_handler(netmsg_t msg) 557 { 558 sofree(msg->base.nm_so); 559 } 560 561 static void 562 soclose_sofree_async(struct socket *so) 563 { 564 struct netmsg_base *base = &so->so_clomsg; 565 566 netmsg_init(base, so, &netisr_apanic_rport, 0, 567 soclose_sofree_async_handler); 568 lwkt_sendmsg(so->so_port, &base->lmsg); 569 } 570 571 static void 572 soclose_disconn_async_handler(netmsg_t msg) 573 { 574 struct socket *so = msg->base.nm_so; 575 576 if ((so->so_state & SS_ISCONNECTED) && 577 (so->so_state & SS_ISDISCONNECTING) == 0) 578 so_pru_disconnect_direct(so); 579 580 if (so->so_pcb) { 581 int error; 582 583 error = so_pru_detach_direct(so); 584 if (error == EJUSTRETURN) { 585 /* 586 * Protocol will call sodiscard() 587 * and sofree() for us. 588 */ 589 return; 590 } 591 } 592 593 sodiscard(so); 594 sofree(so); 595 } 596 597 static void 598 soclose_disconn_async(struct socket *so) 599 { 600 struct netmsg_base *base = &so->so_clomsg; 601 602 netmsg_init(base, so, &netisr_apanic_rport, 0, 603 soclose_disconn_async_handler); 604 lwkt_sendmsg(so->so_port, &base->lmsg); 605 } 606 607 static void 608 soclose_detach_async_handler(netmsg_t msg) 609 { 610 struct socket *so = msg->base.nm_so; 611 612 if (so->so_pcb) { 613 int error; 614 615 error = so_pru_detach_direct(so); 616 if (error == EJUSTRETURN) { 617 /* 618 * Protocol will call sodiscard() 619 * and sofree() for us. 620 */ 621 return; 622 } 623 } 624 625 sodiscard(so); 626 sofree(so); 627 } 628 629 static void 630 soclose_detach_async(struct socket *so) 631 { 632 struct netmsg_base *base = &so->so_clomsg; 633 634 netmsg_init(base, so, &netisr_apanic_rport, 0, 635 soclose_detach_async_handler); 636 lwkt_sendmsg(so->so_port, &base->lmsg); 637 } 638 639 static void 640 soclose_fast(struct socket *so) 641 { 642 if (so->so_pcb == NULL) 643 goto discard; 644 645 if ((so->so_state & SS_ISCONNECTED) && 646 (so->so_state & SS_ISDISCONNECTING) == 0) { 647 soclose_disconn_async(so); 648 return; 649 } 650 651 if (so->so_pcb) { 652 soclose_detach_async(so); 653 return; 654 } 655 656 discard: 657 sodiscard(so); 658 soclose_sofree_async(so); 659 } 660 661 /* 662 * Abort and destroy a socket. Only one abort can be in progress 663 * at any given moment. 664 */ 665 void 666 soabort_async(struct socket *so, boolean_t clr_head) 667 { 668 /* 669 * Keep a reference before clearing the so_head 670 * to avoid racing socket close in netisr. 671 */ 672 soreference(so); 673 if (clr_head) 674 so->so_head = NULL; 675 so_pru_abort_async(so); 676 } 677 678 void 679 soabort_oncpu(struct socket *so) 680 { 681 soreference(so); 682 so_pru_abort_direct(so); 683 } 684 685 /* 686 * so is passed in ref'd, which becomes owned by 687 * the cleared SS_NOFDREF flag. 688 */ 689 void 690 soaccept_generic(struct socket *so) 691 { 692 if ((so->so_state & SS_NOFDREF) == 0) 693 panic("soaccept: !NOFDREF"); 694 soclrstate(so, SS_NOFDREF); /* owned by lack of SS_NOFDREF */ 695 } 696 697 int 698 soaccept(struct socket *so, struct sockaddr **nam) 699 { 700 int error; 701 702 soaccept_generic(so); 703 error = so_pru_accept(so, nam); 704 return (error); 705 } 706 707 int 708 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td, 709 boolean_t sync) 710 { 711 int error; 712 713 if (so->so_options & SO_ACCEPTCONN) 714 return (EOPNOTSUPP); 715 /* 716 * If protocol is connection-based, can only connect once. 717 * Otherwise, if connected, try to disconnect first. 718 * This allows user to disconnect by connecting to, e.g., 719 * a null address. 720 */ 721 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 722 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 723 (error = sodisconnect(so)))) { 724 error = EISCONN; 725 } else { 726 /* 727 * Prevent accumulated error from previous connection 728 * from biting us. 729 */ 730 so->so_error = 0; 731 if (!sync && so->so_proto->pr_usrreqs->pru_preconnect) 732 error = so_pru_connect_async(so, nam, td); 733 else 734 error = so_pru_connect(so, nam, td); 735 } 736 return (error); 737 } 738 739 int 740 soconnect2(struct socket *so1, struct socket *so2) 741 { 742 int error; 743 744 error = so_pru_connect2(so1, so2); 745 return (error); 746 } 747 748 int 749 sodisconnect(struct socket *so) 750 { 751 int error; 752 753 if ((so->so_state & SS_ISCONNECTED) == 0) { 754 error = ENOTCONN; 755 goto bad; 756 } 757 if (so->so_state & SS_ISDISCONNECTING) { 758 error = EALREADY; 759 goto bad; 760 } 761 error = so_pru_disconnect(so); 762 bad: 763 return (error); 764 } 765 766 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 767 /* 768 * Send on a socket. 769 * If send must go all at once and message is larger than 770 * send buffering, then hard error. 771 * Lock against other senders. 772 * If must go all at once and not enough room now, then 773 * inform user that this would block and do nothing. 774 * Otherwise, if nonblocking, send as much as possible. 775 * The data to be sent is described by "uio" if nonzero, 776 * otherwise by the mbuf chain "top" (which must be null 777 * if uio is not). Data provided in mbuf chain must be small 778 * enough to send all at once. 779 * 780 * Returns nonzero on error, timeout or signal; callers 781 * must check for short counts if EINTR/ERESTART are returned. 782 * Data and control buffers are freed on return. 783 */ 784 int 785 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 786 struct mbuf *top, struct mbuf *control, int flags, 787 struct thread *td) 788 { 789 struct mbuf **mp; 790 struct mbuf *m; 791 size_t resid; 792 int space, len; 793 int clen = 0, error, dontroute, mlen; 794 int atomic = sosendallatonce(so) || top; 795 int pru_flags; 796 797 if (uio) { 798 resid = uio->uio_resid; 799 } else { 800 resid = (size_t)top->m_pkthdr.len; 801 #ifdef INVARIANTS 802 len = 0; 803 for (m = top; m; m = m->m_next) 804 len += m->m_len; 805 KKASSERT(top->m_pkthdr.len == len); 806 #endif 807 } 808 809 /* 810 * WARNING! resid is unsigned, space and len are signed. space 811 * can wind up negative if the sockbuf is overcommitted. 812 * 813 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 814 * type sockets since that's an error. 815 */ 816 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 817 error = EINVAL; 818 goto out; 819 } 820 821 dontroute = 822 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 823 (so->so_proto->pr_flags & PR_ATOMIC); 824 if (td->td_lwp != NULL) 825 td->td_lwp->lwp_ru.ru_msgsnd++; 826 if (control) 827 clen = control->m_len; 828 #define gotoerr(errcode) { error = errcode; goto release; } 829 830 restart: 831 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags)); 832 if (error) 833 goto out; 834 835 do { 836 if (so->so_state & SS_CANTSENDMORE) 837 gotoerr(EPIPE); 838 if (so->so_error) { 839 error = so->so_error; 840 so->so_error = 0; 841 goto release; 842 } 843 if ((so->so_state & SS_ISCONNECTED) == 0) { 844 /* 845 * `sendto' and `sendmsg' is allowed on a connection- 846 * based socket if it supports implied connect. 847 * Return ENOTCONN if not connected and no address is 848 * supplied. 849 */ 850 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 851 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 852 if ((so->so_state & SS_ISCONFIRMING) == 0 && 853 !(resid == 0 && clen != 0)) 854 gotoerr(ENOTCONN); 855 } else if (addr == NULL) 856 gotoerr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 857 ENOTCONN : EDESTADDRREQ); 858 } 859 if ((atomic && resid > so->so_snd.ssb_hiwat) || 860 clen > so->so_snd.ssb_hiwat) { 861 gotoerr(EMSGSIZE); 862 } 863 space = ssb_space(&so->so_snd); 864 if (flags & MSG_OOB) 865 space += 1024; 866 if ((space < 0 || (size_t)space < resid + clen) && uio && 867 (atomic || space < so->so_snd.ssb_lowat || space < clen)) { 868 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) 869 gotoerr(EWOULDBLOCK); 870 ssb_unlock(&so->so_snd); 871 error = ssb_wait(&so->so_snd); 872 if (error) 873 goto out; 874 goto restart; 875 } 876 mp = ⊤ 877 space -= clen; 878 do { 879 if (uio == NULL) { 880 /* 881 * Data is prepackaged in "top". 882 */ 883 resid = 0; 884 if (flags & MSG_EOR) 885 top->m_flags |= M_EOR; 886 } else do { 887 if (resid > INT_MAX) 888 resid = INT_MAX; 889 m = m_getl((int)resid, M_WAITOK, MT_DATA, 890 top == NULL ? M_PKTHDR : 0, &mlen); 891 if (top == NULL) { 892 m->m_pkthdr.len = 0; 893 m->m_pkthdr.rcvif = NULL; 894 } 895 len = imin((int)szmin(mlen, resid), space); 896 if (resid < MINCLSIZE) { 897 /* 898 * For datagram protocols, leave room 899 * for protocol headers in first mbuf. 900 */ 901 if (atomic && top == NULL && len < mlen) 902 MH_ALIGN(m, len); 903 } 904 space -= len; 905 error = uiomove(mtod(m, caddr_t), (size_t)len, uio); 906 resid = uio->uio_resid; 907 m->m_len = len; 908 *mp = m; 909 top->m_pkthdr.len += len; 910 if (error) 911 goto release; 912 mp = &m->m_next; 913 if (resid == 0) { 914 if (flags & MSG_EOR) 915 top->m_flags |= M_EOR; 916 break; 917 } 918 } while (space > 0 && atomic); 919 if (dontroute) 920 so->so_options |= SO_DONTROUTE; 921 if (flags & MSG_OOB) { 922 pru_flags = PRUS_OOB; 923 } else if ((flags & MSG_EOF) && 924 (so->so_proto->pr_flags & PR_IMPLOPCL) && 925 (resid == 0)) { 926 /* 927 * If the user set MSG_EOF, the protocol 928 * understands this flag and nothing left to 929 * send then use PRU_SEND_EOF instead of PRU_SEND. 930 */ 931 pru_flags = PRUS_EOF; 932 } else if (resid > 0 && space > 0) { 933 /* If there is more to send, set PRUS_MORETOCOME */ 934 pru_flags = PRUS_MORETOCOME; 935 } else { 936 pru_flags = 0; 937 } 938 /* 939 * XXX all the SS_CANTSENDMORE checks previously 940 * done could be out of date. We could have recieved 941 * a reset packet in an interrupt or maybe we slept 942 * while doing page faults in uiomove() etc. We could 943 * probably recheck again inside the splnet() protection 944 * here, but there are probably other places that this 945 * also happens. We must rethink this. 946 */ 947 error = so_pru_send(so, pru_flags, top, addr, control, td); 948 if (dontroute) 949 so->so_options &= ~SO_DONTROUTE; 950 clen = 0; 951 control = NULL; 952 top = NULL; 953 mp = ⊤ 954 if (error) 955 goto release; 956 } while (resid && space > 0); 957 } while (resid); 958 959 release: 960 ssb_unlock(&so->so_snd); 961 out: 962 if (top) 963 m_freem(top); 964 if (control) 965 m_freem(control); 966 return (error); 967 } 968 969 #ifdef INET 970 /* 971 * A specialization of sosend() for UDP based on protocol-specific knowledge: 972 * so->so_proto->pr_flags has the PR_ATOMIC field set. This means that 973 * sosendallatonce() returns true, 974 * the "atomic" variable is true, 975 * and sosendudp() blocks until space is available for the entire send. 976 * so->so_proto->pr_flags does not have the PR_CONNREQUIRED or 977 * PR_IMPLOPCL flags set. 978 * UDP has no out-of-band data. 979 * UDP has no control data. 980 * UDP does not support MSG_EOR. 981 */ 982 int 983 sosendudp(struct socket *so, struct sockaddr *addr, struct uio *uio, 984 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 985 { 986 size_t resid; 987 int error, pru_flags = 0; 988 int space; 989 990 if (td->td_lwp != NULL) 991 td->td_lwp->lwp_ru.ru_msgsnd++; 992 if (control) 993 m_freem(control); 994 995 KASSERT((uio && !top) || (top && !uio), ("bad arguments to sosendudp")); 996 resid = uio ? uio->uio_resid : (size_t)top->m_pkthdr.len; 997 998 restart: 999 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags)); 1000 if (error) 1001 goto out; 1002 1003 if (so->so_state & SS_CANTSENDMORE) 1004 gotoerr(EPIPE); 1005 if (so->so_error) { 1006 error = so->so_error; 1007 so->so_error = 0; 1008 goto release; 1009 } 1010 if (!(so->so_state & SS_ISCONNECTED) && addr == NULL) 1011 gotoerr(EDESTADDRREQ); 1012 if (resid > so->so_snd.ssb_hiwat) 1013 gotoerr(EMSGSIZE); 1014 space = ssb_space(&so->so_snd); 1015 if (uio && (space < 0 || (size_t)space < resid)) { 1016 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) 1017 gotoerr(EWOULDBLOCK); 1018 ssb_unlock(&so->so_snd); 1019 error = ssb_wait(&so->so_snd); 1020 if (error) 1021 goto out; 1022 goto restart; 1023 } 1024 1025 if (uio) { 1026 int hdrlen = max_hdr; 1027 1028 /* 1029 * We try to optimize out the additional mbuf 1030 * allocations in M_PREPEND() on output path, e.g. 1031 * - udp_output(), when it tries to prepend protocol 1032 * headers. 1033 * - Link layer output function, when it tries to 1034 * prepend link layer header. 1035 * 1036 * This probably will not benefit any data that will 1037 * be fragmented, so this optimization is only performed 1038 * when the size of data and max size of protocol+link 1039 * headers fit into one mbuf cluster. 1040 */ 1041 if (uio->uio_resid > MCLBYTES - hdrlen || 1042 !udp_sosend_prepend) { 1043 top = m_uiomove(uio); 1044 if (top == NULL) 1045 goto release; 1046 } else { 1047 int nsize; 1048 1049 top = m_getl(uio->uio_resid + hdrlen, M_WAITOK, 1050 MT_DATA, M_PKTHDR, &nsize); 1051 KASSERT(nsize >= uio->uio_resid + hdrlen, 1052 ("sosendudp invalid nsize %d, " 1053 "resid %zu, hdrlen %d", 1054 nsize, uio->uio_resid, hdrlen)); 1055 1056 top->m_len = uio->uio_resid; 1057 top->m_pkthdr.len = uio->uio_resid; 1058 top->m_data += hdrlen; 1059 1060 error = uiomove(mtod(top, caddr_t), top->m_len, uio); 1061 if (error) 1062 goto out; 1063 } 1064 } 1065 1066 if (flags & MSG_DONTROUTE) 1067 pru_flags |= PRUS_DONTROUTE; 1068 1069 if (udp_sosend_async && (flags & MSG_SYNC) == 0) { 1070 so_pru_send_async(so, pru_flags, top, addr, NULL, td); 1071 error = 0; 1072 } else { 1073 error = so_pru_send(so, pru_flags, top, addr, NULL, td); 1074 } 1075 top = NULL; /* sent or freed in lower layer */ 1076 1077 release: 1078 ssb_unlock(&so->so_snd); 1079 out: 1080 if (top) 1081 m_freem(top); 1082 return (error); 1083 } 1084 1085 int 1086 sosendtcp(struct socket *so, struct sockaddr *addr, struct uio *uio, 1087 struct mbuf *top, struct mbuf *control, int flags, 1088 struct thread *td) 1089 { 1090 struct mbuf **mp; 1091 struct mbuf *m; 1092 size_t resid; 1093 int space, len; 1094 int error, mlen; 1095 int allatonce; 1096 int pru_flags; 1097 1098 if (uio) { 1099 KKASSERT(top == NULL); 1100 allatonce = 0; 1101 resid = uio->uio_resid; 1102 } else { 1103 allatonce = 1; 1104 resid = (size_t)top->m_pkthdr.len; 1105 #ifdef INVARIANTS 1106 len = 0; 1107 for (m = top; m; m = m->m_next) 1108 len += m->m_len; 1109 KKASSERT(top->m_pkthdr.len == len); 1110 #endif 1111 } 1112 1113 /* 1114 * WARNING! resid is unsigned, space and len are signed. space 1115 * can wind up negative if the sockbuf is overcommitted. 1116 * 1117 * Also check to make sure that MSG_EOR isn't used on TCP 1118 */ 1119 if (flags & MSG_EOR) { 1120 error = EINVAL; 1121 goto out; 1122 } 1123 1124 if (control) { 1125 /* TCP doesn't do control messages (rights, creds, etc) */ 1126 if (control->m_len) { 1127 error = EINVAL; 1128 goto out; 1129 } 1130 m_freem(control); /* empty control, just free it */ 1131 control = NULL; 1132 } 1133 1134 if (td->td_lwp != NULL) 1135 td->td_lwp->lwp_ru.ru_msgsnd++; 1136 1137 #define gotoerr(errcode) { error = errcode; goto release; } 1138 1139 restart: 1140 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags)); 1141 if (error) 1142 goto out; 1143 1144 do { 1145 if (so->so_state & SS_CANTSENDMORE) 1146 gotoerr(EPIPE); 1147 if (so->so_error) { 1148 error = so->so_error; 1149 so->so_error = 0; 1150 goto release; 1151 } 1152 if ((so->so_state & SS_ISCONNECTED) == 0 && 1153 (so->so_state & SS_ISCONFIRMING) == 0) 1154 gotoerr(ENOTCONN); 1155 if (allatonce && resid > so->so_snd.ssb_hiwat) 1156 gotoerr(EMSGSIZE); 1157 1158 space = ssb_space_prealloc(&so->so_snd); 1159 if (flags & MSG_OOB) 1160 space += 1024; 1161 if ((space < 0 || (size_t)space < resid) && !allatonce && 1162 space < so->so_snd.ssb_lowat) { 1163 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) 1164 gotoerr(EWOULDBLOCK); 1165 ssb_unlock(&so->so_snd); 1166 error = ssb_wait(&so->so_snd); 1167 if (error) 1168 goto out; 1169 goto restart; 1170 } 1171 mp = ⊤ 1172 do { 1173 int cnt = 0, async = 0; 1174 1175 if (uio == NULL) { 1176 /* 1177 * Data is prepackaged in "top". 1178 */ 1179 resid = 0; 1180 } else do { 1181 if (resid > INT_MAX) 1182 resid = INT_MAX; 1183 if (tcp_sosend_jcluster) { 1184 m = m_getlj((int)resid, M_WAITOK, MT_DATA, 1185 top == NULL ? M_PKTHDR : 0, &mlen); 1186 } else { 1187 m = m_getl((int)resid, M_WAITOK, MT_DATA, 1188 top == NULL ? M_PKTHDR : 0, &mlen); 1189 } 1190 if (top == NULL) { 1191 m->m_pkthdr.len = 0; 1192 m->m_pkthdr.rcvif = NULL; 1193 } 1194 len = imin((int)szmin(mlen, resid), space); 1195 space -= len; 1196 error = uiomove(mtod(m, caddr_t), (size_t)len, uio); 1197 resid = uio->uio_resid; 1198 m->m_len = len; 1199 *mp = m; 1200 top->m_pkthdr.len += len; 1201 if (error) 1202 goto release; 1203 mp = &m->m_next; 1204 if (resid == 0) 1205 break; 1206 ++cnt; 1207 } while (space > 0 && cnt < tcp_sosend_agglim); 1208 1209 if (tcp_sosend_async) 1210 async = 1; 1211 1212 if (flags & MSG_OOB) { 1213 pru_flags = PRUS_OOB; 1214 async = 0; 1215 } else if ((flags & MSG_EOF) && resid == 0) { 1216 pru_flags = PRUS_EOF; 1217 } else if (resid > 0 && space > 0) { 1218 /* If there is more to send, set PRUS_MORETOCOME */ 1219 pru_flags = PRUS_MORETOCOME; 1220 async = 1; 1221 } else { 1222 pru_flags = 0; 1223 } 1224 1225 if (flags & MSG_SYNC) 1226 async = 0; 1227 1228 /* 1229 * XXX all the SS_CANTSENDMORE checks previously 1230 * done could be out of date. We could have recieved 1231 * a reset packet in an interrupt or maybe we slept 1232 * while doing page faults in uiomove() etc. We could 1233 * probably recheck again inside the splnet() protection 1234 * here, but there are probably other places that this 1235 * also happens. We must rethink this. 1236 */ 1237 for (m = top; m; m = m->m_next) 1238 ssb_preallocstream(&so->so_snd, m); 1239 if (!async) { 1240 error = so_pru_send(so, pru_flags, top, 1241 NULL, NULL, td); 1242 } else { 1243 so_pru_send_async(so, pru_flags, top, 1244 NULL, NULL, td); 1245 error = 0; 1246 } 1247 1248 top = NULL; 1249 mp = ⊤ 1250 if (error) 1251 goto release; 1252 } while (resid && space > 0); 1253 } while (resid); 1254 1255 release: 1256 ssb_unlock(&so->so_snd); 1257 out: 1258 if (top) 1259 m_freem(top); 1260 if (control) 1261 m_freem(control); 1262 return (error); 1263 } 1264 #endif 1265 1266 /* 1267 * Implement receive operations on a socket. 1268 * 1269 * We depend on the way that records are added to the signalsockbuf 1270 * by sbappend*. In particular, each record (mbufs linked through m_next) 1271 * must begin with an address if the protocol so specifies, 1272 * followed by an optional mbuf or mbufs containing ancillary data, 1273 * and then zero or more mbufs of data. 1274 * 1275 * Although the signalsockbuf is locked, new data may still be appended. 1276 * A token inside the ssb_lock deals with MP issues and still allows 1277 * the network to access the socket if we block in a uio. 1278 * 1279 * The caller may receive the data as a single mbuf chain by supplying 1280 * an mbuf **mp0 for use in returning the chain. The uio is then used 1281 * only for the count in uio_resid. 1282 */ 1283 int 1284 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 1285 struct sockbuf *sio, struct mbuf **controlp, int *flagsp) 1286 { 1287 struct mbuf *m, *n; 1288 struct mbuf *free_chain = NULL; 1289 int flags, len, error, offset; 1290 struct protosw *pr = so->so_proto; 1291 int moff, type = 0; 1292 size_t resid, orig_resid; 1293 1294 if (uio) 1295 resid = uio->uio_resid; 1296 else 1297 resid = (size_t)(sio->sb_climit - sio->sb_cc); 1298 orig_resid = resid; 1299 1300 if (psa) 1301 *psa = NULL; 1302 if (controlp) 1303 *controlp = NULL; 1304 if (flagsp) 1305 flags = *flagsp &~ MSG_EOR; 1306 else 1307 flags = 0; 1308 if (flags & MSG_OOB) { 1309 m = m_get(M_WAITOK, MT_DATA); 1310 if (m == NULL) 1311 return (ENOBUFS); 1312 error = so_pru_rcvoob(so, m, flags & MSG_PEEK); 1313 if (error) 1314 goto bad; 1315 if (sio) { 1316 do { 1317 sbappend(sio, m); 1318 KKASSERT(resid >= (size_t)m->m_len); 1319 resid -= (size_t)m->m_len; 1320 } while (resid > 0 && m); 1321 } else { 1322 do { 1323 uio->uio_resid = resid; 1324 error = uiomove(mtod(m, caddr_t), 1325 (int)szmin(resid, m->m_len), 1326 uio); 1327 resid = uio->uio_resid; 1328 m = m_free(m); 1329 } while (uio->uio_resid && error == 0 && m); 1330 } 1331 bad: 1332 if (m) 1333 m_freem(m); 1334 return (error); 1335 } 1336 if ((so->so_state & SS_ISCONFIRMING) && resid) 1337 so_pru_rcvd(so, 0); 1338 1339 /* 1340 * The token interlocks against the protocol thread while 1341 * ssb_lock is a blocking lock against other userland entities. 1342 */ 1343 lwkt_gettoken(&so->so_rcv.ssb_token); 1344 restart: 1345 error = ssb_lock(&so->so_rcv, SBLOCKWAIT(flags)); 1346 if (error) 1347 goto done; 1348 1349 m = so->so_rcv.ssb_mb; 1350 /* 1351 * If we have less data than requested, block awaiting more 1352 * (subject to any timeout) if: 1353 * 1. the current count is less than the low water mark, or 1354 * 2. MSG_WAITALL is set, and it is possible to do the entire 1355 * receive operation at once if we block (resid <= hiwat). 1356 * 3. MSG_DONTWAIT is not set 1357 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1358 * we have to do the receive in sections, and thus risk returning 1359 * a short count if a timeout or signal occurs after we start. 1360 */ 1361 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1362 (size_t)so->so_rcv.ssb_cc < resid) && 1363 (so->so_rcv.ssb_cc < so->so_rcv.ssb_lowat || 1364 ((flags & MSG_WAITALL) && resid <= (size_t)so->so_rcv.ssb_hiwat)) && 1365 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { 1366 KASSERT(m != NULL || !so->so_rcv.ssb_cc, ("receive 1")); 1367 if (so->so_error) { 1368 if (m) 1369 goto dontblock; 1370 error = so->so_error; 1371 if ((flags & MSG_PEEK) == 0) 1372 so->so_error = 0; 1373 goto release; 1374 } 1375 if (so->so_state & SS_CANTRCVMORE) { 1376 if (m) 1377 goto dontblock; 1378 else 1379 goto release; 1380 } 1381 for (; m; m = m->m_next) { 1382 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1383 m = so->so_rcv.ssb_mb; 1384 goto dontblock; 1385 } 1386 } 1387 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1388 (pr->pr_flags & PR_CONNREQUIRED)) { 1389 error = ENOTCONN; 1390 goto release; 1391 } 1392 if (resid == 0) 1393 goto release; 1394 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) { 1395 error = EWOULDBLOCK; 1396 goto release; 1397 } 1398 ssb_unlock(&so->so_rcv); 1399 error = ssb_wait(&so->so_rcv); 1400 if (error) 1401 goto done; 1402 goto restart; 1403 } 1404 dontblock: 1405 if (uio && uio->uio_td && uio->uio_td->td_proc) 1406 uio->uio_td->td_lwp->lwp_ru.ru_msgrcv++; 1407 1408 /* 1409 * note: m should be == sb_mb here. Cache the next record while 1410 * cleaning up. Note that calling m_free*() will break out critical 1411 * section. 1412 */ 1413 KKASSERT(m == so->so_rcv.ssb_mb); 1414 1415 /* 1416 * Skip any address mbufs prepending the record. 1417 */ 1418 if (pr->pr_flags & PR_ADDR) { 1419 KASSERT(m->m_type == MT_SONAME, ("receive 1a")); 1420 orig_resid = 0; 1421 if (psa) 1422 *psa = dup_sockaddr(mtod(m, struct sockaddr *)); 1423 if (flags & MSG_PEEK) 1424 m = m->m_next; 1425 else 1426 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1427 } 1428 1429 /* 1430 * Skip any control mbufs prepending the record. 1431 */ 1432 while (m && m->m_type == MT_CONTROL && error == 0) { 1433 if (flags & MSG_PEEK) { 1434 if (controlp) 1435 *controlp = m_copy(m, 0, m->m_len); 1436 m = m->m_next; /* XXX race */ 1437 } else { 1438 if (controlp) { 1439 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL); 1440 if (pr->pr_domain->dom_externalize && 1441 mtod(m, struct cmsghdr *)->cmsg_type == 1442 SCM_RIGHTS) 1443 error = (*pr->pr_domain->dom_externalize)(m); 1444 *controlp = m; 1445 m = n; 1446 } else { 1447 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1448 } 1449 } 1450 if (controlp && *controlp) { 1451 orig_resid = 0; 1452 controlp = &(*controlp)->m_next; 1453 } 1454 } 1455 1456 /* 1457 * flag OOB data. 1458 */ 1459 if (m) { 1460 type = m->m_type; 1461 if (type == MT_OOBDATA) 1462 flags |= MSG_OOB; 1463 } 1464 1465 /* 1466 * Copy to the UIO or mbuf return chain (*mp). 1467 */ 1468 moff = 0; 1469 offset = 0; 1470 while (m && resid > 0 && error == 0) { 1471 if (m->m_type == MT_OOBDATA) { 1472 if (type != MT_OOBDATA) 1473 break; 1474 } else if (type == MT_OOBDATA) 1475 break; 1476 else 1477 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 1478 ("receive 3")); 1479 soclrstate(so, SS_RCVATMARK); 1480 len = (resid > INT_MAX) ? INT_MAX : resid; 1481 if (so->so_oobmark && len > so->so_oobmark - offset) 1482 len = so->so_oobmark - offset; 1483 if (len > m->m_len - moff) 1484 len = m->m_len - moff; 1485 1486 /* 1487 * Copy out to the UIO or pass the mbufs back to the SIO. 1488 * The SIO is dealt with when we eat the mbuf, but deal 1489 * with the resid here either way. 1490 */ 1491 if (uio) { 1492 uio->uio_resid = resid; 1493 error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1494 resid = uio->uio_resid; 1495 if (error) 1496 goto release; 1497 } else { 1498 resid -= (size_t)len; 1499 } 1500 1501 /* 1502 * Eat the entire mbuf or just a piece of it 1503 */ 1504 if (len == m->m_len - moff) { 1505 if (m->m_flags & M_EOR) 1506 flags |= MSG_EOR; 1507 if (flags & MSG_PEEK) { 1508 m = m->m_next; 1509 moff = 0; 1510 } else { 1511 if (sio) { 1512 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL); 1513 sbappend(sio, m); 1514 m = n; 1515 } else { 1516 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1517 } 1518 } 1519 } else { 1520 if (flags & MSG_PEEK) { 1521 moff += len; 1522 } else { 1523 if (sio) { 1524 n = m_copym(m, 0, len, M_WAITOK); 1525 if (n) 1526 sbappend(sio, n); 1527 } 1528 m->m_data += len; 1529 m->m_len -= len; 1530 so->so_rcv.ssb_cc -= len; 1531 } 1532 } 1533 if (so->so_oobmark) { 1534 if ((flags & MSG_PEEK) == 0) { 1535 so->so_oobmark -= len; 1536 if (so->so_oobmark == 0) { 1537 sosetstate(so, SS_RCVATMARK); 1538 break; 1539 } 1540 } else { 1541 offset += len; 1542 if (offset == so->so_oobmark) 1543 break; 1544 } 1545 } 1546 if (flags & MSG_EOR) 1547 break; 1548 /* 1549 * If the MSG_WAITALL flag is set (for non-atomic socket), 1550 * we must not quit until resid == 0 or an error 1551 * termination. If a signal/timeout occurs, return 1552 * with a short count but without error. 1553 * Keep signalsockbuf locked against other readers. 1554 */ 1555 while ((flags & MSG_WAITALL) && m == NULL && 1556 resid > 0 && !sosendallatonce(so) && 1557 so->so_rcv.ssb_mb == NULL) { 1558 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1559 break; 1560 /* 1561 * The window might have closed to zero, make 1562 * sure we send an ack now that we've drained 1563 * the buffer or we might end up blocking until 1564 * the idle takes over (5 seconds). 1565 */ 1566 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 1567 so_pru_rcvd(so, flags); 1568 error = ssb_wait(&so->so_rcv); 1569 if (error) { 1570 ssb_unlock(&so->so_rcv); 1571 error = 0; 1572 goto done; 1573 } 1574 m = so->so_rcv.ssb_mb; 1575 } 1576 } 1577 1578 /* 1579 * If an atomic read was requested but unread data still remains 1580 * in the record, set MSG_TRUNC. 1581 */ 1582 if (m && pr->pr_flags & PR_ATOMIC) 1583 flags |= MSG_TRUNC; 1584 1585 /* 1586 * Cleanup. If an atomic read was requested drop any unread data. 1587 */ 1588 if ((flags & MSG_PEEK) == 0) { 1589 if (m && (pr->pr_flags & PR_ATOMIC)) 1590 sbdroprecord(&so->so_rcv.sb); 1591 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) 1592 so_pru_rcvd(so, flags); 1593 } 1594 1595 if (orig_resid == resid && orig_resid && 1596 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1597 ssb_unlock(&so->so_rcv); 1598 goto restart; 1599 } 1600 1601 if (flagsp) 1602 *flagsp |= flags; 1603 release: 1604 ssb_unlock(&so->so_rcv); 1605 done: 1606 lwkt_reltoken(&so->so_rcv.ssb_token); 1607 if (free_chain) 1608 m_freem(free_chain); 1609 return (error); 1610 } 1611 1612 int 1613 sorecvtcp(struct socket *so, struct sockaddr **psa, struct uio *uio, 1614 struct sockbuf *sio, struct mbuf **controlp, int *flagsp) 1615 { 1616 struct mbuf *m, *n; 1617 struct mbuf *free_chain = NULL; 1618 int flags, len, error, offset; 1619 struct protosw *pr = so->so_proto; 1620 int moff; 1621 int didoob; 1622 size_t resid, orig_resid, restmp; 1623 1624 if (uio) 1625 resid = uio->uio_resid; 1626 else 1627 resid = (size_t)(sio->sb_climit - sio->sb_cc); 1628 orig_resid = resid; 1629 1630 if (psa) 1631 *psa = NULL; 1632 if (controlp) 1633 *controlp = NULL; 1634 if (flagsp) 1635 flags = *flagsp &~ MSG_EOR; 1636 else 1637 flags = 0; 1638 if (flags & MSG_OOB) { 1639 m = m_get(M_WAITOK, MT_DATA); 1640 if (m == NULL) 1641 return (ENOBUFS); 1642 error = so_pru_rcvoob(so, m, flags & MSG_PEEK); 1643 if (error) 1644 goto bad; 1645 if (sio) { 1646 do { 1647 sbappend(sio, m); 1648 KKASSERT(resid >= (size_t)m->m_len); 1649 resid -= (size_t)m->m_len; 1650 } while (resid > 0 && m); 1651 } else { 1652 do { 1653 uio->uio_resid = resid; 1654 error = uiomove(mtod(m, caddr_t), 1655 (int)szmin(resid, m->m_len), 1656 uio); 1657 resid = uio->uio_resid; 1658 m = m_free(m); 1659 } while (uio->uio_resid && error == 0 && m); 1660 } 1661 bad: 1662 if (m) 1663 m_freem(m); 1664 return (error); 1665 } 1666 1667 /* 1668 * The token interlocks against the protocol thread while 1669 * ssb_lock is a blocking lock against other userland entities. 1670 * 1671 * Lock a limited number of mbufs (not all, so sbcompress() still 1672 * works well). The token is used as an interlock for sbwait() so 1673 * release it afterwords. 1674 */ 1675 restart: 1676 error = ssb_lock(&so->so_rcv, SBLOCKWAIT(flags)); 1677 if (error) 1678 goto done; 1679 1680 lwkt_gettoken(&so->so_rcv.ssb_token); 1681 m = so->so_rcv.ssb_mb; 1682 1683 /* 1684 * If we have less data than requested, block awaiting more 1685 * (subject to any timeout) if: 1686 * 1. the current count is less than the low water mark, or 1687 * 2. MSG_WAITALL is set, and it is possible to do the entire 1688 * receive operation at once if we block (resid <= hiwat). 1689 * 3. MSG_DONTWAIT is not set 1690 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1691 * we have to do the receive in sections, and thus risk returning 1692 * a short count if a timeout or signal occurs after we start. 1693 */ 1694 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1695 (size_t)so->so_rcv.ssb_cc < resid) && 1696 (so->so_rcv.ssb_cc < so->so_rcv.ssb_lowat || 1697 ((flags & MSG_WAITALL) && resid <= (size_t)so->so_rcv.ssb_hiwat)))) { 1698 KASSERT(m != NULL || !so->so_rcv.ssb_cc, ("receive 1")); 1699 if (so->so_error) { 1700 if (m) 1701 goto dontblock; 1702 lwkt_reltoken(&so->so_rcv.ssb_token); 1703 error = so->so_error; 1704 if ((flags & MSG_PEEK) == 0) 1705 so->so_error = 0; 1706 goto release; 1707 } 1708 if (so->so_state & SS_CANTRCVMORE) { 1709 if (m) 1710 goto dontblock; 1711 lwkt_reltoken(&so->so_rcv.ssb_token); 1712 goto release; 1713 } 1714 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1715 (pr->pr_flags & PR_CONNREQUIRED)) { 1716 lwkt_reltoken(&so->so_rcv.ssb_token); 1717 error = ENOTCONN; 1718 goto release; 1719 } 1720 if (resid == 0) { 1721 lwkt_reltoken(&so->so_rcv.ssb_token); 1722 goto release; 1723 } 1724 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) { 1725 lwkt_reltoken(&so->so_rcv.ssb_token); 1726 error = EWOULDBLOCK; 1727 goto release; 1728 } 1729 ssb_unlock(&so->so_rcv); 1730 error = ssb_wait(&so->so_rcv); 1731 lwkt_reltoken(&so->so_rcv.ssb_token); 1732 if (error) 1733 goto done; 1734 goto restart; 1735 } 1736 1737 /* 1738 * Token still held 1739 */ 1740 dontblock: 1741 n = m; 1742 restmp = 0; 1743 while (n && restmp < resid) { 1744 n->m_flags |= M_SOLOCKED; 1745 restmp += n->m_len; 1746 if (n->m_next == NULL) 1747 n = n->m_nextpkt; 1748 else 1749 n = n->m_next; 1750 } 1751 1752 /* 1753 * Release token for loop 1754 */ 1755 lwkt_reltoken(&so->so_rcv.ssb_token); 1756 if (uio && uio->uio_td && uio->uio_td->td_proc) 1757 uio->uio_td->td_lwp->lwp_ru.ru_msgrcv++; 1758 1759 /* 1760 * note: m should be == sb_mb here. Cache the next record while 1761 * cleaning up. Note that calling m_free*() will break out critical 1762 * section. 1763 */ 1764 KKASSERT(m == so->so_rcv.ssb_mb); 1765 1766 /* 1767 * Copy to the UIO or mbuf return chain (*mp). 1768 * 1769 * NOTE: Token is not held for loop 1770 */ 1771 moff = 0; 1772 offset = 0; 1773 didoob = 0; 1774 1775 while (m && (m->m_flags & M_SOLOCKED) && resid > 0 && error == 0) { 1776 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 1777 ("receive 3")); 1778 1779 soclrstate(so, SS_RCVATMARK); 1780 len = (resid > INT_MAX) ? INT_MAX : resid; 1781 if (so->so_oobmark && len > so->so_oobmark - offset) 1782 len = so->so_oobmark - offset; 1783 if (len > m->m_len - moff) 1784 len = m->m_len - moff; 1785 1786 /* 1787 * Copy out to the UIO or pass the mbufs back to the SIO. 1788 * The SIO is dealt with when we eat the mbuf, but deal 1789 * with the resid here either way. 1790 */ 1791 if (uio) { 1792 uio->uio_resid = resid; 1793 error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1794 resid = uio->uio_resid; 1795 if (error) 1796 goto release; 1797 } else { 1798 resid -= (size_t)len; 1799 } 1800 1801 /* 1802 * Eat the entire mbuf or just a piece of it 1803 */ 1804 offset += len; 1805 if (len == m->m_len - moff) { 1806 m = m->m_next; 1807 moff = 0; 1808 } else { 1809 moff += len; 1810 } 1811 1812 /* 1813 * Check oobmark 1814 */ 1815 if (so->so_oobmark && offset == so->so_oobmark) { 1816 didoob = 1; 1817 break; 1818 } 1819 } 1820 1821 /* 1822 * Synchronize sockbuf with data we read. 1823 * 1824 * NOTE: (m) is junk on entry (it could be left over from the 1825 * previous loop). 1826 */ 1827 if ((flags & MSG_PEEK) == 0) { 1828 lwkt_gettoken(&so->so_rcv.ssb_token); 1829 m = so->so_rcv.ssb_mb; 1830 while (m && offset >= m->m_len) { 1831 if (so->so_oobmark) { 1832 so->so_oobmark -= m->m_len; 1833 if (so->so_oobmark == 0) { 1834 sosetstate(so, SS_RCVATMARK); 1835 didoob = 1; 1836 } 1837 } 1838 offset -= m->m_len; 1839 if (sio) { 1840 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL); 1841 sbappend(sio, m); 1842 m = n; 1843 } else { 1844 m = sbunlinkmbuf(&so->so_rcv.sb, 1845 m, &free_chain); 1846 } 1847 } 1848 if (offset) { 1849 KKASSERT(m); 1850 if (sio) { 1851 n = m_copym(m, 0, offset, M_WAITOK); 1852 if (n) 1853 sbappend(sio, n); 1854 } 1855 m->m_data += offset; 1856 m->m_len -= offset; 1857 so->so_rcv.ssb_cc -= offset; 1858 if (so->so_oobmark) { 1859 so->so_oobmark -= offset; 1860 if (so->so_oobmark == 0) { 1861 sosetstate(so, SS_RCVATMARK); 1862 didoob = 1; 1863 } 1864 } 1865 offset = 0; 1866 } 1867 lwkt_reltoken(&so->so_rcv.ssb_token); 1868 } 1869 1870 /* 1871 * If the MSG_WAITALL flag is set (for non-atomic socket), 1872 * we must not quit until resid == 0 or an error termination. 1873 * 1874 * If a signal/timeout occurs, return with a short count but without 1875 * error. 1876 * 1877 * Keep signalsockbuf locked against other readers. 1878 * 1879 * XXX if MSG_PEEK we currently do quit. 1880 */ 1881 if ((flags & MSG_WAITALL) && !(flags & MSG_PEEK) && 1882 didoob == 0 && resid > 0 && 1883 !sosendallatonce(so)) { 1884 lwkt_gettoken(&so->so_rcv.ssb_token); 1885 error = 0; 1886 while ((m = so->so_rcv.ssb_mb) == NULL) { 1887 if (so->so_error || (so->so_state & SS_CANTRCVMORE)) { 1888 error = so->so_error; 1889 break; 1890 } 1891 /* 1892 * The window might have closed to zero, make 1893 * sure we send an ack now that we've drained 1894 * the buffer or we might end up blocking until 1895 * the idle takes over (5 seconds). 1896 */ 1897 if (so->so_pcb) 1898 so_pru_rcvd_async(so); 1899 if (so->so_rcv.ssb_mb == NULL) 1900 error = ssb_wait(&so->so_rcv); 1901 if (error) { 1902 lwkt_reltoken(&so->so_rcv.ssb_token); 1903 ssb_unlock(&so->so_rcv); 1904 error = 0; 1905 goto done; 1906 } 1907 } 1908 if (m && error == 0) 1909 goto dontblock; 1910 lwkt_reltoken(&so->so_rcv.ssb_token); 1911 } 1912 1913 /* 1914 * Token not held here. 1915 * 1916 * Cleanup. If an atomic read was requested drop any unread data XXX 1917 */ 1918 if ((flags & MSG_PEEK) == 0) { 1919 if (so->so_pcb) 1920 so_pru_rcvd_async(so); 1921 } 1922 1923 if (orig_resid == resid && orig_resid && 1924 (so->so_state & SS_CANTRCVMORE) == 0) { 1925 ssb_unlock(&so->so_rcv); 1926 goto restart; 1927 } 1928 1929 if (flagsp) 1930 *flagsp |= flags; 1931 release: 1932 ssb_unlock(&so->so_rcv); 1933 done: 1934 if (free_chain) 1935 m_freem(free_chain); 1936 return (error); 1937 } 1938 1939 /* 1940 * Shut a socket down. Note that we do not get a frontend lock as we 1941 * want to be able to shut the socket down even if another thread is 1942 * blocked in a read(), thus waking it up. 1943 */ 1944 int 1945 soshutdown(struct socket *so, int how) 1946 { 1947 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1948 return (EINVAL); 1949 1950 if (how != SHUT_WR) { 1951 /*ssb_lock(&so->so_rcv, M_WAITOK);*/ 1952 sorflush(so); 1953 /*ssb_unlock(&so->so_rcv);*/ 1954 } 1955 if (how != SHUT_RD) 1956 return (so_pru_shutdown(so)); 1957 return (0); 1958 } 1959 1960 void 1961 sorflush(struct socket *so) 1962 { 1963 struct signalsockbuf *ssb = &so->so_rcv; 1964 struct protosw *pr = so->so_proto; 1965 struct signalsockbuf asb; 1966 1967 atomic_set_int(&ssb->ssb_flags, SSB_NOINTR); 1968 1969 lwkt_gettoken(&ssb->ssb_token); 1970 socantrcvmore(so); 1971 asb = *ssb; 1972 1973 /* 1974 * Can't just blow up the ssb structure here 1975 */ 1976 bzero(&ssb->sb, sizeof(ssb->sb)); 1977 ssb->ssb_timeo = 0; 1978 ssb->ssb_lowat = 0; 1979 ssb->ssb_hiwat = 0; 1980 ssb->ssb_mbmax = 0; 1981 atomic_clear_int(&ssb->ssb_flags, SSB_CLEAR_MASK); 1982 1983 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) 1984 (*pr->pr_domain->dom_dispose)(asb.ssb_mb); 1985 ssb_release(&asb, so); 1986 1987 lwkt_reltoken(&ssb->ssb_token); 1988 } 1989 1990 #ifdef INET 1991 static int 1992 do_setopt_accept_filter(struct socket *so, struct sockopt *sopt) 1993 { 1994 struct accept_filter_arg *afap = NULL; 1995 struct accept_filter *afp; 1996 struct so_accf *af = so->so_accf; 1997 int error = 0; 1998 1999 /* do not set/remove accept filters on non listen sockets */ 2000 if ((so->so_options & SO_ACCEPTCONN) == 0) { 2001 error = EINVAL; 2002 goto out; 2003 } 2004 2005 /* removing the filter */ 2006 if (sopt == NULL) { 2007 if (af != NULL) { 2008 if (af->so_accept_filter != NULL && 2009 af->so_accept_filter->accf_destroy != NULL) { 2010 af->so_accept_filter->accf_destroy(so); 2011 } 2012 if (af->so_accept_filter_str != NULL) { 2013 kfree(af->so_accept_filter_str, M_ACCF); 2014 } 2015 kfree(af, M_ACCF); 2016 so->so_accf = NULL; 2017 } 2018 so->so_options &= ~SO_ACCEPTFILTER; 2019 return (0); 2020 } 2021 /* adding a filter */ 2022 /* must remove previous filter first */ 2023 if (af != NULL) { 2024 error = EINVAL; 2025 goto out; 2026 } 2027 /* don't put large objects on the kernel stack */ 2028 afap = kmalloc(sizeof(*afap), M_TEMP, M_WAITOK); 2029 error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); 2030 afap->af_name[sizeof(afap->af_name)-1] = '\0'; 2031 afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; 2032 if (error) 2033 goto out; 2034 afp = accept_filt_get(afap->af_name); 2035 if (afp == NULL) { 2036 error = ENOENT; 2037 goto out; 2038 } 2039 af = kmalloc(sizeof(*af), M_ACCF, M_WAITOK | M_ZERO); 2040 if (afp->accf_create != NULL) { 2041 if (afap->af_name[0] != '\0') { 2042 int len = strlen(afap->af_name) + 1; 2043 2044 af->so_accept_filter_str = kmalloc(len, M_ACCF, 2045 M_WAITOK); 2046 strcpy(af->so_accept_filter_str, afap->af_name); 2047 } 2048 af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg); 2049 if (af->so_accept_filter_arg == NULL) { 2050 kfree(af->so_accept_filter_str, M_ACCF); 2051 kfree(af, M_ACCF); 2052 so->so_accf = NULL; 2053 error = EINVAL; 2054 goto out; 2055 } 2056 } 2057 af->so_accept_filter = afp; 2058 so->so_accf = af; 2059 so->so_options |= SO_ACCEPTFILTER; 2060 out: 2061 if (afap != NULL) 2062 kfree(afap, M_TEMP); 2063 return (error); 2064 } 2065 #endif /* INET */ 2066 2067 /* 2068 * Perhaps this routine, and sooptcopyout(), below, ought to come in 2069 * an additional variant to handle the case where the option value needs 2070 * to be some kind of integer, but not a specific size. 2071 * In addition to their use here, these functions are also called by the 2072 * protocol-level pr_ctloutput() routines. 2073 */ 2074 int 2075 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 2076 { 2077 return soopt_to_kbuf(sopt, buf, len, minlen); 2078 } 2079 2080 int 2081 soopt_to_kbuf(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 2082 { 2083 size_t valsize; 2084 2085 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2086 KKASSERT(kva_p(buf)); 2087 2088 /* 2089 * If the user gives us more than we wanted, we ignore it, 2090 * but if we don't get the minimum length the caller 2091 * wants, we return EINVAL. On success, sopt->sopt_valsize 2092 * is set to however much we actually retrieved. 2093 */ 2094 if ((valsize = sopt->sopt_valsize) < minlen) 2095 return EINVAL; 2096 if (valsize > len) 2097 sopt->sopt_valsize = valsize = len; 2098 2099 bcopy(sopt->sopt_val, buf, valsize); 2100 return 0; 2101 } 2102 2103 2104 int 2105 sosetopt(struct socket *so, struct sockopt *sopt) 2106 { 2107 int error, optval; 2108 struct linger l; 2109 struct timeval tv; 2110 u_long val; 2111 struct signalsockbuf *sotmp; 2112 2113 error = 0; 2114 sopt->sopt_dir = SOPT_SET; 2115 if (sopt->sopt_level != SOL_SOCKET) { 2116 if (so->so_proto && so->so_proto->pr_ctloutput) { 2117 return (so_pr_ctloutput(so, sopt)); 2118 } 2119 error = ENOPROTOOPT; 2120 } else { 2121 switch (sopt->sopt_name) { 2122 #ifdef INET 2123 case SO_ACCEPTFILTER: 2124 error = do_setopt_accept_filter(so, sopt); 2125 if (error) 2126 goto bad; 2127 break; 2128 #endif /* INET */ 2129 case SO_LINGER: 2130 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 2131 if (error) 2132 goto bad; 2133 2134 so->so_linger = l.l_linger; 2135 if (l.l_onoff) 2136 so->so_options |= SO_LINGER; 2137 else 2138 so->so_options &= ~SO_LINGER; 2139 break; 2140 2141 case SO_DEBUG: 2142 case SO_KEEPALIVE: 2143 case SO_DONTROUTE: 2144 case SO_USELOOPBACK: 2145 case SO_BROADCAST: 2146 case SO_REUSEADDR: 2147 case SO_REUSEPORT: 2148 case SO_OOBINLINE: 2149 case SO_TIMESTAMP: 2150 case SO_NOSIGPIPE: 2151 error = sooptcopyin(sopt, &optval, sizeof optval, 2152 sizeof optval); 2153 if (error) 2154 goto bad; 2155 if (optval) 2156 so->so_options |= sopt->sopt_name; 2157 else 2158 so->so_options &= ~sopt->sopt_name; 2159 break; 2160 2161 case SO_SNDBUF: 2162 case SO_RCVBUF: 2163 case SO_SNDLOWAT: 2164 case SO_RCVLOWAT: 2165 error = sooptcopyin(sopt, &optval, sizeof optval, 2166 sizeof optval); 2167 if (error) 2168 goto bad; 2169 2170 /* 2171 * Values < 1 make no sense for any of these 2172 * options, so disallow them. 2173 */ 2174 if (optval < 1) { 2175 error = EINVAL; 2176 goto bad; 2177 } 2178 2179 switch (sopt->sopt_name) { 2180 case SO_SNDBUF: 2181 case SO_RCVBUF: 2182 if (ssb_reserve(sopt->sopt_name == SO_SNDBUF ? 2183 &so->so_snd : &so->so_rcv, (u_long)optval, 2184 so, 2185 &curproc->p_rlimit[RLIMIT_SBSIZE]) == 0) { 2186 error = ENOBUFS; 2187 goto bad; 2188 } 2189 sotmp = (sopt->sopt_name == SO_SNDBUF) ? 2190 &so->so_snd : &so->so_rcv; 2191 atomic_clear_int(&sotmp->ssb_flags, 2192 SSB_AUTOSIZE); 2193 break; 2194 2195 /* 2196 * Make sure the low-water is never greater than 2197 * the high-water. 2198 */ 2199 case SO_SNDLOWAT: 2200 so->so_snd.ssb_lowat = 2201 (optval > so->so_snd.ssb_hiwat) ? 2202 so->so_snd.ssb_hiwat : optval; 2203 atomic_clear_int(&so->so_snd.ssb_flags, 2204 SSB_AUTOLOWAT); 2205 break; 2206 case SO_RCVLOWAT: 2207 so->so_rcv.ssb_lowat = 2208 (optval > so->so_rcv.ssb_hiwat) ? 2209 so->so_rcv.ssb_hiwat : optval; 2210 atomic_clear_int(&so->so_rcv.ssb_flags, 2211 SSB_AUTOLOWAT); 2212 break; 2213 } 2214 break; 2215 2216 case SO_SNDTIMEO: 2217 case SO_RCVTIMEO: 2218 error = sooptcopyin(sopt, &tv, sizeof tv, 2219 sizeof tv); 2220 if (error) 2221 goto bad; 2222 2223 /* assert(hz > 0); */ 2224 if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz || 2225 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 2226 error = EDOM; 2227 goto bad; 2228 } 2229 /* assert(tick > 0); */ 2230 /* assert(ULONG_MAX - INT_MAX >= 1000000); */ 2231 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / ustick; 2232 if (val > INT_MAX) { 2233 error = EDOM; 2234 goto bad; 2235 } 2236 if (val == 0 && tv.tv_usec != 0) 2237 val = 1; 2238 2239 switch (sopt->sopt_name) { 2240 case SO_SNDTIMEO: 2241 so->so_snd.ssb_timeo = val; 2242 break; 2243 case SO_RCVTIMEO: 2244 so->so_rcv.ssb_timeo = val; 2245 break; 2246 } 2247 break; 2248 default: 2249 error = ENOPROTOOPT; 2250 break; 2251 } 2252 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { 2253 (void) so_pr_ctloutput(so, sopt); 2254 } 2255 } 2256 bad: 2257 return (error); 2258 } 2259 2260 /* Helper routine for getsockopt */ 2261 int 2262 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 2263 { 2264 soopt_from_kbuf(sopt, buf, len); 2265 return 0; 2266 } 2267 2268 void 2269 soopt_from_kbuf(struct sockopt *sopt, const void *buf, size_t len) 2270 { 2271 size_t valsize; 2272 2273 if (len == 0) { 2274 sopt->sopt_valsize = 0; 2275 return; 2276 } 2277 2278 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2279 KKASSERT(kva_p(buf)); 2280 2281 /* 2282 * Documented get behavior is that we always return a value, 2283 * possibly truncated to fit in the user's buffer. 2284 * Traditional behavior is that we always tell the user 2285 * precisely how much we copied, rather than something useful 2286 * like the total amount we had available for her. 2287 * Note that this interface is not idempotent; the entire answer must 2288 * generated ahead of time. 2289 */ 2290 valsize = szmin(len, sopt->sopt_valsize); 2291 sopt->sopt_valsize = valsize; 2292 if (sopt->sopt_val != 0) { 2293 bcopy(buf, sopt->sopt_val, valsize); 2294 } 2295 } 2296 2297 int 2298 sogetopt(struct socket *so, struct sockopt *sopt) 2299 { 2300 int error, optval; 2301 long optval_l; 2302 struct linger l; 2303 struct timeval tv; 2304 #ifdef INET 2305 struct accept_filter_arg *afap; 2306 #endif 2307 2308 error = 0; 2309 sopt->sopt_dir = SOPT_GET; 2310 if (sopt->sopt_level != SOL_SOCKET) { 2311 if (so->so_proto && so->so_proto->pr_ctloutput) { 2312 return (so_pr_ctloutput(so, sopt)); 2313 } else 2314 return (ENOPROTOOPT); 2315 } else { 2316 switch (sopt->sopt_name) { 2317 #ifdef INET 2318 case SO_ACCEPTFILTER: 2319 if ((so->so_options & SO_ACCEPTCONN) == 0) 2320 return (EINVAL); 2321 afap = kmalloc(sizeof(*afap), M_TEMP, 2322 M_WAITOK | M_ZERO); 2323 if ((so->so_options & SO_ACCEPTFILTER) != 0) { 2324 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); 2325 if (so->so_accf->so_accept_filter_str != NULL) 2326 strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); 2327 } 2328 error = sooptcopyout(sopt, afap, sizeof(*afap)); 2329 kfree(afap, M_TEMP); 2330 break; 2331 #endif /* INET */ 2332 2333 case SO_LINGER: 2334 l.l_onoff = so->so_options & SO_LINGER; 2335 l.l_linger = so->so_linger; 2336 error = sooptcopyout(sopt, &l, sizeof l); 2337 break; 2338 2339 case SO_USELOOPBACK: 2340 case SO_DONTROUTE: 2341 case SO_DEBUG: 2342 case SO_KEEPALIVE: 2343 case SO_REUSEADDR: 2344 case SO_REUSEPORT: 2345 case SO_BROADCAST: 2346 case SO_OOBINLINE: 2347 case SO_TIMESTAMP: 2348 case SO_NOSIGPIPE: 2349 optval = so->so_options & sopt->sopt_name; 2350 integer: 2351 error = sooptcopyout(sopt, &optval, sizeof optval); 2352 break; 2353 2354 case SO_TYPE: 2355 optval = so->so_type; 2356 goto integer; 2357 2358 case SO_ERROR: 2359 optval = so->so_error; 2360 so->so_error = 0; 2361 goto integer; 2362 2363 case SO_SNDBUF: 2364 optval = so->so_snd.ssb_hiwat; 2365 goto integer; 2366 2367 case SO_RCVBUF: 2368 optval = so->so_rcv.ssb_hiwat; 2369 goto integer; 2370 2371 case SO_SNDLOWAT: 2372 optval = so->so_snd.ssb_lowat; 2373 goto integer; 2374 2375 case SO_RCVLOWAT: 2376 optval = so->so_rcv.ssb_lowat; 2377 goto integer; 2378 2379 case SO_SNDTIMEO: 2380 case SO_RCVTIMEO: 2381 optval = (sopt->sopt_name == SO_SNDTIMEO ? 2382 so->so_snd.ssb_timeo : so->so_rcv.ssb_timeo); 2383 2384 tv.tv_sec = optval / hz; 2385 tv.tv_usec = (optval % hz) * ustick; 2386 error = sooptcopyout(sopt, &tv, sizeof tv); 2387 break; 2388 2389 case SO_SNDSPACE: 2390 optval_l = ssb_space(&so->so_snd); 2391 error = sooptcopyout(sopt, &optval_l, sizeof(optval_l)); 2392 break; 2393 2394 case SO_CPUHINT: 2395 optval = -1; /* no hint */ 2396 goto integer; 2397 2398 default: 2399 error = ENOPROTOOPT; 2400 break; 2401 } 2402 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) 2403 so_pr_ctloutput(so, sopt); 2404 return (error); 2405 } 2406 } 2407 2408 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 2409 int 2410 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 2411 { 2412 struct mbuf *m, *m_prev; 2413 int sopt_size = sopt->sopt_valsize, msize; 2414 2415 m = m_getl(sopt_size, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA, 2416 0, &msize); 2417 if (m == NULL) 2418 return (ENOBUFS); 2419 m->m_len = min(msize, sopt_size); 2420 sopt_size -= m->m_len; 2421 *mp = m; 2422 m_prev = m; 2423 2424 while (sopt_size > 0) { 2425 m = m_getl(sopt_size, sopt->sopt_td ? M_WAITOK : M_NOWAIT, 2426 MT_DATA, 0, &msize); 2427 if (m == NULL) { 2428 m_freem(*mp); 2429 return (ENOBUFS); 2430 } 2431 m->m_len = min(msize, sopt_size); 2432 sopt_size -= m->m_len; 2433 m_prev->m_next = m; 2434 m_prev = m; 2435 } 2436 return (0); 2437 } 2438 2439 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 2440 int 2441 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 2442 { 2443 soopt_to_mbuf(sopt, m); 2444 return 0; 2445 } 2446 2447 void 2448 soopt_to_mbuf(struct sockopt *sopt, struct mbuf *m) 2449 { 2450 size_t valsize; 2451 void *val; 2452 2453 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2454 KKASSERT(kva_p(m)); 2455 if (sopt->sopt_val == NULL) 2456 return; 2457 val = sopt->sopt_val; 2458 valsize = sopt->sopt_valsize; 2459 while (m != NULL && valsize >= m->m_len) { 2460 bcopy(val, mtod(m, char *), m->m_len); 2461 valsize -= m->m_len; 2462 val = (caddr_t)val + m->m_len; 2463 m = m->m_next; 2464 } 2465 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 2466 panic("ip6_sooptmcopyin"); 2467 } 2468 2469 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 2470 int 2471 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 2472 { 2473 return soopt_from_mbuf(sopt, m); 2474 } 2475 2476 int 2477 soopt_from_mbuf(struct sockopt *sopt, struct mbuf *m) 2478 { 2479 struct mbuf *m0 = m; 2480 size_t valsize = 0; 2481 size_t maxsize; 2482 void *val; 2483 2484 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2485 KKASSERT(kva_p(m)); 2486 if (sopt->sopt_val == NULL) 2487 return 0; 2488 val = sopt->sopt_val; 2489 maxsize = sopt->sopt_valsize; 2490 while (m != NULL && maxsize >= m->m_len) { 2491 bcopy(mtod(m, char *), val, m->m_len); 2492 maxsize -= m->m_len; 2493 val = (caddr_t)val + m->m_len; 2494 valsize += m->m_len; 2495 m = m->m_next; 2496 } 2497 if (m != NULL) { 2498 /* enough soopt buffer should be given from user-land */ 2499 m_freem(m0); 2500 return (EINVAL); 2501 } 2502 sopt->sopt_valsize = valsize; 2503 return 0; 2504 } 2505 2506 void 2507 sohasoutofband(struct socket *so) 2508 { 2509 if (so->so_sigio != NULL) 2510 pgsigio(so->so_sigio, SIGURG, 0); 2511 KNOTE(&so->so_rcv.ssb_kq.ki_note, NOTE_OOB); 2512 } 2513 2514 int 2515 sokqfilter(struct file *fp, struct knote *kn) 2516 { 2517 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2518 struct signalsockbuf *ssb; 2519 2520 switch (kn->kn_filter) { 2521 case EVFILT_READ: 2522 if (so->so_options & SO_ACCEPTCONN) 2523 kn->kn_fop = &solisten_filtops; 2524 else 2525 kn->kn_fop = &soread_filtops; 2526 ssb = &so->so_rcv; 2527 break; 2528 case EVFILT_WRITE: 2529 kn->kn_fop = &sowrite_filtops; 2530 ssb = &so->so_snd; 2531 break; 2532 case EVFILT_EXCEPT: 2533 kn->kn_fop = &soexcept_filtops; 2534 ssb = &so->so_rcv; 2535 break; 2536 default: 2537 return (EOPNOTSUPP); 2538 } 2539 2540 knote_insert(&ssb->ssb_kq.ki_note, kn); 2541 atomic_set_int(&ssb->ssb_flags, SSB_KNOTE); 2542 return (0); 2543 } 2544 2545 static void 2546 filt_sordetach(struct knote *kn) 2547 { 2548 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2549 2550 knote_remove(&so->so_rcv.ssb_kq.ki_note, kn); 2551 if (SLIST_EMPTY(&so->so_rcv.ssb_kq.ki_note)) 2552 atomic_clear_int(&so->so_rcv.ssb_flags, SSB_KNOTE); 2553 } 2554 2555 /*ARGSUSED*/ 2556 static int 2557 filt_soread(struct knote *kn, long hint) 2558 { 2559 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2560 2561 if (kn->kn_sfflags & NOTE_OOB) { 2562 if ((so->so_oobmark || (so->so_state & SS_RCVATMARK))) { 2563 kn->kn_fflags |= NOTE_OOB; 2564 return (1); 2565 } 2566 return (0); 2567 } 2568 kn->kn_data = so->so_rcv.ssb_cc; 2569 2570 if (so->so_state & SS_CANTRCVMORE) { 2571 /* 2572 * Only set NODATA if all data has been exhausted. 2573 */ 2574 if (kn->kn_data == 0) 2575 kn->kn_flags |= EV_NODATA; 2576 kn->kn_flags |= EV_EOF; 2577 kn->kn_fflags = so->so_error; 2578 return (1); 2579 } 2580 if (so->so_error) /* temporary udp error */ 2581 return (1); 2582 if (kn->kn_sfflags & NOTE_LOWAT) 2583 return (kn->kn_data >= kn->kn_sdata); 2584 return ((kn->kn_data >= so->so_rcv.ssb_lowat) || 2585 !TAILQ_EMPTY(&so->so_comp)); 2586 } 2587 2588 static void 2589 filt_sowdetach(struct knote *kn) 2590 { 2591 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2592 2593 knote_remove(&so->so_snd.ssb_kq.ki_note, kn); 2594 if (SLIST_EMPTY(&so->so_snd.ssb_kq.ki_note)) 2595 atomic_clear_int(&so->so_snd.ssb_flags, SSB_KNOTE); 2596 } 2597 2598 /*ARGSUSED*/ 2599 static int 2600 filt_sowrite(struct knote *kn, long hint) 2601 { 2602 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2603 2604 kn->kn_data = ssb_space(&so->so_snd); 2605 if (so->so_state & SS_CANTSENDMORE) { 2606 kn->kn_flags |= (EV_EOF | EV_NODATA); 2607 kn->kn_fflags = so->so_error; 2608 return (1); 2609 } 2610 if (so->so_error) /* temporary udp error */ 2611 return (1); 2612 if (((so->so_state & SS_ISCONNECTED) == 0) && 2613 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2614 return (0); 2615 if (kn->kn_sfflags & NOTE_LOWAT) 2616 return (kn->kn_data >= kn->kn_sdata); 2617 return (kn->kn_data >= so->so_snd.ssb_lowat); 2618 } 2619 2620 /*ARGSUSED*/ 2621 static int 2622 filt_solisten(struct knote *kn, long hint) 2623 { 2624 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2625 2626 kn->kn_data = so->so_qlen; 2627 return (! TAILQ_EMPTY(&so->so_comp)); 2628 } 2629