1 /* 2 * Copyright (c) 2004 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 2004 The DragonFly Project. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Jeffrey M. Hsu. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of The DragonFly Project nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific, prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 /* 35 * Copyright (c) 1982, 1986, 1988, 1990, 1993 36 * The Regents of the University of California. All rights reserved. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. Neither the name of the University nor the names of its contributors 47 * may be used to endorse or promote products derived from this software 48 * without specific prior written permission. 49 * 50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 60 * SUCH DAMAGE. 61 * 62 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 63 * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.24 2003/11/11 17:18:18 silby Exp $ 64 */ 65 66 #include "opt_inet.h" 67 #include "opt_sctp.h" 68 69 #include <sys/param.h> 70 #include <sys/systm.h> 71 #include <sys/fcntl.h> 72 #include <sys/malloc.h> 73 #include <sys/mbuf.h> 74 #include <sys/domain.h> 75 #include <sys/file.h> /* for struct knote */ 76 #include <sys/kernel.h> 77 #include <sys/event.h> 78 #include <sys/proc.h> 79 #include <sys/protosw.h> 80 #include <sys/socket.h> 81 #include <sys/socketvar.h> 82 #include <sys/socketops.h> 83 #include <sys/resourcevar.h> 84 #include <sys/signalvar.h> 85 #include <sys/sysctl.h> 86 #include <sys/uio.h> 87 #include <sys/jail.h> 88 #include <vm/vm_zone.h> 89 #include <vm/pmap.h> 90 #include <net/netmsg2.h> 91 #include <net/netisr2.h> 92 93 #include <sys/thread2.h> 94 #include <sys/socketvar2.h> 95 #include <sys/spinlock2.h> 96 97 #include <machine/limits.h> 98 99 #ifdef INET 100 extern int tcp_sosend_agglim; 101 extern int tcp_sosend_async; 102 extern int tcp_sosend_jcluster; 103 extern int udp_sosend_async; 104 extern int udp_sosend_prepend; 105 106 static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); 107 #endif /* INET */ 108 109 static void filt_sordetach(struct knote *kn); 110 static int filt_soread(struct knote *kn, long hint); 111 static void filt_sowdetach(struct knote *kn); 112 static int filt_sowrite(struct knote *kn, long hint); 113 static int filt_solisten(struct knote *kn, long hint); 114 115 static void sodiscard(struct socket *so); 116 static int soclose_sync(struct socket *so, int fflag); 117 static void soclose_fast(struct socket *so); 118 119 static struct filterops solisten_filtops = 120 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_solisten }; 121 static struct filterops soread_filtops = 122 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_soread }; 123 static struct filterops sowrite_filtops = 124 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sowdetach, filt_sowrite }; 125 static struct filterops soexcept_filtops = 126 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_soread }; 127 128 MALLOC_DEFINE(M_SOCKET, "socket", "socket struct"); 129 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 130 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 131 132 133 static int somaxconn = SOMAXCONN; 134 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, 135 &somaxconn, 0, "Maximum pending socket connection queue size"); 136 137 static int use_soclose_fast = 1; 138 SYSCTL_INT(_kern_ipc, OID_AUTO, soclose_fast, CTLFLAG_RW, 139 &use_soclose_fast, 0, "Fast socket close"); 140 141 int use_soaccept_pred_fast = 1; 142 SYSCTL_INT(_kern_ipc, OID_AUTO, soaccept_pred_fast, CTLFLAG_RW, 143 &use_soaccept_pred_fast, 0, "Fast socket accept predication"); 144 145 int use_sendfile_async = 1; 146 SYSCTL_INT(_kern_ipc, OID_AUTO, sendfile_async, CTLFLAG_RW, 147 &use_sendfile_async, 0, "sendfile uses asynchronized pru_send"); 148 149 int use_soconnect_async = 1; 150 SYSCTL_INT(_kern_ipc, OID_AUTO, soconnect_async, CTLFLAG_RW, 151 &use_soconnect_async, 0, "soconnect uses asynchronized pru_connect"); 152 153 /* 154 * Socket operation routines. 155 * These routines are called by the routines in 156 * sys_socket.c or from a system process, and 157 * implement the semantics of socket operations by 158 * switching out to the protocol specific routines. 159 */ 160 161 /* 162 * Get a socket structure, and initialize it. 163 * Note that it would probably be better to allocate socket 164 * and PCB at the same time, but I'm not convinced that all 165 * the protocols can be easily modified to do this. 166 */ 167 struct socket * 168 soalloc(int waitok, struct protosw *pr) 169 { 170 struct socket *so; 171 unsigned waitmask; 172 173 waitmask = waitok ? M_WAITOK : M_NOWAIT; 174 so = kmalloc(sizeof(struct socket), M_SOCKET, M_ZERO|waitmask); 175 if (so) { 176 /* XXX race condition for reentrant kernel */ 177 so->so_proto = pr; 178 TAILQ_INIT(&so->so_aiojobq); 179 TAILQ_INIT(&so->so_rcv.ssb_kq.ki_mlist); 180 TAILQ_INIT(&so->so_snd.ssb_kq.ki_mlist); 181 lwkt_token_init(&so->so_rcv.ssb_token, "rcvtok"); 182 lwkt_token_init(&so->so_snd.ssb_token, "sndtok"); 183 spin_init(&so->so_rcvd_spin, "soalloc"); 184 netmsg_init(&so->so_rcvd_msg.base, so, &netisr_adone_rport, 185 MSGF_DROPABLE | MSGF_PRIORITY, 186 so->so_proto->pr_usrreqs->pru_rcvd); 187 so->so_rcvd_msg.nm_pru_flags |= PRUR_ASYNC; 188 so->so_state = SS_NOFDREF; 189 so->so_refs = 1; 190 } 191 return so; 192 } 193 194 int 195 socreate(int dom, struct socket **aso, int type, 196 int proto, struct thread *td) 197 { 198 struct proc *p = td->td_proc; 199 struct protosw *prp; 200 struct socket *so; 201 struct pru_attach_info ai; 202 int error; 203 204 if (proto) 205 prp = pffindproto(dom, proto, type); 206 else 207 prp = pffindtype(dom, type); 208 209 if (prp == NULL || prp->pr_usrreqs->pru_attach == 0) 210 return (EPROTONOSUPPORT); 211 212 if (p->p_ucred->cr_prison && jail_socket_unixiproute_only && 213 prp->pr_domain->dom_family != PF_LOCAL && 214 prp->pr_domain->dom_family != PF_INET && 215 prp->pr_domain->dom_family != PF_INET6 && 216 prp->pr_domain->dom_family != PF_ROUTE) { 217 return (EPROTONOSUPPORT); 218 } 219 220 if (prp->pr_type != type) 221 return (EPROTOTYPE); 222 so = soalloc(p != NULL, prp); 223 if (so == NULL) 224 return (ENOBUFS); 225 226 /* 227 * Callers of socreate() presumably will connect up a descriptor 228 * and call soclose() if they cannot. This represents our so_refs 229 * (which should be 1) from soalloc(). 230 */ 231 soclrstate(so, SS_NOFDREF); 232 233 /* 234 * Set a default port for protocol processing. No action will occur 235 * on the socket on this port until an inpcb is attached to it and 236 * is able to match incoming packets, or until the socket becomes 237 * available to userland. 238 * 239 * We normally default the socket to the protocol thread on cpu 0, 240 * if protocol does not provide its own method to initialize the 241 * default port. 242 * 243 * If PR_SYNC_PORT is set (unix domain sockets) there is no protocol 244 * thread and all pr_*()/pru_*() calls are executed synchronously. 245 */ 246 if (prp->pr_flags & PR_SYNC_PORT) 247 so->so_port = &netisr_sync_port; 248 else if (prp->pr_initport != NULL) 249 so->so_port = prp->pr_initport(); 250 else 251 so->so_port = netisr_cpuport(0); 252 253 TAILQ_INIT(&so->so_incomp); 254 TAILQ_INIT(&so->so_comp); 255 so->so_type = type; 256 so->so_cred = crhold(p->p_ucred); 257 ai.sb_rlimit = &p->p_rlimit[RLIMIT_SBSIZE]; 258 ai.p_ucred = p->p_ucred; 259 ai.fd_rdir = p->p_fd->fd_rdir; 260 261 /* 262 * Auto-sizing of socket buffers is managed by the protocols and 263 * the appropriate flags must be set in the pru_attach function. 264 */ 265 error = so_pru_attach(so, proto, &ai); 266 if (error) { 267 sosetstate(so, SS_NOFDREF); 268 sofree(so); /* from soalloc */ 269 return error; 270 } 271 272 /* 273 * NOTE: Returns referenced socket. 274 */ 275 *aso = so; 276 return (0); 277 } 278 279 int 280 sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 281 { 282 int error; 283 284 error = so_pru_bind(so, nam, td); 285 return (error); 286 } 287 288 static void 289 sodealloc(struct socket *so) 290 { 291 if (so->so_rcv.ssb_hiwat) 292 (void)chgsbsize(so->so_cred->cr_uidinfo, 293 &so->so_rcv.ssb_hiwat, 0, RLIM_INFINITY); 294 if (so->so_snd.ssb_hiwat) 295 (void)chgsbsize(so->so_cred->cr_uidinfo, 296 &so->so_snd.ssb_hiwat, 0, RLIM_INFINITY); 297 #ifdef INET 298 /* remove accept filter if present */ 299 if (so->so_accf != NULL) 300 do_setopt_accept_filter(so, NULL); 301 #endif /* INET */ 302 crfree(so->so_cred); 303 if (so->so_faddr != NULL) 304 kfree(so->so_faddr, M_SONAME); 305 kfree(so, M_SOCKET); 306 } 307 308 int 309 solisten(struct socket *so, int backlog, struct thread *td) 310 { 311 int error; 312 #ifdef SCTP 313 short oldopt, oldqlimit; 314 #endif /* SCTP */ 315 316 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) 317 return (EINVAL); 318 319 #ifdef SCTP 320 oldopt = so->so_options; 321 oldqlimit = so->so_qlimit; 322 #endif /* SCTP */ 323 324 lwkt_gettoken(&so->so_rcv.ssb_token); 325 if (TAILQ_EMPTY(&so->so_comp)) 326 so->so_options |= SO_ACCEPTCONN; 327 lwkt_reltoken(&so->so_rcv.ssb_token); 328 if (backlog < 0 || backlog > somaxconn) 329 backlog = somaxconn; 330 so->so_qlimit = backlog; 331 /* SCTP needs to look at tweak both the inbound backlog parameter AND 332 * the so_options (UDP model both connect's and gets inbound 333 * connections .. implicitly). 334 */ 335 error = so_pru_listen(so, td); 336 if (error) { 337 #ifdef SCTP 338 /* Restore the params */ 339 so->so_options = oldopt; 340 so->so_qlimit = oldqlimit; 341 #endif /* SCTP */ 342 return (error); 343 } 344 return (0); 345 } 346 347 /* 348 * Destroy a disconnected socket. This routine is a NOP if entities 349 * still have a reference on the socket: 350 * 351 * so_pcb - The protocol stack still has a reference 352 * SS_NOFDREF - There is no longer a file pointer reference 353 */ 354 void 355 sofree(struct socket *so) 356 { 357 struct socket *head; 358 359 /* 360 * This is a bit hackish at the moment. We need to interlock 361 * any accept queue we are on before we potentially lose the 362 * last reference to avoid races against a re-reference from 363 * someone operating on the queue. 364 */ 365 while ((head = so->so_head) != NULL) { 366 lwkt_getpooltoken(head); 367 if (so->so_head == head) 368 break; 369 lwkt_relpooltoken(head); 370 } 371 372 /* 373 * Arbitrage the last free. 374 */ 375 KKASSERT(so->so_refs > 0); 376 if (atomic_fetchadd_int(&so->so_refs, -1) != 1) { 377 if (head) 378 lwkt_relpooltoken(head); 379 return; 380 } 381 382 KKASSERT(so->so_pcb == NULL && (so->so_state & SS_NOFDREF)); 383 KKASSERT((so->so_state & SS_ASSERTINPROG) == 0); 384 385 /* 386 * We're done, remove ourselves from the accept queue we are 387 * on, if we are on one. 388 */ 389 if (head != NULL) { 390 if (so->so_state & SS_INCOMP) { 391 TAILQ_REMOVE(&head->so_incomp, so, so_list); 392 head->so_incqlen--; 393 } else if (so->so_state & SS_COMP) { 394 /* 395 * We must not decommission a socket that's 396 * on the accept(2) queue. If we do, then 397 * accept(2) may hang after select(2) indicated 398 * that the listening socket was ready. 399 */ 400 lwkt_relpooltoken(head); 401 return; 402 } else { 403 panic("sofree: not queued"); 404 } 405 soclrstate(so, SS_INCOMP); 406 so->so_head = NULL; 407 lwkt_relpooltoken(head); 408 } 409 ssb_release(&so->so_snd, so); 410 sorflush(so); 411 sodealloc(so); 412 } 413 414 /* 415 * Close a socket on last file table reference removal. 416 * Initiate disconnect if connected. 417 * Free socket when disconnect complete. 418 */ 419 int 420 soclose(struct socket *so, int fflag) 421 { 422 int error; 423 424 funsetown(&so->so_sigio); 425 if (!use_soclose_fast || 426 (so->so_proto->pr_flags & PR_SYNC_PORT) || 427 ((so->so_state & SS_ISCONNECTED) && 428 (so->so_options & SO_LINGER))) { 429 error = soclose_sync(so, fflag); 430 } else { 431 soclose_fast(so); 432 error = 0; 433 } 434 return error; 435 } 436 437 static void 438 sodiscard(struct socket *so) 439 { 440 lwkt_getpooltoken(so); 441 if (so->so_options & SO_ACCEPTCONN) { 442 struct socket *sp; 443 444 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 445 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 446 soclrstate(sp, SS_INCOMP); 447 sp->so_head = NULL; 448 so->so_incqlen--; 449 soabort_async(sp); 450 } 451 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 452 TAILQ_REMOVE(&so->so_comp, sp, so_list); 453 soclrstate(sp, SS_COMP); 454 sp->so_head = NULL; 455 so->so_qlen--; 456 soabort_async(sp); 457 } 458 } 459 lwkt_relpooltoken(so); 460 461 if (so->so_state & SS_NOFDREF) 462 panic("soclose: NOFDREF"); 463 sosetstate(so, SS_NOFDREF); /* take ref */ 464 } 465 466 void 467 soinherit(struct socket *so, struct socket *so_inh) 468 { 469 TAILQ_HEAD(, socket) comp, incomp; 470 struct socket *sp; 471 int qlen, incqlen; 472 473 KASSERT(so->so_options & SO_ACCEPTCONN, 474 ("so does not accept connection")); 475 KASSERT(so_inh->so_options & SO_ACCEPTCONN, 476 ("so_inh does not accept connection")); 477 478 TAILQ_INIT(&comp); 479 TAILQ_INIT(&incomp); 480 481 lwkt_getpooltoken(so); 482 lwkt_getpooltoken(so_inh); 483 484 /* 485 * Save completed queue and incompleted queue 486 */ 487 TAILQ_CONCAT(&comp, &so->so_comp, so_list); 488 qlen = so->so_qlen; 489 so->so_qlen = 0; 490 491 TAILQ_CONCAT(&incomp, &so->so_incomp, so_list); 492 incqlen = so->so_incqlen; 493 so->so_incqlen = 0; 494 495 /* 496 * Append the saved completed queue and incompleted 497 * queue to the socket inherits them. 498 * 499 * XXX 500 * This may temporarily break the inheriting socket's 501 * so_qlimit. 502 */ 503 TAILQ_FOREACH(sp, &comp, so_list) { 504 sp->so_head = so_inh; 505 crfree(sp->so_cred); 506 sp->so_cred = crhold(so_inh->so_cred); 507 } 508 509 TAILQ_FOREACH(sp, &incomp, so_list) { 510 sp->so_head = so_inh; 511 crfree(sp->so_cred); 512 sp->so_cred = crhold(so_inh->so_cred); 513 } 514 515 TAILQ_CONCAT(&so_inh->so_comp, &comp, so_list); 516 so_inh->so_qlen += qlen; 517 518 TAILQ_CONCAT(&so_inh->so_incomp, &incomp, so_list); 519 so_inh->so_incqlen += incqlen; 520 521 lwkt_relpooltoken(so_inh); 522 lwkt_relpooltoken(so); 523 524 if (qlen) { 525 /* 526 * "New" connections have arrived 527 */ 528 sorwakeup(so_inh); 529 wakeup(&so_inh->so_timeo); 530 } 531 } 532 533 static int 534 soclose_sync(struct socket *so, int fflag) 535 { 536 int error = 0; 537 538 if (so->so_pcb == NULL) 539 goto discard; 540 if (so->so_state & SS_ISCONNECTED) { 541 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 542 error = sodisconnect(so); 543 if (error) 544 goto drop; 545 } 546 if (so->so_options & SO_LINGER) { 547 if ((so->so_state & SS_ISDISCONNECTING) && 548 (fflag & FNONBLOCK)) 549 goto drop; 550 while (so->so_state & SS_ISCONNECTED) { 551 error = tsleep(&so->so_timeo, PCATCH, 552 "soclos", so->so_linger * hz); 553 if (error) 554 break; 555 } 556 } 557 } 558 drop: 559 if (so->so_pcb) { 560 int error2; 561 562 error2 = so_pru_detach(so); 563 if (error == 0) 564 error = error2; 565 } 566 discard: 567 sodiscard(so); 568 so_pru_sync(so); /* unpend async sending */ 569 sofree(so); /* dispose of ref */ 570 571 return (error); 572 } 573 574 static void 575 soclose_sofree_async_handler(netmsg_t msg) 576 { 577 sofree(msg->base.nm_so); 578 } 579 580 static void 581 soclose_sofree_async(struct socket *so) 582 { 583 struct netmsg_base *base = &so->so_clomsg; 584 585 netmsg_init(base, so, &netisr_apanic_rport, 0, 586 soclose_sofree_async_handler); 587 lwkt_sendmsg(so->so_port, &base->lmsg); 588 } 589 590 static void 591 soclose_disconn_async_handler(netmsg_t msg) 592 { 593 struct socket *so = msg->base.nm_so; 594 595 if ((so->so_state & SS_ISCONNECTED) && 596 (so->so_state & SS_ISDISCONNECTING) == 0) 597 so_pru_disconnect_direct(so); 598 599 if (so->so_pcb) 600 so_pru_detach_direct(so); 601 602 sodiscard(so); 603 sofree(so); 604 } 605 606 static void 607 soclose_disconn_async(struct socket *so) 608 { 609 struct netmsg_base *base = &so->so_clomsg; 610 611 netmsg_init(base, so, &netisr_apanic_rport, 0, 612 soclose_disconn_async_handler); 613 lwkt_sendmsg(so->so_port, &base->lmsg); 614 } 615 616 static void 617 soclose_detach_async_handler(netmsg_t msg) 618 { 619 struct socket *so = msg->base.nm_so; 620 621 if (so->so_pcb) 622 so_pru_detach_direct(so); 623 624 sodiscard(so); 625 sofree(so); 626 } 627 628 static void 629 soclose_detach_async(struct socket *so) 630 { 631 struct netmsg_base *base = &so->so_clomsg; 632 633 netmsg_init(base, so, &netisr_apanic_rport, 0, 634 soclose_detach_async_handler); 635 lwkt_sendmsg(so->so_port, &base->lmsg); 636 } 637 638 static void 639 soclose_fast(struct socket *so) 640 { 641 if (so->so_pcb == NULL) 642 goto discard; 643 644 if ((so->so_state & SS_ISCONNECTED) && 645 (so->so_state & SS_ISDISCONNECTING) == 0) { 646 soclose_disconn_async(so); 647 return; 648 } 649 650 if (so->so_pcb) { 651 soclose_detach_async(so); 652 return; 653 } 654 655 discard: 656 sodiscard(so); 657 soclose_sofree_async(so); 658 } 659 660 /* 661 * Abort and destroy a socket. Only one abort can be in progress 662 * at any given moment. 663 */ 664 void 665 soabort(struct socket *so) 666 { 667 soreference(so); 668 so_pru_abort(so); 669 } 670 671 void 672 soabort_async(struct socket *so) 673 { 674 soreference(so); 675 so_pru_abort_async(so); 676 } 677 678 void 679 soabort_oncpu(struct socket *so) 680 { 681 soreference(so); 682 so_pru_abort_direct(so); 683 } 684 685 /* 686 * so is passed in ref'd, which becomes owned by 687 * the cleared SS_NOFDREF flag. 688 */ 689 void 690 soaccept_generic(struct socket *so) 691 { 692 if ((so->so_state & SS_NOFDREF) == 0) 693 panic("soaccept: !NOFDREF"); 694 soclrstate(so, SS_NOFDREF); /* owned by lack of SS_NOFDREF */ 695 } 696 697 int 698 soaccept(struct socket *so, struct sockaddr **nam) 699 { 700 int error; 701 702 soaccept_generic(so); 703 error = so_pru_accept(so, nam); 704 return (error); 705 } 706 707 int 708 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td, 709 boolean_t sync) 710 { 711 int error; 712 713 if (so->so_options & SO_ACCEPTCONN) 714 return (EOPNOTSUPP); 715 /* 716 * If protocol is connection-based, can only connect once. 717 * Otherwise, if connected, try to disconnect first. 718 * This allows user to disconnect by connecting to, e.g., 719 * a null address. 720 */ 721 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 722 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 723 (error = sodisconnect(so)))) { 724 error = EISCONN; 725 } else { 726 /* 727 * Prevent accumulated error from previous connection 728 * from biting us. 729 */ 730 so->so_error = 0; 731 if (!sync && so->so_proto->pr_usrreqs->pru_preconnect) 732 error = so_pru_connect_async(so, nam, td); 733 else 734 error = so_pru_connect(so, nam, td); 735 } 736 return (error); 737 } 738 739 int 740 soconnect2(struct socket *so1, struct socket *so2) 741 { 742 int error; 743 744 error = so_pru_connect2(so1, so2); 745 return (error); 746 } 747 748 int 749 sodisconnect(struct socket *so) 750 { 751 int error; 752 753 if ((so->so_state & SS_ISCONNECTED) == 0) { 754 error = ENOTCONN; 755 goto bad; 756 } 757 if (so->so_state & SS_ISDISCONNECTING) { 758 error = EALREADY; 759 goto bad; 760 } 761 error = so_pru_disconnect(so); 762 bad: 763 return (error); 764 } 765 766 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 767 /* 768 * Send on a socket. 769 * If send must go all at once and message is larger than 770 * send buffering, then hard error. 771 * Lock against other senders. 772 * If must go all at once and not enough room now, then 773 * inform user that this would block and do nothing. 774 * Otherwise, if nonblocking, send as much as possible. 775 * The data to be sent is described by "uio" if nonzero, 776 * otherwise by the mbuf chain "top" (which must be null 777 * if uio is not). Data provided in mbuf chain must be small 778 * enough to send all at once. 779 * 780 * Returns nonzero on error, timeout or signal; callers 781 * must check for short counts if EINTR/ERESTART are returned. 782 * Data and control buffers are freed on return. 783 */ 784 int 785 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 786 struct mbuf *top, struct mbuf *control, int flags, 787 struct thread *td) 788 { 789 struct mbuf **mp; 790 struct mbuf *m; 791 size_t resid; 792 int space, len; 793 int clen = 0, error, dontroute, mlen; 794 int atomic = sosendallatonce(so) || top; 795 int pru_flags; 796 797 if (uio) { 798 resid = uio->uio_resid; 799 } else { 800 resid = (size_t)top->m_pkthdr.len; 801 #ifdef INVARIANTS 802 len = 0; 803 for (m = top; m; m = m->m_next) 804 len += m->m_len; 805 KKASSERT(top->m_pkthdr.len == len); 806 #endif 807 } 808 809 /* 810 * WARNING! resid is unsigned, space and len are signed. space 811 * can wind up negative if the sockbuf is overcommitted. 812 * 813 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 814 * type sockets since that's an error. 815 */ 816 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 817 error = EINVAL; 818 goto out; 819 } 820 821 dontroute = 822 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 823 (so->so_proto->pr_flags & PR_ATOMIC); 824 if (td->td_lwp != NULL) 825 td->td_lwp->lwp_ru.ru_msgsnd++; 826 if (control) 827 clen = control->m_len; 828 #define gotoerr(errcode) { error = errcode; goto release; } 829 830 restart: 831 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags)); 832 if (error) 833 goto out; 834 835 do { 836 if (so->so_state & SS_CANTSENDMORE) 837 gotoerr(EPIPE); 838 if (so->so_error) { 839 error = so->so_error; 840 so->so_error = 0; 841 goto release; 842 } 843 if ((so->so_state & SS_ISCONNECTED) == 0) { 844 /* 845 * `sendto' and `sendmsg' is allowed on a connection- 846 * based socket if it supports implied connect. 847 * Return ENOTCONN if not connected and no address is 848 * supplied. 849 */ 850 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 851 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 852 if ((so->so_state & SS_ISCONFIRMING) == 0 && 853 !(resid == 0 && clen != 0)) 854 gotoerr(ENOTCONN); 855 } else if (addr == NULL) 856 gotoerr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 857 ENOTCONN : EDESTADDRREQ); 858 } 859 if ((atomic && resid > so->so_snd.ssb_hiwat) || 860 clen > so->so_snd.ssb_hiwat) { 861 gotoerr(EMSGSIZE); 862 } 863 space = ssb_space(&so->so_snd); 864 if (flags & MSG_OOB) 865 space += 1024; 866 if ((space < 0 || (size_t)space < resid + clen) && uio && 867 (atomic || space < so->so_snd.ssb_lowat || space < clen)) { 868 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) 869 gotoerr(EWOULDBLOCK); 870 ssb_unlock(&so->so_snd); 871 error = ssb_wait(&so->so_snd); 872 if (error) 873 goto out; 874 goto restart; 875 } 876 mp = ⊤ 877 space -= clen; 878 do { 879 if (uio == NULL) { 880 /* 881 * Data is prepackaged in "top". 882 */ 883 resid = 0; 884 if (flags & MSG_EOR) 885 top->m_flags |= M_EOR; 886 } else do { 887 if (resid > INT_MAX) 888 resid = INT_MAX; 889 m = m_getl((int)resid, MB_WAIT, MT_DATA, 890 top == NULL ? M_PKTHDR : 0, &mlen); 891 if (top == NULL) { 892 m->m_pkthdr.len = 0; 893 m->m_pkthdr.rcvif = NULL; 894 } 895 len = imin((int)szmin(mlen, resid), space); 896 if (resid < MINCLSIZE) { 897 /* 898 * For datagram protocols, leave room 899 * for protocol headers in first mbuf. 900 */ 901 if (atomic && top == NULL && len < mlen) 902 MH_ALIGN(m, len); 903 } 904 space -= len; 905 error = uiomove(mtod(m, caddr_t), (size_t)len, uio); 906 resid = uio->uio_resid; 907 m->m_len = len; 908 *mp = m; 909 top->m_pkthdr.len += len; 910 if (error) 911 goto release; 912 mp = &m->m_next; 913 if (resid == 0) { 914 if (flags & MSG_EOR) 915 top->m_flags |= M_EOR; 916 break; 917 } 918 } while (space > 0 && atomic); 919 if (dontroute) 920 so->so_options |= SO_DONTROUTE; 921 if (flags & MSG_OOB) { 922 pru_flags = PRUS_OOB; 923 } else if ((flags & MSG_EOF) && 924 (so->so_proto->pr_flags & PR_IMPLOPCL) && 925 (resid == 0)) { 926 /* 927 * If the user set MSG_EOF, the protocol 928 * understands this flag and nothing left to 929 * send then use PRU_SEND_EOF instead of PRU_SEND. 930 */ 931 pru_flags = PRUS_EOF; 932 } else if (resid > 0 && space > 0) { 933 /* If there is more to send, set PRUS_MORETOCOME */ 934 pru_flags = PRUS_MORETOCOME; 935 } else { 936 pru_flags = 0; 937 } 938 /* 939 * XXX all the SS_CANTSENDMORE checks previously 940 * done could be out of date. We could have recieved 941 * a reset packet in an interrupt or maybe we slept 942 * while doing page faults in uiomove() etc. We could 943 * probably recheck again inside the splnet() protection 944 * here, but there are probably other places that this 945 * also happens. We must rethink this. 946 */ 947 error = so_pru_send(so, pru_flags, top, addr, control, td); 948 if (dontroute) 949 so->so_options &= ~SO_DONTROUTE; 950 clen = 0; 951 control = NULL; 952 top = NULL; 953 mp = ⊤ 954 if (error) 955 goto release; 956 } while (resid && space > 0); 957 } while (resid); 958 959 release: 960 ssb_unlock(&so->so_snd); 961 out: 962 if (top) 963 m_freem(top); 964 if (control) 965 m_freem(control); 966 return (error); 967 } 968 969 #ifdef INET 970 /* 971 * A specialization of sosend() for UDP based on protocol-specific knowledge: 972 * so->so_proto->pr_flags has the PR_ATOMIC field set. This means that 973 * sosendallatonce() returns true, 974 * the "atomic" variable is true, 975 * and sosendudp() blocks until space is available for the entire send. 976 * so->so_proto->pr_flags does not have the PR_CONNREQUIRED or 977 * PR_IMPLOPCL flags set. 978 * UDP has no out-of-band data. 979 * UDP has no control data. 980 * UDP does not support MSG_EOR. 981 */ 982 int 983 sosendudp(struct socket *so, struct sockaddr *addr, struct uio *uio, 984 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 985 { 986 size_t resid; 987 int error, pru_flags = 0; 988 int space; 989 990 if (td->td_lwp != NULL) 991 td->td_lwp->lwp_ru.ru_msgsnd++; 992 if (control) 993 m_freem(control); 994 995 KASSERT((uio && !top) || (top && !uio), ("bad arguments to sosendudp")); 996 resid = uio ? uio->uio_resid : (size_t)top->m_pkthdr.len; 997 998 restart: 999 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags)); 1000 if (error) 1001 goto out; 1002 1003 if (so->so_state & SS_CANTSENDMORE) 1004 gotoerr(EPIPE); 1005 if (so->so_error) { 1006 error = so->so_error; 1007 so->so_error = 0; 1008 goto release; 1009 } 1010 if (!(so->so_state & SS_ISCONNECTED) && addr == NULL) 1011 gotoerr(EDESTADDRREQ); 1012 if (resid > so->so_snd.ssb_hiwat) 1013 gotoerr(EMSGSIZE); 1014 space = ssb_space(&so->so_snd); 1015 if (uio && (space < 0 || (size_t)space < resid)) { 1016 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) 1017 gotoerr(EWOULDBLOCK); 1018 ssb_unlock(&so->so_snd); 1019 error = ssb_wait(&so->so_snd); 1020 if (error) 1021 goto out; 1022 goto restart; 1023 } 1024 1025 if (uio) { 1026 int hdrlen = max_hdr; 1027 1028 /* 1029 * We try to optimize out the additional mbuf 1030 * allocations in M_PREPEND() on output path, e.g. 1031 * - udp_output(), when it tries to prepend protocol 1032 * headers. 1033 * - Link layer output function, when it tries to 1034 * prepend link layer header. 1035 * 1036 * This probably will not benefit any data that will 1037 * be fragmented, so this optimization is only performed 1038 * when the size of data and max size of protocol+link 1039 * headers fit into one mbuf cluster. 1040 */ 1041 if (uio->uio_resid > MCLBYTES - hdrlen || 1042 !udp_sosend_prepend) { 1043 top = m_uiomove(uio); 1044 if (top == NULL) 1045 goto release; 1046 } else { 1047 int nsize; 1048 1049 top = m_getl(uio->uio_resid + hdrlen, MB_WAIT, 1050 MT_DATA, M_PKTHDR, &nsize); 1051 KASSERT(nsize >= uio->uio_resid + hdrlen, 1052 ("sosendudp invalid nsize %d, " 1053 "resid %zu, hdrlen %d", 1054 nsize, uio->uio_resid, hdrlen)); 1055 1056 top->m_len = uio->uio_resid; 1057 top->m_pkthdr.len = uio->uio_resid; 1058 top->m_data += hdrlen; 1059 1060 error = uiomove(mtod(top, caddr_t), top->m_len, uio); 1061 if (error) 1062 goto out; 1063 } 1064 } 1065 1066 if (flags & MSG_DONTROUTE) 1067 pru_flags |= PRUS_DONTROUTE; 1068 1069 if (udp_sosend_async && (flags & MSG_SYNC) == 0) { 1070 so_pru_send_async(so, pru_flags, top, addr, NULL, td); 1071 error = 0; 1072 } else { 1073 error = so_pru_send(so, pru_flags, top, addr, NULL, td); 1074 } 1075 top = NULL; /* sent or freed in lower layer */ 1076 1077 release: 1078 ssb_unlock(&so->so_snd); 1079 out: 1080 if (top) 1081 m_freem(top); 1082 return (error); 1083 } 1084 1085 int 1086 sosendtcp(struct socket *so, struct sockaddr *addr, struct uio *uio, 1087 struct mbuf *top, struct mbuf *control, int flags, 1088 struct thread *td) 1089 { 1090 struct mbuf **mp; 1091 struct mbuf *m; 1092 size_t resid; 1093 int space, len; 1094 int error, mlen; 1095 int allatonce; 1096 int pru_flags; 1097 1098 if (uio) { 1099 KKASSERT(top == NULL); 1100 allatonce = 0; 1101 resid = uio->uio_resid; 1102 } else { 1103 allatonce = 1; 1104 resid = (size_t)top->m_pkthdr.len; 1105 #ifdef INVARIANTS 1106 len = 0; 1107 for (m = top; m; m = m->m_next) 1108 len += m->m_len; 1109 KKASSERT(top->m_pkthdr.len == len); 1110 #endif 1111 } 1112 1113 /* 1114 * WARNING! resid is unsigned, space and len are signed. space 1115 * can wind up negative if the sockbuf is overcommitted. 1116 * 1117 * Also check to make sure that MSG_EOR isn't used on TCP 1118 */ 1119 if (flags & MSG_EOR) { 1120 error = EINVAL; 1121 goto out; 1122 } 1123 1124 if (control) { 1125 /* TCP doesn't do control messages (rights, creds, etc) */ 1126 if (control->m_len) { 1127 error = EINVAL; 1128 goto out; 1129 } 1130 m_freem(control); /* empty control, just free it */ 1131 control = NULL; 1132 } 1133 1134 if (td->td_lwp != NULL) 1135 td->td_lwp->lwp_ru.ru_msgsnd++; 1136 1137 #define gotoerr(errcode) { error = errcode; goto release; } 1138 1139 restart: 1140 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags)); 1141 if (error) 1142 goto out; 1143 1144 do { 1145 if (so->so_state & SS_CANTSENDMORE) 1146 gotoerr(EPIPE); 1147 if (so->so_error) { 1148 error = so->so_error; 1149 so->so_error = 0; 1150 goto release; 1151 } 1152 if ((so->so_state & SS_ISCONNECTED) == 0 && 1153 (so->so_state & SS_ISCONFIRMING) == 0) 1154 gotoerr(ENOTCONN); 1155 if (allatonce && resid > so->so_snd.ssb_hiwat) 1156 gotoerr(EMSGSIZE); 1157 1158 space = ssb_space_prealloc(&so->so_snd); 1159 if (flags & MSG_OOB) 1160 space += 1024; 1161 if ((space < 0 || (size_t)space < resid) && !allatonce && 1162 space < so->so_snd.ssb_lowat) { 1163 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) 1164 gotoerr(EWOULDBLOCK); 1165 ssb_unlock(&so->so_snd); 1166 error = ssb_wait(&so->so_snd); 1167 if (error) 1168 goto out; 1169 goto restart; 1170 } 1171 mp = ⊤ 1172 do { 1173 int cnt = 0, async = 0; 1174 1175 if (uio == NULL) { 1176 /* 1177 * Data is prepackaged in "top". 1178 */ 1179 resid = 0; 1180 } else do { 1181 if (resid > INT_MAX) 1182 resid = INT_MAX; 1183 if (tcp_sosend_jcluster) { 1184 m = m_getlj((int)resid, MB_WAIT, MT_DATA, 1185 top == NULL ? M_PKTHDR : 0, &mlen); 1186 } else { 1187 m = m_getl((int)resid, MB_WAIT, MT_DATA, 1188 top == NULL ? M_PKTHDR : 0, &mlen); 1189 } 1190 if (top == NULL) { 1191 m->m_pkthdr.len = 0; 1192 m->m_pkthdr.rcvif = NULL; 1193 } 1194 len = imin((int)szmin(mlen, resid), space); 1195 space -= len; 1196 error = uiomove(mtod(m, caddr_t), (size_t)len, uio); 1197 resid = uio->uio_resid; 1198 m->m_len = len; 1199 *mp = m; 1200 top->m_pkthdr.len += len; 1201 if (error) 1202 goto release; 1203 mp = &m->m_next; 1204 if (resid == 0) 1205 break; 1206 ++cnt; 1207 } while (space > 0 && cnt < tcp_sosend_agglim); 1208 1209 if (tcp_sosend_async) 1210 async = 1; 1211 1212 if (flags & MSG_OOB) { 1213 pru_flags = PRUS_OOB; 1214 async = 0; 1215 } else if ((flags & MSG_EOF) && resid == 0) { 1216 pru_flags = PRUS_EOF; 1217 } else if (resid > 0 && space > 0) { 1218 /* If there is more to send, set PRUS_MORETOCOME */ 1219 pru_flags = PRUS_MORETOCOME; 1220 async = 1; 1221 } else { 1222 pru_flags = 0; 1223 } 1224 1225 if (flags & MSG_SYNC) 1226 async = 0; 1227 1228 /* 1229 * XXX all the SS_CANTSENDMORE checks previously 1230 * done could be out of date. We could have recieved 1231 * a reset packet in an interrupt or maybe we slept 1232 * while doing page faults in uiomove() etc. We could 1233 * probably recheck again inside the splnet() protection 1234 * here, but there are probably other places that this 1235 * also happens. We must rethink this. 1236 */ 1237 for (m = top; m; m = m->m_next) 1238 ssb_preallocstream(&so->so_snd, m); 1239 if (!async) { 1240 error = so_pru_send(so, pru_flags, top, 1241 NULL, NULL, td); 1242 } else { 1243 so_pru_send_async(so, pru_flags, top, 1244 NULL, NULL, td); 1245 error = 0; 1246 } 1247 1248 top = NULL; 1249 mp = ⊤ 1250 if (error) 1251 goto release; 1252 } while (resid && space > 0); 1253 } while (resid); 1254 1255 release: 1256 ssb_unlock(&so->so_snd); 1257 out: 1258 if (top) 1259 m_freem(top); 1260 if (control) 1261 m_freem(control); 1262 return (error); 1263 } 1264 #endif 1265 1266 /* 1267 * Implement receive operations on a socket. 1268 * 1269 * We depend on the way that records are added to the signalsockbuf 1270 * by sbappend*. In particular, each record (mbufs linked through m_next) 1271 * must begin with an address if the protocol so specifies, 1272 * followed by an optional mbuf or mbufs containing ancillary data, 1273 * and then zero or more mbufs of data. 1274 * 1275 * Although the signalsockbuf is locked, new data may still be appended. 1276 * A token inside the ssb_lock deals with MP issues and still allows 1277 * the network to access the socket if we block in a uio. 1278 * 1279 * The caller may receive the data as a single mbuf chain by supplying 1280 * an mbuf **mp0 for use in returning the chain. The uio is then used 1281 * only for the count in uio_resid. 1282 */ 1283 int 1284 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 1285 struct sockbuf *sio, struct mbuf **controlp, int *flagsp) 1286 { 1287 struct mbuf *m, *n; 1288 struct mbuf *free_chain = NULL; 1289 int flags, len, error, offset; 1290 struct protosw *pr = so->so_proto; 1291 int moff, type = 0; 1292 size_t resid, orig_resid; 1293 1294 if (uio) 1295 resid = uio->uio_resid; 1296 else 1297 resid = (size_t)(sio->sb_climit - sio->sb_cc); 1298 orig_resid = resid; 1299 1300 if (psa) 1301 *psa = NULL; 1302 if (controlp) 1303 *controlp = NULL; 1304 if (flagsp) 1305 flags = *flagsp &~ MSG_EOR; 1306 else 1307 flags = 0; 1308 if (flags & MSG_OOB) { 1309 m = m_get(MB_WAIT, MT_DATA); 1310 if (m == NULL) 1311 return (ENOBUFS); 1312 error = so_pru_rcvoob(so, m, flags & MSG_PEEK); 1313 if (error) 1314 goto bad; 1315 if (sio) { 1316 do { 1317 sbappend(sio, m); 1318 KKASSERT(resid >= (size_t)m->m_len); 1319 resid -= (size_t)m->m_len; 1320 } while (resid > 0 && m); 1321 } else { 1322 do { 1323 uio->uio_resid = resid; 1324 error = uiomove(mtod(m, caddr_t), 1325 (int)szmin(resid, m->m_len), 1326 uio); 1327 resid = uio->uio_resid; 1328 m = m_free(m); 1329 } while (uio->uio_resid && error == 0 && m); 1330 } 1331 bad: 1332 if (m) 1333 m_freem(m); 1334 return (error); 1335 } 1336 if ((so->so_state & SS_ISCONFIRMING) && resid) 1337 so_pru_rcvd(so, 0); 1338 1339 /* 1340 * The token interlocks against the protocol thread while 1341 * ssb_lock is a blocking lock against other userland entities. 1342 */ 1343 lwkt_gettoken(&so->so_rcv.ssb_token); 1344 restart: 1345 error = ssb_lock(&so->so_rcv, SBLOCKWAIT(flags)); 1346 if (error) 1347 goto done; 1348 1349 m = so->so_rcv.ssb_mb; 1350 /* 1351 * If we have less data than requested, block awaiting more 1352 * (subject to any timeout) if: 1353 * 1. the current count is less than the low water mark, or 1354 * 2. MSG_WAITALL is set, and it is possible to do the entire 1355 * receive operation at once if we block (resid <= hiwat). 1356 * 3. MSG_DONTWAIT is not set 1357 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1358 * we have to do the receive in sections, and thus risk returning 1359 * a short count if a timeout or signal occurs after we start. 1360 */ 1361 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1362 (size_t)so->so_rcv.ssb_cc < resid) && 1363 (so->so_rcv.ssb_cc < so->so_rcv.ssb_lowat || 1364 ((flags & MSG_WAITALL) && resid <= (size_t)so->so_rcv.ssb_hiwat)) && 1365 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { 1366 KASSERT(m != NULL || !so->so_rcv.ssb_cc, ("receive 1")); 1367 if (so->so_error) { 1368 if (m) 1369 goto dontblock; 1370 error = so->so_error; 1371 if ((flags & MSG_PEEK) == 0) 1372 so->so_error = 0; 1373 goto release; 1374 } 1375 if (so->so_state & SS_CANTRCVMORE) { 1376 if (m) 1377 goto dontblock; 1378 else 1379 goto release; 1380 } 1381 for (; m; m = m->m_next) { 1382 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1383 m = so->so_rcv.ssb_mb; 1384 goto dontblock; 1385 } 1386 } 1387 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1388 (pr->pr_flags & PR_CONNREQUIRED)) { 1389 error = ENOTCONN; 1390 goto release; 1391 } 1392 if (resid == 0) 1393 goto release; 1394 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) { 1395 error = EWOULDBLOCK; 1396 goto release; 1397 } 1398 ssb_unlock(&so->so_rcv); 1399 error = ssb_wait(&so->so_rcv); 1400 if (error) 1401 goto done; 1402 goto restart; 1403 } 1404 dontblock: 1405 if (uio && uio->uio_td && uio->uio_td->td_proc) 1406 uio->uio_td->td_lwp->lwp_ru.ru_msgrcv++; 1407 1408 /* 1409 * note: m should be == sb_mb here. Cache the next record while 1410 * cleaning up. Note that calling m_free*() will break out critical 1411 * section. 1412 */ 1413 KKASSERT(m == so->so_rcv.ssb_mb); 1414 1415 /* 1416 * Skip any address mbufs prepending the record. 1417 */ 1418 if (pr->pr_flags & PR_ADDR) { 1419 KASSERT(m->m_type == MT_SONAME, ("receive 1a")); 1420 orig_resid = 0; 1421 if (psa) 1422 *psa = dup_sockaddr(mtod(m, struct sockaddr *)); 1423 if (flags & MSG_PEEK) 1424 m = m->m_next; 1425 else 1426 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1427 } 1428 1429 /* 1430 * Skip any control mbufs prepending the record. 1431 */ 1432 #ifdef SCTP 1433 if (pr->pr_flags & PR_ADDR_OPT) { 1434 /* 1435 * For SCTP we may be getting a 1436 * whole message OR a partial delivery. 1437 */ 1438 if (m && m->m_type == MT_SONAME) { 1439 orig_resid = 0; 1440 if (psa) 1441 *psa = dup_sockaddr(mtod(m, struct sockaddr *)); 1442 if (flags & MSG_PEEK) 1443 m = m->m_next; 1444 else 1445 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1446 } 1447 } 1448 #endif /* SCTP */ 1449 while (m && m->m_type == MT_CONTROL && error == 0) { 1450 if (flags & MSG_PEEK) { 1451 if (controlp) 1452 *controlp = m_copy(m, 0, m->m_len); 1453 m = m->m_next; /* XXX race */ 1454 } else { 1455 if (controlp) { 1456 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL); 1457 if (pr->pr_domain->dom_externalize && 1458 mtod(m, struct cmsghdr *)->cmsg_type == 1459 SCM_RIGHTS) 1460 error = (*pr->pr_domain->dom_externalize)(m); 1461 *controlp = m; 1462 m = n; 1463 } else { 1464 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1465 } 1466 } 1467 if (controlp && *controlp) { 1468 orig_resid = 0; 1469 controlp = &(*controlp)->m_next; 1470 } 1471 } 1472 1473 /* 1474 * flag OOB data. 1475 */ 1476 if (m) { 1477 type = m->m_type; 1478 if (type == MT_OOBDATA) 1479 flags |= MSG_OOB; 1480 } 1481 1482 /* 1483 * Copy to the UIO or mbuf return chain (*mp). 1484 */ 1485 moff = 0; 1486 offset = 0; 1487 while (m && resid > 0 && error == 0) { 1488 if (m->m_type == MT_OOBDATA) { 1489 if (type != MT_OOBDATA) 1490 break; 1491 } else if (type == MT_OOBDATA) 1492 break; 1493 else 1494 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 1495 ("receive 3")); 1496 soclrstate(so, SS_RCVATMARK); 1497 len = (resid > INT_MAX) ? INT_MAX : resid; 1498 if (so->so_oobmark && len > so->so_oobmark - offset) 1499 len = so->so_oobmark - offset; 1500 if (len > m->m_len - moff) 1501 len = m->m_len - moff; 1502 1503 /* 1504 * Copy out to the UIO or pass the mbufs back to the SIO. 1505 * The SIO is dealt with when we eat the mbuf, but deal 1506 * with the resid here either way. 1507 */ 1508 if (uio) { 1509 uio->uio_resid = resid; 1510 error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1511 resid = uio->uio_resid; 1512 if (error) 1513 goto release; 1514 } else { 1515 resid -= (size_t)len; 1516 } 1517 1518 /* 1519 * Eat the entire mbuf or just a piece of it 1520 */ 1521 if (len == m->m_len - moff) { 1522 if (m->m_flags & M_EOR) 1523 flags |= MSG_EOR; 1524 #ifdef SCTP 1525 if (m->m_flags & M_NOTIFICATION) 1526 flags |= MSG_NOTIFICATION; 1527 #endif /* SCTP */ 1528 if (flags & MSG_PEEK) { 1529 m = m->m_next; 1530 moff = 0; 1531 } else { 1532 if (sio) { 1533 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL); 1534 sbappend(sio, m); 1535 m = n; 1536 } else { 1537 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1538 } 1539 } 1540 } else { 1541 if (flags & MSG_PEEK) { 1542 moff += len; 1543 } else { 1544 if (sio) { 1545 n = m_copym(m, 0, len, MB_WAIT); 1546 if (n) 1547 sbappend(sio, n); 1548 } 1549 m->m_data += len; 1550 m->m_len -= len; 1551 so->so_rcv.ssb_cc -= len; 1552 } 1553 } 1554 if (so->so_oobmark) { 1555 if ((flags & MSG_PEEK) == 0) { 1556 so->so_oobmark -= len; 1557 if (so->so_oobmark == 0) { 1558 sosetstate(so, SS_RCVATMARK); 1559 break; 1560 } 1561 } else { 1562 offset += len; 1563 if (offset == so->so_oobmark) 1564 break; 1565 } 1566 } 1567 if (flags & MSG_EOR) 1568 break; 1569 /* 1570 * If the MSG_WAITALL flag is set (for non-atomic socket), 1571 * we must not quit until resid == 0 or an error 1572 * termination. If a signal/timeout occurs, return 1573 * with a short count but without error. 1574 * Keep signalsockbuf locked against other readers. 1575 */ 1576 while ((flags & MSG_WAITALL) && m == NULL && 1577 resid > 0 && !sosendallatonce(so) && 1578 so->so_rcv.ssb_mb == NULL) { 1579 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1580 break; 1581 /* 1582 * The window might have closed to zero, make 1583 * sure we send an ack now that we've drained 1584 * the buffer or we might end up blocking until 1585 * the idle takes over (5 seconds). 1586 */ 1587 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 1588 so_pru_rcvd(so, flags); 1589 error = ssb_wait(&so->so_rcv); 1590 if (error) { 1591 ssb_unlock(&so->so_rcv); 1592 error = 0; 1593 goto done; 1594 } 1595 m = so->so_rcv.ssb_mb; 1596 } 1597 } 1598 1599 /* 1600 * If an atomic read was requested but unread data still remains 1601 * in the record, set MSG_TRUNC. 1602 */ 1603 if (m && pr->pr_flags & PR_ATOMIC) 1604 flags |= MSG_TRUNC; 1605 1606 /* 1607 * Cleanup. If an atomic read was requested drop any unread data. 1608 */ 1609 if ((flags & MSG_PEEK) == 0) { 1610 if (m && (pr->pr_flags & PR_ATOMIC)) 1611 sbdroprecord(&so->so_rcv.sb); 1612 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) 1613 so_pru_rcvd(so, flags); 1614 } 1615 1616 if (orig_resid == resid && orig_resid && 1617 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1618 ssb_unlock(&so->so_rcv); 1619 goto restart; 1620 } 1621 1622 if (flagsp) 1623 *flagsp |= flags; 1624 release: 1625 ssb_unlock(&so->so_rcv); 1626 done: 1627 lwkt_reltoken(&so->so_rcv.ssb_token); 1628 if (free_chain) 1629 m_freem(free_chain); 1630 return (error); 1631 } 1632 1633 int 1634 sorecvtcp(struct socket *so, struct sockaddr **psa, struct uio *uio, 1635 struct sockbuf *sio, struct mbuf **controlp, int *flagsp) 1636 { 1637 struct mbuf *m, *n; 1638 struct mbuf *free_chain = NULL; 1639 int flags, len, error, offset; 1640 struct protosw *pr = so->so_proto; 1641 int moff; 1642 int didoob; 1643 size_t resid, orig_resid, restmp; 1644 1645 if (uio) 1646 resid = uio->uio_resid; 1647 else 1648 resid = (size_t)(sio->sb_climit - sio->sb_cc); 1649 orig_resid = resid; 1650 1651 if (psa) 1652 *psa = NULL; 1653 if (controlp) 1654 *controlp = NULL; 1655 if (flagsp) 1656 flags = *flagsp &~ MSG_EOR; 1657 else 1658 flags = 0; 1659 if (flags & MSG_OOB) { 1660 m = m_get(MB_WAIT, MT_DATA); 1661 if (m == NULL) 1662 return (ENOBUFS); 1663 error = so_pru_rcvoob(so, m, flags & MSG_PEEK); 1664 if (error) 1665 goto bad; 1666 if (sio) { 1667 do { 1668 sbappend(sio, m); 1669 KKASSERT(resid >= (size_t)m->m_len); 1670 resid -= (size_t)m->m_len; 1671 } while (resid > 0 && m); 1672 } else { 1673 do { 1674 uio->uio_resid = resid; 1675 error = uiomove(mtod(m, caddr_t), 1676 (int)szmin(resid, m->m_len), 1677 uio); 1678 resid = uio->uio_resid; 1679 m = m_free(m); 1680 } while (uio->uio_resid && error == 0 && m); 1681 } 1682 bad: 1683 if (m) 1684 m_freem(m); 1685 return (error); 1686 } 1687 1688 /* 1689 * The token interlocks against the protocol thread while 1690 * ssb_lock is a blocking lock against other userland entities. 1691 * 1692 * Lock a limited number of mbufs (not all, so sbcompress() still 1693 * works well). The token is used as an interlock for sbwait() so 1694 * release it afterwords. 1695 */ 1696 restart: 1697 error = ssb_lock(&so->so_rcv, SBLOCKWAIT(flags)); 1698 if (error) 1699 goto done; 1700 1701 lwkt_gettoken(&so->so_rcv.ssb_token); 1702 m = so->so_rcv.ssb_mb; 1703 1704 /* 1705 * If we have less data than requested, block awaiting more 1706 * (subject to any timeout) if: 1707 * 1. the current count is less than the low water mark, or 1708 * 2. MSG_WAITALL is set, and it is possible to do the entire 1709 * receive operation at once if we block (resid <= hiwat). 1710 * 3. MSG_DONTWAIT is not set 1711 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1712 * we have to do the receive in sections, and thus risk returning 1713 * a short count if a timeout or signal occurs after we start. 1714 */ 1715 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1716 (size_t)so->so_rcv.ssb_cc < resid) && 1717 (so->so_rcv.ssb_cc < so->so_rcv.ssb_lowat || 1718 ((flags & MSG_WAITALL) && resid <= (size_t)so->so_rcv.ssb_hiwat)))) { 1719 KASSERT(m != NULL || !so->so_rcv.ssb_cc, ("receive 1")); 1720 if (so->so_error) { 1721 if (m) 1722 goto dontblock; 1723 lwkt_reltoken(&so->so_rcv.ssb_token); 1724 error = so->so_error; 1725 if ((flags & MSG_PEEK) == 0) 1726 so->so_error = 0; 1727 goto release; 1728 } 1729 if (so->so_state & SS_CANTRCVMORE) { 1730 if (m) 1731 goto dontblock; 1732 lwkt_reltoken(&so->so_rcv.ssb_token); 1733 goto release; 1734 } 1735 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1736 (pr->pr_flags & PR_CONNREQUIRED)) { 1737 lwkt_reltoken(&so->so_rcv.ssb_token); 1738 error = ENOTCONN; 1739 goto release; 1740 } 1741 if (resid == 0) { 1742 lwkt_reltoken(&so->so_rcv.ssb_token); 1743 goto release; 1744 } 1745 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) { 1746 lwkt_reltoken(&so->so_rcv.ssb_token); 1747 error = EWOULDBLOCK; 1748 goto release; 1749 } 1750 ssb_unlock(&so->so_rcv); 1751 error = ssb_wait(&so->so_rcv); 1752 lwkt_reltoken(&so->so_rcv.ssb_token); 1753 if (error) 1754 goto done; 1755 goto restart; 1756 } 1757 1758 /* 1759 * Token still held 1760 */ 1761 dontblock: 1762 n = m; 1763 restmp = 0; 1764 while (n && restmp < resid) { 1765 n->m_flags |= M_SOLOCKED; 1766 restmp += n->m_len; 1767 if (n->m_next == NULL) 1768 n = n->m_nextpkt; 1769 else 1770 n = n->m_next; 1771 } 1772 1773 /* 1774 * Release token for loop 1775 */ 1776 lwkt_reltoken(&so->so_rcv.ssb_token); 1777 if (uio && uio->uio_td && uio->uio_td->td_proc) 1778 uio->uio_td->td_lwp->lwp_ru.ru_msgrcv++; 1779 1780 /* 1781 * note: m should be == sb_mb here. Cache the next record while 1782 * cleaning up. Note that calling m_free*() will break out critical 1783 * section. 1784 */ 1785 KKASSERT(m == so->so_rcv.ssb_mb); 1786 1787 /* 1788 * Copy to the UIO or mbuf return chain (*mp). 1789 * 1790 * NOTE: Token is not held for loop 1791 */ 1792 moff = 0; 1793 offset = 0; 1794 didoob = 0; 1795 1796 while (m && (m->m_flags & M_SOLOCKED) && resid > 0 && error == 0) { 1797 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 1798 ("receive 3")); 1799 1800 soclrstate(so, SS_RCVATMARK); 1801 len = (resid > INT_MAX) ? INT_MAX : resid; 1802 if (so->so_oobmark && len > so->so_oobmark - offset) 1803 len = so->so_oobmark - offset; 1804 if (len > m->m_len - moff) 1805 len = m->m_len - moff; 1806 1807 /* 1808 * Copy out to the UIO or pass the mbufs back to the SIO. 1809 * The SIO is dealt with when we eat the mbuf, but deal 1810 * with the resid here either way. 1811 */ 1812 if (uio) { 1813 uio->uio_resid = resid; 1814 error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1815 resid = uio->uio_resid; 1816 if (error) 1817 goto release; 1818 } else { 1819 resid -= (size_t)len; 1820 } 1821 1822 /* 1823 * Eat the entire mbuf or just a piece of it 1824 */ 1825 offset += len; 1826 if (len == m->m_len - moff) { 1827 m = m->m_next; 1828 moff = 0; 1829 } else { 1830 moff += len; 1831 } 1832 1833 /* 1834 * Check oobmark 1835 */ 1836 if (so->so_oobmark && offset == so->so_oobmark) { 1837 didoob = 1; 1838 break; 1839 } 1840 } 1841 1842 /* 1843 * Synchronize sockbuf with data we read. 1844 * 1845 * NOTE: (m) is junk on entry (it could be left over from the 1846 * previous loop). 1847 */ 1848 if ((flags & MSG_PEEK) == 0) { 1849 lwkt_gettoken(&so->so_rcv.ssb_token); 1850 m = so->so_rcv.ssb_mb; 1851 while (m && offset >= m->m_len) { 1852 if (so->so_oobmark) { 1853 so->so_oobmark -= m->m_len; 1854 if (so->so_oobmark == 0) { 1855 sosetstate(so, SS_RCVATMARK); 1856 didoob = 1; 1857 } 1858 } 1859 offset -= m->m_len; 1860 if (sio) { 1861 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL); 1862 sbappend(sio, m); 1863 m = n; 1864 } else { 1865 m = sbunlinkmbuf(&so->so_rcv.sb, 1866 m, &free_chain); 1867 } 1868 } 1869 if (offset) { 1870 KKASSERT(m); 1871 if (sio) { 1872 n = m_copym(m, 0, offset, MB_WAIT); 1873 if (n) 1874 sbappend(sio, n); 1875 } 1876 m->m_data += offset; 1877 m->m_len -= offset; 1878 so->so_rcv.ssb_cc -= offset; 1879 if (so->so_oobmark) { 1880 so->so_oobmark -= offset; 1881 if (so->so_oobmark == 0) { 1882 sosetstate(so, SS_RCVATMARK); 1883 didoob = 1; 1884 } 1885 } 1886 offset = 0; 1887 } 1888 lwkt_reltoken(&so->so_rcv.ssb_token); 1889 } 1890 1891 /* 1892 * If the MSG_WAITALL flag is set (for non-atomic socket), 1893 * we must not quit until resid == 0 or an error termination. 1894 * 1895 * If a signal/timeout occurs, return with a short count but without 1896 * error. 1897 * 1898 * Keep signalsockbuf locked against other readers. 1899 * 1900 * XXX if MSG_PEEK we currently do quit. 1901 */ 1902 if ((flags & MSG_WAITALL) && !(flags & MSG_PEEK) && 1903 didoob == 0 && resid > 0 && 1904 !sosendallatonce(so)) { 1905 lwkt_gettoken(&so->so_rcv.ssb_token); 1906 error = 0; 1907 while ((m = so->so_rcv.ssb_mb) == NULL) { 1908 if (so->so_error || (so->so_state & SS_CANTRCVMORE)) { 1909 error = so->so_error; 1910 break; 1911 } 1912 /* 1913 * The window might have closed to zero, make 1914 * sure we send an ack now that we've drained 1915 * the buffer or we might end up blocking until 1916 * the idle takes over (5 seconds). 1917 */ 1918 if (so->so_pcb) 1919 so_pru_rcvd_async(so); 1920 if (so->so_rcv.ssb_mb == NULL) 1921 error = ssb_wait(&so->so_rcv); 1922 if (error) { 1923 lwkt_reltoken(&so->so_rcv.ssb_token); 1924 ssb_unlock(&so->so_rcv); 1925 error = 0; 1926 goto done; 1927 } 1928 } 1929 if (m && error == 0) 1930 goto dontblock; 1931 lwkt_reltoken(&so->so_rcv.ssb_token); 1932 } 1933 1934 /* 1935 * Token not held here. 1936 * 1937 * Cleanup. If an atomic read was requested drop any unread data XXX 1938 */ 1939 if ((flags & MSG_PEEK) == 0) { 1940 if (so->so_pcb) 1941 so_pru_rcvd_async(so); 1942 } 1943 1944 if (orig_resid == resid && orig_resid && 1945 (so->so_state & SS_CANTRCVMORE) == 0) { 1946 ssb_unlock(&so->so_rcv); 1947 goto restart; 1948 } 1949 1950 if (flagsp) 1951 *flagsp |= flags; 1952 release: 1953 ssb_unlock(&so->so_rcv); 1954 done: 1955 if (free_chain) 1956 m_freem(free_chain); 1957 return (error); 1958 } 1959 1960 /* 1961 * Shut a socket down. Note that we do not get a frontend lock as we 1962 * want to be able to shut the socket down even if another thread is 1963 * blocked in a read(), thus waking it up. 1964 */ 1965 int 1966 soshutdown(struct socket *so, int how) 1967 { 1968 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1969 return (EINVAL); 1970 1971 if (how != SHUT_WR) { 1972 /*ssb_lock(&so->so_rcv, M_WAITOK);*/ 1973 sorflush(so); 1974 /*ssb_unlock(&so->so_rcv);*/ 1975 } 1976 if (how != SHUT_RD) 1977 return (so_pru_shutdown(so)); 1978 return (0); 1979 } 1980 1981 void 1982 sorflush(struct socket *so) 1983 { 1984 struct signalsockbuf *ssb = &so->so_rcv; 1985 struct protosw *pr = so->so_proto; 1986 struct signalsockbuf asb; 1987 1988 atomic_set_int(&ssb->ssb_flags, SSB_NOINTR); 1989 1990 lwkt_gettoken(&ssb->ssb_token); 1991 socantrcvmore(so); 1992 asb = *ssb; 1993 1994 /* 1995 * Can't just blow up the ssb structure here 1996 */ 1997 bzero(&ssb->sb, sizeof(ssb->sb)); 1998 ssb->ssb_timeo = 0; 1999 ssb->ssb_lowat = 0; 2000 ssb->ssb_hiwat = 0; 2001 ssb->ssb_mbmax = 0; 2002 atomic_clear_int(&ssb->ssb_flags, SSB_CLEAR_MASK); 2003 2004 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) 2005 (*pr->pr_domain->dom_dispose)(asb.ssb_mb); 2006 ssb_release(&asb, so); 2007 2008 lwkt_reltoken(&ssb->ssb_token); 2009 } 2010 2011 #ifdef INET 2012 static int 2013 do_setopt_accept_filter(struct socket *so, struct sockopt *sopt) 2014 { 2015 struct accept_filter_arg *afap = NULL; 2016 struct accept_filter *afp; 2017 struct so_accf *af = so->so_accf; 2018 int error = 0; 2019 2020 /* do not set/remove accept filters on non listen sockets */ 2021 if ((so->so_options & SO_ACCEPTCONN) == 0) { 2022 error = EINVAL; 2023 goto out; 2024 } 2025 2026 /* removing the filter */ 2027 if (sopt == NULL) { 2028 if (af != NULL) { 2029 if (af->so_accept_filter != NULL && 2030 af->so_accept_filter->accf_destroy != NULL) { 2031 af->so_accept_filter->accf_destroy(so); 2032 } 2033 if (af->so_accept_filter_str != NULL) { 2034 kfree(af->so_accept_filter_str, M_ACCF); 2035 } 2036 kfree(af, M_ACCF); 2037 so->so_accf = NULL; 2038 } 2039 so->so_options &= ~SO_ACCEPTFILTER; 2040 return (0); 2041 } 2042 /* adding a filter */ 2043 /* must remove previous filter first */ 2044 if (af != NULL) { 2045 error = EINVAL; 2046 goto out; 2047 } 2048 /* don't put large objects on the kernel stack */ 2049 afap = kmalloc(sizeof(*afap), M_TEMP, M_WAITOK); 2050 error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); 2051 afap->af_name[sizeof(afap->af_name)-1] = '\0'; 2052 afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; 2053 if (error) 2054 goto out; 2055 afp = accept_filt_get(afap->af_name); 2056 if (afp == NULL) { 2057 error = ENOENT; 2058 goto out; 2059 } 2060 af = kmalloc(sizeof(*af), M_ACCF, M_WAITOK | M_ZERO); 2061 if (afp->accf_create != NULL) { 2062 if (afap->af_name[0] != '\0') { 2063 int len = strlen(afap->af_name) + 1; 2064 2065 af->so_accept_filter_str = kmalloc(len, M_ACCF, 2066 M_WAITOK); 2067 strcpy(af->so_accept_filter_str, afap->af_name); 2068 } 2069 af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg); 2070 if (af->so_accept_filter_arg == NULL) { 2071 kfree(af->so_accept_filter_str, M_ACCF); 2072 kfree(af, M_ACCF); 2073 so->so_accf = NULL; 2074 error = EINVAL; 2075 goto out; 2076 } 2077 } 2078 af->so_accept_filter = afp; 2079 so->so_accf = af; 2080 so->so_options |= SO_ACCEPTFILTER; 2081 out: 2082 if (afap != NULL) 2083 kfree(afap, M_TEMP); 2084 return (error); 2085 } 2086 #endif /* INET */ 2087 2088 /* 2089 * Perhaps this routine, and sooptcopyout(), below, ought to come in 2090 * an additional variant to handle the case where the option value needs 2091 * to be some kind of integer, but not a specific size. 2092 * In addition to their use here, these functions are also called by the 2093 * protocol-level pr_ctloutput() routines. 2094 */ 2095 int 2096 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 2097 { 2098 return soopt_to_kbuf(sopt, buf, len, minlen); 2099 } 2100 2101 int 2102 soopt_to_kbuf(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 2103 { 2104 size_t valsize; 2105 2106 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2107 KKASSERT(kva_p(buf)); 2108 2109 /* 2110 * If the user gives us more than we wanted, we ignore it, 2111 * but if we don't get the minimum length the caller 2112 * wants, we return EINVAL. On success, sopt->sopt_valsize 2113 * is set to however much we actually retrieved. 2114 */ 2115 if ((valsize = sopt->sopt_valsize) < minlen) 2116 return EINVAL; 2117 if (valsize > len) 2118 sopt->sopt_valsize = valsize = len; 2119 2120 bcopy(sopt->sopt_val, buf, valsize); 2121 return 0; 2122 } 2123 2124 2125 int 2126 sosetopt(struct socket *so, struct sockopt *sopt) 2127 { 2128 int error, optval; 2129 struct linger l; 2130 struct timeval tv; 2131 u_long val; 2132 struct signalsockbuf *sotmp; 2133 2134 error = 0; 2135 sopt->sopt_dir = SOPT_SET; 2136 if (sopt->sopt_level != SOL_SOCKET) { 2137 if (so->so_proto && so->so_proto->pr_ctloutput) { 2138 return (so_pr_ctloutput(so, sopt)); 2139 } 2140 error = ENOPROTOOPT; 2141 } else { 2142 switch (sopt->sopt_name) { 2143 #ifdef INET 2144 case SO_ACCEPTFILTER: 2145 error = do_setopt_accept_filter(so, sopt); 2146 if (error) 2147 goto bad; 2148 break; 2149 #endif /* INET */ 2150 case SO_LINGER: 2151 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 2152 if (error) 2153 goto bad; 2154 2155 so->so_linger = l.l_linger; 2156 if (l.l_onoff) 2157 so->so_options |= SO_LINGER; 2158 else 2159 so->so_options &= ~SO_LINGER; 2160 break; 2161 2162 case SO_DEBUG: 2163 case SO_KEEPALIVE: 2164 case SO_DONTROUTE: 2165 case SO_USELOOPBACK: 2166 case SO_BROADCAST: 2167 case SO_REUSEADDR: 2168 case SO_REUSEPORT: 2169 case SO_OOBINLINE: 2170 case SO_TIMESTAMP: 2171 case SO_NOSIGPIPE: 2172 error = sooptcopyin(sopt, &optval, sizeof optval, 2173 sizeof optval); 2174 if (error) 2175 goto bad; 2176 if (optval) 2177 so->so_options |= sopt->sopt_name; 2178 else 2179 so->so_options &= ~sopt->sopt_name; 2180 break; 2181 2182 case SO_SNDBUF: 2183 case SO_RCVBUF: 2184 case SO_SNDLOWAT: 2185 case SO_RCVLOWAT: 2186 error = sooptcopyin(sopt, &optval, sizeof optval, 2187 sizeof optval); 2188 if (error) 2189 goto bad; 2190 2191 /* 2192 * Values < 1 make no sense for any of these 2193 * options, so disallow them. 2194 */ 2195 if (optval < 1) { 2196 error = EINVAL; 2197 goto bad; 2198 } 2199 2200 switch (sopt->sopt_name) { 2201 case SO_SNDBUF: 2202 case SO_RCVBUF: 2203 if (ssb_reserve(sopt->sopt_name == SO_SNDBUF ? 2204 &so->so_snd : &so->so_rcv, (u_long)optval, 2205 so, 2206 &curproc->p_rlimit[RLIMIT_SBSIZE]) == 0) { 2207 error = ENOBUFS; 2208 goto bad; 2209 } 2210 sotmp = (sopt->sopt_name == SO_SNDBUF) ? 2211 &so->so_snd : &so->so_rcv; 2212 atomic_clear_int(&sotmp->ssb_flags, 2213 SSB_AUTOSIZE); 2214 break; 2215 2216 /* 2217 * Make sure the low-water is never greater than 2218 * the high-water. 2219 */ 2220 case SO_SNDLOWAT: 2221 so->so_snd.ssb_lowat = 2222 (optval > so->so_snd.ssb_hiwat) ? 2223 so->so_snd.ssb_hiwat : optval; 2224 atomic_clear_int(&so->so_snd.ssb_flags, 2225 SSB_AUTOLOWAT); 2226 break; 2227 case SO_RCVLOWAT: 2228 so->so_rcv.ssb_lowat = 2229 (optval > so->so_rcv.ssb_hiwat) ? 2230 so->so_rcv.ssb_hiwat : optval; 2231 atomic_clear_int(&so->so_rcv.ssb_flags, 2232 SSB_AUTOLOWAT); 2233 break; 2234 } 2235 break; 2236 2237 case SO_SNDTIMEO: 2238 case SO_RCVTIMEO: 2239 error = sooptcopyin(sopt, &tv, sizeof tv, 2240 sizeof tv); 2241 if (error) 2242 goto bad; 2243 2244 /* assert(hz > 0); */ 2245 if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz || 2246 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 2247 error = EDOM; 2248 goto bad; 2249 } 2250 /* assert(tick > 0); */ 2251 /* assert(ULONG_MAX - INT_MAX >= 1000000); */ 2252 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / ustick; 2253 if (val > INT_MAX) { 2254 error = EDOM; 2255 goto bad; 2256 } 2257 if (val == 0 && tv.tv_usec != 0) 2258 val = 1; 2259 2260 switch (sopt->sopt_name) { 2261 case SO_SNDTIMEO: 2262 so->so_snd.ssb_timeo = val; 2263 break; 2264 case SO_RCVTIMEO: 2265 so->so_rcv.ssb_timeo = val; 2266 break; 2267 } 2268 break; 2269 default: 2270 error = ENOPROTOOPT; 2271 break; 2272 } 2273 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { 2274 (void) so_pr_ctloutput(so, sopt); 2275 } 2276 } 2277 bad: 2278 return (error); 2279 } 2280 2281 /* Helper routine for getsockopt */ 2282 int 2283 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 2284 { 2285 soopt_from_kbuf(sopt, buf, len); 2286 return 0; 2287 } 2288 2289 void 2290 soopt_from_kbuf(struct sockopt *sopt, const void *buf, size_t len) 2291 { 2292 size_t valsize; 2293 2294 if (len == 0) { 2295 sopt->sopt_valsize = 0; 2296 return; 2297 } 2298 2299 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2300 KKASSERT(kva_p(buf)); 2301 2302 /* 2303 * Documented get behavior is that we always return a value, 2304 * possibly truncated to fit in the user's buffer. 2305 * Traditional behavior is that we always tell the user 2306 * precisely how much we copied, rather than something useful 2307 * like the total amount we had available for her. 2308 * Note that this interface is not idempotent; the entire answer must 2309 * generated ahead of time. 2310 */ 2311 valsize = szmin(len, sopt->sopt_valsize); 2312 sopt->sopt_valsize = valsize; 2313 if (sopt->sopt_val != 0) { 2314 bcopy(buf, sopt->sopt_val, valsize); 2315 } 2316 } 2317 2318 int 2319 sogetopt(struct socket *so, struct sockopt *sopt) 2320 { 2321 int error, optval; 2322 long optval_l; 2323 struct linger l; 2324 struct timeval tv; 2325 #ifdef INET 2326 struct accept_filter_arg *afap; 2327 #endif 2328 2329 error = 0; 2330 sopt->sopt_dir = SOPT_GET; 2331 if (sopt->sopt_level != SOL_SOCKET) { 2332 if (so->so_proto && so->so_proto->pr_ctloutput) { 2333 return (so_pr_ctloutput(so, sopt)); 2334 } else 2335 return (ENOPROTOOPT); 2336 } else { 2337 switch (sopt->sopt_name) { 2338 #ifdef INET 2339 case SO_ACCEPTFILTER: 2340 if ((so->so_options & SO_ACCEPTCONN) == 0) 2341 return (EINVAL); 2342 afap = kmalloc(sizeof(*afap), M_TEMP, 2343 M_WAITOK | M_ZERO); 2344 if ((so->so_options & SO_ACCEPTFILTER) != 0) { 2345 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); 2346 if (so->so_accf->so_accept_filter_str != NULL) 2347 strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); 2348 } 2349 error = sooptcopyout(sopt, afap, sizeof(*afap)); 2350 kfree(afap, M_TEMP); 2351 break; 2352 #endif /* INET */ 2353 2354 case SO_LINGER: 2355 l.l_onoff = so->so_options & SO_LINGER; 2356 l.l_linger = so->so_linger; 2357 error = sooptcopyout(sopt, &l, sizeof l); 2358 break; 2359 2360 case SO_USELOOPBACK: 2361 case SO_DONTROUTE: 2362 case SO_DEBUG: 2363 case SO_KEEPALIVE: 2364 case SO_REUSEADDR: 2365 case SO_REUSEPORT: 2366 case SO_BROADCAST: 2367 case SO_OOBINLINE: 2368 case SO_TIMESTAMP: 2369 case SO_NOSIGPIPE: 2370 optval = so->so_options & sopt->sopt_name; 2371 integer: 2372 error = sooptcopyout(sopt, &optval, sizeof optval); 2373 break; 2374 2375 case SO_TYPE: 2376 optval = so->so_type; 2377 goto integer; 2378 2379 case SO_ERROR: 2380 optval = so->so_error; 2381 so->so_error = 0; 2382 goto integer; 2383 2384 case SO_SNDBUF: 2385 optval = so->so_snd.ssb_hiwat; 2386 goto integer; 2387 2388 case SO_RCVBUF: 2389 optval = so->so_rcv.ssb_hiwat; 2390 goto integer; 2391 2392 case SO_SNDLOWAT: 2393 optval = so->so_snd.ssb_lowat; 2394 goto integer; 2395 2396 case SO_RCVLOWAT: 2397 optval = so->so_rcv.ssb_lowat; 2398 goto integer; 2399 2400 case SO_SNDTIMEO: 2401 case SO_RCVTIMEO: 2402 optval = (sopt->sopt_name == SO_SNDTIMEO ? 2403 so->so_snd.ssb_timeo : so->so_rcv.ssb_timeo); 2404 2405 tv.tv_sec = optval / hz; 2406 tv.tv_usec = (optval % hz) * ustick; 2407 error = sooptcopyout(sopt, &tv, sizeof tv); 2408 break; 2409 2410 case SO_SNDSPACE: 2411 optval_l = ssb_space(&so->so_snd); 2412 error = sooptcopyout(sopt, &optval_l, sizeof(optval_l)); 2413 break; 2414 2415 default: 2416 error = ENOPROTOOPT; 2417 break; 2418 } 2419 return (error); 2420 } 2421 } 2422 2423 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 2424 int 2425 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 2426 { 2427 struct mbuf *m, *m_prev; 2428 int sopt_size = sopt->sopt_valsize, msize; 2429 2430 m = m_getl(sopt_size, sopt->sopt_td ? MB_WAIT : MB_DONTWAIT, MT_DATA, 2431 0, &msize); 2432 if (m == NULL) 2433 return (ENOBUFS); 2434 m->m_len = min(msize, sopt_size); 2435 sopt_size -= m->m_len; 2436 *mp = m; 2437 m_prev = m; 2438 2439 while (sopt_size > 0) { 2440 m = m_getl(sopt_size, sopt->sopt_td ? MB_WAIT : MB_DONTWAIT, 2441 MT_DATA, 0, &msize); 2442 if (m == NULL) { 2443 m_freem(*mp); 2444 return (ENOBUFS); 2445 } 2446 m->m_len = min(msize, sopt_size); 2447 sopt_size -= m->m_len; 2448 m_prev->m_next = m; 2449 m_prev = m; 2450 } 2451 return (0); 2452 } 2453 2454 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 2455 int 2456 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 2457 { 2458 soopt_to_mbuf(sopt, m); 2459 return 0; 2460 } 2461 2462 void 2463 soopt_to_mbuf(struct sockopt *sopt, struct mbuf *m) 2464 { 2465 size_t valsize; 2466 void *val; 2467 2468 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2469 KKASSERT(kva_p(m)); 2470 if (sopt->sopt_val == NULL) 2471 return; 2472 val = sopt->sopt_val; 2473 valsize = sopt->sopt_valsize; 2474 while (m != NULL && valsize >= m->m_len) { 2475 bcopy(val, mtod(m, char *), m->m_len); 2476 valsize -= m->m_len; 2477 val = (caddr_t)val + m->m_len; 2478 m = m->m_next; 2479 } 2480 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 2481 panic("ip6_sooptmcopyin"); 2482 } 2483 2484 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 2485 int 2486 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 2487 { 2488 return soopt_from_mbuf(sopt, m); 2489 } 2490 2491 int 2492 soopt_from_mbuf(struct sockopt *sopt, struct mbuf *m) 2493 { 2494 struct mbuf *m0 = m; 2495 size_t valsize = 0; 2496 size_t maxsize; 2497 void *val; 2498 2499 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2500 KKASSERT(kva_p(m)); 2501 if (sopt->sopt_val == NULL) 2502 return 0; 2503 val = sopt->sopt_val; 2504 maxsize = sopt->sopt_valsize; 2505 while (m != NULL && maxsize >= m->m_len) { 2506 bcopy(mtod(m, char *), val, m->m_len); 2507 maxsize -= m->m_len; 2508 val = (caddr_t)val + m->m_len; 2509 valsize += m->m_len; 2510 m = m->m_next; 2511 } 2512 if (m != NULL) { 2513 /* enough soopt buffer should be given from user-land */ 2514 m_freem(m0); 2515 return (EINVAL); 2516 } 2517 sopt->sopt_valsize = valsize; 2518 return 0; 2519 } 2520 2521 void 2522 sohasoutofband(struct socket *so) 2523 { 2524 if (so->so_sigio != NULL) 2525 pgsigio(so->so_sigio, SIGURG, 0); 2526 KNOTE(&so->so_rcv.ssb_kq.ki_note, NOTE_OOB); 2527 } 2528 2529 int 2530 sokqfilter(struct file *fp, struct knote *kn) 2531 { 2532 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2533 struct signalsockbuf *ssb; 2534 2535 switch (kn->kn_filter) { 2536 case EVFILT_READ: 2537 if (so->so_options & SO_ACCEPTCONN) 2538 kn->kn_fop = &solisten_filtops; 2539 else 2540 kn->kn_fop = &soread_filtops; 2541 ssb = &so->so_rcv; 2542 break; 2543 case EVFILT_WRITE: 2544 kn->kn_fop = &sowrite_filtops; 2545 ssb = &so->so_snd; 2546 break; 2547 case EVFILT_EXCEPT: 2548 kn->kn_fop = &soexcept_filtops; 2549 ssb = &so->so_rcv; 2550 break; 2551 default: 2552 return (EOPNOTSUPP); 2553 } 2554 2555 knote_insert(&ssb->ssb_kq.ki_note, kn); 2556 atomic_set_int(&ssb->ssb_flags, SSB_KNOTE); 2557 return (0); 2558 } 2559 2560 static void 2561 filt_sordetach(struct knote *kn) 2562 { 2563 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2564 2565 knote_remove(&so->so_rcv.ssb_kq.ki_note, kn); 2566 if (SLIST_EMPTY(&so->so_rcv.ssb_kq.ki_note)) 2567 atomic_clear_int(&so->so_rcv.ssb_flags, SSB_KNOTE); 2568 } 2569 2570 /*ARGSUSED*/ 2571 static int 2572 filt_soread(struct knote *kn, long hint) 2573 { 2574 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2575 2576 if (kn->kn_sfflags & NOTE_OOB) { 2577 if ((so->so_oobmark || (so->so_state & SS_RCVATMARK))) { 2578 kn->kn_fflags |= NOTE_OOB; 2579 return (1); 2580 } 2581 return (0); 2582 } 2583 kn->kn_data = so->so_rcv.ssb_cc; 2584 2585 if (so->so_state & SS_CANTRCVMORE) { 2586 /* 2587 * Only set NODATA if all data has been exhausted. 2588 */ 2589 if (kn->kn_data == 0) 2590 kn->kn_flags |= EV_NODATA; 2591 kn->kn_flags |= EV_EOF; 2592 kn->kn_fflags = so->so_error; 2593 return (1); 2594 } 2595 if (so->so_error) /* temporary udp error */ 2596 return (1); 2597 if (kn->kn_sfflags & NOTE_LOWAT) 2598 return (kn->kn_data >= kn->kn_sdata); 2599 return ((kn->kn_data >= so->so_rcv.ssb_lowat) || 2600 !TAILQ_EMPTY(&so->so_comp)); 2601 } 2602 2603 static void 2604 filt_sowdetach(struct knote *kn) 2605 { 2606 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2607 2608 knote_remove(&so->so_snd.ssb_kq.ki_note, kn); 2609 if (SLIST_EMPTY(&so->so_snd.ssb_kq.ki_note)) 2610 atomic_clear_int(&so->so_snd.ssb_flags, SSB_KNOTE); 2611 } 2612 2613 /*ARGSUSED*/ 2614 static int 2615 filt_sowrite(struct knote *kn, long hint) 2616 { 2617 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2618 2619 kn->kn_data = ssb_space(&so->so_snd); 2620 if (so->so_state & SS_CANTSENDMORE) { 2621 kn->kn_flags |= (EV_EOF | EV_NODATA); 2622 kn->kn_fflags = so->so_error; 2623 return (1); 2624 } 2625 if (so->so_error) /* temporary udp error */ 2626 return (1); 2627 if (((so->so_state & SS_ISCONNECTED) == 0) && 2628 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2629 return (0); 2630 if (kn->kn_sfflags & NOTE_LOWAT) 2631 return (kn->kn_data >= kn->kn_sdata); 2632 return (kn->kn_data >= so->so_snd.ssb_lowat); 2633 } 2634 2635 /*ARGSUSED*/ 2636 static int 2637 filt_solisten(struct knote *kn, long hint) 2638 { 2639 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2640 2641 kn->kn_data = so->so_qlen; 2642 return (! TAILQ_EMPTY(&so->so_comp)); 2643 } 2644