1 /* 2 * Copyright (c) 2004 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 2004 The DragonFly Project. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Jeffrey M. Hsu. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of The DragonFly Project nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific, prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 /* 35 * Copyright (c) 1982, 1986, 1988, 1990, 1993 36 * The Regents of the University of California. All rights reserved. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. Neither the name of the University nor the names of its contributors 47 * may be used to endorse or promote products derived from this software 48 * without specific prior written permission. 49 * 50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 60 * SUCH DAMAGE. 61 * 62 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 63 * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.24 2003/11/11 17:18:18 silby Exp $ 64 */ 65 66 #include "opt_inet.h" 67 #include "opt_sctp.h" 68 69 #include <sys/param.h> 70 #include <sys/systm.h> 71 #include <sys/fcntl.h> 72 #include <sys/malloc.h> 73 #include <sys/mbuf.h> 74 #include <sys/domain.h> 75 #include <sys/file.h> /* for struct knote */ 76 #include <sys/kernel.h> 77 #include <sys/event.h> 78 #include <sys/proc.h> 79 #include <sys/protosw.h> 80 #include <sys/socket.h> 81 #include <sys/socketvar.h> 82 #include <sys/socketops.h> 83 #include <sys/resourcevar.h> 84 #include <sys/signalvar.h> 85 #include <sys/sysctl.h> 86 #include <sys/uio.h> 87 #include <sys/jail.h> 88 #include <vm/vm_zone.h> 89 #include <vm/pmap.h> 90 #include <net/netmsg2.h> 91 #include <net/netisr2.h> 92 93 #include <sys/thread2.h> 94 #include <sys/socketvar2.h> 95 #include <sys/spinlock2.h> 96 97 #include <machine/limits.h> 98 99 #ifdef INET 100 extern int tcp_sosend_agglim; 101 extern int tcp_sosend_async; 102 extern int tcp_sosend_jcluster; 103 extern int udp_sosend_async; 104 extern int udp_sosend_prepend; 105 106 static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); 107 #endif /* INET */ 108 109 static void filt_sordetach(struct knote *kn); 110 static int filt_soread(struct knote *kn, long hint); 111 static void filt_sowdetach(struct knote *kn); 112 static int filt_sowrite(struct knote *kn, long hint); 113 static int filt_solisten(struct knote *kn, long hint); 114 115 static void sodiscard(struct socket *so); 116 static int soclose_sync(struct socket *so, int fflag); 117 static void soclose_fast(struct socket *so); 118 119 static struct filterops solisten_filtops = 120 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_solisten }; 121 static struct filterops soread_filtops = 122 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_soread }; 123 static struct filterops sowrite_filtops = 124 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sowdetach, filt_sowrite }; 125 static struct filterops soexcept_filtops = 126 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_soread }; 127 128 MALLOC_DEFINE(M_SOCKET, "socket", "socket struct"); 129 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 130 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 131 132 133 static int somaxconn = SOMAXCONN; 134 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, 135 &somaxconn, 0, "Maximum pending socket connection queue size"); 136 137 static int use_soclose_fast = 1; 138 SYSCTL_INT(_kern_ipc, OID_AUTO, soclose_fast, CTLFLAG_RW, 139 &use_soclose_fast, 0, "Fast socket close"); 140 141 int use_soaccept_pred_fast = 1; 142 SYSCTL_INT(_kern_ipc, OID_AUTO, soaccept_pred_fast, CTLFLAG_RW, 143 &use_soaccept_pred_fast, 0, "Fast socket accept predication"); 144 145 int use_sendfile_async = 1; 146 SYSCTL_INT(_kern_ipc, OID_AUTO, sendfile_async, CTLFLAG_RW, 147 &use_sendfile_async, 0, "sendfile uses asynchronized pru_send"); 148 149 int use_soconnect_async = 1; 150 SYSCTL_INT(_kern_ipc, OID_AUTO, soconnect_async, CTLFLAG_RW, 151 &use_soconnect_async, 0, "soconnect uses asynchronized pru_connect"); 152 153 /* 154 * Socket operation routines. 155 * These routines are called by the routines in 156 * sys_socket.c or from a system process, and 157 * implement the semantics of socket operations by 158 * switching out to the protocol specific routines. 159 */ 160 161 /* 162 * Get a socket structure, and initialize it. 163 * Note that it would probably be better to allocate socket 164 * and PCB at the same time, but I'm not convinced that all 165 * the protocols can be easily modified to do this. 166 */ 167 struct socket * 168 soalloc(int waitok, struct protosw *pr) 169 { 170 struct socket *so; 171 unsigned waitmask; 172 173 waitmask = waitok ? M_WAITOK : M_NOWAIT; 174 so = kmalloc(sizeof(struct socket), M_SOCKET, M_ZERO|waitmask); 175 if (so) { 176 /* XXX race condition for reentrant kernel */ 177 so->so_proto = pr; 178 TAILQ_INIT(&so->so_aiojobq); 179 TAILQ_INIT(&so->so_rcv.ssb_kq.ki_mlist); 180 TAILQ_INIT(&so->so_snd.ssb_kq.ki_mlist); 181 lwkt_token_init(&so->so_rcv.ssb_token, "rcvtok"); 182 lwkt_token_init(&so->so_snd.ssb_token, "sndtok"); 183 spin_init(&so->so_rcvd_spin); 184 netmsg_init(&so->so_rcvd_msg.base, so, &netisr_adone_rport, 185 MSGF_DROPABLE | MSGF_PRIORITY, 186 so->so_proto->pr_usrreqs->pru_rcvd); 187 so->so_rcvd_msg.nm_pru_flags |= PRUR_ASYNC; 188 so->so_state = SS_NOFDREF; 189 so->so_refs = 1; 190 } 191 return so; 192 } 193 194 int 195 socreate(int dom, struct socket **aso, int type, 196 int proto, struct thread *td) 197 { 198 struct proc *p = td->td_proc; 199 struct protosw *prp; 200 struct socket *so; 201 struct pru_attach_info ai; 202 int error; 203 204 if (proto) 205 prp = pffindproto(dom, proto, type); 206 else 207 prp = pffindtype(dom, type); 208 209 if (prp == NULL || prp->pr_usrreqs->pru_attach == 0) 210 return (EPROTONOSUPPORT); 211 212 if (p->p_ucred->cr_prison && jail_socket_unixiproute_only && 213 prp->pr_domain->dom_family != PF_LOCAL && 214 prp->pr_domain->dom_family != PF_INET && 215 prp->pr_domain->dom_family != PF_INET6 && 216 prp->pr_domain->dom_family != PF_ROUTE) { 217 return (EPROTONOSUPPORT); 218 } 219 220 if (prp->pr_type != type) 221 return (EPROTOTYPE); 222 so = soalloc(p != NULL, prp); 223 if (so == NULL) 224 return (ENOBUFS); 225 226 /* 227 * Callers of socreate() presumably will connect up a descriptor 228 * and call soclose() if they cannot. This represents our so_refs 229 * (which should be 1) from soalloc(). 230 */ 231 soclrstate(so, SS_NOFDREF); 232 233 /* 234 * Set a default port for protocol processing. No action will occur 235 * on the socket on this port until an inpcb is attached to it and 236 * is able to match incoming packets, or until the socket becomes 237 * available to userland. 238 * 239 * We normally default the socket to the protocol thread on cpu 0, 240 * if protocol does not provide its own method to initialize the 241 * default port. 242 * 243 * If PR_SYNC_PORT is set (unix domain sockets) there is no protocol 244 * thread and all pr_*()/pru_*() calls are executed synchronously. 245 */ 246 if (prp->pr_flags & PR_SYNC_PORT) 247 so->so_port = &netisr_sync_port; 248 else if (prp->pr_initport != NULL) 249 so->so_port = prp->pr_initport(); 250 else 251 so->so_port = netisr_cpuport(0); 252 253 TAILQ_INIT(&so->so_incomp); 254 TAILQ_INIT(&so->so_comp); 255 so->so_type = type; 256 so->so_cred = crhold(p->p_ucred); 257 ai.sb_rlimit = &p->p_rlimit[RLIMIT_SBSIZE]; 258 ai.p_ucred = p->p_ucred; 259 ai.fd_rdir = p->p_fd->fd_rdir; 260 261 /* 262 * Auto-sizing of socket buffers is managed by the protocols and 263 * the appropriate flags must be set in the pru_attach function. 264 */ 265 error = so_pru_attach(so, proto, &ai); 266 if (error) { 267 sosetstate(so, SS_NOFDREF); 268 sofree(so); /* from soalloc */ 269 return error; 270 } 271 272 /* 273 * NOTE: Returns referenced socket. 274 */ 275 *aso = so; 276 return (0); 277 } 278 279 int 280 sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 281 { 282 int error; 283 284 error = so_pru_bind(so, nam, td); 285 return (error); 286 } 287 288 static void 289 sodealloc(struct socket *so) 290 { 291 if (so->so_rcv.ssb_hiwat) 292 (void)chgsbsize(so->so_cred->cr_uidinfo, 293 &so->so_rcv.ssb_hiwat, 0, RLIM_INFINITY); 294 if (so->so_snd.ssb_hiwat) 295 (void)chgsbsize(so->so_cred->cr_uidinfo, 296 &so->so_snd.ssb_hiwat, 0, RLIM_INFINITY); 297 #ifdef INET 298 /* remove accept filter if present */ 299 if (so->so_accf != NULL) 300 do_setopt_accept_filter(so, NULL); 301 #endif /* INET */ 302 crfree(so->so_cred); 303 if (so->so_faddr != NULL) 304 kfree(so->so_faddr, M_SONAME); 305 kfree(so, M_SOCKET); 306 } 307 308 int 309 solisten(struct socket *so, int backlog, struct thread *td) 310 { 311 int error; 312 #ifdef SCTP 313 short oldopt, oldqlimit; 314 #endif /* SCTP */ 315 316 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) 317 return (EINVAL); 318 319 #ifdef SCTP 320 oldopt = so->so_options; 321 oldqlimit = so->so_qlimit; 322 #endif /* SCTP */ 323 324 lwkt_gettoken(&so->so_rcv.ssb_token); 325 if (TAILQ_EMPTY(&so->so_comp)) 326 so->so_options |= SO_ACCEPTCONN; 327 lwkt_reltoken(&so->so_rcv.ssb_token); 328 if (backlog < 0 || backlog > somaxconn) 329 backlog = somaxconn; 330 so->so_qlimit = backlog; 331 /* SCTP needs to look at tweak both the inbound backlog parameter AND 332 * the so_options (UDP model both connect's and gets inbound 333 * connections .. implicitly). 334 */ 335 error = so_pru_listen(so, td); 336 if (error) { 337 #ifdef SCTP 338 /* Restore the params */ 339 so->so_options = oldopt; 340 so->so_qlimit = oldqlimit; 341 #endif /* SCTP */ 342 return (error); 343 } 344 return (0); 345 } 346 347 /* 348 * Destroy a disconnected socket. This routine is a NOP if entities 349 * still have a reference on the socket: 350 * 351 * so_pcb - The protocol stack still has a reference 352 * SS_NOFDREF - There is no longer a file pointer reference 353 */ 354 void 355 sofree(struct socket *so) 356 { 357 struct socket *head; 358 359 /* 360 * This is a bit hackish at the moment. We need to interlock 361 * any accept queue we are on before we potentially lose the 362 * last reference to avoid races against a re-reference from 363 * someone operating on the queue. 364 */ 365 while ((head = so->so_head) != NULL) { 366 lwkt_getpooltoken(head); 367 if (so->so_head == head) 368 break; 369 lwkt_relpooltoken(head); 370 } 371 372 /* 373 * Arbitrage the last free. 374 */ 375 KKASSERT(so->so_refs > 0); 376 if (atomic_fetchadd_int(&so->so_refs, -1) != 1) { 377 if (head) 378 lwkt_relpooltoken(head); 379 return; 380 } 381 382 KKASSERT(so->so_pcb == NULL && (so->so_state & SS_NOFDREF)); 383 KKASSERT((so->so_state & SS_ASSERTINPROG) == 0); 384 385 /* 386 * We're done, remove ourselves from the accept queue we are 387 * on, if we are on one. 388 */ 389 if (head != NULL) { 390 if (so->so_state & SS_INCOMP) { 391 TAILQ_REMOVE(&head->so_incomp, so, so_list); 392 head->so_incqlen--; 393 } else if (so->so_state & SS_COMP) { 394 /* 395 * We must not decommission a socket that's 396 * on the accept(2) queue. If we do, then 397 * accept(2) may hang after select(2) indicated 398 * that the listening socket was ready. 399 */ 400 lwkt_relpooltoken(head); 401 return; 402 } else { 403 panic("sofree: not queued"); 404 } 405 soclrstate(so, SS_INCOMP); 406 so->so_head = NULL; 407 lwkt_relpooltoken(head); 408 } 409 ssb_release(&so->so_snd, so); 410 sorflush(so); 411 sodealloc(so); 412 } 413 414 /* 415 * Close a socket on last file table reference removal. 416 * Initiate disconnect if connected. 417 * Free socket when disconnect complete. 418 */ 419 int 420 soclose(struct socket *so, int fflag) 421 { 422 int error; 423 424 funsetown(&so->so_sigio); 425 if (!use_soclose_fast || 426 (so->so_proto->pr_flags & PR_SYNC_PORT) || 427 ((so->so_state & SS_ISCONNECTED) && 428 (so->so_options & SO_LINGER))) { 429 error = soclose_sync(so, fflag); 430 } else { 431 soclose_fast(so); 432 error = 0; 433 } 434 return error; 435 } 436 437 static void 438 sodiscard(struct socket *so) 439 { 440 lwkt_getpooltoken(so); 441 if (so->so_options & SO_ACCEPTCONN) { 442 struct socket *sp; 443 444 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 445 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 446 soclrstate(sp, SS_INCOMP); 447 sp->so_head = NULL; 448 so->so_incqlen--; 449 soabort_async(sp); 450 } 451 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 452 TAILQ_REMOVE(&so->so_comp, sp, so_list); 453 soclrstate(sp, SS_COMP); 454 sp->so_head = NULL; 455 so->so_qlen--; 456 soabort_async(sp); 457 } 458 } 459 lwkt_relpooltoken(so); 460 461 if (so->so_state & SS_NOFDREF) 462 panic("soclose: NOFDREF"); 463 sosetstate(so, SS_NOFDREF); /* take ref */ 464 } 465 466 void 467 soinherit(struct socket *so, struct socket *so_inh) 468 { 469 TAILQ_HEAD(, socket) comp, incomp; 470 struct socket *sp; 471 int qlen, incqlen; 472 473 KASSERT(so->so_options & SO_ACCEPTCONN, 474 ("so does not accept connection")); 475 KASSERT(so_inh->so_options & SO_ACCEPTCONN, 476 ("so_inh does not accept connection")); 477 478 TAILQ_INIT(&comp); 479 TAILQ_INIT(&incomp); 480 481 lwkt_getpooltoken(so); 482 lwkt_getpooltoken(so_inh); 483 484 /* 485 * Save completed queue and incompleted queue 486 */ 487 TAILQ_CONCAT(&comp, &so->so_comp, so_list); 488 qlen = so->so_qlen; 489 so->so_qlen = 0; 490 491 TAILQ_CONCAT(&incomp, &so->so_incomp, so_list); 492 incqlen = so->so_incqlen; 493 so->so_incqlen = 0; 494 495 /* 496 * Append the saved completed queue and incompleted 497 * queue to the socket inherits them. 498 * 499 * XXX 500 * This may temporarily break the inheriting socket's 501 * so_qlimit. 502 */ 503 TAILQ_FOREACH(sp, &comp, so_list) { 504 sp->so_head = so_inh; 505 crfree(sp->so_cred); 506 sp->so_cred = crhold(so_inh->so_cred); 507 } 508 509 TAILQ_FOREACH(sp, &incomp, so_list) { 510 sp->so_head = so_inh; 511 crfree(sp->so_cred); 512 sp->so_cred = crhold(so_inh->so_cred); 513 } 514 515 TAILQ_CONCAT(&so_inh->so_comp, &comp, so_list); 516 so_inh->so_qlen += qlen; 517 518 TAILQ_CONCAT(&so_inh->so_incomp, &incomp, so_list); 519 so_inh->so_incqlen += incqlen; 520 521 lwkt_relpooltoken(so_inh); 522 lwkt_relpooltoken(so); 523 524 if (qlen) { 525 /* 526 * "New" connections have arrived 527 */ 528 sorwakeup(so_inh); 529 wakeup(&so_inh->so_timeo); 530 } 531 } 532 533 static int 534 soclose_sync(struct socket *so, int fflag) 535 { 536 int error = 0; 537 538 if (so->so_pcb == NULL) 539 goto discard; 540 if (so->so_state & SS_ISCONNECTED) { 541 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 542 error = sodisconnect(so); 543 if (error) 544 goto drop; 545 } 546 if (so->so_options & SO_LINGER) { 547 if ((so->so_state & SS_ISDISCONNECTING) && 548 (fflag & FNONBLOCK)) 549 goto drop; 550 while (so->so_state & SS_ISCONNECTED) { 551 error = tsleep(&so->so_timeo, PCATCH, 552 "soclos", so->so_linger * hz); 553 if (error) 554 break; 555 } 556 } 557 } 558 drop: 559 if (so->so_pcb) { 560 int error2; 561 562 error2 = so_pru_detach(so); 563 if (error == 0) 564 error = error2; 565 } 566 discard: 567 sodiscard(so); 568 so_pru_sync(so); /* unpend async sending */ 569 sofree(so); /* dispose of ref */ 570 571 return (error); 572 } 573 574 static void 575 soclose_sofree_async_handler(netmsg_t msg) 576 { 577 sofree(msg->base.nm_so); 578 } 579 580 static void 581 soclose_sofree_async(struct socket *so) 582 { 583 struct netmsg_base *base = &so->so_clomsg; 584 585 netmsg_init(base, so, &netisr_apanic_rport, 0, 586 soclose_sofree_async_handler); 587 lwkt_sendmsg(so->so_port, &base->lmsg); 588 } 589 590 static void 591 soclose_disconn_async_handler(netmsg_t msg) 592 { 593 struct socket *so = msg->base.nm_so; 594 595 if ((so->so_state & SS_ISCONNECTED) && 596 (so->so_state & SS_ISDISCONNECTING) == 0) 597 so_pru_disconnect_direct(so); 598 599 if (so->so_pcb) 600 so_pru_detach_direct(so); 601 602 sodiscard(so); 603 sofree(so); 604 } 605 606 static void 607 soclose_disconn_async(struct socket *so) 608 { 609 struct netmsg_base *base = &so->so_clomsg; 610 611 netmsg_init(base, so, &netisr_apanic_rport, 0, 612 soclose_disconn_async_handler); 613 lwkt_sendmsg(so->so_port, &base->lmsg); 614 } 615 616 static void 617 soclose_detach_async_handler(netmsg_t msg) 618 { 619 struct socket *so = msg->base.nm_so; 620 621 if (so->so_pcb) 622 so_pru_detach_direct(so); 623 624 sodiscard(so); 625 sofree(so); 626 } 627 628 static void 629 soclose_detach_async(struct socket *so) 630 { 631 struct netmsg_base *base = &so->so_clomsg; 632 633 netmsg_init(base, so, &netisr_apanic_rport, 0, 634 soclose_detach_async_handler); 635 lwkt_sendmsg(so->so_port, &base->lmsg); 636 } 637 638 static void 639 soclose_fast(struct socket *so) 640 { 641 if (so->so_pcb == NULL) 642 goto discard; 643 644 if ((so->so_state & SS_ISCONNECTED) && 645 (so->so_state & SS_ISDISCONNECTING) == 0) { 646 soclose_disconn_async(so); 647 return; 648 } 649 650 if (so->so_pcb) { 651 soclose_detach_async(so); 652 return; 653 } 654 655 discard: 656 sodiscard(so); 657 soclose_sofree_async(so); 658 } 659 660 /* 661 * Abort and destroy a socket. Only one abort can be in progress 662 * at any given moment. 663 */ 664 void 665 soabort(struct socket *so) 666 { 667 soreference(so); 668 so_pru_abort(so); 669 } 670 671 void 672 soabort_async(struct socket *so) 673 { 674 soreference(so); 675 so_pru_abort_async(so); 676 } 677 678 void 679 soabort_oncpu(struct socket *so) 680 { 681 soreference(so); 682 so_pru_abort_direct(so); 683 } 684 685 /* 686 * so is passed in ref'd, which becomes owned by 687 * the cleared SS_NOFDREF flag. 688 */ 689 void 690 soaccept_generic(struct socket *so) 691 { 692 if ((so->so_state & SS_NOFDREF) == 0) 693 panic("soaccept: !NOFDREF"); 694 soclrstate(so, SS_NOFDREF); /* owned by lack of SS_NOFDREF */ 695 } 696 697 int 698 soaccept(struct socket *so, struct sockaddr **nam) 699 { 700 int error; 701 702 soaccept_generic(so); 703 error = so_pru_accept(so, nam); 704 return (error); 705 } 706 707 int 708 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td, 709 boolean_t sync) 710 { 711 int error; 712 713 if (so->so_options & SO_ACCEPTCONN) 714 return (EOPNOTSUPP); 715 /* 716 * If protocol is connection-based, can only connect once. 717 * Otherwise, if connected, try to disconnect first. 718 * This allows user to disconnect by connecting to, e.g., 719 * a null address. 720 */ 721 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 722 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 723 (error = sodisconnect(so)))) { 724 error = EISCONN; 725 } else { 726 /* 727 * Prevent accumulated error from previous connection 728 * from biting us. 729 */ 730 so->so_error = 0; 731 if (!sync && so->so_proto->pr_usrreqs->pru_preconnect) 732 error = so_pru_connect_async(so, nam, td); 733 else 734 error = so_pru_connect(so, nam, td); 735 } 736 return (error); 737 } 738 739 int 740 soconnect2(struct socket *so1, struct socket *so2) 741 { 742 int error; 743 744 error = so_pru_connect2(so1, so2); 745 return (error); 746 } 747 748 int 749 sodisconnect(struct socket *so) 750 { 751 int error; 752 753 if ((so->so_state & SS_ISCONNECTED) == 0) { 754 error = ENOTCONN; 755 goto bad; 756 } 757 if (so->so_state & SS_ISDISCONNECTING) { 758 error = EALREADY; 759 goto bad; 760 } 761 error = so_pru_disconnect(so); 762 bad: 763 return (error); 764 } 765 766 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 767 /* 768 * Send on a socket. 769 * If send must go all at once and message is larger than 770 * send buffering, then hard error. 771 * Lock against other senders. 772 * If must go all at once and not enough room now, then 773 * inform user that this would block and do nothing. 774 * Otherwise, if nonblocking, send as much as possible. 775 * The data to be sent is described by "uio" if nonzero, 776 * otherwise by the mbuf chain "top" (which must be null 777 * if uio is not). Data provided in mbuf chain must be small 778 * enough to send all at once. 779 * 780 * Returns nonzero on error, timeout or signal; callers 781 * must check for short counts if EINTR/ERESTART are returned. 782 * Data and control buffers are freed on return. 783 */ 784 int 785 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 786 struct mbuf *top, struct mbuf *control, int flags, 787 struct thread *td) 788 { 789 struct mbuf **mp; 790 struct mbuf *m; 791 size_t resid; 792 int space, len; 793 int clen = 0, error, dontroute, mlen; 794 int atomic = sosendallatonce(so) || top; 795 int pru_flags; 796 797 if (uio) { 798 resid = uio->uio_resid; 799 } else { 800 resid = (size_t)top->m_pkthdr.len; 801 #ifdef INVARIANTS 802 len = 0; 803 for (m = top; m; m = m->m_next) 804 len += m->m_len; 805 KKASSERT(top->m_pkthdr.len == len); 806 #endif 807 } 808 809 /* 810 * WARNING! resid is unsigned, space and len are signed. space 811 * can wind up negative if the sockbuf is overcommitted. 812 * 813 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 814 * type sockets since that's an error. 815 */ 816 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 817 error = EINVAL; 818 goto out; 819 } 820 821 dontroute = 822 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 823 (so->so_proto->pr_flags & PR_ATOMIC); 824 if (td->td_lwp != NULL) 825 td->td_lwp->lwp_ru.ru_msgsnd++; 826 if (control) 827 clen = control->m_len; 828 #define gotoerr(errcode) { error = errcode; goto release; } 829 830 restart: 831 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags)); 832 if (error) 833 goto out; 834 835 do { 836 if (so->so_state & SS_CANTSENDMORE) 837 gotoerr(EPIPE); 838 if (so->so_error) { 839 error = so->so_error; 840 so->so_error = 0; 841 goto release; 842 } 843 if ((so->so_state & SS_ISCONNECTED) == 0) { 844 /* 845 * `sendto' and `sendmsg' is allowed on a connection- 846 * based socket if it supports implied connect. 847 * Return ENOTCONN if not connected and no address is 848 * supplied. 849 */ 850 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 851 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 852 if ((so->so_state & SS_ISCONFIRMING) == 0 && 853 !(resid == 0 && clen != 0)) 854 gotoerr(ENOTCONN); 855 } else if (addr == NULL) 856 gotoerr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 857 ENOTCONN : EDESTADDRREQ); 858 } 859 if ((atomic && resid > so->so_snd.ssb_hiwat) || 860 clen > so->so_snd.ssb_hiwat) { 861 gotoerr(EMSGSIZE); 862 } 863 space = ssb_space(&so->so_snd); 864 if (flags & MSG_OOB) 865 space += 1024; 866 if ((space < 0 || (size_t)space < resid + clen) && uio && 867 (atomic || space < so->so_snd.ssb_lowat || space < clen)) { 868 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) 869 gotoerr(EWOULDBLOCK); 870 ssb_unlock(&so->so_snd); 871 error = ssb_wait(&so->so_snd); 872 if (error) 873 goto out; 874 goto restart; 875 } 876 mp = ⊤ 877 space -= clen; 878 do { 879 if (uio == NULL) { 880 /* 881 * Data is prepackaged in "top". 882 */ 883 resid = 0; 884 if (flags & MSG_EOR) 885 top->m_flags |= M_EOR; 886 } else do { 887 if (resid > INT_MAX) 888 resid = INT_MAX; 889 m = m_getl((int)resid, MB_WAIT, MT_DATA, 890 top == NULL ? M_PKTHDR : 0, &mlen); 891 if (top == NULL) { 892 m->m_pkthdr.len = 0; 893 m->m_pkthdr.rcvif = NULL; 894 } 895 len = imin((int)szmin(mlen, resid), space); 896 if (resid < MINCLSIZE) { 897 /* 898 * For datagram protocols, leave room 899 * for protocol headers in first mbuf. 900 */ 901 if (atomic && top == NULL && len < mlen) 902 MH_ALIGN(m, len); 903 } 904 space -= len; 905 error = uiomove(mtod(m, caddr_t), (size_t)len, uio); 906 resid = uio->uio_resid; 907 m->m_len = len; 908 *mp = m; 909 top->m_pkthdr.len += len; 910 if (error) 911 goto release; 912 mp = &m->m_next; 913 if (resid == 0) { 914 if (flags & MSG_EOR) 915 top->m_flags |= M_EOR; 916 break; 917 } 918 } while (space > 0 && atomic); 919 if (dontroute) 920 so->so_options |= SO_DONTROUTE; 921 if (flags & MSG_OOB) { 922 pru_flags = PRUS_OOB; 923 } else if ((flags & MSG_EOF) && 924 (so->so_proto->pr_flags & PR_IMPLOPCL) && 925 (resid == 0)) { 926 /* 927 * If the user set MSG_EOF, the protocol 928 * understands this flag and nothing left to 929 * send then use PRU_SEND_EOF instead of PRU_SEND. 930 */ 931 pru_flags = PRUS_EOF; 932 } else if (resid > 0 && space > 0) { 933 /* If there is more to send, set PRUS_MORETOCOME */ 934 pru_flags = PRUS_MORETOCOME; 935 } else { 936 pru_flags = 0; 937 } 938 /* 939 * XXX all the SS_CANTSENDMORE checks previously 940 * done could be out of date. We could have recieved 941 * a reset packet in an interrupt or maybe we slept 942 * while doing page faults in uiomove() etc. We could 943 * probably recheck again inside the splnet() protection 944 * here, but there are probably other places that this 945 * also happens. We must rethink this. 946 */ 947 error = so_pru_send(so, pru_flags, top, addr, control, td); 948 if (dontroute) 949 so->so_options &= ~SO_DONTROUTE; 950 clen = 0; 951 control = NULL; 952 top = NULL; 953 mp = ⊤ 954 if (error) 955 goto release; 956 } while (resid && space > 0); 957 } while (resid); 958 959 release: 960 ssb_unlock(&so->so_snd); 961 out: 962 if (top) 963 m_freem(top); 964 if (control) 965 m_freem(control); 966 return (error); 967 } 968 969 #ifdef INET 970 /* 971 * A specialization of sosend() for UDP based on protocol-specific knowledge: 972 * so->so_proto->pr_flags has the PR_ATOMIC field set. This means that 973 * sosendallatonce() returns true, 974 * the "atomic" variable is true, 975 * and sosendudp() blocks until space is available for the entire send. 976 * so->so_proto->pr_flags does not have the PR_CONNREQUIRED or 977 * PR_IMPLOPCL flags set. 978 * UDP has no out-of-band data. 979 * UDP has no control data. 980 * UDP does not support MSG_EOR. 981 */ 982 int 983 sosendudp(struct socket *so, struct sockaddr *addr, struct uio *uio, 984 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 985 { 986 size_t resid; 987 int error, pru_flags = 0; 988 int space; 989 990 if (td->td_lwp != NULL) 991 td->td_lwp->lwp_ru.ru_msgsnd++; 992 if (control) 993 m_freem(control); 994 995 KASSERT((uio && !top) || (top && !uio), ("bad arguments to sosendudp")); 996 resid = uio ? uio->uio_resid : (size_t)top->m_pkthdr.len; 997 998 restart: 999 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags)); 1000 if (error) 1001 goto out; 1002 1003 if (so->so_state & SS_CANTSENDMORE) 1004 gotoerr(EPIPE); 1005 if (so->so_error) { 1006 error = so->so_error; 1007 so->so_error = 0; 1008 goto release; 1009 } 1010 if (!(so->so_state & SS_ISCONNECTED) && addr == NULL) 1011 gotoerr(EDESTADDRREQ); 1012 if (resid > so->so_snd.ssb_hiwat) 1013 gotoerr(EMSGSIZE); 1014 space = ssb_space(&so->so_snd); 1015 if (uio && (space < 0 || (size_t)space < resid)) { 1016 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) 1017 gotoerr(EWOULDBLOCK); 1018 ssb_unlock(&so->so_snd); 1019 error = ssb_wait(&so->so_snd); 1020 if (error) 1021 goto out; 1022 goto restart; 1023 } 1024 1025 if (uio) { 1026 int hdrlen = max_hdr; 1027 1028 /* 1029 * We try to optimize out the additional mbuf 1030 * allocations in M_PREPEND() on output path, e.g. 1031 * - udp_output(), when it tries to prepend protocol 1032 * headers. 1033 * - Link layer output function, when it tries to 1034 * prepend link layer header. 1035 * 1036 * This probably will not benefit any data that will 1037 * be fragmented, so this optimization is only performed 1038 * when the size of data and max size of protocol+link 1039 * headers fit into one mbuf cluster. 1040 */ 1041 if (uio->uio_resid > MCLBYTES - hdrlen || 1042 !udp_sosend_prepend) { 1043 top = m_uiomove(uio); 1044 if (top == NULL) 1045 goto release; 1046 } else { 1047 int nsize; 1048 1049 top = m_getl(uio->uio_resid + hdrlen, MB_WAIT, 1050 MT_DATA, M_PKTHDR, &nsize); 1051 KASSERT(nsize >= uio->uio_resid + hdrlen, 1052 ("sosendudp invalid nsize %d, " 1053 "resid %zu, hdrlen %d", 1054 nsize, uio->uio_resid, hdrlen)); 1055 1056 top->m_len = uio->uio_resid; 1057 top->m_pkthdr.len = uio->uio_resid; 1058 top->m_data += hdrlen; 1059 1060 error = uiomove(mtod(top, caddr_t), top->m_len, uio); 1061 if (error) 1062 goto out; 1063 } 1064 } 1065 1066 if (flags & MSG_DONTROUTE) 1067 pru_flags |= PRUS_DONTROUTE; 1068 1069 if (udp_sosend_async && (flags & MSG_SYNC) == 0) { 1070 so_pru_send_async(so, pru_flags, top, addr, NULL, td); 1071 error = 0; 1072 } else { 1073 error = so_pru_send(so, pru_flags, top, addr, NULL, td); 1074 } 1075 top = NULL; /* sent or freed in lower layer */ 1076 1077 release: 1078 ssb_unlock(&so->so_snd); 1079 out: 1080 if (top) 1081 m_freem(top); 1082 return (error); 1083 } 1084 1085 int 1086 sosendtcp(struct socket *so, struct sockaddr *addr, struct uio *uio, 1087 struct mbuf *top, struct mbuf *control, int flags, 1088 struct thread *td) 1089 { 1090 struct mbuf **mp; 1091 struct mbuf *m; 1092 size_t resid; 1093 int space, len; 1094 int error, mlen; 1095 int allatonce; 1096 int pru_flags; 1097 1098 if (uio) { 1099 KKASSERT(top == NULL); 1100 allatonce = 0; 1101 resid = uio->uio_resid; 1102 } else { 1103 allatonce = 1; 1104 resid = (size_t)top->m_pkthdr.len; 1105 #ifdef INVARIANTS 1106 len = 0; 1107 for (m = top; m; m = m->m_next) 1108 len += m->m_len; 1109 KKASSERT(top->m_pkthdr.len == len); 1110 #endif 1111 } 1112 1113 /* 1114 * WARNING! resid is unsigned, space and len are signed. space 1115 * can wind up negative if the sockbuf is overcommitted. 1116 * 1117 * Also check to make sure that MSG_EOR isn't used on TCP 1118 */ 1119 if (flags & MSG_EOR) { 1120 error = EINVAL; 1121 goto out; 1122 } 1123 1124 if (control) { 1125 /* TCP doesn't do control messages (rights, creds, etc) */ 1126 if (control->m_len) { 1127 error = EINVAL; 1128 goto out; 1129 } 1130 m_freem(control); /* empty control, just free it */ 1131 control = NULL; 1132 } 1133 1134 if (td->td_lwp != NULL) 1135 td->td_lwp->lwp_ru.ru_msgsnd++; 1136 1137 #define gotoerr(errcode) { error = errcode; goto release; } 1138 1139 restart: 1140 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags)); 1141 if (error) 1142 goto out; 1143 1144 do { 1145 if (so->so_state & SS_CANTSENDMORE) 1146 gotoerr(EPIPE); 1147 if (so->so_error) { 1148 error = so->so_error; 1149 so->so_error = 0; 1150 goto release; 1151 } 1152 if ((so->so_state & SS_ISCONNECTED) == 0 && 1153 (so->so_state & SS_ISCONFIRMING) == 0) 1154 gotoerr(ENOTCONN); 1155 if (allatonce && resid > so->so_snd.ssb_hiwat) 1156 gotoerr(EMSGSIZE); 1157 1158 space = ssb_space_prealloc(&so->so_snd); 1159 if (flags & MSG_OOB) 1160 space += 1024; 1161 if ((space < 0 || (size_t)space < resid) && !allatonce && 1162 space < so->so_snd.ssb_lowat) { 1163 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) 1164 gotoerr(EWOULDBLOCK); 1165 ssb_unlock(&so->so_snd); 1166 error = ssb_wait(&so->so_snd); 1167 if (error) 1168 goto out; 1169 goto restart; 1170 } 1171 mp = ⊤ 1172 do { 1173 int cnt = 0, async = 0; 1174 1175 if (uio == NULL) { 1176 /* 1177 * Data is prepackaged in "top". 1178 */ 1179 resid = 0; 1180 } else do { 1181 if (resid > INT_MAX) 1182 resid = INT_MAX; 1183 if (tcp_sosend_jcluster) { 1184 m = m_getlj((int)resid, MB_WAIT, MT_DATA, 1185 top == NULL ? M_PKTHDR : 0, &mlen); 1186 } else { 1187 m = m_getl((int)resid, MB_WAIT, MT_DATA, 1188 top == NULL ? M_PKTHDR : 0, &mlen); 1189 } 1190 if (top == NULL) { 1191 m->m_pkthdr.len = 0; 1192 m->m_pkthdr.rcvif = NULL; 1193 } 1194 len = imin((int)szmin(mlen, resid), space); 1195 space -= len; 1196 error = uiomove(mtod(m, caddr_t), (size_t)len, uio); 1197 resid = uio->uio_resid; 1198 m->m_len = len; 1199 *mp = m; 1200 top->m_pkthdr.len += len; 1201 if (error) 1202 goto release; 1203 mp = &m->m_next; 1204 if (resid == 0) 1205 break; 1206 ++cnt; 1207 } while (space > 0 && cnt < tcp_sosend_agglim); 1208 1209 if (tcp_sosend_async) 1210 async = 1; 1211 1212 if (flags & MSG_OOB) { 1213 pru_flags = PRUS_OOB; 1214 async = 0; 1215 } else if ((flags & MSG_EOF) && resid == 0) { 1216 pru_flags = PRUS_EOF; 1217 } else if (resid > 0 && space > 0) { 1218 /* If there is more to send, set PRUS_MORETOCOME */ 1219 pru_flags = PRUS_MORETOCOME; 1220 async = 1; 1221 } else { 1222 pru_flags = 0; 1223 } 1224 1225 if (flags & MSG_SYNC) 1226 async = 0; 1227 1228 /* 1229 * XXX all the SS_CANTSENDMORE checks previously 1230 * done could be out of date. We could have recieved 1231 * a reset packet in an interrupt or maybe we slept 1232 * while doing page faults in uiomove() etc. We could 1233 * probably recheck again inside the splnet() protection 1234 * here, but there are probably other places that this 1235 * also happens. We must rethink this. 1236 */ 1237 for (m = top; m; m = m->m_next) 1238 ssb_preallocstream(&so->so_snd, m); 1239 if (!async) { 1240 error = so_pru_send(so, pru_flags, top, 1241 NULL, NULL, td); 1242 } else { 1243 so_pru_send_async(so, pru_flags, top, 1244 NULL, NULL, td); 1245 error = 0; 1246 } 1247 1248 top = NULL; 1249 mp = ⊤ 1250 if (error) 1251 goto release; 1252 } while (resid && space > 0); 1253 } while (resid); 1254 1255 release: 1256 ssb_unlock(&so->so_snd); 1257 out: 1258 if (top) 1259 m_freem(top); 1260 if (control) 1261 m_freem(control); 1262 return (error); 1263 } 1264 #endif 1265 1266 /* 1267 * Implement receive operations on a socket. 1268 * 1269 * We depend on the way that records are added to the signalsockbuf 1270 * by sbappend*. In particular, each record (mbufs linked through m_next) 1271 * must begin with an address if the protocol so specifies, 1272 * followed by an optional mbuf or mbufs containing ancillary data, 1273 * and then zero or more mbufs of data. 1274 * 1275 * Although the signalsockbuf is locked, new data may still be appended. 1276 * A token inside the ssb_lock deals with MP issues and still allows 1277 * the network to access the socket if we block in a uio. 1278 * 1279 * The caller may receive the data as a single mbuf chain by supplying 1280 * an mbuf **mp0 for use in returning the chain. The uio is then used 1281 * only for the count in uio_resid. 1282 */ 1283 int 1284 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 1285 struct sockbuf *sio, struct mbuf **controlp, int *flagsp) 1286 { 1287 struct mbuf *m, *n; 1288 struct mbuf *free_chain = NULL; 1289 int flags, len, error, offset; 1290 struct protosw *pr = so->so_proto; 1291 int moff, type = 0; 1292 size_t resid, orig_resid; 1293 1294 if (uio) 1295 resid = uio->uio_resid; 1296 else 1297 resid = (size_t)(sio->sb_climit - sio->sb_cc); 1298 orig_resid = resid; 1299 1300 if (psa) 1301 *psa = NULL; 1302 if (controlp) 1303 *controlp = NULL; 1304 if (flagsp) 1305 flags = *flagsp &~ MSG_EOR; 1306 else 1307 flags = 0; 1308 if (flags & MSG_OOB) { 1309 m = m_get(MB_WAIT, MT_DATA); 1310 if (m == NULL) 1311 return (ENOBUFS); 1312 error = so_pru_rcvoob(so, m, flags & MSG_PEEK); 1313 if (error) 1314 goto bad; 1315 if (sio) { 1316 do { 1317 sbappend(sio, m); 1318 KKASSERT(resid >= (size_t)m->m_len); 1319 resid -= (size_t)m->m_len; 1320 } while (resid > 0 && m); 1321 } else { 1322 do { 1323 uio->uio_resid = resid; 1324 error = uiomove(mtod(m, caddr_t), 1325 (int)szmin(resid, m->m_len), 1326 uio); 1327 resid = uio->uio_resid; 1328 m = m_free(m); 1329 } while (uio->uio_resid && error == 0 && m); 1330 } 1331 bad: 1332 if (m) 1333 m_freem(m); 1334 return (error); 1335 } 1336 if ((so->so_state & SS_ISCONFIRMING) && resid) 1337 so_pru_rcvd(so, 0); 1338 1339 /* 1340 * The token interlocks against the protocol thread while 1341 * ssb_lock is a blocking lock against other userland entities. 1342 */ 1343 lwkt_gettoken(&so->so_rcv.ssb_token); 1344 restart: 1345 error = ssb_lock(&so->so_rcv, SBLOCKWAIT(flags)); 1346 if (error) 1347 goto done; 1348 1349 m = so->so_rcv.ssb_mb; 1350 /* 1351 * If we have less data than requested, block awaiting more 1352 * (subject to any timeout) if: 1353 * 1. the current count is less than the low water mark, or 1354 * 2. MSG_WAITALL is set, and it is possible to do the entire 1355 * receive operation at once if we block (resid <= hiwat). 1356 * 3. MSG_DONTWAIT is not set 1357 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1358 * we have to do the receive in sections, and thus risk returning 1359 * a short count if a timeout or signal occurs after we start. 1360 */ 1361 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1362 (size_t)so->so_rcv.ssb_cc < resid) && 1363 (so->so_rcv.ssb_cc < so->so_rcv.ssb_lowat || 1364 ((flags & MSG_WAITALL) && resid <= (size_t)so->so_rcv.ssb_hiwat)) && 1365 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { 1366 KASSERT(m != NULL || !so->so_rcv.ssb_cc, ("receive 1")); 1367 if (so->so_error) { 1368 if (m) 1369 goto dontblock; 1370 error = so->so_error; 1371 if ((flags & MSG_PEEK) == 0) 1372 so->so_error = 0; 1373 goto release; 1374 } 1375 if (so->so_state & SS_CANTRCVMORE) { 1376 if (m) 1377 goto dontblock; 1378 else 1379 goto release; 1380 } 1381 for (; m; m = m->m_next) { 1382 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1383 m = so->so_rcv.ssb_mb; 1384 goto dontblock; 1385 } 1386 } 1387 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1388 (pr->pr_flags & PR_CONNREQUIRED)) { 1389 error = ENOTCONN; 1390 goto release; 1391 } 1392 if (resid == 0) 1393 goto release; 1394 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) { 1395 error = EWOULDBLOCK; 1396 goto release; 1397 } 1398 ssb_unlock(&so->so_rcv); 1399 error = ssb_wait(&so->so_rcv); 1400 if (error) 1401 goto done; 1402 goto restart; 1403 } 1404 dontblock: 1405 if (uio && uio->uio_td && uio->uio_td->td_proc) 1406 uio->uio_td->td_lwp->lwp_ru.ru_msgrcv++; 1407 1408 /* 1409 * note: m should be == sb_mb here. Cache the next record while 1410 * cleaning up. Note that calling m_free*() will break out critical 1411 * section. 1412 */ 1413 KKASSERT(m == so->so_rcv.ssb_mb); 1414 1415 /* 1416 * Skip any address mbufs prepending the record. 1417 */ 1418 if (pr->pr_flags & PR_ADDR) { 1419 KASSERT(m->m_type == MT_SONAME, ("receive 1a")); 1420 orig_resid = 0; 1421 if (psa) 1422 *psa = dup_sockaddr(mtod(m, struct sockaddr *)); 1423 if (flags & MSG_PEEK) 1424 m = m->m_next; 1425 else 1426 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1427 } 1428 1429 /* 1430 * Skip any control mbufs prepending the record. 1431 */ 1432 #ifdef SCTP 1433 if (pr->pr_flags & PR_ADDR_OPT) { 1434 /* 1435 * For SCTP we may be getting a 1436 * whole message OR a partial delivery. 1437 */ 1438 if (m && m->m_type == MT_SONAME) { 1439 orig_resid = 0; 1440 if (psa) 1441 *psa = dup_sockaddr(mtod(m, struct sockaddr *)); 1442 if (flags & MSG_PEEK) 1443 m = m->m_next; 1444 else 1445 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1446 } 1447 } 1448 #endif /* SCTP */ 1449 while (m && m->m_type == MT_CONTROL && error == 0) { 1450 if (flags & MSG_PEEK) { 1451 if (controlp) 1452 *controlp = m_copy(m, 0, m->m_len); 1453 m = m->m_next; /* XXX race */ 1454 } else { 1455 if (controlp) { 1456 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL); 1457 if (pr->pr_domain->dom_externalize && 1458 mtod(m, struct cmsghdr *)->cmsg_type == 1459 SCM_RIGHTS) 1460 error = (*pr->pr_domain->dom_externalize)(m); 1461 *controlp = m; 1462 m = n; 1463 } else { 1464 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1465 } 1466 } 1467 if (controlp && *controlp) { 1468 orig_resid = 0; 1469 controlp = &(*controlp)->m_next; 1470 } 1471 } 1472 1473 /* 1474 * flag OOB data. 1475 */ 1476 if (m) { 1477 type = m->m_type; 1478 if (type == MT_OOBDATA) 1479 flags |= MSG_OOB; 1480 } 1481 1482 /* 1483 * Copy to the UIO or mbuf return chain (*mp). 1484 */ 1485 moff = 0; 1486 offset = 0; 1487 while (m && resid > 0 && error == 0) { 1488 if (m->m_type == MT_OOBDATA) { 1489 if (type != MT_OOBDATA) 1490 break; 1491 } else if (type == MT_OOBDATA) 1492 break; 1493 else 1494 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 1495 ("receive 3")); 1496 soclrstate(so, SS_RCVATMARK); 1497 len = (resid > INT_MAX) ? INT_MAX : resid; 1498 if (so->so_oobmark && len > so->so_oobmark - offset) 1499 len = so->so_oobmark - offset; 1500 if (len > m->m_len - moff) 1501 len = m->m_len - moff; 1502 1503 /* 1504 * Copy out to the UIO or pass the mbufs back to the SIO. 1505 * The SIO is dealt with when we eat the mbuf, but deal 1506 * with the resid here either way. 1507 */ 1508 if (uio) { 1509 uio->uio_resid = resid; 1510 error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1511 resid = uio->uio_resid; 1512 if (error) 1513 goto release; 1514 } else { 1515 resid -= (size_t)len; 1516 } 1517 1518 /* 1519 * Eat the entire mbuf or just a piece of it 1520 */ 1521 if (len == m->m_len - moff) { 1522 if (m->m_flags & M_EOR) 1523 flags |= MSG_EOR; 1524 #ifdef SCTP 1525 if (m->m_flags & M_NOTIFICATION) 1526 flags |= MSG_NOTIFICATION; 1527 #endif /* SCTP */ 1528 if (flags & MSG_PEEK) { 1529 m = m->m_next; 1530 moff = 0; 1531 } else { 1532 if (sio) { 1533 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL); 1534 sbappend(sio, m); 1535 m = n; 1536 } else { 1537 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1538 } 1539 } 1540 } else { 1541 if (flags & MSG_PEEK) { 1542 moff += len; 1543 } else { 1544 if (sio) { 1545 n = m_copym(m, 0, len, MB_WAIT); 1546 if (n) 1547 sbappend(sio, n); 1548 } 1549 m->m_data += len; 1550 m->m_len -= len; 1551 so->so_rcv.ssb_cc -= len; 1552 } 1553 } 1554 if (so->so_oobmark) { 1555 if ((flags & MSG_PEEK) == 0) { 1556 so->so_oobmark -= len; 1557 if (so->so_oobmark == 0) { 1558 sosetstate(so, SS_RCVATMARK); 1559 break; 1560 } 1561 } else { 1562 offset += len; 1563 if (offset == so->so_oobmark) 1564 break; 1565 } 1566 } 1567 if (flags & MSG_EOR) 1568 break; 1569 /* 1570 * If the MSG_WAITALL flag is set (for non-atomic socket), 1571 * we must not quit until resid == 0 or an error 1572 * termination. If a signal/timeout occurs, return 1573 * with a short count but without error. 1574 * Keep signalsockbuf locked against other readers. 1575 */ 1576 while ((flags & MSG_WAITALL) && m == NULL && 1577 resid > 0 && !sosendallatonce(so) && 1578 so->so_rcv.ssb_mb == NULL) { 1579 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1580 break; 1581 /* 1582 * The window might have closed to zero, make 1583 * sure we send an ack now that we've drained 1584 * the buffer or we might end up blocking until 1585 * the idle takes over (5 seconds). 1586 */ 1587 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 1588 so_pru_rcvd(so, flags); 1589 error = ssb_wait(&so->so_rcv); 1590 if (error) { 1591 ssb_unlock(&so->so_rcv); 1592 error = 0; 1593 goto done; 1594 } 1595 m = so->so_rcv.ssb_mb; 1596 } 1597 } 1598 1599 /* 1600 * If an atomic read was requested but unread data still remains 1601 * in the record, set MSG_TRUNC. 1602 */ 1603 if (m && pr->pr_flags & PR_ATOMIC) 1604 flags |= MSG_TRUNC; 1605 1606 /* 1607 * Cleanup. If an atomic read was requested drop any unread data. 1608 */ 1609 if ((flags & MSG_PEEK) == 0) { 1610 if (m && (pr->pr_flags & PR_ATOMIC)) 1611 sbdroprecord(&so->so_rcv.sb); 1612 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) 1613 so_pru_rcvd(so, flags); 1614 } 1615 1616 if (orig_resid == resid && orig_resid && 1617 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1618 ssb_unlock(&so->so_rcv); 1619 goto restart; 1620 } 1621 1622 if (flagsp) 1623 *flagsp |= flags; 1624 release: 1625 ssb_unlock(&so->so_rcv); 1626 done: 1627 lwkt_reltoken(&so->so_rcv.ssb_token); 1628 if (free_chain) 1629 m_freem(free_chain); 1630 return (error); 1631 } 1632 1633 int 1634 sorecvtcp(struct socket *so, struct sockaddr **psa, struct uio *uio, 1635 struct sockbuf *sio, struct mbuf **controlp, int *flagsp) 1636 { 1637 struct mbuf *m, *n; 1638 struct mbuf *free_chain = NULL; 1639 int flags, len, error, offset; 1640 struct protosw *pr = so->so_proto; 1641 int moff; 1642 size_t resid, orig_resid; 1643 1644 if (uio) 1645 resid = uio->uio_resid; 1646 else 1647 resid = (size_t)(sio->sb_climit - sio->sb_cc); 1648 orig_resid = resid; 1649 1650 if (psa) 1651 *psa = NULL; 1652 if (controlp) 1653 *controlp = NULL; 1654 if (flagsp) 1655 flags = *flagsp &~ MSG_EOR; 1656 else 1657 flags = 0; 1658 if (flags & MSG_OOB) { 1659 m = m_get(MB_WAIT, MT_DATA); 1660 if (m == NULL) 1661 return (ENOBUFS); 1662 error = so_pru_rcvoob(so, m, flags & MSG_PEEK); 1663 if (error) 1664 goto bad; 1665 if (sio) { 1666 do { 1667 sbappend(sio, m); 1668 KKASSERT(resid >= (size_t)m->m_len); 1669 resid -= (size_t)m->m_len; 1670 } while (resid > 0 && m); 1671 } else { 1672 do { 1673 uio->uio_resid = resid; 1674 error = uiomove(mtod(m, caddr_t), 1675 (int)szmin(resid, m->m_len), 1676 uio); 1677 resid = uio->uio_resid; 1678 m = m_free(m); 1679 } while (uio->uio_resid && error == 0 && m); 1680 } 1681 bad: 1682 if (m) 1683 m_freem(m); 1684 return (error); 1685 } 1686 1687 /* 1688 * The token interlocks against the protocol thread while 1689 * ssb_lock is a blocking lock against other userland entities. 1690 */ 1691 lwkt_gettoken(&so->so_rcv.ssb_token); 1692 restart: 1693 error = ssb_lock(&so->so_rcv, SBLOCKWAIT(flags)); 1694 if (error) 1695 goto done; 1696 1697 m = so->so_rcv.ssb_mb; 1698 /* 1699 * If we have less data than requested, block awaiting more 1700 * (subject to any timeout) if: 1701 * 1. the current count is less than the low water mark, or 1702 * 2. MSG_WAITALL is set, and it is possible to do the entire 1703 * receive operation at once if we block (resid <= hiwat). 1704 * 3. MSG_DONTWAIT is not set 1705 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1706 * we have to do the receive in sections, and thus risk returning 1707 * a short count if a timeout or signal occurs after we start. 1708 */ 1709 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1710 (size_t)so->so_rcv.ssb_cc < resid) && 1711 (so->so_rcv.ssb_cc < so->so_rcv.ssb_lowat || 1712 ((flags & MSG_WAITALL) && resid <= (size_t)so->so_rcv.ssb_hiwat)))) { 1713 KASSERT(m != NULL || !so->so_rcv.ssb_cc, ("receive 1")); 1714 if (so->so_error) { 1715 if (m) 1716 goto dontblock; 1717 error = so->so_error; 1718 if ((flags & MSG_PEEK) == 0) 1719 so->so_error = 0; 1720 goto release; 1721 } 1722 if (so->so_state & SS_CANTRCVMORE) { 1723 if (m) 1724 goto dontblock; 1725 else 1726 goto release; 1727 } 1728 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1729 (pr->pr_flags & PR_CONNREQUIRED)) { 1730 error = ENOTCONN; 1731 goto release; 1732 } 1733 if (resid == 0) 1734 goto release; 1735 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) { 1736 error = EWOULDBLOCK; 1737 goto release; 1738 } 1739 ssb_unlock(&so->so_rcv); 1740 error = ssb_wait(&so->so_rcv); 1741 if (error) 1742 goto done; 1743 goto restart; 1744 } 1745 dontblock: 1746 if (uio && uio->uio_td && uio->uio_td->td_proc) 1747 uio->uio_td->td_lwp->lwp_ru.ru_msgrcv++; 1748 1749 /* 1750 * note: m should be == sb_mb here. Cache the next record while 1751 * cleaning up. Note that calling m_free*() will break out critical 1752 * section. 1753 */ 1754 KKASSERT(m == so->so_rcv.ssb_mb); 1755 1756 /* 1757 * Copy to the UIO or mbuf return chain (*mp). 1758 */ 1759 moff = 0; 1760 offset = 0; 1761 while (m && resid > 0 && error == 0) { 1762 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 1763 ("receive 3")); 1764 1765 soclrstate(so, SS_RCVATMARK); 1766 len = (resid > INT_MAX) ? INT_MAX : resid; 1767 if (so->so_oobmark && len > so->so_oobmark - offset) 1768 len = so->so_oobmark - offset; 1769 if (len > m->m_len - moff) 1770 len = m->m_len - moff; 1771 1772 /* 1773 * Copy out to the UIO or pass the mbufs back to the SIO. 1774 * The SIO is dealt with when we eat the mbuf, but deal 1775 * with the resid here either way. 1776 */ 1777 if (uio) { 1778 uio->uio_resid = resid; 1779 error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1780 resid = uio->uio_resid; 1781 if (error) 1782 goto release; 1783 } else { 1784 resid -= (size_t)len; 1785 } 1786 1787 /* 1788 * Eat the entire mbuf or just a piece of it 1789 */ 1790 if (len == m->m_len - moff) { 1791 if (flags & MSG_PEEK) { 1792 m = m->m_next; 1793 moff = 0; 1794 } else { 1795 if (sio) { 1796 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL); 1797 sbappend(sio, m); 1798 m = n; 1799 } else { 1800 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1801 } 1802 } 1803 } else { 1804 if (flags & MSG_PEEK) { 1805 moff += len; 1806 } else { 1807 if (sio) { 1808 n = m_copym(m, 0, len, MB_WAIT); 1809 if (n) 1810 sbappend(sio, n); 1811 } 1812 m->m_data += len; 1813 m->m_len -= len; 1814 so->so_rcv.ssb_cc -= len; 1815 } 1816 } 1817 if (so->so_oobmark) { 1818 if ((flags & MSG_PEEK) == 0) { 1819 so->so_oobmark -= len; 1820 if (so->so_oobmark == 0) { 1821 sosetstate(so, SS_RCVATMARK); 1822 break; 1823 } 1824 } else { 1825 offset += len; 1826 if (offset == so->so_oobmark) 1827 break; 1828 } 1829 } 1830 /* 1831 * If the MSG_WAITALL flag is set (for non-atomic socket), 1832 * we must not quit until resid == 0 or an error 1833 * termination. If a signal/timeout occurs, return 1834 * with a short count but without error. 1835 * Keep signalsockbuf locked against other readers. 1836 */ 1837 while ((flags & MSG_WAITALL) && m == NULL && 1838 resid > 0 && !sosendallatonce(so) && 1839 so->so_rcv.ssb_mb == NULL) { 1840 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1841 break; 1842 /* 1843 * The window might have closed to zero, make 1844 * sure we send an ack now that we've drained 1845 * the buffer or we might end up blocking until 1846 * the idle takes over (5 seconds). 1847 */ 1848 if (so->so_pcb) 1849 so_pru_rcvd_async(so); 1850 error = ssb_wait(&so->so_rcv); 1851 if (error) { 1852 ssb_unlock(&so->so_rcv); 1853 error = 0; 1854 goto done; 1855 } 1856 m = so->so_rcv.ssb_mb; 1857 } 1858 } 1859 1860 /* 1861 * Cleanup. If an atomic read was requested drop any unread data. 1862 */ 1863 if ((flags & MSG_PEEK) == 0) { 1864 if (so->so_pcb) 1865 so_pru_rcvd_async(so); 1866 } 1867 1868 if (orig_resid == resid && orig_resid && 1869 (so->so_state & SS_CANTRCVMORE) == 0) { 1870 ssb_unlock(&so->so_rcv); 1871 goto restart; 1872 } 1873 1874 if (flagsp) 1875 *flagsp |= flags; 1876 release: 1877 ssb_unlock(&so->so_rcv); 1878 done: 1879 lwkt_reltoken(&so->so_rcv.ssb_token); 1880 if (free_chain) 1881 m_freem(free_chain); 1882 return (error); 1883 } 1884 1885 /* 1886 * Shut a socket down. Note that we do not get a frontend lock as we 1887 * want to be able to shut the socket down even if another thread is 1888 * blocked in a read(), thus waking it up. 1889 */ 1890 int 1891 soshutdown(struct socket *so, int how) 1892 { 1893 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1894 return (EINVAL); 1895 1896 if (how != SHUT_WR) { 1897 /*ssb_lock(&so->so_rcv, M_WAITOK);*/ 1898 sorflush(so); 1899 /*ssb_unlock(&so->so_rcv);*/ 1900 } 1901 if (how != SHUT_RD) 1902 return (so_pru_shutdown(so)); 1903 return (0); 1904 } 1905 1906 void 1907 sorflush(struct socket *so) 1908 { 1909 struct signalsockbuf *ssb = &so->so_rcv; 1910 struct protosw *pr = so->so_proto; 1911 struct signalsockbuf asb; 1912 1913 atomic_set_int(&ssb->ssb_flags, SSB_NOINTR); 1914 1915 lwkt_gettoken(&ssb->ssb_token); 1916 socantrcvmore(so); 1917 asb = *ssb; 1918 1919 /* 1920 * Can't just blow up the ssb structure here 1921 */ 1922 bzero(&ssb->sb, sizeof(ssb->sb)); 1923 ssb->ssb_timeo = 0; 1924 ssb->ssb_lowat = 0; 1925 ssb->ssb_hiwat = 0; 1926 ssb->ssb_mbmax = 0; 1927 atomic_clear_int(&ssb->ssb_flags, SSB_CLEAR_MASK); 1928 1929 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) 1930 (*pr->pr_domain->dom_dispose)(asb.ssb_mb); 1931 ssb_release(&asb, so); 1932 1933 lwkt_reltoken(&ssb->ssb_token); 1934 } 1935 1936 #ifdef INET 1937 static int 1938 do_setopt_accept_filter(struct socket *so, struct sockopt *sopt) 1939 { 1940 struct accept_filter_arg *afap = NULL; 1941 struct accept_filter *afp; 1942 struct so_accf *af = so->so_accf; 1943 int error = 0; 1944 1945 /* do not set/remove accept filters on non listen sockets */ 1946 if ((so->so_options & SO_ACCEPTCONN) == 0) { 1947 error = EINVAL; 1948 goto out; 1949 } 1950 1951 /* removing the filter */ 1952 if (sopt == NULL) { 1953 if (af != NULL) { 1954 if (af->so_accept_filter != NULL && 1955 af->so_accept_filter->accf_destroy != NULL) { 1956 af->so_accept_filter->accf_destroy(so); 1957 } 1958 if (af->so_accept_filter_str != NULL) { 1959 kfree(af->so_accept_filter_str, M_ACCF); 1960 } 1961 kfree(af, M_ACCF); 1962 so->so_accf = NULL; 1963 } 1964 so->so_options &= ~SO_ACCEPTFILTER; 1965 return (0); 1966 } 1967 /* adding a filter */ 1968 /* must remove previous filter first */ 1969 if (af != NULL) { 1970 error = EINVAL; 1971 goto out; 1972 } 1973 /* don't put large objects on the kernel stack */ 1974 afap = kmalloc(sizeof(*afap), M_TEMP, M_WAITOK); 1975 error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); 1976 afap->af_name[sizeof(afap->af_name)-1] = '\0'; 1977 afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; 1978 if (error) 1979 goto out; 1980 afp = accept_filt_get(afap->af_name); 1981 if (afp == NULL) { 1982 error = ENOENT; 1983 goto out; 1984 } 1985 af = kmalloc(sizeof(*af), M_ACCF, M_WAITOK | M_ZERO); 1986 if (afp->accf_create != NULL) { 1987 if (afap->af_name[0] != '\0') { 1988 int len = strlen(afap->af_name) + 1; 1989 1990 af->so_accept_filter_str = kmalloc(len, M_ACCF, 1991 M_WAITOK); 1992 strcpy(af->so_accept_filter_str, afap->af_name); 1993 } 1994 af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg); 1995 if (af->so_accept_filter_arg == NULL) { 1996 kfree(af->so_accept_filter_str, M_ACCF); 1997 kfree(af, M_ACCF); 1998 so->so_accf = NULL; 1999 error = EINVAL; 2000 goto out; 2001 } 2002 } 2003 af->so_accept_filter = afp; 2004 so->so_accf = af; 2005 so->so_options |= SO_ACCEPTFILTER; 2006 out: 2007 if (afap != NULL) 2008 kfree(afap, M_TEMP); 2009 return (error); 2010 } 2011 #endif /* INET */ 2012 2013 /* 2014 * Perhaps this routine, and sooptcopyout(), below, ought to come in 2015 * an additional variant to handle the case where the option value needs 2016 * to be some kind of integer, but not a specific size. 2017 * In addition to their use here, these functions are also called by the 2018 * protocol-level pr_ctloutput() routines. 2019 */ 2020 int 2021 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 2022 { 2023 return soopt_to_kbuf(sopt, buf, len, minlen); 2024 } 2025 2026 int 2027 soopt_to_kbuf(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 2028 { 2029 size_t valsize; 2030 2031 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2032 KKASSERT(kva_p(buf)); 2033 2034 /* 2035 * If the user gives us more than we wanted, we ignore it, 2036 * but if we don't get the minimum length the caller 2037 * wants, we return EINVAL. On success, sopt->sopt_valsize 2038 * is set to however much we actually retrieved. 2039 */ 2040 if ((valsize = sopt->sopt_valsize) < minlen) 2041 return EINVAL; 2042 if (valsize > len) 2043 sopt->sopt_valsize = valsize = len; 2044 2045 bcopy(sopt->sopt_val, buf, valsize); 2046 return 0; 2047 } 2048 2049 2050 int 2051 sosetopt(struct socket *so, struct sockopt *sopt) 2052 { 2053 int error, optval; 2054 struct linger l; 2055 struct timeval tv; 2056 u_long val; 2057 struct signalsockbuf *sotmp; 2058 2059 error = 0; 2060 sopt->sopt_dir = SOPT_SET; 2061 if (sopt->sopt_level != SOL_SOCKET) { 2062 if (so->so_proto && so->so_proto->pr_ctloutput) { 2063 return (so_pr_ctloutput(so, sopt)); 2064 } 2065 error = ENOPROTOOPT; 2066 } else { 2067 switch (sopt->sopt_name) { 2068 #ifdef INET 2069 case SO_ACCEPTFILTER: 2070 error = do_setopt_accept_filter(so, sopt); 2071 if (error) 2072 goto bad; 2073 break; 2074 #endif /* INET */ 2075 case SO_LINGER: 2076 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 2077 if (error) 2078 goto bad; 2079 2080 so->so_linger = l.l_linger; 2081 if (l.l_onoff) 2082 so->so_options |= SO_LINGER; 2083 else 2084 so->so_options &= ~SO_LINGER; 2085 break; 2086 2087 case SO_DEBUG: 2088 case SO_KEEPALIVE: 2089 case SO_DONTROUTE: 2090 case SO_USELOOPBACK: 2091 case SO_BROADCAST: 2092 case SO_REUSEADDR: 2093 case SO_REUSEPORT: 2094 case SO_OOBINLINE: 2095 case SO_TIMESTAMP: 2096 case SO_NOSIGPIPE: 2097 error = sooptcopyin(sopt, &optval, sizeof optval, 2098 sizeof optval); 2099 if (error) 2100 goto bad; 2101 if (optval) 2102 so->so_options |= sopt->sopt_name; 2103 else 2104 so->so_options &= ~sopt->sopt_name; 2105 break; 2106 2107 case SO_SNDBUF: 2108 case SO_RCVBUF: 2109 case SO_SNDLOWAT: 2110 case SO_RCVLOWAT: 2111 error = sooptcopyin(sopt, &optval, sizeof optval, 2112 sizeof optval); 2113 if (error) 2114 goto bad; 2115 2116 /* 2117 * Values < 1 make no sense for any of these 2118 * options, so disallow them. 2119 */ 2120 if (optval < 1) { 2121 error = EINVAL; 2122 goto bad; 2123 } 2124 2125 switch (sopt->sopt_name) { 2126 case SO_SNDBUF: 2127 case SO_RCVBUF: 2128 if (ssb_reserve(sopt->sopt_name == SO_SNDBUF ? 2129 &so->so_snd : &so->so_rcv, (u_long)optval, 2130 so, 2131 &curproc->p_rlimit[RLIMIT_SBSIZE]) == 0) { 2132 error = ENOBUFS; 2133 goto bad; 2134 } 2135 sotmp = (sopt->sopt_name == SO_SNDBUF) ? 2136 &so->so_snd : &so->so_rcv; 2137 atomic_clear_int(&sotmp->ssb_flags, 2138 SSB_AUTOSIZE); 2139 break; 2140 2141 /* 2142 * Make sure the low-water is never greater than 2143 * the high-water. 2144 */ 2145 case SO_SNDLOWAT: 2146 so->so_snd.ssb_lowat = 2147 (optval > so->so_snd.ssb_hiwat) ? 2148 so->so_snd.ssb_hiwat : optval; 2149 atomic_clear_int(&so->so_snd.ssb_flags, 2150 SSB_AUTOLOWAT); 2151 break; 2152 case SO_RCVLOWAT: 2153 so->so_rcv.ssb_lowat = 2154 (optval > so->so_rcv.ssb_hiwat) ? 2155 so->so_rcv.ssb_hiwat : optval; 2156 atomic_clear_int(&so->so_rcv.ssb_flags, 2157 SSB_AUTOLOWAT); 2158 break; 2159 } 2160 break; 2161 2162 case SO_SNDTIMEO: 2163 case SO_RCVTIMEO: 2164 error = sooptcopyin(sopt, &tv, sizeof tv, 2165 sizeof tv); 2166 if (error) 2167 goto bad; 2168 2169 /* assert(hz > 0); */ 2170 if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz || 2171 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 2172 error = EDOM; 2173 goto bad; 2174 } 2175 /* assert(tick > 0); */ 2176 /* assert(ULONG_MAX - INT_MAX >= 1000000); */ 2177 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / ustick; 2178 if (val > INT_MAX) { 2179 error = EDOM; 2180 goto bad; 2181 } 2182 if (val == 0 && tv.tv_usec != 0) 2183 val = 1; 2184 2185 switch (sopt->sopt_name) { 2186 case SO_SNDTIMEO: 2187 so->so_snd.ssb_timeo = val; 2188 break; 2189 case SO_RCVTIMEO: 2190 so->so_rcv.ssb_timeo = val; 2191 break; 2192 } 2193 break; 2194 default: 2195 error = ENOPROTOOPT; 2196 break; 2197 } 2198 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { 2199 (void) so_pr_ctloutput(so, sopt); 2200 } 2201 } 2202 bad: 2203 return (error); 2204 } 2205 2206 /* Helper routine for getsockopt */ 2207 int 2208 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 2209 { 2210 soopt_from_kbuf(sopt, buf, len); 2211 return 0; 2212 } 2213 2214 void 2215 soopt_from_kbuf(struct sockopt *sopt, const void *buf, size_t len) 2216 { 2217 size_t valsize; 2218 2219 if (len == 0) { 2220 sopt->sopt_valsize = 0; 2221 return; 2222 } 2223 2224 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2225 KKASSERT(kva_p(buf)); 2226 2227 /* 2228 * Documented get behavior is that we always return a value, 2229 * possibly truncated to fit in the user's buffer. 2230 * Traditional behavior is that we always tell the user 2231 * precisely how much we copied, rather than something useful 2232 * like the total amount we had available for her. 2233 * Note that this interface is not idempotent; the entire answer must 2234 * generated ahead of time. 2235 */ 2236 valsize = szmin(len, sopt->sopt_valsize); 2237 sopt->sopt_valsize = valsize; 2238 if (sopt->sopt_val != 0) { 2239 bcopy(buf, sopt->sopt_val, valsize); 2240 } 2241 } 2242 2243 int 2244 sogetopt(struct socket *so, struct sockopt *sopt) 2245 { 2246 int error, optval; 2247 long optval_l; 2248 struct linger l; 2249 struct timeval tv; 2250 #ifdef INET 2251 struct accept_filter_arg *afap; 2252 #endif 2253 2254 error = 0; 2255 sopt->sopt_dir = SOPT_GET; 2256 if (sopt->sopt_level != SOL_SOCKET) { 2257 if (so->so_proto && so->so_proto->pr_ctloutput) { 2258 return (so_pr_ctloutput(so, sopt)); 2259 } else 2260 return (ENOPROTOOPT); 2261 } else { 2262 switch (sopt->sopt_name) { 2263 #ifdef INET 2264 case SO_ACCEPTFILTER: 2265 if ((so->so_options & SO_ACCEPTCONN) == 0) 2266 return (EINVAL); 2267 afap = kmalloc(sizeof(*afap), M_TEMP, 2268 M_WAITOK | M_ZERO); 2269 if ((so->so_options & SO_ACCEPTFILTER) != 0) { 2270 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); 2271 if (so->so_accf->so_accept_filter_str != NULL) 2272 strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); 2273 } 2274 error = sooptcopyout(sopt, afap, sizeof(*afap)); 2275 kfree(afap, M_TEMP); 2276 break; 2277 #endif /* INET */ 2278 2279 case SO_LINGER: 2280 l.l_onoff = so->so_options & SO_LINGER; 2281 l.l_linger = so->so_linger; 2282 error = sooptcopyout(sopt, &l, sizeof l); 2283 break; 2284 2285 case SO_USELOOPBACK: 2286 case SO_DONTROUTE: 2287 case SO_DEBUG: 2288 case SO_KEEPALIVE: 2289 case SO_REUSEADDR: 2290 case SO_REUSEPORT: 2291 case SO_BROADCAST: 2292 case SO_OOBINLINE: 2293 case SO_TIMESTAMP: 2294 case SO_NOSIGPIPE: 2295 optval = so->so_options & sopt->sopt_name; 2296 integer: 2297 error = sooptcopyout(sopt, &optval, sizeof optval); 2298 break; 2299 2300 case SO_TYPE: 2301 optval = so->so_type; 2302 goto integer; 2303 2304 case SO_ERROR: 2305 optval = so->so_error; 2306 so->so_error = 0; 2307 goto integer; 2308 2309 case SO_SNDBUF: 2310 optval = so->so_snd.ssb_hiwat; 2311 goto integer; 2312 2313 case SO_RCVBUF: 2314 optval = so->so_rcv.ssb_hiwat; 2315 goto integer; 2316 2317 case SO_SNDLOWAT: 2318 optval = so->so_snd.ssb_lowat; 2319 goto integer; 2320 2321 case SO_RCVLOWAT: 2322 optval = so->so_rcv.ssb_lowat; 2323 goto integer; 2324 2325 case SO_SNDTIMEO: 2326 case SO_RCVTIMEO: 2327 optval = (sopt->sopt_name == SO_SNDTIMEO ? 2328 so->so_snd.ssb_timeo : so->so_rcv.ssb_timeo); 2329 2330 tv.tv_sec = optval / hz; 2331 tv.tv_usec = (optval % hz) * ustick; 2332 error = sooptcopyout(sopt, &tv, sizeof tv); 2333 break; 2334 2335 case SO_SNDSPACE: 2336 optval_l = ssb_space(&so->so_snd); 2337 error = sooptcopyout(sopt, &optval_l, sizeof(optval_l)); 2338 break; 2339 2340 default: 2341 error = ENOPROTOOPT; 2342 break; 2343 } 2344 return (error); 2345 } 2346 } 2347 2348 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 2349 int 2350 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 2351 { 2352 struct mbuf *m, *m_prev; 2353 int sopt_size = sopt->sopt_valsize, msize; 2354 2355 m = m_getl(sopt_size, sopt->sopt_td ? MB_WAIT : MB_DONTWAIT, MT_DATA, 2356 0, &msize); 2357 if (m == NULL) 2358 return (ENOBUFS); 2359 m->m_len = min(msize, sopt_size); 2360 sopt_size -= m->m_len; 2361 *mp = m; 2362 m_prev = m; 2363 2364 while (sopt_size > 0) { 2365 m = m_getl(sopt_size, sopt->sopt_td ? MB_WAIT : MB_DONTWAIT, 2366 MT_DATA, 0, &msize); 2367 if (m == NULL) { 2368 m_freem(*mp); 2369 return (ENOBUFS); 2370 } 2371 m->m_len = min(msize, sopt_size); 2372 sopt_size -= m->m_len; 2373 m_prev->m_next = m; 2374 m_prev = m; 2375 } 2376 return (0); 2377 } 2378 2379 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 2380 int 2381 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 2382 { 2383 soopt_to_mbuf(sopt, m); 2384 return 0; 2385 } 2386 2387 void 2388 soopt_to_mbuf(struct sockopt *sopt, struct mbuf *m) 2389 { 2390 size_t valsize; 2391 void *val; 2392 2393 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2394 KKASSERT(kva_p(m)); 2395 if (sopt->sopt_val == NULL) 2396 return; 2397 val = sopt->sopt_val; 2398 valsize = sopt->sopt_valsize; 2399 while (m != NULL && valsize >= m->m_len) { 2400 bcopy(val, mtod(m, char *), m->m_len); 2401 valsize -= m->m_len; 2402 val = (caddr_t)val + m->m_len; 2403 m = m->m_next; 2404 } 2405 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 2406 panic("ip6_sooptmcopyin"); 2407 } 2408 2409 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 2410 int 2411 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 2412 { 2413 return soopt_from_mbuf(sopt, m); 2414 } 2415 2416 int 2417 soopt_from_mbuf(struct sockopt *sopt, struct mbuf *m) 2418 { 2419 struct mbuf *m0 = m; 2420 size_t valsize = 0; 2421 size_t maxsize; 2422 void *val; 2423 2424 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2425 KKASSERT(kva_p(m)); 2426 if (sopt->sopt_val == NULL) 2427 return 0; 2428 val = sopt->sopt_val; 2429 maxsize = sopt->sopt_valsize; 2430 while (m != NULL && maxsize >= m->m_len) { 2431 bcopy(mtod(m, char *), val, m->m_len); 2432 maxsize -= m->m_len; 2433 val = (caddr_t)val + m->m_len; 2434 valsize += m->m_len; 2435 m = m->m_next; 2436 } 2437 if (m != NULL) { 2438 /* enough soopt buffer should be given from user-land */ 2439 m_freem(m0); 2440 return (EINVAL); 2441 } 2442 sopt->sopt_valsize = valsize; 2443 return 0; 2444 } 2445 2446 void 2447 sohasoutofband(struct socket *so) 2448 { 2449 if (so->so_sigio != NULL) 2450 pgsigio(so->so_sigio, SIGURG, 0); 2451 KNOTE(&so->so_rcv.ssb_kq.ki_note, NOTE_OOB); 2452 } 2453 2454 int 2455 sokqfilter(struct file *fp, struct knote *kn) 2456 { 2457 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2458 struct signalsockbuf *ssb; 2459 2460 switch (kn->kn_filter) { 2461 case EVFILT_READ: 2462 if (so->so_options & SO_ACCEPTCONN) 2463 kn->kn_fop = &solisten_filtops; 2464 else 2465 kn->kn_fop = &soread_filtops; 2466 ssb = &so->so_rcv; 2467 break; 2468 case EVFILT_WRITE: 2469 kn->kn_fop = &sowrite_filtops; 2470 ssb = &so->so_snd; 2471 break; 2472 case EVFILT_EXCEPT: 2473 kn->kn_fop = &soexcept_filtops; 2474 ssb = &so->so_rcv; 2475 break; 2476 default: 2477 return (EOPNOTSUPP); 2478 } 2479 2480 knote_insert(&ssb->ssb_kq.ki_note, kn); 2481 atomic_set_int(&ssb->ssb_flags, SSB_KNOTE); 2482 return (0); 2483 } 2484 2485 static void 2486 filt_sordetach(struct knote *kn) 2487 { 2488 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2489 2490 knote_remove(&so->so_rcv.ssb_kq.ki_note, kn); 2491 if (SLIST_EMPTY(&so->so_rcv.ssb_kq.ki_note)) 2492 atomic_clear_int(&so->so_rcv.ssb_flags, SSB_KNOTE); 2493 } 2494 2495 /*ARGSUSED*/ 2496 static int 2497 filt_soread(struct knote *kn, long hint) 2498 { 2499 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2500 2501 if (kn->kn_sfflags & NOTE_OOB) { 2502 if ((so->so_oobmark || (so->so_state & SS_RCVATMARK))) { 2503 kn->kn_fflags |= NOTE_OOB; 2504 return (1); 2505 } 2506 return (0); 2507 } 2508 kn->kn_data = so->so_rcv.ssb_cc; 2509 2510 if (so->so_state & SS_CANTRCVMORE) { 2511 /* 2512 * Only set NODATA if all data has been exhausted. 2513 */ 2514 if (kn->kn_data == 0) 2515 kn->kn_flags |= EV_NODATA; 2516 kn->kn_flags |= EV_EOF; 2517 kn->kn_fflags = so->so_error; 2518 return (1); 2519 } 2520 if (so->so_error) /* temporary udp error */ 2521 return (1); 2522 if (kn->kn_sfflags & NOTE_LOWAT) 2523 return (kn->kn_data >= kn->kn_sdata); 2524 return ((kn->kn_data >= so->so_rcv.ssb_lowat) || 2525 !TAILQ_EMPTY(&so->so_comp)); 2526 } 2527 2528 static void 2529 filt_sowdetach(struct knote *kn) 2530 { 2531 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2532 2533 knote_remove(&so->so_snd.ssb_kq.ki_note, kn); 2534 if (SLIST_EMPTY(&so->so_snd.ssb_kq.ki_note)) 2535 atomic_clear_int(&so->so_snd.ssb_flags, SSB_KNOTE); 2536 } 2537 2538 /*ARGSUSED*/ 2539 static int 2540 filt_sowrite(struct knote *kn, long hint) 2541 { 2542 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2543 2544 kn->kn_data = ssb_space(&so->so_snd); 2545 if (so->so_state & SS_CANTSENDMORE) { 2546 kn->kn_flags |= (EV_EOF | EV_NODATA); 2547 kn->kn_fflags = so->so_error; 2548 return (1); 2549 } 2550 if (so->so_error) /* temporary udp error */ 2551 return (1); 2552 if (((so->so_state & SS_ISCONNECTED) == 0) && 2553 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2554 return (0); 2555 if (kn->kn_sfflags & NOTE_LOWAT) 2556 return (kn->kn_data >= kn->kn_sdata); 2557 return (kn->kn_data >= so->so_snd.ssb_lowat); 2558 } 2559 2560 /*ARGSUSED*/ 2561 static int 2562 filt_solisten(struct knote *kn, long hint) 2563 { 2564 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2565 2566 kn->kn_data = so->so_qlen; 2567 return (! TAILQ_EMPTY(&so->so_comp)); 2568 } 2569