1 /* 2 * Copyright (c) 2004 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 2004 The DragonFly Project. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Jeffrey M. Hsu. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of The DragonFly Project nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific, prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 /* 35 * Copyright (c) 1982, 1986, 1988, 1990, 1993 36 * The Regents of the University of California. All rights reserved. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. Neither the name of the University nor the names of its contributors 47 * may be used to endorse or promote products derived from this software 48 * without specific prior written permission. 49 * 50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 60 * SUCH DAMAGE. 61 * 62 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 63 * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.24 2003/11/11 17:18:18 silby Exp $ 64 */ 65 66 #include "opt_inet.h" 67 #include "opt_sctp.h" 68 69 #include <sys/param.h> 70 #include <sys/systm.h> 71 #include <sys/fcntl.h> 72 #include <sys/malloc.h> 73 #include <sys/mbuf.h> 74 #include <sys/domain.h> 75 #include <sys/file.h> /* for struct knote */ 76 #include <sys/kernel.h> 77 #include <sys/event.h> 78 #include <sys/proc.h> 79 #include <sys/protosw.h> 80 #include <sys/socket.h> 81 #include <sys/socketvar.h> 82 #include <sys/socketops.h> 83 #include <sys/resourcevar.h> 84 #include <sys/signalvar.h> 85 #include <sys/sysctl.h> 86 #include <sys/uio.h> 87 #include <sys/jail.h> 88 #include <vm/vm_zone.h> 89 #include <vm/pmap.h> 90 #include <net/netmsg2.h> 91 #include <net/netisr2.h> 92 93 #include <sys/thread2.h> 94 #include <sys/socketvar2.h> 95 #include <sys/spinlock2.h> 96 97 #include <machine/limits.h> 98 99 #ifdef INET 100 extern int tcp_sosend_agglim; 101 extern int tcp_sosend_async; 102 extern int udp_sosend_async; 103 extern int udp_sosend_prepend; 104 105 static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); 106 #endif /* INET */ 107 108 static void filt_sordetach(struct knote *kn); 109 static int filt_soread(struct knote *kn, long hint); 110 static void filt_sowdetach(struct knote *kn); 111 static int filt_sowrite(struct knote *kn, long hint); 112 static int filt_solisten(struct knote *kn, long hint); 113 114 static void sodiscard(struct socket *so); 115 static int soclose_sync(struct socket *so, int fflag); 116 static void soclose_fast(struct socket *so); 117 118 static struct filterops solisten_filtops = 119 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_solisten }; 120 static struct filterops soread_filtops = 121 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_soread }; 122 static struct filterops sowrite_filtops = 123 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sowdetach, filt_sowrite }; 124 static struct filterops soexcept_filtops = 125 { FILTEROP_ISFD|FILTEROP_MPSAFE, NULL, filt_sordetach, filt_soread }; 126 127 MALLOC_DEFINE(M_SOCKET, "socket", "socket struct"); 128 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 129 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 130 131 132 static int somaxconn = SOMAXCONN; 133 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, 134 &somaxconn, 0, "Maximum pending socket connection queue size"); 135 136 static int use_soclose_fast = 1; 137 SYSCTL_INT(_kern_ipc, OID_AUTO, soclose_fast, CTLFLAG_RW, 138 &use_soclose_fast, 0, "Fast socket close"); 139 140 int use_soaccept_pred_fast = 1; 141 SYSCTL_INT(_kern_ipc, OID_AUTO, soaccept_pred_fast, CTLFLAG_RW, 142 &use_soaccept_pred_fast, 0, "Fast socket accept predication"); 143 144 int use_sendfile_async = 1; 145 SYSCTL_INT(_kern_ipc, OID_AUTO, sendfile_async, CTLFLAG_RW, 146 &use_sendfile_async, 0, "sendfile uses asynchronized pru_send"); 147 148 int use_soconnect_async = 1; 149 SYSCTL_INT(_kern_ipc, OID_AUTO, soconnect_async, CTLFLAG_RW, 150 &use_soconnect_async, 0, "soconnect uses asynchronized pru_connect"); 151 152 int use_rand_initport = 1; 153 SYSCTL_INT(_kern_ipc, OID_AUTO, rand_initport, CTLFLAG_RW, 154 &use_rand_initport, 0, "socket uses random initial msgport"); 155 156 /* 157 * Socket operation routines. 158 * These routines are called by the routines in 159 * sys_socket.c or from a system process, and 160 * implement the semantics of socket operations by 161 * switching out to the protocol specific routines. 162 */ 163 164 /* 165 * Get a socket structure, and initialize it. 166 * Note that it would probably be better to allocate socket 167 * and PCB at the same time, but I'm not convinced that all 168 * the protocols can be easily modified to do this. 169 */ 170 struct socket * 171 soalloc(int waitok, struct protosw *pr) 172 { 173 struct socket *so; 174 unsigned waitmask; 175 176 waitmask = waitok ? M_WAITOK : M_NOWAIT; 177 so = kmalloc(sizeof(struct socket), M_SOCKET, M_ZERO|waitmask); 178 if (so) { 179 /* XXX race condition for reentrant kernel */ 180 so->so_proto = pr; 181 TAILQ_INIT(&so->so_aiojobq); 182 TAILQ_INIT(&so->so_rcv.ssb_kq.ki_mlist); 183 TAILQ_INIT(&so->so_snd.ssb_kq.ki_mlist); 184 lwkt_token_init(&so->so_rcv.ssb_token, "rcvtok"); 185 lwkt_token_init(&so->so_snd.ssb_token, "sndtok"); 186 spin_init(&so->so_rcvd_spin); 187 netmsg_init(&so->so_rcvd_msg.base, so, &netisr_adone_rport, 188 MSGF_DROPABLE | MSGF_PRIORITY, 189 so->so_proto->pr_usrreqs->pru_rcvd); 190 so->so_rcvd_msg.nm_pru_flags |= PRUR_ASYNC; 191 so->so_state = SS_NOFDREF; 192 so->so_refs = 1; 193 } 194 return so; 195 } 196 197 int 198 socreate(int dom, struct socket **aso, int type, 199 int proto, struct thread *td) 200 { 201 struct proc *p = td->td_proc; 202 struct protosw *prp; 203 struct socket *so; 204 struct pru_attach_info ai; 205 int error; 206 207 if (proto) 208 prp = pffindproto(dom, proto, type); 209 else 210 prp = pffindtype(dom, type); 211 212 if (prp == NULL || prp->pr_usrreqs->pru_attach == 0) 213 return (EPROTONOSUPPORT); 214 215 if (p->p_ucred->cr_prison && jail_socket_unixiproute_only && 216 prp->pr_domain->dom_family != PF_LOCAL && 217 prp->pr_domain->dom_family != PF_INET && 218 prp->pr_domain->dom_family != PF_INET6 && 219 prp->pr_domain->dom_family != PF_ROUTE) { 220 return (EPROTONOSUPPORT); 221 } 222 223 if (prp->pr_type != type) 224 return (EPROTOTYPE); 225 so = soalloc(p != NULL, prp); 226 if (so == NULL) 227 return (ENOBUFS); 228 229 /* 230 * Callers of socreate() presumably will connect up a descriptor 231 * and call soclose() if they cannot. This represents our so_refs 232 * (which should be 1) from soalloc(). 233 */ 234 soclrstate(so, SS_NOFDREF); 235 236 /* 237 * Set a default port for protocol processing. No action will occur 238 * on the socket on this port until an inpcb is attached to it and 239 * is able to match incoming packets, or until the socket becomes 240 * available to userland. 241 * 242 * We normally default the socket to the protocol thread on cpu 0. 243 * If PR_SYNC_PORT is set (unix domain sockets) there is no protocol 244 * thread and all pr_*()/pru_*() calls are executed synchronously. 245 */ 246 if (prp->pr_flags & PR_SYNC_PORT) { 247 so->so_port = &netisr_sync_port; 248 } else if (prp->pr_flags & PR_RAND_INITPORT) { 249 if (use_rand_initport) 250 so->so_port = netisr_cpuport(mycpuid & ncpus2_mask); 251 else 252 so->so_port = netisr_cpuport(0); 253 } else { 254 so->so_port = netisr_cpuport(0); 255 } 256 257 TAILQ_INIT(&so->so_incomp); 258 TAILQ_INIT(&so->so_comp); 259 so->so_type = type; 260 so->so_cred = crhold(p->p_ucred); 261 ai.sb_rlimit = &p->p_rlimit[RLIMIT_SBSIZE]; 262 ai.p_ucred = p->p_ucred; 263 ai.fd_rdir = p->p_fd->fd_rdir; 264 265 /* 266 * Auto-sizing of socket buffers is managed by the protocols and 267 * the appropriate flags must be set in the pru_attach function. 268 */ 269 error = so_pru_attach(so, proto, &ai); 270 if (error) { 271 sosetstate(so, SS_NOFDREF); 272 sofree(so); /* from soalloc */ 273 return error; 274 } 275 276 /* 277 * NOTE: Returns referenced socket. 278 */ 279 *aso = so; 280 return (0); 281 } 282 283 int 284 sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 285 { 286 int error; 287 288 error = so_pru_bind(so, nam, td); 289 return (error); 290 } 291 292 static void 293 sodealloc(struct socket *so) 294 { 295 if (so->so_rcv.ssb_hiwat) 296 (void)chgsbsize(so->so_cred->cr_uidinfo, 297 &so->so_rcv.ssb_hiwat, 0, RLIM_INFINITY); 298 if (so->so_snd.ssb_hiwat) 299 (void)chgsbsize(so->so_cred->cr_uidinfo, 300 &so->so_snd.ssb_hiwat, 0, RLIM_INFINITY); 301 #ifdef INET 302 /* remove accept filter if present */ 303 if (so->so_accf != NULL) 304 do_setopt_accept_filter(so, NULL); 305 #endif /* INET */ 306 crfree(so->so_cred); 307 if (so->so_faddr != NULL) 308 kfree(so->so_faddr, M_SONAME); 309 kfree(so, M_SOCKET); 310 } 311 312 int 313 solisten(struct socket *so, int backlog, struct thread *td) 314 { 315 int error; 316 #ifdef SCTP 317 short oldopt, oldqlimit; 318 #endif /* SCTP */ 319 320 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) 321 return (EINVAL); 322 323 #ifdef SCTP 324 oldopt = so->so_options; 325 oldqlimit = so->so_qlimit; 326 #endif /* SCTP */ 327 328 lwkt_gettoken(&so->so_rcv.ssb_token); 329 if (TAILQ_EMPTY(&so->so_comp)) 330 so->so_options |= SO_ACCEPTCONN; 331 lwkt_reltoken(&so->so_rcv.ssb_token); 332 if (backlog < 0 || backlog > somaxconn) 333 backlog = somaxconn; 334 so->so_qlimit = backlog; 335 /* SCTP needs to look at tweak both the inbound backlog parameter AND 336 * the so_options (UDP model both connect's and gets inbound 337 * connections .. implicitly). 338 */ 339 error = so_pru_listen(so, td); 340 if (error) { 341 #ifdef SCTP 342 /* Restore the params */ 343 so->so_options = oldopt; 344 so->so_qlimit = oldqlimit; 345 #endif /* SCTP */ 346 return (error); 347 } 348 return (0); 349 } 350 351 /* 352 * Destroy a disconnected socket. This routine is a NOP if entities 353 * still have a reference on the socket: 354 * 355 * so_pcb - The protocol stack still has a reference 356 * SS_NOFDREF - There is no longer a file pointer reference 357 */ 358 void 359 sofree(struct socket *so) 360 { 361 struct socket *head; 362 363 /* 364 * This is a bit hackish at the moment. We need to interlock 365 * any accept queue we are on before we potentially lose the 366 * last reference to avoid races against a re-reference from 367 * someone operating on the queue. 368 */ 369 while ((head = so->so_head) != NULL) { 370 lwkt_getpooltoken(head); 371 if (so->so_head == head) 372 break; 373 lwkt_relpooltoken(head); 374 } 375 376 /* 377 * Arbitrage the last free. 378 */ 379 KKASSERT(so->so_refs > 0); 380 if (atomic_fetchadd_int(&so->so_refs, -1) != 1) { 381 if (head) 382 lwkt_relpooltoken(head); 383 return; 384 } 385 386 KKASSERT(so->so_pcb == NULL && (so->so_state & SS_NOFDREF)); 387 KKASSERT((so->so_state & SS_ASSERTINPROG) == 0); 388 389 /* 390 * We're done, remove ourselves from the accept queue we are 391 * on, if we are on one. 392 */ 393 if (head != NULL) { 394 if (so->so_state & SS_INCOMP) { 395 TAILQ_REMOVE(&head->so_incomp, so, so_list); 396 head->so_incqlen--; 397 } else if (so->so_state & SS_COMP) { 398 /* 399 * We must not decommission a socket that's 400 * on the accept(2) queue. If we do, then 401 * accept(2) may hang after select(2) indicated 402 * that the listening socket was ready. 403 */ 404 lwkt_relpooltoken(head); 405 return; 406 } else { 407 panic("sofree: not queued"); 408 } 409 soclrstate(so, SS_INCOMP); 410 so->so_head = NULL; 411 lwkt_relpooltoken(head); 412 } 413 ssb_release(&so->so_snd, so); 414 sorflush(so); 415 sodealloc(so); 416 } 417 418 /* 419 * Close a socket on last file table reference removal. 420 * Initiate disconnect if connected. 421 * Free socket when disconnect complete. 422 */ 423 int 424 soclose(struct socket *so, int fflag) 425 { 426 int error; 427 428 funsetown(&so->so_sigio); 429 if (!use_soclose_fast || 430 (so->so_proto->pr_flags & PR_SYNC_PORT) || 431 ((so->so_state & SS_ISCONNECTED) && 432 (so->so_options & SO_LINGER))) { 433 error = soclose_sync(so, fflag); 434 } else { 435 soclose_fast(so); 436 error = 0; 437 } 438 return error; 439 } 440 441 static void 442 sodiscard(struct socket *so) 443 { 444 lwkt_getpooltoken(so); 445 if (so->so_options & SO_ACCEPTCONN) { 446 struct socket *sp; 447 448 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 449 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 450 soclrstate(sp, SS_INCOMP); 451 sp->so_head = NULL; 452 so->so_incqlen--; 453 soaborta(sp); 454 } 455 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 456 TAILQ_REMOVE(&so->so_comp, sp, so_list); 457 soclrstate(sp, SS_COMP); 458 sp->so_head = NULL; 459 so->so_qlen--; 460 soaborta(sp); 461 } 462 } 463 lwkt_relpooltoken(so); 464 465 if (so->so_state & SS_NOFDREF) 466 panic("soclose: NOFDREF"); 467 sosetstate(so, SS_NOFDREF); /* take ref */ 468 } 469 470 void 471 soinherit(struct socket *so, struct socket *so_inh) 472 { 473 TAILQ_HEAD(, socket) comp, incomp; 474 struct socket *sp; 475 int qlen, incqlen; 476 477 KASSERT(so->so_options & SO_ACCEPTCONN, 478 ("so does not accept connection")); 479 KASSERT(so_inh->so_options & SO_ACCEPTCONN, 480 ("so_inh does not accept connection")); 481 482 TAILQ_INIT(&comp); 483 TAILQ_INIT(&incomp); 484 485 lwkt_getpooltoken(so); 486 lwkt_getpooltoken(so_inh); 487 488 /* 489 * Save completed queue and incompleted queue 490 */ 491 TAILQ_CONCAT(&comp, &so->so_comp, so_list); 492 qlen = so->so_qlen; 493 so->so_qlen = 0; 494 495 TAILQ_CONCAT(&incomp, &so->so_incomp, so_list); 496 incqlen = so->so_incqlen; 497 so->so_incqlen = 0; 498 499 /* 500 * Append the saved completed queue and incompleted 501 * queue to the socket inherits them. 502 * 503 * XXX 504 * This may temporarily break the inheriting socket's 505 * so_qlimit. 506 */ 507 TAILQ_FOREACH(sp, &comp, so_list) { 508 sp->so_head = so_inh; 509 crfree(sp->so_cred); 510 sp->so_cred = crhold(so_inh->so_cred); 511 } 512 513 TAILQ_FOREACH(sp, &incomp, so_list) { 514 sp->so_head = so_inh; 515 crfree(sp->so_cred); 516 sp->so_cred = crhold(so_inh->so_cred); 517 } 518 519 TAILQ_CONCAT(&so_inh->so_comp, &comp, so_list); 520 so_inh->so_qlen += qlen; 521 522 TAILQ_CONCAT(&so_inh->so_incomp, &incomp, so_list); 523 so_inh->so_incqlen += incqlen; 524 525 lwkt_relpooltoken(so_inh); 526 lwkt_relpooltoken(so); 527 528 if (qlen) { 529 /* 530 * "New" connections have arrived 531 */ 532 sorwakeup(so_inh); 533 wakeup(&so_inh->so_timeo); 534 } 535 } 536 537 static int 538 soclose_sync(struct socket *so, int fflag) 539 { 540 int error = 0; 541 542 if (so->so_pcb == NULL) 543 goto discard; 544 if (so->so_state & SS_ISCONNECTED) { 545 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 546 error = sodisconnect(so); 547 if (error) 548 goto drop; 549 } 550 if (so->so_options & SO_LINGER) { 551 if ((so->so_state & SS_ISDISCONNECTING) && 552 (fflag & FNONBLOCK)) 553 goto drop; 554 while (so->so_state & SS_ISCONNECTED) { 555 error = tsleep(&so->so_timeo, PCATCH, 556 "soclos", so->so_linger * hz); 557 if (error) 558 break; 559 } 560 } 561 } 562 drop: 563 if (so->so_pcb) { 564 int error2; 565 566 error2 = so_pru_detach(so); 567 if (error == 0) 568 error = error2; 569 } 570 discard: 571 sodiscard(so); 572 so_pru_sync(so); /* unpend async sending */ 573 sofree(so); /* dispose of ref */ 574 575 return (error); 576 } 577 578 static void 579 soclose_sofree_async_handler(netmsg_t msg) 580 { 581 sofree(msg->base.nm_so); 582 } 583 584 static void 585 soclose_sofree_async(struct socket *so) 586 { 587 struct netmsg_base *base = &so->so_clomsg; 588 589 netmsg_init(base, so, &netisr_apanic_rport, 0, 590 soclose_sofree_async_handler); 591 lwkt_sendmsg(so->so_port, &base->lmsg); 592 } 593 594 static void 595 soclose_disconn_async_handler(netmsg_t msg) 596 { 597 struct socket *so = msg->base.nm_so; 598 599 if ((so->so_state & SS_ISCONNECTED) && 600 (so->so_state & SS_ISDISCONNECTING) == 0) 601 so_pru_disconnect_direct(so); 602 603 if (so->so_pcb) 604 so_pru_detach_direct(so); 605 606 sodiscard(so); 607 sofree(so); 608 } 609 610 static void 611 soclose_disconn_async(struct socket *so) 612 { 613 struct netmsg_base *base = &so->so_clomsg; 614 615 netmsg_init(base, so, &netisr_apanic_rport, 0, 616 soclose_disconn_async_handler); 617 lwkt_sendmsg(so->so_port, &base->lmsg); 618 } 619 620 static void 621 soclose_detach_async_handler(netmsg_t msg) 622 { 623 struct socket *so = msg->base.nm_so; 624 625 if (so->so_pcb) 626 so_pru_detach_direct(so); 627 628 sodiscard(so); 629 sofree(so); 630 } 631 632 static void 633 soclose_detach_async(struct socket *so) 634 { 635 struct netmsg_base *base = &so->so_clomsg; 636 637 netmsg_init(base, so, &netisr_apanic_rport, 0, 638 soclose_detach_async_handler); 639 lwkt_sendmsg(so->so_port, &base->lmsg); 640 } 641 642 static void 643 soclose_fast(struct socket *so) 644 { 645 if (so->so_pcb == NULL) 646 goto discard; 647 648 if ((so->so_state & SS_ISCONNECTED) && 649 (so->so_state & SS_ISDISCONNECTING) == 0) { 650 soclose_disconn_async(so); 651 return; 652 } 653 654 if (so->so_pcb) { 655 soclose_detach_async(so); 656 return; 657 } 658 659 discard: 660 sodiscard(so); 661 soclose_sofree_async(so); 662 } 663 664 /* 665 * Abort and destroy a socket. Only one abort can be in progress 666 * at any given moment. 667 */ 668 void 669 soabort(struct socket *so) 670 { 671 soreference(so); 672 so_pru_abort(so); 673 } 674 675 void 676 soaborta(struct socket *so) 677 { 678 soreference(so); 679 so_pru_aborta(so); 680 } 681 682 void 683 soabort_oncpu(struct socket *so) 684 { 685 soreference(so); 686 so_pru_abort_oncpu(so); 687 } 688 689 /* 690 * so is passed in ref'd, which becomes owned by 691 * the cleared SS_NOFDREF flag. 692 */ 693 void 694 soaccept_generic(struct socket *so) 695 { 696 if ((so->so_state & SS_NOFDREF) == 0) 697 panic("soaccept: !NOFDREF"); 698 soclrstate(so, SS_NOFDREF); /* owned by lack of SS_NOFDREF */ 699 } 700 701 int 702 soaccept(struct socket *so, struct sockaddr **nam) 703 { 704 int error; 705 706 soaccept_generic(so); 707 error = so_pru_accept(so, nam); 708 return (error); 709 } 710 711 int 712 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td, 713 boolean_t sync) 714 { 715 int error; 716 717 if (so->so_options & SO_ACCEPTCONN) 718 return (EOPNOTSUPP); 719 /* 720 * If protocol is connection-based, can only connect once. 721 * Otherwise, if connected, try to disconnect first. 722 * This allows user to disconnect by connecting to, e.g., 723 * a null address. 724 */ 725 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 726 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 727 (error = sodisconnect(so)))) { 728 error = EISCONN; 729 } else { 730 /* 731 * Prevent accumulated error from previous connection 732 * from biting us. 733 */ 734 so->so_error = 0; 735 if (!sync && so->so_proto->pr_usrreqs->pru_preconnect) 736 error = so_pru_connect_async(so, nam, td); 737 else 738 error = so_pru_connect(so, nam, td); 739 } 740 return (error); 741 } 742 743 int 744 soconnect2(struct socket *so1, struct socket *so2) 745 { 746 int error; 747 748 error = so_pru_connect2(so1, so2); 749 return (error); 750 } 751 752 int 753 sodisconnect(struct socket *so) 754 { 755 int error; 756 757 if ((so->so_state & SS_ISCONNECTED) == 0) { 758 error = ENOTCONN; 759 goto bad; 760 } 761 if (so->so_state & SS_ISDISCONNECTING) { 762 error = EALREADY; 763 goto bad; 764 } 765 error = so_pru_disconnect(so); 766 bad: 767 return (error); 768 } 769 770 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 771 /* 772 * Send on a socket. 773 * If send must go all at once and message is larger than 774 * send buffering, then hard error. 775 * Lock against other senders. 776 * If must go all at once and not enough room now, then 777 * inform user that this would block and do nothing. 778 * Otherwise, if nonblocking, send as much as possible. 779 * The data to be sent is described by "uio" if nonzero, 780 * otherwise by the mbuf chain "top" (which must be null 781 * if uio is not). Data provided in mbuf chain must be small 782 * enough to send all at once. 783 * 784 * Returns nonzero on error, timeout or signal; callers 785 * must check for short counts if EINTR/ERESTART are returned. 786 * Data and control buffers are freed on return. 787 */ 788 int 789 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 790 struct mbuf *top, struct mbuf *control, int flags, 791 struct thread *td) 792 { 793 struct mbuf **mp; 794 struct mbuf *m; 795 size_t resid; 796 int space, len; 797 int clen = 0, error, dontroute, mlen; 798 int atomic = sosendallatonce(so) || top; 799 int pru_flags; 800 801 if (uio) { 802 resid = uio->uio_resid; 803 } else { 804 resid = (size_t)top->m_pkthdr.len; 805 #ifdef INVARIANTS 806 len = 0; 807 for (m = top; m; m = m->m_next) 808 len += m->m_len; 809 KKASSERT(top->m_pkthdr.len == len); 810 #endif 811 } 812 813 /* 814 * WARNING! resid is unsigned, space and len are signed. space 815 * can wind up negative if the sockbuf is overcommitted. 816 * 817 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 818 * type sockets since that's an error. 819 */ 820 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 821 error = EINVAL; 822 goto out; 823 } 824 825 dontroute = 826 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 827 (so->so_proto->pr_flags & PR_ATOMIC); 828 if (td->td_lwp != NULL) 829 td->td_lwp->lwp_ru.ru_msgsnd++; 830 if (control) 831 clen = control->m_len; 832 #define gotoerr(errcode) { error = errcode; goto release; } 833 834 restart: 835 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags)); 836 if (error) 837 goto out; 838 839 do { 840 if (so->so_state & SS_CANTSENDMORE) 841 gotoerr(EPIPE); 842 if (so->so_error) { 843 error = so->so_error; 844 so->so_error = 0; 845 goto release; 846 } 847 if ((so->so_state & SS_ISCONNECTED) == 0) { 848 /* 849 * `sendto' and `sendmsg' is allowed on a connection- 850 * based socket if it supports implied connect. 851 * Return ENOTCONN if not connected and no address is 852 * supplied. 853 */ 854 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 855 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 856 if ((so->so_state & SS_ISCONFIRMING) == 0 && 857 !(resid == 0 && clen != 0)) 858 gotoerr(ENOTCONN); 859 } else if (addr == NULL) 860 gotoerr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 861 ENOTCONN : EDESTADDRREQ); 862 } 863 if ((atomic && resid > so->so_snd.ssb_hiwat) || 864 clen > so->so_snd.ssb_hiwat) { 865 gotoerr(EMSGSIZE); 866 } 867 space = ssb_space(&so->so_snd); 868 if (flags & MSG_OOB) 869 space += 1024; 870 if ((space < 0 || (size_t)space < resid + clen) && uio && 871 (atomic || space < so->so_snd.ssb_lowat || space < clen)) { 872 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) 873 gotoerr(EWOULDBLOCK); 874 ssb_unlock(&so->so_snd); 875 error = ssb_wait(&so->so_snd); 876 if (error) 877 goto out; 878 goto restart; 879 } 880 mp = ⊤ 881 space -= clen; 882 do { 883 if (uio == NULL) { 884 /* 885 * Data is prepackaged in "top". 886 */ 887 resid = 0; 888 if (flags & MSG_EOR) 889 top->m_flags |= M_EOR; 890 } else do { 891 if (resid > INT_MAX) 892 resid = INT_MAX; 893 m = m_getl((int)resid, MB_WAIT, MT_DATA, 894 top == NULL ? M_PKTHDR : 0, &mlen); 895 if (top == NULL) { 896 m->m_pkthdr.len = 0; 897 m->m_pkthdr.rcvif = NULL; 898 } 899 len = imin((int)szmin(mlen, resid), space); 900 if (resid < MINCLSIZE) { 901 /* 902 * For datagram protocols, leave room 903 * for protocol headers in first mbuf. 904 */ 905 if (atomic && top == NULL && len < mlen) 906 MH_ALIGN(m, len); 907 } 908 space -= len; 909 error = uiomove(mtod(m, caddr_t), (size_t)len, uio); 910 resid = uio->uio_resid; 911 m->m_len = len; 912 *mp = m; 913 top->m_pkthdr.len += len; 914 if (error) 915 goto release; 916 mp = &m->m_next; 917 if (resid == 0) { 918 if (flags & MSG_EOR) 919 top->m_flags |= M_EOR; 920 break; 921 } 922 } while (space > 0 && atomic); 923 if (dontroute) 924 so->so_options |= SO_DONTROUTE; 925 if (flags & MSG_OOB) { 926 pru_flags = PRUS_OOB; 927 } else if ((flags & MSG_EOF) && 928 (so->so_proto->pr_flags & PR_IMPLOPCL) && 929 (resid == 0)) { 930 /* 931 * If the user set MSG_EOF, the protocol 932 * understands this flag and nothing left to 933 * send then use PRU_SEND_EOF instead of PRU_SEND. 934 */ 935 pru_flags = PRUS_EOF; 936 } else if (resid > 0 && space > 0) { 937 /* If there is more to send, set PRUS_MORETOCOME */ 938 pru_flags = PRUS_MORETOCOME; 939 } else { 940 pru_flags = 0; 941 } 942 /* 943 * XXX all the SS_CANTSENDMORE checks previously 944 * done could be out of date. We could have recieved 945 * a reset packet in an interrupt or maybe we slept 946 * while doing page faults in uiomove() etc. We could 947 * probably recheck again inside the splnet() protection 948 * here, but there are probably other places that this 949 * also happens. We must rethink this. 950 */ 951 error = so_pru_send(so, pru_flags, top, addr, control, td); 952 if (dontroute) 953 so->so_options &= ~SO_DONTROUTE; 954 clen = 0; 955 control = NULL; 956 top = NULL; 957 mp = ⊤ 958 if (error) 959 goto release; 960 } while (resid && space > 0); 961 } while (resid); 962 963 release: 964 ssb_unlock(&so->so_snd); 965 out: 966 if (top) 967 m_freem(top); 968 if (control) 969 m_freem(control); 970 return (error); 971 } 972 973 #ifdef INET 974 /* 975 * A specialization of sosend() for UDP based on protocol-specific knowledge: 976 * so->so_proto->pr_flags has the PR_ATOMIC field set. This means that 977 * sosendallatonce() returns true, 978 * the "atomic" variable is true, 979 * and sosendudp() blocks until space is available for the entire send. 980 * so->so_proto->pr_flags does not have the PR_CONNREQUIRED or 981 * PR_IMPLOPCL flags set. 982 * UDP has no out-of-band data. 983 * UDP has no control data. 984 * UDP does not support MSG_EOR. 985 */ 986 int 987 sosendudp(struct socket *so, struct sockaddr *addr, struct uio *uio, 988 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 989 { 990 size_t resid; 991 int error, pru_flags = 0; 992 int space; 993 994 if (td->td_lwp != NULL) 995 td->td_lwp->lwp_ru.ru_msgsnd++; 996 if (control) 997 m_freem(control); 998 999 KASSERT((uio && !top) || (top && !uio), ("bad arguments to sosendudp")); 1000 resid = uio ? uio->uio_resid : (size_t)top->m_pkthdr.len; 1001 1002 restart: 1003 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags)); 1004 if (error) 1005 goto out; 1006 1007 if (so->so_state & SS_CANTSENDMORE) 1008 gotoerr(EPIPE); 1009 if (so->so_error) { 1010 error = so->so_error; 1011 so->so_error = 0; 1012 goto release; 1013 } 1014 if (!(so->so_state & SS_ISCONNECTED) && addr == NULL) 1015 gotoerr(EDESTADDRREQ); 1016 if (resid > so->so_snd.ssb_hiwat) 1017 gotoerr(EMSGSIZE); 1018 space = ssb_space(&so->so_snd); 1019 if (uio && (space < 0 || (size_t)space < resid)) { 1020 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) 1021 gotoerr(EWOULDBLOCK); 1022 ssb_unlock(&so->so_snd); 1023 error = ssb_wait(&so->so_snd); 1024 if (error) 1025 goto out; 1026 goto restart; 1027 } 1028 1029 if (uio) { 1030 int hdrlen = max_hdr; 1031 1032 /* 1033 * We try to optimize out the additional mbuf 1034 * allocations in M_PREPEND() on output path, e.g. 1035 * - udp_output(), when it tries to prepend protocol 1036 * headers. 1037 * - Link layer output function, when it tries to 1038 * prepend link layer header. 1039 * 1040 * This probably will not benefit any data that will 1041 * be fragmented, so this optimization is only performed 1042 * when the size of data and max size of protocol+link 1043 * headers fit into one mbuf cluster. 1044 */ 1045 if (uio->uio_resid > MCLBYTES - hdrlen || 1046 !udp_sosend_prepend) { 1047 top = m_uiomove(uio); 1048 if (top == NULL) 1049 goto release; 1050 } else { 1051 int nsize; 1052 1053 top = m_getl(uio->uio_resid + hdrlen, MB_WAIT, 1054 MT_DATA, M_PKTHDR, &nsize); 1055 KASSERT(nsize >= uio->uio_resid + hdrlen, 1056 ("sosendudp invalid nsize %d, " 1057 "resid %zu, hdrlen %d", 1058 nsize, uio->uio_resid, hdrlen)); 1059 1060 top->m_len = uio->uio_resid; 1061 top->m_pkthdr.len = uio->uio_resid; 1062 top->m_data += hdrlen; 1063 1064 error = uiomove(mtod(top, caddr_t), top->m_len, uio); 1065 if (error) 1066 goto out; 1067 } 1068 } 1069 1070 if (flags & MSG_DONTROUTE) 1071 pru_flags |= PRUS_DONTROUTE; 1072 1073 if (udp_sosend_async && (flags & MSG_SYNC) == 0) { 1074 so_pru_send_async(so, pru_flags, top, addr, NULL, td); 1075 error = 0; 1076 } else { 1077 error = so_pru_send(so, pru_flags, top, addr, NULL, td); 1078 } 1079 top = NULL; /* sent or freed in lower layer */ 1080 1081 release: 1082 ssb_unlock(&so->so_snd); 1083 out: 1084 if (top) 1085 m_freem(top); 1086 return (error); 1087 } 1088 1089 int 1090 sosendtcp(struct socket *so, struct sockaddr *addr, struct uio *uio, 1091 struct mbuf *top, struct mbuf *control, int flags, 1092 struct thread *td) 1093 { 1094 struct mbuf **mp; 1095 struct mbuf *m; 1096 size_t resid; 1097 int space, len; 1098 int error, mlen; 1099 int allatonce; 1100 int pru_flags; 1101 1102 if (uio) { 1103 KKASSERT(top == NULL); 1104 allatonce = 0; 1105 resid = uio->uio_resid; 1106 } else { 1107 allatonce = 1; 1108 resid = (size_t)top->m_pkthdr.len; 1109 #ifdef INVARIANTS 1110 len = 0; 1111 for (m = top; m; m = m->m_next) 1112 len += m->m_len; 1113 KKASSERT(top->m_pkthdr.len == len); 1114 #endif 1115 } 1116 1117 /* 1118 * WARNING! resid is unsigned, space and len are signed. space 1119 * can wind up negative if the sockbuf is overcommitted. 1120 * 1121 * Also check to make sure that MSG_EOR isn't used on TCP 1122 */ 1123 if (flags & MSG_EOR) { 1124 error = EINVAL; 1125 goto out; 1126 } 1127 1128 if (control) { 1129 /* TCP doesn't do control messages (rights, creds, etc) */ 1130 if (control->m_len) { 1131 error = EINVAL; 1132 goto out; 1133 } 1134 m_freem(control); /* empty control, just free it */ 1135 control = NULL; 1136 } 1137 1138 if (td->td_lwp != NULL) 1139 td->td_lwp->lwp_ru.ru_msgsnd++; 1140 1141 #define gotoerr(errcode) { error = errcode; goto release; } 1142 1143 restart: 1144 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags)); 1145 if (error) 1146 goto out; 1147 1148 do { 1149 if (so->so_state & SS_CANTSENDMORE) 1150 gotoerr(EPIPE); 1151 if (so->so_error) { 1152 error = so->so_error; 1153 so->so_error = 0; 1154 goto release; 1155 } 1156 if ((so->so_state & SS_ISCONNECTED) == 0 && 1157 (so->so_state & SS_ISCONFIRMING) == 0) 1158 gotoerr(ENOTCONN); 1159 if (allatonce && resid > so->so_snd.ssb_hiwat) 1160 gotoerr(EMSGSIZE); 1161 1162 space = ssb_space_prealloc(&so->so_snd); 1163 if (flags & MSG_OOB) 1164 space += 1024; 1165 if ((space < 0 || (size_t)space < resid) && !allatonce && 1166 space < so->so_snd.ssb_lowat) { 1167 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) 1168 gotoerr(EWOULDBLOCK); 1169 ssb_unlock(&so->so_snd); 1170 error = ssb_wait(&so->so_snd); 1171 if (error) 1172 goto out; 1173 goto restart; 1174 } 1175 mp = ⊤ 1176 do { 1177 int cnt = 0, async = 0; 1178 1179 if (uio == NULL) { 1180 /* 1181 * Data is prepackaged in "top". 1182 */ 1183 resid = 0; 1184 } else do { 1185 if (resid > INT_MAX) 1186 resid = INT_MAX; 1187 m = m_getl((int)resid, MB_WAIT, MT_DATA, 1188 top == NULL ? M_PKTHDR : 0, &mlen); 1189 if (top == NULL) { 1190 m->m_pkthdr.len = 0; 1191 m->m_pkthdr.rcvif = NULL; 1192 } 1193 len = imin((int)szmin(mlen, resid), space); 1194 space -= len; 1195 error = uiomove(mtod(m, caddr_t), (size_t)len, uio); 1196 resid = uio->uio_resid; 1197 m->m_len = len; 1198 *mp = m; 1199 top->m_pkthdr.len += len; 1200 if (error) 1201 goto release; 1202 mp = &m->m_next; 1203 if (resid == 0) 1204 break; 1205 ++cnt; 1206 } while (space > 0 && cnt < tcp_sosend_agglim); 1207 1208 if (tcp_sosend_async) 1209 async = 1; 1210 1211 if (flags & MSG_OOB) { 1212 pru_flags = PRUS_OOB; 1213 async = 0; 1214 } else if ((flags & MSG_EOF) && resid == 0) { 1215 pru_flags = PRUS_EOF; 1216 } else if (resid > 0 && space > 0) { 1217 /* If there is more to send, set PRUS_MORETOCOME */ 1218 pru_flags = PRUS_MORETOCOME; 1219 async = 1; 1220 } else { 1221 pru_flags = 0; 1222 } 1223 1224 if (flags & MSG_SYNC) 1225 async = 0; 1226 1227 /* 1228 * XXX all the SS_CANTSENDMORE checks previously 1229 * done could be out of date. We could have recieved 1230 * a reset packet in an interrupt or maybe we slept 1231 * while doing page faults in uiomove() etc. We could 1232 * probably recheck again inside the splnet() protection 1233 * here, but there are probably other places that this 1234 * also happens. We must rethink this. 1235 */ 1236 for (m = top; m; m = m->m_next) 1237 ssb_preallocstream(&so->so_snd, m); 1238 if (!async) { 1239 error = so_pru_send(so, pru_flags, top, 1240 NULL, NULL, td); 1241 } else { 1242 so_pru_send_async(so, pru_flags, top, 1243 NULL, NULL, td); 1244 error = 0; 1245 } 1246 1247 top = NULL; 1248 mp = ⊤ 1249 if (error) 1250 goto release; 1251 } while (resid && space > 0); 1252 } while (resid); 1253 1254 release: 1255 ssb_unlock(&so->so_snd); 1256 out: 1257 if (top) 1258 m_freem(top); 1259 if (control) 1260 m_freem(control); 1261 return (error); 1262 } 1263 #endif 1264 1265 /* 1266 * Implement receive operations on a socket. 1267 * 1268 * We depend on the way that records are added to the signalsockbuf 1269 * by sbappend*. In particular, each record (mbufs linked through m_next) 1270 * must begin with an address if the protocol so specifies, 1271 * followed by an optional mbuf or mbufs containing ancillary data, 1272 * and then zero or more mbufs of data. 1273 * 1274 * Although the signalsockbuf is locked, new data may still be appended. 1275 * A token inside the ssb_lock deals with MP issues and still allows 1276 * the network to access the socket if we block in a uio. 1277 * 1278 * The caller may receive the data as a single mbuf chain by supplying 1279 * an mbuf **mp0 for use in returning the chain. The uio is then used 1280 * only for the count in uio_resid. 1281 */ 1282 int 1283 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 1284 struct sockbuf *sio, struct mbuf **controlp, int *flagsp) 1285 { 1286 struct mbuf *m, *n; 1287 struct mbuf *free_chain = NULL; 1288 int flags, len, error, offset; 1289 struct protosw *pr = so->so_proto; 1290 int moff, type = 0; 1291 size_t resid, orig_resid; 1292 1293 if (uio) 1294 resid = uio->uio_resid; 1295 else 1296 resid = (size_t)(sio->sb_climit - sio->sb_cc); 1297 orig_resid = resid; 1298 1299 if (psa) 1300 *psa = NULL; 1301 if (controlp) 1302 *controlp = NULL; 1303 if (flagsp) 1304 flags = *flagsp &~ MSG_EOR; 1305 else 1306 flags = 0; 1307 if (flags & MSG_OOB) { 1308 m = m_get(MB_WAIT, MT_DATA); 1309 if (m == NULL) 1310 return (ENOBUFS); 1311 error = so_pru_rcvoob(so, m, flags & MSG_PEEK); 1312 if (error) 1313 goto bad; 1314 if (sio) { 1315 do { 1316 sbappend(sio, m); 1317 KKASSERT(resid >= (size_t)m->m_len); 1318 resid -= (size_t)m->m_len; 1319 } while (resid > 0 && m); 1320 } else { 1321 do { 1322 uio->uio_resid = resid; 1323 error = uiomove(mtod(m, caddr_t), 1324 (int)szmin(resid, m->m_len), 1325 uio); 1326 resid = uio->uio_resid; 1327 m = m_free(m); 1328 } while (uio->uio_resid && error == 0 && m); 1329 } 1330 bad: 1331 if (m) 1332 m_freem(m); 1333 return (error); 1334 } 1335 if ((so->so_state & SS_ISCONFIRMING) && resid) 1336 so_pru_rcvd(so, 0); 1337 1338 /* 1339 * The token interlocks against the protocol thread while 1340 * ssb_lock is a blocking lock against other userland entities. 1341 */ 1342 lwkt_gettoken(&so->so_rcv.ssb_token); 1343 restart: 1344 error = ssb_lock(&so->so_rcv, SBLOCKWAIT(flags)); 1345 if (error) 1346 goto done; 1347 1348 m = so->so_rcv.ssb_mb; 1349 /* 1350 * If we have less data than requested, block awaiting more 1351 * (subject to any timeout) if: 1352 * 1. the current count is less than the low water mark, or 1353 * 2. MSG_WAITALL is set, and it is possible to do the entire 1354 * receive operation at once if we block (resid <= hiwat). 1355 * 3. MSG_DONTWAIT is not set 1356 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1357 * we have to do the receive in sections, and thus risk returning 1358 * a short count if a timeout or signal occurs after we start. 1359 */ 1360 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1361 (size_t)so->so_rcv.ssb_cc < resid) && 1362 (so->so_rcv.ssb_cc < so->so_rcv.ssb_lowat || 1363 ((flags & MSG_WAITALL) && resid <= (size_t)so->so_rcv.ssb_hiwat)) && 1364 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { 1365 KASSERT(m != NULL || !so->so_rcv.ssb_cc, ("receive 1")); 1366 if (so->so_error) { 1367 if (m) 1368 goto dontblock; 1369 error = so->so_error; 1370 if ((flags & MSG_PEEK) == 0) 1371 so->so_error = 0; 1372 goto release; 1373 } 1374 if (so->so_state & SS_CANTRCVMORE) { 1375 if (m) 1376 goto dontblock; 1377 else 1378 goto release; 1379 } 1380 for (; m; m = m->m_next) { 1381 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1382 m = so->so_rcv.ssb_mb; 1383 goto dontblock; 1384 } 1385 } 1386 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1387 (pr->pr_flags & PR_CONNREQUIRED)) { 1388 error = ENOTCONN; 1389 goto release; 1390 } 1391 if (resid == 0) 1392 goto release; 1393 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) { 1394 error = EWOULDBLOCK; 1395 goto release; 1396 } 1397 ssb_unlock(&so->so_rcv); 1398 error = ssb_wait(&so->so_rcv); 1399 if (error) 1400 goto done; 1401 goto restart; 1402 } 1403 dontblock: 1404 if (uio && uio->uio_td && uio->uio_td->td_proc) 1405 uio->uio_td->td_lwp->lwp_ru.ru_msgrcv++; 1406 1407 /* 1408 * note: m should be == sb_mb here. Cache the next record while 1409 * cleaning up. Note that calling m_free*() will break out critical 1410 * section. 1411 */ 1412 KKASSERT(m == so->so_rcv.ssb_mb); 1413 1414 /* 1415 * Skip any address mbufs prepending the record. 1416 */ 1417 if (pr->pr_flags & PR_ADDR) { 1418 KASSERT(m->m_type == MT_SONAME, ("receive 1a")); 1419 orig_resid = 0; 1420 if (psa) 1421 *psa = dup_sockaddr(mtod(m, struct sockaddr *)); 1422 if (flags & MSG_PEEK) 1423 m = m->m_next; 1424 else 1425 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1426 } 1427 1428 /* 1429 * Skip any control mbufs prepending the record. 1430 */ 1431 #ifdef SCTP 1432 if (pr->pr_flags & PR_ADDR_OPT) { 1433 /* 1434 * For SCTP we may be getting a 1435 * whole message OR a partial delivery. 1436 */ 1437 if (m && m->m_type == MT_SONAME) { 1438 orig_resid = 0; 1439 if (psa) 1440 *psa = dup_sockaddr(mtod(m, struct sockaddr *)); 1441 if (flags & MSG_PEEK) 1442 m = m->m_next; 1443 else 1444 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1445 } 1446 } 1447 #endif /* SCTP */ 1448 while (m && m->m_type == MT_CONTROL && error == 0) { 1449 if (flags & MSG_PEEK) { 1450 if (controlp) 1451 *controlp = m_copy(m, 0, m->m_len); 1452 m = m->m_next; /* XXX race */ 1453 } else { 1454 if (controlp) { 1455 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL); 1456 if (pr->pr_domain->dom_externalize && 1457 mtod(m, struct cmsghdr *)->cmsg_type == 1458 SCM_RIGHTS) 1459 error = (*pr->pr_domain->dom_externalize)(m); 1460 *controlp = m; 1461 m = n; 1462 } else { 1463 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1464 } 1465 } 1466 if (controlp && *controlp) { 1467 orig_resid = 0; 1468 controlp = &(*controlp)->m_next; 1469 } 1470 } 1471 1472 /* 1473 * flag OOB data. 1474 */ 1475 if (m) { 1476 type = m->m_type; 1477 if (type == MT_OOBDATA) 1478 flags |= MSG_OOB; 1479 } 1480 1481 /* 1482 * Copy to the UIO or mbuf return chain (*mp). 1483 */ 1484 moff = 0; 1485 offset = 0; 1486 while (m && resid > 0 && error == 0) { 1487 if (m->m_type == MT_OOBDATA) { 1488 if (type != MT_OOBDATA) 1489 break; 1490 } else if (type == MT_OOBDATA) 1491 break; 1492 else 1493 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 1494 ("receive 3")); 1495 soclrstate(so, SS_RCVATMARK); 1496 len = (resid > INT_MAX) ? INT_MAX : resid; 1497 if (so->so_oobmark && len > so->so_oobmark - offset) 1498 len = so->so_oobmark - offset; 1499 if (len > m->m_len - moff) 1500 len = m->m_len - moff; 1501 1502 /* 1503 * Copy out to the UIO or pass the mbufs back to the SIO. 1504 * The SIO is dealt with when we eat the mbuf, but deal 1505 * with the resid here either way. 1506 */ 1507 if (uio) { 1508 uio->uio_resid = resid; 1509 error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1510 resid = uio->uio_resid; 1511 if (error) 1512 goto release; 1513 } else { 1514 resid -= (size_t)len; 1515 } 1516 1517 /* 1518 * Eat the entire mbuf or just a piece of it 1519 */ 1520 if (len == m->m_len - moff) { 1521 if (m->m_flags & M_EOR) 1522 flags |= MSG_EOR; 1523 #ifdef SCTP 1524 if (m->m_flags & M_NOTIFICATION) 1525 flags |= MSG_NOTIFICATION; 1526 #endif /* SCTP */ 1527 if (flags & MSG_PEEK) { 1528 m = m->m_next; 1529 moff = 0; 1530 } else { 1531 if (sio) { 1532 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL); 1533 sbappend(sio, m); 1534 m = n; 1535 } else { 1536 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1537 } 1538 } 1539 } else { 1540 if (flags & MSG_PEEK) { 1541 moff += len; 1542 } else { 1543 if (sio) { 1544 n = m_copym(m, 0, len, MB_WAIT); 1545 if (n) 1546 sbappend(sio, n); 1547 } 1548 m->m_data += len; 1549 m->m_len -= len; 1550 so->so_rcv.ssb_cc -= len; 1551 } 1552 } 1553 if (so->so_oobmark) { 1554 if ((flags & MSG_PEEK) == 0) { 1555 so->so_oobmark -= len; 1556 if (so->so_oobmark == 0) { 1557 sosetstate(so, SS_RCVATMARK); 1558 break; 1559 } 1560 } else { 1561 offset += len; 1562 if (offset == so->so_oobmark) 1563 break; 1564 } 1565 } 1566 if (flags & MSG_EOR) 1567 break; 1568 /* 1569 * If the MSG_WAITALL flag is set (for non-atomic socket), 1570 * we must not quit until resid == 0 or an error 1571 * termination. If a signal/timeout occurs, return 1572 * with a short count but without error. 1573 * Keep signalsockbuf locked against other readers. 1574 */ 1575 while ((flags & MSG_WAITALL) && m == NULL && 1576 resid > 0 && !sosendallatonce(so) && 1577 so->so_rcv.ssb_mb == NULL) { 1578 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1579 break; 1580 /* 1581 * The window might have closed to zero, make 1582 * sure we send an ack now that we've drained 1583 * the buffer or we might end up blocking until 1584 * the idle takes over (5 seconds). 1585 */ 1586 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 1587 so_pru_rcvd(so, flags); 1588 error = ssb_wait(&so->so_rcv); 1589 if (error) { 1590 ssb_unlock(&so->so_rcv); 1591 error = 0; 1592 goto done; 1593 } 1594 m = so->so_rcv.ssb_mb; 1595 } 1596 } 1597 1598 /* 1599 * If an atomic read was requested but unread data still remains 1600 * in the record, set MSG_TRUNC. 1601 */ 1602 if (m && pr->pr_flags & PR_ATOMIC) 1603 flags |= MSG_TRUNC; 1604 1605 /* 1606 * Cleanup. If an atomic read was requested drop any unread data. 1607 */ 1608 if ((flags & MSG_PEEK) == 0) { 1609 if (m && (pr->pr_flags & PR_ATOMIC)) 1610 sbdroprecord(&so->so_rcv.sb); 1611 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) 1612 so_pru_rcvd(so, flags); 1613 } 1614 1615 if (orig_resid == resid && orig_resid && 1616 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1617 ssb_unlock(&so->so_rcv); 1618 goto restart; 1619 } 1620 1621 if (flagsp) 1622 *flagsp |= flags; 1623 release: 1624 ssb_unlock(&so->so_rcv); 1625 done: 1626 lwkt_reltoken(&so->so_rcv.ssb_token); 1627 if (free_chain) 1628 m_freem(free_chain); 1629 return (error); 1630 } 1631 1632 int 1633 sorecvtcp(struct socket *so, struct sockaddr **psa, struct uio *uio, 1634 struct sockbuf *sio, struct mbuf **controlp, int *flagsp) 1635 { 1636 struct mbuf *m, *n; 1637 struct mbuf *free_chain = NULL; 1638 int flags, len, error, offset; 1639 struct protosw *pr = so->so_proto; 1640 int moff; 1641 size_t resid, orig_resid; 1642 1643 if (uio) 1644 resid = uio->uio_resid; 1645 else 1646 resid = (size_t)(sio->sb_climit - sio->sb_cc); 1647 orig_resid = resid; 1648 1649 if (psa) 1650 *psa = NULL; 1651 if (controlp) 1652 *controlp = NULL; 1653 if (flagsp) 1654 flags = *flagsp &~ MSG_EOR; 1655 else 1656 flags = 0; 1657 if (flags & MSG_OOB) { 1658 m = m_get(MB_WAIT, MT_DATA); 1659 if (m == NULL) 1660 return (ENOBUFS); 1661 error = so_pru_rcvoob(so, m, flags & MSG_PEEK); 1662 if (error) 1663 goto bad; 1664 if (sio) { 1665 do { 1666 sbappend(sio, m); 1667 KKASSERT(resid >= (size_t)m->m_len); 1668 resid -= (size_t)m->m_len; 1669 } while (resid > 0 && m); 1670 } else { 1671 do { 1672 uio->uio_resid = resid; 1673 error = uiomove(mtod(m, caddr_t), 1674 (int)szmin(resid, m->m_len), 1675 uio); 1676 resid = uio->uio_resid; 1677 m = m_free(m); 1678 } while (uio->uio_resid && error == 0 && m); 1679 } 1680 bad: 1681 if (m) 1682 m_freem(m); 1683 return (error); 1684 } 1685 1686 /* 1687 * The token interlocks against the protocol thread while 1688 * ssb_lock is a blocking lock against other userland entities. 1689 */ 1690 lwkt_gettoken(&so->so_rcv.ssb_token); 1691 restart: 1692 error = ssb_lock(&so->so_rcv, SBLOCKWAIT(flags)); 1693 if (error) 1694 goto done; 1695 1696 m = so->so_rcv.ssb_mb; 1697 /* 1698 * If we have less data than requested, block awaiting more 1699 * (subject to any timeout) if: 1700 * 1. the current count is less than the low water mark, or 1701 * 2. MSG_WAITALL is set, and it is possible to do the entire 1702 * receive operation at once if we block (resid <= hiwat). 1703 * 3. MSG_DONTWAIT is not set 1704 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1705 * we have to do the receive in sections, and thus risk returning 1706 * a short count if a timeout or signal occurs after we start. 1707 */ 1708 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1709 (size_t)so->so_rcv.ssb_cc < resid) && 1710 (so->so_rcv.ssb_cc < so->so_rcv.ssb_lowat || 1711 ((flags & MSG_WAITALL) && resid <= (size_t)so->so_rcv.ssb_hiwat)))) { 1712 KASSERT(m != NULL || !so->so_rcv.ssb_cc, ("receive 1")); 1713 if (so->so_error) { 1714 if (m) 1715 goto dontblock; 1716 error = so->so_error; 1717 if ((flags & MSG_PEEK) == 0) 1718 so->so_error = 0; 1719 goto release; 1720 } 1721 if (so->so_state & SS_CANTRCVMORE) { 1722 if (m) 1723 goto dontblock; 1724 else 1725 goto release; 1726 } 1727 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1728 (pr->pr_flags & PR_CONNREQUIRED)) { 1729 error = ENOTCONN; 1730 goto release; 1731 } 1732 if (resid == 0) 1733 goto release; 1734 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) { 1735 error = EWOULDBLOCK; 1736 goto release; 1737 } 1738 ssb_unlock(&so->so_rcv); 1739 error = ssb_wait(&so->so_rcv); 1740 if (error) 1741 goto done; 1742 goto restart; 1743 } 1744 dontblock: 1745 if (uio && uio->uio_td && uio->uio_td->td_proc) 1746 uio->uio_td->td_lwp->lwp_ru.ru_msgrcv++; 1747 1748 /* 1749 * note: m should be == sb_mb here. Cache the next record while 1750 * cleaning up. Note that calling m_free*() will break out critical 1751 * section. 1752 */ 1753 KKASSERT(m == so->so_rcv.ssb_mb); 1754 1755 /* 1756 * Copy to the UIO or mbuf return chain (*mp). 1757 */ 1758 moff = 0; 1759 offset = 0; 1760 while (m && resid > 0 && error == 0) { 1761 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 1762 ("receive 3")); 1763 1764 soclrstate(so, SS_RCVATMARK); 1765 len = (resid > INT_MAX) ? INT_MAX : resid; 1766 if (so->so_oobmark && len > so->so_oobmark - offset) 1767 len = so->so_oobmark - offset; 1768 if (len > m->m_len - moff) 1769 len = m->m_len - moff; 1770 1771 /* 1772 * Copy out to the UIO or pass the mbufs back to the SIO. 1773 * The SIO is dealt with when we eat the mbuf, but deal 1774 * with the resid here either way. 1775 */ 1776 if (uio) { 1777 uio->uio_resid = resid; 1778 error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1779 resid = uio->uio_resid; 1780 if (error) 1781 goto release; 1782 } else { 1783 resid -= (size_t)len; 1784 } 1785 1786 /* 1787 * Eat the entire mbuf or just a piece of it 1788 */ 1789 if (len == m->m_len - moff) { 1790 if (flags & MSG_PEEK) { 1791 m = m->m_next; 1792 moff = 0; 1793 } else { 1794 if (sio) { 1795 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL); 1796 sbappend(sio, m); 1797 m = n; 1798 } else { 1799 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1800 } 1801 } 1802 } else { 1803 if (flags & MSG_PEEK) { 1804 moff += len; 1805 } else { 1806 if (sio) { 1807 n = m_copym(m, 0, len, MB_WAIT); 1808 if (n) 1809 sbappend(sio, n); 1810 } 1811 m->m_data += len; 1812 m->m_len -= len; 1813 so->so_rcv.ssb_cc -= len; 1814 } 1815 } 1816 if (so->so_oobmark) { 1817 if ((flags & MSG_PEEK) == 0) { 1818 so->so_oobmark -= len; 1819 if (so->so_oobmark == 0) { 1820 sosetstate(so, SS_RCVATMARK); 1821 break; 1822 } 1823 } else { 1824 offset += len; 1825 if (offset == so->so_oobmark) 1826 break; 1827 } 1828 } 1829 /* 1830 * If the MSG_WAITALL flag is set (for non-atomic socket), 1831 * we must not quit until resid == 0 or an error 1832 * termination. If a signal/timeout occurs, return 1833 * with a short count but without error. 1834 * Keep signalsockbuf locked against other readers. 1835 */ 1836 while ((flags & MSG_WAITALL) && m == NULL && 1837 resid > 0 && !sosendallatonce(so) && 1838 so->so_rcv.ssb_mb == NULL) { 1839 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1840 break; 1841 /* 1842 * The window might have closed to zero, make 1843 * sure we send an ack now that we've drained 1844 * the buffer or we might end up blocking until 1845 * the idle takes over (5 seconds). 1846 */ 1847 if (so->so_pcb) 1848 so_pru_rcvd_async(so); 1849 error = ssb_wait(&so->so_rcv); 1850 if (error) { 1851 ssb_unlock(&so->so_rcv); 1852 error = 0; 1853 goto done; 1854 } 1855 m = so->so_rcv.ssb_mb; 1856 } 1857 } 1858 1859 /* 1860 * Cleanup. If an atomic read was requested drop any unread data. 1861 */ 1862 if ((flags & MSG_PEEK) == 0) { 1863 if (so->so_pcb) 1864 so_pru_rcvd_async(so); 1865 } 1866 1867 if (orig_resid == resid && orig_resid && 1868 (so->so_state & SS_CANTRCVMORE) == 0) { 1869 ssb_unlock(&so->so_rcv); 1870 goto restart; 1871 } 1872 1873 if (flagsp) 1874 *flagsp |= flags; 1875 release: 1876 ssb_unlock(&so->so_rcv); 1877 done: 1878 lwkt_reltoken(&so->so_rcv.ssb_token); 1879 if (free_chain) 1880 m_freem(free_chain); 1881 return (error); 1882 } 1883 1884 /* 1885 * Shut a socket down. Note that we do not get a frontend lock as we 1886 * want to be able to shut the socket down even if another thread is 1887 * blocked in a read(), thus waking it up. 1888 */ 1889 int 1890 soshutdown(struct socket *so, int how) 1891 { 1892 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1893 return (EINVAL); 1894 1895 if (how != SHUT_WR) { 1896 /*ssb_lock(&so->so_rcv, M_WAITOK);*/ 1897 sorflush(so); 1898 /*ssb_unlock(&so->so_rcv);*/ 1899 } 1900 if (how != SHUT_RD) 1901 return (so_pru_shutdown(so)); 1902 return (0); 1903 } 1904 1905 void 1906 sorflush(struct socket *so) 1907 { 1908 struct signalsockbuf *ssb = &so->so_rcv; 1909 struct protosw *pr = so->so_proto; 1910 struct signalsockbuf asb; 1911 1912 atomic_set_int(&ssb->ssb_flags, SSB_NOINTR); 1913 1914 lwkt_gettoken(&ssb->ssb_token); 1915 socantrcvmore(so); 1916 asb = *ssb; 1917 1918 /* 1919 * Can't just blow up the ssb structure here 1920 */ 1921 bzero(&ssb->sb, sizeof(ssb->sb)); 1922 ssb->ssb_timeo = 0; 1923 ssb->ssb_lowat = 0; 1924 ssb->ssb_hiwat = 0; 1925 ssb->ssb_mbmax = 0; 1926 atomic_clear_int(&ssb->ssb_flags, SSB_CLEAR_MASK); 1927 1928 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) 1929 (*pr->pr_domain->dom_dispose)(asb.ssb_mb); 1930 ssb_release(&asb, so); 1931 1932 lwkt_reltoken(&ssb->ssb_token); 1933 } 1934 1935 #ifdef INET 1936 static int 1937 do_setopt_accept_filter(struct socket *so, struct sockopt *sopt) 1938 { 1939 struct accept_filter_arg *afap = NULL; 1940 struct accept_filter *afp; 1941 struct so_accf *af = so->so_accf; 1942 int error = 0; 1943 1944 /* do not set/remove accept filters on non listen sockets */ 1945 if ((so->so_options & SO_ACCEPTCONN) == 0) { 1946 error = EINVAL; 1947 goto out; 1948 } 1949 1950 /* removing the filter */ 1951 if (sopt == NULL) { 1952 if (af != NULL) { 1953 if (af->so_accept_filter != NULL && 1954 af->so_accept_filter->accf_destroy != NULL) { 1955 af->so_accept_filter->accf_destroy(so); 1956 } 1957 if (af->so_accept_filter_str != NULL) { 1958 kfree(af->so_accept_filter_str, M_ACCF); 1959 } 1960 kfree(af, M_ACCF); 1961 so->so_accf = NULL; 1962 } 1963 so->so_options &= ~SO_ACCEPTFILTER; 1964 return (0); 1965 } 1966 /* adding a filter */ 1967 /* must remove previous filter first */ 1968 if (af != NULL) { 1969 error = EINVAL; 1970 goto out; 1971 } 1972 /* don't put large objects on the kernel stack */ 1973 afap = kmalloc(sizeof(*afap), M_TEMP, M_WAITOK); 1974 error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); 1975 afap->af_name[sizeof(afap->af_name)-1] = '\0'; 1976 afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; 1977 if (error) 1978 goto out; 1979 afp = accept_filt_get(afap->af_name); 1980 if (afp == NULL) { 1981 error = ENOENT; 1982 goto out; 1983 } 1984 af = kmalloc(sizeof(*af), M_ACCF, M_WAITOK | M_ZERO); 1985 if (afp->accf_create != NULL) { 1986 if (afap->af_name[0] != '\0') { 1987 int len = strlen(afap->af_name) + 1; 1988 1989 af->so_accept_filter_str = kmalloc(len, M_ACCF, 1990 M_WAITOK); 1991 strcpy(af->so_accept_filter_str, afap->af_name); 1992 } 1993 af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg); 1994 if (af->so_accept_filter_arg == NULL) { 1995 kfree(af->so_accept_filter_str, M_ACCF); 1996 kfree(af, M_ACCF); 1997 so->so_accf = NULL; 1998 error = EINVAL; 1999 goto out; 2000 } 2001 } 2002 af->so_accept_filter = afp; 2003 so->so_accf = af; 2004 so->so_options |= SO_ACCEPTFILTER; 2005 out: 2006 if (afap != NULL) 2007 kfree(afap, M_TEMP); 2008 return (error); 2009 } 2010 #endif /* INET */ 2011 2012 /* 2013 * Perhaps this routine, and sooptcopyout(), below, ought to come in 2014 * an additional variant to handle the case where the option value needs 2015 * to be some kind of integer, but not a specific size. 2016 * In addition to their use here, these functions are also called by the 2017 * protocol-level pr_ctloutput() routines. 2018 */ 2019 int 2020 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 2021 { 2022 return soopt_to_kbuf(sopt, buf, len, minlen); 2023 } 2024 2025 int 2026 soopt_to_kbuf(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 2027 { 2028 size_t valsize; 2029 2030 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2031 KKASSERT(kva_p(buf)); 2032 2033 /* 2034 * If the user gives us more than we wanted, we ignore it, 2035 * but if we don't get the minimum length the caller 2036 * wants, we return EINVAL. On success, sopt->sopt_valsize 2037 * is set to however much we actually retrieved. 2038 */ 2039 if ((valsize = sopt->sopt_valsize) < minlen) 2040 return EINVAL; 2041 if (valsize > len) 2042 sopt->sopt_valsize = valsize = len; 2043 2044 bcopy(sopt->sopt_val, buf, valsize); 2045 return 0; 2046 } 2047 2048 2049 int 2050 sosetopt(struct socket *so, struct sockopt *sopt) 2051 { 2052 int error, optval; 2053 struct linger l; 2054 struct timeval tv; 2055 u_long val; 2056 struct signalsockbuf *sotmp; 2057 2058 error = 0; 2059 sopt->sopt_dir = SOPT_SET; 2060 if (sopt->sopt_level != SOL_SOCKET) { 2061 if (so->so_proto && so->so_proto->pr_ctloutput) { 2062 return (so_pr_ctloutput(so, sopt)); 2063 } 2064 error = ENOPROTOOPT; 2065 } else { 2066 switch (sopt->sopt_name) { 2067 #ifdef INET 2068 case SO_ACCEPTFILTER: 2069 error = do_setopt_accept_filter(so, sopt); 2070 if (error) 2071 goto bad; 2072 break; 2073 #endif /* INET */ 2074 case SO_LINGER: 2075 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 2076 if (error) 2077 goto bad; 2078 2079 so->so_linger = l.l_linger; 2080 if (l.l_onoff) 2081 so->so_options |= SO_LINGER; 2082 else 2083 so->so_options &= ~SO_LINGER; 2084 break; 2085 2086 case SO_DEBUG: 2087 case SO_KEEPALIVE: 2088 case SO_DONTROUTE: 2089 case SO_USELOOPBACK: 2090 case SO_BROADCAST: 2091 case SO_REUSEADDR: 2092 case SO_REUSEPORT: 2093 case SO_OOBINLINE: 2094 case SO_TIMESTAMP: 2095 case SO_NOSIGPIPE: 2096 error = sooptcopyin(sopt, &optval, sizeof optval, 2097 sizeof optval); 2098 if (error) 2099 goto bad; 2100 if (optval) 2101 so->so_options |= sopt->sopt_name; 2102 else 2103 so->so_options &= ~sopt->sopt_name; 2104 break; 2105 2106 case SO_SNDBUF: 2107 case SO_RCVBUF: 2108 case SO_SNDLOWAT: 2109 case SO_RCVLOWAT: 2110 error = sooptcopyin(sopt, &optval, sizeof optval, 2111 sizeof optval); 2112 if (error) 2113 goto bad; 2114 2115 /* 2116 * Values < 1 make no sense for any of these 2117 * options, so disallow them. 2118 */ 2119 if (optval < 1) { 2120 error = EINVAL; 2121 goto bad; 2122 } 2123 2124 switch (sopt->sopt_name) { 2125 case SO_SNDBUF: 2126 case SO_RCVBUF: 2127 if (ssb_reserve(sopt->sopt_name == SO_SNDBUF ? 2128 &so->so_snd : &so->so_rcv, (u_long)optval, 2129 so, 2130 &curproc->p_rlimit[RLIMIT_SBSIZE]) == 0) { 2131 error = ENOBUFS; 2132 goto bad; 2133 } 2134 sotmp = (sopt->sopt_name == SO_SNDBUF) ? 2135 &so->so_snd : &so->so_rcv; 2136 atomic_clear_int(&sotmp->ssb_flags, 2137 SSB_AUTOSIZE); 2138 break; 2139 2140 /* 2141 * Make sure the low-water is never greater than 2142 * the high-water. 2143 */ 2144 case SO_SNDLOWAT: 2145 so->so_snd.ssb_lowat = 2146 (optval > so->so_snd.ssb_hiwat) ? 2147 so->so_snd.ssb_hiwat : optval; 2148 atomic_clear_int(&so->so_snd.ssb_flags, 2149 SSB_AUTOLOWAT); 2150 break; 2151 case SO_RCVLOWAT: 2152 so->so_rcv.ssb_lowat = 2153 (optval > so->so_rcv.ssb_hiwat) ? 2154 so->so_rcv.ssb_hiwat : optval; 2155 atomic_clear_int(&so->so_rcv.ssb_flags, 2156 SSB_AUTOLOWAT); 2157 break; 2158 } 2159 break; 2160 2161 case SO_SNDTIMEO: 2162 case SO_RCVTIMEO: 2163 error = sooptcopyin(sopt, &tv, sizeof tv, 2164 sizeof tv); 2165 if (error) 2166 goto bad; 2167 2168 /* assert(hz > 0); */ 2169 if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz || 2170 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 2171 error = EDOM; 2172 goto bad; 2173 } 2174 /* assert(tick > 0); */ 2175 /* assert(ULONG_MAX - INT_MAX >= 1000000); */ 2176 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / ustick; 2177 if (val > INT_MAX) { 2178 error = EDOM; 2179 goto bad; 2180 } 2181 if (val == 0 && tv.tv_usec != 0) 2182 val = 1; 2183 2184 switch (sopt->sopt_name) { 2185 case SO_SNDTIMEO: 2186 so->so_snd.ssb_timeo = val; 2187 break; 2188 case SO_RCVTIMEO: 2189 so->so_rcv.ssb_timeo = val; 2190 break; 2191 } 2192 break; 2193 default: 2194 error = ENOPROTOOPT; 2195 break; 2196 } 2197 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { 2198 (void) so_pr_ctloutput(so, sopt); 2199 } 2200 } 2201 bad: 2202 return (error); 2203 } 2204 2205 /* Helper routine for getsockopt */ 2206 int 2207 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 2208 { 2209 soopt_from_kbuf(sopt, buf, len); 2210 return 0; 2211 } 2212 2213 void 2214 soopt_from_kbuf(struct sockopt *sopt, const void *buf, size_t len) 2215 { 2216 size_t valsize; 2217 2218 if (len == 0) { 2219 sopt->sopt_valsize = 0; 2220 return; 2221 } 2222 2223 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2224 KKASSERT(kva_p(buf)); 2225 2226 /* 2227 * Documented get behavior is that we always return a value, 2228 * possibly truncated to fit in the user's buffer. 2229 * Traditional behavior is that we always tell the user 2230 * precisely how much we copied, rather than something useful 2231 * like the total amount we had available for her. 2232 * Note that this interface is not idempotent; the entire answer must 2233 * generated ahead of time. 2234 */ 2235 valsize = szmin(len, sopt->sopt_valsize); 2236 sopt->sopt_valsize = valsize; 2237 if (sopt->sopt_val != 0) { 2238 bcopy(buf, sopt->sopt_val, valsize); 2239 } 2240 } 2241 2242 int 2243 sogetopt(struct socket *so, struct sockopt *sopt) 2244 { 2245 int error, optval; 2246 long optval_l; 2247 struct linger l; 2248 struct timeval tv; 2249 #ifdef INET 2250 struct accept_filter_arg *afap; 2251 #endif 2252 2253 error = 0; 2254 sopt->sopt_dir = SOPT_GET; 2255 if (sopt->sopt_level != SOL_SOCKET) { 2256 if (so->so_proto && so->so_proto->pr_ctloutput) { 2257 return (so_pr_ctloutput(so, sopt)); 2258 } else 2259 return (ENOPROTOOPT); 2260 } else { 2261 switch (sopt->sopt_name) { 2262 #ifdef INET 2263 case SO_ACCEPTFILTER: 2264 if ((so->so_options & SO_ACCEPTCONN) == 0) 2265 return (EINVAL); 2266 afap = kmalloc(sizeof(*afap), M_TEMP, 2267 M_WAITOK | M_ZERO); 2268 if ((so->so_options & SO_ACCEPTFILTER) != 0) { 2269 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); 2270 if (so->so_accf->so_accept_filter_str != NULL) 2271 strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); 2272 } 2273 error = sooptcopyout(sopt, afap, sizeof(*afap)); 2274 kfree(afap, M_TEMP); 2275 break; 2276 #endif /* INET */ 2277 2278 case SO_LINGER: 2279 l.l_onoff = so->so_options & SO_LINGER; 2280 l.l_linger = so->so_linger; 2281 error = sooptcopyout(sopt, &l, sizeof l); 2282 break; 2283 2284 case SO_USELOOPBACK: 2285 case SO_DONTROUTE: 2286 case SO_DEBUG: 2287 case SO_KEEPALIVE: 2288 case SO_REUSEADDR: 2289 case SO_REUSEPORT: 2290 case SO_BROADCAST: 2291 case SO_OOBINLINE: 2292 case SO_TIMESTAMP: 2293 case SO_NOSIGPIPE: 2294 optval = so->so_options & sopt->sopt_name; 2295 integer: 2296 error = sooptcopyout(sopt, &optval, sizeof optval); 2297 break; 2298 2299 case SO_TYPE: 2300 optval = so->so_type; 2301 goto integer; 2302 2303 case SO_ERROR: 2304 optval = so->so_error; 2305 so->so_error = 0; 2306 goto integer; 2307 2308 case SO_SNDBUF: 2309 optval = so->so_snd.ssb_hiwat; 2310 goto integer; 2311 2312 case SO_RCVBUF: 2313 optval = so->so_rcv.ssb_hiwat; 2314 goto integer; 2315 2316 case SO_SNDLOWAT: 2317 optval = so->so_snd.ssb_lowat; 2318 goto integer; 2319 2320 case SO_RCVLOWAT: 2321 optval = so->so_rcv.ssb_lowat; 2322 goto integer; 2323 2324 case SO_SNDTIMEO: 2325 case SO_RCVTIMEO: 2326 optval = (sopt->sopt_name == SO_SNDTIMEO ? 2327 so->so_snd.ssb_timeo : so->so_rcv.ssb_timeo); 2328 2329 tv.tv_sec = optval / hz; 2330 tv.tv_usec = (optval % hz) * ustick; 2331 error = sooptcopyout(sopt, &tv, sizeof tv); 2332 break; 2333 2334 case SO_SNDSPACE: 2335 optval_l = ssb_space(&so->so_snd); 2336 error = sooptcopyout(sopt, &optval_l, sizeof(optval_l)); 2337 break; 2338 2339 default: 2340 error = ENOPROTOOPT; 2341 break; 2342 } 2343 return (error); 2344 } 2345 } 2346 2347 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 2348 int 2349 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 2350 { 2351 struct mbuf *m, *m_prev; 2352 int sopt_size = sopt->sopt_valsize, msize; 2353 2354 m = m_getl(sopt_size, sopt->sopt_td ? MB_WAIT : MB_DONTWAIT, MT_DATA, 2355 0, &msize); 2356 if (m == NULL) 2357 return (ENOBUFS); 2358 m->m_len = min(msize, sopt_size); 2359 sopt_size -= m->m_len; 2360 *mp = m; 2361 m_prev = m; 2362 2363 while (sopt_size > 0) { 2364 m = m_getl(sopt_size, sopt->sopt_td ? MB_WAIT : MB_DONTWAIT, 2365 MT_DATA, 0, &msize); 2366 if (m == NULL) { 2367 m_freem(*mp); 2368 return (ENOBUFS); 2369 } 2370 m->m_len = min(msize, sopt_size); 2371 sopt_size -= m->m_len; 2372 m_prev->m_next = m; 2373 m_prev = m; 2374 } 2375 return (0); 2376 } 2377 2378 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 2379 int 2380 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 2381 { 2382 soopt_to_mbuf(sopt, m); 2383 return 0; 2384 } 2385 2386 void 2387 soopt_to_mbuf(struct sockopt *sopt, struct mbuf *m) 2388 { 2389 size_t valsize; 2390 void *val; 2391 2392 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2393 KKASSERT(kva_p(m)); 2394 if (sopt->sopt_val == NULL) 2395 return; 2396 val = sopt->sopt_val; 2397 valsize = sopt->sopt_valsize; 2398 while (m != NULL && valsize >= m->m_len) { 2399 bcopy(val, mtod(m, char *), m->m_len); 2400 valsize -= m->m_len; 2401 val = (caddr_t)val + m->m_len; 2402 m = m->m_next; 2403 } 2404 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 2405 panic("ip6_sooptmcopyin"); 2406 } 2407 2408 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 2409 int 2410 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 2411 { 2412 return soopt_from_mbuf(sopt, m); 2413 } 2414 2415 int 2416 soopt_from_mbuf(struct sockopt *sopt, struct mbuf *m) 2417 { 2418 struct mbuf *m0 = m; 2419 size_t valsize = 0; 2420 size_t maxsize; 2421 void *val; 2422 2423 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 2424 KKASSERT(kva_p(m)); 2425 if (sopt->sopt_val == NULL) 2426 return 0; 2427 val = sopt->sopt_val; 2428 maxsize = sopt->sopt_valsize; 2429 while (m != NULL && maxsize >= m->m_len) { 2430 bcopy(mtod(m, char *), val, m->m_len); 2431 maxsize -= m->m_len; 2432 val = (caddr_t)val + m->m_len; 2433 valsize += m->m_len; 2434 m = m->m_next; 2435 } 2436 if (m != NULL) { 2437 /* enough soopt buffer should be given from user-land */ 2438 m_freem(m0); 2439 return (EINVAL); 2440 } 2441 sopt->sopt_valsize = valsize; 2442 return 0; 2443 } 2444 2445 void 2446 sohasoutofband(struct socket *so) 2447 { 2448 if (so->so_sigio != NULL) 2449 pgsigio(so->so_sigio, SIGURG, 0); 2450 KNOTE(&so->so_rcv.ssb_kq.ki_note, NOTE_OOB); 2451 } 2452 2453 int 2454 sokqfilter(struct file *fp, struct knote *kn) 2455 { 2456 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2457 struct signalsockbuf *ssb; 2458 2459 switch (kn->kn_filter) { 2460 case EVFILT_READ: 2461 if (so->so_options & SO_ACCEPTCONN) 2462 kn->kn_fop = &solisten_filtops; 2463 else 2464 kn->kn_fop = &soread_filtops; 2465 ssb = &so->so_rcv; 2466 break; 2467 case EVFILT_WRITE: 2468 kn->kn_fop = &sowrite_filtops; 2469 ssb = &so->so_snd; 2470 break; 2471 case EVFILT_EXCEPT: 2472 kn->kn_fop = &soexcept_filtops; 2473 ssb = &so->so_rcv; 2474 break; 2475 default: 2476 return (EOPNOTSUPP); 2477 } 2478 2479 knote_insert(&ssb->ssb_kq.ki_note, kn); 2480 atomic_set_int(&ssb->ssb_flags, SSB_KNOTE); 2481 return (0); 2482 } 2483 2484 static void 2485 filt_sordetach(struct knote *kn) 2486 { 2487 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2488 2489 knote_remove(&so->so_rcv.ssb_kq.ki_note, kn); 2490 if (SLIST_EMPTY(&so->so_rcv.ssb_kq.ki_note)) 2491 atomic_clear_int(&so->so_rcv.ssb_flags, SSB_KNOTE); 2492 } 2493 2494 /*ARGSUSED*/ 2495 static int 2496 filt_soread(struct knote *kn, long hint) 2497 { 2498 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2499 2500 if (kn->kn_sfflags & NOTE_OOB) { 2501 if ((so->so_oobmark || (so->so_state & SS_RCVATMARK))) { 2502 kn->kn_fflags |= NOTE_OOB; 2503 return (1); 2504 } 2505 return (0); 2506 } 2507 kn->kn_data = so->so_rcv.ssb_cc; 2508 2509 if (so->so_state & SS_CANTRCVMORE) { 2510 /* 2511 * Only set NODATA if all data has been exhausted. 2512 */ 2513 if (kn->kn_data == 0) 2514 kn->kn_flags |= EV_NODATA; 2515 kn->kn_flags |= EV_EOF; 2516 kn->kn_fflags = so->so_error; 2517 return (1); 2518 } 2519 if (so->so_error) /* temporary udp error */ 2520 return (1); 2521 if (kn->kn_sfflags & NOTE_LOWAT) 2522 return (kn->kn_data >= kn->kn_sdata); 2523 return ((kn->kn_data >= so->so_rcv.ssb_lowat) || 2524 !TAILQ_EMPTY(&so->so_comp)); 2525 } 2526 2527 static void 2528 filt_sowdetach(struct knote *kn) 2529 { 2530 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2531 2532 knote_remove(&so->so_snd.ssb_kq.ki_note, kn); 2533 if (SLIST_EMPTY(&so->so_snd.ssb_kq.ki_note)) 2534 atomic_clear_int(&so->so_snd.ssb_flags, SSB_KNOTE); 2535 } 2536 2537 /*ARGSUSED*/ 2538 static int 2539 filt_sowrite(struct knote *kn, long hint) 2540 { 2541 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2542 2543 kn->kn_data = ssb_space(&so->so_snd); 2544 if (so->so_state & SS_CANTSENDMORE) { 2545 kn->kn_flags |= (EV_EOF | EV_NODATA); 2546 kn->kn_fflags = so->so_error; 2547 return (1); 2548 } 2549 if (so->so_error) /* temporary udp error */ 2550 return (1); 2551 if (((so->so_state & SS_ISCONNECTED) == 0) && 2552 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2553 return (0); 2554 if (kn->kn_sfflags & NOTE_LOWAT) 2555 return (kn->kn_data >= kn->kn_sdata); 2556 return (kn->kn_data >= so->so_snd.ssb_lowat); 2557 } 2558 2559 /*ARGSUSED*/ 2560 static int 2561 filt_solisten(struct knote *kn, long hint) 2562 { 2563 struct socket *so = (struct socket *)kn->kn_fp->f_data; 2564 2565 kn->kn_data = so->so_qlen; 2566 return (! TAILQ_EMPTY(&so->so_comp)); 2567 } 2568