1 /* 2 * Copyright (c) 2004 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 2004 The DragonFly Project. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Jeffrey M. Hsu. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of The DragonFly Project nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific, prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 /* 35 * Copyright (c) 1982, 1986, 1988, 1990, 1993 36 * The Regents of the University of California. All rights reserved. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. All advertising materials mentioning features or use of this software 47 * must display the following acknowledgement: 48 * This product includes software developed by the University of 49 * California, Berkeley and its contributors. 50 * 4. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 67 * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.24 2003/11/11 17:18:18 silby Exp $ 68 * $DragonFly: src/sys/kern/uipc_socket.c,v 1.55 2008/09/02 16:17:52 dillon Exp $ 69 */ 70 71 #include "opt_inet.h" 72 #include "opt_sctp.h" 73 74 #include <sys/param.h> 75 #include <sys/systm.h> 76 #include <sys/fcntl.h> 77 #include <sys/malloc.h> 78 #include <sys/mbuf.h> 79 #include <sys/domain.h> 80 #include <sys/file.h> /* for struct knote */ 81 #include <sys/kernel.h> 82 #include <sys/malloc.h> 83 #include <sys/event.h> 84 #include <sys/proc.h> 85 #include <sys/protosw.h> 86 #include <sys/socket.h> 87 #include <sys/socketvar.h> 88 #include <sys/socketops.h> 89 #include <sys/resourcevar.h> 90 #include <sys/signalvar.h> 91 #include <sys/sysctl.h> 92 #include <sys/uio.h> 93 #include <sys/jail.h> 94 #include <vm/vm_zone.h> 95 #include <vm/pmap.h> 96 97 #include <sys/thread2.h> 98 #include <sys/socketvar2.h> 99 100 #include <machine/limits.h> 101 102 #ifdef INET 103 static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); 104 #endif /* INET */ 105 106 static void filt_sordetach(struct knote *kn); 107 static int filt_soread(struct knote *kn, long hint); 108 static void filt_sowdetach(struct knote *kn); 109 static int filt_sowrite(struct knote *kn, long hint); 110 static int filt_solisten(struct knote *kn, long hint); 111 112 static struct filterops solisten_filtops = 113 { FILTEROP_ISFD, NULL, filt_sordetach, filt_solisten }; 114 static struct filterops soread_filtops = 115 { FILTEROP_ISFD, NULL, filt_sordetach, filt_soread }; 116 static struct filterops sowrite_filtops = 117 { FILTEROP_ISFD, NULL, filt_sowdetach, filt_sowrite }; 118 static struct filterops soexcept_filtops = 119 { FILTEROP_ISFD, NULL, filt_sordetach, filt_soread }; 120 121 MALLOC_DEFINE(M_SOCKET, "socket", "socket struct"); 122 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 123 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 124 125 126 static int somaxconn = SOMAXCONN; 127 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, 128 &somaxconn, 0, "Maximum pending socket connection queue size"); 129 130 /* 131 * Socket operation routines. 132 * These routines are called by the routines in 133 * sys_socket.c or from a system process, and 134 * implement the semantics of socket operations by 135 * switching out to the protocol specific routines. 136 */ 137 138 /* 139 * Get a socket structure, and initialize it. 140 * Note that it would probably be better to allocate socket 141 * and PCB at the same time, but I'm not convinced that all 142 * the protocols can be easily modified to do this. 143 */ 144 struct socket * 145 soalloc(int waitok) 146 { 147 struct socket *so; 148 unsigned waitmask; 149 150 waitmask = waitok ? M_WAITOK : M_NOWAIT; 151 so = kmalloc(sizeof(struct socket), M_SOCKET, M_ZERO|waitmask); 152 if (so) { 153 /* XXX race condition for reentrant kernel */ 154 TAILQ_INIT(&so->so_aiojobq); 155 TAILQ_INIT(&so->so_rcv.ssb_kq.ki_mlist); 156 TAILQ_INIT(&so->so_snd.ssb_kq.ki_mlist); 157 lwkt_token_init(&so->so_rcv.ssb_token, 1, "rcvtok"); 158 lwkt_token_init(&so->so_snd.ssb_token, 1, "rcvtok"); 159 so->so_state = SS_NOFDREF; 160 so->so_refs = 1; 161 } 162 return so; 163 } 164 165 int 166 socreate(int dom, struct socket **aso, int type, 167 int proto, struct thread *td) 168 { 169 struct proc *p = td->td_proc; 170 struct protosw *prp; 171 struct socket *so; 172 struct pru_attach_info ai; 173 int error; 174 175 if (proto) 176 prp = pffindproto(dom, proto, type); 177 else 178 prp = pffindtype(dom, type); 179 180 if (prp == 0 || prp->pr_usrreqs->pru_attach == 0) 181 return (EPROTONOSUPPORT); 182 183 if (p->p_ucred->cr_prison && jail_socket_unixiproute_only && 184 prp->pr_domain->dom_family != PF_LOCAL && 185 prp->pr_domain->dom_family != PF_INET && 186 prp->pr_domain->dom_family != PF_INET6 && 187 prp->pr_domain->dom_family != PF_ROUTE) { 188 return (EPROTONOSUPPORT); 189 } 190 191 if (prp->pr_type != type) 192 return (EPROTOTYPE); 193 so = soalloc(p != 0); 194 if (so == NULL) 195 return (ENOBUFS); 196 197 /* 198 * Callers of socreate() presumably will connect up a descriptor 199 * and call soclose() if they cannot. This represents our so_refs 200 * (which should be 1) from soalloc(). 201 */ 202 soclrstate(so, SS_NOFDREF); 203 204 /* 205 * Set a default port for protocol processing. No action will occur 206 * on the socket on this port until an inpcb is attached to it and 207 * is able to match incoming packets, or until the socket becomes 208 * available to userland. 209 * 210 * We normally default the socket to the protocol thread on cpu 0. 211 * If PR_SYNC_PORT is set (unix domain sockets) there is no protocol 212 * thread and all pr_*()/pru_*() calls are executed synchronously. 213 */ 214 if (prp->pr_flags & PR_SYNC_PORT) 215 so->so_port = &netisr_sync_port; 216 else 217 so->so_port = cpu_portfn(0); 218 219 TAILQ_INIT(&so->so_incomp); 220 TAILQ_INIT(&so->so_comp); 221 so->so_type = type; 222 so->so_cred = crhold(p->p_ucred); 223 so->so_proto = prp; 224 ai.sb_rlimit = &p->p_rlimit[RLIMIT_SBSIZE]; 225 ai.p_ucred = p->p_ucred; 226 ai.fd_rdir = p->p_fd->fd_rdir; 227 228 /* 229 * Auto-sizing of socket buffers is managed by the protocols and 230 * the appropriate flags must be set in the pru_attach function. 231 */ 232 error = so_pru_attach(so, proto, &ai); 233 if (error) { 234 sosetstate(so, SS_NOFDREF); 235 sofree(so); /* from soalloc */ 236 return error; 237 } 238 239 /* 240 * NOTE: Returns referenced socket. 241 */ 242 *aso = so; 243 return (0); 244 } 245 246 int 247 sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 248 { 249 int error; 250 251 error = so_pru_bind(so, nam, td); 252 return (error); 253 } 254 255 static void 256 sodealloc(struct socket *so) 257 { 258 if (so->so_rcv.ssb_hiwat) 259 (void)chgsbsize(so->so_cred->cr_uidinfo, 260 &so->so_rcv.ssb_hiwat, 0, RLIM_INFINITY); 261 if (so->so_snd.ssb_hiwat) 262 (void)chgsbsize(so->so_cred->cr_uidinfo, 263 &so->so_snd.ssb_hiwat, 0, RLIM_INFINITY); 264 #ifdef INET 265 /* remove accept filter if present */ 266 if (so->so_accf != NULL) 267 do_setopt_accept_filter(so, NULL); 268 #endif /* INET */ 269 crfree(so->so_cred); 270 kfree(so, M_SOCKET); 271 } 272 273 int 274 solisten(struct socket *so, int backlog, struct thread *td) 275 { 276 int error; 277 #ifdef SCTP 278 short oldopt, oldqlimit; 279 #endif /* SCTP */ 280 281 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) 282 return (EINVAL); 283 284 #ifdef SCTP 285 oldopt = so->so_options; 286 oldqlimit = so->so_qlimit; 287 #endif /* SCTP */ 288 289 lwkt_gettoken(&so->so_rcv.ssb_token); 290 if (TAILQ_EMPTY(&so->so_comp)) 291 so->so_options |= SO_ACCEPTCONN; 292 lwkt_reltoken(&so->so_rcv.ssb_token); 293 if (backlog < 0 || backlog > somaxconn) 294 backlog = somaxconn; 295 so->so_qlimit = backlog; 296 /* SCTP needs to look at tweak both the inbound backlog parameter AND 297 * the so_options (UDP model both connect's and gets inbound 298 * connections .. implicitly). 299 */ 300 error = so_pru_listen(so, td); 301 if (error) { 302 #ifdef SCTP 303 /* Restore the params */ 304 so->so_options = oldopt; 305 so->so_qlimit = oldqlimit; 306 #endif /* SCTP */ 307 return (error); 308 } 309 return (0); 310 } 311 312 /* 313 * Destroy a disconnected socket. This routine is a NOP if entities 314 * still have a reference on the socket: 315 * 316 * so_pcb - The protocol stack still has a reference 317 * SS_NOFDREF - There is no longer a file pointer reference 318 */ 319 void 320 sofree(struct socket *so) 321 { 322 struct socket *head = so->so_head; 323 324 /* 325 * Arbitrage the last free. 326 */ 327 KKASSERT(so->so_refs > 0); 328 if (atomic_fetchadd_int(&so->so_refs, -1) != 1) 329 return; 330 331 KKASSERT(so->so_pcb == NULL && (so->so_state & SS_NOFDREF)); 332 KKASSERT((so->so_state & SS_ASSERTINPROG) == 0); 333 334 /* 335 * We're done, clean up 336 */ 337 if (head != NULL) { 338 lwkt_gettoken(&head->so_rcv.ssb_token); 339 if (so->so_state & SS_INCOMP) { 340 TAILQ_REMOVE(&head->so_incomp, so, so_list); 341 head->so_incqlen--; 342 } else if (so->so_state & SS_COMP) { 343 /* 344 * We must not decommission a socket that's 345 * on the accept(2) queue. If we do, then 346 * accept(2) may hang after select(2) indicated 347 * that the listening socket was ready. 348 */ 349 lwkt_reltoken(&head->so_rcv.ssb_token); 350 return; 351 } else { 352 panic("sofree: not queued"); 353 } 354 soclrstate(so, SS_INCOMP); 355 so->so_head = NULL; 356 lwkt_reltoken(&head->so_rcv.ssb_token); 357 } 358 ssb_release(&so->so_snd, so); 359 sorflush(so); 360 sodealloc(so); 361 } 362 363 /* 364 * Close a socket on last file table reference removal. 365 * Initiate disconnect if connected. 366 * Free socket when disconnect complete. 367 */ 368 int 369 soclose(struct socket *so, int fflag) 370 { 371 int error = 0; 372 373 funsetown(so->so_sigio); 374 if (so->so_pcb == NULL) 375 goto discard; 376 if (so->so_state & SS_ISCONNECTED) { 377 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 378 error = sodisconnect(so); 379 if (error) 380 goto drop; 381 } 382 if (so->so_options & SO_LINGER) { 383 if ((so->so_state & SS_ISDISCONNECTING) && 384 (fflag & FNONBLOCK)) 385 goto drop; 386 while (so->so_state & SS_ISCONNECTED) { 387 error = tsleep(&so->so_timeo, PCATCH, 388 "soclos", so->so_linger * hz); 389 if (error) 390 break; 391 } 392 } 393 } 394 drop: 395 if (so->so_pcb) { 396 int error2; 397 398 error2 = so_pru_detach(so); 399 if (error == 0) 400 error = error2; 401 } 402 discard: 403 lwkt_gettoken(&so->so_rcv.ssb_token); 404 if (so->so_options & SO_ACCEPTCONN) { 405 struct socket *sp; 406 407 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 408 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 409 soclrstate(sp, SS_INCOMP); 410 sp->so_head = NULL; 411 so->so_incqlen--; 412 soaborta(sp); 413 } 414 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 415 TAILQ_REMOVE(&so->so_comp, sp, so_list); 416 soclrstate(sp, SS_COMP); 417 sp->so_head = NULL; 418 so->so_qlen--; 419 soaborta(sp); 420 } 421 } 422 lwkt_reltoken(&so->so_rcv.ssb_token); 423 if (so->so_state & SS_NOFDREF) 424 panic("soclose: NOFDREF"); 425 sosetstate(so, SS_NOFDREF); /* take ref */ 426 sofree(so); /* dispose of ref */ 427 return (error); 428 } 429 430 /* 431 * Abort and destroy a socket. Only one abort can be in progress 432 * at any given moment. 433 */ 434 void 435 soabort(struct socket *so) 436 { 437 soreference(so); 438 so_pru_abort(so); 439 } 440 441 void 442 soaborta(struct socket *so) 443 { 444 soreference(so); 445 so_pru_aborta(so); 446 } 447 448 void 449 soabort_oncpu(struct socket *so) 450 { 451 soreference(so); 452 so_pru_abort_oncpu(so); 453 } 454 455 int 456 soaccept(struct socket *so, struct sockaddr **nam) 457 { 458 int error; 459 460 if ((so->so_state & SS_NOFDREF) == 0) 461 panic("soaccept: !NOFDREF"); 462 soreference(so); /* create ref */ 463 soclrstate(so, SS_NOFDREF); /* owned by lack of SS_NOFDREF */ 464 error = so_pru_accept_direct(so, nam); 465 return (error); 466 } 467 468 int 469 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) 470 { 471 int error; 472 473 if (so->so_options & SO_ACCEPTCONN) 474 return (EOPNOTSUPP); 475 /* 476 * If protocol is connection-based, can only connect once. 477 * Otherwise, if connected, try to disconnect first. 478 * This allows user to disconnect by connecting to, e.g., 479 * a null address. 480 */ 481 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 482 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 483 (error = sodisconnect(so)))) { 484 error = EISCONN; 485 } else { 486 /* 487 * Prevent accumulated error from previous connection 488 * from biting us. 489 */ 490 so->so_error = 0; 491 error = so_pru_connect(so, nam, td); 492 } 493 return (error); 494 } 495 496 int 497 soconnect2(struct socket *so1, struct socket *so2) 498 { 499 int error; 500 501 error = so_pru_connect2(so1, so2); 502 return (error); 503 } 504 505 int 506 sodisconnect(struct socket *so) 507 { 508 int error; 509 510 if ((so->so_state & SS_ISCONNECTED) == 0) { 511 error = ENOTCONN; 512 goto bad; 513 } 514 if (so->so_state & SS_ISDISCONNECTING) { 515 error = EALREADY; 516 goto bad; 517 } 518 error = so_pru_disconnect(so); 519 bad: 520 return (error); 521 } 522 523 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 524 /* 525 * Send on a socket. 526 * If send must go all at once and message is larger than 527 * send buffering, then hard error. 528 * Lock against other senders. 529 * If must go all at once and not enough room now, then 530 * inform user that this would block and do nothing. 531 * Otherwise, if nonblocking, send as much as possible. 532 * The data to be sent is described by "uio" if nonzero, 533 * otherwise by the mbuf chain "top" (which must be null 534 * if uio is not). Data provided in mbuf chain must be small 535 * enough to send all at once. 536 * 537 * Returns nonzero on error, timeout or signal; callers 538 * must check for short counts if EINTR/ERESTART are returned. 539 * Data and control buffers are freed on return. 540 */ 541 int 542 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 543 struct mbuf *top, struct mbuf *control, int flags, 544 struct thread *td) 545 { 546 struct mbuf **mp; 547 struct mbuf *m; 548 size_t resid; 549 int space, len; 550 int clen = 0, error, dontroute, mlen; 551 int atomic = sosendallatonce(so) || top; 552 int pru_flags; 553 554 if (uio) { 555 resid = uio->uio_resid; 556 } else { 557 resid = (size_t)top->m_pkthdr.len; 558 #ifdef INVARIANTS 559 len = 0; 560 for (m = top; m; m = m->m_next) 561 len += m->m_len; 562 KKASSERT(top->m_pkthdr.len == len); 563 #endif 564 } 565 566 /* 567 * WARNING! resid is unsigned, space and len are signed. space 568 * can wind up negative if the sockbuf is overcommitted. 569 * 570 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 571 * type sockets since that's an error. 572 */ 573 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 574 error = EINVAL; 575 goto out; 576 } 577 578 dontroute = 579 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 580 (so->so_proto->pr_flags & PR_ATOMIC); 581 if (td->td_lwp != NULL) 582 td->td_lwp->lwp_ru.ru_msgsnd++; 583 if (control) 584 clen = control->m_len; 585 #define gotoerr(errcode) { error = errcode; goto release; } 586 587 restart: 588 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags)); 589 if (error) 590 goto out; 591 592 do { 593 if (so->so_state & SS_CANTSENDMORE) 594 gotoerr(EPIPE); 595 if (so->so_error) { 596 error = so->so_error; 597 so->so_error = 0; 598 goto release; 599 } 600 if ((so->so_state & SS_ISCONNECTED) == 0) { 601 /* 602 * `sendto' and `sendmsg' is allowed on a connection- 603 * based socket if it supports implied connect. 604 * Return ENOTCONN if not connected and no address is 605 * supplied. 606 */ 607 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 608 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 609 if ((so->so_state & SS_ISCONFIRMING) == 0 && 610 !(resid == 0 && clen != 0)) 611 gotoerr(ENOTCONN); 612 } else if (addr == 0) 613 gotoerr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 614 ENOTCONN : EDESTADDRREQ); 615 } 616 if ((atomic && resid > so->so_snd.ssb_hiwat) || 617 clen > so->so_snd.ssb_hiwat) { 618 gotoerr(EMSGSIZE); 619 } 620 space = ssb_space(&so->so_snd); 621 if (flags & MSG_OOB) 622 space += 1024; 623 if ((space < 0 || (size_t)space < resid + clen) && uio && 624 (atomic || space < so->so_snd.ssb_lowat || space < clen)) { 625 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) 626 gotoerr(EWOULDBLOCK); 627 ssb_unlock(&so->so_snd); 628 error = ssb_wait(&so->so_snd); 629 if (error) 630 goto out; 631 goto restart; 632 } 633 mp = ⊤ 634 space -= clen; 635 do { 636 if (uio == NULL) { 637 /* 638 * Data is prepackaged in "top". 639 */ 640 resid = 0; 641 if (flags & MSG_EOR) 642 top->m_flags |= M_EOR; 643 } else do { 644 if (resid > INT_MAX) 645 resid = INT_MAX; 646 m = m_getl((int)resid, MB_WAIT, MT_DATA, 647 top == NULL ? M_PKTHDR : 0, &mlen); 648 if (top == NULL) { 649 m->m_pkthdr.len = 0; 650 m->m_pkthdr.rcvif = NULL; 651 } 652 len = imin((int)szmin(mlen, resid), space); 653 if (resid < MINCLSIZE) { 654 /* 655 * For datagram protocols, leave room 656 * for protocol headers in first mbuf. 657 */ 658 if (atomic && top == 0 && len < mlen) 659 MH_ALIGN(m, len); 660 } 661 space -= len; 662 error = uiomove(mtod(m, caddr_t), (size_t)len, uio); 663 resid = uio->uio_resid; 664 m->m_len = len; 665 *mp = m; 666 top->m_pkthdr.len += len; 667 if (error) 668 goto release; 669 mp = &m->m_next; 670 if (resid == 0) { 671 if (flags & MSG_EOR) 672 top->m_flags |= M_EOR; 673 break; 674 } 675 } while (space > 0 && atomic); 676 if (dontroute) 677 so->so_options |= SO_DONTROUTE; 678 if (flags & MSG_OOB) { 679 pru_flags = PRUS_OOB; 680 } else if ((flags & MSG_EOF) && 681 (so->so_proto->pr_flags & PR_IMPLOPCL) && 682 (resid == 0)) { 683 /* 684 * If the user set MSG_EOF, the protocol 685 * understands this flag and nothing left to 686 * send then use PRU_SEND_EOF instead of PRU_SEND. 687 */ 688 pru_flags = PRUS_EOF; 689 } else if (resid > 0 && space > 0) { 690 /* If there is more to send, set PRUS_MORETOCOME */ 691 pru_flags = PRUS_MORETOCOME; 692 } else { 693 pru_flags = 0; 694 } 695 /* 696 * XXX all the SS_CANTSENDMORE checks previously 697 * done could be out of date. We could have recieved 698 * a reset packet in an interrupt or maybe we slept 699 * while doing page faults in uiomove() etc. We could 700 * probably recheck again inside the splnet() protection 701 * here, but there are probably other places that this 702 * also happens. We must rethink this. 703 */ 704 error = so_pru_send(so, pru_flags, top, addr, control, td); 705 if (dontroute) 706 so->so_options &= ~SO_DONTROUTE; 707 clen = 0; 708 control = 0; 709 top = NULL; 710 mp = ⊤ 711 if (error) 712 goto release; 713 } while (resid && space > 0); 714 } while (resid); 715 716 release: 717 ssb_unlock(&so->so_snd); 718 out: 719 if (top) 720 m_freem(top); 721 if (control) 722 m_freem(control); 723 return (error); 724 } 725 726 /* 727 * A specialization of sosend() for UDP based on protocol-specific knowledge: 728 * so->so_proto->pr_flags has the PR_ATOMIC field set. This means that 729 * sosendallatonce() returns true, 730 * the "atomic" variable is true, 731 * and sosendudp() blocks until space is available for the entire send. 732 * so->so_proto->pr_flags does not have the PR_CONNREQUIRED or 733 * PR_IMPLOPCL flags set. 734 * UDP has no out-of-band data. 735 * UDP has no control data. 736 * UDP does not support MSG_EOR. 737 */ 738 int 739 sosendudp(struct socket *so, struct sockaddr *addr, struct uio *uio, 740 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 741 { 742 boolean_t dontroute; /* temporary SO_DONTROUTE setting */ 743 size_t resid; 744 int error; 745 int space; 746 747 if (td->td_lwp != NULL) 748 td->td_lwp->lwp_ru.ru_msgsnd++; 749 if (control) 750 m_freem(control); 751 752 KASSERT((uio && !top) || (top && !uio), ("bad arguments to sosendudp")); 753 resid = uio ? uio->uio_resid : (size_t)top->m_pkthdr.len; 754 755 restart: 756 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags)); 757 if (error) 758 goto out; 759 760 if (so->so_state & SS_CANTSENDMORE) 761 gotoerr(EPIPE); 762 if (so->so_error) { 763 error = so->so_error; 764 so->so_error = 0; 765 goto release; 766 } 767 if (!(so->so_state & SS_ISCONNECTED) && addr == NULL) 768 gotoerr(EDESTADDRREQ); 769 if (resid > so->so_snd.ssb_hiwat) 770 gotoerr(EMSGSIZE); 771 space = ssb_space(&so->so_snd); 772 if (uio && (space < 0 || (size_t)space < resid)) { 773 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) 774 gotoerr(EWOULDBLOCK); 775 ssb_unlock(&so->so_snd); 776 error = ssb_wait(&so->so_snd); 777 if (error) 778 goto out; 779 goto restart; 780 } 781 782 if (uio) { 783 top = m_uiomove(uio); 784 if (top == NULL) 785 goto release; 786 } 787 788 dontroute = (flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE); 789 if (dontroute) 790 so->so_options |= SO_DONTROUTE; 791 792 error = so_pru_send(so, 0, top, addr, NULL, td); 793 top = NULL; /* sent or freed in lower layer */ 794 795 if (dontroute) 796 so->so_options &= ~SO_DONTROUTE; 797 798 release: 799 ssb_unlock(&so->so_snd); 800 out: 801 if (top) 802 m_freem(top); 803 return (error); 804 } 805 806 /* 807 * Implement receive operations on a socket. 808 * 809 * We depend on the way that records are added to the signalsockbuf 810 * by sbappend*. In particular, each record (mbufs linked through m_next) 811 * must begin with an address if the protocol so specifies, 812 * followed by an optional mbuf or mbufs containing ancillary data, 813 * and then zero or more mbufs of data. 814 * 815 * Although the signalsockbuf is locked, new data may still be appended. 816 * A token inside the ssb_lock deals with MP issues and still allows 817 * the network to access the socket if we block in a uio. 818 * 819 * The caller may receive the data as a single mbuf chain by supplying 820 * an mbuf **mp0 for use in returning the chain. The uio is then used 821 * only for the count in uio_resid. 822 */ 823 int 824 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 825 struct sockbuf *sio, struct mbuf **controlp, int *flagsp) 826 { 827 struct mbuf *m, *n; 828 struct mbuf *free_chain = NULL; 829 int flags, len, error, offset; 830 struct protosw *pr = so->so_proto; 831 int moff, type = 0; 832 size_t resid, orig_resid; 833 834 if (uio) 835 resid = uio->uio_resid; 836 else 837 resid = (size_t)(sio->sb_climit - sio->sb_cc); 838 orig_resid = resid; 839 840 if (psa) 841 *psa = NULL; 842 if (controlp) 843 *controlp = NULL; 844 if (flagsp) 845 flags = *flagsp &~ MSG_EOR; 846 else 847 flags = 0; 848 if (flags & MSG_OOB) { 849 m = m_get(MB_WAIT, MT_DATA); 850 if (m == NULL) 851 return (ENOBUFS); 852 error = so_pru_rcvoob(so, m, flags & MSG_PEEK); 853 if (error) 854 goto bad; 855 if (sio) { 856 do { 857 sbappend(sio, m); 858 KKASSERT(resid >= (size_t)m->m_len); 859 resid -= (size_t)m->m_len; 860 } while (resid > 0 && m); 861 } else { 862 do { 863 uio->uio_resid = resid; 864 error = uiomove(mtod(m, caddr_t), 865 (int)szmin(resid, m->m_len), 866 uio); 867 resid = uio->uio_resid; 868 m = m_free(m); 869 } while (uio->uio_resid && error == 0 && m); 870 } 871 bad: 872 if (m) 873 m_freem(m); 874 return (error); 875 } 876 if ((so->so_state & SS_ISCONFIRMING) && resid) 877 so_pru_rcvd(so, 0); 878 879 /* 880 * The token interlocks against the protocol thread while 881 * ssb_lock is a blocking lock against other userland entities. 882 */ 883 lwkt_gettoken(&so->so_rcv.ssb_token); 884 restart: 885 error = ssb_lock(&so->so_rcv, SBLOCKWAIT(flags)); 886 if (error) 887 goto done; 888 889 m = so->so_rcv.ssb_mb; 890 /* 891 * If we have less data than requested, block awaiting more 892 * (subject to any timeout) if: 893 * 1. the current count is less than the low water mark, or 894 * 2. MSG_WAITALL is set, and it is possible to do the entire 895 * receive operation at once if we block (resid <= hiwat). 896 * 3. MSG_DONTWAIT is not set 897 * If MSG_WAITALL is set but resid is larger than the receive buffer, 898 * we have to do the receive in sections, and thus risk returning 899 * a short count if a timeout or signal occurs after we start. 900 */ 901 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 902 (size_t)so->so_rcv.ssb_cc < resid) && 903 (so->so_rcv.ssb_cc < so->so_rcv.ssb_lowat || 904 ((flags & MSG_WAITALL) && resid <= (size_t)so->so_rcv.ssb_hiwat)) && 905 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { 906 KASSERT(m != NULL || !so->so_rcv.ssb_cc, ("receive 1")); 907 if (so->so_error) { 908 if (m) 909 goto dontblock; 910 error = so->so_error; 911 if ((flags & MSG_PEEK) == 0) 912 so->so_error = 0; 913 goto release; 914 } 915 if (so->so_state & SS_CANTRCVMORE) { 916 if (m) 917 goto dontblock; 918 else 919 goto release; 920 } 921 for (; m; m = m->m_next) { 922 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 923 m = so->so_rcv.ssb_mb; 924 goto dontblock; 925 } 926 } 927 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 928 (pr->pr_flags & PR_CONNREQUIRED)) { 929 error = ENOTCONN; 930 goto release; 931 } 932 if (resid == 0) 933 goto release; 934 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) { 935 error = EWOULDBLOCK; 936 goto release; 937 } 938 ssb_unlock(&so->so_rcv); 939 error = ssb_wait(&so->so_rcv); 940 if (error) 941 goto done; 942 goto restart; 943 } 944 dontblock: 945 if (uio && uio->uio_td && uio->uio_td->td_proc) 946 uio->uio_td->td_lwp->lwp_ru.ru_msgrcv++; 947 948 /* 949 * note: m should be == sb_mb here. Cache the next record while 950 * cleaning up. Note that calling m_free*() will break out critical 951 * section. 952 */ 953 KKASSERT(m == so->so_rcv.ssb_mb); 954 955 /* 956 * Skip any address mbufs prepending the record. 957 */ 958 if (pr->pr_flags & PR_ADDR) { 959 KASSERT(m->m_type == MT_SONAME, ("receive 1a")); 960 orig_resid = 0; 961 if (psa) 962 *psa = dup_sockaddr(mtod(m, struct sockaddr *)); 963 if (flags & MSG_PEEK) 964 m = m->m_next; 965 else 966 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 967 } 968 969 /* 970 * Skip any control mbufs prepending the record. 971 */ 972 #ifdef SCTP 973 if (pr->pr_flags & PR_ADDR_OPT) { 974 /* 975 * For SCTP we may be getting a 976 * whole message OR a partial delivery. 977 */ 978 if (m && m->m_type == MT_SONAME) { 979 orig_resid = 0; 980 if (psa) 981 *psa = dup_sockaddr(mtod(m, struct sockaddr *)); 982 if (flags & MSG_PEEK) 983 m = m->m_next; 984 else 985 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 986 } 987 } 988 #endif /* SCTP */ 989 while (m && m->m_type == MT_CONTROL && error == 0) { 990 if (flags & MSG_PEEK) { 991 if (controlp) 992 *controlp = m_copy(m, 0, m->m_len); 993 m = m->m_next; /* XXX race */ 994 } else { 995 if (controlp) { 996 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL); 997 if (pr->pr_domain->dom_externalize && 998 mtod(m, struct cmsghdr *)->cmsg_type == 999 SCM_RIGHTS) 1000 error = (*pr->pr_domain->dom_externalize)(m); 1001 *controlp = m; 1002 m = n; 1003 } else { 1004 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1005 } 1006 } 1007 if (controlp && *controlp) { 1008 orig_resid = 0; 1009 controlp = &(*controlp)->m_next; 1010 } 1011 } 1012 1013 /* 1014 * flag OOB data. 1015 */ 1016 if (m) { 1017 type = m->m_type; 1018 if (type == MT_OOBDATA) 1019 flags |= MSG_OOB; 1020 } 1021 1022 /* 1023 * Copy to the UIO or mbuf return chain (*mp). 1024 */ 1025 moff = 0; 1026 offset = 0; 1027 while (m && resid > 0 && error == 0) { 1028 if (m->m_type == MT_OOBDATA) { 1029 if (type != MT_OOBDATA) 1030 break; 1031 } else if (type == MT_OOBDATA) 1032 break; 1033 else 1034 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 1035 ("receive 3")); 1036 soclrstate(so, SS_RCVATMARK); 1037 len = (resid > INT_MAX) ? INT_MAX : resid; 1038 if (so->so_oobmark && len > so->so_oobmark - offset) 1039 len = so->so_oobmark - offset; 1040 if (len > m->m_len - moff) 1041 len = m->m_len - moff; 1042 1043 /* 1044 * Copy out to the UIO or pass the mbufs back to the SIO. 1045 * The SIO is dealt with when we eat the mbuf, but deal 1046 * with the resid here either way. 1047 */ 1048 if (uio) { 1049 uio->uio_resid = resid; 1050 error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1051 resid = uio->uio_resid; 1052 if (error) 1053 goto release; 1054 } else { 1055 resid -= (size_t)len; 1056 } 1057 1058 /* 1059 * Eat the entire mbuf or just a piece of it 1060 */ 1061 if (len == m->m_len - moff) { 1062 if (m->m_flags & M_EOR) 1063 flags |= MSG_EOR; 1064 #ifdef SCTP 1065 if (m->m_flags & M_NOTIFICATION) 1066 flags |= MSG_NOTIFICATION; 1067 #endif /* SCTP */ 1068 if (flags & MSG_PEEK) { 1069 m = m->m_next; 1070 moff = 0; 1071 } else { 1072 if (sio) { 1073 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL); 1074 sbappend(sio, m); 1075 m = n; 1076 } else { 1077 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1078 } 1079 } 1080 } else { 1081 if (flags & MSG_PEEK) { 1082 moff += len; 1083 } else { 1084 if (sio) { 1085 n = m_copym(m, 0, len, MB_WAIT); 1086 if (n) 1087 sbappend(sio, n); 1088 } 1089 m->m_data += len; 1090 m->m_len -= len; 1091 so->so_rcv.ssb_cc -= len; 1092 } 1093 } 1094 if (so->so_oobmark) { 1095 if ((flags & MSG_PEEK) == 0) { 1096 so->so_oobmark -= len; 1097 if (so->so_oobmark == 0) { 1098 sosetstate(so, SS_RCVATMARK); 1099 break; 1100 } 1101 } else { 1102 offset += len; 1103 if (offset == so->so_oobmark) 1104 break; 1105 } 1106 } 1107 if (flags & MSG_EOR) 1108 break; 1109 /* 1110 * If the MSG_WAITALL flag is set (for non-atomic socket), 1111 * we must not quit until resid == 0 or an error 1112 * termination. If a signal/timeout occurs, return 1113 * with a short count but without error. 1114 * Keep signalsockbuf locked against other readers. 1115 */ 1116 while ((flags & MSG_WAITALL) && m == NULL && 1117 resid > 0 && !sosendallatonce(so) && 1118 so->so_rcv.ssb_mb == NULL) { 1119 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1120 break; 1121 /* 1122 * The window might have closed to zero, make 1123 * sure we send an ack now that we've drained 1124 * the buffer or we might end up blocking until 1125 * the idle takes over (5 seconds). 1126 */ 1127 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 1128 so_pru_rcvd(so, flags); 1129 error = ssb_wait(&so->so_rcv); 1130 if (error) { 1131 ssb_unlock(&so->so_rcv); 1132 error = 0; 1133 goto done; 1134 } 1135 m = so->so_rcv.ssb_mb; 1136 } 1137 } 1138 1139 /* 1140 * If an atomic read was requested but unread data still remains 1141 * in the record, set MSG_TRUNC. 1142 */ 1143 if (m && pr->pr_flags & PR_ATOMIC) 1144 flags |= MSG_TRUNC; 1145 1146 /* 1147 * Cleanup. If an atomic read was requested drop any unread data. 1148 */ 1149 if ((flags & MSG_PEEK) == 0) { 1150 if (m && (pr->pr_flags & PR_ATOMIC)) 1151 sbdroprecord(&so->so_rcv.sb); 1152 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) 1153 so_pru_rcvd(so, flags); 1154 } 1155 1156 if (orig_resid == resid && orig_resid && 1157 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1158 ssb_unlock(&so->so_rcv); 1159 goto restart; 1160 } 1161 1162 if (flagsp) 1163 *flagsp |= flags; 1164 release: 1165 ssb_unlock(&so->so_rcv); 1166 done: 1167 lwkt_reltoken(&so->so_rcv.ssb_token); 1168 if (free_chain) 1169 m_freem(free_chain); 1170 return (error); 1171 } 1172 1173 int 1174 soshutdown(struct socket *so, int how) 1175 { 1176 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1177 return (EINVAL); 1178 1179 if (how != SHUT_WR) { 1180 ssb_lock(&so->so_rcv, M_WAITOK); /* frontend lock */ 1181 sorflush(so); 1182 ssb_unlock(&so->so_rcv); 1183 } 1184 if (how != SHUT_RD) 1185 return (so_pru_shutdown(so)); 1186 return (0); 1187 } 1188 1189 void 1190 sorflush(struct socket *so) 1191 { 1192 struct signalsockbuf *ssb = &so->so_rcv; 1193 struct protosw *pr = so->so_proto; 1194 struct signalsockbuf asb; 1195 1196 atomic_set_int(&ssb->ssb_flags, SSB_NOINTR); 1197 1198 lwkt_gettoken(&ssb->ssb_token); 1199 socantrcvmore(so); 1200 asb = *ssb; 1201 1202 /* 1203 * Can't just blow up the ssb structure here 1204 */ 1205 bzero(&ssb->sb, sizeof(ssb->sb)); 1206 ssb->ssb_timeo = 0; 1207 ssb->ssb_unused01 = 0; 1208 ssb->ssb_lowat = 0; 1209 ssb->ssb_hiwat = 0; 1210 ssb->ssb_mbmax = 0; 1211 atomic_clear_int(&ssb->ssb_flags, SSB_CLEAR_MASK); 1212 1213 lwkt_reltoken(&ssb->ssb_token); 1214 1215 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1216 (*pr->pr_domain->dom_dispose)(asb.ssb_mb); 1217 ssb_release(&asb, so); 1218 } 1219 1220 #ifdef INET 1221 static int 1222 do_setopt_accept_filter(struct socket *so, struct sockopt *sopt) 1223 { 1224 struct accept_filter_arg *afap = NULL; 1225 struct accept_filter *afp; 1226 struct so_accf *af = so->so_accf; 1227 int error = 0; 1228 1229 /* do not set/remove accept filters on non listen sockets */ 1230 if ((so->so_options & SO_ACCEPTCONN) == 0) { 1231 error = EINVAL; 1232 goto out; 1233 } 1234 1235 /* removing the filter */ 1236 if (sopt == NULL) { 1237 if (af != NULL) { 1238 if (af->so_accept_filter != NULL && 1239 af->so_accept_filter->accf_destroy != NULL) { 1240 af->so_accept_filter->accf_destroy(so); 1241 } 1242 if (af->so_accept_filter_str != NULL) { 1243 FREE(af->so_accept_filter_str, M_ACCF); 1244 } 1245 FREE(af, M_ACCF); 1246 so->so_accf = NULL; 1247 } 1248 so->so_options &= ~SO_ACCEPTFILTER; 1249 return (0); 1250 } 1251 /* adding a filter */ 1252 /* must remove previous filter first */ 1253 if (af != NULL) { 1254 error = EINVAL; 1255 goto out; 1256 } 1257 /* don't put large objects on the kernel stack */ 1258 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, M_WAITOK); 1259 error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); 1260 afap->af_name[sizeof(afap->af_name)-1] = '\0'; 1261 afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; 1262 if (error) 1263 goto out; 1264 afp = accept_filt_get(afap->af_name); 1265 if (afp == NULL) { 1266 error = ENOENT; 1267 goto out; 1268 } 1269 MALLOC(af, struct so_accf *, sizeof(*af), M_ACCF, M_WAITOK | M_ZERO); 1270 if (afp->accf_create != NULL) { 1271 if (afap->af_name[0] != '\0') { 1272 int len = strlen(afap->af_name) + 1; 1273 1274 MALLOC(af->so_accept_filter_str, char *, len, M_ACCF, M_WAITOK); 1275 strcpy(af->so_accept_filter_str, afap->af_name); 1276 } 1277 af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg); 1278 if (af->so_accept_filter_arg == NULL) { 1279 FREE(af->so_accept_filter_str, M_ACCF); 1280 FREE(af, M_ACCF); 1281 so->so_accf = NULL; 1282 error = EINVAL; 1283 goto out; 1284 } 1285 } 1286 af->so_accept_filter = afp; 1287 so->so_accf = af; 1288 so->so_options |= SO_ACCEPTFILTER; 1289 out: 1290 if (afap != NULL) 1291 FREE(afap, M_TEMP); 1292 return (error); 1293 } 1294 #endif /* INET */ 1295 1296 /* 1297 * Perhaps this routine, and sooptcopyout(), below, ought to come in 1298 * an additional variant to handle the case where the option value needs 1299 * to be some kind of integer, but not a specific size. 1300 * In addition to their use here, these functions are also called by the 1301 * protocol-level pr_ctloutput() routines. 1302 */ 1303 int 1304 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 1305 { 1306 return soopt_to_kbuf(sopt, buf, len, minlen); 1307 } 1308 1309 int 1310 soopt_to_kbuf(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 1311 { 1312 size_t valsize; 1313 1314 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 1315 KKASSERT(kva_p(buf)); 1316 1317 /* 1318 * If the user gives us more than we wanted, we ignore it, 1319 * but if we don't get the minimum length the caller 1320 * wants, we return EINVAL. On success, sopt->sopt_valsize 1321 * is set to however much we actually retrieved. 1322 */ 1323 if ((valsize = sopt->sopt_valsize) < minlen) 1324 return EINVAL; 1325 if (valsize > len) 1326 sopt->sopt_valsize = valsize = len; 1327 1328 bcopy(sopt->sopt_val, buf, valsize); 1329 return 0; 1330 } 1331 1332 1333 int 1334 sosetopt(struct socket *so, struct sockopt *sopt) 1335 { 1336 int error, optval; 1337 struct linger l; 1338 struct timeval tv; 1339 u_long val; 1340 struct signalsockbuf *sotmp; 1341 1342 error = 0; 1343 sopt->sopt_dir = SOPT_SET; 1344 if (sopt->sopt_level != SOL_SOCKET) { 1345 if (so->so_proto && so->so_proto->pr_ctloutput) { 1346 return (so_pr_ctloutput(so, sopt)); 1347 } 1348 error = ENOPROTOOPT; 1349 } else { 1350 switch (sopt->sopt_name) { 1351 #ifdef INET 1352 case SO_ACCEPTFILTER: 1353 error = do_setopt_accept_filter(so, sopt); 1354 if (error) 1355 goto bad; 1356 break; 1357 #endif /* INET */ 1358 case SO_LINGER: 1359 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 1360 if (error) 1361 goto bad; 1362 1363 so->so_linger = l.l_linger; 1364 if (l.l_onoff) 1365 so->so_options |= SO_LINGER; 1366 else 1367 so->so_options &= ~SO_LINGER; 1368 break; 1369 1370 case SO_DEBUG: 1371 case SO_KEEPALIVE: 1372 case SO_DONTROUTE: 1373 case SO_USELOOPBACK: 1374 case SO_BROADCAST: 1375 case SO_REUSEADDR: 1376 case SO_REUSEPORT: 1377 case SO_OOBINLINE: 1378 case SO_TIMESTAMP: 1379 error = sooptcopyin(sopt, &optval, sizeof optval, 1380 sizeof optval); 1381 if (error) 1382 goto bad; 1383 if (optval) 1384 so->so_options |= sopt->sopt_name; 1385 else 1386 so->so_options &= ~sopt->sopt_name; 1387 break; 1388 1389 case SO_SNDBUF: 1390 case SO_RCVBUF: 1391 case SO_SNDLOWAT: 1392 case SO_RCVLOWAT: 1393 error = sooptcopyin(sopt, &optval, sizeof optval, 1394 sizeof optval); 1395 if (error) 1396 goto bad; 1397 1398 /* 1399 * Values < 1 make no sense for any of these 1400 * options, so disallow them. 1401 */ 1402 if (optval < 1) { 1403 error = EINVAL; 1404 goto bad; 1405 } 1406 1407 switch (sopt->sopt_name) { 1408 case SO_SNDBUF: 1409 case SO_RCVBUF: 1410 if (ssb_reserve(sopt->sopt_name == SO_SNDBUF ? 1411 &so->so_snd : &so->so_rcv, (u_long)optval, 1412 so, 1413 &curproc->p_rlimit[RLIMIT_SBSIZE]) == 0) { 1414 error = ENOBUFS; 1415 goto bad; 1416 } 1417 sotmp = (sopt->sopt_name == SO_SNDBUF) ? 1418 &so->so_snd : &so->so_rcv; 1419 atomic_clear_int(&sotmp->ssb_flags, 1420 SSB_AUTOSIZE); 1421 break; 1422 1423 /* 1424 * Make sure the low-water is never greater than 1425 * the high-water. 1426 */ 1427 case SO_SNDLOWAT: 1428 so->so_snd.ssb_lowat = 1429 (optval > so->so_snd.ssb_hiwat) ? 1430 so->so_snd.ssb_hiwat : optval; 1431 atomic_clear_int(&so->so_snd.ssb_flags, 1432 SSB_AUTOLOWAT); 1433 break; 1434 case SO_RCVLOWAT: 1435 so->so_rcv.ssb_lowat = 1436 (optval > so->so_rcv.ssb_hiwat) ? 1437 so->so_rcv.ssb_hiwat : optval; 1438 atomic_clear_int(&so->so_rcv.ssb_flags, 1439 SSB_AUTOLOWAT); 1440 break; 1441 } 1442 break; 1443 1444 case SO_SNDTIMEO: 1445 case SO_RCVTIMEO: 1446 error = sooptcopyin(sopt, &tv, sizeof tv, 1447 sizeof tv); 1448 if (error) 1449 goto bad; 1450 1451 /* assert(hz > 0); */ 1452 if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz || 1453 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 1454 error = EDOM; 1455 goto bad; 1456 } 1457 /* assert(tick > 0); */ 1458 /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */ 1459 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / ustick; 1460 if (val > SHRT_MAX) { 1461 error = EDOM; 1462 goto bad; 1463 } 1464 if (val == 0 && tv.tv_usec != 0) 1465 val = 1; 1466 1467 switch (sopt->sopt_name) { 1468 case SO_SNDTIMEO: 1469 so->so_snd.ssb_timeo = val; 1470 break; 1471 case SO_RCVTIMEO: 1472 so->so_rcv.ssb_timeo = val; 1473 break; 1474 } 1475 break; 1476 default: 1477 error = ENOPROTOOPT; 1478 break; 1479 } 1480 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { 1481 (void) so_pr_ctloutput(so, sopt); 1482 } 1483 } 1484 bad: 1485 return (error); 1486 } 1487 1488 /* Helper routine for getsockopt */ 1489 int 1490 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 1491 { 1492 soopt_from_kbuf(sopt, buf, len); 1493 return 0; 1494 } 1495 1496 void 1497 soopt_from_kbuf(struct sockopt *sopt, const void *buf, size_t len) 1498 { 1499 size_t valsize; 1500 1501 if (len == 0) { 1502 sopt->sopt_valsize = 0; 1503 return; 1504 } 1505 1506 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 1507 KKASSERT(kva_p(buf)); 1508 1509 /* 1510 * Documented get behavior is that we always return a value, 1511 * possibly truncated to fit in the user's buffer. 1512 * Traditional behavior is that we always tell the user 1513 * precisely how much we copied, rather than something useful 1514 * like the total amount we had available for her. 1515 * Note that this interface is not idempotent; the entire answer must 1516 * generated ahead of time. 1517 */ 1518 valsize = szmin(len, sopt->sopt_valsize); 1519 sopt->sopt_valsize = valsize; 1520 if (sopt->sopt_val != 0) { 1521 bcopy(buf, sopt->sopt_val, valsize); 1522 } 1523 } 1524 1525 int 1526 sogetopt(struct socket *so, struct sockopt *sopt) 1527 { 1528 int error, optval; 1529 struct linger l; 1530 struct timeval tv; 1531 #ifdef INET 1532 struct accept_filter_arg *afap; 1533 #endif 1534 1535 error = 0; 1536 sopt->sopt_dir = SOPT_GET; 1537 if (sopt->sopt_level != SOL_SOCKET) { 1538 if (so->so_proto && so->so_proto->pr_ctloutput) { 1539 return (so_pr_ctloutput(so, sopt)); 1540 } else 1541 return (ENOPROTOOPT); 1542 } else { 1543 switch (sopt->sopt_name) { 1544 #ifdef INET 1545 case SO_ACCEPTFILTER: 1546 if ((so->so_options & SO_ACCEPTCONN) == 0) 1547 return (EINVAL); 1548 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), 1549 M_TEMP, M_WAITOK | M_ZERO); 1550 if ((so->so_options & SO_ACCEPTFILTER) != 0) { 1551 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); 1552 if (so->so_accf->so_accept_filter_str != NULL) 1553 strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); 1554 } 1555 error = sooptcopyout(sopt, afap, sizeof(*afap)); 1556 FREE(afap, M_TEMP); 1557 break; 1558 #endif /* INET */ 1559 1560 case SO_LINGER: 1561 l.l_onoff = so->so_options & SO_LINGER; 1562 l.l_linger = so->so_linger; 1563 error = sooptcopyout(sopt, &l, sizeof l); 1564 break; 1565 1566 case SO_USELOOPBACK: 1567 case SO_DONTROUTE: 1568 case SO_DEBUG: 1569 case SO_KEEPALIVE: 1570 case SO_REUSEADDR: 1571 case SO_REUSEPORT: 1572 case SO_BROADCAST: 1573 case SO_OOBINLINE: 1574 case SO_TIMESTAMP: 1575 optval = so->so_options & sopt->sopt_name; 1576 integer: 1577 error = sooptcopyout(sopt, &optval, sizeof optval); 1578 break; 1579 1580 case SO_TYPE: 1581 optval = so->so_type; 1582 goto integer; 1583 1584 case SO_ERROR: 1585 optval = so->so_error; 1586 so->so_error = 0; 1587 goto integer; 1588 1589 case SO_SNDBUF: 1590 optval = so->so_snd.ssb_hiwat; 1591 goto integer; 1592 1593 case SO_RCVBUF: 1594 optval = so->so_rcv.ssb_hiwat; 1595 goto integer; 1596 1597 case SO_SNDLOWAT: 1598 optval = so->so_snd.ssb_lowat; 1599 goto integer; 1600 1601 case SO_RCVLOWAT: 1602 optval = so->so_rcv.ssb_lowat; 1603 goto integer; 1604 1605 case SO_SNDTIMEO: 1606 case SO_RCVTIMEO: 1607 optval = (sopt->sopt_name == SO_SNDTIMEO ? 1608 so->so_snd.ssb_timeo : so->so_rcv.ssb_timeo); 1609 1610 tv.tv_sec = optval / hz; 1611 tv.tv_usec = (optval % hz) * ustick; 1612 error = sooptcopyout(sopt, &tv, sizeof tv); 1613 break; 1614 1615 default: 1616 error = ENOPROTOOPT; 1617 break; 1618 } 1619 return (error); 1620 } 1621 } 1622 1623 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 1624 int 1625 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 1626 { 1627 struct mbuf *m, *m_prev; 1628 int sopt_size = sopt->sopt_valsize, msize; 1629 1630 m = m_getl(sopt_size, sopt->sopt_td ? MB_WAIT : MB_DONTWAIT, MT_DATA, 1631 0, &msize); 1632 if (m == NULL) 1633 return (ENOBUFS); 1634 m->m_len = min(msize, sopt_size); 1635 sopt_size -= m->m_len; 1636 *mp = m; 1637 m_prev = m; 1638 1639 while (sopt_size > 0) { 1640 m = m_getl(sopt_size, sopt->sopt_td ? MB_WAIT : MB_DONTWAIT, 1641 MT_DATA, 0, &msize); 1642 if (m == NULL) { 1643 m_freem(*mp); 1644 return (ENOBUFS); 1645 } 1646 m->m_len = min(msize, sopt_size); 1647 sopt_size -= m->m_len; 1648 m_prev->m_next = m; 1649 m_prev = m; 1650 } 1651 return (0); 1652 } 1653 1654 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 1655 int 1656 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 1657 { 1658 soopt_to_mbuf(sopt, m); 1659 return 0; 1660 } 1661 1662 void 1663 soopt_to_mbuf(struct sockopt *sopt, struct mbuf *m) 1664 { 1665 size_t valsize; 1666 void *val; 1667 1668 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 1669 KKASSERT(kva_p(m)); 1670 if (sopt->sopt_val == NULL) 1671 return; 1672 val = sopt->sopt_val; 1673 valsize = sopt->sopt_valsize; 1674 while (m != NULL && valsize >= m->m_len) { 1675 bcopy(val, mtod(m, char *), m->m_len); 1676 valsize -= m->m_len; 1677 val = (caddr_t)val + m->m_len; 1678 m = m->m_next; 1679 } 1680 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 1681 panic("ip6_sooptmcopyin"); 1682 } 1683 1684 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 1685 int 1686 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 1687 { 1688 return soopt_from_mbuf(sopt, m); 1689 } 1690 1691 int 1692 soopt_from_mbuf(struct sockopt *sopt, struct mbuf *m) 1693 { 1694 struct mbuf *m0 = m; 1695 size_t valsize = 0; 1696 size_t maxsize; 1697 void *val; 1698 1699 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 1700 KKASSERT(kva_p(m)); 1701 if (sopt->sopt_val == NULL) 1702 return 0; 1703 val = sopt->sopt_val; 1704 maxsize = sopt->sopt_valsize; 1705 while (m != NULL && maxsize >= m->m_len) { 1706 bcopy(mtod(m, char *), val, m->m_len); 1707 maxsize -= m->m_len; 1708 val = (caddr_t)val + m->m_len; 1709 valsize += m->m_len; 1710 m = m->m_next; 1711 } 1712 if (m != NULL) { 1713 /* enough soopt buffer should be given from user-land */ 1714 m_freem(m0); 1715 return (EINVAL); 1716 } 1717 sopt->sopt_valsize = valsize; 1718 return 0; 1719 } 1720 1721 void 1722 sohasoutofband(struct socket *so) 1723 { 1724 if (so->so_sigio != NULL) 1725 pgsigio(so->so_sigio, SIGURG, 0); 1726 KNOTE(&so->so_rcv.ssb_kq.ki_note, NOTE_OOB); 1727 } 1728 1729 int 1730 sokqfilter(struct file *fp, struct knote *kn) 1731 { 1732 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1733 struct signalsockbuf *ssb; 1734 1735 switch (kn->kn_filter) { 1736 case EVFILT_READ: 1737 if (so->so_options & SO_ACCEPTCONN) 1738 kn->kn_fop = &solisten_filtops; 1739 else 1740 kn->kn_fop = &soread_filtops; 1741 ssb = &so->so_rcv; 1742 break; 1743 case EVFILT_WRITE: 1744 kn->kn_fop = &sowrite_filtops; 1745 ssb = &so->so_snd; 1746 break; 1747 case EVFILT_EXCEPT: 1748 kn->kn_fop = &soexcept_filtops; 1749 ssb = &so->so_rcv; 1750 break; 1751 default: 1752 return (EOPNOTSUPP); 1753 } 1754 1755 knote_insert(&ssb->ssb_kq.ki_note, kn); 1756 atomic_set_int(&ssb->ssb_flags, SSB_KNOTE); 1757 return (0); 1758 } 1759 1760 static void 1761 filt_sordetach(struct knote *kn) 1762 { 1763 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1764 1765 knote_remove(&so->so_rcv.ssb_kq.ki_note, kn); 1766 if (SLIST_EMPTY(&so->so_rcv.ssb_kq.ki_note)) 1767 atomic_clear_int(&so->so_rcv.ssb_flags, SSB_KNOTE); 1768 } 1769 1770 /*ARGSUSED*/ 1771 static int 1772 filt_soread(struct knote *kn, long hint) 1773 { 1774 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1775 1776 if (kn->kn_sfflags & NOTE_OOB) { 1777 if ((so->so_oobmark || (so->so_state & SS_RCVATMARK))) { 1778 kn->kn_fflags |= NOTE_OOB; 1779 return (1); 1780 } 1781 return (0); 1782 } 1783 kn->kn_data = so->so_rcv.ssb_cc; 1784 1785 /* 1786 * Only set EOF if all data has been exhausted. 1787 */ 1788 if ((so->so_state & SS_CANTRCVMORE) && kn->kn_data == 0) { 1789 kn->kn_flags |= EV_EOF; 1790 kn->kn_fflags = so->so_error; 1791 return (1); 1792 } 1793 if (so->so_error) /* temporary udp error */ 1794 return (1); 1795 if (kn->kn_sfflags & NOTE_LOWAT) 1796 return (kn->kn_data >= kn->kn_sdata); 1797 return ((kn->kn_data >= so->so_rcv.ssb_lowat) || 1798 !TAILQ_EMPTY(&so->so_comp)); 1799 } 1800 1801 static void 1802 filt_sowdetach(struct knote *kn) 1803 { 1804 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1805 1806 knote_remove(&so->so_snd.ssb_kq.ki_note, kn); 1807 if (SLIST_EMPTY(&so->so_snd.ssb_kq.ki_note)) 1808 atomic_clear_int(&so->so_snd.ssb_flags, SSB_KNOTE); 1809 } 1810 1811 /*ARGSUSED*/ 1812 static int 1813 filt_sowrite(struct knote *kn, long hint) 1814 { 1815 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1816 1817 kn->kn_data = ssb_space(&so->so_snd); 1818 if (so->so_state & SS_CANTSENDMORE) { 1819 kn->kn_flags |= EV_EOF; 1820 kn->kn_fflags = so->so_error; 1821 return (1); 1822 } 1823 if (so->so_error) /* temporary udp error */ 1824 return (1); 1825 if (((so->so_state & SS_ISCONNECTED) == 0) && 1826 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 1827 return (0); 1828 if (kn->kn_sfflags & NOTE_LOWAT) 1829 return (kn->kn_data >= kn->kn_sdata); 1830 return (kn->kn_data >= so->so_snd.ssb_lowat); 1831 } 1832 1833 /*ARGSUSED*/ 1834 static int 1835 filt_solisten(struct knote *kn, long hint) 1836 { 1837 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1838 1839 kn->kn_data = so->so_qlen; 1840 return (! TAILQ_EMPTY(&so->so_comp)); 1841 } 1842