1 /* 2 * Copyright (c) 2004 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 2004 The DragonFly Project. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Jeffrey M. Hsu. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of The DragonFly Project nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific, prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 /* 35 * Copyright (c) 1982, 1986, 1988, 1990, 1993 36 * The Regents of the University of California. All rights reserved. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. All advertising materials mentioning features or use of this software 47 * must display the following acknowledgement: 48 * This product includes software developed by the University of 49 * California, Berkeley and its contributors. 50 * 4. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 67 * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.24 2003/11/11 17:18:18 silby Exp $ 68 * $DragonFly: src/sys/kern/uipc_socket.c,v 1.55 2008/09/02 16:17:52 dillon Exp $ 69 */ 70 71 #include "opt_inet.h" 72 #include "opt_sctp.h" 73 74 #include <sys/param.h> 75 #include <sys/systm.h> 76 #include <sys/fcntl.h> 77 #include <sys/malloc.h> 78 #include <sys/mbuf.h> 79 #include <sys/domain.h> 80 #include <sys/file.h> /* for struct knote */ 81 #include <sys/kernel.h> 82 #include <sys/malloc.h> 83 #include <sys/event.h> 84 #include <sys/poll.h> 85 #include <sys/proc.h> 86 #include <sys/protosw.h> 87 #include <sys/socket.h> 88 #include <sys/socketvar.h> 89 #include <sys/socketops.h> 90 #include <sys/resourcevar.h> 91 #include <sys/signalvar.h> 92 #include <sys/sysctl.h> 93 #include <sys/uio.h> 94 #include <sys/jail.h> 95 #include <vm/vm_zone.h> 96 #include <vm/pmap.h> 97 98 #include <sys/thread2.h> 99 #include <sys/socketvar2.h> 100 101 #include <machine/limits.h> 102 103 #ifdef INET 104 static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); 105 #endif /* INET */ 106 107 static void filt_sordetach(struct knote *kn); 108 static int filt_soread(struct knote *kn, long hint); 109 static void filt_sowdetach(struct knote *kn); 110 static int filt_sowrite(struct knote *kn, long hint); 111 static int filt_solisten(struct knote *kn, long hint); 112 113 static struct filterops solisten_filtops = 114 { 1, NULL, filt_sordetach, filt_solisten }; 115 static struct filterops soread_filtops = 116 { 1, NULL, filt_sordetach, filt_soread }; 117 static struct filterops sowrite_filtops = 118 { 1, NULL, filt_sowdetach, filt_sowrite }; 119 static struct filterops soexcept_filtops = 120 { 1, NULL, filt_sordetach, filt_soread }; 121 122 MALLOC_DEFINE(M_SOCKET, "socket", "socket struct"); 123 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 124 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 125 126 127 static int somaxconn = SOMAXCONN; 128 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, 129 &somaxconn, 0, "Maximum pending socket connection queue size"); 130 131 /* 132 * Socket operation routines. 133 * These routines are called by the routines in 134 * sys_socket.c or from a system process, and 135 * implement the semantics of socket operations by 136 * switching out to the protocol specific routines. 137 */ 138 139 /* 140 * Get a socket structure, and initialize it. 141 * Note that it would probably be better to allocate socket 142 * and PCB at the same time, but I'm not convinced that all 143 * the protocols can be easily modified to do this. 144 */ 145 struct socket * 146 soalloc(int waitok) 147 { 148 struct socket *so; 149 unsigned waitmask; 150 151 waitmask = waitok ? M_WAITOK : M_NOWAIT; 152 so = kmalloc(sizeof(struct socket), M_SOCKET, M_ZERO|waitmask); 153 if (so) { 154 /* XXX race condition for reentrant kernel */ 155 TAILQ_INIT(&so->so_aiojobq); 156 TAILQ_INIT(&so->so_rcv.ssb_sel.si_mlist); 157 TAILQ_INIT(&so->so_snd.ssb_sel.si_mlist); 158 } 159 return so; 160 } 161 162 int 163 socreate(int dom, struct socket **aso, int type, 164 int proto, struct thread *td) 165 { 166 struct proc *p = td->td_proc; 167 struct protosw *prp; 168 struct socket *so; 169 struct pru_attach_info ai; 170 int error; 171 172 if (proto) 173 prp = pffindproto(dom, proto, type); 174 else 175 prp = pffindtype(dom, type); 176 177 if (prp == 0 || prp->pr_usrreqs->pru_attach == 0) 178 return (EPROTONOSUPPORT); 179 180 if (p->p_ucred->cr_prison && jail_socket_unixiproute_only && 181 prp->pr_domain->dom_family != PF_LOCAL && 182 prp->pr_domain->dom_family != PF_INET && 183 prp->pr_domain->dom_family != PF_INET6 && 184 prp->pr_domain->dom_family != PF_ROUTE) { 185 return (EPROTONOSUPPORT); 186 } 187 188 if (prp->pr_type != type) 189 return (EPROTOTYPE); 190 so = soalloc(p != 0); 191 if (so == 0) 192 return (ENOBUFS); 193 194 /* 195 * Set a default port for protocol processing. No action will occur 196 * on the socket on this port until an inpcb is attached to it and 197 * is able to match incoming packets, or until the socket becomes 198 * available to userland. 199 */ 200 so->so_port = cpu0_soport(so, NULL, NULL); 201 202 TAILQ_INIT(&so->so_incomp); 203 TAILQ_INIT(&so->so_comp); 204 so->so_type = type; 205 so->so_cred = crhold(p->p_ucred); 206 so->so_proto = prp; 207 ai.sb_rlimit = &p->p_rlimit[RLIMIT_SBSIZE]; 208 ai.p_ucred = p->p_ucred; 209 ai.fd_rdir = p->p_fd->fd_rdir; 210 211 /* 212 * Auto-sizing of socket buffers is managed by the protocols and 213 * the appropriate flags must be set in the pru_attach function. 214 */ 215 error = so_pru_attach(so, proto, &ai); 216 if (error) { 217 so->so_state |= SS_NOFDREF; 218 sofree(so); 219 return (error); 220 } 221 222 *aso = so; 223 return (0); 224 } 225 226 int 227 sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 228 { 229 int error; 230 231 crit_enter(); 232 error = so_pru_bind(so, nam, td); 233 crit_exit(); 234 return (error); 235 } 236 237 void 238 sodealloc(struct socket *so) 239 { 240 if (so->so_rcv.ssb_hiwat) 241 (void)chgsbsize(so->so_cred->cr_uidinfo, 242 &so->so_rcv.ssb_hiwat, 0, RLIM_INFINITY); 243 if (so->so_snd.ssb_hiwat) 244 (void)chgsbsize(so->so_cred->cr_uidinfo, 245 &so->so_snd.ssb_hiwat, 0, RLIM_INFINITY); 246 #ifdef INET 247 /* remove accept filter if present */ 248 if (so->so_accf != NULL) 249 do_setopt_accept_filter(so, NULL); 250 #endif /* INET */ 251 crfree(so->so_cred); 252 kfree(so, M_SOCKET); 253 } 254 255 int 256 solisten(struct socket *so, int backlog, struct thread *td) 257 { 258 int error; 259 #ifdef SCTP 260 short oldopt, oldqlimit; 261 #endif /* SCTP */ 262 263 crit_enter(); 264 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) { 265 crit_exit(); 266 return (EINVAL); 267 } 268 269 #ifdef SCTP 270 oldopt = so->so_options; 271 oldqlimit = so->so_qlimit; 272 #endif /* SCTP */ 273 274 if (TAILQ_EMPTY(&so->so_comp)) 275 so->so_options |= SO_ACCEPTCONN; 276 if (backlog < 0 || backlog > somaxconn) 277 backlog = somaxconn; 278 so->so_qlimit = backlog; 279 /* SCTP needs to look at tweak both the inbound backlog parameter AND 280 * the so_options (UDP model both connect's and gets inbound 281 * connections .. implicitly). 282 */ 283 error = so_pru_listen(so, td); 284 if (error) { 285 #ifdef SCTP 286 /* Restore the params */ 287 so->so_options = oldopt; 288 so->so_qlimit = oldqlimit; 289 #endif /* SCTP */ 290 crit_exit(); 291 return (error); 292 } 293 crit_exit(); 294 return (0); 295 } 296 297 /* 298 * Destroy a disconnected socket. This routine is a NOP if entities 299 * still have a reference on the socket: 300 * 301 * so_pcb - The protocol stack still has a reference 302 * SS_NOFDREF - There is no longer a file pointer reference 303 * SS_ABORTING - An abort netmsg is in-flight 304 */ 305 void 306 sofree(struct socket *so) 307 { 308 struct socket *head = so->so_head; 309 310 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) 311 return; 312 if (so->so_state & SS_ABORTING) 313 return; 314 if (head != NULL) { 315 if (so->so_state & SS_INCOMP) { 316 TAILQ_REMOVE(&head->so_incomp, so, so_list); 317 head->so_incqlen--; 318 } else if (so->so_state & SS_COMP) { 319 /* 320 * We must not decommission a socket that's 321 * on the accept(2) queue. If we do, then 322 * accept(2) may hang after select(2) indicated 323 * that the listening socket was ready. 324 */ 325 return; 326 } else { 327 panic("sofree: not queued"); 328 } 329 so->so_state &= ~SS_INCOMP; 330 so->so_head = NULL; 331 } 332 ssb_release(&so->so_snd, so); 333 sorflush(so); 334 sodealloc(so); 335 } 336 337 /* 338 * Close a socket on last file table reference removal. 339 * Initiate disconnect if connected. 340 * Free socket when disconnect complete. 341 */ 342 int 343 soclose(struct socket *so, int fflag) 344 { 345 int error = 0; 346 347 crit_enter(); 348 funsetown(so->so_sigio); 349 if (so->so_pcb == NULL) 350 goto discard; 351 if (so->so_state & SS_ISCONNECTED) { 352 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 353 error = sodisconnect(so); 354 if (error) 355 goto drop; 356 } 357 if (so->so_options & SO_LINGER) { 358 if ((so->so_state & SS_ISDISCONNECTING) && 359 (fflag & FNONBLOCK)) 360 goto drop; 361 while (so->so_state & SS_ISCONNECTED) { 362 error = tsleep((caddr_t)&so->so_timeo, 363 PCATCH, "soclos", so->so_linger * hz); 364 if (error) 365 break; 366 } 367 } 368 } 369 drop: 370 if (so->so_pcb) { 371 int error2; 372 373 error2 = so_pru_detach(so); 374 if (error == 0) 375 error = error2; 376 } 377 discard: 378 if (so->so_options & SO_ACCEPTCONN) { 379 struct socket *sp; 380 381 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 382 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 383 sp->so_state &= ~SS_INCOMP; 384 sp->so_head = NULL; 385 so->so_incqlen--; 386 soaborta(sp); 387 } 388 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 389 TAILQ_REMOVE(&so->so_comp, sp, so_list); 390 sp->so_state &= ~SS_COMP; 391 sp->so_head = NULL; 392 so->so_qlen--; 393 soaborta(sp); 394 } 395 } 396 if (so->so_state & SS_NOFDREF) 397 panic("soclose: NOFDREF"); 398 so->so_state |= SS_NOFDREF; 399 sofree(so); 400 crit_exit(); 401 return (error); 402 } 403 404 /* 405 * Abort and destroy a socket. Only one abort can be in progress 406 * at any given moment. 407 */ 408 void 409 soabort(struct socket *so) 410 { 411 if ((so->so_state & SS_ABORTING) == 0) { 412 so->so_state |= SS_ABORTING; 413 so_pru_abort(so); 414 } 415 } 416 417 void 418 soaborta(struct socket *so) 419 { 420 if ((so->so_state & SS_ABORTING) == 0) { 421 so->so_state |= SS_ABORTING; 422 so_pru_aborta(so); 423 } 424 } 425 426 void 427 soabort_oncpu(struct socket *so) 428 { 429 if ((so->so_state & SS_ABORTING) == 0) { 430 so->so_state |= SS_ABORTING; 431 so_pru_abort_oncpu(so); 432 } 433 } 434 435 int 436 soaccept(struct socket *so, struct sockaddr **nam) 437 { 438 int error; 439 440 crit_enter(); 441 if ((so->so_state & SS_NOFDREF) == 0) 442 panic("soaccept: !NOFDREF"); 443 so->so_state &= ~SS_NOFDREF; 444 error = so_pru_accept(so, nam); 445 crit_exit(); 446 return (error); 447 } 448 449 int 450 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) 451 { 452 int error; 453 454 if (so->so_options & SO_ACCEPTCONN) 455 return (EOPNOTSUPP); 456 crit_enter(); 457 /* 458 * If protocol is connection-based, can only connect once. 459 * Otherwise, if connected, try to disconnect first. 460 * This allows user to disconnect by connecting to, e.g., 461 * a null address. 462 */ 463 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 464 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 465 (error = sodisconnect(so)))) { 466 error = EISCONN; 467 } else { 468 /* 469 * Prevent accumulated error from previous connection 470 * from biting us. 471 */ 472 so->so_error = 0; 473 error = so_pru_connect(so, nam, td); 474 } 475 crit_exit(); 476 return (error); 477 } 478 479 int 480 soconnect2(struct socket *so1, struct socket *so2) 481 { 482 int error; 483 484 crit_enter(); 485 error = so_pru_connect2(so1, so2); 486 crit_exit(); 487 return (error); 488 } 489 490 int 491 sodisconnect(struct socket *so) 492 { 493 int error; 494 495 crit_enter(); 496 if ((so->so_state & SS_ISCONNECTED) == 0) { 497 error = ENOTCONN; 498 goto bad; 499 } 500 if (so->so_state & SS_ISDISCONNECTING) { 501 error = EALREADY; 502 goto bad; 503 } 504 error = so_pru_disconnect(so); 505 bad: 506 crit_exit(); 507 return (error); 508 } 509 510 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 511 /* 512 * Send on a socket. 513 * If send must go all at once and message is larger than 514 * send buffering, then hard error. 515 * Lock against other senders. 516 * If must go all at once and not enough room now, then 517 * inform user that this would block and do nothing. 518 * Otherwise, if nonblocking, send as much as possible. 519 * The data to be sent is described by "uio" if nonzero, 520 * otherwise by the mbuf chain "top" (which must be null 521 * if uio is not). Data provided in mbuf chain must be small 522 * enough to send all at once. 523 * 524 * Returns nonzero on error, timeout or signal; callers 525 * must check for short counts if EINTR/ERESTART are returned. 526 * Data and control buffers are freed on return. 527 */ 528 int 529 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 530 struct mbuf *top, struct mbuf *control, int flags, 531 struct thread *td) 532 { 533 struct mbuf **mp; 534 struct mbuf *m; 535 size_t resid; 536 int space, len; 537 int clen = 0, error, dontroute, mlen; 538 int atomic = sosendallatonce(so) || top; 539 int pru_flags; 540 541 if (uio) 542 resid = uio->uio_resid; 543 else 544 resid = (size_t)top->m_pkthdr.len; 545 546 /* 547 * WARNING! resid is unsigned, space and len are signed. space 548 * can wind up negative if the sockbuf is overcommitted. 549 * 550 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 551 * type sockets since that's an error. 552 */ 553 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { 554 error = EINVAL; 555 goto out; 556 } 557 558 dontroute = 559 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 560 (so->so_proto->pr_flags & PR_ATOMIC); 561 if (td->td_lwp != NULL) 562 td->td_lwp->lwp_ru.ru_msgsnd++; 563 if (control) 564 clen = control->m_len; 565 #define gotoerr(errcode) { error = errcode; crit_exit(); goto release; } 566 567 restart: 568 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags)); 569 if (error) 570 goto out; 571 572 do { 573 crit_enter(); 574 if (so->so_state & SS_CANTSENDMORE) 575 gotoerr(EPIPE); 576 if (so->so_error) { 577 error = so->so_error; 578 so->so_error = 0; 579 crit_exit(); 580 goto release; 581 } 582 if ((so->so_state & SS_ISCONNECTED) == 0) { 583 /* 584 * `sendto' and `sendmsg' is allowed on a connection- 585 * based socket if it supports implied connect. 586 * Return ENOTCONN if not connected and no address is 587 * supplied. 588 */ 589 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 590 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 591 if ((so->so_state & SS_ISCONFIRMING) == 0 && 592 !(resid == 0 && clen != 0)) 593 gotoerr(ENOTCONN); 594 } else if (addr == 0) 595 gotoerr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 596 ENOTCONN : EDESTADDRREQ); 597 } 598 if ((atomic && resid > so->so_snd.ssb_hiwat) || 599 clen > so->so_snd.ssb_hiwat) { 600 gotoerr(EMSGSIZE); 601 } 602 space = ssb_space(&so->so_snd); 603 if (flags & MSG_OOB) 604 space += 1024; 605 if ((space < 0 || (size_t)space < resid + clen) && uio && 606 (atomic || space < so->so_snd.ssb_lowat || space < clen)) { 607 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) 608 gotoerr(EWOULDBLOCK); 609 ssb_unlock(&so->so_snd); 610 error = ssb_wait(&so->so_snd); 611 crit_exit(); 612 if (error) 613 goto out; 614 goto restart; 615 } 616 crit_exit(); 617 mp = ⊤ 618 space -= clen; 619 do { 620 if (uio == NULL) { 621 /* 622 * Data is prepackaged in "top". 623 */ 624 resid = 0; 625 if (flags & MSG_EOR) 626 top->m_flags |= M_EOR; 627 } else do { 628 if (resid > INT_MAX) 629 resid = INT_MAX; 630 m = m_getl((int)resid, MB_WAIT, MT_DATA, 631 top == NULL ? M_PKTHDR : 0, &mlen); 632 if (top == NULL) { 633 m->m_pkthdr.len = 0; 634 m->m_pkthdr.rcvif = NULL; 635 } 636 len = imin((int)szmin(mlen, resid), space); 637 if (resid < MINCLSIZE) { 638 /* 639 * For datagram protocols, leave room 640 * for protocol headers in first mbuf. 641 */ 642 if (atomic && top == 0 && len < mlen) 643 MH_ALIGN(m, len); 644 } 645 space -= len; 646 error = uiomove(mtod(m, caddr_t), (size_t)len, uio); 647 resid = uio->uio_resid; 648 m->m_len = len; 649 *mp = m; 650 top->m_pkthdr.len += len; 651 if (error) 652 goto release; 653 mp = &m->m_next; 654 if (resid == 0) { 655 if (flags & MSG_EOR) 656 top->m_flags |= M_EOR; 657 break; 658 } 659 } while (space > 0 && atomic); 660 if (dontroute) 661 so->so_options |= SO_DONTROUTE; 662 if (flags & MSG_OOB) { 663 pru_flags = PRUS_OOB; 664 } else if ((flags & MSG_EOF) && 665 (so->so_proto->pr_flags & PR_IMPLOPCL) && 666 (resid == 0)) { 667 /* 668 * If the user set MSG_EOF, the protocol 669 * understands this flag and nothing left to 670 * send then use PRU_SEND_EOF instead of PRU_SEND. 671 */ 672 pru_flags = PRUS_EOF; 673 } else if (resid > 0 && space > 0) { 674 /* If there is more to send, set PRUS_MORETOCOME */ 675 pru_flags = PRUS_MORETOCOME; 676 } else { 677 pru_flags = 0; 678 } 679 crit_enter(); 680 /* 681 * XXX all the SS_CANTSENDMORE checks previously 682 * done could be out of date. We could have recieved 683 * a reset packet in an interrupt or maybe we slept 684 * while doing page faults in uiomove() etc. We could 685 * probably recheck again inside the splnet() protection 686 * here, but there are probably other places that this 687 * also happens. We must rethink this. 688 */ 689 error = so_pru_send(so, pru_flags, top, addr, control, td); 690 crit_exit(); 691 if (dontroute) 692 so->so_options &= ~SO_DONTROUTE; 693 clen = 0; 694 control = 0; 695 top = 0; 696 mp = ⊤ 697 if (error) 698 goto release; 699 } while (resid && space > 0); 700 } while (resid); 701 702 release: 703 ssb_unlock(&so->so_snd); 704 out: 705 if (top) 706 m_freem(top); 707 if (control) 708 m_freem(control); 709 return (error); 710 } 711 712 /* 713 * A specialization of sosend() for UDP based on protocol-specific knowledge: 714 * so->so_proto->pr_flags has the PR_ATOMIC field set. This means that 715 * sosendallatonce() returns true, 716 * the "atomic" variable is true, 717 * and sosendudp() blocks until space is available for the entire send. 718 * so->so_proto->pr_flags does not have the PR_CONNREQUIRED or 719 * PR_IMPLOPCL flags set. 720 * UDP has no out-of-band data. 721 * UDP has no control data. 722 * UDP does not support MSG_EOR. 723 */ 724 int 725 sosendudp(struct socket *so, struct sockaddr *addr, struct uio *uio, 726 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 727 { 728 boolean_t dontroute; /* temporary SO_DONTROUTE setting */ 729 size_t resid; 730 int error; 731 int space; 732 733 if (td->td_lwp != NULL) 734 td->td_lwp->lwp_ru.ru_msgsnd++; 735 if (control) 736 m_freem(control); 737 738 KASSERT((uio && !top) || (top && !uio), ("bad arguments to sosendudp")); 739 resid = uio ? uio->uio_resid : (size_t)top->m_pkthdr.len; 740 741 restart: 742 error = ssb_lock(&so->so_snd, SBLOCKWAIT(flags)); 743 if (error) 744 goto out; 745 746 crit_enter(); 747 if (so->so_state & SS_CANTSENDMORE) 748 gotoerr(EPIPE); 749 if (so->so_error) { 750 error = so->so_error; 751 so->so_error = 0; 752 crit_exit(); 753 goto release; 754 } 755 if (!(so->so_state & SS_ISCONNECTED) && addr == NULL) 756 gotoerr(EDESTADDRREQ); 757 if (resid > so->so_snd.ssb_hiwat) 758 gotoerr(EMSGSIZE); 759 space = ssb_space(&so->so_snd); 760 if (uio && (space < 0 || (size_t)space < resid)) { 761 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) 762 gotoerr(EWOULDBLOCK); 763 ssb_unlock(&so->so_snd); 764 error = ssb_wait(&so->so_snd); 765 crit_exit(); 766 if (error) 767 goto out; 768 goto restart; 769 } 770 crit_exit(); 771 772 if (uio) { 773 top = m_uiomove(uio); 774 if (top == NULL) 775 goto release; 776 } 777 778 dontroute = (flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE); 779 if (dontroute) 780 so->so_options |= SO_DONTROUTE; 781 782 error = so_pru_send(so, 0, top, addr, NULL, td); 783 top = NULL; /* sent or freed in lower layer */ 784 785 if (dontroute) 786 so->so_options &= ~SO_DONTROUTE; 787 788 release: 789 ssb_unlock(&so->so_snd); 790 out: 791 if (top) 792 m_freem(top); 793 return (error); 794 } 795 796 /* 797 * Implement receive operations on a socket. 798 * We depend on the way that records are added to the signalsockbuf 799 * by sbappend*. In particular, each record (mbufs linked through m_next) 800 * must begin with an address if the protocol so specifies, 801 * followed by an optional mbuf or mbufs containing ancillary data, 802 * and then zero or more mbufs of data. 803 * In order to avoid blocking network interrupts for the entire time here, 804 * we exit the critical section while doing the actual copy to user space. 805 * Although the signalsockbuf is locked, new data may still be appended, 806 * and thus we must maintain consistency of the signalsockbuf during that time. 807 * 808 * The caller may receive the data as a single mbuf chain by supplying 809 * an mbuf **mp0 for use in returning the chain. The uio is then used 810 * only for the count in uio_resid. 811 */ 812 int 813 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 814 struct sockbuf *sio, struct mbuf **controlp, int *flagsp) 815 { 816 struct mbuf *m, *n; 817 struct mbuf *free_chain = NULL; 818 int flags, len, error, offset; 819 struct protosw *pr = so->so_proto; 820 int moff, type = 0; 821 size_t resid, orig_resid; 822 823 if (uio) 824 resid = uio->uio_resid; 825 else 826 resid = (size_t)(sio->sb_climit - sio->sb_cc); 827 orig_resid = resid; 828 829 if (psa) 830 *psa = NULL; 831 if (controlp) 832 *controlp = NULL; 833 if (flagsp) 834 flags = *flagsp &~ MSG_EOR; 835 else 836 flags = 0; 837 if (flags & MSG_OOB) { 838 m = m_get(MB_WAIT, MT_DATA); 839 if (m == NULL) 840 return (ENOBUFS); 841 error = so_pru_rcvoob(so, m, flags & MSG_PEEK); 842 if (error) 843 goto bad; 844 if (sio) { 845 do { 846 sbappend(sio, m); 847 KKASSERT(resid >= (size_t)m->m_len); 848 resid -= (size_t)m->m_len; 849 } while (resid > 0 && m); 850 } else { 851 do { 852 uio->uio_resid = resid; 853 error = uiomove(mtod(m, caddr_t), 854 (int)szmin(resid, m->m_len), 855 uio); 856 resid = uio->uio_resid; 857 m = m_free(m); 858 } while (uio->uio_resid && error == 0 && m); 859 } 860 bad: 861 if (m) 862 m_freem(m); 863 return (error); 864 } 865 if ((so->so_state & SS_ISCONFIRMING) && resid) 866 so_pru_rcvd(so, 0); 867 868 restart: 869 crit_enter(); 870 error = ssb_lock(&so->so_rcv, SBLOCKWAIT(flags)); 871 if (error) 872 goto done; 873 874 m = so->so_rcv.ssb_mb; 875 /* 876 * If we have less data than requested, block awaiting more 877 * (subject to any timeout) if: 878 * 1. the current count is less than the low water mark, or 879 * 2. MSG_WAITALL is set, and it is possible to do the entire 880 * receive operation at once if we block (resid <= hiwat). 881 * 3. MSG_DONTWAIT is not set 882 * If MSG_WAITALL is set but resid is larger than the receive buffer, 883 * we have to do the receive in sections, and thus risk returning 884 * a short count if a timeout or signal occurs after we start. 885 */ 886 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 887 (size_t)so->so_rcv.ssb_cc < resid) && 888 (so->so_rcv.ssb_cc < so->so_rcv.ssb_lowat || 889 ((flags & MSG_WAITALL) && resid <= (size_t)so->so_rcv.ssb_hiwat)) && 890 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { 891 KASSERT(m != NULL || !so->so_rcv.ssb_cc, ("receive 1")); 892 if (so->so_error) { 893 if (m) 894 goto dontblock; 895 error = so->so_error; 896 if ((flags & MSG_PEEK) == 0) 897 so->so_error = 0; 898 goto release; 899 } 900 if (so->so_state & SS_CANTRCVMORE) { 901 if (m) 902 goto dontblock; 903 else 904 goto release; 905 } 906 for (; m; m = m->m_next) { 907 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 908 m = so->so_rcv.ssb_mb; 909 goto dontblock; 910 } 911 } 912 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 913 (pr->pr_flags & PR_CONNREQUIRED)) { 914 error = ENOTCONN; 915 goto release; 916 } 917 if (resid == 0) 918 goto release; 919 if (flags & (MSG_FNONBLOCKING|MSG_DONTWAIT)) { 920 error = EWOULDBLOCK; 921 goto release; 922 } 923 ssb_unlock(&so->so_rcv); 924 error = ssb_wait(&so->so_rcv); 925 if (error) 926 goto done; 927 crit_exit(); 928 goto restart; 929 } 930 dontblock: 931 if (uio && uio->uio_td && uio->uio_td->td_proc) 932 uio->uio_td->td_lwp->lwp_ru.ru_msgrcv++; 933 934 /* 935 * note: m should be == sb_mb here. Cache the next record while 936 * cleaning up. Note that calling m_free*() will break out critical 937 * section. 938 */ 939 KKASSERT(m == so->so_rcv.ssb_mb); 940 941 /* 942 * Skip any address mbufs prepending the record. 943 */ 944 if (pr->pr_flags & PR_ADDR) { 945 KASSERT(m->m_type == MT_SONAME, ("receive 1a")); 946 orig_resid = 0; 947 if (psa) 948 *psa = dup_sockaddr(mtod(m, struct sockaddr *)); 949 if (flags & MSG_PEEK) 950 m = m->m_next; 951 else 952 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 953 } 954 955 /* 956 * Skip any control mbufs prepending the record. 957 */ 958 #ifdef SCTP 959 if (pr->pr_flags & PR_ADDR_OPT) { 960 /* 961 * For SCTP we may be getting a 962 * whole message OR a partial delivery. 963 */ 964 if (m && m->m_type == MT_SONAME) { 965 orig_resid = 0; 966 if (psa) 967 *psa = dup_sockaddr(mtod(m, struct sockaddr *)); 968 if (flags & MSG_PEEK) 969 m = m->m_next; 970 else 971 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 972 } 973 } 974 #endif /* SCTP */ 975 while (m && m->m_type == MT_CONTROL && error == 0) { 976 if (flags & MSG_PEEK) { 977 if (controlp) 978 *controlp = m_copy(m, 0, m->m_len); 979 m = m->m_next; /* XXX race */ 980 } else { 981 if (controlp) { 982 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL); 983 if (pr->pr_domain->dom_externalize && 984 mtod(m, struct cmsghdr *)->cmsg_type == 985 SCM_RIGHTS) 986 error = (*pr->pr_domain->dom_externalize)(m); 987 *controlp = m; 988 m = n; 989 } else { 990 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 991 } 992 } 993 if (controlp && *controlp) { 994 orig_resid = 0; 995 controlp = &(*controlp)->m_next; 996 } 997 } 998 999 /* 1000 * flag OOB data. 1001 */ 1002 if (m) { 1003 type = m->m_type; 1004 if (type == MT_OOBDATA) 1005 flags |= MSG_OOB; 1006 } 1007 1008 /* 1009 * Copy to the UIO or mbuf return chain (*mp). 1010 */ 1011 moff = 0; 1012 offset = 0; 1013 while (m && resid > 0 && error == 0) { 1014 if (m->m_type == MT_OOBDATA) { 1015 if (type != MT_OOBDATA) 1016 break; 1017 } else if (type == MT_OOBDATA) 1018 break; 1019 else 1020 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 1021 ("receive 3")); 1022 so->so_state &= ~SS_RCVATMARK; 1023 len = (resid > INT_MAX) ? INT_MAX : resid; 1024 if (so->so_oobmark && len > so->so_oobmark - offset) 1025 len = so->so_oobmark - offset; 1026 if (len > m->m_len - moff) 1027 len = m->m_len - moff; 1028 1029 /* 1030 * Copy out to the UIO or pass the mbufs back to the SIO. 1031 * The SIO is dealt with when we eat the mbuf, but deal 1032 * with the resid here either way. 1033 */ 1034 if (uio) { 1035 crit_exit(); 1036 uio->uio_resid = resid; 1037 error = uiomove(mtod(m, caddr_t) + moff, len, uio); 1038 resid = uio->uio_resid; 1039 crit_enter(); 1040 if (error) 1041 goto release; 1042 } else { 1043 resid -= (size_t)len; 1044 } 1045 1046 /* 1047 * Eat the entire mbuf or just a piece of it 1048 */ 1049 if (len == m->m_len - moff) { 1050 if (m->m_flags & M_EOR) 1051 flags |= MSG_EOR; 1052 #ifdef SCTP 1053 if (m->m_flags & M_NOTIFICATION) 1054 flags |= MSG_NOTIFICATION; 1055 #endif /* SCTP */ 1056 if (flags & MSG_PEEK) { 1057 m = m->m_next; 1058 moff = 0; 1059 } else { 1060 if (sio) { 1061 n = sbunlinkmbuf(&so->so_rcv.sb, m, NULL); 1062 sbappend(sio, m); 1063 m = n; 1064 } else { 1065 m = sbunlinkmbuf(&so->so_rcv.sb, m, &free_chain); 1066 } 1067 } 1068 } else { 1069 if (flags & MSG_PEEK) { 1070 moff += len; 1071 } else { 1072 if (sio) { 1073 n = m_copym(m, 0, len, MB_WAIT); 1074 if (n) 1075 sbappend(sio, n); 1076 } 1077 m->m_data += len; 1078 m->m_len -= len; 1079 so->so_rcv.ssb_cc -= len; 1080 } 1081 } 1082 if (so->so_oobmark) { 1083 if ((flags & MSG_PEEK) == 0) { 1084 so->so_oobmark -= len; 1085 if (so->so_oobmark == 0) { 1086 so->so_state |= SS_RCVATMARK; 1087 break; 1088 } 1089 } else { 1090 offset += len; 1091 if (offset == so->so_oobmark) 1092 break; 1093 } 1094 } 1095 if (flags & MSG_EOR) 1096 break; 1097 /* 1098 * If the MSG_WAITALL flag is set (for non-atomic socket), 1099 * we must not quit until resid == 0 or an error 1100 * termination. If a signal/timeout occurs, return 1101 * with a short count but without error. 1102 * Keep signalsockbuf locked against other readers. 1103 */ 1104 while ((flags & MSG_WAITALL) && m == NULL && 1105 resid > 0 && !sosendallatonce(so) && 1106 so->so_rcv.ssb_mb == NULL) { 1107 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1108 break; 1109 /* 1110 * The window might have closed to zero, make 1111 * sure we send an ack now that we've drained 1112 * the buffer or we might end up blocking until 1113 * the idle takes over (5 seconds). 1114 */ 1115 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 1116 so_pru_rcvd(so, flags); 1117 error = ssb_wait(&so->so_rcv); 1118 if (error) { 1119 ssb_unlock(&so->so_rcv); 1120 error = 0; 1121 goto done; 1122 } 1123 m = so->so_rcv.ssb_mb; 1124 } 1125 } 1126 1127 /* 1128 * If an atomic read was requested but unread data still remains 1129 * in the record, set MSG_TRUNC. 1130 */ 1131 if (m && pr->pr_flags & PR_ATOMIC) 1132 flags |= MSG_TRUNC; 1133 1134 /* 1135 * Cleanup. If an atomic read was requested drop any unread data. 1136 */ 1137 if ((flags & MSG_PEEK) == 0) { 1138 if (m && (pr->pr_flags & PR_ATOMIC)) 1139 sbdroprecord(&so->so_rcv.sb); 1140 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) 1141 so_pru_rcvd(so, flags); 1142 } 1143 1144 if (orig_resid == resid && orig_resid && 1145 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1146 ssb_unlock(&so->so_rcv); 1147 crit_exit(); 1148 goto restart; 1149 } 1150 1151 if (flagsp) 1152 *flagsp |= flags; 1153 release: 1154 ssb_unlock(&so->so_rcv); 1155 done: 1156 crit_exit(); 1157 if (free_chain) 1158 m_freem(free_chain); 1159 return (error); 1160 } 1161 1162 int 1163 soshutdown(struct socket *so, int how) 1164 { 1165 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1166 return (EINVAL); 1167 1168 if (how != SHUT_WR) 1169 sorflush(so); 1170 if (how != SHUT_RD) 1171 return (so_pru_shutdown(so)); 1172 return (0); 1173 } 1174 1175 void 1176 sorflush(struct socket *so) 1177 { 1178 struct signalsockbuf *ssb = &so->so_rcv; 1179 struct protosw *pr = so->so_proto; 1180 struct signalsockbuf asb; 1181 1182 ssb->ssb_flags |= SSB_NOINTR; 1183 (void) ssb_lock(ssb, M_WAITOK); 1184 1185 crit_enter(); 1186 socantrcvmore(so); 1187 ssb_unlock(ssb); 1188 asb = *ssb; 1189 bzero((caddr_t)ssb, sizeof (*ssb)); 1190 if (asb.ssb_flags & SSB_KNOTE) { 1191 ssb->ssb_sel.si_note = asb.ssb_sel.si_note; 1192 ssb->ssb_flags = SSB_KNOTE; 1193 } 1194 crit_exit(); 1195 1196 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1197 (*pr->pr_domain->dom_dispose)(asb.ssb_mb); 1198 ssb_release(&asb, so); 1199 } 1200 1201 #ifdef INET 1202 static int 1203 do_setopt_accept_filter(struct socket *so, struct sockopt *sopt) 1204 { 1205 struct accept_filter_arg *afap = NULL; 1206 struct accept_filter *afp; 1207 struct so_accf *af = so->so_accf; 1208 int error = 0; 1209 1210 /* do not set/remove accept filters on non listen sockets */ 1211 if ((so->so_options & SO_ACCEPTCONN) == 0) { 1212 error = EINVAL; 1213 goto out; 1214 } 1215 1216 /* removing the filter */ 1217 if (sopt == NULL) { 1218 if (af != NULL) { 1219 if (af->so_accept_filter != NULL && 1220 af->so_accept_filter->accf_destroy != NULL) { 1221 af->so_accept_filter->accf_destroy(so); 1222 } 1223 if (af->so_accept_filter_str != NULL) { 1224 FREE(af->so_accept_filter_str, M_ACCF); 1225 } 1226 FREE(af, M_ACCF); 1227 so->so_accf = NULL; 1228 } 1229 so->so_options &= ~SO_ACCEPTFILTER; 1230 return (0); 1231 } 1232 /* adding a filter */ 1233 /* must remove previous filter first */ 1234 if (af != NULL) { 1235 error = EINVAL; 1236 goto out; 1237 } 1238 /* don't put large objects on the kernel stack */ 1239 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, M_WAITOK); 1240 error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); 1241 afap->af_name[sizeof(afap->af_name)-1] = '\0'; 1242 afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; 1243 if (error) 1244 goto out; 1245 afp = accept_filt_get(afap->af_name); 1246 if (afp == NULL) { 1247 error = ENOENT; 1248 goto out; 1249 } 1250 MALLOC(af, struct so_accf *, sizeof(*af), M_ACCF, M_WAITOK | M_ZERO); 1251 if (afp->accf_create != NULL) { 1252 if (afap->af_name[0] != '\0') { 1253 int len = strlen(afap->af_name) + 1; 1254 1255 MALLOC(af->so_accept_filter_str, char *, len, M_ACCF, M_WAITOK); 1256 strcpy(af->so_accept_filter_str, afap->af_name); 1257 } 1258 af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg); 1259 if (af->so_accept_filter_arg == NULL) { 1260 FREE(af->so_accept_filter_str, M_ACCF); 1261 FREE(af, M_ACCF); 1262 so->so_accf = NULL; 1263 error = EINVAL; 1264 goto out; 1265 } 1266 } 1267 af->so_accept_filter = afp; 1268 so->so_accf = af; 1269 so->so_options |= SO_ACCEPTFILTER; 1270 out: 1271 if (afap != NULL) 1272 FREE(afap, M_TEMP); 1273 return (error); 1274 } 1275 #endif /* INET */ 1276 1277 /* 1278 * Perhaps this routine, and sooptcopyout(), below, ought to come in 1279 * an additional variant to handle the case where the option value needs 1280 * to be some kind of integer, but not a specific size. 1281 * In addition to their use here, these functions are also called by the 1282 * protocol-level pr_ctloutput() routines. 1283 */ 1284 int 1285 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 1286 { 1287 return soopt_to_kbuf(sopt, buf, len, minlen); 1288 } 1289 1290 int 1291 soopt_to_kbuf(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 1292 { 1293 size_t valsize; 1294 1295 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 1296 KKASSERT(kva_p(buf)); 1297 1298 /* 1299 * If the user gives us more than we wanted, we ignore it, 1300 * but if we don't get the minimum length the caller 1301 * wants, we return EINVAL. On success, sopt->sopt_valsize 1302 * is set to however much we actually retrieved. 1303 */ 1304 if ((valsize = sopt->sopt_valsize) < minlen) 1305 return EINVAL; 1306 if (valsize > len) 1307 sopt->sopt_valsize = valsize = len; 1308 1309 bcopy(sopt->sopt_val, buf, valsize); 1310 return 0; 1311 } 1312 1313 1314 int 1315 sosetopt(struct socket *so, struct sockopt *sopt) 1316 { 1317 int error, optval; 1318 struct linger l; 1319 struct timeval tv; 1320 u_long val; 1321 1322 error = 0; 1323 sopt->sopt_dir = SOPT_SET; 1324 if (sopt->sopt_level != SOL_SOCKET) { 1325 if (so->so_proto && so->so_proto->pr_ctloutput) { 1326 return (so_pru_ctloutput(so, sopt)); 1327 } 1328 error = ENOPROTOOPT; 1329 } else { 1330 switch (sopt->sopt_name) { 1331 #ifdef INET 1332 case SO_ACCEPTFILTER: 1333 error = do_setopt_accept_filter(so, sopt); 1334 if (error) 1335 goto bad; 1336 break; 1337 #endif /* INET */ 1338 case SO_LINGER: 1339 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 1340 if (error) 1341 goto bad; 1342 1343 so->so_linger = l.l_linger; 1344 if (l.l_onoff) 1345 so->so_options |= SO_LINGER; 1346 else 1347 so->so_options &= ~SO_LINGER; 1348 break; 1349 1350 case SO_DEBUG: 1351 case SO_KEEPALIVE: 1352 case SO_DONTROUTE: 1353 case SO_USELOOPBACK: 1354 case SO_BROADCAST: 1355 case SO_REUSEADDR: 1356 case SO_REUSEPORT: 1357 case SO_OOBINLINE: 1358 case SO_TIMESTAMP: 1359 error = sooptcopyin(sopt, &optval, sizeof optval, 1360 sizeof optval); 1361 if (error) 1362 goto bad; 1363 if (optval) 1364 so->so_options |= sopt->sopt_name; 1365 else 1366 so->so_options &= ~sopt->sopt_name; 1367 break; 1368 1369 case SO_SNDBUF: 1370 case SO_RCVBUF: 1371 case SO_SNDLOWAT: 1372 case SO_RCVLOWAT: 1373 error = sooptcopyin(sopt, &optval, sizeof optval, 1374 sizeof optval); 1375 if (error) 1376 goto bad; 1377 1378 /* 1379 * Values < 1 make no sense for any of these 1380 * options, so disallow them. 1381 */ 1382 if (optval < 1) { 1383 error = EINVAL; 1384 goto bad; 1385 } 1386 1387 switch (sopt->sopt_name) { 1388 case SO_SNDBUF: 1389 case SO_RCVBUF: 1390 if (ssb_reserve(sopt->sopt_name == SO_SNDBUF ? 1391 &so->so_snd : &so->so_rcv, (u_long)optval, 1392 so, 1393 &curproc->p_rlimit[RLIMIT_SBSIZE]) == 0) { 1394 error = ENOBUFS; 1395 goto bad; 1396 } 1397 (sopt->sopt_name == SO_SNDBUF ? &so->so_snd : 1398 &so->so_rcv)->ssb_flags &= ~SSB_AUTOSIZE; 1399 break; 1400 1401 /* 1402 * Make sure the low-water is never greater than 1403 * the high-water. 1404 */ 1405 case SO_SNDLOWAT: 1406 so->so_snd.ssb_lowat = 1407 (optval > so->so_snd.ssb_hiwat) ? 1408 so->so_snd.ssb_hiwat : optval; 1409 so->so_snd.ssb_flags &= ~SSB_AUTOLOWAT; 1410 break; 1411 case SO_RCVLOWAT: 1412 so->so_rcv.ssb_lowat = 1413 (optval > so->so_rcv.ssb_hiwat) ? 1414 so->so_rcv.ssb_hiwat : optval; 1415 so->so_rcv.ssb_flags &= ~SSB_AUTOLOWAT; 1416 break; 1417 } 1418 break; 1419 1420 case SO_SNDTIMEO: 1421 case SO_RCVTIMEO: 1422 error = sooptcopyin(sopt, &tv, sizeof tv, 1423 sizeof tv); 1424 if (error) 1425 goto bad; 1426 1427 /* assert(hz > 0); */ 1428 if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz || 1429 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 1430 error = EDOM; 1431 goto bad; 1432 } 1433 /* assert(tick > 0); */ 1434 /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */ 1435 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / ustick; 1436 if (val > SHRT_MAX) { 1437 error = EDOM; 1438 goto bad; 1439 } 1440 if (val == 0 && tv.tv_usec != 0) 1441 val = 1; 1442 1443 switch (sopt->sopt_name) { 1444 case SO_SNDTIMEO: 1445 so->so_snd.ssb_timeo = val; 1446 break; 1447 case SO_RCVTIMEO: 1448 so->so_rcv.ssb_timeo = val; 1449 break; 1450 } 1451 break; 1452 default: 1453 error = ENOPROTOOPT; 1454 break; 1455 } 1456 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { 1457 (void) so_pru_ctloutput(so, sopt); 1458 } 1459 } 1460 bad: 1461 return (error); 1462 } 1463 1464 /* Helper routine for getsockopt */ 1465 int 1466 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 1467 { 1468 soopt_from_kbuf(sopt, buf, len); 1469 return 0; 1470 } 1471 1472 void 1473 soopt_from_kbuf(struct sockopt *sopt, const void *buf, size_t len) 1474 { 1475 size_t valsize; 1476 1477 if (len == 0) { 1478 sopt->sopt_valsize = 0; 1479 return; 1480 } 1481 1482 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 1483 KKASSERT(kva_p(buf)); 1484 1485 /* 1486 * Documented get behavior is that we always return a value, 1487 * possibly truncated to fit in the user's buffer. 1488 * Traditional behavior is that we always tell the user 1489 * precisely how much we copied, rather than something useful 1490 * like the total amount we had available for her. 1491 * Note that this interface is not idempotent; the entire answer must 1492 * generated ahead of time. 1493 */ 1494 valsize = szmin(len, sopt->sopt_valsize); 1495 sopt->sopt_valsize = valsize; 1496 if (sopt->sopt_val != 0) { 1497 bcopy(buf, sopt->sopt_val, valsize); 1498 } 1499 } 1500 1501 int 1502 sogetopt(struct socket *so, struct sockopt *sopt) 1503 { 1504 int error, optval; 1505 struct linger l; 1506 struct timeval tv; 1507 #ifdef INET 1508 struct accept_filter_arg *afap; 1509 #endif 1510 1511 error = 0; 1512 sopt->sopt_dir = SOPT_GET; 1513 if (sopt->sopt_level != SOL_SOCKET) { 1514 if (so->so_proto && so->so_proto->pr_ctloutput) { 1515 return (so_pru_ctloutput(so, sopt)); 1516 } else 1517 return (ENOPROTOOPT); 1518 } else { 1519 switch (sopt->sopt_name) { 1520 #ifdef INET 1521 case SO_ACCEPTFILTER: 1522 if ((so->so_options & SO_ACCEPTCONN) == 0) 1523 return (EINVAL); 1524 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), 1525 M_TEMP, M_WAITOK | M_ZERO); 1526 if ((so->so_options & SO_ACCEPTFILTER) != 0) { 1527 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); 1528 if (so->so_accf->so_accept_filter_str != NULL) 1529 strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); 1530 } 1531 error = sooptcopyout(sopt, afap, sizeof(*afap)); 1532 FREE(afap, M_TEMP); 1533 break; 1534 #endif /* INET */ 1535 1536 case SO_LINGER: 1537 l.l_onoff = so->so_options & SO_LINGER; 1538 l.l_linger = so->so_linger; 1539 error = sooptcopyout(sopt, &l, sizeof l); 1540 break; 1541 1542 case SO_USELOOPBACK: 1543 case SO_DONTROUTE: 1544 case SO_DEBUG: 1545 case SO_KEEPALIVE: 1546 case SO_REUSEADDR: 1547 case SO_REUSEPORT: 1548 case SO_BROADCAST: 1549 case SO_OOBINLINE: 1550 case SO_TIMESTAMP: 1551 optval = so->so_options & sopt->sopt_name; 1552 integer: 1553 error = sooptcopyout(sopt, &optval, sizeof optval); 1554 break; 1555 1556 case SO_TYPE: 1557 optval = so->so_type; 1558 goto integer; 1559 1560 case SO_ERROR: 1561 optval = so->so_error; 1562 so->so_error = 0; 1563 goto integer; 1564 1565 case SO_SNDBUF: 1566 optval = so->so_snd.ssb_hiwat; 1567 goto integer; 1568 1569 case SO_RCVBUF: 1570 optval = so->so_rcv.ssb_hiwat; 1571 goto integer; 1572 1573 case SO_SNDLOWAT: 1574 optval = so->so_snd.ssb_lowat; 1575 goto integer; 1576 1577 case SO_RCVLOWAT: 1578 optval = so->so_rcv.ssb_lowat; 1579 goto integer; 1580 1581 case SO_SNDTIMEO: 1582 case SO_RCVTIMEO: 1583 optval = (sopt->sopt_name == SO_SNDTIMEO ? 1584 so->so_snd.ssb_timeo : so->so_rcv.ssb_timeo); 1585 1586 tv.tv_sec = optval / hz; 1587 tv.tv_usec = (optval % hz) * ustick; 1588 error = sooptcopyout(sopt, &tv, sizeof tv); 1589 break; 1590 1591 default: 1592 error = ENOPROTOOPT; 1593 break; 1594 } 1595 return (error); 1596 } 1597 } 1598 1599 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 1600 int 1601 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 1602 { 1603 struct mbuf *m, *m_prev; 1604 int sopt_size = sopt->sopt_valsize, msize; 1605 1606 m = m_getl(sopt_size, sopt->sopt_td ? MB_WAIT : MB_DONTWAIT, MT_DATA, 1607 0, &msize); 1608 if (m == NULL) 1609 return (ENOBUFS); 1610 m->m_len = min(msize, sopt_size); 1611 sopt_size -= m->m_len; 1612 *mp = m; 1613 m_prev = m; 1614 1615 while (sopt_size > 0) { 1616 m = m_getl(sopt_size, sopt->sopt_td ? MB_WAIT : MB_DONTWAIT, 1617 MT_DATA, 0, &msize); 1618 if (m == NULL) { 1619 m_freem(*mp); 1620 return (ENOBUFS); 1621 } 1622 m->m_len = min(msize, sopt_size); 1623 sopt_size -= m->m_len; 1624 m_prev->m_next = m; 1625 m_prev = m; 1626 } 1627 return (0); 1628 } 1629 1630 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 1631 int 1632 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 1633 { 1634 soopt_to_mbuf(sopt, m); 1635 return 0; 1636 } 1637 1638 void 1639 soopt_to_mbuf(struct sockopt *sopt, struct mbuf *m) 1640 { 1641 size_t valsize; 1642 void *val; 1643 1644 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 1645 KKASSERT(kva_p(m)); 1646 if (sopt->sopt_val == NULL) 1647 return; 1648 val = sopt->sopt_val; 1649 valsize = sopt->sopt_valsize; 1650 while (m != NULL && valsize >= m->m_len) { 1651 bcopy(val, mtod(m, char *), m->m_len); 1652 valsize -= m->m_len; 1653 val = (caddr_t)val + m->m_len; 1654 m = m->m_next; 1655 } 1656 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 1657 panic("ip6_sooptmcopyin"); 1658 } 1659 1660 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 1661 int 1662 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 1663 { 1664 return soopt_from_mbuf(sopt, m); 1665 } 1666 1667 int 1668 soopt_from_mbuf(struct sockopt *sopt, struct mbuf *m) 1669 { 1670 struct mbuf *m0 = m; 1671 size_t valsize = 0; 1672 size_t maxsize; 1673 void *val; 1674 1675 KKASSERT(!sopt->sopt_val || kva_p(sopt->sopt_val)); 1676 KKASSERT(kva_p(m)); 1677 if (sopt->sopt_val == NULL) 1678 return 0; 1679 val = sopt->sopt_val; 1680 maxsize = sopt->sopt_valsize; 1681 while (m != NULL && maxsize >= m->m_len) { 1682 bcopy(mtod(m, char *), val, m->m_len); 1683 maxsize -= m->m_len; 1684 val = (caddr_t)val + m->m_len; 1685 valsize += m->m_len; 1686 m = m->m_next; 1687 } 1688 if (m != NULL) { 1689 /* enough soopt buffer should be given from user-land */ 1690 m_freem(m0); 1691 return (EINVAL); 1692 } 1693 sopt->sopt_valsize = valsize; 1694 return 0; 1695 } 1696 1697 void 1698 sohasoutofband(struct socket *so) 1699 { 1700 if (so->so_sigio != NULL) 1701 pgsigio(so->so_sigio, SIGURG, 0); 1702 selwakeup(&so->so_rcv.ssb_sel); 1703 KNOTE(&so->so_rcv.ssb_sel.si_note, NOTE_OOB); 1704 } 1705 1706 int 1707 sopoll(struct socket *so, int events, struct ucred *cred, struct thread *td) 1708 { 1709 int revents = 0; 1710 1711 crit_enter(); 1712 1713 if (events & (POLLIN | POLLRDNORM)) 1714 if (soreadable(so)) 1715 revents |= events & (POLLIN | POLLRDNORM); 1716 1717 if (events & POLLINIGNEOF) 1718 if (so->so_rcv.ssb_cc >= so->so_rcv.ssb_lowat || 1719 !TAILQ_EMPTY(&so->so_comp) || so->so_error) 1720 revents |= POLLINIGNEOF; 1721 1722 if (events & (POLLOUT | POLLWRNORM)) 1723 if (sowriteable(so)) 1724 revents |= events & (POLLOUT | POLLWRNORM); 1725 1726 if (events & (POLLPRI | POLLRDBAND)) 1727 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) 1728 revents |= events & (POLLPRI | POLLRDBAND); 1729 1730 if (revents == 0) { 1731 if (events & 1732 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | 1733 POLLRDBAND)) { 1734 selrecord(td, &so->so_rcv.ssb_sel); 1735 so->so_rcv.ssb_flags |= SSB_SEL; 1736 } 1737 1738 if (events & (POLLOUT | POLLWRNORM)) { 1739 selrecord(td, &so->so_snd.ssb_sel); 1740 so->so_snd.ssb_flags |= SSB_SEL; 1741 } 1742 } 1743 1744 crit_exit(); 1745 return (revents); 1746 } 1747 1748 int 1749 sokqfilter(struct file *fp, struct knote *kn) 1750 { 1751 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1752 struct signalsockbuf *ssb; 1753 1754 switch (kn->kn_filter) { 1755 case EVFILT_READ: 1756 if (so->so_options & SO_ACCEPTCONN) 1757 kn->kn_fop = &solisten_filtops; 1758 else 1759 kn->kn_fop = &soread_filtops; 1760 ssb = &so->so_rcv; 1761 break; 1762 case EVFILT_WRITE: 1763 kn->kn_fop = &sowrite_filtops; 1764 ssb = &so->so_snd; 1765 break; 1766 case EVFILT_EXCEPT: 1767 kn->kn_fop = &soexcept_filtops; 1768 ssb = &so->so_rcv; 1769 break; 1770 default: 1771 return (1); 1772 } 1773 1774 crit_enter(); 1775 SLIST_INSERT_HEAD(&ssb->ssb_sel.si_note, kn, kn_selnext); 1776 ssb->ssb_flags |= SSB_KNOTE; 1777 crit_exit(); 1778 return (0); 1779 } 1780 1781 static void 1782 filt_sordetach(struct knote *kn) 1783 { 1784 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1785 1786 crit_enter(); 1787 SLIST_REMOVE(&so->so_rcv.ssb_sel.si_note, kn, knote, kn_selnext); 1788 if (SLIST_EMPTY(&so->so_rcv.ssb_sel.si_note)) 1789 so->so_rcv.ssb_flags &= ~SSB_KNOTE; 1790 crit_exit(); 1791 } 1792 1793 /*ARGSUSED*/ 1794 static int 1795 filt_soread(struct knote *kn, long hint) 1796 { 1797 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1798 1799 if (kn->kn_sfflags & NOTE_OOB) { 1800 if ((so->so_oobmark || (so->so_state & SS_RCVATMARK))) { 1801 kn->kn_fflags |= NOTE_OOB; 1802 return (1); 1803 } 1804 return (0); 1805 } 1806 kn->kn_data = so->so_rcv.ssb_cc; 1807 if (so->so_state & SS_CANTRCVMORE) { 1808 kn->kn_flags |= EV_EOF; 1809 kn->kn_fflags = so->so_error; 1810 return (1); 1811 } 1812 if (so->so_error) /* temporary udp error */ 1813 return (1); 1814 if (kn->kn_sfflags & NOTE_LOWAT) 1815 return (kn->kn_data >= kn->kn_sdata); 1816 return ((kn->kn_data >= so->so_rcv.ssb_lowat) || 1817 !TAILQ_EMPTY(&so->so_comp)); 1818 } 1819 1820 static void 1821 filt_sowdetach(struct knote *kn) 1822 { 1823 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1824 1825 crit_enter(); 1826 SLIST_REMOVE(&so->so_snd.ssb_sel.si_note, kn, knote, kn_selnext); 1827 if (SLIST_EMPTY(&so->so_snd.ssb_sel.si_note)) 1828 so->so_snd.ssb_flags &= ~SSB_KNOTE; 1829 crit_exit(); 1830 } 1831 1832 /*ARGSUSED*/ 1833 static int 1834 filt_sowrite(struct knote *kn, long hint) 1835 { 1836 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1837 1838 kn->kn_data = ssb_space(&so->so_snd); 1839 if (so->so_state & SS_CANTSENDMORE) { 1840 kn->kn_flags |= EV_EOF; 1841 kn->kn_fflags = so->so_error; 1842 return (1); 1843 } 1844 if (so->so_error) /* temporary udp error */ 1845 return (1); 1846 if (((so->so_state & SS_ISCONNECTED) == 0) && 1847 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 1848 return (0); 1849 if (kn->kn_sfflags & NOTE_LOWAT) 1850 return (kn->kn_data >= kn->kn_sdata); 1851 return (kn->kn_data >= so->so_snd.ssb_lowat); 1852 } 1853 1854 /*ARGSUSED*/ 1855 static int 1856 filt_solisten(struct knote *kn, long hint) 1857 { 1858 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1859 1860 kn->kn_data = so->so_qlen; 1861 return (! TAILQ_EMPTY(&so->so_comp)); 1862 } 1863