1 /* $OpenBSD: uipc_socket.c,v 1.64 2006/06/10 17:05:17 beck Exp $ */ 2 /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 #include <sys/file.h> 39 #include <sys/malloc.h> 40 #include <sys/mbuf.h> 41 #include <sys/domain.h> 42 #include <sys/kernel.h> 43 #include <sys/event.h> 44 #include <sys/protosw.h> 45 #include <sys/socket.h> 46 #include <sys/socketvar.h> 47 #include <sys/signalvar.h> 48 #include <sys/resourcevar.h> 49 #include <sys/pool.h> 50 51 void filt_sordetach(struct knote *kn); 52 int filt_soread(struct knote *kn, long hint); 53 void filt_sowdetach(struct knote *kn); 54 int filt_sowrite(struct knote *kn, long hint); 55 int filt_solisten(struct knote *kn, long hint); 56 57 struct filterops solisten_filtops = 58 { 1, NULL, filt_sordetach, filt_solisten }; 59 struct filterops soread_filtops = 60 { 1, NULL, filt_sordetach, filt_soread }; 61 struct filterops sowrite_filtops = 62 { 1, NULL, filt_sowdetach, filt_sowrite }; 63 64 65 #ifndef SOMINCONN 66 #define SOMINCONN 80 67 #endif /* SOMINCONN */ 68 69 int somaxconn = SOMAXCONN; 70 int sominconn = SOMINCONN; 71 72 struct pool socket_pool; 73 74 void 75 soinit(void) 76 { 77 78 pool_init(&socket_pool, sizeof(struct socket), 0, 0, 0, "sockpl", NULL); 79 } 80 81 /* 82 * Socket operation routines. 83 * These routines are called by the routines in 84 * sys_socket.c or from a system process, and 85 * implement the semantics of socket operations by 86 * switching out to the protocol specific routines. 87 */ 88 /*ARGSUSED*/ 89 int 90 socreate(int dom, struct socket **aso, int type, int proto) 91 { 92 struct proc *p = curproc; /* XXX */ 93 struct protosw *prp; 94 struct socket *so; 95 int error, s; 96 97 if (proto) 98 prp = pffindproto(dom, proto, type); 99 else 100 prp = pffindtype(dom, type); 101 if (prp == NULL || prp->pr_usrreq == 0) 102 return (EPROTONOSUPPORT); 103 if (prp->pr_type != type) 104 return (EPROTOTYPE); 105 s = splsoftnet(); 106 so = pool_get(&socket_pool, PR_WAITOK); 107 bzero(so, sizeof(*so)); 108 TAILQ_INIT(&so->so_q0); 109 TAILQ_INIT(&so->so_q); 110 so->so_type = type; 111 if (p->p_ucred->cr_uid == 0) 112 so->so_state = SS_PRIV; 113 so->so_ruid = p->p_cred->p_ruid; 114 so->so_euid = p->p_ucred->cr_uid; 115 so->so_rgid = p->p_cred->p_rgid; 116 so->so_egid = p->p_ucred->cr_gid; 117 so->so_cpid = p->p_pid; 118 so->so_proto = prp; 119 error = (*prp->pr_usrreq)(so, PRU_ATTACH, NULL, 120 (struct mbuf *)(long)proto, NULL); 121 if (error) { 122 so->so_state |= SS_NOFDREF; 123 sofree(so); 124 splx(s); 125 return (error); 126 } 127 #ifdef COMPAT_SUNOS 128 { 129 extern struct emul emul_sunos; 130 if (p->p_emul == &emul_sunos && type == SOCK_DGRAM) 131 so->so_options |= SO_BROADCAST; 132 } 133 #endif 134 splx(s); 135 *aso = so; 136 return (0); 137 } 138 139 int 140 sobind(struct socket *so, struct mbuf *nam) 141 { 142 int s = splsoftnet(); 143 int error; 144 145 error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, NULL, nam, NULL); 146 splx(s); 147 return (error); 148 } 149 150 int 151 solisten(struct socket *so, int backlog) 152 { 153 int s = splsoftnet(), error; 154 155 error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, NULL, NULL); 156 if (error) { 157 splx(s); 158 return (error); 159 } 160 if (TAILQ_FIRST(&so->so_q) == NULL) 161 so->so_options |= SO_ACCEPTCONN; 162 if (backlog < 0 || backlog > somaxconn) 163 backlog = somaxconn; 164 if (backlog < sominconn) 165 backlog = sominconn; 166 so->so_qlimit = backlog; 167 splx(s); 168 return (0); 169 } 170 171 /* 172 * Must be called at splsoftnet() 173 */ 174 175 void 176 sofree(struct socket *so) 177 { 178 splassert(IPL_SOFTNET); 179 180 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) 181 return; 182 if (so->so_head) { 183 /* 184 * We must not decommission a socket that's on the accept(2) 185 * queue. If we do, then accept(2) may hang after select(2) 186 * indicated that the listening socket was ready. 187 */ 188 if (!soqremque(so, 0)) 189 return; 190 } 191 sbrelease(&so->so_snd); 192 sorflush(so); 193 pool_put(&socket_pool, so); 194 } 195 196 /* 197 * Close a socket on last file table reference removal. 198 * Initiate disconnect if connected. 199 * Free socket when disconnect complete. 200 */ 201 int 202 soclose(struct socket *so) 203 { 204 struct socket *so2; 205 int s = splsoftnet(); /* conservative */ 206 int error = 0; 207 208 if (so->so_options & SO_ACCEPTCONN) { 209 while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) { 210 (void) soqremque(so2, 0); 211 (void) soabort(so2); 212 } 213 while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) { 214 (void) soqremque(so2, 1); 215 (void) soabort(so2); 216 } 217 } 218 if (so->so_pcb == 0) 219 goto discard; 220 if (so->so_state & SS_ISCONNECTED) { 221 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 222 error = sodisconnect(so); 223 if (error) 224 goto drop; 225 } 226 if (so->so_options & SO_LINGER) { 227 if ((so->so_state & SS_ISDISCONNECTING) && 228 (so->so_state & SS_NBIO)) 229 goto drop; 230 while (so->so_state & SS_ISCONNECTED) { 231 error = tsleep(&so->so_timeo, 232 PSOCK | PCATCH, netcls, 233 so->so_linger * hz); 234 if (error) 235 break; 236 } 237 } 238 } 239 drop: 240 if (so->so_pcb) { 241 int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH, NULL, 242 NULL, NULL); 243 if (error == 0) 244 error = error2; 245 } 246 discard: 247 if (so->so_state & SS_NOFDREF) 248 panic("soclose: NOFDREF"); 249 so->so_state |= SS_NOFDREF; 250 sofree(so); 251 splx(s); 252 return (error); 253 } 254 255 /* 256 * Must be called at splsoftnet. 257 */ 258 int 259 soabort(struct socket *so) 260 { 261 splassert(IPL_SOFTNET); 262 263 return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, NULL, NULL, NULL); 264 } 265 266 int 267 soaccept(struct socket *so, struct mbuf *nam) 268 { 269 int s = splsoftnet(); 270 int error = 0; 271 272 if ((so->so_state & SS_NOFDREF) == 0) 273 panic("soaccept: !NOFDREF"); 274 so->so_state &= ~SS_NOFDREF; 275 if ((so->so_state & SS_ISDISCONNECTED) == 0 || 276 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) 277 error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, NULL, 278 nam, NULL); 279 else 280 error = ECONNABORTED; 281 splx(s); 282 return (error); 283 } 284 285 int 286 soconnect(struct socket *so, struct mbuf *nam) 287 { 288 int s; 289 int error; 290 291 if (so->so_options & SO_ACCEPTCONN) 292 return (EOPNOTSUPP); 293 s = splsoftnet(); 294 /* 295 * If protocol is connection-based, can only connect once. 296 * Otherwise, if connected, try to disconnect first. 297 * This allows user to disconnect by connecting to, e.g., 298 * a null address. 299 */ 300 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 301 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 302 (error = sodisconnect(so)))) 303 error = EISCONN; 304 else 305 error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, 306 NULL, nam, NULL); 307 splx(s); 308 return (error); 309 } 310 311 int 312 soconnect2(struct socket *so1, struct socket *so2) 313 { 314 int s = splsoftnet(); 315 int error; 316 317 error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, NULL, 318 (struct mbuf *)so2, NULL); 319 splx(s); 320 return (error); 321 } 322 323 int 324 sodisconnect(struct socket *so) 325 { 326 int s = splsoftnet(); 327 int error; 328 329 if ((so->so_state & SS_ISCONNECTED) == 0) { 330 error = ENOTCONN; 331 goto bad; 332 } 333 if (so->so_state & SS_ISDISCONNECTING) { 334 error = EALREADY; 335 goto bad; 336 } 337 error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, NULL, NULL, 338 NULL); 339 bad: 340 splx(s); 341 return (error); 342 } 343 344 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 345 /* 346 * Send on a socket. 347 * If send must go all at once and message is larger than 348 * send buffering, then hard error. 349 * Lock against other senders. 350 * If must go all at once and not enough room now, then 351 * inform user that this would block and do nothing. 352 * Otherwise, if nonblocking, send as much as possible. 353 * The data to be sent is described by "uio" if nonzero, 354 * otherwise by the mbuf chain "top" (which must be null 355 * if uio is not). Data provided in mbuf chain must be small 356 * enough to send all at once. 357 * 358 * Returns nonzero on error, timeout or signal; callers 359 * must check for short counts if EINTR/ERESTART are returned. 360 * Data and control buffers are freed on return. 361 */ 362 int 363 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, 364 struct mbuf *control, int flags) 365 { 366 struct mbuf **mp; 367 struct mbuf *m; 368 long space, len, mlen, clen = 0; 369 quad_t resid; 370 int error, s, dontroute; 371 int atomic = sosendallatonce(so) || top; 372 373 if (uio) 374 resid = uio->uio_resid; 375 else 376 resid = top->m_pkthdr.len; 377 /* 378 * In theory resid should be unsigned (since uio->uio_resid is). 379 * However, space must be signed, as it might be less than 0 380 * if we over-committed, and we must use a signed comparison 381 * of space and resid. On the other hand, a negative resid 382 * causes us to loop sending 0-length segments to the protocol. 383 * MSG_EOR on a SOCK_STREAM socket is also invalid. 384 */ 385 if (resid < 0 || 386 (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 387 error = EINVAL; 388 goto out; 389 } 390 dontroute = 391 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 392 (so->so_proto->pr_flags & PR_ATOMIC); 393 if (uio && uio->uio_procp) 394 uio->uio_procp->p_stats->p_ru.ru_msgsnd++; 395 if (control) 396 clen = control->m_len; 397 #define snderr(errno) { error = errno; splx(s); goto release; } 398 399 restart: 400 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0) 401 goto out; 402 so->so_state |= SS_ISSENDING; 403 do { 404 s = splsoftnet(); 405 if (so->so_state & SS_CANTSENDMORE) 406 snderr(EPIPE); 407 if (so->so_error) { 408 error = so->so_error; 409 so->so_error = 0; 410 splx(s); 411 goto release; 412 } 413 if ((so->so_state & SS_ISCONNECTED) == 0) { 414 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 415 if ((so->so_state & SS_ISCONFIRMING) == 0 && 416 !(resid == 0 && clen != 0)) 417 snderr(ENOTCONN); 418 } else if (addr == 0) 419 snderr(EDESTADDRREQ); 420 } 421 space = sbspace(&so->so_snd); 422 if (flags & MSG_OOB) 423 space += 1024; 424 if ((atomic && resid > so->so_snd.sb_hiwat) || 425 clen > so->so_snd.sb_hiwat) 426 snderr(EMSGSIZE); 427 if (space < resid + clen && uio && 428 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 429 if (so->so_state & SS_NBIO) 430 snderr(EWOULDBLOCK); 431 sbunlock(&so->so_snd); 432 error = sbwait(&so->so_snd); 433 so->so_state &= ~SS_ISSENDING; 434 splx(s); 435 if (error) 436 goto out; 437 goto restart; 438 } 439 splx(s); 440 mp = ⊤ 441 space -= clen; 442 do { 443 if (uio == NULL) { 444 /* 445 * Data is prepackaged in "top". 446 */ 447 resid = 0; 448 if (flags & MSG_EOR) 449 top->m_flags |= M_EOR; 450 } else do { 451 if (top == 0) { 452 MGETHDR(m, M_WAIT, MT_DATA); 453 mlen = MHLEN; 454 m->m_pkthdr.len = 0; 455 m->m_pkthdr.rcvif = (struct ifnet *)0; 456 } else { 457 MGET(m, M_WAIT, MT_DATA); 458 mlen = MLEN; 459 } 460 if (resid >= MINCLSIZE && space >= MCLBYTES) { 461 MCLGET(m, M_WAIT); 462 if ((m->m_flags & M_EXT) == 0) 463 goto nopages; 464 mlen = MCLBYTES; 465 if (atomic && top == 0) { 466 len = lmin(MCLBYTES - max_hdr, resid); 467 m->m_data += max_hdr; 468 } else 469 len = lmin(MCLBYTES, resid); 470 space -= len; 471 } else { 472 nopages: 473 len = lmin(lmin(mlen, resid), space); 474 space -= len; 475 /* 476 * For datagram protocols, leave room 477 * for protocol headers in first mbuf. 478 */ 479 if (atomic && top == 0 && len < mlen) 480 MH_ALIGN(m, len); 481 } 482 error = uiomove(mtod(m, caddr_t), (int)len, 483 uio); 484 resid = uio->uio_resid; 485 m->m_len = len; 486 *mp = m; 487 top->m_pkthdr.len += len; 488 if (error) 489 goto release; 490 mp = &m->m_next; 491 if (resid <= 0) { 492 if (flags & MSG_EOR) 493 top->m_flags |= M_EOR; 494 break; 495 } 496 } while (space > 0 && atomic); 497 if (dontroute) 498 so->so_options |= SO_DONTROUTE; 499 s = splsoftnet(); /* XXX */ 500 if (resid <= 0) 501 so->so_state &= ~SS_ISSENDING; 502 error = (*so->so_proto->pr_usrreq)(so, 503 (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, 504 top, addr, control); 505 splx(s); 506 if (dontroute) 507 so->so_options &= ~SO_DONTROUTE; 508 clen = 0; 509 control = 0; 510 top = 0; 511 mp = ⊤ 512 if (error) 513 goto release; 514 } while (resid && space > 0); 515 } while (resid); 516 517 release: 518 so->so_state &= ~SS_ISSENDING; 519 sbunlock(&so->so_snd); 520 out: 521 if (top) 522 m_freem(top); 523 if (control) 524 m_freem(control); 525 return (error); 526 } 527 528 /* 529 * Implement receive operations on a socket. 530 * We depend on the way that records are added to the sockbuf 531 * by sbappend*. In particular, each record (mbufs linked through m_next) 532 * must begin with an address if the protocol so specifies, 533 * followed by an optional mbuf or mbufs containing ancillary data, 534 * and then zero or more mbufs of data. 535 * In order to avoid blocking network interrupts for the entire time here, 536 * we splx() while doing the actual copy to user space. 537 * Although the sockbuf is locked, new data may still be appended, 538 * and thus we must maintain consistency of the sockbuf during that time. 539 * 540 * The caller may receive the data as a single mbuf chain by supplying 541 * an mbuf **mp0 for use in returning the chain. The uio is then used 542 * only for the count in uio_resid. 543 */ 544 int 545 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, 546 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 547 { 548 struct mbuf *m, **mp; 549 int flags, len, error, s, offset; 550 struct protosw *pr = so->so_proto; 551 struct mbuf *nextrecord; 552 int moff, type = 0; 553 size_t orig_resid = uio->uio_resid; 554 int uio_error = 0; 555 int resid; 556 557 mp = mp0; 558 if (paddr) 559 *paddr = 0; 560 if (controlp) 561 *controlp = 0; 562 if (flagsp) 563 flags = *flagsp &~ MSG_EOR; 564 else 565 flags = 0; 566 if (so->so_state & SS_NBIO) 567 flags |= MSG_DONTWAIT; 568 if (flags & MSG_OOB) { 569 m = m_get(M_WAIT, MT_DATA); 570 error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, 571 (struct mbuf *)(long)(flags & MSG_PEEK), NULL); 572 if (error) 573 goto bad; 574 do { 575 error = uiomove(mtod(m, caddr_t), 576 (int) min(uio->uio_resid, m->m_len), uio); 577 m = m_free(m); 578 } while (uio->uio_resid && error == 0 && m); 579 bad: 580 if (m) 581 m_freem(m); 582 return (error); 583 } 584 if (mp) 585 *mp = NULL; 586 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 587 (*pr->pr_usrreq)(so, PRU_RCVD, NULL, NULL, NULL); 588 589 restart: 590 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) 591 return (error); 592 s = splsoftnet(); 593 594 m = so->so_rcv.sb_mb; 595 /* 596 * If we have less data than requested, block awaiting more 597 * (subject to any timeout) if: 598 * 1. the current count is less than the low water mark, 599 * 2. MSG_WAITALL is set, and it is possible to do the entire 600 * receive operation at once if we block (resid <= hiwat), or 601 * 3. MSG_DONTWAIT is not set. 602 * If MSG_WAITALL is set but resid is larger than the receive buffer, 603 * we have to do the receive in sections, and thus risk returning 604 * a short count if a timeout or signal occurs after we start. 605 */ 606 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 607 so->so_rcv.sb_cc < uio->uio_resid) && 608 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 609 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 610 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 611 #ifdef DIAGNOSTIC 612 if (m == NULL && so->so_rcv.sb_cc) 613 panic("receive 1"); 614 #endif 615 if (so->so_error) { 616 if (m) 617 goto dontblock; 618 error = so->so_error; 619 if ((flags & MSG_PEEK) == 0) 620 so->so_error = 0; 621 goto release; 622 } 623 if (so->so_state & SS_CANTRCVMORE) { 624 if (m) 625 goto dontblock; 626 else 627 goto release; 628 } 629 for (; m; m = m->m_next) 630 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 631 m = so->so_rcv.sb_mb; 632 goto dontblock; 633 } 634 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 635 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 636 error = ENOTCONN; 637 goto release; 638 } 639 if (uio->uio_resid == 0 && controlp == NULL) 640 goto release; 641 if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { 642 error = EWOULDBLOCK; 643 goto release; 644 } 645 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1"); 646 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1"); 647 sbunlock(&so->so_rcv); 648 error = sbwait(&so->so_rcv); 649 splx(s); 650 if (error) 651 return (error); 652 goto restart; 653 } 654 dontblock: 655 /* 656 * On entry here, m points to the first record of the socket buffer. 657 * While we process the initial mbufs containing address and control 658 * info, we save a copy of m->m_nextpkt into nextrecord. 659 */ 660 if (uio->uio_procp) 661 uio->uio_procp->p_stats->p_ru.ru_msgrcv++; 662 KASSERT(m == so->so_rcv.sb_mb); 663 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); 664 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); 665 nextrecord = m->m_nextpkt; 666 if (pr->pr_flags & PR_ADDR) { 667 #ifdef DIAGNOSTIC 668 if (m->m_type != MT_SONAME) 669 panic("receive 1a"); 670 #endif 671 orig_resid = 0; 672 if (flags & MSG_PEEK) { 673 if (paddr) 674 *paddr = m_copy(m, 0, m->m_len); 675 m = m->m_next; 676 } else { 677 sbfree(&so->so_rcv, m); 678 if (paddr) { 679 *paddr = m; 680 so->so_rcv.sb_mb = m->m_next; 681 m->m_next = 0; 682 m = so->so_rcv.sb_mb; 683 } else { 684 MFREE(m, so->so_rcv.sb_mb); 685 m = so->so_rcv.sb_mb; 686 } 687 } 688 } 689 while (m && m->m_type == MT_CONTROL && error == 0) { 690 if (flags & MSG_PEEK) { 691 if (controlp) 692 *controlp = m_copy(m, 0, m->m_len); 693 m = m->m_next; 694 } else { 695 sbfree(&so->so_rcv, m); 696 if (controlp) { 697 if (pr->pr_domain->dom_externalize && 698 mtod(m, struct cmsghdr *)->cmsg_type == 699 SCM_RIGHTS) 700 error = (*pr->pr_domain->dom_externalize)(m); 701 *controlp = m; 702 so->so_rcv.sb_mb = m->m_next; 703 m->m_next = 0; 704 m = so->so_rcv.sb_mb; 705 } else { 706 /* 707 * Dispose of any SCM_RIGHTS message that went 708 * through the read path rather than recv. 709 */ 710 if (pr->pr_domain->dom_dispose && 711 mtod(m, struct cmsghdr *)->cmsg_type == SCM_RIGHTS) 712 pr->pr_domain->dom_dispose(m); 713 MFREE(m, so->so_rcv.sb_mb); 714 m = so->so_rcv.sb_mb; 715 } 716 } 717 if (controlp) { 718 orig_resid = 0; 719 controlp = &(*controlp)->m_next; 720 } 721 } 722 723 /* 724 * If m is non-NULL, we have some data to read. From now on, 725 * make sure to keep sb_lastrecord consistent when working on 726 * the last packet on the chain (nextrecord == NULL) and we 727 * change m->m_nextpkt. 728 */ 729 if (m) { 730 if ((flags & MSG_PEEK) == 0) { 731 m->m_nextpkt = nextrecord; 732 /* 733 * If nextrecord == NULL (this is a single chain), 734 * then sb_lastrecord may not be valid here if m 735 * was changed earlier. 736 */ 737 if (nextrecord == NULL) { 738 KASSERT(so->so_rcv.sb_mb == m); 739 so->so_rcv.sb_lastrecord = m; 740 } 741 } 742 type = m->m_type; 743 if (type == MT_OOBDATA) 744 flags |= MSG_OOB; 745 if (m->m_flags & M_BCAST) 746 flags |= MSG_BCAST; 747 if (m->m_flags & M_MCAST) 748 flags |= MSG_MCAST; 749 } else { 750 if ((flags & MSG_PEEK) == 0) { 751 KASSERT(so->so_rcv.sb_mb == m); 752 so->so_rcv.sb_mb = nextrecord; 753 SB_EMPTY_FIXUP(&so->so_rcv); 754 } 755 } 756 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2"); 757 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2"); 758 759 moff = 0; 760 offset = 0; 761 while (m && uio->uio_resid > 0 && error == 0) { 762 if (m->m_type == MT_OOBDATA) { 763 if (type != MT_OOBDATA) 764 break; 765 } else if (type == MT_OOBDATA) 766 break; 767 #ifdef DIAGNOSTIC 768 else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) 769 panic("receive 3"); 770 #endif 771 so->so_state &= ~SS_RCVATMARK; 772 len = uio->uio_resid; 773 if (so->so_oobmark && len > so->so_oobmark - offset) 774 len = so->so_oobmark - offset; 775 if (len > m->m_len - moff) 776 len = m->m_len - moff; 777 /* 778 * If mp is set, just pass back the mbufs. 779 * Otherwise copy them out via the uio, then free. 780 * Sockbuf must be consistent here (points to current mbuf, 781 * it points to next record) when we drop priority; 782 * we must note any additions to the sockbuf when we 783 * block interrupts again. 784 */ 785 if (mp == NULL && uio_error == 0) { 786 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); 787 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); 788 resid = uio->uio_resid; 789 splx(s); 790 uio_error = 791 uiomove(mtod(m, caddr_t) + moff, (int)len, 792 uio); 793 s = splsoftnet(); 794 if (uio_error) 795 uio->uio_resid = resid - len; 796 } else 797 uio->uio_resid -= len; 798 if (len == m->m_len - moff) { 799 if (m->m_flags & M_EOR) 800 flags |= MSG_EOR; 801 if (flags & MSG_PEEK) { 802 m = m->m_next; 803 moff = 0; 804 } else { 805 nextrecord = m->m_nextpkt; 806 sbfree(&so->so_rcv, m); 807 if (mp) { 808 *mp = m; 809 mp = &m->m_next; 810 so->so_rcv.sb_mb = m = m->m_next; 811 *mp = NULL; 812 } else { 813 MFREE(m, so->so_rcv.sb_mb); 814 m = so->so_rcv.sb_mb; 815 } 816 /* 817 * If m != NULL, we also know that 818 * so->so_rcv.sb_mb != NULL. 819 */ 820 KASSERT(so->so_rcv.sb_mb == m); 821 if (m) { 822 m->m_nextpkt = nextrecord; 823 if (nextrecord == NULL) 824 so->so_rcv.sb_lastrecord = m; 825 } else { 826 so->so_rcv.sb_mb = nextrecord; 827 SB_EMPTY_FIXUP(&so->so_rcv); 828 } 829 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3"); 830 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3"); 831 } 832 } else { 833 if (flags & MSG_PEEK) 834 moff += len; 835 else { 836 if (mp) 837 *mp = m_copym(m, 0, len, M_WAIT); 838 m->m_data += len; 839 m->m_len -= len; 840 so->so_rcv.sb_cc -= len; 841 } 842 } 843 if (so->so_oobmark) { 844 if ((flags & MSG_PEEK) == 0) { 845 so->so_oobmark -= len; 846 if (so->so_oobmark == 0) { 847 so->so_state |= SS_RCVATMARK; 848 break; 849 } 850 } else { 851 offset += len; 852 if (offset == so->so_oobmark) 853 break; 854 } 855 } 856 if (flags & MSG_EOR) 857 break; 858 /* 859 * If the MSG_WAITALL flag is set (for non-atomic socket), 860 * we must not quit until "uio->uio_resid == 0" or an error 861 * termination. If a signal/timeout occurs, return 862 * with a short count but without error. 863 * Keep sockbuf locked against other readers. 864 */ 865 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 866 !sosendallatonce(so) && !nextrecord) { 867 if (so->so_error || so->so_state & SS_CANTRCVMORE) 868 break; 869 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2"); 870 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2"); 871 error = sbwait(&so->so_rcv); 872 if (error) { 873 sbunlock(&so->so_rcv); 874 splx(s); 875 return (0); 876 } 877 if ((m = so->so_rcv.sb_mb) != NULL) 878 nextrecord = m->m_nextpkt; 879 } 880 } 881 882 if (m && pr->pr_flags & PR_ATOMIC) { 883 flags |= MSG_TRUNC; 884 if ((flags & MSG_PEEK) == 0) 885 (void) sbdroprecord(&so->so_rcv); 886 } 887 if ((flags & MSG_PEEK) == 0) { 888 if (m == NULL) { 889 /* 890 * First part is an inline SB_EMPTY_FIXUP(). Second 891 * part makes sure sb_lastrecord is up-to-date if 892 * there is still data in the socket buffer. 893 */ 894 so->so_rcv.sb_mb = nextrecord; 895 if (so->so_rcv.sb_mb == NULL) { 896 so->so_rcv.sb_mbtail = NULL; 897 so->so_rcv.sb_lastrecord = NULL; 898 } else if (nextrecord->m_nextpkt == NULL) 899 so->so_rcv.sb_lastrecord = nextrecord; 900 } 901 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); 902 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); 903 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 904 (*pr->pr_usrreq)(so, PRU_RCVD, NULL, 905 (struct mbuf *)(long)flags, NULL); 906 } 907 if (orig_resid == uio->uio_resid && orig_resid && 908 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 909 sbunlock(&so->so_rcv); 910 splx(s); 911 goto restart; 912 } 913 914 if (uio_error) 915 error = uio_error; 916 917 if (flagsp) 918 *flagsp |= flags; 919 release: 920 sbunlock(&so->so_rcv); 921 splx(s); 922 return (error); 923 } 924 925 int 926 soshutdown(struct socket *so, int how) 927 { 928 struct protosw *pr = so->so_proto; 929 930 switch (how) { 931 case SHUT_RD: 932 case SHUT_RDWR: 933 sorflush(so); 934 if (how == SHUT_RD) 935 return (0); 936 /* FALLTHROUGH */ 937 case SHUT_WR: 938 return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, NULL, NULL, NULL); 939 default: 940 return (EINVAL); 941 } 942 } 943 944 void 945 sorflush(struct socket *so) 946 { 947 struct sockbuf *sb = &so->so_rcv; 948 struct protosw *pr = so->so_proto; 949 int s; 950 struct sockbuf asb; 951 952 sb->sb_flags |= SB_NOINTR; 953 (void) sblock(sb, M_WAITOK); 954 s = splnet(); 955 socantrcvmore(so); 956 sbunlock(sb); 957 asb = *sb; 958 bzero(sb, sizeof (*sb)); 959 /* XXX - the bzero stumps all over so_rcv */ 960 if (asb.sb_flags & SB_KNOTE) { 961 sb->sb_sel.si_note = asb.sb_sel.si_note; 962 sb->sb_flags = SB_KNOTE; 963 } 964 splx(s); 965 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 966 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 967 sbrelease(&asb); 968 } 969 970 int 971 sosetopt(struct socket *so, int level, int optname, struct mbuf *m0) 972 { 973 int error = 0; 974 struct mbuf *m = m0; 975 976 if (level != SOL_SOCKET) { 977 if (so->so_proto && so->so_proto->pr_ctloutput) 978 return ((*so->so_proto->pr_ctloutput) 979 (PRCO_SETOPT, so, level, optname, &m0)); 980 error = ENOPROTOOPT; 981 } else { 982 switch (optname) { 983 984 case SO_LINGER: 985 if (m == NULL || m->m_len != sizeof (struct linger) || 986 mtod(m, struct linger *)->l_linger < 0 || 987 mtod(m, struct linger *)->l_linger > SHRT_MAX) { 988 error = EINVAL; 989 goto bad; 990 } 991 so->so_linger = mtod(m, struct linger *)->l_linger; 992 /* fall thru... */ 993 994 case SO_DEBUG: 995 case SO_KEEPALIVE: 996 case SO_DONTROUTE: 997 case SO_USELOOPBACK: 998 case SO_BROADCAST: 999 case SO_REUSEADDR: 1000 case SO_REUSEPORT: 1001 case SO_OOBINLINE: 1002 case SO_JUMBO: 1003 if (m == NULL || m->m_len < sizeof (int)) { 1004 error = EINVAL; 1005 goto bad; 1006 } 1007 if (*mtod(m, int *)) 1008 so->so_options |= optname; 1009 else 1010 so->so_options &= ~optname; 1011 break; 1012 1013 case SO_SNDBUF: 1014 case SO_RCVBUF: 1015 case SO_SNDLOWAT: 1016 case SO_RCVLOWAT: 1017 { 1018 u_long cnt; 1019 1020 if (m == NULL || m->m_len < sizeof (int)) { 1021 error = EINVAL; 1022 goto bad; 1023 } 1024 cnt = *mtod(m, int *); 1025 if ((long)cnt <= 0) 1026 cnt = 1; 1027 switch (optname) { 1028 1029 case SO_SNDBUF: 1030 if (sbcheckreserve(cnt, so->so_snd.sb_hiwat) || 1031 sbreserve(&so->so_snd, cnt) == 0) { 1032 error = ENOBUFS; 1033 goto bad; 1034 } 1035 break; 1036 1037 case SO_RCVBUF: 1038 if (sbcheckreserve(cnt, so->so_rcv.sb_hiwat) || 1039 sbreserve(&so->so_rcv, cnt) == 0) { 1040 error = ENOBUFS; 1041 goto bad; 1042 } 1043 break; 1044 1045 case SO_SNDLOWAT: 1046 so->so_snd.sb_lowat = (cnt > so->so_snd.sb_hiwat) ? 1047 so->so_snd.sb_hiwat : cnt; 1048 break; 1049 case SO_RCVLOWAT: 1050 so->so_rcv.sb_lowat = (cnt > so->so_rcv.sb_hiwat) ? 1051 so->so_rcv.sb_hiwat : cnt; 1052 break; 1053 } 1054 break; 1055 } 1056 1057 case SO_SNDTIMEO: 1058 case SO_RCVTIMEO: 1059 { 1060 struct timeval *tv; 1061 short val; 1062 1063 if (m == NULL || m->m_len < sizeof (*tv)) { 1064 error = EINVAL; 1065 goto bad; 1066 } 1067 tv = mtod(m, struct timeval *); 1068 if (tv->tv_sec > (SHRT_MAX - tv->tv_usec / tick) / hz) { 1069 error = EDOM; 1070 goto bad; 1071 } 1072 val = tv->tv_sec * hz + tv->tv_usec / tick; 1073 if (val == 0 && tv->tv_usec != 0) 1074 val = 1; 1075 1076 switch (optname) { 1077 1078 case SO_SNDTIMEO: 1079 so->so_snd.sb_timeo = val; 1080 break; 1081 case SO_RCVTIMEO: 1082 so->so_rcv.sb_timeo = val; 1083 break; 1084 } 1085 break; 1086 } 1087 1088 default: 1089 error = ENOPROTOOPT; 1090 break; 1091 } 1092 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { 1093 (void) ((*so->so_proto->pr_ctloutput) 1094 (PRCO_SETOPT, so, level, optname, &m0)); 1095 m = NULL; /* freed by protocol */ 1096 } 1097 } 1098 bad: 1099 if (m) 1100 (void) m_free(m); 1101 return (error); 1102 } 1103 1104 int 1105 sogetopt(struct socket *so, int level, int optname, struct mbuf **mp) 1106 { 1107 struct mbuf *m; 1108 1109 if (level != SOL_SOCKET) { 1110 if (so->so_proto && so->so_proto->pr_ctloutput) { 1111 return ((*so->so_proto->pr_ctloutput) 1112 (PRCO_GETOPT, so, level, optname, mp)); 1113 } else 1114 return (ENOPROTOOPT); 1115 } else { 1116 m = m_get(M_WAIT, MT_SOOPTS); 1117 m->m_len = sizeof (int); 1118 1119 switch (optname) { 1120 1121 case SO_LINGER: 1122 m->m_len = sizeof (struct linger); 1123 mtod(m, struct linger *)->l_onoff = 1124 so->so_options & SO_LINGER; 1125 mtod(m, struct linger *)->l_linger = so->so_linger; 1126 break; 1127 1128 case SO_USELOOPBACK: 1129 case SO_DONTROUTE: 1130 case SO_DEBUG: 1131 case SO_KEEPALIVE: 1132 case SO_REUSEADDR: 1133 case SO_REUSEPORT: 1134 case SO_BROADCAST: 1135 case SO_OOBINLINE: 1136 case SO_JUMBO: 1137 *mtod(m, int *) = so->so_options & optname; 1138 break; 1139 1140 case SO_TYPE: 1141 *mtod(m, int *) = so->so_type; 1142 break; 1143 1144 case SO_ERROR: 1145 *mtod(m, int *) = so->so_error; 1146 so->so_error = 0; 1147 break; 1148 1149 case SO_SNDBUF: 1150 *mtod(m, int *) = so->so_snd.sb_hiwat; 1151 break; 1152 1153 case SO_RCVBUF: 1154 *mtod(m, int *) = so->so_rcv.sb_hiwat; 1155 break; 1156 1157 case SO_SNDLOWAT: 1158 *mtod(m, int *) = so->so_snd.sb_lowat; 1159 break; 1160 1161 case SO_RCVLOWAT: 1162 *mtod(m, int *) = so->so_rcv.sb_lowat; 1163 break; 1164 1165 case SO_SNDTIMEO: 1166 case SO_RCVTIMEO: 1167 { 1168 int val = (optname == SO_SNDTIMEO ? 1169 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1170 1171 m->m_len = sizeof(struct timeval); 1172 mtod(m, struct timeval *)->tv_sec = val / hz; 1173 mtod(m, struct timeval *)->tv_usec = 1174 (val % hz) * tick; 1175 break; 1176 } 1177 1178 default: 1179 (void)m_free(m); 1180 return (ENOPROTOOPT); 1181 } 1182 *mp = m; 1183 return (0); 1184 } 1185 } 1186 1187 void 1188 sohasoutofband(struct socket *so) 1189 { 1190 csignal(so->so_pgid, SIGURG, so->so_siguid, so->so_sigeuid); 1191 selwakeup(&so->so_rcv.sb_sel); 1192 } 1193 1194 int 1195 soo_kqfilter(struct file *fp, struct knote *kn) 1196 { 1197 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1198 struct sockbuf *sb; 1199 int s; 1200 1201 switch (kn->kn_filter) { 1202 case EVFILT_READ: 1203 if (so->so_options & SO_ACCEPTCONN) 1204 kn->kn_fop = &solisten_filtops; 1205 else 1206 kn->kn_fop = &soread_filtops; 1207 sb = &so->so_rcv; 1208 break; 1209 case EVFILT_WRITE: 1210 kn->kn_fop = &sowrite_filtops; 1211 sb = &so->so_snd; 1212 break; 1213 default: 1214 return (1); 1215 } 1216 1217 s = splnet(); 1218 SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext); 1219 sb->sb_flags |= SB_KNOTE; 1220 splx(s); 1221 return (0); 1222 } 1223 1224 void 1225 filt_sordetach(struct knote *kn) 1226 { 1227 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1228 int s = splnet(); 1229 1230 SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext); 1231 if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note)) 1232 so->so_rcv.sb_flags &= ~SB_KNOTE; 1233 splx(s); 1234 } 1235 1236 /*ARGSUSED*/ 1237 int 1238 filt_soread(struct knote *kn, long hint) 1239 { 1240 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1241 1242 kn->kn_data = so->so_rcv.sb_cc; 1243 if (so->so_state & SS_CANTRCVMORE) { 1244 kn->kn_flags |= EV_EOF; 1245 kn->kn_fflags = so->so_error; 1246 return (1); 1247 } 1248 if (so->so_error) /* temporary udp error */ 1249 return (1); 1250 if (kn->kn_sfflags & NOTE_LOWAT) 1251 return (kn->kn_data >= kn->kn_sdata); 1252 return (kn->kn_data >= so->so_rcv.sb_lowat); 1253 } 1254 1255 void 1256 filt_sowdetach(struct knote *kn) 1257 { 1258 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1259 int s = splnet(); 1260 1261 SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext); 1262 if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note)) 1263 so->so_snd.sb_flags &= ~SB_KNOTE; 1264 splx(s); 1265 } 1266 1267 /*ARGSUSED*/ 1268 int 1269 filt_sowrite(struct knote *kn, long hint) 1270 { 1271 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1272 1273 kn->kn_data = sbspace(&so->so_snd); 1274 if (so->so_state & SS_CANTSENDMORE) { 1275 kn->kn_flags |= EV_EOF; 1276 kn->kn_fflags = so->so_error; 1277 return (1); 1278 } 1279 if (so->so_error) /* temporary udp error */ 1280 return (1); 1281 if (((so->so_state & SS_ISCONNECTED) == 0) && 1282 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 1283 return (0); 1284 if (kn->kn_sfflags & NOTE_LOWAT) 1285 return (kn->kn_data >= kn->kn_sdata); 1286 return (kn->kn_data >= so->so_snd.sb_lowat); 1287 } 1288 1289 /*ARGSUSED*/ 1290 int 1291 filt_solisten(struct knote *kn, long hint) 1292 { 1293 struct socket *so = (struct socket *)kn->kn_fp->f_data; 1294 1295 kn->kn_data = so->so_qlen; 1296 return (so->so_qlen != 0); 1297 } 1298