1 /* $OpenBSD: tcp_usrreq.c,v 1.134 2016/07/20 19:57:53 bluhm Exp $ */ 2 /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/protosw.h> 77 #include <sys/stat.h> 78 #include <sys/sysctl.h> 79 #include <sys/domain.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/in_var.h> 89 #include <netinet/ip.h> 90 #include <netinet/in_pcb.h> 91 #include <netinet/ip_var.h> 92 #include <netinet/tcp.h> 93 #include <netinet/tcp_fsm.h> 94 #include <netinet/tcp_seq.h> 95 #include <netinet/tcp_timer.h> 96 #include <netinet/tcp_var.h> 97 #include <netinet/tcpip.h> 98 #include <netinet/tcp_debug.h> 99 100 #ifdef INET6 101 #include <netinet6/in6_var.h> 102 #endif 103 104 #ifndef TCP_SENDSPACE 105 #define TCP_SENDSPACE 1024*16 106 #endif 107 u_int tcp_sendspace = TCP_SENDSPACE; 108 #ifndef TCP_RECVSPACE 109 #define TCP_RECVSPACE 1024*16 110 #endif 111 u_int tcp_recvspace = TCP_RECVSPACE; 112 u_int tcp_autorcvbuf_inc = 16 * 1024; 113 114 int *tcpctl_vars[TCPCTL_MAXID] = TCPCTL_VARS; 115 116 struct inpcbtable tcbtable; 117 118 int tcp_ident(void *, size_t *, void *, size_t, int); 119 120 /* 121 * Process a TCP user request for TCP tb. If this is a send request 122 * then m is the mbuf chain of send data. If this is a timer expiration 123 * (called from the software clock routine), then timertype tells which timer. 124 */ 125 /*ARGSUSED*/ 126 int 127 tcp_usrreq(so, req, m, nam, control, p) 128 struct socket *so; 129 int req; 130 struct mbuf *m, *nam, *control; 131 struct proc *p; 132 { 133 struct sockaddr_in *sin; 134 struct inpcb *inp; 135 struct tcpcb *tp = NULL; 136 int s; 137 int error = 0; 138 short ostate; 139 140 if (req == PRU_CONTROL) { 141 #ifdef INET6 142 if (sotopf(so) == PF_INET6) 143 return in6_control(so, (u_long)m, (caddr_t)nam, 144 (struct ifnet *)control); 145 else 146 #endif /* INET6 */ 147 return (in_control(so, (u_long)m, (caddr_t)nam, 148 (struct ifnet *)control)); 149 } 150 if (control && control->m_len) { 151 m_freem(control); 152 m_freem(m); 153 return (EINVAL); 154 } 155 156 s = splsoftnet(); 157 inp = sotoinpcb(so); 158 /* 159 * When a TCP is attached to a socket, then there will be 160 * a (struct inpcb) pointed at by the socket, and this 161 * structure will point at a subsidiary (struct tcpcb). 162 */ 163 if (inp == NULL && req != PRU_ATTACH) { 164 error = so->so_error; 165 if (error == 0) 166 error = EINVAL; 167 splx(s); 168 /* 169 * The following corrects an mbuf leak under rare 170 * circumstances 171 */ 172 if (req == PRU_SEND || req == PRU_SENDOOB) 173 m_freem(m); 174 return (error); 175 } 176 if (inp) { 177 tp = intotcpcb(inp); 178 /* tp might get 0 when using socket splicing */ 179 if (tp == NULL) { 180 splx(s); 181 return (0); 182 } 183 #ifdef KPROF 184 tcp_acounts[tp->t_state][req]++; 185 #endif 186 ostate = tp->t_state; 187 } else 188 ostate = 0; 189 switch (req) { 190 191 /* 192 * TCP attaches to socket via PRU_ATTACH, reserving space, 193 * and an internet control block. 194 */ 195 case PRU_ATTACH: 196 if (inp) { 197 error = EISCONN; 198 break; 199 } 200 error = tcp_attach(so); 201 if (error) 202 break; 203 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 204 so->so_linger = TCP_LINGERTIME; 205 tp = sototcpcb(so); 206 break; 207 208 /* 209 * PRU_DETACH detaches the TCP protocol from the socket. 210 * If the protocol state is non-embryonic, then can't 211 * do this directly: have to initiate a PRU_DISCONNECT, 212 * which may finish later; embryonic TCB's can just 213 * be discarded here. 214 */ 215 case PRU_DETACH: 216 tp = tcp_disconnect(tp); 217 break; 218 219 /* 220 * Give the socket an address. 221 */ 222 case PRU_BIND: 223 error = in_pcbbind(inp, nam, p); 224 break; 225 226 /* 227 * Prepare to accept connections. 228 */ 229 case PRU_LISTEN: 230 if (inp->inp_lport == 0) 231 error = in_pcbbind(inp, NULL, p); 232 /* If the in_pcbbind() above is called, the tp->pf 233 should still be whatever it was before. */ 234 if (error == 0) 235 tp->t_state = TCPS_LISTEN; 236 break; 237 238 /* 239 * Initiate connection to peer. 240 * Create a template for use in transmissions on this connection. 241 * Enter SYN_SENT state, and mark socket as connecting. 242 * Start keep-alive timer, and seed output sequence space. 243 * Send initial segment on connection. 244 */ 245 case PRU_CONNECT: 246 sin = mtod(nam, struct sockaddr_in *); 247 248 #ifdef INET6 249 if (sin->sin_family == AF_INET6) { 250 struct in6_addr *in6_addr = &mtod(nam, 251 struct sockaddr_in6 *)->sin6_addr; 252 253 if (IN6_IS_ADDR_UNSPECIFIED(in6_addr) || 254 IN6_IS_ADDR_MULTICAST(in6_addr) || 255 IN6_IS_ADDR_V4MAPPED(in6_addr)) { 256 error = EINVAL; 257 break; 258 } 259 260 error = in6_pcbconnect(inp, nam); 261 } else if (sin->sin_family == AF_INET) 262 #endif /* INET6 */ 263 { 264 if ((sin->sin_addr.s_addr == INADDR_ANY) || 265 (sin->sin_addr.s_addr == INADDR_BROADCAST) || 266 IN_MULTICAST(sin->sin_addr.s_addr) || 267 in_broadcast(sin->sin_addr, inp->inp_rtableid)) { 268 error = EINVAL; 269 break; 270 } 271 272 error = in_pcbconnect(inp, nam); 273 } 274 275 if (error) 276 break; 277 278 tp->t_template = tcp_template(tp); 279 if (tp->t_template == 0) { 280 in_pcbdisconnect(inp); 281 error = ENOBUFS; 282 break; 283 } 284 285 so->so_state |= SS_CONNECTOUT; 286 287 /* Compute window scaling to request. */ 288 tcp_rscale(tp, sb_max); 289 290 soisconnecting(so); 291 tcpstat.tcps_connattempt++; 292 tp->t_state = TCPS_SYN_SENT; 293 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 294 tcp_set_iss_tsm(tp); 295 tcp_sendseqinit(tp); 296 #if defined(TCP_SACK) 297 tp->snd_last = tp->snd_una; 298 #endif 299 #if defined(TCP_SACK) && defined(TCP_FACK) 300 tp->snd_fack = tp->snd_una; 301 tp->retran_data = 0; 302 tp->snd_awnd = 0; 303 #endif 304 error = tcp_output(tp); 305 break; 306 307 /* 308 * Create a TCP connection between two sockets. 309 */ 310 case PRU_CONNECT2: 311 error = EOPNOTSUPP; 312 break; 313 314 /* 315 * Initiate disconnect from peer. 316 * If connection never passed embryonic stage, just drop; 317 * else if don't need to let data drain, then can just drop anyways, 318 * else have to begin TCP shutdown process: mark socket disconnecting, 319 * drain unread data, state switch to reflect user close, and 320 * send segment (e.g. FIN) to peer. Socket will be really disconnected 321 * when peer sends FIN and acks ours. 322 * 323 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 324 */ 325 case PRU_DISCONNECT: 326 tp = tcp_disconnect(tp); 327 break; 328 329 /* 330 * Accept a connection. Essentially all the work is 331 * done at higher levels; just return the address 332 * of the peer, storing through addr. 333 */ 334 case PRU_ACCEPT: 335 #ifdef INET6 336 if (inp->inp_flags & INP_IPV6) 337 in6_setpeeraddr(inp, nam); 338 else 339 #endif 340 in_setpeeraddr(inp, nam); 341 break; 342 343 /* 344 * Mark the connection as being incapable of further output. 345 */ 346 case PRU_SHUTDOWN: 347 if (so->so_state & SS_CANTSENDMORE) 348 break; 349 socantsendmore(so); 350 tp = tcp_usrclosed(tp); 351 if (tp) 352 error = tcp_output(tp); 353 break; 354 355 /* 356 * After a receive, possibly send window update to peer. 357 */ 358 case PRU_RCVD: 359 /* 360 * soreceive() calls this function when a user receives 361 * ancillary data on a listening socket. We don't call 362 * tcp_output in such a case, since there is no header 363 * template for a listening socket and hence the kernel 364 * will panic. 365 */ 366 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 367 (void) tcp_output(tp); 368 break; 369 370 /* 371 * Do a send by putting data in output queue and updating urgent 372 * marker if URG set. Possibly send more data. 373 */ 374 case PRU_SEND: 375 sbappendstream(&so->so_snd, m); 376 error = tcp_output(tp); 377 break; 378 379 /* 380 * Abort the TCP. 381 */ 382 case PRU_ABORT: 383 tp = tcp_drop(tp, ECONNABORTED); 384 break; 385 386 case PRU_SENSE: 387 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; 388 splx(s); 389 return (0); 390 391 case PRU_RCVOOB: 392 if ((so->so_oobmark == 0 && 393 (so->so_state & SS_RCVATMARK) == 0) || 394 so->so_options & SO_OOBINLINE || 395 tp->t_oobflags & TCPOOB_HADDATA) { 396 error = EINVAL; 397 break; 398 } 399 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 400 error = EWOULDBLOCK; 401 break; 402 } 403 m->m_len = 1; 404 *mtod(m, caddr_t) = tp->t_iobc; 405 if (((long)nam & MSG_PEEK) == 0) 406 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 407 break; 408 409 case PRU_SENDOOB: 410 if (sbspace(&so->so_snd) < -512) { 411 m_freem(m); 412 error = ENOBUFS; 413 break; 414 } 415 /* 416 * According to RFC961 (Assigned Protocols), 417 * the urgent pointer points to the last octet 418 * of urgent data. We continue, however, 419 * to consider it to indicate the first octet 420 * of data past the urgent section. 421 * Otherwise, snd_up should be one lower. 422 */ 423 sbappendstream(&so->so_snd, m); 424 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 425 tp->t_force = 1; 426 error = tcp_output(tp); 427 tp->t_force = 0; 428 break; 429 430 case PRU_SOCKADDR: 431 #ifdef INET6 432 if (inp->inp_flags & INP_IPV6) 433 in6_setsockaddr(inp, nam); 434 else 435 #endif 436 in_setsockaddr(inp, nam); 437 break; 438 439 case PRU_PEERADDR: 440 #ifdef INET6 441 if (inp->inp_flags & INP_IPV6) 442 in6_setpeeraddr(inp, nam); 443 else 444 #endif 445 in_setpeeraddr(inp, nam); 446 break; 447 448 default: 449 panic("tcp_usrreq"); 450 } 451 if (tp && (so->so_options & SO_DEBUG)) 452 tcp_trace(TA_USER, ostate, tp, (caddr_t)0, req, 0); 453 splx(s); 454 return (error); 455 } 456 457 int 458 tcp_ctloutput(op, so, level, optname, mp) 459 int op; 460 struct socket *so; 461 int level, optname; 462 struct mbuf **mp; 463 { 464 int error = 0, s; 465 struct inpcb *inp; 466 struct tcpcb *tp; 467 struct mbuf *m; 468 int i; 469 470 s = splsoftnet(); 471 inp = sotoinpcb(so); 472 if (inp == NULL) { 473 splx(s); 474 if (op == PRCO_SETOPT) 475 (void) m_free(*mp); 476 return (ECONNRESET); 477 } 478 if (level != IPPROTO_TCP) { 479 switch (so->so_proto->pr_domain->dom_family) { 480 #ifdef INET6 481 case PF_INET6: 482 error = ip6_ctloutput(op, so, level, optname, mp); 483 break; 484 #endif /* INET6 */ 485 case PF_INET: 486 error = ip_ctloutput(op, so, level, optname, mp); 487 break; 488 default: 489 error = EAFNOSUPPORT; /*?*/ 490 break; 491 } 492 splx(s); 493 return (error); 494 } 495 tp = intotcpcb(inp); 496 497 switch (op) { 498 499 case PRCO_SETOPT: 500 m = *mp; 501 switch (optname) { 502 503 case TCP_NODELAY: 504 if (m == NULL || m->m_len < sizeof (int)) 505 error = EINVAL; 506 else if (*mtod(m, int *)) 507 tp->t_flags |= TF_NODELAY; 508 else 509 tp->t_flags &= ~TF_NODELAY; 510 break; 511 512 case TCP_NOPUSH: 513 if (m == NULL || m->m_len < sizeof (int)) 514 error = EINVAL; 515 else if (*mtod(m, int *)) 516 tp->t_flags |= TF_NOPUSH; 517 else if (tp->t_flags & TF_NOPUSH) { 518 tp->t_flags &= ~TF_NOPUSH; 519 if (TCPS_HAVEESTABLISHED(tp->t_state)) 520 error = tcp_output(tp); 521 } 522 break; 523 524 case TCP_MAXSEG: 525 if (m == NULL || m->m_len < sizeof (int)) { 526 error = EINVAL; 527 break; 528 } 529 530 i = *mtod(m, int *); 531 if (i > 0 && i <= tp->t_maxseg) 532 tp->t_maxseg = i; 533 else 534 error = EINVAL; 535 break; 536 537 #ifdef TCP_SACK 538 case TCP_SACK_ENABLE: 539 if (m == NULL || m->m_len < sizeof (int)) { 540 error = EINVAL; 541 break; 542 } 543 544 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 545 error = EPERM; 546 break; 547 } 548 549 if (tp->t_flags & TF_SIGNATURE) { 550 error = EPERM; 551 break; 552 } 553 554 if (*mtod(m, int *)) 555 tp->sack_enable = 1; 556 else 557 tp->sack_enable = 0; 558 break; 559 #endif 560 #ifdef TCP_SIGNATURE 561 case TCP_MD5SIG: 562 if (m == NULL || m->m_len < sizeof (int)) { 563 error = EINVAL; 564 break; 565 } 566 567 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 568 error = EPERM; 569 break; 570 } 571 572 if (*mtod(m, int *)) { 573 tp->t_flags |= TF_SIGNATURE; 574 #ifdef TCP_SACK 575 tp->sack_enable = 0; 576 #endif /* TCP_SACK */ 577 } else 578 tp->t_flags &= ~TF_SIGNATURE; 579 break; 580 #endif /* TCP_SIGNATURE */ 581 default: 582 error = ENOPROTOOPT; 583 break; 584 } 585 if (m) 586 (void) m_free(m); 587 break; 588 589 case PRCO_GETOPT: 590 *mp = m = m_get(M_WAIT, MT_SOOPTS); 591 m->m_len = sizeof(int); 592 593 switch (optname) { 594 case TCP_NODELAY: 595 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 596 break; 597 case TCP_NOPUSH: 598 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 599 break; 600 case TCP_MAXSEG: 601 *mtod(m, int *) = tp->t_maxseg; 602 break; 603 #ifdef TCP_SACK 604 case TCP_SACK_ENABLE: 605 *mtod(m, int *) = tp->sack_enable; 606 break; 607 #endif 608 #ifdef TCP_SIGNATURE 609 case TCP_MD5SIG: 610 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; 611 break; 612 #endif 613 default: 614 error = ENOPROTOOPT; 615 break; 616 } 617 break; 618 } 619 splx(s); 620 return (error); 621 } 622 623 /* 624 * Attach TCP protocol to socket, allocating 625 * internet protocol control block, tcp control block, 626 * bufer space, and entering LISTEN state if to accept connections. 627 */ 628 int 629 tcp_attach(so) 630 struct socket *so; 631 { 632 struct tcpcb *tp; 633 struct inpcb *inp; 634 int error; 635 636 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || 637 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || 638 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { 639 error = soreserve(so, tcp_sendspace, tcp_recvspace); 640 if (error) 641 return (error); 642 } 643 644 error = in_pcballoc(so, &tcbtable); 645 if (error) 646 return (error); 647 inp = sotoinpcb(so); 648 tp = tcp_newtcpcb(inp); 649 if (tp == NULL) { 650 int nofd = so->so_state & SS_NOFDREF; /* XXX */ 651 652 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 653 in_pcbdetach(inp); 654 so->so_state |= nofd; 655 return (ENOBUFS); 656 } 657 tp->t_state = TCPS_CLOSED; 658 #ifdef INET6 659 /* we disallow IPv4 mapped address completely. */ 660 if (inp->inp_flags & INP_IPV6) 661 tp->pf = PF_INET6; 662 else 663 tp->pf = PF_INET; 664 #else 665 tp->pf = PF_INET; 666 #endif 667 return (0); 668 } 669 670 /* 671 * Initiate (or continue) disconnect. 672 * If embryonic state, just send reset (once). 673 * If in ``let data drain'' option and linger null, just drop. 674 * Otherwise (hard), mark socket disconnecting and drop 675 * current input data; switch states based on user close, and 676 * send segment to peer (with FIN). 677 */ 678 struct tcpcb * 679 tcp_disconnect(tp) 680 struct tcpcb *tp; 681 { 682 struct socket *so = tp->t_inpcb->inp_socket; 683 684 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 685 tp = tcp_close(tp); 686 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 687 tp = tcp_drop(tp, 0); 688 else { 689 soisdisconnecting(so); 690 sbflush(&so->so_rcv); 691 tp = tcp_usrclosed(tp); 692 if (tp) 693 (void) tcp_output(tp); 694 } 695 return (tp); 696 } 697 698 /* 699 * User issued close, and wish to trail through shutdown states: 700 * if never received SYN, just forget it. If got a SYN from peer, 701 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 702 * If already got a FIN from peer, then almost done; go to LAST_ACK 703 * state. In all other cases, have already sent FIN to peer (e.g. 704 * after PRU_SHUTDOWN), and just have to play tedious game waiting 705 * for peer to send FIN or not respond to keep-alives, etc. 706 * We can let the user exit from the close as soon as the FIN is acked. 707 */ 708 struct tcpcb * 709 tcp_usrclosed(tp) 710 struct tcpcb *tp; 711 { 712 713 switch (tp->t_state) { 714 715 case TCPS_CLOSED: 716 case TCPS_LISTEN: 717 case TCPS_SYN_SENT: 718 tp->t_state = TCPS_CLOSED; 719 tp = tcp_close(tp); 720 break; 721 722 case TCPS_SYN_RECEIVED: 723 case TCPS_ESTABLISHED: 724 tp->t_state = TCPS_FIN_WAIT_1; 725 break; 726 727 case TCPS_CLOSE_WAIT: 728 tp->t_state = TCPS_LAST_ACK; 729 break; 730 } 731 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 732 soisdisconnected(tp->t_inpcb->inp_socket); 733 /* 734 * If we are in FIN_WAIT_2, we arrived here because the 735 * application did a shutdown of the send side. Like the 736 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 737 * a full close, we start a timer to make sure sockets are 738 * not left in FIN_WAIT_2 forever. 739 */ 740 if (tp->t_state == TCPS_FIN_WAIT_2) 741 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 742 } 743 return (tp); 744 } 745 746 /* 747 * Look up a socket for ident or tcpdrop, ... 748 */ 749 int 750 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) 751 { 752 int error = 0, s; 753 struct tcp_ident_mapping tir; 754 struct inpcb *inp; 755 struct tcpcb *tp = NULL; 756 struct sockaddr_in *fin, *lin; 757 #ifdef INET6 758 struct sockaddr_in6 *fin6, *lin6; 759 struct in6_addr f6, l6; 760 #endif 761 if (dodrop) { 762 if (oldp != NULL || *oldlenp != 0) 763 return (EINVAL); 764 if (newp == NULL) 765 return (EPERM); 766 if (newlen < sizeof(tir)) 767 return (ENOMEM); 768 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) 769 return (error); 770 } else { 771 if (oldp == NULL) 772 return (EINVAL); 773 if (*oldlenp < sizeof(tir)) 774 return (ENOMEM); 775 if (newp != NULL || newlen != 0) 776 return (EINVAL); 777 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) 778 return (error); 779 } 780 switch (tir.faddr.ss_family) { 781 #ifdef INET6 782 case AF_INET6: 783 fin6 = (struct sockaddr_in6 *)&tir.faddr; 784 error = in6_embedscope(&f6, fin6, NULL); 785 if (error) 786 return EINVAL; /*?*/ 787 lin6 = (struct sockaddr_in6 *)&tir.laddr; 788 error = in6_embedscope(&l6, lin6, NULL); 789 if (error) 790 return EINVAL; /*?*/ 791 break; 792 #endif 793 case AF_INET: 794 fin = (struct sockaddr_in *)&tir.faddr; 795 lin = (struct sockaddr_in *)&tir.laddr; 796 break; 797 default: 798 return (EINVAL); 799 } 800 801 s = splsoftnet(); 802 switch (tir.faddr.ss_family) { 803 #ifdef INET6 804 case AF_INET6: 805 inp = in6_pcbhashlookup(&tcbtable, &f6, 806 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); 807 break; 808 #endif 809 case AF_INET: 810 inp = in_pcbhashlookup(&tcbtable, fin->sin_addr, 811 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); 812 break; 813 default: 814 unhandled_af(tir.faddr.ss_family); 815 } 816 817 if (dodrop) { 818 if (inp && (tp = intotcpcb(inp)) && 819 ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) 820 tp = tcp_drop(tp, ECONNABORTED); 821 else 822 error = ESRCH; 823 splx(s); 824 return (error); 825 } 826 827 if (inp == NULL) { 828 ++tcpstat.tcps_pcbhashmiss; 829 switch (tir.faddr.ss_family) { 830 #ifdef INET6 831 case AF_INET6: 832 inp = in6_pcblookup_listen(&tcbtable, 833 &l6, lin6->sin6_port, 0, NULL, tir.rdomain); 834 break; 835 #endif 836 case AF_INET: 837 inp = in_pcblookup_listen(&tcbtable, 838 lin->sin_addr, lin->sin_port, 0, NULL, tir.rdomain); 839 break; 840 } 841 } 842 843 if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) { 844 tir.ruid = inp->inp_socket->so_ruid; 845 tir.euid = inp->inp_socket->so_euid; 846 } else { 847 tir.ruid = -1; 848 tir.euid = -1; 849 } 850 splx(s); 851 852 *oldlenp = sizeof (tir); 853 error = copyout((void *)&tir, oldp, sizeof (tir)); 854 return (error); 855 } 856 857 /* 858 * Sysctl for tcp variables. 859 */ 860 int 861 tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen) 862 int *name; 863 u_int namelen; 864 void *oldp; 865 size_t *oldlenp; 866 void *newp; 867 size_t newlen; 868 { 869 int error, nval; 870 871 /* All sysctl names at this level are terminal. */ 872 if (namelen != 1) 873 return (ENOTDIR); 874 875 switch (name[0]) { 876 #ifdef TCP_SACK 877 case TCPCTL_SACK: 878 return (sysctl_int(oldp, oldlenp, newp, newlen, 879 &tcp_do_sack)); 880 #endif 881 case TCPCTL_SLOWHZ: 882 return (sysctl_rdint(oldp, oldlenp, newp, PR_SLOWHZ)); 883 884 case TCPCTL_BADDYNAMIC: 885 return (sysctl_struct(oldp, oldlenp, newp, newlen, 886 baddynamicports.tcp, sizeof(baddynamicports.tcp))); 887 888 case TCPCTL_ROOTONLY: 889 if (newp && securelevel > 0) 890 return (EPERM); 891 return (sysctl_struct(oldp, oldlenp, newp, newlen, 892 rootonlyports.tcp, sizeof(rootonlyports.tcp))); 893 894 case TCPCTL_IDENT: 895 return (tcp_ident(oldp, oldlenp, newp, newlen, 0)); 896 897 case TCPCTL_DROP: 898 return (tcp_ident(oldp, oldlenp, newp, newlen, 1)); 899 900 case TCPCTL_ALWAYS_KEEPALIVE: 901 return (sysctl_int(oldp, oldlenp, newp, newlen, 902 &tcp_always_keepalive)); 903 904 #ifdef TCP_ECN 905 case TCPCTL_ECN: 906 return (sysctl_int(oldp, oldlenp, newp, newlen, 907 &tcp_do_ecn)); 908 #endif 909 case TCPCTL_REASS_LIMIT: 910 nval = tcp_reass_limit; 911 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 912 if (error) 913 return (error); 914 if (nval != tcp_reass_limit) { 915 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); 916 if (error) 917 return (error); 918 tcp_reass_limit = nval; 919 } 920 return (0); 921 #ifdef TCP_SACK 922 case TCPCTL_SACKHOLE_LIMIT: 923 nval = tcp_sackhole_limit; 924 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 925 if (error) 926 return (error); 927 if (nval != tcp_sackhole_limit) { 928 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); 929 if (error) 930 return (error); 931 tcp_sackhole_limit = nval; 932 } 933 return (0); 934 #endif 935 936 case TCPCTL_STATS: 937 if (newp != NULL) 938 return (EPERM); 939 { 940 struct syn_cache_set *set; 941 int i; 942 943 set = &tcp_syn_cache[tcp_syn_cache_active]; 944 tcpstat.tcps_sc_hash_size = set->scs_size; 945 tcpstat.tcps_sc_entry_count = set->scs_count; 946 tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit; 947 tcpstat.tcps_sc_bucket_maxlen = 0; 948 for (i = 0; i < set->scs_size; i++) { 949 if (tcpstat.tcps_sc_bucket_maxlen < 950 set->scs_buckethead[i].sch_length) 951 tcpstat.tcps_sc_bucket_maxlen = 952 set->scs_buckethead[i].sch_length; 953 } 954 tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit; 955 tcpstat.tcps_sc_uses_left = set->scs_use; 956 } 957 return (sysctl_struct(oldp, oldlenp, newp, newlen, 958 &tcpstat, sizeof(tcpstat))); 959 960 case TCPCTL_SYN_USE_LIMIT: 961 error = sysctl_int(oldp, oldlenp, newp, newlen, 962 &tcp_syn_use_limit); 963 if (error) 964 return (error); 965 if (newp != NULL) { 966 /* 967 * Global tcp_syn_use_limit is used when reseeding a 968 * new cache. Also update the value in active cache. 969 */ 970 if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit) 971 tcp_syn_cache[0].scs_use = tcp_syn_use_limit; 972 if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit) 973 tcp_syn_cache[1].scs_use = tcp_syn_use_limit; 974 } 975 return (0); 976 977 case TCPCTL_SYN_HASH_SIZE: 978 nval = tcp_syn_hash_size; 979 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 980 if (error) 981 return (error); 982 if (nval != tcp_syn_hash_size) { 983 if (nval < 1 || nval > 100000) 984 return (EINVAL); 985 /* 986 * If global hash size has been changed, switch sets as 987 * soon as possible. Then the actual hash array will 988 * be reallocated. 989 */ 990 if (tcp_syn_cache[0].scs_size != nval) 991 tcp_syn_cache[0].scs_use = 0; 992 if (tcp_syn_cache[1].scs_size != nval) 993 tcp_syn_cache[1].scs_use = 0; 994 tcp_syn_hash_size = nval; 995 } 996 return (0); 997 998 default: 999 if (name[0] < TCPCTL_MAXID) 1000 return (sysctl_int_arr(tcpctl_vars, name, namelen, 1001 oldp, oldlenp, newp, newlen)); 1002 return (ENOPROTOOPT); 1003 } 1004 /* NOTREACHED */ 1005 } 1006 1007 /* 1008 * Scale the send buffer so that inflight data is not accounted against 1009 * the limit. The buffer will scale with the congestion window, if the 1010 * the receiver stops acking data the window will shrink and therefor 1011 * the buffer size will shrink as well. 1012 * In low memory situation try to shrink the buffer to the initial size 1013 * disabling the send buffer scaling as long as the situation persists. 1014 */ 1015 void 1016 tcp_update_sndspace(struct tcpcb *tp) 1017 { 1018 struct socket *so = tp->t_inpcb->inp_socket; 1019 u_long nmax = so->so_snd.sb_hiwat; 1020 1021 if (sbchecklowmem()) { 1022 /* low on memory try to get rid of some */ 1023 if (tcp_sendspace < nmax) 1024 nmax = tcp_sendspace; 1025 } else if (so->so_snd.sb_wat != tcp_sendspace) 1026 /* user requested buffer size, auto-scaling disabled */ 1027 nmax = so->so_snd.sb_wat; 1028 else 1029 /* automatic buffer scaling */ 1030 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - 1031 tp->snd_una); 1032 1033 /* a writable socket must be preserved because of poll(2) semantics */ 1034 if (sbspace(&so->so_snd) >= so->so_snd.sb_lowat) { 1035 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) 1036 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; 1037 if (nmax * 2 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat) 1038 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+1) / 2; 1039 } 1040 1041 /* round to MSS boundary */ 1042 nmax = roundup(nmax, tp->t_maxseg); 1043 1044 if (nmax != so->so_snd.sb_hiwat) 1045 sbreserve(&so->so_snd, nmax); 1046 } 1047 1048 /* 1049 * Scale the recv buffer by looking at how much data was transferred in 1050 * on approximated RTT. If more then a big part of the recv buffer was 1051 * transferred during that time we increase the buffer by a constant. 1052 * In low memory situation try to shrink the buffer to the initial size. 1053 */ 1054 void 1055 tcp_update_rcvspace(struct tcpcb *tp) 1056 { 1057 struct socket *so = tp->t_inpcb->inp_socket; 1058 u_long nmax = so->so_rcv.sb_hiwat; 1059 1060 if (sbchecklowmem()) { 1061 /* low on memory try to get rid of some */ 1062 if (tcp_recvspace < nmax) 1063 nmax = tcp_recvspace; 1064 } else if (so->so_rcv.sb_wat != tcp_recvspace) 1065 /* user requested buffer size, auto-scaling disabled */ 1066 nmax = so->so_rcv.sb_wat; 1067 else { 1068 /* automatic buffer scaling */ 1069 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) 1070 nmax = MIN(sb_max, so->so_rcv.sb_hiwat + 1071 tcp_autorcvbuf_inc); 1072 } 1073 1074 /* a readable socket must be preserved because of poll(2) semantics */ 1075 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && 1076 nmax < so->so_snd.sb_lowat) 1077 nmax = so->so_snd.sb_lowat; 1078 1079 if (nmax == so->so_rcv.sb_hiwat) 1080 return; 1081 1082 /* round to MSS boundary */ 1083 nmax = roundup(nmax, tp->t_maxseg); 1084 sbreserve(&so->so_rcv, nmax); 1085 } 1086