1 /* $OpenBSD: tcp_usrreq.c,v 1.170 2018/11/04 19:36:25 bluhm Exp $ */ 2 /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/protosw.h> 77 #include <sys/stat.h> 78 #include <sys/sysctl.h> 79 #include <sys/domain.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/in_var.h> 89 #include <netinet/ip.h> 90 #include <netinet/in_pcb.h> 91 #include <netinet/ip_var.h> 92 #include <netinet/tcp.h> 93 #include <netinet/tcp_fsm.h> 94 #include <netinet/tcp_seq.h> 95 #include <netinet/tcp_timer.h> 96 #include <netinet/tcp_var.h> 97 #include <netinet/tcp_debug.h> 98 99 #ifdef INET6 100 #include <netinet6/in6_var.h> 101 #endif 102 103 #ifndef TCP_SENDSPACE 104 #define TCP_SENDSPACE 1024*16 105 #endif 106 u_int tcp_sendspace = TCP_SENDSPACE; 107 #ifndef TCP_RECVSPACE 108 #define TCP_RECVSPACE 1024*16 109 #endif 110 u_int tcp_recvspace = TCP_RECVSPACE; 111 u_int tcp_autorcvbuf_inc = 16 * 1024; 112 113 int *tcpctl_vars[TCPCTL_MAXID] = TCPCTL_VARS; 114 115 struct inpcbtable tcbtable; 116 117 int tcp_ident(void *, size_t *, void *, size_t, int); 118 119 /* 120 * Process a TCP user request for TCP tb. If this is a send request 121 * then m is the mbuf chain of send data. If this is a timer expiration 122 * (called from the software clock routine), then timertype tells which timer. 123 */ 124 /*ARGSUSED*/ 125 int 126 tcp_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam, 127 struct mbuf *control, struct proc *p) 128 { 129 struct inpcb *inp; 130 struct tcpcb *otp = NULL, *tp = NULL; 131 int error = 0; 132 short ostate; 133 134 if (req == PRU_CONTROL) { 135 #ifdef INET6 136 if (sotopf(so) == PF_INET6) 137 return in6_control(so, (u_long)m, (caddr_t)nam, 138 (struct ifnet *)control); 139 else 140 #endif /* INET6 */ 141 return (in_control(so, (u_long)m, (caddr_t)nam, 142 (struct ifnet *)control)); 143 } 144 145 soassertlocked(so); 146 147 if (control && control->m_len) { 148 m_freem(control); 149 m_freem(m); 150 return (EINVAL); 151 } 152 153 inp = sotoinpcb(so); 154 /* 155 * When a TCP is attached to a socket, then there will be 156 * a (struct inpcb) pointed at by the socket, and this 157 * structure will point at a subsidiary (struct tcpcb). 158 */ 159 if (inp == NULL) { 160 error = so->so_error; 161 if (error == 0) 162 error = EINVAL; 163 /* 164 * The following corrects an mbuf leak under rare 165 * circumstances 166 */ 167 if (req == PRU_SEND || req == PRU_SENDOOB) 168 m_freem(m); 169 return (error); 170 } 171 tp = intotcpcb(inp); 172 /* tp might get 0 when using socket splicing */ 173 if (tp == NULL) 174 return (0); 175 if (so->so_options & SO_DEBUG) { 176 otp = tp; 177 ostate = tp->t_state; 178 } 179 180 switch (req) { 181 182 /* 183 * Give the socket an address. 184 */ 185 case PRU_BIND: 186 error = in_pcbbind(inp, nam, p); 187 break; 188 189 /* 190 * Prepare to accept connections. 191 */ 192 case PRU_LISTEN: 193 if (inp->inp_lport == 0) 194 error = in_pcbbind(inp, NULL, p); 195 /* If the in_pcbbind() above is called, the tp->pf 196 should still be whatever it was before. */ 197 if (error == 0) 198 tp->t_state = TCPS_LISTEN; 199 break; 200 201 /* 202 * Initiate connection to peer. 203 * Create a template for use in transmissions on this connection. 204 * Enter SYN_SENT state, and mark socket as connecting. 205 * Start keep-alive timer, and seed output sequence space. 206 * Send initial segment on connection. 207 */ 208 case PRU_CONNECT: 209 #ifdef INET6 210 if (inp->inp_flags & INP_IPV6) { 211 struct sockaddr_in6 *sin6; 212 213 if ((error = in6_nam2sin6(nam, &sin6))) 214 break; 215 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || 216 IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { 217 error = EINVAL; 218 break; 219 } 220 error = in6_pcbconnect(inp, nam); 221 } else 222 #endif /* INET6 */ 223 { 224 struct sockaddr_in *sin; 225 226 if ((error = in_nam2sin(nam, &sin))) 227 break; 228 if ((sin->sin_addr.s_addr == INADDR_ANY) || 229 (sin->sin_addr.s_addr == INADDR_BROADCAST) || 230 IN_MULTICAST(sin->sin_addr.s_addr) || 231 in_broadcast(sin->sin_addr, inp->inp_rtableid)) { 232 error = EINVAL; 233 break; 234 } 235 error = in_pcbconnect(inp, nam); 236 } 237 if (error) 238 break; 239 240 tp->t_template = tcp_template(tp); 241 if (tp->t_template == 0) { 242 in_pcbdisconnect(inp); 243 error = ENOBUFS; 244 break; 245 } 246 247 so->so_state |= SS_CONNECTOUT; 248 249 /* Compute window scaling to request. */ 250 tcp_rscale(tp, sb_max); 251 252 soisconnecting(so); 253 tcpstat_inc(tcps_connattempt); 254 tp->t_state = TCPS_SYN_SENT; 255 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 256 tcp_set_iss_tsm(tp); 257 tcp_sendseqinit(tp); 258 tp->snd_last = tp->snd_una; 259 error = tcp_output(tp); 260 break; 261 262 /* 263 * Create a TCP connection between two sockets. 264 */ 265 case PRU_CONNECT2: 266 error = EOPNOTSUPP; 267 break; 268 269 /* 270 * Initiate disconnect from peer. 271 * If connection never passed embryonic stage, just drop; 272 * else if don't need to let data drain, then can just drop anyways, 273 * else have to begin TCP shutdown process: mark socket disconnecting, 274 * drain unread data, state switch to reflect user close, and 275 * send segment (e.g. FIN) to peer. Socket will be really disconnected 276 * when peer sends FIN and acks ours. 277 * 278 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 279 */ 280 case PRU_DISCONNECT: 281 tp = tcp_disconnect(tp); 282 break; 283 284 /* 285 * Accept a connection. Essentially all the work is 286 * done at higher levels; just return the address 287 * of the peer, storing through addr. 288 */ 289 case PRU_ACCEPT: 290 #ifdef INET6 291 if (inp->inp_flags & INP_IPV6) 292 in6_setpeeraddr(inp, nam); 293 else 294 #endif 295 in_setpeeraddr(inp, nam); 296 break; 297 298 /* 299 * Mark the connection as being incapable of further output. 300 */ 301 case PRU_SHUTDOWN: 302 if (so->so_state & SS_CANTSENDMORE) 303 break; 304 socantsendmore(so); 305 tp = tcp_usrclosed(tp); 306 if (tp) 307 error = tcp_output(tp); 308 break; 309 310 /* 311 * After a receive, possibly send window update to peer. 312 */ 313 case PRU_RCVD: 314 /* 315 * soreceive() calls this function when a user receives 316 * ancillary data on a listening socket. We don't call 317 * tcp_output in such a case, since there is no header 318 * template for a listening socket and hence the kernel 319 * will panic. 320 */ 321 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 322 (void) tcp_output(tp); 323 break; 324 325 /* 326 * Do a send by putting data in output queue and updating urgent 327 * marker if URG set. Possibly send more data. 328 */ 329 case PRU_SEND: 330 sbappendstream(so, &so->so_snd, m); 331 error = tcp_output(tp); 332 break; 333 334 /* 335 * Abort the TCP. 336 */ 337 case PRU_ABORT: 338 tp = tcp_drop(tp, ECONNABORTED); 339 break; 340 341 case PRU_SENSE: 342 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; 343 return (0); 344 345 case PRU_RCVOOB: 346 if ((so->so_oobmark == 0 && 347 (so->so_state & SS_RCVATMARK) == 0) || 348 so->so_options & SO_OOBINLINE || 349 tp->t_oobflags & TCPOOB_HADDATA) { 350 error = EINVAL; 351 break; 352 } 353 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 354 error = EWOULDBLOCK; 355 break; 356 } 357 m->m_len = 1; 358 *mtod(m, caddr_t) = tp->t_iobc; 359 if (((long)nam & MSG_PEEK) == 0) 360 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 361 break; 362 363 case PRU_SENDOOB: 364 if (sbspace(so, &so->so_snd) < -512) { 365 m_freem(m); 366 error = ENOBUFS; 367 break; 368 } 369 /* 370 * According to RFC961 (Assigned Protocols), 371 * the urgent pointer points to the last octet 372 * of urgent data. We continue, however, 373 * to consider it to indicate the first octet 374 * of data past the urgent section. 375 * Otherwise, snd_up should be one lower. 376 */ 377 sbappendstream(so, &so->so_snd, m); 378 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 379 tp->t_force = 1; 380 error = tcp_output(tp); 381 tp->t_force = 0; 382 break; 383 384 case PRU_SOCKADDR: 385 #ifdef INET6 386 if (inp->inp_flags & INP_IPV6) 387 in6_setsockaddr(inp, nam); 388 else 389 #endif 390 in_setsockaddr(inp, nam); 391 break; 392 393 case PRU_PEERADDR: 394 #ifdef INET6 395 if (inp->inp_flags & INP_IPV6) 396 in6_setpeeraddr(inp, nam); 397 else 398 #endif 399 in_setpeeraddr(inp, nam); 400 break; 401 402 default: 403 panic("tcp_usrreq"); 404 } 405 if (otp) 406 tcp_trace(TA_USER, ostate, tp, otp, NULL, req, 0); 407 return (error); 408 } 409 410 int 411 tcp_ctloutput(int op, struct socket *so, int level, int optname, 412 struct mbuf *m) 413 { 414 int error = 0; 415 struct inpcb *inp; 416 struct tcpcb *tp; 417 int i; 418 419 inp = sotoinpcb(so); 420 if (inp == NULL) 421 return (ECONNRESET); 422 if (level != IPPROTO_TCP) { 423 switch (so->so_proto->pr_domain->dom_family) { 424 #ifdef INET6 425 case PF_INET6: 426 error = ip6_ctloutput(op, so, level, optname, m); 427 break; 428 #endif /* INET6 */ 429 case PF_INET: 430 error = ip_ctloutput(op, so, level, optname, m); 431 break; 432 default: 433 error = EAFNOSUPPORT; /*?*/ 434 break; 435 } 436 return (error); 437 } 438 tp = intotcpcb(inp); 439 440 switch (op) { 441 442 case PRCO_SETOPT: 443 switch (optname) { 444 445 case TCP_NODELAY: 446 if (m == NULL || m->m_len < sizeof (int)) 447 error = EINVAL; 448 else if (*mtod(m, int *)) 449 tp->t_flags |= TF_NODELAY; 450 else 451 tp->t_flags &= ~TF_NODELAY; 452 break; 453 454 case TCP_NOPUSH: 455 if (m == NULL || m->m_len < sizeof (int)) 456 error = EINVAL; 457 else if (*mtod(m, int *)) 458 tp->t_flags |= TF_NOPUSH; 459 else if (tp->t_flags & TF_NOPUSH) { 460 tp->t_flags &= ~TF_NOPUSH; 461 if (TCPS_HAVEESTABLISHED(tp->t_state)) 462 error = tcp_output(tp); 463 } 464 break; 465 466 case TCP_MAXSEG: 467 if (m == NULL || m->m_len < sizeof (int)) { 468 error = EINVAL; 469 break; 470 } 471 472 i = *mtod(m, int *); 473 if (i > 0 && i <= tp->t_maxseg) 474 tp->t_maxseg = i; 475 else 476 error = EINVAL; 477 break; 478 479 case TCP_SACK_ENABLE: 480 if (m == NULL || m->m_len < sizeof (int)) { 481 error = EINVAL; 482 break; 483 } 484 485 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 486 error = EPERM; 487 break; 488 } 489 490 if (tp->t_flags & TF_SIGNATURE) { 491 error = EPERM; 492 break; 493 } 494 495 if (*mtod(m, int *)) 496 tp->sack_enable = 1; 497 else 498 tp->sack_enable = 0; 499 break; 500 #ifdef TCP_SIGNATURE 501 case TCP_MD5SIG: 502 if (m == NULL || m->m_len < sizeof (int)) { 503 error = EINVAL; 504 break; 505 } 506 507 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 508 error = EPERM; 509 break; 510 } 511 512 if (*mtod(m, int *)) { 513 tp->t_flags |= TF_SIGNATURE; 514 tp->sack_enable = 0; 515 } else 516 tp->t_flags &= ~TF_SIGNATURE; 517 break; 518 #endif /* TCP_SIGNATURE */ 519 default: 520 error = ENOPROTOOPT; 521 break; 522 } 523 break; 524 525 case PRCO_GETOPT: 526 m->m_len = sizeof(int); 527 528 switch (optname) { 529 case TCP_NODELAY: 530 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 531 break; 532 case TCP_NOPUSH: 533 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 534 break; 535 case TCP_MAXSEG: 536 *mtod(m, int *) = tp->t_maxseg; 537 break; 538 case TCP_SACK_ENABLE: 539 *mtod(m, int *) = tp->sack_enable; 540 break; 541 #ifdef TCP_SIGNATURE 542 case TCP_MD5SIG: 543 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; 544 break; 545 #endif 546 default: 547 error = ENOPROTOOPT; 548 break; 549 } 550 break; 551 } 552 return (error); 553 } 554 555 /* 556 * Attach TCP protocol to socket, allocating 557 * internet protocol control block, tcp control block, 558 * buffer space, and entering LISTEN state to accept connections. 559 */ 560 int 561 tcp_attach(struct socket *so, int proto) 562 { 563 struct tcpcb *tp; 564 struct inpcb *inp; 565 int error; 566 567 if (so->so_pcb) 568 return EISCONN; 569 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || 570 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || 571 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { 572 error = soreserve(so, tcp_sendspace, tcp_recvspace); 573 if (error) 574 return (error); 575 } 576 577 NET_ASSERT_LOCKED(); 578 error = in_pcballoc(so, &tcbtable); 579 if (error) 580 return (error); 581 inp = sotoinpcb(so); 582 tp = tcp_newtcpcb(inp); 583 if (tp == NULL) { 584 unsigned int nofd = so->so_state & SS_NOFDREF; /* XXX */ 585 586 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 587 in_pcbdetach(inp); 588 so->so_state |= nofd; 589 return (ENOBUFS); 590 } 591 tp->t_state = TCPS_CLOSED; 592 #ifdef INET6 593 /* we disallow IPv4 mapped address completely. */ 594 if (inp->inp_flags & INP_IPV6) 595 tp->pf = PF_INET6; 596 else 597 tp->pf = PF_INET; 598 #else 599 tp->pf = PF_INET; 600 #endif 601 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 602 so->so_linger = TCP_LINGERTIME; 603 604 if (so->so_options & SO_DEBUG) 605 tcp_trace(TA_USER, TCPS_CLOSED, tp, tp, NULL, PRU_ATTACH, 0); 606 return (0); 607 } 608 609 int 610 tcp_detach(struct socket *so) 611 { 612 struct inpcb *inp; 613 struct tcpcb *otp = NULL, *tp = NULL; 614 int error = 0; 615 short ostate; 616 617 soassertlocked(so); 618 619 inp = sotoinpcb(so); 620 /* 621 * When a TCP is attached to a socket, then there will be 622 * a (struct inpcb) pointed at by the socket, and this 623 * structure will point at a subsidiary (struct tcpcb). 624 */ 625 if (inp == NULL) { 626 error = so->so_error; 627 if (error == 0) 628 error = EINVAL; 629 return (error); 630 } 631 tp = intotcpcb(inp); 632 /* tp might get 0 when using socket splicing */ 633 if (tp == NULL) 634 return (0); 635 if (so->so_options & SO_DEBUG) { 636 otp = tp; 637 ostate = tp->t_state; 638 } 639 640 /* 641 * Detach the TCP protocol from the socket. 642 * If the protocol state is non-embryonic, then can't 643 * do this directly: have to initiate a PRU_DISCONNECT, 644 * which may finish later; embryonic TCB's can just 645 * be discarded here. 646 */ 647 tp = tcp_disconnect(tp); 648 649 if (otp) 650 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DETACH, 0); 651 return (error); 652 } 653 654 /* 655 * Initiate (or continue) disconnect. 656 * If embryonic state, just send reset (once). 657 * If in ``let data drain'' option and linger null, just drop. 658 * Otherwise (hard), mark socket disconnecting and drop 659 * current input data; switch states based on user close, and 660 * send segment to peer (with FIN). 661 */ 662 struct tcpcb * 663 tcp_disconnect(struct tcpcb *tp) 664 { 665 struct socket *so = tp->t_inpcb->inp_socket; 666 667 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 668 tp = tcp_close(tp); 669 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 670 tp = tcp_drop(tp, 0); 671 else { 672 soisdisconnecting(so); 673 sbflush(so, &so->so_rcv); 674 tp = tcp_usrclosed(tp); 675 if (tp) 676 (void) tcp_output(tp); 677 } 678 return (tp); 679 } 680 681 /* 682 * User issued close, and wish to trail through shutdown states: 683 * if never received SYN, just forget it. If got a SYN from peer, 684 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 685 * If already got a FIN from peer, then almost done; go to LAST_ACK 686 * state. In all other cases, have already sent FIN to peer (e.g. 687 * after PRU_SHUTDOWN), and just have to play tedious game waiting 688 * for peer to send FIN or not respond to keep-alives, etc. 689 * We can let the user exit from the close as soon as the FIN is acked. 690 */ 691 struct tcpcb * 692 tcp_usrclosed(struct tcpcb *tp) 693 { 694 695 switch (tp->t_state) { 696 697 case TCPS_CLOSED: 698 case TCPS_LISTEN: 699 case TCPS_SYN_SENT: 700 tp->t_state = TCPS_CLOSED; 701 tp = tcp_close(tp); 702 break; 703 704 case TCPS_SYN_RECEIVED: 705 case TCPS_ESTABLISHED: 706 tp->t_state = TCPS_FIN_WAIT_1; 707 break; 708 709 case TCPS_CLOSE_WAIT: 710 tp->t_state = TCPS_LAST_ACK; 711 break; 712 } 713 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 714 soisdisconnected(tp->t_inpcb->inp_socket); 715 /* 716 * If we are in FIN_WAIT_2, we arrived here because the 717 * application did a shutdown of the send side. Like the 718 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 719 * a full close, we start a timer to make sure sockets are 720 * not left in FIN_WAIT_2 forever. 721 */ 722 if (tp->t_state == TCPS_FIN_WAIT_2) 723 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 724 } 725 return (tp); 726 } 727 728 /* 729 * Look up a socket for ident or tcpdrop, ... 730 */ 731 int 732 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) 733 { 734 int error = 0; 735 struct tcp_ident_mapping tir; 736 struct inpcb *inp; 737 struct tcpcb *tp = NULL; 738 struct sockaddr_in *fin, *lin; 739 #ifdef INET6 740 struct sockaddr_in6 *fin6, *lin6; 741 struct in6_addr f6, l6; 742 #endif 743 744 NET_ASSERT_LOCKED(); 745 746 if (dodrop) { 747 if (oldp != NULL || *oldlenp != 0) 748 return (EINVAL); 749 if (newp == NULL) 750 return (EPERM); 751 if (newlen < sizeof(tir)) 752 return (ENOMEM); 753 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) 754 return (error); 755 } else { 756 if (oldp == NULL) 757 return (EINVAL); 758 if (*oldlenp < sizeof(tir)) 759 return (ENOMEM); 760 if (newp != NULL || newlen != 0) 761 return (EINVAL); 762 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) 763 return (error); 764 } 765 switch (tir.faddr.ss_family) { 766 #ifdef INET6 767 case AF_INET6: 768 fin6 = (struct sockaddr_in6 *)&tir.faddr; 769 error = in6_embedscope(&f6, fin6, NULL); 770 if (error) 771 return EINVAL; /*?*/ 772 lin6 = (struct sockaddr_in6 *)&tir.laddr; 773 error = in6_embedscope(&l6, lin6, NULL); 774 if (error) 775 return EINVAL; /*?*/ 776 break; 777 #endif 778 case AF_INET: 779 fin = (struct sockaddr_in *)&tir.faddr; 780 lin = (struct sockaddr_in *)&tir.laddr; 781 break; 782 default: 783 return (EINVAL); 784 } 785 786 switch (tir.faddr.ss_family) { 787 #ifdef INET6 788 case AF_INET6: 789 inp = in6_pcbhashlookup(&tcbtable, &f6, 790 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); 791 break; 792 #endif 793 case AF_INET: 794 inp = in_pcbhashlookup(&tcbtable, fin->sin_addr, 795 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); 796 break; 797 default: 798 unhandled_af(tir.faddr.ss_family); 799 } 800 801 if (dodrop) { 802 if (inp && (tp = intotcpcb(inp)) && 803 ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) 804 tp = tcp_drop(tp, ECONNABORTED); 805 else 806 error = ESRCH; 807 return (error); 808 } 809 810 if (inp == NULL) { 811 tcpstat_inc(tcps_pcbhashmiss); 812 switch (tir.faddr.ss_family) { 813 #ifdef INET6 814 case AF_INET6: 815 inp = in6_pcblookup_listen(&tcbtable, 816 &l6, lin6->sin6_port, NULL, tir.rdomain); 817 break; 818 #endif 819 case AF_INET: 820 inp = in_pcblookup_listen(&tcbtable, 821 lin->sin_addr, lin->sin_port, NULL, tir.rdomain); 822 break; 823 } 824 } 825 826 if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) { 827 tir.ruid = inp->inp_socket->so_ruid; 828 tir.euid = inp->inp_socket->so_euid; 829 } else { 830 tir.ruid = -1; 831 tir.euid = -1; 832 } 833 834 *oldlenp = sizeof (tir); 835 error = copyout((void *)&tir, oldp, sizeof (tir)); 836 return (error); 837 } 838 839 int 840 tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp) 841 { 842 uint64_t counters[tcps_ncounters]; 843 struct tcpstat tcpstat; 844 struct syn_cache_set *set; 845 int i = 0; 846 847 #define ASSIGN(field) do { tcpstat.field = counters[i++]; } while (0) 848 849 memset(&tcpstat, 0, sizeof tcpstat); 850 counters_read(tcpcounters, counters, nitems(counters)); 851 ASSIGN(tcps_connattempt); 852 ASSIGN(tcps_accepts); 853 ASSIGN(tcps_connects); 854 ASSIGN(tcps_drops); 855 ASSIGN(tcps_conndrops); 856 ASSIGN(tcps_closed); 857 ASSIGN(tcps_segstimed); 858 ASSIGN(tcps_rttupdated); 859 ASSIGN(tcps_delack); 860 ASSIGN(tcps_timeoutdrop); 861 ASSIGN(tcps_rexmttimeo); 862 ASSIGN(tcps_persisttimeo); 863 ASSIGN(tcps_persistdrop); 864 ASSIGN(tcps_keeptimeo); 865 ASSIGN(tcps_keepprobe); 866 ASSIGN(tcps_keepdrops); 867 ASSIGN(tcps_sndtotal); 868 ASSIGN(tcps_sndpack); 869 ASSIGN(tcps_sndbyte); 870 ASSIGN(tcps_sndrexmitpack); 871 ASSIGN(tcps_sndrexmitbyte); 872 ASSIGN(tcps_sndrexmitfast); 873 ASSIGN(tcps_sndacks); 874 ASSIGN(tcps_sndprobe); 875 ASSIGN(tcps_sndurg); 876 ASSIGN(tcps_sndwinup); 877 ASSIGN(tcps_sndctrl); 878 ASSIGN(tcps_rcvtotal); 879 ASSIGN(tcps_rcvpack); 880 ASSIGN(tcps_rcvbyte); 881 ASSIGN(tcps_rcvbadsum); 882 ASSIGN(tcps_rcvbadoff); 883 ASSIGN(tcps_rcvmemdrop); 884 ASSIGN(tcps_rcvnosec); 885 ASSIGN(tcps_rcvshort); 886 ASSIGN(tcps_rcvduppack); 887 ASSIGN(tcps_rcvdupbyte); 888 ASSIGN(tcps_rcvpartduppack); 889 ASSIGN(tcps_rcvpartdupbyte); 890 ASSIGN(tcps_rcvoopack); 891 ASSIGN(tcps_rcvoobyte); 892 ASSIGN(tcps_rcvpackafterwin); 893 ASSIGN(tcps_rcvbyteafterwin); 894 ASSIGN(tcps_rcvafterclose); 895 ASSIGN(tcps_rcvwinprobe); 896 ASSIGN(tcps_rcvdupack); 897 ASSIGN(tcps_rcvacktoomuch); 898 ASSIGN(tcps_rcvacktooold); 899 ASSIGN(tcps_rcvackpack); 900 ASSIGN(tcps_rcvackbyte); 901 ASSIGN(tcps_rcvwinupd); 902 ASSIGN(tcps_pawsdrop); 903 ASSIGN(tcps_predack); 904 ASSIGN(tcps_preddat); 905 ASSIGN(tcps_pcbhashmiss); 906 ASSIGN(tcps_noport); 907 ASSIGN(tcps_badsyn); 908 ASSIGN(tcps_dropsyn); 909 ASSIGN(tcps_rcvbadsig); 910 ASSIGN(tcps_rcvgoodsig); 911 ASSIGN(tcps_inswcsum); 912 ASSIGN(tcps_outswcsum); 913 ASSIGN(tcps_ecn_accepts); 914 ASSIGN(tcps_ecn_rcvece); 915 ASSIGN(tcps_ecn_rcvcwr); 916 ASSIGN(tcps_ecn_rcvce); 917 ASSIGN(tcps_ecn_sndect); 918 ASSIGN(tcps_ecn_sndece); 919 ASSIGN(tcps_ecn_sndcwr); 920 ASSIGN(tcps_cwr_ecn); 921 ASSIGN(tcps_cwr_frecovery); 922 ASSIGN(tcps_cwr_timeout); 923 ASSIGN(tcps_sc_added); 924 ASSIGN(tcps_sc_completed); 925 ASSIGN(tcps_sc_timed_out); 926 ASSIGN(tcps_sc_overflowed); 927 ASSIGN(tcps_sc_reset); 928 ASSIGN(tcps_sc_unreach); 929 ASSIGN(tcps_sc_bucketoverflow); 930 ASSIGN(tcps_sc_aborted); 931 ASSIGN(tcps_sc_dupesyn); 932 ASSIGN(tcps_sc_dropped); 933 ASSIGN(tcps_sc_collisions); 934 ASSIGN(tcps_sc_retransmitted); 935 ASSIGN(tcps_sc_seedrandom); 936 ASSIGN(tcps_sc_hash_size); 937 ASSIGN(tcps_sc_entry_count); 938 ASSIGN(tcps_sc_entry_limit); 939 ASSIGN(tcps_sc_bucket_maxlen); 940 ASSIGN(tcps_sc_bucket_limit); 941 ASSIGN(tcps_sc_uses_left); 942 ASSIGN(tcps_conndrained); 943 ASSIGN(tcps_sack_recovery_episode); 944 ASSIGN(tcps_sack_rexmits); 945 ASSIGN(tcps_sack_rexmit_bytes); 946 ASSIGN(tcps_sack_rcv_opts); 947 ASSIGN(tcps_sack_snd_opts); 948 949 #undef ASSIGN 950 951 set = &tcp_syn_cache[tcp_syn_cache_active]; 952 tcpstat.tcps_sc_hash_size = set->scs_size; 953 tcpstat.tcps_sc_entry_count = set->scs_count; 954 tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit; 955 tcpstat.tcps_sc_bucket_maxlen = 0; 956 for (i = 0; i < set->scs_size; i++) { 957 if (tcpstat.tcps_sc_bucket_maxlen < 958 set->scs_buckethead[i].sch_length) 959 tcpstat.tcps_sc_bucket_maxlen = 960 set->scs_buckethead[i].sch_length; 961 } 962 tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit; 963 tcpstat.tcps_sc_uses_left = set->scs_use; 964 965 return (sysctl_rdstruct(oldp, oldlenp, newp, 966 &tcpstat, sizeof(tcpstat))); 967 } 968 969 /* 970 * Sysctl for tcp variables. 971 */ 972 int 973 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 974 size_t newlen) 975 { 976 int error, nval; 977 978 /* All sysctl names at this level are terminal. */ 979 if (namelen != 1) 980 return (ENOTDIR); 981 982 switch (name[0]) { 983 case TCPCTL_SACK: 984 NET_LOCK(); 985 error = sysctl_int(oldp, oldlenp, newp, newlen, 986 &tcp_do_sack); 987 NET_UNLOCK(); 988 return (error); 989 990 case TCPCTL_SLOWHZ: 991 return (sysctl_rdint(oldp, oldlenp, newp, PR_SLOWHZ)); 992 993 case TCPCTL_BADDYNAMIC: 994 NET_LOCK(); 995 error = sysctl_struct(oldp, oldlenp, newp, newlen, 996 baddynamicports.tcp, sizeof(baddynamicports.tcp)); 997 NET_UNLOCK(); 998 return (error); 999 1000 case TCPCTL_ROOTONLY: 1001 if (newp && securelevel > 0) 1002 return (EPERM); 1003 NET_LOCK(); 1004 error = sysctl_struct(oldp, oldlenp, newp, newlen, 1005 rootonlyports.tcp, sizeof(rootonlyports.tcp)); 1006 NET_UNLOCK(); 1007 return (error); 1008 1009 case TCPCTL_IDENT: 1010 NET_LOCK(); 1011 error = tcp_ident(oldp, oldlenp, newp, newlen, 0); 1012 NET_UNLOCK(); 1013 return (error); 1014 1015 case TCPCTL_DROP: 1016 NET_LOCK(); 1017 error = tcp_ident(oldp, oldlenp, newp, newlen, 1); 1018 NET_UNLOCK(); 1019 return (error); 1020 1021 case TCPCTL_ALWAYS_KEEPALIVE: 1022 NET_LOCK(); 1023 error = sysctl_int(oldp, oldlenp, newp, newlen, 1024 &tcp_always_keepalive); 1025 NET_UNLOCK(); 1026 return (error); 1027 1028 #ifdef TCP_ECN 1029 case TCPCTL_ECN: 1030 NET_LOCK(); 1031 error = sysctl_int(oldp, oldlenp, newp, newlen, 1032 &tcp_do_ecn); 1033 NET_UNLOCK(); 1034 return (error); 1035 #endif 1036 case TCPCTL_REASS_LIMIT: 1037 NET_LOCK(); 1038 nval = tcp_reass_limit; 1039 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1040 if (!error && nval != tcp_reass_limit) { 1041 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); 1042 if (!error) 1043 tcp_reass_limit = nval; 1044 } 1045 NET_UNLOCK(); 1046 return (error); 1047 1048 case TCPCTL_SACKHOLE_LIMIT: 1049 NET_LOCK(); 1050 nval = tcp_sackhole_limit; 1051 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1052 if (!error && nval != tcp_sackhole_limit) { 1053 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); 1054 if (!error) 1055 tcp_sackhole_limit = nval; 1056 } 1057 NET_UNLOCK(); 1058 return (error); 1059 1060 case TCPCTL_STATS: 1061 return (tcp_sysctl_tcpstat(oldp, oldlenp, newp)); 1062 1063 case TCPCTL_SYN_USE_LIMIT: 1064 NET_LOCK(); 1065 error = sysctl_int(oldp, oldlenp, newp, newlen, 1066 &tcp_syn_use_limit); 1067 if (!error && newp != NULL) { 1068 /* 1069 * Global tcp_syn_use_limit is used when reseeding a 1070 * new cache. Also update the value in active cache. 1071 */ 1072 if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit) 1073 tcp_syn_cache[0].scs_use = tcp_syn_use_limit; 1074 if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit) 1075 tcp_syn_cache[1].scs_use = tcp_syn_use_limit; 1076 } 1077 NET_UNLOCK(); 1078 return (error); 1079 1080 case TCPCTL_SYN_HASH_SIZE: 1081 NET_LOCK(); 1082 nval = tcp_syn_hash_size; 1083 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1084 if (!error && nval != tcp_syn_hash_size) { 1085 if (nval < 1 || nval > 100000) { 1086 error = EINVAL; 1087 } else { 1088 /* 1089 * If global hash size has been changed, 1090 * switch sets as soon as possible. Then 1091 * the actual hash array will be reallocated. 1092 */ 1093 if (tcp_syn_cache[0].scs_size != nval) 1094 tcp_syn_cache[0].scs_use = 0; 1095 if (tcp_syn_cache[1].scs_size != nval) 1096 tcp_syn_cache[1].scs_use = 0; 1097 tcp_syn_hash_size = nval; 1098 } 1099 } 1100 NET_UNLOCK(); 1101 return (error); 1102 1103 default: 1104 if (name[0] < TCPCTL_MAXID) { 1105 NET_LOCK(); 1106 error = sysctl_int_arr(tcpctl_vars, name, namelen, 1107 oldp, oldlenp, newp, newlen); 1108 NET_UNLOCK(); 1109 return (error); 1110 } 1111 return (ENOPROTOOPT); 1112 } 1113 /* NOTREACHED */ 1114 } 1115 1116 /* 1117 * Scale the send buffer so that inflight data is not accounted against 1118 * the limit. The buffer will scale with the congestion window, if the 1119 * the receiver stops acking data the window will shrink and therefor 1120 * the buffer size will shrink as well. 1121 * In low memory situation try to shrink the buffer to the initial size 1122 * disabling the send buffer scaling as long as the situation persists. 1123 */ 1124 void 1125 tcp_update_sndspace(struct tcpcb *tp) 1126 { 1127 struct socket *so = tp->t_inpcb->inp_socket; 1128 u_long nmax = so->so_snd.sb_hiwat; 1129 1130 if (sbchecklowmem()) { 1131 /* low on memory try to get rid of some */ 1132 if (tcp_sendspace < nmax) 1133 nmax = tcp_sendspace; 1134 } else if (so->so_snd.sb_wat != tcp_sendspace) 1135 /* user requested buffer size, auto-scaling disabled */ 1136 nmax = so->so_snd.sb_wat; 1137 else 1138 /* automatic buffer scaling */ 1139 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - 1140 tp->snd_una); 1141 1142 /* a writable socket must be preserved because of poll(2) semantics */ 1143 if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) { 1144 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) 1145 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; 1146 /* keep in sync with sbreserve() calculation */ 1147 if (nmax * 8 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat) 1148 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+7) / 8; 1149 } 1150 1151 /* round to MSS boundary */ 1152 nmax = roundup(nmax, tp->t_maxseg); 1153 1154 if (nmax != so->so_snd.sb_hiwat) 1155 sbreserve(so, &so->so_snd, nmax); 1156 } 1157 1158 /* 1159 * Scale the recv buffer by looking at how much data was transferred in 1160 * on approximated RTT. If more than a big part of the recv buffer was 1161 * transferred during that time we increase the buffer by a constant. 1162 * In low memory situation try to shrink the buffer to the initial size. 1163 */ 1164 void 1165 tcp_update_rcvspace(struct tcpcb *tp) 1166 { 1167 struct socket *so = tp->t_inpcb->inp_socket; 1168 u_long nmax = so->so_rcv.sb_hiwat; 1169 1170 if (sbchecklowmem()) { 1171 /* low on memory try to get rid of some */ 1172 if (tcp_recvspace < nmax) 1173 nmax = tcp_recvspace; 1174 } else if (so->so_rcv.sb_wat != tcp_recvspace) 1175 /* user requested buffer size, auto-scaling disabled */ 1176 nmax = so->so_rcv.sb_wat; 1177 else { 1178 /* automatic buffer scaling */ 1179 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) 1180 nmax = MIN(sb_max, so->so_rcv.sb_hiwat + 1181 tcp_autorcvbuf_inc); 1182 } 1183 1184 /* a readable socket must be preserved because of poll(2) semantics */ 1185 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && 1186 nmax < so->so_snd.sb_lowat) 1187 nmax = so->so_snd.sb_lowat; 1188 1189 if (nmax == so->so_rcv.sb_hiwat) 1190 return; 1191 1192 /* round to MSS boundary */ 1193 nmax = roundup(nmax, tp->t_maxseg); 1194 sbreserve(so, &so->so_rcv, nmax); 1195 } 1196