1 /* $OpenBSD: tcp_usrreq.c,v 1.144 2017/02/09 15:19:32 jca Exp $ */ 2 /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/protosw.h> 77 #include <sys/stat.h> 78 #include <sys/sysctl.h> 79 #include <sys/domain.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/in_var.h> 89 #include <netinet/ip.h> 90 #include <netinet/in_pcb.h> 91 #include <netinet/ip_var.h> 92 #include <netinet/tcp.h> 93 #include <netinet/tcp_fsm.h> 94 #include <netinet/tcp_seq.h> 95 #include <netinet/tcp_timer.h> 96 #include <netinet/tcp_var.h> 97 #include <netinet/tcpip.h> 98 #include <netinet/tcp_debug.h> 99 100 #ifdef INET6 101 #include <netinet6/in6_var.h> 102 #endif 103 104 #ifndef TCP_SENDSPACE 105 #define TCP_SENDSPACE 1024*16 106 #endif 107 u_int tcp_sendspace = TCP_SENDSPACE; 108 #ifndef TCP_RECVSPACE 109 #define TCP_RECVSPACE 1024*16 110 #endif 111 u_int tcp_recvspace = TCP_RECVSPACE; 112 u_int tcp_autorcvbuf_inc = 16 * 1024; 113 114 int *tcpctl_vars[TCPCTL_MAXID] = TCPCTL_VARS; 115 116 struct inpcbtable tcbtable; 117 118 int tcp_ident(void *, size_t *, void *, size_t, int); 119 120 /* 121 * Process a TCP user request for TCP tb. If this is a send request 122 * then m is the mbuf chain of send data. If this is a timer expiration 123 * (called from the software clock routine), then timertype tells which timer. 124 */ 125 /*ARGSUSED*/ 126 int 127 tcp_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam, 128 struct mbuf *control, struct proc *p) 129 { 130 struct sockaddr_in *sin; 131 struct inpcb *inp; 132 struct tcpcb *tp = NULL; 133 int error = 0; 134 short ostate; 135 136 NET_ASSERT_LOCKED(); 137 138 if (req == PRU_CONTROL) { 139 #ifdef INET6 140 if (sotopf(so) == PF_INET6) 141 return in6_control(so, (u_long)m, (caddr_t)nam, 142 (struct ifnet *)control); 143 else 144 #endif /* INET6 */ 145 return (in_control(so, (u_long)m, (caddr_t)nam, 146 (struct ifnet *)control)); 147 } 148 if (control && control->m_len) { 149 m_freem(control); 150 m_freem(m); 151 return (EINVAL); 152 } 153 154 inp = sotoinpcb(so); 155 /* 156 * When a TCP is attached to a socket, then there will be 157 * a (struct inpcb) pointed at by the socket, and this 158 * structure will point at a subsidiary (struct tcpcb). 159 */ 160 if (inp == NULL && req != PRU_ATTACH) { 161 error = so->so_error; 162 if (error == 0) 163 error = EINVAL; 164 /* 165 * The following corrects an mbuf leak under rare 166 * circumstances 167 */ 168 if (req == PRU_SEND || req == PRU_SENDOOB) 169 m_freem(m); 170 return (error); 171 } 172 if (inp) { 173 tp = intotcpcb(inp); 174 /* tp might get 0 when using socket splicing */ 175 if (tp == NULL) { 176 return (0); 177 } 178 #ifdef KPROF 179 tcp_acounts[tp->t_state][req]++; 180 #endif 181 ostate = tp->t_state; 182 } else 183 ostate = 0; 184 switch (req) { 185 186 /* 187 * TCP attaches to socket via PRU_ATTACH, reserving space, 188 * and an internet control block. 189 */ 190 case PRU_ATTACH: 191 if (inp) { 192 error = EISCONN; 193 break; 194 } 195 error = tcp_attach(so); 196 if (error) 197 break; 198 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 199 so->so_linger = TCP_LINGERTIME; 200 tp = sototcpcb(so); 201 break; 202 203 /* 204 * PRU_DETACH detaches the TCP protocol from the socket. 205 * If the protocol state is non-embryonic, then can't 206 * do this directly: have to initiate a PRU_DISCONNECT, 207 * which may finish later; embryonic TCB's can just 208 * be discarded here. 209 */ 210 case PRU_DETACH: 211 tp = tcp_disconnect(tp); 212 break; 213 214 /* 215 * Give the socket an address. 216 */ 217 case PRU_BIND: 218 error = in_pcbbind(inp, nam, p); 219 break; 220 221 /* 222 * Prepare to accept connections. 223 */ 224 case PRU_LISTEN: 225 if (inp->inp_lport == 0) 226 error = in_pcbbind(inp, NULL, p); 227 /* If the in_pcbbind() above is called, the tp->pf 228 should still be whatever it was before. */ 229 if (error == 0) 230 tp->t_state = TCPS_LISTEN; 231 break; 232 233 /* 234 * Initiate connection to peer. 235 * Create a template for use in transmissions on this connection. 236 * Enter SYN_SENT state, and mark socket as connecting. 237 * Start keep-alive timer, and seed output sequence space. 238 * Send initial segment on connection. 239 */ 240 case PRU_CONNECT: 241 sin = mtod(nam, struct sockaddr_in *); 242 243 #ifdef INET6 244 if (sin->sin_family == AF_INET6) { 245 struct in6_addr *in6_addr = &mtod(nam, 246 struct sockaddr_in6 *)->sin6_addr; 247 248 if (IN6_IS_ADDR_UNSPECIFIED(in6_addr) || 249 IN6_IS_ADDR_MULTICAST(in6_addr) || 250 IN6_IS_ADDR_V4MAPPED(in6_addr)) { 251 error = EINVAL; 252 break; 253 } 254 255 error = in6_pcbconnect(inp, nam); 256 } else if (sin->sin_family == AF_INET) 257 #endif /* INET6 */ 258 { 259 if ((sin->sin_addr.s_addr == INADDR_ANY) || 260 (sin->sin_addr.s_addr == INADDR_BROADCAST) || 261 IN_MULTICAST(sin->sin_addr.s_addr) || 262 in_broadcast(sin->sin_addr, inp->inp_rtableid)) { 263 error = EINVAL; 264 break; 265 } 266 267 error = in_pcbconnect(inp, nam); 268 } 269 270 if (error) 271 break; 272 273 tp->t_template = tcp_template(tp); 274 if (tp->t_template == 0) { 275 in_pcbdisconnect(inp); 276 error = ENOBUFS; 277 break; 278 } 279 280 so->so_state |= SS_CONNECTOUT; 281 282 /* Compute window scaling to request. */ 283 tcp_rscale(tp, sb_max); 284 285 soisconnecting(so); 286 tcpstat_inc(tcps_connattempt); 287 tp->t_state = TCPS_SYN_SENT; 288 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 289 tcp_set_iss_tsm(tp); 290 tcp_sendseqinit(tp); 291 #if defined(TCP_SACK) 292 tp->snd_last = tp->snd_una; 293 #endif 294 #if defined(TCP_SACK) && defined(TCP_FACK) 295 tp->snd_fack = tp->snd_una; 296 tp->retran_data = 0; 297 tp->snd_awnd = 0; 298 #endif 299 error = tcp_output(tp); 300 break; 301 302 /* 303 * Create a TCP connection between two sockets. 304 */ 305 case PRU_CONNECT2: 306 error = EOPNOTSUPP; 307 break; 308 309 /* 310 * Initiate disconnect from peer. 311 * If connection never passed embryonic stage, just drop; 312 * else if don't need to let data drain, then can just drop anyways, 313 * else have to begin TCP shutdown process: mark socket disconnecting, 314 * drain unread data, state switch to reflect user close, and 315 * send segment (e.g. FIN) to peer. Socket will be really disconnected 316 * when peer sends FIN and acks ours. 317 * 318 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 319 */ 320 case PRU_DISCONNECT: 321 tp = tcp_disconnect(tp); 322 break; 323 324 /* 325 * Accept a connection. Essentially all the work is 326 * done at higher levels; just return the address 327 * of the peer, storing through addr. 328 */ 329 case PRU_ACCEPT: 330 #ifdef INET6 331 if (inp->inp_flags & INP_IPV6) 332 in6_setpeeraddr(inp, nam); 333 else 334 #endif 335 in_setpeeraddr(inp, nam); 336 break; 337 338 /* 339 * Mark the connection as being incapable of further output. 340 */ 341 case PRU_SHUTDOWN: 342 if (so->so_state & SS_CANTSENDMORE) 343 break; 344 socantsendmore(so); 345 tp = tcp_usrclosed(tp); 346 if (tp) 347 error = tcp_output(tp); 348 break; 349 350 /* 351 * After a receive, possibly send window update to peer. 352 */ 353 case PRU_RCVD: 354 /* 355 * soreceive() calls this function when a user receives 356 * ancillary data on a listening socket. We don't call 357 * tcp_output in such a case, since there is no header 358 * template for a listening socket and hence the kernel 359 * will panic. 360 */ 361 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 362 (void) tcp_output(tp); 363 break; 364 365 /* 366 * Do a send by putting data in output queue and updating urgent 367 * marker if URG set. Possibly send more data. 368 */ 369 case PRU_SEND: 370 sbappendstream(&so->so_snd, m); 371 error = tcp_output(tp); 372 break; 373 374 /* 375 * Abort the TCP. 376 */ 377 case PRU_ABORT: 378 tp = tcp_drop(tp, ECONNABORTED); 379 break; 380 381 case PRU_SENSE: 382 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; 383 return (0); 384 385 case PRU_RCVOOB: 386 if ((so->so_oobmark == 0 && 387 (so->so_state & SS_RCVATMARK) == 0) || 388 so->so_options & SO_OOBINLINE || 389 tp->t_oobflags & TCPOOB_HADDATA) { 390 error = EINVAL; 391 break; 392 } 393 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 394 error = EWOULDBLOCK; 395 break; 396 } 397 m->m_len = 1; 398 *mtod(m, caddr_t) = tp->t_iobc; 399 if (((long)nam & MSG_PEEK) == 0) 400 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 401 break; 402 403 case PRU_SENDOOB: 404 if (sbspace(&so->so_snd) < -512) { 405 m_freem(m); 406 error = ENOBUFS; 407 break; 408 } 409 /* 410 * According to RFC961 (Assigned Protocols), 411 * the urgent pointer points to the last octet 412 * of urgent data. We continue, however, 413 * to consider it to indicate the first octet 414 * of data past the urgent section. 415 * Otherwise, snd_up should be one lower. 416 */ 417 sbappendstream(&so->so_snd, m); 418 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 419 tp->t_force = 1; 420 error = tcp_output(tp); 421 tp->t_force = 0; 422 break; 423 424 case PRU_SOCKADDR: 425 #ifdef INET6 426 if (inp->inp_flags & INP_IPV6) 427 in6_setsockaddr(inp, nam); 428 else 429 #endif 430 in_setsockaddr(inp, nam); 431 break; 432 433 case PRU_PEERADDR: 434 #ifdef INET6 435 if (inp->inp_flags & INP_IPV6) 436 in6_setpeeraddr(inp, nam); 437 else 438 #endif 439 in_setpeeraddr(inp, nam); 440 break; 441 442 default: 443 panic("tcp_usrreq"); 444 } 445 if (tp && (so->so_options & SO_DEBUG)) 446 tcp_trace(TA_USER, ostate, tp, (caddr_t)0, req, 0); 447 return (error); 448 } 449 450 int 451 tcp_ctloutput(int op, struct socket *so, int level, int optname, 452 struct mbuf *m) 453 { 454 int error = 0; 455 struct inpcb *inp; 456 struct tcpcb *tp; 457 int i; 458 459 inp = sotoinpcb(so); 460 if (inp == NULL) { 461 if (op == PRCO_SETOPT) 462 (void) m_free(m); 463 return (ECONNRESET); 464 } 465 if (level != IPPROTO_TCP) { 466 switch (so->so_proto->pr_domain->dom_family) { 467 #ifdef INET6 468 case PF_INET6: 469 error = ip6_ctloutput(op, so, level, optname, m); 470 break; 471 #endif /* INET6 */ 472 case PF_INET: 473 error = ip_ctloutput(op, so, level, optname, m); 474 break; 475 default: 476 error = EAFNOSUPPORT; /*?*/ 477 break; 478 } 479 return (error); 480 } 481 tp = intotcpcb(inp); 482 483 switch (op) { 484 485 case PRCO_SETOPT: 486 switch (optname) { 487 488 case TCP_NODELAY: 489 if (m == NULL || m->m_len < sizeof (int)) 490 error = EINVAL; 491 else if (*mtod(m, int *)) 492 tp->t_flags |= TF_NODELAY; 493 else 494 tp->t_flags &= ~TF_NODELAY; 495 break; 496 497 case TCP_NOPUSH: 498 if (m == NULL || m->m_len < sizeof (int)) 499 error = EINVAL; 500 else if (*mtod(m, int *)) 501 tp->t_flags |= TF_NOPUSH; 502 else if (tp->t_flags & TF_NOPUSH) { 503 tp->t_flags &= ~TF_NOPUSH; 504 if (TCPS_HAVEESTABLISHED(tp->t_state)) 505 error = tcp_output(tp); 506 } 507 break; 508 509 case TCP_MAXSEG: 510 if (m == NULL || m->m_len < sizeof (int)) { 511 error = EINVAL; 512 break; 513 } 514 515 i = *mtod(m, int *); 516 if (i > 0 && i <= tp->t_maxseg) 517 tp->t_maxseg = i; 518 else 519 error = EINVAL; 520 break; 521 522 #ifdef TCP_SACK 523 case TCP_SACK_ENABLE: 524 if (m == NULL || m->m_len < sizeof (int)) { 525 error = EINVAL; 526 break; 527 } 528 529 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 530 error = EPERM; 531 break; 532 } 533 534 if (tp->t_flags & TF_SIGNATURE) { 535 error = EPERM; 536 break; 537 } 538 539 if (*mtod(m, int *)) 540 tp->sack_enable = 1; 541 else 542 tp->sack_enable = 0; 543 break; 544 #endif 545 #ifdef TCP_SIGNATURE 546 case TCP_MD5SIG: 547 if (m == NULL || m->m_len < sizeof (int)) { 548 error = EINVAL; 549 break; 550 } 551 552 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 553 error = EPERM; 554 break; 555 } 556 557 if (*mtod(m, int *)) { 558 tp->t_flags |= TF_SIGNATURE; 559 #ifdef TCP_SACK 560 tp->sack_enable = 0; 561 #endif /* TCP_SACK */ 562 } else 563 tp->t_flags &= ~TF_SIGNATURE; 564 break; 565 #endif /* TCP_SIGNATURE */ 566 default: 567 error = ENOPROTOOPT; 568 break; 569 } 570 m_free(m); 571 break; 572 573 case PRCO_GETOPT: 574 m->m_len = sizeof(int); 575 576 switch (optname) { 577 case TCP_NODELAY: 578 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 579 break; 580 case TCP_NOPUSH: 581 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 582 break; 583 case TCP_MAXSEG: 584 *mtod(m, int *) = tp->t_maxseg; 585 break; 586 #ifdef TCP_SACK 587 case TCP_SACK_ENABLE: 588 *mtod(m, int *) = tp->sack_enable; 589 break; 590 #endif 591 #ifdef TCP_SIGNATURE 592 case TCP_MD5SIG: 593 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; 594 break; 595 #endif 596 default: 597 error = ENOPROTOOPT; 598 break; 599 } 600 break; 601 } 602 return (error); 603 } 604 605 /* 606 * Attach TCP protocol to socket, allocating 607 * internet protocol control block, tcp control block, 608 * bufer space, and entering LISTEN state if to accept connections. 609 */ 610 int 611 tcp_attach(struct socket *so) 612 { 613 struct tcpcb *tp; 614 struct inpcb *inp; 615 int error; 616 617 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || 618 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || 619 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { 620 error = soreserve(so, tcp_sendspace, tcp_recvspace); 621 if (error) 622 return (error); 623 } 624 625 error = in_pcballoc(so, &tcbtable); 626 if (error) 627 return (error); 628 inp = sotoinpcb(so); 629 tp = tcp_newtcpcb(inp); 630 if (tp == NULL) { 631 int nofd = so->so_state & SS_NOFDREF; /* XXX */ 632 633 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 634 in_pcbdetach(inp); 635 so->so_state |= nofd; 636 return (ENOBUFS); 637 } 638 tp->t_state = TCPS_CLOSED; 639 #ifdef INET6 640 /* we disallow IPv4 mapped address completely. */ 641 if (inp->inp_flags & INP_IPV6) 642 tp->pf = PF_INET6; 643 else 644 tp->pf = PF_INET; 645 #else 646 tp->pf = PF_INET; 647 #endif 648 return (0); 649 } 650 651 /* 652 * Initiate (or continue) disconnect. 653 * If embryonic state, just send reset (once). 654 * If in ``let data drain'' option and linger null, just drop. 655 * Otherwise (hard), mark socket disconnecting and drop 656 * current input data; switch states based on user close, and 657 * send segment to peer (with FIN). 658 */ 659 struct tcpcb * 660 tcp_disconnect(struct tcpcb *tp) 661 { 662 struct socket *so = tp->t_inpcb->inp_socket; 663 664 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 665 tp = tcp_close(tp); 666 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 667 tp = tcp_drop(tp, 0); 668 else { 669 soisdisconnecting(so); 670 sbflush(&so->so_rcv); 671 tp = tcp_usrclosed(tp); 672 if (tp) 673 (void) tcp_output(tp); 674 } 675 return (tp); 676 } 677 678 /* 679 * User issued close, and wish to trail through shutdown states: 680 * if never received SYN, just forget it. If got a SYN from peer, 681 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 682 * If already got a FIN from peer, then almost done; go to LAST_ACK 683 * state. In all other cases, have already sent FIN to peer (e.g. 684 * after PRU_SHUTDOWN), and just have to play tedious game waiting 685 * for peer to send FIN or not respond to keep-alives, etc. 686 * We can let the user exit from the close as soon as the FIN is acked. 687 */ 688 struct tcpcb * 689 tcp_usrclosed(struct tcpcb *tp) 690 { 691 692 switch (tp->t_state) { 693 694 case TCPS_CLOSED: 695 case TCPS_LISTEN: 696 case TCPS_SYN_SENT: 697 tp->t_state = TCPS_CLOSED; 698 tp = tcp_close(tp); 699 break; 700 701 case TCPS_SYN_RECEIVED: 702 case TCPS_ESTABLISHED: 703 tp->t_state = TCPS_FIN_WAIT_1; 704 break; 705 706 case TCPS_CLOSE_WAIT: 707 tp->t_state = TCPS_LAST_ACK; 708 break; 709 } 710 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 711 soisdisconnected(tp->t_inpcb->inp_socket); 712 /* 713 * If we are in FIN_WAIT_2, we arrived here because the 714 * application did a shutdown of the send side. Like the 715 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 716 * a full close, we start a timer to make sure sockets are 717 * not left in FIN_WAIT_2 forever. 718 */ 719 if (tp->t_state == TCPS_FIN_WAIT_2) 720 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 721 } 722 return (tp); 723 } 724 725 /* 726 * Look up a socket for ident or tcpdrop, ... 727 */ 728 int 729 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) 730 { 731 int error = 0; 732 struct tcp_ident_mapping tir; 733 struct inpcb *inp; 734 struct tcpcb *tp = NULL; 735 struct sockaddr_in *fin, *lin; 736 #ifdef INET6 737 struct sockaddr_in6 *fin6, *lin6; 738 struct in6_addr f6, l6; 739 #endif 740 741 splsoftassert(IPL_SOFTNET); 742 743 if (dodrop) { 744 if (oldp != NULL || *oldlenp != 0) 745 return (EINVAL); 746 if (newp == NULL) 747 return (EPERM); 748 if (newlen < sizeof(tir)) 749 return (ENOMEM); 750 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) 751 return (error); 752 } else { 753 if (oldp == NULL) 754 return (EINVAL); 755 if (*oldlenp < sizeof(tir)) 756 return (ENOMEM); 757 if (newp != NULL || newlen != 0) 758 return (EINVAL); 759 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) 760 return (error); 761 } 762 switch (tir.faddr.ss_family) { 763 #ifdef INET6 764 case AF_INET6: 765 fin6 = (struct sockaddr_in6 *)&tir.faddr; 766 error = in6_embedscope(&f6, fin6, NULL); 767 if (error) 768 return EINVAL; /*?*/ 769 lin6 = (struct sockaddr_in6 *)&tir.laddr; 770 error = in6_embedscope(&l6, lin6, NULL); 771 if (error) 772 return EINVAL; /*?*/ 773 break; 774 #endif 775 case AF_INET: 776 fin = (struct sockaddr_in *)&tir.faddr; 777 lin = (struct sockaddr_in *)&tir.laddr; 778 break; 779 default: 780 return (EINVAL); 781 } 782 783 switch (tir.faddr.ss_family) { 784 #ifdef INET6 785 case AF_INET6: 786 inp = in6_pcbhashlookup(&tcbtable, &f6, 787 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); 788 break; 789 #endif 790 case AF_INET: 791 inp = in_pcbhashlookup(&tcbtable, fin->sin_addr, 792 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); 793 break; 794 default: 795 unhandled_af(tir.faddr.ss_family); 796 } 797 798 if (dodrop) { 799 if (inp && (tp = intotcpcb(inp)) && 800 ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) 801 tp = tcp_drop(tp, ECONNABORTED); 802 else 803 error = ESRCH; 804 return (error); 805 } 806 807 if (inp == NULL) { 808 tcpstat_inc(tcps_pcbhashmiss); 809 switch (tir.faddr.ss_family) { 810 #ifdef INET6 811 case AF_INET6: 812 inp = in6_pcblookup_listen(&tcbtable, 813 &l6, lin6->sin6_port, 0, NULL, tir.rdomain); 814 break; 815 #endif 816 case AF_INET: 817 inp = in_pcblookup_listen(&tcbtable, 818 lin->sin_addr, lin->sin_port, 0, NULL, tir.rdomain); 819 break; 820 } 821 } 822 823 if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) { 824 tir.ruid = inp->inp_socket->so_ruid; 825 tir.euid = inp->inp_socket->so_euid; 826 } else { 827 tir.ruid = -1; 828 tir.euid = -1; 829 } 830 831 *oldlenp = sizeof (tir); 832 error = copyout((void *)&tir, oldp, sizeof (tir)); 833 return (error); 834 } 835 836 int 837 tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp) 838 { 839 struct tcpstat tcpstat; 840 struct counters_ref cr; 841 uint64_t *counters; 842 struct syn_cache_set *set; 843 int i = 0; 844 845 #define ASSIGN(field) do { tcpstat.field = counters[i++]; } while (0) 846 847 counters = counters_enter(&cr, tcpcounters); 848 ASSIGN(tcps_connattempt); 849 ASSIGN(tcps_accepts); 850 ASSIGN(tcps_connects); 851 ASSIGN(tcps_drops); 852 ASSIGN(tcps_conndrops); 853 ASSIGN(tcps_closed); 854 ASSIGN(tcps_segstimed); 855 ASSIGN(tcps_rttupdated); 856 ASSIGN(tcps_delack); 857 ASSIGN(tcps_timeoutdrop); 858 ASSIGN(tcps_rexmttimeo); 859 ASSIGN(tcps_persisttimeo); 860 ASSIGN(tcps_persistdrop); 861 ASSIGN(tcps_keeptimeo); 862 ASSIGN(tcps_keepprobe); 863 ASSIGN(tcps_keepdrops); 864 ASSIGN(tcps_sndtotal); 865 ASSIGN(tcps_sndpack); 866 ASSIGN(tcps_sndbyte); 867 ASSIGN(tcps_sndrexmitpack); 868 ASSIGN(tcps_sndrexmitbyte); 869 ASSIGN(tcps_sndrexmitfast); 870 ASSIGN(tcps_sndacks); 871 ASSIGN(tcps_sndprobe); 872 ASSIGN(tcps_sndurg); 873 ASSIGN(tcps_sndwinup); 874 ASSIGN(tcps_sndctrl); 875 ASSIGN(tcps_rcvtotal); 876 ASSIGN(tcps_rcvpack); 877 ASSIGN(tcps_rcvbyte); 878 ASSIGN(tcps_rcvbadsum); 879 ASSIGN(tcps_rcvbadoff); 880 ASSIGN(tcps_rcvmemdrop); 881 ASSIGN(tcps_rcvnosec); 882 ASSIGN(tcps_rcvshort); 883 ASSIGN(tcps_rcvduppack); 884 ASSIGN(tcps_rcvdupbyte); 885 ASSIGN(tcps_rcvpartduppack); 886 ASSIGN(tcps_rcvpartdupbyte); 887 ASSIGN(tcps_rcvoopack); 888 ASSIGN(tcps_rcvoobyte); 889 ASSIGN(tcps_rcvpackafterwin); 890 ASSIGN(tcps_rcvbyteafterwin); 891 ASSIGN(tcps_rcvafterclose); 892 ASSIGN(tcps_rcvwinprobe); 893 ASSIGN(tcps_rcvdupack); 894 ASSIGN(tcps_rcvacktoomuch); 895 ASSIGN(tcps_rcvacktooold); 896 ASSIGN(tcps_rcvackpack); 897 ASSIGN(tcps_rcvackbyte); 898 ASSIGN(tcps_rcvwinupd); 899 ASSIGN(tcps_pawsdrop); 900 ASSIGN(tcps_predack); 901 ASSIGN(tcps_preddat); 902 ASSIGN(tcps_pcbhashmiss); 903 ASSIGN(tcps_noport); 904 ASSIGN(tcps_badsyn); 905 ASSIGN(tcps_dropsyn); 906 ASSIGN(tcps_rcvbadsig); 907 ASSIGN(tcps_rcvgoodsig); 908 ASSIGN(tcps_inswcsum); 909 ASSIGN(tcps_outswcsum); 910 ASSIGN(tcps_ecn_accepts); 911 ASSIGN(tcps_ecn_rcvece); 912 ASSIGN(tcps_ecn_rcvcwr); 913 ASSIGN(tcps_ecn_rcvce); 914 ASSIGN(tcps_ecn_sndect); 915 ASSIGN(tcps_ecn_sndece); 916 ASSIGN(tcps_ecn_sndcwr); 917 ASSIGN(tcps_cwr_ecn); 918 ASSIGN(tcps_cwr_frecovery); 919 ASSIGN(tcps_cwr_timeout); 920 ASSIGN(tcps_sc_added); 921 ASSIGN(tcps_sc_completed); 922 ASSIGN(tcps_sc_timed_out); 923 ASSIGN(tcps_sc_overflowed); 924 ASSIGN(tcps_sc_reset); 925 ASSIGN(tcps_sc_unreach); 926 ASSIGN(tcps_sc_bucketoverflow); 927 ASSIGN(tcps_sc_aborted); 928 ASSIGN(tcps_sc_dupesyn); 929 ASSIGN(tcps_sc_dropped); 930 ASSIGN(tcps_sc_collisions); 931 ASSIGN(tcps_sc_retransmitted); 932 ASSIGN(tcps_sc_seedrandom); 933 ASSIGN(tcps_sc_hash_size); 934 ASSIGN(tcps_sc_entry_count); 935 ASSIGN(tcps_sc_entry_limit); 936 ASSIGN(tcps_sc_bucket_maxlen); 937 ASSIGN(tcps_sc_bucket_limit); 938 ASSIGN(tcps_sc_uses_left); 939 ASSIGN(tcps_conndrained); 940 ASSIGN(tcps_sack_recovery_episode); 941 ASSIGN(tcps_sack_rexmits); 942 ASSIGN(tcps_sack_rexmit_bytes); 943 ASSIGN(tcps_sack_rcv_opts); 944 ASSIGN(tcps_sack_snd_opts); 945 counters_leave(&cr, tcpcounters); 946 947 #undef ASSIGN 948 949 set = &tcp_syn_cache[tcp_syn_cache_active]; 950 tcpstat.tcps_sc_hash_size = set->scs_size; 951 tcpstat.tcps_sc_entry_count = set->scs_count; 952 tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit; 953 tcpstat.tcps_sc_bucket_maxlen = 0; 954 for (i = 0; i < set->scs_size; i++) { 955 if (tcpstat.tcps_sc_bucket_maxlen < 956 set->scs_buckethead[i].sch_length) 957 tcpstat.tcps_sc_bucket_maxlen = 958 set->scs_buckethead[i].sch_length; 959 } 960 tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit; 961 tcpstat.tcps_sc_uses_left = set->scs_use; 962 963 return (sysctl_rdstruct(oldp, oldlenp, newp, 964 &tcpstat, sizeof(tcpstat))); 965 } 966 967 /* 968 * Sysctl for tcp variables. 969 */ 970 int 971 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 972 size_t newlen) 973 { 974 int error, nval; 975 976 NET_ASSERT_LOCKED(); 977 978 /* All sysctl names at this level are terminal. */ 979 if (namelen != 1) 980 return (ENOTDIR); 981 982 switch (name[0]) { 983 #ifdef TCP_SACK 984 case TCPCTL_SACK: 985 return (sysctl_int(oldp, oldlenp, newp, newlen, 986 &tcp_do_sack)); 987 #endif 988 case TCPCTL_SLOWHZ: 989 return (sysctl_rdint(oldp, oldlenp, newp, PR_SLOWHZ)); 990 991 case TCPCTL_BADDYNAMIC: 992 return (sysctl_struct(oldp, oldlenp, newp, newlen, 993 baddynamicports.tcp, sizeof(baddynamicports.tcp))); 994 995 case TCPCTL_ROOTONLY: 996 if (newp && securelevel > 0) 997 return (EPERM); 998 return (sysctl_struct(oldp, oldlenp, newp, newlen, 999 rootonlyports.tcp, sizeof(rootonlyports.tcp))); 1000 1001 case TCPCTL_IDENT: 1002 return (tcp_ident(oldp, oldlenp, newp, newlen, 0)); 1003 1004 case TCPCTL_DROP: 1005 return (tcp_ident(oldp, oldlenp, newp, newlen, 1)); 1006 1007 case TCPCTL_ALWAYS_KEEPALIVE: 1008 return (sysctl_int(oldp, oldlenp, newp, newlen, 1009 &tcp_always_keepalive)); 1010 1011 #ifdef TCP_ECN 1012 case TCPCTL_ECN: 1013 return (sysctl_int(oldp, oldlenp, newp, newlen, 1014 &tcp_do_ecn)); 1015 #endif 1016 case TCPCTL_REASS_LIMIT: 1017 nval = tcp_reass_limit; 1018 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1019 if (error) 1020 return (error); 1021 if (nval != tcp_reass_limit) { 1022 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); 1023 if (error) 1024 return (error); 1025 tcp_reass_limit = nval; 1026 } 1027 return (0); 1028 #ifdef TCP_SACK 1029 case TCPCTL_SACKHOLE_LIMIT: 1030 nval = tcp_sackhole_limit; 1031 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1032 if (error) 1033 return (error); 1034 if (nval != tcp_sackhole_limit) { 1035 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); 1036 if (error) 1037 return (error); 1038 tcp_sackhole_limit = nval; 1039 } 1040 return (0); 1041 #endif 1042 1043 case TCPCTL_STATS: 1044 return (tcp_sysctl_tcpstat(oldp, oldlenp, newp)); 1045 1046 case TCPCTL_SYN_USE_LIMIT: 1047 error = sysctl_int(oldp, oldlenp, newp, newlen, 1048 &tcp_syn_use_limit); 1049 if (error) 1050 return (error); 1051 if (newp != NULL) { 1052 /* 1053 * Global tcp_syn_use_limit is used when reseeding a 1054 * new cache. Also update the value in active cache. 1055 */ 1056 if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit) 1057 tcp_syn_cache[0].scs_use = tcp_syn_use_limit; 1058 if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit) 1059 tcp_syn_cache[1].scs_use = tcp_syn_use_limit; 1060 } 1061 return (0); 1062 1063 case TCPCTL_SYN_HASH_SIZE: 1064 nval = tcp_syn_hash_size; 1065 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1066 if (error) 1067 return (error); 1068 if (nval != tcp_syn_hash_size) { 1069 if (nval < 1 || nval > 100000) 1070 return (EINVAL); 1071 /* 1072 * If global hash size has been changed, switch sets as 1073 * soon as possible. Then the actual hash array will 1074 * be reallocated. 1075 */ 1076 if (tcp_syn_cache[0].scs_size != nval) 1077 tcp_syn_cache[0].scs_use = 0; 1078 if (tcp_syn_cache[1].scs_size != nval) 1079 tcp_syn_cache[1].scs_use = 0; 1080 tcp_syn_hash_size = nval; 1081 } 1082 return (0); 1083 1084 default: 1085 if (name[0] < TCPCTL_MAXID) 1086 return (sysctl_int_arr(tcpctl_vars, name, namelen, 1087 oldp, oldlenp, newp, newlen)); 1088 return (ENOPROTOOPT); 1089 } 1090 /* NOTREACHED */ 1091 } 1092 1093 /* 1094 * Scale the send buffer so that inflight data is not accounted against 1095 * the limit. The buffer will scale with the congestion window, if the 1096 * the receiver stops acking data the window will shrink and therefor 1097 * the buffer size will shrink as well. 1098 * In low memory situation try to shrink the buffer to the initial size 1099 * disabling the send buffer scaling as long as the situation persists. 1100 */ 1101 void 1102 tcp_update_sndspace(struct tcpcb *tp) 1103 { 1104 struct socket *so = tp->t_inpcb->inp_socket; 1105 u_long nmax = so->so_snd.sb_hiwat; 1106 1107 if (sbchecklowmem()) { 1108 /* low on memory try to get rid of some */ 1109 if (tcp_sendspace < nmax) 1110 nmax = tcp_sendspace; 1111 } else if (so->so_snd.sb_wat != tcp_sendspace) 1112 /* user requested buffer size, auto-scaling disabled */ 1113 nmax = so->so_snd.sb_wat; 1114 else 1115 /* automatic buffer scaling */ 1116 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - 1117 tp->snd_una); 1118 1119 /* a writable socket must be preserved because of poll(2) semantics */ 1120 if (sbspace(&so->so_snd) >= so->so_snd.sb_lowat) { 1121 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) 1122 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; 1123 if (nmax * 2 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat) 1124 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+1) / 2; 1125 } 1126 1127 /* round to MSS boundary */ 1128 nmax = roundup(nmax, tp->t_maxseg); 1129 1130 if (nmax != so->so_snd.sb_hiwat) 1131 sbreserve(&so->so_snd, nmax); 1132 } 1133 1134 /* 1135 * Scale the recv buffer by looking at how much data was transferred in 1136 * on approximated RTT. If more than a big part of the recv buffer was 1137 * transferred during that time we increase the buffer by a constant. 1138 * In low memory situation try to shrink the buffer to the initial size. 1139 */ 1140 void 1141 tcp_update_rcvspace(struct tcpcb *tp) 1142 { 1143 struct socket *so = tp->t_inpcb->inp_socket; 1144 u_long nmax = so->so_rcv.sb_hiwat; 1145 1146 if (sbchecklowmem()) { 1147 /* low on memory try to get rid of some */ 1148 if (tcp_recvspace < nmax) 1149 nmax = tcp_recvspace; 1150 } else if (so->so_rcv.sb_wat != tcp_recvspace) 1151 /* user requested buffer size, auto-scaling disabled */ 1152 nmax = so->so_rcv.sb_wat; 1153 else { 1154 /* automatic buffer scaling */ 1155 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) 1156 nmax = MIN(sb_max, so->so_rcv.sb_hiwat + 1157 tcp_autorcvbuf_inc); 1158 } 1159 1160 /* a readable socket must be preserved because of poll(2) semantics */ 1161 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && 1162 nmax < so->so_snd.sb_lowat) 1163 nmax = so->so_snd.sb_lowat; 1164 1165 if (nmax == so->so_rcv.sb_hiwat) 1166 return; 1167 1168 /* round to MSS boundary */ 1169 nmax = roundup(nmax, tp->t_maxseg); 1170 sbreserve(&so->so_rcv, nmax); 1171 } 1172