1 /* $OpenBSD: tcp_usrreq.c,v 1.167 2018/02/05 14:53:26 bluhm Exp $ */ 2 /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/protosw.h> 77 #include <sys/stat.h> 78 #include <sys/sysctl.h> 79 #include <sys/domain.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/in_var.h> 89 #include <netinet/ip.h> 90 #include <netinet/in_pcb.h> 91 #include <netinet/ip_var.h> 92 #include <netinet/tcp.h> 93 #include <netinet/tcp_fsm.h> 94 #include <netinet/tcp_seq.h> 95 #include <netinet/tcp_timer.h> 96 #include <netinet/tcp_var.h> 97 #include <netinet/tcp_debug.h> 98 99 #ifdef INET6 100 #include <netinet6/in6_var.h> 101 #endif 102 103 #ifndef TCP_SENDSPACE 104 #define TCP_SENDSPACE 1024*16 105 #endif 106 u_int tcp_sendspace = TCP_SENDSPACE; 107 #ifndef TCP_RECVSPACE 108 #define TCP_RECVSPACE 1024*16 109 #endif 110 u_int tcp_recvspace = TCP_RECVSPACE; 111 u_int tcp_autorcvbuf_inc = 16 * 1024; 112 113 int *tcpctl_vars[TCPCTL_MAXID] = TCPCTL_VARS; 114 115 struct inpcbtable tcbtable; 116 117 int tcp_ident(void *, size_t *, void *, size_t, int); 118 119 /* 120 * Process a TCP user request for TCP tb. If this is a send request 121 * then m is the mbuf chain of send data. If this is a timer expiration 122 * (called from the software clock routine), then timertype tells which timer. 123 */ 124 /*ARGSUSED*/ 125 int 126 tcp_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam, 127 struct mbuf *control, struct proc *p) 128 { 129 struct inpcb *inp; 130 struct tcpcb *tp = NULL; 131 int error = 0; 132 short ostate; 133 134 soassertlocked(so); 135 136 if (req == PRU_CONTROL) { 137 #ifdef INET6 138 if (sotopf(so) == PF_INET6) 139 return in6_control(so, (u_long)m, (caddr_t)nam, 140 (struct ifnet *)control); 141 else 142 #endif /* INET6 */ 143 return (in_control(so, (u_long)m, (caddr_t)nam, 144 (struct ifnet *)control)); 145 } 146 if (control && control->m_len) { 147 m_freem(control); 148 m_freem(m); 149 return (EINVAL); 150 } 151 152 inp = sotoinpcb(so); 153 /* 154 * When a TCP is attached to a socket, then there will be 155 * a (struct inpcb) pointed at by the socket, and this 156 * structure will point at a subsidiary (struct tcpcb). 157 */ 158 if (inp == NULL) { 159 error = so->so_error; 160 if (error == 0) 161 error = EINVAL; 162 /* 163 * The following corrects an mbuf leak under rare 164 * circumstances 165 */ 166 if (req == PRU_SEND || req == PRU_SENDOOB) 167 m_freem(m); 168 return (error); 169 } 170 tp = intotcpcb(inp); 171 /* tp might get 0 when using socket splicing */ 172 if (tp == NULL) 173 return (0); 174 ostate = tp->t_state; 175 176 switch (req) { 177 178 /* 179 * Give the socket an address. 180 */ 181 case PRU_BIND: 182 error = in_pcbbind(inp, nam, p); 183 break; 184 185 /* 186 * Prepare to accept connections. 187 */ 188 case PRU_LISTEN: 189 if (inp->inp_lport == 0) 190 error = in_pcbbind(inp, NULL, p); 191 /* If the in_pcbbind() above is called, the tp->pf 192 should still be whatever it was before. */ 193 if (error == 0) 194 tp->t_state = TCPS_LISTEN; 195 break; 196 197 /* 198 * Initiate connection to peer. 199 * Create a template for use in transmissions on this connection. 200 * Enter SYN_SENT state, and mark socket as connecting. 201 * Start keep-alive timer, and seed output sequence space. 202 * Send initial segment on connection. 203 */ 204 case PRU_CONNECT: 205 #ifdef INET6 206 if (inp->inp_flags & INP_IPV6) { 207 struct sockaddr_in6 *sin6; 208 209 if ((error = in6_nam2sin6(nam, &sin6))) 210 break; 211 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || 212 IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { 213 error = EINVAL; 214 break; 215 } 216 error = in6_pcbconnect(inp, nam); 217 } else 218 #endif /* INET6 */ 219 { 220 struct sockaddr_in *sin; 221 222 if ((error = in_nam2sin(nam, &sin))) 223 break; 224 if ((sin->sin_addr.s_addr == INADDR_ANY) || 225 (sin->sin_addr.s_addr == INADDR_BROADCAST) || 226 IN_MULTICAST(sin->sin_addr.s_addr) || 227 in_broadcast(sin->sin_addr, inp->inp_rtableid)) { 228 error = EINVAL; 229 break; 230 } 231 error = in_pcbconnect(inp, nam); 232 } 233 if (error) 234 break; 235 236 tp->t_template = tcp_template(tp); 237 if (tp->t_template == 0) { 238 in_pcbdisconnect(inp); 239 error = ENOBUFS; 240 break; 241 } 242 243 so->so_state |= SS_CONNECTOUT; 244 245 /* Compute window scaling to request. */ 246 tcp_rscale(tp, sb_max); 247 248 soisconnecting(so); 249 tcpstat_inc(tcps_connattempt); 250 tp->t_state = TCPS_SYN_SENT; 251 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 252 tcp_set_iss_tsm(tp); 253 tcp_sendseqinit(tp); 254 tp->snd_last = tp->snd_una; 255 error = tcp_output(tp); 256 break; 257 258 /* 259 * Create a TCP connection between two sockets. 260 */ 261 case PRU_CONNECT2: 262 error = EOPNOTSUPP; 263 break; 264 265 /* 266 * Initiate disconnect from peer. 267 * If connection never passed embryonic stage, just drop; 268 * else if don't need to let data drain, then can just drop anyways, 269 * else have to begin TCP shutdown process: mark socket disconnecting, 270 * drain unread data, state switch to reflect user close, and 271 * send segment (e.g. FIN) to peer. Socket will be really disconnected 272 * when peer sends FIN and acks ours. 273 * 274 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 275 */ 276 case PRU_DISCONNECT: 277 tp = tcp_disconnect(tp); 278 break; 279 280 /* 281 * Accept a connection. Essentially all the work is 282 * done at higher levels; just return the address 283 * of the peer, storing through addr. 284 */ 285 case PRU_ACCEPT: 286 #ifdef INET6 287 if (inp->inp_flags & INP_IPV6) 288 in6_setpeeraddr(inp, nam); 289 else 290 #endif 291 in_setpeeraddr(inp, nam); 292 break; 293 294 /* 295 * Mark the connection as being incapable of further output. 296 */ 297 case PRU_SHUTDOWN: 298 if (so->so_state & SS_CANTSENDMORE) 299 break; 300 socantsendmore(so); 301 tp = tcp_usrclosed(tp); 302 if (tp) 303 error = tcp_output(tp); 304 break; 305 306 /* 307 * After a receive, possibly send window update to peer. 308 */ 309 case PRU_RCVD: 310 /* 311 * soreceive() calls this function when a user receives 312 * ancillary data on a listening socket. We don't call 313 * tcp_output in such a case, since there is no header 314 * template for a listening socket and hence the kernel 315 * will panic. 316 */ 317 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 318 (void) tcp_output(tp); 319 break; 320 321 /* 322 * Do a send by putting data in output queue and updating urgent 323 * marker if URG set. Possibly send more data. 324 */ 325 case PRU_SEND: 326 sbappendstream(so, &so->so_snd, m); 327 error = tcp_output(tp); 328 break; 329 330 /* 331 * Abort the TCP. 332 */ 333 case PRU_ABORT: 334 tp = tcp_drop(tp, ECONNABORTED); 335 break; 336 337 case PRU_SENSE: 338 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; 339 return (0); 340 341 case PRU_RCVOOB: 342 if ((so->so_oobmark == 0 && 343 (so->so_state & SS_RCVATMARK) == 0) || 344 so->so_options & SO_OOBINLINE || 345 tp->t_oobflags & TCPOOB_HADDATA) { 346 error = EINVAL; 347 break; 348 } 349 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 350 error = EWOULDBLOCK; 351 break; 352 } 353 m->m_len = 1; 354 *mtod(m, caddr_t) = tp->t_iobc; 355 if (((long)nam & MSG_PEEK) == 0) 356 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 357 break; 358 359 case PRU_SENDOOB: 360 if (sbspace(so, &so->so_snd) < -512) { 361 m_freem(m); 362 error = ENOBUFS; 363 break; 364 } 365 /* 366 * According to RFC961 (Assigned Protocols), 367 * the urgent pointer points to the last octet 368 * of urgent data. We continue, however, 369 * to consider it to indicate the first octet 370 * of data past the urgent section. 371 * Otherwise, snd_up should be one lower. 372 */ 373 sbappendstream(so, &so->so_snd, m); 374 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 375 tp->t_force = 1; 376 error = tcp_output(tp); 377 tp->t_force = 0; 378 break; 379 380 case PRU_SOCKADDR: 381 #ifdef INET6 382 if (inp->inp_flags & INP_IPV6) 383 in6_setsockaddr(inp, nam); 384 else 385 #endif 386 in_setsockaddr(inp, nam); 387 break; 388 389 case PRU_PEERADDR: 390 #ifdef INET6 391 if (inp->inp_flags & INP_IPV6) 392 in6_setpeeraddr(inp, nam); 393 else 394 #endif 395 in_setpeeraddr(inp, nam); 396 break; 397 398 default: 399 panic("tcp_usrreq"); 400 } 401 if (tp && (so->so_options & SO_DEBUG)) 402 tcp_trace(TA_USER, ostate, tp, (caddr_t)0, req, 0); 403 return (error); 404 } 405 406 int 407 tcp_ctloutput(int op, struct socket *so, int level, int optname, 408 struct mbuf *m) 409 { 410 int error = 0; 411 struct inpcb *inp; 412 struct tcpcb *tp; 413 int i; 414 415 inp = sotoinpcb(so); 416 if (inp == NULL) 417 return (ECONNRESET); 418 if (level != IPPROTO_TCP) { 419 switch (so->so_proto->pr_domain->dom_family) { 420 #ifdef INET6 421 case PF_INET6: 422 error = ip6_ctloutput(op, so, level, optname, m); 423 break; 424 #endif /* INET6 */ 425 case PF_INET: 426 error = ip_ctloutput(op, so, level, optname, m); 427 break; 428 default: 429 error = EAFNOSUPPORT; /*?*/ 430 break; 431 } 432 return (error); 433 } 434 tp = intotcpcb(inp); 435 436 switch (op) { 437 438 case PRCO_SETOPT: 439 switch (optname) { 440 441 case TCP_NODELAY: 442 if (m == NULL || m->m_len < sizeof (int)) 443 error = EINVAL; 444 else if (*mtod(m, int *)) 445 tp->t_flags |= TF_NODELAY; 446 else 447 tp->t_flags &= ~TF_NODELAY; 448 break; 449 450 case TCP_NOPUSH: 451 if (m == NULL || m->m_len < sizeof (int)) 452 error = EINVAL; 453 else if (*mtod(m, int *)) 454 tp->t_flags |= TF_NOPUSH; 455 else if (tp->t_flags & TF_NOPUSH) { 456 tp->t_flags &= ~TF_NOPUSH; 457 if (TCPS_HAVEESTABLISHED(tp->t_state)) 458 error = tcp_output(tp); 459 } 460 break; 461 462 case TCP_MAXSEG: 463 if (m == NULL || m->m_len < sizeof (int)) { 464 error = EINVAL; 465 break; 466 } 467 468 i = *mtod(m, int *); 469 if (i > 0 && i <= tp->t_maxseg) 470 tp->t_maxseg = i; 471 else 472 error = EINVAL; 473 break; 474 475 case TCP_SACK_ENABLE: 476 if (m == NULL || m->m_len < sizeof (int)) { 477 error = EINVAL; 478 break; 479 } 480 481 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 482 error = EPERM; 483 break; 484 } 485 486 if (tp->t_flags & TF_SIGNATURE) { 487 error = EPERM; 488 break; 489 } 490 491 if (*mtod(m, int *)) 492 tp->sack_enable = 1; 493 else 494 tp->sack_enable = 0; 495 break; 496 #ifdef TCP_SIGNATURE 497 case TCP_MD5SIG: 498 if (m == NULL || m->m_len < sizeof (int)) { 499 error = EINVAL; 500 break; 501 } 502 503 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 504 error = EPERM; 505 break; 506 } 507 508 if (*mtod(m, int *)) { 509 tp->t_flags |= TF_SIGNATURE; 510 tp->sack_enable = 0; 511 } else 512 tp->t_flags &= ~TF_SIGNATURE; 513 break; 514 #endif /* TCP_SIGNATURE */ 515 default: 516 error = ENOPROTOOPT; 517 break; 518 } 519 break; 520 521 case PRCO_GETOPT: 522 m->m_len = sizeof(int); 523 524 switch (optname) { 525 case TCP_NODELAY: 526 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 527 break; 528 case TCP_NOPUSH: 529 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 530 break; 531 case TCP_MAXSEG: 532 *mtod(m, int *) = tp->t_maxseg; 533 break; 534 case TCP_SACK_ENABLE: 535 *mtod(m, int *) = tp->sack_enable; 536 break; 537 #ifdef TCP_SIGNATURE 538 case TCP_MD5SIG: 539 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; 540 break; 541 #endif 542 default: 543 error = ENOPROTOOPT; 544 break; 545 } 546 break; 547 } 548 return (error); 549 } 550 551 /* 552 * Attach TCP protocol to socket, allocating 553 * internet protocol control block, tcp control block, 554 * buffer space, and entering LISTEN state to accept connections. 555 */ 556 int 557 tcp_attach(struct socket *so, int proto) 558 { 559 struct tcpcb *tp; 560 struct inpcb *inp; 561 int error; 562 563 if (so->so_pcb) 564 return EISCONN; 565 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || 566 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || 567 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { 568 error = soreserve(so, tcp_sendspace, tcp_recvspace); 569 if (error) 570 return (error); 571 } 572 573 NET_ASSERT_LOCKED(); 574 error = in_pcballoc(so, &tcbtable); 575 if (error) 576 return (error); 577 inp = sotoinpcb(so); 578 tp = tcp_newtcpcb(inp); 579 if (tp == NULL) { 580 unsigned int nofd = so->so_state & SS_NOFDREF; /* XXX */ 581 582 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 583 in_pcbdetach(inp); 584 so->so_state |= nofd; 585 return (ENOBUFS); 586 } 587 tp->t_state = TCPS_CLOSED; 588 #ifdef INET6 589 /* we disallow IPv4 mapped address completely. */ 590 if (inp->inp_flags & INP_IPV6) 591 tp->pf = PF_INET6; 592 else 593 tp->pf = PF_INET; 594 #else 595 tp->pf = PF_INET; 596 #endif 597 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 598 so->so_linger = TCP_LINGERTIME; 599 600 if (so->so_options & SO_DEBUG) 601 tcp_trace(TA_USER, TCPS_CLOSED, tp, (caddr_t)0, PRU_ATTACH, 0); 602 return (0); 603 } 604 605 int 606 tcp_detach(struct socket *so) 607 { 608 struct inpcb *inp; 609 struct tcpcb *tp = NULL; 610 int error = 0; 611 short ostate; 612 613 soassertlocked(so); 614 615 inp = sotoinpcb(so); 616 /* 617 * When a TCP is attached to a socket, then there will be 618 * a (struct inpcb) pointed at by the socket, and this 619 * structure will point at a subsidiary (struct tcpcb). 620 */ 621 if (inp == NULL) { 622 error = so->so_error; 623 if (error == 0) 624 error = EINVAL; 625 return (error); 626 } 627 tp = intotcpcb(inp); 628 /* tp might get 0 when using socket splicing */ 629 if (tp == NULL) 630 return (0); 631 ostate = tp->t_state; 632 633 /* 634 * Detach the TCP protocol from the socket. 635 * If the protocol state is non-embryonic, then can't 636 * do this directly: have to initiate a PRU_DISCONNECT, 637 * which may finish later; embryonic TCB's can just 638 * be discarded here. 639 */ 640 tp = tcp_disconnect(tp); 641 642 if (tp && (so->so_options & SO_DEBUG)) 643 tcp_trace(TA_USER, ostate, tp, (caddr_t)0, PRU_DETACH, 0); 644 return (error); 645 } 646 647 /* 648 * Initiate (or continue) disconnect. 649 * If embryonic state, just send reset (once). 650 * If in ``let data drain'' option and linger null, just drop. 651 * Otherwise (hard), mark socket disconnecting and drop 652 * current input data; switch states based on user close, and 653 * send segment to peer (with FIN). 654 */ 655 struct tcpcb * 656 tcp_disconnect(struct tcpcb *tp) 657 { 658 struct socket *so = tp->t_inpcb->inp_socket; 659 660 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 661 tp = tcp_close(tp); 662 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 663 tp = tcp_drop(tp, 0); 664 else { 665 soisdisconnecting(so); 666 sbflush(so, &so->so_rcv); 667 tp = tcp_usrclosed(tp); 668 if (tp) 669 (void) tcp_output(tp); 670 } 671 return (tp); 672 } 673 674 /* 675 * User issued close, and wish to trail through shutdown states: 676 * if never received SYN, just forget it. If got a SYN from peer, 677 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 678 * If already got a FIN from peer, then almost done; go to LAST_ACK 679 * state. In all other cases, have already sent FIN to peer (e.g. 680 * after PRU_SHUTDOWN), and just have to play tedious game waiting 681 * for peer to send FIN or not respond to keep-alives, etc. 682 * We can let the user exit from the close as soon as the FIN is acked. 683 */ 684 struct tcpcb * 685 tcp_usrclosed(struct tcpcb *tp) 686 { 687 688 switch (tp->t_state) { 689 690 case TCPS_CLOSED: 691 case TCPS_LISTEN: 692 case TCPS_SYN_SENT: 693 tp->t_state = TCPS_CLOSED; 694 tp = tcp_close(tp); 695 break; 696 697 case TCPS_SYN_RECEIVED: 698 case TCPS_ESTABLISHED: 699 tp->t_state = TCPS_FIN_WAIT_1; 700 break; 701 702 case TCPS_CLOSE_WAIT: 703 tp->t_state = TCPS_LAST_ACK; 704 break; 705 } 706 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 707 soisdisconnected(tp->t_inpcb->inp_socket); 708 /* 709 * If we are in FIN_WAIT_2, we arrived here because the 710 * application did a shutdown of the send side. Like the 711 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 712 * a full close, we start a timer to make sure sockets are 713 * not left in FIN_WAIT_2 forever. 714 */ 715 if (tp->t_state == TCPS_FIN_WAIT_2) 716 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 717 } 718 return (tp); 719 } 720 721 /* 722 * Look up a socket for ident or tcpdrop, ... 723 */ 724 int 725 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) 726 { 727 int error = 0; 728 struct tcp_ident_mapping tir; 729 struct inpcb *inp; 730 struct tcpcb *tp = NULL; 731 struct sockaddr_in *fin, *lin; 732 #ifdef INET6 733 struct sockaddr_in6 *fin6, *lin6; 734 struct in6_addr f6, l6; 735 #endif 736 737 NET_ASSERT_LOCKED(); 738 739 if (dodrop) { 740 if (oldp != NULL || *oldlenp != 0) 741 return (EINVAL); 742 if (newp == NULL) 743 return (EPERM); 744 if (newlen < sizeof(tir)) 745 return (ENOMEM); 746 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) 747 return (error); 748 } else { 749 if (oldp == NULL) 750 return (EINVAL); 751 if (*oldlenp < sizeof(tir)) 752 return (ENOMEM); 753 if (newp != NULL || newlen != 0) 754 return (EINVAL); 755 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) 756 return (error); 757 } 758 switch (tir.faddr.ss_family) { 759 #ifdef INET6 760 case AF_INET6: 761 fin6 = (struct sockaddr_in6 *)&tir.faddr; 762 error = in6_embedscope(&f6, fin6, NULL); 763 if (error) 764 return EINVAL; /*?*/ 765 lin6 = (struct sockaddr_in6 *)&tir.laddr; 766 error = in6_embedscope(&l6, lin6, NULL); 767 if (error) 768 return EINVAL; /*?*/ 769 break; 770 #endif 771 case AF_INET: 772 fin = (struct sockaddr_in *)&tir.faddr; 773 lin = (struct sockaddr_in *)&tir.laddr; 774 break; 775 default: 776 return (EINVAL); 777 } 778 779 switch (tir.faddr.ss_family) { 780 #ifdef INET6 781 case AF_INET6: 782 inp = in6_pcbhashlookup(&tcbtable, &f6, 783 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); 784 break; 785 #endif 786 case AF_INET: 787 inp = in_pcbhashlookup(&tcbtable, fin->sin_addr, 788 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); 789 break; 790 default: 791 unhandled_af(tir.faddr.ss_family); 792 } 793 794 if (dodrop) { 795 if (inp && (tp = intotcpcb(inp)) && 796 ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) 797 tp = tcp_drop(tp, ECONNABORTED); 798 else 799 error = ESRCH; 800 return (error); 801 } 802 803 if (inp == NULL) { 804 tcpstat_inc(tcps_pcbhashmiss); 805 switch (tir.faddr.ss_family) { 806 #ifdef INET6 807 case AF_INET6: 808 inp = in6_pcblookup_listen(&tcbtable, 809 &l6, lin6->sin6_port, NULL, tir.rdomain); 810 break; 811 #endif 812 case AF_INET: 813 inp = in_pcblookup_listen(&tcbtable, 814 lin->sin_addr, lin->sin_port, NULL, tir.rdomain); 815 break; 816 } 817 } 818 819 if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) { 820 tir.ruid = inp->inp_socket->so_ruid; 821 tir.euid = inp->inp_socket->so_euid; 822 } else { 823 tir.ruid = -1; 824 tir.euid = -1; 825 } 826 827 *oldlenp = sizeof (tir); 828 error = copyout((void *)&tir, oldp, sizeof (tir)); 829 return (error); 830 } 831 832 int 833 tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp) 834 { 835 uint64_t counters[tcps_ncounters]; 836 struct tcpstat tcpstat; 837 struct syn_cache_set *set; 838 int i = 0; 839 840 #define ASSIGN(field) do { tcpstat.field = counters[i++]; } while (0) 841 842 memset(&tcpstat, 0, sizeof tcpstat); 843 counters_read(tcpcounters, counters, nitems(counters)); 844 ASSIGN(tcps_connattempt); 845 ASSIGN(tcps_accepts); 846 ASSIGN(tcps_connects); 847 ASSIGN(tcps_drops); 848 ASSIGN(tcps_conndrops); 849 ASSIGN(tcps_closed); 850 ASSIGN(tcps_segstimed); 851 ASSIGN(tcps_rttupdated); 852 ASSIGN(tcps_delack); 853 ASSIGN(tcps_timeoutdrop); 854 ASSIGN(tcps_rexmttimeo); 855 ASSIGN(tcps_persisttimeo); 856 ASSIGN(tcps_persistdrop); 857 ASSIGN(tcps_keeptimeo); 858 ASSIGN(tcps_keepprobe); 859 ASSIGN(tcps_keepdrops); 860 ASSIGN(tcps_sndtotal); 861 ASSIGN(tcps_sndpack); 862 ASSIGN(tcps_sndbyte); 863 ASSIGN(tcps_sndrexmitpack); 864 ASSIGN(tcps_sndrexmitbyte); 865 ASSIGN(tcps_sndrexmitfast); 866 ASSIGN(tcps_sndacks); 867 ASSIGN(tcps_sndprobe); 868 ASSIGN(tcps_sndurg); 869 ASSIGN(tcps_sndwinup); 870 ASSIGN(tcps_sndctrl); 871 ASSIGN(tcps_rcvtotal); 872 ASSIGN(tcps_rcvpack); 873 ASSIGN(tcps_rcvbyte); 874 ASSIGN(tcps_rcvbadsum); 875 ASSIGN(tcps_rcvbadoff); 876 ASSIGN(tcps_rcvmemdrop); 877 ASSIGN(tcps_rcvnosec); 878 ASSIGN(tcps_rcvshort); 879 ASSIGN(tcps_rcvduppack); 880 ASSIGN(tcps_rcvdupbyte); 881 ASSIGN(tcps_rcvpartduppack); 882 ASSIGN(tcps_rcvpartdupbyte); 883 ASSIGN(tcps_rcvoopack); 884 ASSIGN(tcps_rcvoobyte); 885 ASSIGN(tcps_rcvpackafterwin); 886 ASSIGN(tcps_rcvbyteafterwin); 887 ASSIGN(tcps_rcvafterclose); 888 ASSIGN(tcps_rcvwinprobe); 889 ASSIGN(tcps_rcvdupack); 890 ASSIGN(tcps_rcvacktoomuch); 891 ASSIGN(tcps_rcvacktooold); 892 ASSIGN(tcps_rcvackpack); 893 ASSIGN(tcps_rcvackbyte); 894 ASSIGN(tcps_rcvwinupd); 895 ASSIGN(tcps_pawsdrop); 896 ASSIGN(tcps_predack); 897 ASSIGN(tcps_preddat); 898 ASSIGN(tcps_pcbhashmiss); 899 ASSIGN(tcps_noport); 900 ASSIGN(tcps_badsyn); 901 ASSIGN(tcps_dropsyn); 902 ASSIGN(tcps_rcvbadsig); 903 ASSIGN(tcps_rcvgoodsig); 904 ASSIGN(tcps_inswcsum); 905 ASSIGN(tcps_outswcsum); 906 ASSIGN(tcps_ecn_accepts); 907 ASSIGN(tcps_ecn_rcvece); 908 ASSIGN(tcps_ecn_rcvcwr); 909 ASSIGN(tcps_ecn_rcvce); 910 ASSIGN(tcps_ecn_sndect); 911 ASSIGN(tcps_ecn_sndece); 912 ASSIGN(tcps_ecn_sndcwr); 913 ASSIGN(tcps_cwr_ecn); 914 ASSIGN(tcps_cwr_frecovery); 915 ASSIGN(tcps_cwr_timeout); 916 ASSIGN(tcps_sc_added); 917 ASSIGN(tcps_sc_completed); 918 ASSIGN(tcps_sc_timed_out); 919 ASSIGN(tcps_sc_overflowed); 920 ASSIGN(tcps_sc_reset); 921 ASSIGN(tcps_sc_unreach); 922 ASSIGN(tcps_sc_bucketoverflow); 923 ASSIGN(tcps_sc_aborted); 924 ASSIGN(tcps_sc_dupesyn); 925 ASSIGN(tcps_sc_dropped); 926 ASSIGN(tcps_sc_collisions); 927 ASSIGN(tcps_sc_retransmitted); 928 ASSIGN(tcps_sc_seedrandom); 929 ASSIGN(tcps_sc_hash_size); 930 ASSIGN(tcps_sc_entry_count); 931 ASSIGN(tcps_sc_entry_limit); 932 ASSIGN(tcps_sc_bucket_maxlen); 933 ASSIGN(tcps_sc_bucket_limit); 934 ASSIGN(tcps_sc_uses_left); 935 ASSIGN(tcps_conndrained); 936 ASSIGN(tcps_sack_recovery_episode); 937 ASSIGN(tcps_sack_rexmits); 938 ASSIGN(tcps_sack_rexmit_bytes); 939 ASSIGN(tcps_sack_rcv_opts); 940 ASSIGN(tcps_sack_snd_opts); 941 942 #undef ASSIGN 943 944 set = &tcp_syn_cache[tcp_syn_cache_active]; 945 tcpstat.tcps_sc_hash_size = set->scs_size; 946 tcpstat.tcps_sc_entry_count = set->scs_count; 947 tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit; 948 tcpstat.tcps_sc_bucket_maxlen = 0; 949 for (i = 0; i < set->scs_size; i++) { 950 if (tcpstat.tcps_sc_bucket_maxlen < 951 set->scs_buckethead[i].sch_length) 952 tcpstat.tcps_sc_bucket_maxlen = 953 set->scs_buckethead[i].sch_length; 954 } 955 tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit; 956 tcpstat.tcps_sc_uses_left = set->scs_use; 957 958 return (sysctl_rdstruct(oldp, oldlenp, newp, 959 &tcpstat, sizeof(tcpstat))); 960 } 961 962 /* 963 * Sysctl for tcp variables. 964 */ 965 int 966 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 967 size_t newlen) 968 { 969 int error, nval; 970 971 /* All sysctl names at this level are terminal. */ 972 if (namelen != 1) 973 return (ENOTDIR); 974 975 switch (name[0]) { 976 case TCPCTL_SACK: 977 NET_LOCK(); 978 error = sysctl_int(oldp, oldlenp, newp, newlen, 979 &tcp_do_sack); 980 NET_UNLOCK(); 981 return (error); 982 983 case TCPCTL_SLOWHZ: 984 return (sysctl_rdint(oldp, oldlenp, newp, PR_SLOWHZ)); 985 986 case TCPCTL_BADDYNAMIC: 987 NET_LOCK(); 988 error = sysctl_struct(oldp, oldlenp, newp, newlen, 989 baddynamicports.tcp, sizeof(baddynamicports.tcp)); 990 NET_UNLOCK(); 991 return (error); 992 993 case TCPCTL_ROOTONLY: 994 if (newp && securelevel > 0) 995 return (EPERM); 996 NET_LOCK(); 997 error = sysctl_struct(oldp, oldlenp, newp, newlen, 998 rootonlyports.tcp, sizeof(rootonlyports.tcp)); 999 NET_UNLOCK(); 1000 return (error); 1001 1002 case TCPCTL_IDENT: 1003 NET_LOCK(); 1004 error = tcp_ident(oldp, oldlenp, newp, newlen, 0); 1005 NET_UNLOCK(); 1006 return (error); 1007 1008 case TCPCTL_DROP: 1009 NET_LOCK(); 1010 error = tcp_ident(oldp, oldlenp, newp, newlen, 1); 1011 NET_UNLOCK(); 1012 return (error); 1013 1014 case TCPCTL_ALWAYS_KEEPALIVE: 1015 NET_LOCK(); 1016 error = sysctl_int(oldp, oldlenp, newp, newlen, 1017 &tcp_always_keepalive); 1018 NET_UNLOCK(); 1019 return (error); 1020 1021 #ifdef TCP_ECN 1022 case TCPCTL_ECN: 1023 NET_LOCK(); 1024 error = sysctl_int(oldp, oldlenp, newp, newlen, 1025 &tcp_do_ecn); 1026 NET_UNLOCK(); 1027 return (error); 1028 #endif 1029 case TCPCTL_REASS_LIMIT: 1030 NET_LOCK(); 1031 nval = tcp_reass_limit; 1032 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1033 if (!error && nval != tcp_reass_limit) { 1034 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); 1035 if (!error) 1036 tcp_reass_limit = nval; 1037 } 1038 NET_UNLOCK(); 1039 return (error); 1040 1041 case TCPCTL_SACKHOLE_LIMIT: 1042 NET_LOCK(); 1043 nval = tcp_sackhole_limit; 1044 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1045 if (!error && nval != tcp_sackhole_limit) { 1046 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); 1047 if (!error) 1048 tcp_sackhole_limit = nval; 1049 } 1050 NET_UNLOCK(); 1051 return (error); 1052 1053 case TCPCTL_STATS: 1054 return (tcp_sysctl_tcpstat(oldp, oldlenp, newp)); 1055 1056 case TCPCTL_SYN_USE_LIMIT: 1057 NET_LOCK(); 1058 error = sysctl_int(oldp, oldlenp, newp, newlen, 1059 &tcp_syn_use_limit); 1060 if (!error && newp != NULL) { 1061 /* 1062 * Global tcp_syn_use_limit is used when reseeding a 1063 * new cache. Also update the value in active cache. 1064 */ 1065 if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit) 1066 tcp_syn_cache[0].scs_use = tcp_syn_use_limit; 1067 if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit) 1068 tcp_syn_cache[1].scs_use = tcp_syn_use_limit; 1069 } 1070 NET_UNLOCK(); 1071 return (error); 1072 1073 case TCPCTL_SYN_HASH_SIZE: 1074 NET_LOCK(); 1075 nval = tcp_syn_hash_size; 1076 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1077 if (!error && nval != tcp_syn_hash_size) { 1078 if (nval < 1 || nval > 100000) { 1079 error = EINVAL; 1080 } else { 1081 /* 1082 * If global hash size has been changed, 1083 * switch sets as soon as possible. Then 1084 * the actual hash array will be reallocated. 1085 */ 1086 if (tcp_syn_cache[0].scs_size != nval) 1087 tcp_syn_cache[0].scs_use = 0; 1088 if (tcp_syn_cache[1].scs_size != nval) 1089 tcp_syn_cache[1].scs_use = 0; 1090 tcp_syn_hash_size = nval; 1091 } 1092 } 1093 NET_UNLOCK(); 1094 return (error); 1095 1096 default: 1097 if (name[0] < TCPCTL_MAXID) { 1098 NET_LOCK(); 1099 error = sysctl_int_arr(tcpctl_vars, name, namelen, 1100 oldp, oldlenp, newp, newlen); 1101 NET_UNLOCK(); 1102 return (error); 1103 } 1104 return (ENOPROTOOPT); 1105 } 1106 /* NOTREACHED */ 1107 } 1108 1109 /* 1110 * Scale the send buffer so that inflight data is not accounted against 1111 * the limit. The buffer will scale with the congestion window, if the 1112 * the receiver stops acking data the window will shrink and therefor 1113 * the buffer size will shrink as well. 1114 * In low memory situation try to shrink the buffer to the initial size 1115 * disabling the send buffer scaling as long as the situation persists. 1116 */ 1117 void 1118 tcp_update_sndspace(struct tcpcb *tp) 1119 { 1120 struct socket *so = tp->t_inpcb->inp_socket; 1121 u_long nmax = so->so_snd.sb_hiwat; 1122 1123 if (sbchecklowmem()) { 1124 /* low on memory try to get rid of some */ 1125 if (tcp_sendspace < nmax) 1126 nmax = tcp_sendspace; 1127 } else if (so->so_snd.sb_wat != tcp_sendspace) 1128 /* user requested buffer size, auto-scaling disabled */ 1129 nmax = so->so_snd.sb_wat; 1130 else 1131 /* automatic buffer scaling */ 1132 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - 1133 tp->snd_una); 1134 1135 /* a writable socket must be preserved because of poll(2) semantics */ 1136 if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) { 1137 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) 1138 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; 1139 if (nmax * 2 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat) 1140 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+1) / 2; 1141 } 1142 1143 /* round to MSS boundary */ 1144 nmax = roundup(nmax, tp->t_maxseg); 1145 1146 if (nmax != so->so_snd.sb_hiwat) 1147 sbreserve(so, &so->so_snd, nmax); 1148 } 1149 1150 /* 1151 * Scale the recv buffer by looking at how much data was transferred in 1152 * on approximated RTT. If more than a big part of the recv buffer was 1153 * transferred during that time we increase the buffer by a constant. 1154 * In low memory situation try to shrink the buffer to the initial size. 1155 */ 1156 void 1157 tcp_update_rcvspace(struct tcpcb *tp) 1158 { 1159 struct socket *so = tp->t_inpcb->inp_socket; 1160 u_long nmax = so->so_rcv.sb_hiwat; 1161 1162 if (sbchecklowmem()) { 1163 /* low on memory try to get rid of some */ 1164 if (tcp_recvspace < nmax) 1165 nmax = tcp_recvspace; 1166 } else if (so->so_rcv.sb_wat != tcp_recvspace) 1167 /* user requested buffer size, auto-scaling disabled */ 1168 nmax = so->so_rcv.sb_wat; 1169 else { 1170 /* automatic buffer scaling */ 1171 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) 1172 nmax = MIN(sb_max, so->so_rcv.sb_hiwat + 1173 tcp_autorcvbuf_inc); 1174 } 1175 1176 /* a readable socket must be preserved because of poll(2) semantics */ 1177 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && 1178 nmax < so->so_snd.sb_lowat) 1179 nmax = so->so_snd.sb_lowat; 1180 1181 if (nmax == so->so_rcv.sb_hiwat) 1182 return; 1183 1184 /* round to MSS boundary */ 1185 nmax = roundup(nmax, tp->t_maxseg); 1186 sbreserve(so, &so->so_rcv, nmax); 1187 } 1188