1 /* $OpenBSD: tcp_usrreq.c,v 1.168 2018/04/24 15:40:55 pirofti Exp $ */ 2 /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/protosw.h> 77 #include <sys/stat.h> 78 #include <sys/sysctl.h> 79 #include <sys/domain.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/in_var.h> 89 #include <netinet/ip.h> 90 #include <netinet/in_pcb.h> 91 #include <netinet/ip_var.h> 92 #include <netinet/tcp.h> 93 #include <netinet/tcp_fsm.h> 94 #include <netinet/tcp_seq.h> 95 #include <netinet/tcp_timer.h> 96 #include <netinet/tcp_var.h> 97 #include <netinet/tcp_debug.h> 98 99 #ifdef INET6 100 #include <netinet6/in6_var.h> 101 #endif 102 103 #ifndef TCP_SENDSPACE 104 #define TCP_SENDSPACE 1024*16 105 #endif 106 u_int tcp_sendspace = TCP_SENDSPACE; 107 #ifndef TCP_RECVSPACE 108 #define TCP_RECVSPACE 1024*16 109 #endif 110 u_int tcp_recvspace = TCP_RECVSPACE; 111 u_int tcp_autorcvbuf_inc = 16 * 1024; 112 113 int *tcpctl_vars[TCPCTL_MAXID] = TCPCTL_VARS; 114 115 struct inpcbtable tcbtable; 116 117 int tcp_ident(void *, size_t *, void *, size_t, int); 118 119 /* 120 * Process a TCP user request for TCP tb. If this is a send request 121 * then m is the mbuf chain of send data. If this is a timer expiration 122 * (called from the software clock routine), then timertype tells which timer. 123 */ 124 /*ARGSUSED*/ 125 int 126 tcp_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam, 127 struct mbuf *control, struct proc *p) 128 { 129 struct inpcb *inp; 130 struct tcpcb *tp = NULL; 131 int error = 0; 132 short ostate; 133 134 if (req == PRU_CONTROL) { 135 #ifdef INET6 136 if (sotopf(so) == PF_INET6) 137 return in6_control(so, (u_long)m, (caddr_t)nam, 138 (struct ifnet *)control); 139 else 140 #endif /* INET6 */ 141 return (in_control(so, (u_long)m, (caddr_t)nam, 142 (struct ifnet *)control)); 143 } 144 145 soassertlocked(so); 146 147 if (control && control->m_len) { 148 m_freem(control); 149 m_freem(m); 150 return (EINVAL); 151 } 152 153 inp = sotoinpcb(so); 154 /* 155 * When a TCP is attached to a socket, then there will be 156 * a (struct inpcb) pointed at by the socket, and this 157 * structure will point at a subsidiary (struct tcpcb). 158 */ 159 if (inp == NULL) { 160 error = so->so_error; 161 if (error == 0) 162 error = EINVAL; 163 /* 164 * The following corrects an mbuf leak under rare 165 * circumstances 166 */ 167 if (req == PRU_SEND || req == PRU_SENDOOB) 168 m_freem(m); 169 return (error); 170 } 171 tp = intotcpcb(inp); 172 /* tp might get 0 when using socket splicing */ 173 if (tp == NULL) 174 return (0); 175 ostate = tp->t_state; 176 177 switch (req) { 178 179 /* 180 * Give the socket an address. 181 */ 182 case PRU_BIND: 183 error = in_pcbbind(inp, nam, p); 184 break; 185 186 /* 187 * Prepare to accept connections. 188 */ 189 case PRU_LISTEN: 190 if (inp->inp_lport == 0) 191 error = in_pcbbind(inp, NULL, p); 192 /* If the in_pcbbind() above is called, the tp->pf 193 should still be whatever it was before. */ 194 if (error == 0) 195 tp->t_state = TCPS_LISTEN; 196 break; 197 198 /* 199 * Initiate connection to peer. 200 * Create a template for use in transmissions on this connection. 201 * Enter SYN_SENT state, and mark socket as connecting. 202 * Start keep-alive timer, and seed output sequence space. 203 * Send initial segment on connection. 204 */ 205 case PRU_CONNECT: 206 #ifdef INET6 207 if (inp->inp_flags & INP_IPV6) { 208 struct sockaddr_in6 *sin6; 209 210 if ((error = in6_nam2sin6(nam, &sin6))) 211 break; 212 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || 213 IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { 214 error = EINVAL; 215 break; 216 } 217 error = in6_pcbconnect(inp, nam); 218 } else 219 #endif /* INET6 */ 220 { 221 struct sockaddr_in *sin; 222 223 if ((error = in_nam2sin(nam, &sin))) 224 break; 225 if ((sin->sin_addr.s_addr == INADDR_ANY) || 226 (sin->sin_addr.s_addr == INADDR_BROADCAST) || 227 IN_MULTICAST(sin->sin_addr.s_addr) || 228 in_broadcast(sin->sin_addr, inp->inp_rtableid)) { 229 error = EINVAL; 230 break; 231 } 232 error = in_pcbconnect(inp, nam); 233 } 234 if (error) 235 break; 236 237 tp->t_template = tcp_template(tp); 238 if (tp->t_template == 0) { 239 in_pcbdisconnect(inp); 240 error = ENOBUFS; 241 break; 242 } 243 244 so->so_state |= SS_CONNECTOUT; 245 246 /* Compute window scaling to request. */ 247 tcp_rscale(tp, sb_max); 248 249 soisconnecting(so); 250 tcpstat_inc(tcps_connattempt); 251 tp->t_state = TCPS_SYN_SENT; 252 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 253 tcp_set_iss_tsm(tp); 254 tcp_sendseqinit(tp); 255 tp->snd_last = tp->snd_una; 256 error = tcp_output(tp); 257 break; 258 259 /* 260 * Create a TCP connection between two sockets. 261 */ 262 case PRU_CONNECT2: 263 error = EOPNOTSUPP; 264 break; 265 266 /* 267 * Initiate disconnect from peer. 268 * If connection never passed embryonic stage, just drop; 269 * else if don't need to let data drain, then can just drop anyways, 270 * else have to begin TCP shutdown process: mark socket disconnecting, 271 * drain unread data, state switch to reflect user close, and 272 * send segment (e.g. FIN) to peer. Socket will be really disconnected 273 * when peer sends FIN and acks ours. 274 * 275 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 276 */ 277 case PRU_DISCONNECT: 278 tp = tcp_disconnect(tp); 279 break; 280 281 /* 282 * Accept a connection. Essentially all the work is 283 * done at higher levels; just return the address 284 * of the peer, storing through addr. 285 */ 286 case PRU_ACCEPT: 287 #ifdef INET6 288 if (inp->inp_flags & INP_IPV6) 289 in6_setpeeraddr(inp, nam); 290 else 291 #endif 292 in_setpeeraddr(inp, nam); 293 break; 294 295 /* 296 * Mark the connection as being incapable of further output. 297 */ 298 case PRU_SHUTDOWN: 299 if (so->so_state & SS_CANTSENDMORE) 300 break; 301 socantsendmore(so); 302 tp = tcp_usrclosed(tp); 303 if (tp) 304 error = tcp_output(tp); 305 break; 306 307 /* 308 * After a receive, possibly send window update to peer. 309 */ 310 case PRU_RCVD: 311 /* 312 * soreceive() calls this function when a user receives 313 * ancillary data on a listening socket. We don't call 314 * tcp_output in such a case, since there is no header 315 * template for a listening socket and hence the kernel 316 * will panic. 317 */ 318 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 319 (void) tcp_output(tp); 320 break; 321 322 /* 323 * Do a send by putting data in output queue and updating urgent 324 * marker if URG set. Possibly send more data. 325 */ 326 case PRU_SEND: 327 sbappendstream(so, &so->so_snd, m); 328 error = tcp_output(tp); 329 break; 330 331 /* 332 * Abort the TCP. 333 */ 334 case PRU_ABORT: 335 tp = tcp_drop(tp, ECONNABORTED); 336 break; 337 338 case PRU_SENSE: 339 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; 340 return (0); 341 342 case PRU_RCVOOB: 343 if ((so->so_oobmark == 0 && 344 (so->so_state & SS_RCVATMARK) == 0) || 345 so->so_options & SO_OOBINLINE || 346 tp->t_oobflags & TCPOOB_HADDATA) { 347 error = EINVAL; 348 break; 349 } 350 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 351 error = EWOULDBLOCK; 352 break; 353 } 354 m->m_len = 1; 355 *mtod(m, caddr_t) = tp->t_iobc; 356 if (((long)nam & MSG_PEEK) == 0) 357 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 358 break; 359 360 case PRU_SENDOOB: 361 if (sbspace(so, &so->so_snd) < -512) { 362 m_freem(m); 363 error = ENOBUFS; 364 break; 365 } 366 /* 367 * According to RFC961 (Assigned Protocols), 368 * the urgent pointer points to the last octet 369 * of urgent data. We continue, however, 370 * to consider it to indicate the first octet 371 * of data past the urgent section. 372 * Otherwise, snd_up should be one lower. 373 */ 374 sbappendstream(so, &so->so_snd, m); 375 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 376 tp->t_force = 1; 377 error = tcp_output(tp); 378 tp->t_force = 0; 379 break; 380 381 case PRU_SOCKADDR: 382 #ifdef INET6 383 if (inp->inp_flags & INP_IPV6) 384 in6_setsockaddr(inp, nam); 385 else 386 #endif 387 in_setsockaddr(inp, nam); 388 break; 389 390 case PRU_PEERADDR: 391 #ifdef INET6 392 if (inp->inp_flags & INP_IPV6) 393 in6_setpeeraddr(inp, nam); 394 else 395 #endif 396 in_setpeeraddr(inp, nam); 397 break; 398 399 default: 400 panic("tcp_usrreq"); 401 } 402 if (tp && (so->so_options & SO_DEBUG)) 403 tcp_trace(TA_USER, ostate, tp, (caddr_t)0, req, 0); 404 return (error); 405 } 406 407 int 408 tcp_ctloutput(int op, struct socket *so, int level, int optname, 409 struct mbuf *m) 410 { 411 int error = 0; 412 struct inpcb *inp; 413 struct tcpcb *tp; 414 int i; 415 416 inp = sotoinpcb(so); 417 if (inp == NULL) 418 return (ECONNRESET); 419 if (level != IPPROTO_TCP) { 420 switch (so->so_proto->pr_domain->dom_family) { 421 #ifdef INET6 422 case PF_INET6: 423 error = ip6_ctloutput(op, so, level, optname, m); 424 break; 425 #endif /* INET6 */ 426 case PF_INET: 427 error = ip_ctloutput(op, so, level, optname, m); 428 break; 429 default: 430 error = EAFNOSUPPORT; /*?*/ 431 break; 432 } 433 return (error); 434 } 435 tp = intotcpcb(inp); 436 437 switch (op) { 438 439 case PRCO_SETOPT: 440 switch (optname) { 441 442 case TCP_NODELAY: 443 if (m == NULL || m->m_len < sizeof (int)) 444 error = EINVAL; 445 else if (*mtod(m, int *)) 446 tp->t_flags |= TF_NODELAY; 447 else 448 tp->t_flags &= ~TF_NODELAY; 449 break; 450 451 case TCP_NOPUSH: 452 if (m == NULL || m->m_len < sizeof (int)) 453 error = EINVAL; 454 else if (*mtod(m, int *)) 455 tp->t_flags |= TF_NOPUSH; 456 else if (tp->t_flags & TF_NOPUSH) { 457 tp->t_flags &= ~TF_NOPUSH; 458 if (TCPS_HAVEESTABLISHED(tp->t_state)) 459 error = tcp_output(tp); 460 } 461 break; 462 463 case TCP_MAXSEG: 464 if (m == NULL || m->m_len < sizeof (int)) { 465 error = EINVAL; 466 break; 467 } 468 469 i = *mtod(m, int *); 470 if (i > 0 && i <= tp->t_maxseg) 471 tp->t_maxseg = i; 472 else 473 error = EINVAL; 474 break; 475 476 case TCP_SACK_ENABLE: 477 if (m == NULL || m->m_len < sizeof (int)) { 478 error = EINVAL; 479 break; 480 } 481 482 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 483 error = EPERM; 484 break; 485 } 486 487 if (tp->t_flags & TF_SIGNATURE) { 488 error = EPERM; 489 break; 490 } 491 492 if (*mtod(m, int *)) 493 tp->sack_enable = 1; 494 else 495 tp->sack_enable = 0; 496 break; 497 #ifdef TCP_SIGNATURE 498 case TCP_MD5SIG: 499 if (m == NULL || m->m_len < sizeof (int)) { 500 error = EINVAL; 501 break; 502 } 503 504 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 505 error = EPERM; 506 break; 507 } 508 509 if (*mtod(m, int *)) { 510 tp->t_flags |= TF_SIGNATURE; 511 tp->sack_enable = 0; 512 } else 513 tp->t_flags &= ~TF_SIGNATURE; 514 break; 515 #endif /* TCP_SIGNATURE */ 516 default: 517 error = ENOPROTOOPT; 518 break; 519 } 520 break; 521 522 case PRCO_GETOPT: 523 m->m_len = sizeof(int); 524 525 switch (optname) { 526 case TCP_NODELAY: 527 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 528 break; 529 case TCP_NOPUSH: 530 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 531 break; 532 case TCP_MAXSEG: 533 *mtod(m, int *) = tp->t_maxseg; 534 break; 535 case TCP_SACK_ENABLE: 536 *mtod(m, int *) = tp->sack_enable; 537 break; 538 #ifdef TCP_SIGNATURE 539 case TCP_MD5SIG: 540 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; 541 break; 542 #endif 543 default: 544 error = ENOPROTOOPT; 545 break; 546 } 547 break; 548 } 549 return (error); 550 } 551 552 /* 553 * Attach TCP protocol to socket, allocating 554 * internet protocol control block, tcp control block, 555 * buffer space, and entering LISTEN state to accept connections. 556 */ 557 int 558 tcp_attach(struct socket *so, int proto) 559 { 560 struct tcpcb *tp; 561 struct inpcb *inp; 562 int error; 563 564 if (so->so_pcb) 565 return EISCONN; 566 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || 567 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || 568 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { 569 error = soreserve(so, tcp_sendspace, tcp_recvspace); 570 if (error) 571 return (error); 572 } 573 574 NET_ASSERT_LOCKED(); 575 error = in_pcballoc(so, &tcbtable); 576 if (error) 577 return (error); 578 inp = sotoinpcb(so); 579 tp = tcp_newtcpcb(inp); 580 if (tp == NULL) { 581 unsigned int nofd = so->so_state & SS_NOFDREF; /* XXX */ 582 583 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 584 in_pcbdetach(inp); 585 so->so_state |= nofd; 586 return (ENOBUFS); 587 } 588 tp->t_state = TCPS_CLOSED; 589 #ifdef INET6 590 /* we disallow IPv4 mapped address completely. */ 591 if (inp->inp_flags & INP_IPV6) 592 tp->pf = PF_INET6; 593 else 594 tp->pf = PF_INET; 595 #else 596 tp->pf = PF_INET; 597 #endif 598 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 599 so->so_linger = TCP_LINGERTIME; 600 601 if (so->so_options & SO_DEBUG) 602 tcp_trace(TA_USER, TCPS_CLOSED, tp, (caddr_t)0, PRU_ATTACH, 0); 603 return (0); 604 } 605 606 int 607 tcp_detach(struct socket *so) 608 { 609 struct inpcb *inp; 610 struct tcpcb *tp = NULL; 611 int error = 0; 612 short ostate; 613 614 soassertlocked(so); 615 616 inp = sotoinpcb(so); 617 /* 618 * When a TCP is attached to a socket, then there will be 619 * a (struct inpcb) pointed at by the socket, and this 620 * structure will point at a subsidiary (struct tcpcb). 621 */ 622 if (inp == NULL) { 623 error = so->so_error; 624 if (error == 0) 625 error = EINVAL; 626 return (error); 627 } 628 tp = intotcpcb(inp); 629 /* tp might get 0 when using socket splicing */ 630 if (tp == NULL) 631 return (0); 632 ostate = tp->t_state; 633 634 /* 635 * Detach the TCP protocol from the socket. 636 * If the protocol state is non-embryonic, then can't 637 * do this directly: have to initiate a PRU_DISCONNECT, 638 * which may finish later; embryonic TCB's can just 639 * be discarded here. 640 */ 641 tp = tcp_disconnect(tp); 642 643 if (tp && (so->so_options & SO_DEBUG)) 644 tcp_trace(TA_USER, ostate, tp, (caddr_t)0, PRU_DETACH, 0); 645 return (error); 646 } 647 648 /* 649 * Initiate (or continue) disconnect. 650 * If embryonic state, just send reset (once). 651 * If in ``let data drain'' option and linger null, just drop. 652 * Otherwise (hard), mark socket disconnecting and drop 653 * current input data; switch states based on user close, and 654 * send segment to peer (with FIN). 655 */ 656 struct tcpcb * 657 tcp_disconnect(struct tcpcb *tp) 658 { 659 struct socket *so = tp->t_inpcb->inp_socket; 660 661 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 662 tp = tcp_close(tp); 663 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 664 tp = tcp_drop(tp, 0); 665 else { 666 soisdisconnecting(so); 667 sbflush(so, &so->so_rcv); 668 tp = tcp_usrclosed(tp); 669 if (tp) 670 (void) tcp_output(tp); 671 } 672 return (tp); 673 } 674 675 /* 676 * User issued close, and wish to trail through shutdown states: 677 * if never received SYN, just forget it. If got a SYN from peer, 678 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 679 * If already got a FIN from peer, then almost done; go to LAST_ACK 680 * state. In all other cases, have already sent FIN to peer (e.g. 681 * after PRU_SHUTDOWN), and just have to play tedious game waiting 682 * for peer to send FIN or not respond to keep-alives, etc. 683 * We can let the user exit from the close as soon as the FIN is acked. 684 */ 685 struct tcpcb * 686 tcp_usrclosed(struct tcpcb *tp) 687 { 688 689 switch (tp->t_state) { 690 691 case TCPS_CLOSED: 692 case TCPS_LISTEN: 693 case TCPS_SYN_SENT: 694 tp->t_state = TCPS_CLOSED; 695 tp = tcp_close(tp); 696 break; 697 698 case TCPS_SYN_RECEIVED: 699 case TCPS_ESTABLISHED: 700 tp->t_state = TCPS_FIN_WAIT_1; 701 break; 702 703 case TCPS_CLOSE_WAIT: 704 tp->t_state = TCPS_LAST_ACK; 705 break; 706 } 707 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 708 soisdisconnected(tp->t_inpcb->inp_socket); 709 /* 710 * If we are in FIN_WAIT_2, we arrived here because the 711 * application did a shutdown of the send side. Like the 712 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 713 * a full close, we start a timer to make sure sockets are 714 * not left in FIN_WAIT_2 forever. 715 */ 716 if (tp->t_state == TCPS_FIN_WAIT_2) 717 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 718 } 719 return (tp); 720 } 721 722 /* 723 * Look up a socket for ident or tcpdrop, ... 724 */ 725 int 726 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) 727 { 728 int error = 0; 729 struct tcp_ident_mapping tir; 730 struct inpcb *inp; 731 struct tcpcb *tp = NULL; 732 struct sockaddr_in *fin, *lin; 733 #ifdef INET6 734 struct sockaddr_in6 *fin6, *lin6; 735 struct in6_addr f6, l6; 736 #endif 737 738 NET_ASSERT_LOCKED(); 739 740 if (dodrop) { 741 if (oldp != NULL || *oldlenp != 0) 742 return (EINVAL); 743 if (newp == NULL) 744 return (EPERM); 745 if (newlen < sizeof(tir)) 746 return (ENOMEM); 747 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) 748 return (error); 749 } else { 750 if (oldp == NULL) 751 return (EINVAL); 752 if (*oldlenp < sizeof(tir)) 753 return (ENOMEM); 754 if (newp != NULL || newlen != 0) 755 return (EINVAL); 756 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) 757 return (error); 758 } 759 switch (tir.faddr.ss_family) { 760 #ifdef INET6 761 case AF_INET6: 762 fin6 = (struct sockaddr_in6 *)&tir.faddr; 763 error = in6_embedscope(&f6, fin6, NULL); 764 if (error) 765 return EINVAL; /*?*/ 766 lin6 = (struct sockaddr_in6 *)&tir.laddr; 767 error = in6_embedscope(&l6, lin6, NULL); 768 if (error) 769 return EINVAL; /*?*/ 770 break; 771 #endif 772 case AF_INET: 773 fin = (struct sockaddr_in *)&tir.faddr; 774 lin = (struct sockaddr_in *)&tir.laddr; 775 break; 776 default: 777 return (EINVAL); 778 } 779 780 switch (tir.faddr.ss_family) { 781 #ifdef INET6 782 case AF_INET6: 783 inp = in6_pcbhashlookup(&tcbtable, &f6, 784 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); 785 break; 786 #endif 787 case AF_INET: 788 inp = in_pcbhashlookup(&tcbtable, fin->sin_addr, 789 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); 790 break; 791 default: 792 unhandled_af(tir.faddr.ss_family); 793 } 794 795 if (dodrop) { 796 if (inp && (tp = intotcpcb(inp)) && 797 ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) 798 tp = tcp_drop(tp, ECONNABORTED); 799 else 800 error = ESRCH; 801 return (error); 802 } 803 804 if (inp == NULL) { 805 tcpstat_inc(tcps_pcbhashmiss); 806 switch (tir.faddr.ss_family) { 807 #ifdef INET6 808 case AF_INET6: 809 inp = in6_pcblookup_listen(&tcbtable, 810 &l6, lin6->sin6_port, NULL, tir.rdomain); 811 break; 812 #endif 813 case AF_INET: 814 inp = in_pcblookup_listen(&tcbtable, 815 lin->sin_addr, lin->sin_port, NULL, tir.rdomain); 816 break; 817 } 818 } 819 820 if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) { 821 tir.ruid = inp->inp_socket->so_ruid; 822 tir.euid = inp->inp_socket->so_euid; 823 } else { 824 tir.ruid = -1; 825 tir.euid = -1; 826 } 827 828 *oldlenp = sizeof (tir); 829 error = copyout((void *)&tir, oldp, sizeof (tir)); 830 return (error); 831 } 832 833 int 834 tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp) 835 { 836 uint64_t counters[tcps_ncounters]; 837 struct tcpstat tcpstat; 838 struct syn_cache_set *set; 839 int i = 0; 840 841 #define ASSIGN(field) do { tcpstat.field = counters[i++]; } while (0) 842 843 memset(&tcpstat, 0, sizeof tcpstat); 844 counters_read(tcpcounters, counters, nitems(counters)); 845 ASSIGN(tcps_connattempt); 846 ASSIGN(tcps_accepts); 847 ASSIGN(tcps_connects); 848 ASSIGN(tcps_drops); 849 ASSIGN(tcps_conndrops); 850 ASSIGN(tcps_closed); 851 ASSIGN(tcps_segstimed); 852 ASSIGN(tcps_rttupdated); 853 ASSIGN(tcps_delack); 854 ASSIGN(tcps_timeoutdrop); 855 ASSIGN(tcps_rexmttimeo); 856 ASSIGN(tcps_persisttimeo); 857 ASSIGN(tcps_persistdrop); 858 ASSIGN(tcps_keeptimeo); 859 ASSIGN(tcps_keepprobe); 860 ASSIGN(tcps_keepdrops); 861 ASSIGN(tcps_sndtotal); 862 ASSIGN(tcps_sndpack); 863 ASSIGN(tcps_sndbyte); 864 ASSIGN(tcps_sndrexmitpack); 865 ASSIGN(tcps_sndrexmitbyte); 866 ASSIGN(tcps_sndrexmitfast); 867 ASSIGN(tcps_sndacks); 868 ASSIGN(tcps_sndprobe); 869 ASSIGN(tcps_sndurg); 870 ASSIGN(tcps_sndwinup); 871 ASSIGN(tcps_sndctrl); 872 ASSIGN(tcps_rcvtotal); 873 ASSIGN(tcps_rcvpack); 874 ASSIGN(tcps_rcvbyte); 875 ASSIGN(tcps_rcvbadsum); 876 ASSIGN(tcps_rcvbadoff); 877 ASSIGN(tcps_rcvmemdrop); 878 ASSIGN(tcps_rcvnosec); 879 ASSIGN(tcps_rcvshort); 880 ASSIGN(tcps_rcvduppack); 881 ASSIGN(tcps_rcvdupbyte); 882 ASSIGN(tcps_rcvpartduppack); 883 ASSIGN(tcps_rcvpartdupbyte); 884 ASSIGN(tcps_rcvoopack); 885 ASSIGN(tcps_rcvoobyte); 886 ASSIGN(tcps_rcvpackafterwin); 887 ASSIGN(tcps_rcvbyteafterwin); 888 ASSIGN(tcps_rcvafterclose); 889 ASSIGN(tcps_rcvwinprobe); 890 ASSIGN(tcps_rcvdupack); 891 ASSIGN(tcps_rcvacktoomuch); 892 ASSIGN(tcps_rcvacktooold); 893 ASSIGN(tcps_rcvackpack); 894 ASSIGN(tcps_rcvackbyte); 895 ASSIGN(tcps_rcvwinupd); 896 ASSIGN(tcps_pawsdrop); 897 ASSIGN(tcps_predack); 898 ASSIGN(tcps_preddat); 899 ASSIGN(tcps_pcbhashmiss); 900 ASSIGN(tcps_noport); 901 ASSIGN(tcps_badsyn); 902 ASSIGN(tcps_dropsyn); 903 ASSIGN(tcps_rcvbadsig); 904 ASSIGN(tcps_rcvgoodsig); 905 ASSIGN(tcps_inswcsum); 906 ASSIGN(tcps_outswcsum); 907 ASSIGN(tcps_ecn_accepts); 908 ASSIGN(tcps_ecn_rcvece); 909 ASSIGN(tcps_ecn_rcvcwr); 910 ASSIGN(tcps_ecn_rcvce); 911 ASSIGN(tcps_ecn_sndect); 912 ASSIGN(tcps_ecn_sndece); 913 ASSIGN(tcps_ecn_sndcwr); 914 ASSIGN(tcps_cwr_ecn); 915 ASSIGN(tcps_cwr_frecovery); 916 ASSIGN(tcps_cwr_timeout); 917 ASSIGN(tcps_sc_added); 918 ASSIGN(tcps_sc_completed); 919 ASSIGN(tcps_sc_timed_out); 920 ASSIGN(tcps_sc_overflowed); 921 ASSIGN(tcps_sc_reset); 922 ASSIGN(tcps_sc_unreach); 923 ASSIGN(tcps_sc_bucketoverflow); 924 ASSIGN(tcps_sc_aborted); 925 ASSIGN(tcps_sc_dupesyn); 926 ASSIGN(tcps_sc_dropped); 927 ASSIGN(tcps_sc_collisions); 928 ASSIGN(tcps_sc_retransmitted); 929 ASSIGN(tcps_sc_seedrandom); 930 ASSIGN(tcps_sc_hash_size); 931 ASSIGN(tcps_sc_entry_count); 932 ASSIGN(tcps_sc_entry_limit); 933 ASSIGN(tcps_sc_bucket_maxlen); 934 ASSIGN(tcps_sc_bucket_limit); 935 ASSIGN(tcps_sc_uses_left); 936 ASSIGN(tcps_conndrained); 937 ASSIGN(tcps_sack_recovery_episode); 938 ASSIGN(tcps_sack_rexmits); 939 ASSIGN(tcps_sack_rexmit_bytes); 940 ASSIGN(tcps_sack_rcv_opts); 941 ASSIGN(tcps_sack_snd_opts); 942 943 #undef ASSIGN 944 945 set = &tcp_syn_cache[tcp_syn_cache_active]; 946 tcpstat.tcps_sc_hash_size = set->scs_size; 947 tcpstat.tcps_sc_entry_count = set->scs_count; 948 tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit; 949 tcpstat.tcps_sc_bucket_maxlen = 0; 950 for (i = 0; i < set->scs_size; i++) { 951 if (tcpstat.tcps_sc_bucket_maxlen < 952 set->scs_buckethead[i].sch_length) 953 tcpstat.tcps_sc_bucket_maxlen = 954 set->scs_buckethead[i].sch_length; 955 } 956 tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit; 957 tcpstat.tcps_sc_uses_left = set->scs_use; 958 959 return (sysctl_rdstruct(oldp, oldlenp, newp, 960 &tcpstat, sizeof(tcpstat))); 961 } 962 963 /* 964 * Sysctl for tcp variables. 965 */ 966 int 967 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 968 size_t newlen) 969 { 970 int error, nval; 971 972 /* All sysctl names at this level are terminal. */ 973 if (namelen != 1) 974 return (ENOTDIR); 975 976 switch (name[0]) { 977 case TCPCTL_SACK: 978 NET_LOCK(); 979 error = sysctl_int(oldp, oldlenp, newp, newlen, 980 &tcp_do_sack); 981 NET_UNLOCK(); 982 return (error); 983 984 case TCPCTL_SLOWHZ: 985 return (sysctl_rdint(oldp, oldlenp, newp, PR_SLOWHZ)); 986 987 case TCPCTL_BADDYNAMIC: 988 NET_LOCK(); 989 error = sysctl_struct(oldp, oldlenp, newp, newlen, 990 baddynamicports.tcp, sizeof(baddynamicports.tcp)); 991 NET_UNLOCK(); 992 return (error); 993 994 case TCPCTL_ROOTONLY: 995 if (newp && securelevel > 0) 996 return (EPERM); 997 NET_LOCK(); 998 error = sysctl_struct(oldp, oldlenp, newp, newlen, 999 rootonlyports.tcp, sizeof(rootonlyports.tcp)); 1000 NET_UNLOCK(); 1001 return (error); 1002 1003 case TCPCTL_IDENT: 1004 NET_LOCK(); 1005 error = tcp_ident(oldp, oldlenp, newp, newlen, 0); 1006 NET_UNLOCK(); 1007 return (error); 1008 1009 case TCPCTL_DROP: 1010 NET_LOCK(); 1011 error = tcp_ident(oldp, oldlenp, newp, newlen, 1); 1012 NET_UNLOCK(); 1013 return (error); 1014 1015 case TCPCTL_ALWAYS_KEEPALIVE: 1016 NET_LOCK(); 1017 error = sysctl_int(oldp, oldlenp, newp, newlen, 1018 &tcp_always_keepalive); 1019 NET_UNLOCK(); 1020 return (error); 1021 1022 #ifdef TCP_ECN 1023 case TCPCTL_ECN: 1024 NET_LOCK(); 1025 error = sysctl_int(oldp, oldlenp, newp, newlen, 1026 &tcp_do_ecn); 1027 NET_UNLOCK(); 1028 return (error); 1029 #endif 1030 case TCPCTL_REASS_LIMIT: 1031 NET_LOCK(); 1032 nval = tcp_reass_limit; 1033 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1034 if (!error && nval != tcp_reass_limit) { 1035 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); 1036 if (!error) 1037 tcp_reass_limit = nval; 1038 } 1039 NET_UNLOCK(); 1040 return (error); 1041 1042 case TCPCTL_SACKHOLE_LIMIT: 1043 NET_LOCK(); 1044 nval = tcp_sackhole_limit; 1045 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1046 if (!error && nval != tcp_sackhole_limit) { 1047 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); 1048 if (!error) 1049 tcp_sackhole_limit = nval; 1050 } 1051 NET_UNLOCK(); 1052 return (error); 1053 1054 case TCPCTL_STATS: 1055 return (tcp_sysctl_tcpstat(oldp, oldlenp, newp)); 1056 1057 case TCPCTL_SYN_USE_LIMIT: 1058 NET_LOCK(); 1059 error = sysctl_int(oldp, oldlenp, newp, newlen, 1060 &tcp_syn_use_limit); 1061 if (!error && newp != NULL) { 1062 /* 1063 * Global tcp_syn_use_limit is used when reseeding a 1064 * new cache. Also update the value in active cache. 1065 */ 1066 if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit) 1067 tcp_syn_cache[0].scs_use = tcp_syn_use_limit; 1068 if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit) 1069 tcp_syn_cache[1].scs_use = tcp_syn_use_limit; 1070 } 1071 NET_UNLOCK(); 1072 return (error); 1073 1074 case TCPCTL_SYN_HASH_SIZE: 1075 NET_LOCK(); 1076 nval = tcp_syn_hash_size; 1077 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1078 if (!error && nval != tcp_syn_hash_size) { 1079 if (nval < 1 || nval > 100000) { 1080 error = EINVAL; 1081 } else { 1082 /* 1083 * If global hash size has been changed, 1084 * switch sets as soon as possible. Then 1085 * the actual hash array will be reallocated. 1086 */ 1087 if (tcp_syn_cache[0].scs_size != nval) 1088 tcp_syn_cache[0].scs_use = 0; 1089 if (tcp_syn_cache[1].scs_size != nval) 1090 tcp_syn_cache[1].scs_use = 0; 1091 tcp_syn_hash_size = nval; 1092 } 1093 } 1094 NET_UNLOCK(); 1095 return (error); 1096 1097 default: 1098 if (name[0] < TCPCTL_MAXID) { 1099 NET_LOCK(); 1100 error = sysctl_int_arr(tcpctl_vars, name, namelen, 1101 oldp, oldlenp, newp, newlen); 1102 NET_UNLOCK(); 1103 return (error); 1104 } 1105 return (ENOPROTOOPT); 1106 } 1107 /* NOTREACHED */ 1108 } 1109 1110 /* 1111 * Scale the send buffer so that inflight data is not accounted against 1112 * the limit. The buffer will scale with the congestion window, if the 1113 * the receiver stops acking data the window will shrink and therefor 1114 * the buffer size will shrink as well. 1115 * In low memory situation try to shrink the buffer to the initial size 1116 * disabling the send buffer scaling as long as the situation persists. 1117 */ 1118 void 1119 tcp_update_sndspace(struct tcpcb *tp) 1120 { 1121 struct socket *so = tp->t_inpcb->inp_socket; 1122 u_long nmax = so->so_snd.sb_hiwat; 1123 1124 if (sbchecklowmem()) { 1125 /* low on memory try to get rid of some */ 1126 if (tcp_sendspace < nmax) 1127 nmax = tcp_sendspace; 1128 } else if (so->so_snd.sb_wat != tcp_sendspace) 1129 /* user requested buffer size, auto-scaling disabled */ 1130 nmax = so->so_snd.sb_wat; 1131 else 1132 /* automatic buffer scaling */ 1133 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - 1134 tp->snd_una); 1135 1136 /* a writable socket must be preserved because of poll(2) semantics */ 1137 if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) { 1138 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) 1139 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; 1140 if (nmax * 2 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat) 1141 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+1) / 2; 1142 } 1143 1144 /* round to MSS boundary */ 1145 nmax = roundup(nmax, tp->t_maxseg); 1146 1147 if (nmax != so->so_snd.sb_hiwat) 1148 sbreserve(so, &so->so_snd, nmax); 1149 } 1150 1151 /* 1152 * Scale the recv buffer by looking at how much data was transferred in 1153 * on approximated RTT. If more than a big part of the recv buffer was 1154 * transferred during that time we increase the buffer by a constant. 1155 * In low memory situation try to shrink the buffer to the initial size. 1156 */ 1157 void 1158 tcp_update_rcvspace(struct tcpcb *tp) 1159 { 1160 struct socket *so = tp->t_inpcb->inp_socket; 1161 u_long nmax = so->so_rcv.sb_hiwat; 1162 1163 if (sbchecklowmem()) { 1164 /* low on memory try to get rid of some */ 1165 if (tcp_recvspace < nmax) 1166 nmax = tcp_recvspace; 1167 } else if (so->so_rcv.sb_wat != tcp_recvspace) 1168 /* user requested buffer size, auto-scaling disabled */ 1169 nmax = so->so_rcv.sb_wat; 1170 else { 1171 /* automatic buffer scaling */ 1172 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) 1173 nmax = MIN(sb_max, so->so_rcv.sb_hiwat + 1174 tcp_autorcvbuf_inc); 1175 } 1176 1177 /* a readable socket must be preserved because of poll(2) semantics */ 1178 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && 1179 nmax < so->so_snd.sb_lowat) 1180 nmax = so->so_snd.sb_lowat; 1181 1182 if (nmax == so->so_rcv.sb_hiwat) 1183 return; 1184 1185 /* round to MSS boundary */ 1186 nmax = roundup(nmax, tp->t_maxseg); 1187 sbreserve(so, &so->so_rcv, nmax); 1188 } 1189