1 /* $OpenBSD: tcp_usrreq.c,v 1.155 2017/09/05 07:59:11 mpi Exp $ */ 2 /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/protosw.h> 77 #include <sys/stat.h> 78 #include <sys/sysctl.h> 79 #include <sys/domain.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/in_var.h> 89 #include <netinet/ip.h> 90 #include <netinet/in_pcb.h> 91 #include <netinet/ip_var.h> 92 #include <netinet/tcp.h> 93 #include <netinet/tcp_fsm.h> 94 #include <netinet/tcp_seq.h> 95 #include <netinet/tcp_timer.h> 96 #include <netinet/tcp_var.h> 97 #include <netinet/tcp_debug.h> 98 99 #ifdef INET6 100 #include <netinet6/in6_var.h> 101 #endif 102 103 #ifndef TCP_SENDSPACE 104 #define TCP_SENDSPACE 1024*16 105 #endif 106 u_int tcp_sendspace = TCP_SENDSPACE; 107 #ifndef TCP_RECVSPACE 108 #define TCP_RECVSPACE 1024*16 109 #endif 110 u_int tcp_recvspace = TCP_RECVSPACE; 111 u_int tcp_autorcvbuf_inc = 16 * 1024; 112 113 int *tcpctl_vars[TCPCTL_MAXID] = TCPCTL_VARS; 114 115 struct inpcbtable tcbtable; 116 117 int tcp_ident(void *, size_t *, void *, size_t, int); 118 119 /* 120 * Process a TCP user request for TCP tb. If this is a send request 121 * then m is the mbuf chain of send data. If this is a timer expiration 122 * (called from the software clock routine), then timertype tells which timer. 123 */ 124 /*ARGSUSED*/ 125 int 126 tcp_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam, 127 struct mbuf *control, struct proc *p) 128 { 129 struct inpcb *inp; 130 struct tcpcb *tp = NULL; 131 int error = 0; 132 short ostate; 133 134 soassertlocked(so); 135 136 if (req == PRU_CONTROL) { 137 #ifdef INET6 138 if (sotopf(so) == PF_INET6) 139 return in6_control(so, (u_long)m, (caddr_t)nam, 140 (struct ifnet *)control); 141 else 142 #endif /* INET6 */ 143 return (in_control(so, (u_long)m, (caddr_t)nam, 144 (struct ifnet *)control)); 145 } 146 if (control && control->m_len) { 147 m_freem(control); 148 m_freem(m); 149 return (EINVAL); 150 } 151 152 inp = sotoinpcb(so); 153 /* 154 * When a TCP is attached to a socket, then there will be 155 * a (struct inpcb) pointed at by the socket, and this 156 * structure will point at a subsidiary (struct tcpcb). 157 */ 158 if (inp == NULL) { 159 error = so->so_error; 160 if (error == 0) 161 error = EINVAL; 162 /* 163 * The following corrects an mbuf leak under rare 164 * circumstances 165 */ 166 if (req == PRU_SEND || req == PRU_SENDOOB) 167 m_freem(m); 168 return (error); 169 } 170 if (inp) { 171 tp = intotcpcb(inp); 172 /* tp might get 0 when using socket splicing */ 173 if (tp == NULL) { 174 return (0); 175 } 176 #ifdef KPROF 177 tcp_acounts[tp->t_state][req]++; 178 #endif 179 ostate = tp->t_state; 180 } else 181 ostate = 0; 182 switch (req) { 183 184 /* 185 * PRU_DETACH detaches the TCP protocol from the socket. 186 * If the protocol state is non-embryonic, then can't 187 * do this directly: have to initiate a PRU_DISCONNECT, 188 * which may finish later; embryonic TCB's can just 189 * be discarded here. 190 */ 191 case PRU_DETACH: 192 tp = tcp_disconnect(tp); 193 break; 194 195 /* 196 * Give the socket an address. 197 */ 198 case PRU_BIND: 199 error = in_pcbbind(inp, nam, p); 200 break; 201 202 /* 203 * Prepare to accept connections. 204 */ 205 case PRU_LISTEN: 206 if (inp->inp_lport == 0) 207 error = in_pcbbind(inp, NULL, p); 208 /* If the in_pcbbind() above is called, the tp->pf 209 should still be whatever it was before. */ 210 if (error == 0) 211 tp->t_state = TCPS_LISTEN; 212 break; 213 214 /* 215 * Initiate connection to peer. 216 * Create a template for use in transmissions on this connection. 217 * Enter SYN_SENT state, and mark socket as connecting. 218 * Start keep-alive timer, and seed output sequence space. 219 * Send initial segment on connection. 220 */ 221 case PRU_CONNECT: 222 #ifdef INET6 223 if (inp->inp_flags & INP_IPV6) { 224 struct sockaddr_in6 *sin6; 225 226 if ((error = in6_nam2sin6(nam, &sin6))) 227 break; 228 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || 229 IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { 230 error = EINVAL; 231 break; 232 } 233 error = in6_pcbconnect(inp, nam); 234 } else 235 #endif /* INET6 */ 236 { 237 struct sockaddr_in *sin; 238 239 if ((error = in_nam2sin(nam, &sin))) 240 break; 241 if ((sin->sin_addr.s_addr == INADDR_ANY) || 242 (sin->sin_addr.s_addr == INADDR_BROADCAST) || 243 IN_MULTICAST(sin->sin_addr.s_addr) || 244 in_broadcast(sin->sin_addr, inp->inp_rtableid)) { 245 error = EINVAL; 246 break; 247 } 248 error = in_pcbconnect(inp, nam); 249 } 250 if (error) 251 break; 252 253 tp->t_template = tcp_template(tp); 254 if (tp->t_template == 0) { 255 in_pcbdisconnect(inp); 256 error = ENOBUFS; 257 break; 258 } 259 260 so->so_state |= SS_CONNECTOUT; 261 262 /* Compute window scaling to request. */ 263 tcp_rscale(tp, sb_max); 264 265 soisconnecting(so); 266 tcpstat_inc(tcps_connattempt); 267 tp->t_state = TCPS_SYN_SENT; 268 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 269 tcp_set_iss_tsm(tp); 270 tcp_sendseqinit(tp); 271 #if defined(TCP_SACK) 272 tp->snd_last = tp->snd_una; 273 #endif 274 #if defined(TCP_SACK) && defined(TCP_FACK) 275 tp->snd_fack = tp->snd_una; 276 tp->retran_data = 0; 277 tp->snd_awnd = 0; 278 #endif 279 error = tcp_output(tp); 280 break; 281 282 /* 283 * Create a TCP connection between two sockets. 284 */ 285 case PRU_CONNECT2: 286 error = EOPNOTSUPP; 287 break; 288 289 /* 290 * Initiate disconnect from peer. 291 * If connection never passed embryonic stage, just drop; 292 * else if don't need to let data drain, then can just drop anyways, 293 * else have to begin TCP shutdown process: mark socket disconnecting, 294 * drain unread data, state switch to reflect user close, and 295 * send segment (e.g. FIN) to peer. Socket will be really disconnected 296 * when peer sends FIN and acks ours. 297 * 298 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 299 */ 300 case PRU_DISCONNECT: 301 tp = tcp_disconnect(tp); 302 break; 303 304 /* 305 * Accept a connection. Essentially all the work is 306 * done at higher levels; just return the address 307 * of the peer, storing through addr. 308 */ 309 case PRU_ACCEPT: 310 #ifdef INET6 311 if (inp->inp_flags & INP_IPV6) 312 in6_setpeeraddr(inp, nam); 313 else 314 #endif 315 in_setpeeraddr(inp, nam); 316 break; 317 318 /* 319 * Mark the connection as being incapable of further output. 320 */ 321 case PRU_SHUTDOWN: 322 if (so->so_state & SS_CANTSENDMORE) 323 break; 324 socantsendmore(so); 325 tp = tcp_usrclosed(tp); 326 if (tp) 327 error = tcp_output(tp); 328 break; 329 330 /* 331 * After a receive, possibly send window update to peer. 332 */ 333 case PRU_RCVD: 334 /* 335 * soreceive() calls this function when a user receives 336 * ancillary data on a listening socket. We don't call 337 * tcp_output in such a case, since there is no header 338 * template for a listening socket and hence the kernel 339 * will panic. 340 */ 341 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 342 (void) tcp_output(tp); 343 break; 344 345 /* 346 * Do a send by putting data in output queue and updating urgent 347 * marker if URG set. Possibly send more data. 348 */ 349 case PRU_SEND: 350 sbappendstream(so, &so->so_snd, m); 351 error = tcp_output(tp); 352 break; 353 354 /* 355 * Abort the TCP. 356 */ 357 case PRU_ABORT: 358 tp = tcp_drop(tp, ECONNABORTED); 359 break; 360 361 case PRU_SENSE: 362 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; 363 return (0); 364 365 case PRU_RCVOOB: 366 if ((so->so_oobmark == 0 && 367 (so->so_state & SS_RCVATMARK) == 0) || 368 so->so_options & SO_OOBINLINE || 369 tp->t_oobflags & TCPOOB_HADDATA) { 370 error = EINVAL; 371 break; 372 } 373 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 374 error = EWOULDBLOCK; 375 break; 376 } 377 m->m_len = 1; 378 *mtod(m, caddr_t) = tp->t_iobc; 379 if (((long)nam & MSG_PEEK) == 0) 380 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 381 break; 382 383 case PRU_SENDOOB: 384 if (sbspace(so, &so->so_snd) < -512) { 385 m_freem(m); 386 error = ENOBUFS; 387 break; 388 } 389 /* 390 * According to RFC961 (Assigned Protocols), 391 * the urgent pointer points to the last octet 392 * of urgent data. We continue, however, 393 * to consider it to indicate the first octet 394 * of data past the urgent section. 395 * Otherwise, snd_up should be one lower. 396 */ 397 sbappendstream(so, &so->so_snd, m); 398 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 399 tp->t_force = 1; 400 error = tcp_output(tp); 401 tp->t_force = 0; 402 break; 403 404 case PRU_SOCKADDR: 405 #ifdef INET6 406 if (inp->inp_flags & INP_IPV6) 407 in6_setsockaddr(inp, nam); 408 else 409 #endif 410 in_setsockaddr(inp, nam); 411 break; 412 413 case PRU_PEERADDR: 414 #ifdef INET6 415 if (inp->inp_flags & INP_IPV6) 416 in6_setpeeraddr(inp, nam); 417 else 418 #endif 419 in_setpeeraddr(inp, nam); 420 break; 421 422 default: 423 panic("tcp_usrreq"); 424 } 425 if (tp && (so->so_options & SO_DEBUG)) 426 tcp_trace(TA_USER, ostate, tp, (caddr_t)0, req, 0); 427 return (error); 428 } 429 430 int 431 tcp_ctloutput(int op, struct socket *so, int level, int optname, 432 struct mbuf *m) 433 { 434 int error = 0; 435 struct inpcb *inp; 436 struct tcpcb *tp; 437 int i; 438 439 inp = sotoinpcb(so); 440 if (inp == NULL) 441 return (ECONNRESET); 442 if (level != IPPROTO_TCP) { 443 switch (so->so_proto->pr_domain->dom_family) { 444 #ifdef INET6 445 case PF_INET6: 446 error = ip6_ctloutput(op, so, level, optname, m); 447 break; 448 #endif /* INET6 */ 449 case PF_INET: 450 error = ip_ctloutput(op, so, level, optname, m); 451 break; 452 default: 453 error = EAFNOSUPPORT; /*?*/ 454 break; 455 } 456 return (error); 457 } 458 tp = intotcpcb(inp); 459 460 switch (op) { 461 462 case PRCO_SETOPT: 463 switch (optname) { 464 465 case TCP_NODELAY: 466 if (m == NULL || m->m_len < sizeof (int)) 467 error = EINVAL; 468 else if (*mtod(m, int *)) 469 tp->t_flags |= TF_NODELAY; 470 else 471 tp->t_flags &= ~TF_NODELAY; 472 break; 473 474 case TCP_NOPUSH: 475 if (m == NULL || m->m_len < sizeof (int)) 476 error = EINVAL; 477 else if (*mtod(m, int *)) 478 tp->t_flags |= TF_NOPUSH; 479 else if (tp->t_flags & TF_NOPUSH) { 480 tp->t_flags &= ~TF_NOPUSH; 481 if (TCPS_HAVEESTABLISHED(tp->t_state)) 482 error = tcp_output(tp); 483 } 484 break; 485 486 case TCP_MAXSEG: 487 if (m == NULL || m->m_len < sizeof (int)) { 488 error = EINVAL; 489 break; 490 } 491 492 i = *mtod(m, int *); 493 if (i > 0 && i <= tp->t_maxseg) 494 tp->t_maxseg = i; 495 else 496 error = EINVAL; 497 break; 498 499 #ifdef TCP_SACK 500 case TCP_SACK_ENABLE: 501 if (m == NULL || m->m_len < sizeof (int)) { 502 error = EINVAL; 503 break; 504 } 505 506 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 507 error = EPERM; 508 break; 509 } 510 511 if (tp->t_flags & TF_SIGNATURE) { 512 error = EPERM; 513 break; 514 } 515 516 if (*mtod(m, int *)) 517 tp->sack_enable = 1; 518 else 519 tp->sack_enable = 0; 520 break; 521 #endif 522 #ifdef TCP_SIGNATURE 523 case TCP_MD5SIG: 524 if (m == NULL || m->m_len < sizeof (int)) { 525 error = EINVAL; 526 break; 527 } 528 529 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 530 error = EPERM; 531 break; 532 } 533 534 if (*mtod(m, int *)) { 535 tp->t_flags |= TF_SIGNATURE; 536 #ifdef TCP_SACK 537 tp->sack_enable = 0; 538 #endif /* TCP_SACK */ 539 } else 540 tp->t_flags &= ~TF_SIGNATURE; 541 break; 542 #endif /* TCP_SIGNATURE */ 543 default: 544 error = ENOPROTOOPT; 545 break; 546 } 547 break; 548 549 case PRCO_GETOPT: 550 m->m_len = sizeof(int); 551 552 switch (optname) { 553 case TCP_NODELAY: 554 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 555 break; 556 case TCP_NOPUSH: 557 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 558 break; 559 case TCP_MAXSEG: 560 *mtod(m, int *) = tp->t_maxseg; 561 break; 562 #ifdef TCP_SACK 563 case TCP_SACK_ENABLE: 564 *mtod(m, int *) = tp->sack_enable; 565 break; 566 #endif 567 #ifdef TCP_SIGNATURE 568 case TCP_MD5SIG: 569 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; 570 break; 571 #endif 572 default: 573 error = ENOPROTOOPT; 574 break; 575 } 576 break; 577 } 578 return (error); 579 } 580 581 /* 582 * Attach TCP protocol to socket, allocating 583 * internet protocol control block, tcp control block, 584 * bufer space, and entering LISTEN state if to accept connections. 585 */ 586 int 587 tcp_attach(struct socket *so, int proto) 588 { 589 struct tcpcb *tp; 590 struct inpcb *inp; 591 int error; 592 593 if (so->so_pcb) 594 return EISCONN; 595 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || 596 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || 597 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { 598 error = soreserve(so, tcp_sendspace, tcp_recvspace); 599 if (error) 600 return (error); 601 } 602 603 error = in_pcballoc(so, &tcbtable); 604 if (error) 605 return (error); 606 inp = sotoinpcb(so); 607 tp = tcp_newtcpcb(inp); 608 if (tp == NULL) { 609 int nofd = so->so_state & SS_NOFDREF; /* XXX */ 610 611 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 612 in_pcbdetach(inp); 613 so->so_state |= nofd; 614 return (ENOBUFS); 615 } 616 tp->t_state = TCPS_CLOSED; 617 #ifdef INET6 618 /* we disallow IPv4 mapped address completely. */ 619 if (inp->inp_flags & INP_IPV6) 620 tp->pf = PF_INET6; 621 else 622 tp->pf = PF_INET; 623 #else 624 tp->pf = PF_INET; 625 #endif 626 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 627 so->so_linger = TCP_LINGERTIME; 628 629 if (tp && (so->so_options & SO_DEBUG)) 630 tcp_trace(TA_USER, 0, tp, (caddr_t)0, 0 /* XXX */, 0); 631 return (0); 632 } 633 634 /* 635 * Initiate (or continue) disconnect. 636 * If embryonic state, just send reset (once). 637 * If in ``let data drain'' option and linger null, just drop. 638 * Otherwise (hard), mark socket disconnecting and drop 639 * current input data; switch states based on user close, and 640 * send segment to peer (with FIN). 641 */ 642 struct tcpcb * 643 tcp_disconnect(struct tcpcb *tp) 644 { 645 struct socket *so = tp->t_inpcb->inp_socket; 646 647 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 648 tp = tcp_close(tp); 649 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 650 tp = tcp_drop(tp, 0); 651 else { 652 soisdisconnecting(so); 653 sbflush(so, &so->so_rcv); 654 tp = tcp_usrclosed(tp); 655 if (tp) 656 (void) tcp_output(tp); 657 } 658 return (tp); 659 } 660 661 /* 662 * User issued close, and wish to trail through shutdown states: 663 * if never received SYN, just forget it. If got a SYN from peer, 664 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 665 * If already got a FIN from peer, then almost done; go to LAST_ACK 666 * state. In all other cases, have already sent FIN to peer (e.g. 667 * after PRU_SHUTDOWN), and just have to play tedious game waiting 668 * for peer to send FIN or not respond to keep-alives, etc. 669 * We can let the user exit from the close as soon as the FIN is acked. 670 */ 671 struct tcpcb * 672 tcp_usrclosed(struct tcpcb *tp) 673 { 674 675 switch (tp->t_state) { 676 677 case TCPS_CLOSED: 678 case TCPS_LISTEN: 679 case TCPS_SYN_SENT: 680 tp->t_state = TCPS_CLOSED; 681 tp = tcp_close(tp); 682 break; 683 684 case TCPS_SYN_RECEIVED: 685 case TCPS_ESTABLISHED: 686 tp->t_state = TCPS_FIN_WAIT_1; 687 break; 688 689 case TCPS_CLOSE_WAIT: 690 tp->t_state = TCPS_LAST_ACK; 691 break; 692 } 693 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 694 soisdisconnected(tp->t_inpcb->inp_socket); 695 /* 696 * If we are in FIN_WAIT_2, we arrived here because the 697 * application did a shutdown of the send side. Like the 698 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 699 * a full close, we start a timer to make sure sockets are 700 * not left in FIN_WAIT_2 forever. 701 */ 702 if (tp->t_state == TCPS_FIN_WAIT_2) 703 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 704 } 705 return (tp); 706 } 707 708 /* 709 * Look up a socket for ident or tcpdrop, ... 710 */ 711 int 712 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) 713 { 714 int error = 0; 715 struct tcp_ident_mapping tir; 716 struct inpcb *inp; 717 struct tcpcb *tp = NULL; 718 struct sockaddr_in *fin, *lin; 719 #ifdef INET6 720 struct sockaddr_in6 *fin6, *lin6; 721 struct in6_addr f6, l6; 722 #endif 723 724 NET_ASSERT_LOCKED(); 725 726 if (dodrop) { 727 if (oldp != NULL || *oldlenp != 0) 728 return (EINVAL); 729 if (newp == NULL) 730 return (EPERM); 731 if (newlen < sizeof(tir)) 732 return (ENOMEM); 733 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) 734 return (error); 735 } else { 736 if (oldp == NULL) 737 return (EINVAL); 738 if (*oldlenp < sizeof(tir)) 739 return (ENOMEM); 740 if (newp != NULL || newlen != 0) 741 return (EINVAL); 742 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) 743 return (error); 744 } 745 switch (tir.faddr.ss_family) { 746 #ifdef INET6 747 case AF_INET6: 748 fin6 = (struct sockaddr_in6 *)&tir.faddr; 749 error = in6_embedscope(&f6, fin6, NULL); 750 if (error) 751 return EINVAL; /*?*/ 752 lin6 = (struct sockaddr_in6 *)&tir.laddr; 753 error = in6_embedscope(&l6, lin6, NULL); 754 if (error) 755 return EINVAL; /*?*/ 756 break; 757 #endif 758 case AF_INET: 759 fin = (struct sockaddr_in *)&tir.faddr; 760 lin = (struct sockaddr_in *)&tir.laddr; 761 break; 762 default: 763 return (EINVAL); 764 } 765 766 switch (tir.faddr.ss_family) { 767 #ifdef INET6 768 case AF_INET6: 769 inp = in6_pcbhashlookup(&tcbtable, &f6, 770 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); 771 break; 772 #endif 773 case AF_INET: 774 inp = in_pcbhashlookup(&tcbtable, fin->sin_addr, 775 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); 776 break; 777 default: 778 unhandled_af(tir.faddr.ss_family); 779 } 780 781 if (dodrop) { 782 if (inp && (tp = intotcpcb(inp)) && 783 ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) 784 tp = tcp_drop(tp, ECONNABORTED); 785 else 786 error = ESRCH; 787 return (error); 788 } 789 790 if (inp == NULL) { 791 tcpstat_inc(tcps_pcbhashmiss); 792 switch (tir.faddr.ss_family) { 793 #ifdef INET6 794 case AF_INET6: 795 inp = in6_pcblookup_listen(&tcbtable, 796 &l6, lin6->sin6_port, 0, NULL, tir.rdomain); 797 break; 798 #endif 799 case AF_INET: 800 inp = in_pcblookup_listen(&tcbtable, 801 lin->sin_addr, lin->sin_port, 0, NULL, tir.rdomain); 802 break; 803 } 804 } 805 806 if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) { 807 tir.ruid = inp->inp_socket->so_ruid; 808 tir.euid = inp->inp_socket->so_euid; 809 } else { 810 tir.ruid = -1; 811 tir.euid = -1; 812 } 813 814 *oldlenp = sizeof (tir); 815 error = copyout((void *)&tir, oldp, sizeof (tir)); 816 return (error); 817 } 818 819 int 820 tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp) 821 { 822 uint64_t counters[tcps_ncounters]; 823 struct tcpstat tcpstat; 824 struct syn_cache_set *set; 825 int i = 0; 826 827 #define ASSIGN(field) do { tcpstat.field = counters[i++]; } while (0) 828 829 memset(&tcpstat, 0, sizeof tcpstat); 830 counters_read(tcpcounters, counters, nitems(counters)); 831 ASSIGN(tcps_connattempt); 832 ASSIGN(tcps_accepts); 833 ASSIGN(tcps_connects); 834 ASSIGN(tcps_drops); 835 ASSIGN(tcps_conndrops); 836 ASSIGN(tcps_closed); 837 ASSIGN(tcps_segstimed); 838 ASSIGN(tcps_rttupdated); 839 ASSIGN(tcps_delack); 840 ASSIGN(tcps_timeoutdrop); 841 ASSIGN(tcps_rexmttimeo); 842 ASSIGN(tcps_persisttimeo); 843 ASSIGN(tcps_persistdrop); 844 ASSIGN(tcps_keeptimeo); 845 ASSIGN(tcps_keepprobe); 846 ASSIGN(tcps_keepdrops); 847 ASSIGN(tcps_sndtotal); 848 ASSIGN(tcps_sndpack); 849 ASSIGN(tcps_sndbyte); 850 ASSIGN(tcps_sndrexmitpack); 851 ASSIGN(tcps_sndrexmitbyte); 852 ASSIGN(tcps_sndrexmitfast); 853 ASSIGN(tcps_sndacks); 854 ASSIGN(tcps_sndprobe); 855 ASSIGN(tcps_sndurg); 856 ASSIGN(tcps_sndwinup); 857 ASSIGN(tcps_sndctrl); 858 ASSIGN(tcps_rcvtotal); 859 ASSIGN(tcps_rcvpack); 860 ASSIGN(tcps_rcvbyte); 861 ASSIGN(tcps_rcvbadsum); 862 ASSIGN(tcps_rcvbadoff); 863 ASSIGN(tcps_rcvmemdrop); 864 ASSIGN(tcps_rcvnosec); 865 ASSIGN(tcps_rcvshort); 866 ASSIGN(tcps_rcvduppack); 867 ASSIGN(tcps_rcvdupbyte); 868 ASSIGN(tcps_rcvpartduppack); 869 ASSIGN(tcps_rcvpartdupbyte); 870 ASSIGN(tcps_rcvoopack); 871 ASSIGN(tcps_rcvoobyte); 872 ASSIGN(tcps_rcvpackafterwin); 873 ASSIGN(tcps_rcvbyteafterwin); 874 ASSIGN(tcps_rcvafterclose); 875 ASSIGN(tcps_rcvwinprobe); 876 ASSIGN(tcps_rcvdupack); 877 ASSIGN(tcps_rcvacktoomuch); 878 ASSIGN(tcps_rcvacktooold); 879 ASSIGN(tcps_rcvackpack); 880 ASSIGN(tcps_rcvackbyte); 881 ASSIGN(tcps_rcvwinupd); 882 ASSIGN(tcps_pawsdrop); 883 ASSIGN(tcps_predack); 884 ASSIGN(tcps_preddat); 885 ASSIGN(tcps_pcbhashmiss); 886 ASSIGN(tcps_noport); 887 ASSIGN(tcps_badsyn); 888 ASSIGN(tcps_dropsyn); 889 ASSIGN(tcps_rcvbadsig); 890 ASSIGN(tcps_rcvgoodsig); 891 ASSIGN(tcps_inswcsum); 892 ASSIGN(tcps_outswcsum); 893 ASSIGN(tcps_ecn_accepts); 894 ASSIGN(tcps_ecn_rcvece); 895 ASSIGN(tcps_ecn_rcvcwr); 896 ASSIGN(tcps_ecn_rcvce); 897 ASSIGN(tcps_ecn_sndect); 898 ASSIGN(tcps_ecn_sndece); 899 ASSIGN(tcps_ecn_sndcwr); 900 ASSIGN(tcps_cwr_ecn); 901 ASSIGN(tcps_cwr_frecovery); 902 ASSIGN(tcps_cwr_timeout); 903 ASSIGN(tcps_sc_added); 904 ASSIGN(tcps_sc_completed); 905 ASSIGN(tcps_sc_timed_out); 906 ASSIGN(tcps_sc_overflowed); 907 ASSIGN(tcps_sc_reset); 908 ASSIGN(tcps_sc_unreach); 909 ASSIGN(tcps_sc_bucketoverflow); 910 ASSIGN(tcps_sc_aborted); 911 ASSIGN(tcps_sc_dupesyn); 912 ASSIGN(tcps_sc_dropped); 913 ASSIGN(tcps_sc_collisions); 914 ASSIGN(tcps_sc_retransmitted); 915 ASSIGN(tcps_sc_seedrandom); 916 ASSIGN(tcps_sc_hash_size); 917 ASSIGN(tcps_sc_entry_count); 918 ASSIGN(tcps_sc_entry_limit); 919 ASSIGN(tcps_sc_bucket_maxlen); 920 ASSIGN(tcps_sc_bucket_limit); 921 ASSIGN(tcps_sc_uses_left); 922 ASSIGN(tcps_conndrained); 923 ASSIGN(tcps_sack_recovery_episode); 924 ASSIGN(tcps_sack_rexmits); 925 ASSIGN(tcps_sack_rexmit_bytes); 926 ASSIGN(tcps_sack_rcv_opts); 927 ASSIGN(tcps_sack_snd_opts); 928 929 #undef ASSIGN 930 931 set = &tcp_syn_cache[tcp_syn_cache_active]; 932 tcpstat.tcps_sc_hash_size = set->scs_size; 933 tcpstat.tcps_sc_entry_count = set->scs_count; 934 tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit; 935 tcpstat.tcps_sc_bucket_maxlen = 0; 936 for (i = 0; i < set->scs_size; i++) { 937 if (tcpstat.tcps_sc_bucket_maxlen < 938 set->scs_buckethead[i].sch_length) 939 tcpstat.tcps_sc_bucket_maxlen = 940 set->scs_buckethead[i].sch_length; 941 } 942 tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit; 943 tcpstat.tcps_sc_uses_left = set->scs_use; 944 945 return (sysctl_rdstruct(oldp, oldlenp, newp, 946 &tcpstat, sizeof(tcpstat))); 947 } 948 949 /* 950 * Sysctl for tcp variables. 951 */ 952 int 953 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 954 size_t newlen) 955 { 956 int error, nval; 957 958 NET_ASSERT_LOCKED(); 959 960 /* All sysctl names at this level are terminal. */ 961 if (namelen != 1) 962 return (ENOTDIR); 963 964 switch (name[0]) { 965 #ifdef TCP_SACK 966 case TCPCTL_SACK: 967 return (sysctl_int(oldp, oldlenp, newp, newlen, 968 &tcp_do_sack)); 969 #endif 970 case TCPCTL_SLOWHZ: 971 return (sysctl_rdint(oldp, oldlenp, newp, PR_SLOWHZ)); 972 973 case TCPCTL_BADDYNAMIC: 974 return (sysctl_struct(oldp, oldlenp, newp, newlen, 975 baddynamicports.tcp, sizeof(baddynamicports.tcp))); 976 977 case TCPCTL_ROOTONLY: 978 if (newp && securelevel > 0) 979 return (EPERM); 980 return (sysctl_struct(oldp, oldlenp, newp, newlen, 981 rootonlyports.tcp, sizeof(rootonlyports.tcp))); 982 983 case TCPCTL_IDENT: 984 return (tcp_ident(oldp, oldlenp, newp, newlen, 0)); 985 986 case TCPCTL_DROP: 987 return (tcp_ident(oldp, oldlenp, newp, newlen, 1)); 988 989 case TCPCTL_ALWAYS_KEEPALIVE: 990 return (sysctl_int(oldp, oldlenp, newp, newlen, 991 &tcp_always_keepalive)); 992 993 #ifdef TCP_ECN 994 case TCPCTL_ECN: 995 return (sysctl_int(oldp, oldlenp, newp, newlen, 996 &tcp_do_ecn)); 997 #endif 998 case TCPCTL_REASS_LIMIT: 999 nval = tcp_reass_limit; 1000 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1001 if (error) 1002 return (error); 1003 if (nval != tcp_reass_limit) { 1004 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); 1005 if (error) 1006 return (error); 1007 tcp_reass_limit = nval; 1008 } 1009 return (0); 1010 #ifdef TCP_SACK 1011 case TCPCTL_SACKHOLE_LIMIT: 1012 nval = tcp_sackhole_limit; 1013 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1014 if (error) 1015 return (error); 1016 if (nval != tcp_sackhole_limit) { 1017 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); 1018 if (error) 1019 return (error); 1020 tcp_sackhole_limit = nval; 1021 } 1022 return (0); 1023 #endif 1024 1025 case TCPCTL_STATS: 1026 return (tcp_sysctl_tcpstat(oldp, oldlenp, newp)); 1027 1028 case TCPCTL_SYN_USE_LIMIT: 1029 error = sysctl_int(oldp, oldlenp, newp, newlen, 1030 &tcp_syn_use_limit); 1031 if (error) 1032 return (error); 1033 if (newp != NULL) { 1034 /* 1035 * Global tcp_syn_use_limit is used when reseeding a 1036 * new cache. Also update the value in active cache. 1037 */ 1038 if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit) 1039 tcp_syn_cache[0].scs_use = tcp_syn_use_limit; 1040 if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit) 1041 tcp_syn_cache[1].scs_use = tcp_syn_use_limit; 1042 } 1043 return (0); 1044 1045 case TCPCTL_SYN_HASH_SIZE: 1046 nval = tcp_syn_hash_size; 1047 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1048 if (error) 1049 return (error); 1050 if (nval != tcp_syn_hash_size) { 1051 if (nval < 1 || nval > 100000) 1052 return (EINVAL); 1053 /* 1054 * If global hash size has been changed, switch sets as 1055 * soon as possible. Then the actual hash array will 1056 * be reallocated. 1057 */ 1058 if (tcp_syn_cache[0].scs_size != nval) 1059 tcp_syn_cache[0].scs_use = 0; 1060 if (tcp_syn_cache[1].scs_size != nval) 1061 tcp_syn_cache[1].scs_use = 0; 1062 tcp_syn_hash_size = nval; 1063 } 1064 return (0); 1065 1066 default: 1067 if (name[0] < TCPCTL_MAXID) 1068 return (sysctl_int_arr(tcpctl_vars, name, namelen, 1069 oldp, oldlenp, newp, newlen)); 1070 return (ENOPROTOOPT); 1071 } 1072 /* NOTREACHED */ 1073 } 1074 1075 /* 1076 * Scale the send buffer so that inflight data is not accounted against 1077 * the limit. The buffer will scale with the congestion window, if the 1078 * the receiver stops acking data the window will shrink and therefor 1079 * the buffer size will shrink as well. 1080 * In low memory situation try to shrink the buffer to the initial size 1081 * disabling the send buffer scaling as long as the situation persists. 1082 */ 1083 void 1084 tcp_update_sndspace(struct tcpcb *tp) 1085 { 1086 struct socket *so = tp->t_inpcb->inp_socket; 1087 u_long nmax = so->so_snd.sb_hiwat; 1088 1089 if (sbchecklowmem()) { 1090 /* low on memory try to get rid of some */ 1091 if (tcp_sendspace < nmax) 1092 nmax = tcp_sendspace; 1093 } else if (so->so_snd.sb_wat != tcp_sendspace) 1094 /* user requested buffer size, auto-scaling disabled */ 1095 nmax = so->so_snd.sb_wat; 1096 else 1097 /* automatic buffer scaling */ 1098 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - 1099 tp->snd_una); 1100 1101 /* a writable socket must be preserved because of poll(2) semantics */ 1102 if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) { 1103 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) 1104 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; 1105 if (nmax * 2 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat) 1106 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+1) / 2; 1107 } 1108 1109 /* round to MSS boundary */ 1110 nmax = roundup(nmax, tp->t_maxseg); 1111 1112 if (nmax != so->so_snd.sb_hiwat) 1113 sbreserve(so, &so->so_snd, nmax); 1114 } 1115 1116 /* 1117 * Scale the recv buffer by looking at how much data was transferred in 1118 * on approximated RTT. If more than a big part of the recv buffer was 1119 * transferred during that time we increase the buffer by a constant. 1120 * In low memory situation try to shrink the buffer to the initial size. 1121 */ 1122 void 1123 tcp_update_rcvspace(struct tcpcb *tp) 1124 { 1125 struct socket *so = tp->t_inpcb->inp_socket; 1126 u_long nmax = so->so_rcv.sb_hiwat; 1127 1128 if (sbchecklowmem()) { 1129 /* low on memory try to get rid of some */ 1130 if (tcp_recvspace < nmax) 1131 nmax = tcp_recvspace; 1132 } else if (so->so_rcv.sb_wat != tcp_recvspace) 1133 /* user requested buffer size, auto-scaling disabled */ 1134 nmax = so->so_rcv.sb_wat; 1135 else { 1136 /* automatic buffer scaling */ 1137 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) 1138 nmax = MIN(sb_max, so->so_rcv.sb_hiwat + 1139 tcp_autorcvbuf_inc); 1140 } 1141 1142 /* a readable socket must be preserved because of poll(2) semantics */ 1143 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && 1144 nmax < so->so_snd.sb_lowat) 1145 nmax = so->so_snd.sb_lowat; 1146 1147 if (nmax == so->so_rcv.sb_hiwat) 1148 return; 1149 1150 /* round to MSS boundary */ 1151 nmax = roundup(nmax, tp->t_maxseg); 1152 sbreserve(so, &so->so_rcv, nmax); 1153 } 1154