1 /* $OpenBSD: tcp_usrreq.c,v 1.163 2018/01/09 15:14:23 mpi Exp $ */ 2 /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/protosw.h> 77 #include <sys/stat.h> 78 #include <sys/sysctl.h> 79 #include <sys/domain.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/in_var.h> 89 #include <netinet/ip.h> 90 #include <netinet/in_pcb.h> 91 #include <netinet/ip_var.h> 92 #include <netinet/tcp.h> 93 #include <netinet/tcp_fsm.h> 94 #include <netinet/tcp_seq.h> 95 #include <netinet/tcp_timer.h> 96 #include <netinet/tcp_var.h> 97 #include <netinet/tcp_debug.h> 98 99 #ifdef INET6 100 #include <netinet6/in6_var.h> 101 #endif 102 103 #ifndef TCP_SENDSPACE 104 #define TCP_SENDSPACE 1024*16 105 #endif 106 u_int tcp_sendspace = TCP_SENDSPACE; 107 #ifndef TCP_RECVSPACE 108 #define TCP_RECVSPACE 1024*16 109 #endif 110 u_int tcp_recvspace = TCP_RECVSPACE; 111 u_int tcp_autorcvbuf_inc = 16 * 1024; 112 113 int *tcpctl_vars[TCPCTL_MAXID] = TCPCTL_VARS; 114 115 struct inpcbtable tcbtable; 116 117 int tcp_ident(void *, size_t *, void *, size_t, int); 118 119 /* 120 * Process a TCP user request for TCP tb. If this is a send request 121 * then m is the mbuf chain of send data. If this is a timer expiration 122 * (called from the software clock routine), then timertype tells which timer. 123 */ 124 /*ARGSUSED*/ 125 int 126 tcp_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam, 127 struct mbuf *control, struct proc *p) 128 { 129 struct inpcb *inp; 130 struct tcpcb *tp = NULL; 131 int error = 0; 132 short ostate; 133 134 soassertlocked(so); 135 136 if (req == PRU_CONTROL) { 137 #ifdef INET6 138 if (sotopf(so) == PF_INET6) 139 return in6_control(so, (u_long)m, (caddr_t)nam, 140 (struct ifnet *)control); 141 else 142 #endif /* INET6 */ 143 return (in_control(so, (u_long)m, (caddr_t)nam, 144 (struct ifnet *)control)); 145 } 146 if (control && control->m_len) { 147 m_freem(control); 148 m_freem(m); 149 return (EINVAL); 150 } 151 152 inp = sotoinpcb(so); 153 /* 154 * When a TCP is attached to a socket, then there will be 155 * a (struct inpcb) pointed at by the socket, and this 156 * structure will point at a subsidiary (struct tcpcb). 157 */ 158 if (inp == NULL) { 159 error = so->so_error; 160 if (error == 0) 161 error = EINVAL; 162 /* 163 * The following corrects an mbuf leak under rare 164 * circumstances 165 */ 166 if (req == PRU_SEND || req == PRU_SENDOOB) 167 m_freem(m); 168 return (error); 169 } 170 if (inp) { 171 tp = intotcpcb(inp); 172 /* tp might get 0 when using socket splicing */ 173 if (tp == NULL) { 174 return (0); 175 } 176 #ifdef KPROF 177 tcp_acounts[tp->t_state][req]++; 178 #endif 179 ostate = tp->t_state; 180 } else 181 ostate = 0; 182 switch (req) { 183 184 /* 185 * Give the socket an address. 186 */ 187 case PRU_BIND: 188 error = in_pcbbind(inp, nam, p); 189 break; 190 191 /* 192 * Prepare to accept connections. 193 */ 194 case PRU_LISTEN: 195 if (inp->inp_lport == 0) 196 error = in_pcbbind(inp, NULL, p); 197 /* If the in_pcbbind() above is called, the tp->pf 198 should still be whatever it was before. */ 199 if (error == 0) 200 tp->t_state = TCPS_LISTEN; 201 break; 202 203 /* 204 * Initiate connection to peer. 205 * Create a template for use in transmissions on this connection. 206 * Enter SYN_SENT state, and mark socket as connecting. 207 * Start keep-alive timer, and seed output sequence space. 208 * Send initial segment on connection. 209 */ 210 case PRU_CONNECT: 211 #ifdef INET6 212 if (inp->inp_flags & INP_IPV6) { 213 struct sockaddr_in6 *sin6; 214 215 if ((error = in6_nam2sin6(nam, &sin6))) 216 break; 217 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || 218 IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { 219 error = EINVAL; 220 break; 221 } 222 error = in6_pcbconnect(inp, nam); 223 } else 224 #endif /* INET6 */ 225 { 226 struct sockaddr_in *sin; 227 228 if ((error = in_nam2sin(nam, &sin))) 229 break; 230 if ((sin->sin_addr.s_addr == INADDR_ANY) || 231 (sin->sin_addr.s_addr == INADDR_BROADCAST) || 232 IN_MULTICAST(sin->sin_addr.s_addr) || 233 in_broadcast(sin->sin_addr, inp->inp_rtableid)) { 234 error = EINVAL; 235 break; 236 } 237 error = in_pcbconnect(inp, nam); 238 } 239 if (error) 240 break; 241 242 tp->t_template = tcp_template(tp); 243 if (tp->t_template == 0) { 244 in_pcbdisconnect(inp); 245 error = ENOBUFS; 246 break; 247 } 248 249 so->so_state |= SS_CONNECTOUT; 250 251 /* Compute window scaling to request. */ 252 tcp_rscale(tp, sb_max); 253 254 soisconnecting(so); 255 tcpstat_inc(tcps_connattempt); 256 tp->t_state = TCPS_SYN_SENT; 257 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 258 tcp_set_iss_tsm(tp); 259 tcp_sendseqinit(tp); 260 tp->snd_last = tp->snd_una; 261 error = tcp_output(tp); 262 break; 263 264 /* 265 * Create a TCP connection between two sockets. 266 */ 267 case PRU_CONNECT2: 268 error = EOPNOTSUPP; 269 break; 270 271 /* 272 * Initiate disconnect from peer. 273 * If connection never passed embryonic stage, just drop; 274 * else if don't need to let data drain, then can just drop anyways, 275 * else have to begin TCP shutdown process: mark socket disconnecting, 276 * drain unread data, state switch to reflect user close, and 277 * send segment (e.g. FIN) to peer. Socket will be really disconnected 278 * when peer sends FIN and acks ours. 279 * 280 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 281 */ 282 case PRU_DISCONNECT: 283 tp = tcp_disconnect(tp); 284 break; 285 286 /* 287 * Accept a connection. Essentially all the work is 288 * done at higher levels; just return the address 289 * of the peer, storing through addr. 290 */ 291 case PRU_ACCEPT: 292 #ifdef INET6 293 if (inp->inp_flags & INP_IPV6) 294 in6_setpeeraddr(inp, nam); 295 else 296 #endif 297 in_setpeeraddr(inp, nam); 298 break; 299 300 /* 301 * Mark the connection as being incapable of further output. 302 */ 303 case PRU_SHUTDOWN: 304 if (so->so_state & SS_CANTSENDMORE) 305 break; 306 socantsendmore(so); 307 tp = tcp_usrclosed(tp); 308 if (tp) 309 error = tcp_output(tp); 310 break; 311 312 /* 313 * After a receive, possibly send window update to peer. 314 */ 315 case PRU_RCVD: 316 /* 317 * soreceive() calls this function when a user receives 318 * ancillary data on a listening socket. We don't call 319 * tcp_output in such a case, since there is no header 320 * template for a listening socket and hence the kernel 321 * will panic. 322 */ 323 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 324 (void) tcp_output(tp); 325 break; 326 327 /* 328 * Do a send by putting data in output queue and updating urgent 329 * marker if URG set. Possibly send more data. 330 */ 331 case PRU_SEND: 332 sbappendstream(so, &so->so_snd, m); 333 error = tcp_output(tp); 334 break; 335 336 /* 337 * Abort the TCP. 338 */ 339 case PRU_ABORT: 340 tp = tcp_drop(tp, ECONNABORTED); 341 break; 342 343 case PRU_SENSE: 344 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; 345 return (0); 346 347 case PRU_RCVOOB: 348 if ((so->so_oobmark == 0 && 349 (so->so_state & SS_RCVATMARK) == 0) || 350 so->so_options & SO_OOBINLINE || 351 tp->t_oobflags & TCPOOB_HADDATA) { 352 error = EINVAL; 353 break; 354 } 355 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 356 error = EWOULDBLOCK; 357 break; 358 } 359 m->m_len = 1; 360 *mtod(m, caddr_t) = tp->t_iobc; 361 if (((long)nam & MSG_PEEK) == 0) 362 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 363 break; 364 365 case PRU_SENDOOB: 366 if (sbspace(so, &so->so_snd) < -512) { 367 m_freem(m); 368 error = ENOBUFS; 369 break; 370 } 371 /* 372 * According to RFC961 (Assigned Protocols), 373 * the urgent pointer points to the last octet 374 * of urgent data. We continue, however, 375 * to consider it to indicate the first octet 376 * of data past the urgent section. 377 * Otherwise, snd_up should be one lower. 378 */ 379 sbappendstream(so, &so->so_snd, m); 380 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 381 tp->t_force = 1; 382 error = tcp_output(tp); 383 tp->t_force = 0; 384 break; 385 386 case PRU_SOCKADDR: 387 #ifdef INET6 388 if (inp->inp_flags & INP_IPV6) 389 in6_setsockaddr(inp, nam); 390 else 391 #endif 392 in_setsockaddr(inp, nam); 393 break; 394 395 case PRU_PEERADDR: 396 #ifdef INET6 397 if (inp->inp_flags & INP_IPV6) 398 in6_setpeeraddr(inp, nam); 399 else 400 #endif 401 in_setpeeraddr(inp, nam); 402 break; 403 404 default: 405 panic("tcp_usrreq"); 406 } 407 if (tp && (so->so_options & SO_DEBUG)) 408 tcp_trace(TA_USER, ostate, tp, (caddr_t)0, req, 0); 409 return (error); 410 } 411 412 int 413 tcp_ctloutput(int op, struct socket *so, int level, int optname, 414 struct mbuf *m) 415 { 416 int error = 0; 417 struct inpcb *inp; 418 struct tcpcb *tp; 419 int i; 420 421 inp = sotoinpcb(so); 422 if (inp == NULL) 423 return (ECONNRESET); 424 if (level != IPPROTO_TCP) { 425 switch (so->so_proto->pr_domain->dom_family) { 426 #ifdef INET6 427 case PF_INET6: 428 error = ip6_ctloutput(op, so, level, optname, m); 429 break; 430 #endif /* INET6 */ 431 case PF_INET: 432 error = ip_ctloutput(op, so, level, optname, m); 433 break; 434 default: 435 error = EAFNOSUPPORT; /*?*/ 436 break; 437 } 438 return (error); 439 } 440 tp = intotcpcb(inp); 441 442 switch (op) { 443 444 case PRCO_SETOPT: 445 switch (optname) { 446 447 case TCP_NODELAY: 448 if (m == NULL || m->m_len < sizeof (int)) 449 error = EINVAL; 450 else if (*mtod(m, int *)) 451 tp->t_flags |= TF_NODELAY; 452 else 453 tp->t_flags &= ~TF_NODELAY; 454 break; 455 456 case TCP_NOPUSH: 457 if (m == NULL || m->m_len < sizeof (int)) 458 error = EINVAL; 459 else if (*mtod(m, int *)) 460 tp->t_flags |= TF_NOPUSH; 461 else if (tp->t_flags & TF_NOPUSH) { 462 tp->t_flags &= ~TF_NOPUSH; 463 if (TCPS_HAVEESTABLISHED(tp->t_state)) 464 error = tcp_output(tp); 465 } 466 break; 467 468 case TCP_MAXSEG: 469 if (m == NULL || m->m_len < sizeof (int)) { 470 error = EINVAL; 471 break; 472 } 473 474 i = *mtod(m, int *); 475 if (i > 0 && i <= tp->t_maxseg) 476 tp->t_maxseg = i; 477 else 478 error = EINVAL; 479 break; 480 481 case TCP_SACK_ENABLE: 482 if (m == NULL || m->m_len < sizeof (int)) { 483 error = EINVAL; 484 break; 485 } 486 487 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 488 error = EPERM; 489 break; 490 } 491 492 if (tp->t_flags & TF_SIGNATURE) { 493 error = EPERM; 494 break; 495 } 496 497 if (*mtod(m, int *)) 498 tp->sack_enable = 1; 499 else 500 tp->sack_enable = 0; 501 break; 502 #ifdef TCP_SIGNATURE 503 case TCP_MD5SIG: 504 if (m == NULL || m->m_len < sizeof (int)) { 505 error = EINVAL; 506 break; 507 } 508 509 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 510 error = EPERM; 511 break; 512 } 513 514 if (*mtod(m, int *)) { 515 tp->t_flags |= TF_SIGNATURE; 516 tp->sack_enable = 0; 517 } else 518 tp->t_flags &= ~TF_SIGNATURE; 519 break; 520 #endif /* TCP_SIGNATURE */ 521 default: 522 error = ENOPROTOOPT; 523 break; 524 } 525 break; 526 527 case PRCO_GETOPT: 528 m->m_len = sizeof(int); 529 530 switch (optname) { 531 case TCP_NODELAY: 532 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 533 break; 534 case TCP_NOPUSH: 535 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 536 break; 537 case TCP_MAXSEG: 538 *mtod(m, int *) = tp->t_maxseg; 539 break; 540 case TCP_SACK_ENABLE: 541 *mtod(m, int *) = tp->sack_enable; 542 break; 543 #ifdef TCP_SIGNATURE 544 case TCP_MD5SIG: 545 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; 546 break; 547 #endif 548 default: 549 error = ENOPROTOOPT; 550 break; 551 } 552 break; 553 } 554 return (error); 555 } 556 557 /* 558 * Attach TCP protocol to socket, allocating 559 * internet protocol control block, tcp control block, 560 * buffer space, and entering LISTEN state to accept connections. 561 */ 562 int 563 tcp_attach(struct socket *so, int proto) 564 { 565 struct tcpcb *tp; 566 struct inpcb *inp; 567 int error; 568 569 if (so->so_pcb) 570 return EISCONN; 571 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || 572 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || 573 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { 574 error = soreserve(so, tcp_sendspace, tcp_recvspace); 575 if (error) 576 return (error); 577 } 578 579 NET_ASSERT_LOCKED(); 580 error = in_pcballoc(so, &tcbtable); 581 if (error) 582 return (error); 583 inp = sotoinpcb(so); 584 tp = tcp_newtcpcb(inp); 585 if (tp == NULL) { 586 unsigned int nofd = so->so_state & SS_NOFDREF; /* XXX */ 587 588 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 589 in_pcbdetach(inp); 590 so->so_state |= nofd; 591 return (ENOBUFS); 592 } 593 tp->t_state = TCPS_CLOSED; 594 #ifdef INET6 595 /* we disallow IPv4 mapped address completely. */ 596 if (inp->inp_flags & INP_IPV6) 597 tp->pf = PF_INET6; 598 else 599 tp->pf = PF_INET; 600 #else 601 tp->pf = PF_INET; 602 #endif 603 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 604 so->so_linger = TCP_LINGERTIME; 605 606 if (tp && (so->so_options & SO_DEBUG)) 607 tcp_trace(TA_USER, 0, tp, (caddr_t)0, 0 /* XXX */, 0); 608 return (0); 609 } 610 611 int 612 tcp_detach(struct socket *so) 613 { 614 struct inpcb *inp; 615 struct tcpcb *tp = NULL; 616 int error = 0; 617 short ostate; 618 619 soassertlocked(so); 620 621 inp = sotoinpcb(so); 622 /* 623 * When a TCP is attached to a socket, then there will be 624 * a (struct inpcb) pointed at by the socket, and this 625 * structure will point at a subsidiary (struct tcpcb). 626 */ 627 if (inp == NULL) { 628 error = so->so_error; 629 if (error == 0) 630 error = EINVAL; 631 632 return (error); 633 } 634 if (inp) { 635 tp = intotcpcb(inp); 636 /* tp might get 0 when using socket splicing */ 637 if (tp == NULL) { 638 return (0); 639 } 640 #ifdef KPROF 641 tcp_acounts[tp->t_state][req]++; 642 #endif 643 ostate = tp->t_state; 644 } else 645 ostate = 0; 646 647 /* 648 * Detach the TCP protocol from the socket. 649 * If the protocol state is non-embryonic, then can't 650 * do this directly: have to initiate a PRU_DISCONNECT, 651 * which may finish later; embryonic TCB's can just 652 * be discarded here. 653 */ 654 tcp_disconnect(tp); 655 656 return (error); 657 } 658 659 /* 660 * Initiate (or continue) disconnect. 661 * If embryonic state, just send reset (once). 662 * If in ``let data drain'' option and linger null, just drop. 663 * Otherwise (hard), mark socket disconnecting and drop 664 * current input data; switch states based on user close, and 665 * send segment to peer (with FIN). 666 */ 667 struct tcpcb * 668 tcp_disconnect(struct tcpcb *tp) 669 { 670 struct socket *so = tp->t_inpcb->inp_socket; 671 672 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 673 tp = tcp_close(tp); 674 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 675 tp = tcp_drop(tp, 0); 676 else { 677 soisdisconnecting(so); 678 sbflush(so, &so->so_rcv); 679 tp = tcp_usrclosed(tp); 680 if (tp) 681 (void) tcp_output(tp); 682 } 683 return (tp); 684 } 685 686 /* 687 * User issued close, and wish to trail through shutdown states: 688 * if never received SYN, just forget it. If got a SYN from peer, 689 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 690 * If already got a FIN from peer, then almost done; go to LAST_ACK 691 * state. In all other cases, have already sent FIN to peer (e.g. 692 * after PRU_SHUTDOWN), and just have to play tedious game waiting 693 * for peer to send FIN or not respond to keep-alives, etc. 694 * We can let the user exit from the close as soon as the FIN is acked. 695 */ 696 struct tcpcb * 697 tcp_usrclosed(struct tcpcb *tp) 698 { 699 700 switch (tp->t_state) { 701 702 case TCPS_CLOSED: 703 case TCPS_LISTEN: 704 case TCPS_SYN_SENT: 705 tp->t_state = TCPS_CLOSED; 706 tp = tcp_close(tp); 707 break; 708 709 case TCPS_SYN_RECEIVED: 710 case TCPS_ESTABLISHED: 711 tp->t_state = TCPS_FIN_WAIT_1; 712 break; 713 714 case TCPS_CLOSE_WAIT: 715 tp->t_state = TCPS_LAST_ACK; 716 break; 717 } 718 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 719 soisdisconnected(tp->t_inpcb->inp_socket); 720 /* 721 * If we are in FIN_WAIT_2, we arrived here because the 722 * application did a shutdown of the send side. Like the 723 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 724 * a full close, we start a timer to make sure sockets are 725 * not left in FIN_WAIT_2 forever. 726 */ 727 if (tp->t_state == TCPS_FIN_WAIT_2) 728 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 729 } 730 return (tp); 731 } 732 733 /* 734 * Look up a socket for ident or tcpdrop, ... 735 */ 736 int 737 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) 738 { 739 int error = 0; 740 struct tcp_ident_mapping tir; 741 struct inpcb *inp; 742 struct tcpcb *tp = NULL; 743 struct sockaddr_in *fin, *lin; 744 #ifdef INET6 745 struct sockaddr_in6 *fin6, *lin6; 746 struct in6_addr f6, l6; 747 #endif 748 749 NET_ASSERT_LOCKED(); 750 751 if (dodrop) { 752 if (oldp != NULL || *oldlenp != 0) 753 return (EINVAL); 754 if (newp == NULL) 755 return (EPERM); 756 if (newlen < sizeof(tir)) 757 return (ENOMEM); 758 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) 759 return (error); 760 } else { 761 if (oldp == NULL) 762 return (EINVAL); 763 if (*oldlenp < sizeof(tir)) 764 return (ENOMEM); 765 if (newp != NULL || newlen != 0) 766 return (EINVAL); 767 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) 768 return (error); 769 } 770 switch (tir.faddr.ss_family) { 771 #ifdef INET6 772 case AF_INET6: 773 fin6 = (struct sockaddr_in6 *)&tir.faddr; 774 error = in6_embedscope(&f6, fin6, NULL); 775 if (error) 776 return EINVAL; /*?*/ 777 lin6 = (struct sockaddr_in6 *)&tir.laddr; 778 error = in6_embedscope(&l6, lin6, NULL); 779 if (error) 780 return EINVAL; /*?*/ 781 break; 782 #endif 783 case AF_INET: 784 fin = (struct sockaddr_in *)&tir.faddr; 785 lin = (struct sockaddr_in *)&tir.laddr; 786 break; 787 default: 788 return (EINVAL); 789 } 790 791 switch (tir.faddr.ss_family) { 792 #ifdef INET6 793 case AF_INET6: 794 inp = in6_pcbhashlookup(&tcbtable, &f6, 795 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); 796 break; 797 #endif 798 case AF_INET: 799 inp = in_pcbhashlookup(&tcbtable, fin->sin_addr, 800 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); 801 break; 802 default: 803 unhandled_af(tir.faddr.ss_family); 804 } 805 806 if (dodrop) { 807 if (inp && (tp = intotcpcb(inp)) && 808 ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) 809 tp = tcp_drop(tp, ECONNABORTED); 810 else 811 error = ESRCH; 812 return (error); 813 } 814 815 if (inp == NULL) { 816 tcpstat_inc(tcps_pcbhashmiss); 817 switch (tir.faddr.ss_family) { 818 #ifdef INET6 819 case AF_INET6: 820 inp = in6_pcblookup_listen(&tcbtable, 821 &l6, lin6->sin6_port, NULL, tir.rdomain); 822 break; 823 #endif 824 case AF_INET: 825 inp = in_pcblookup_listen(&tcbtable, 826 lin->sin_addr, lin->sin_port, NULL, tir.rdomain); 827 break; 828 } 829 } 830 831 if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) { 832 tir.ruid = inp->inp_socket->so_ruid; 833 tir.euid = inp->inp_socket->so_euid; 834 } else { 835 tir.ruid = -1; 836 tir.euid = -1; 837 } 838 839 *oldlenp = sizeof (tir); 840 error = copyout((void *)&tir, oldp, sizeof (tir)); 841 return (error); 842 } 843 844 int 845 tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp) 846 { 847 uint64_t counters[tcps_ncounters]; 848 struct tcpstat tcpstat; 849 struct syn_cache_set *set; 850 int i = 0; 851 852 #define ASSIGN(field) do { tcpstat.field = counters[i++]; } while (0) 853 854 memset(&tcpstat, 0, sizeof tcpstat); 855 counters_read(tcpcounters, counters, nitems(counters)); 856 ASSIGN(tcps_connattempt); 857 ASSIGN(tcps_accepts); 858 ASSIGN(tcps_connects); 859 ASSIGN(tcps_drops); 860 ASSIGN(tcps_conndrops); 861 ASSIGN(tcps_closed); 862 ASSIGN(tcps_segstimed); 863 ASSIGN(tcps_rttupdated); 864 ASSIGN(tcps_delack); 865 ASSIGN(tcps_timeoutdrop); 866 ASSIGN(tcps_rexmttimeo); 867 ASSIGN(tcps_persisttimeo); 868 ASSIGN(tcps_persistdrop); 869 ASSIGN(tcps_keeptimeo); 870 ASSIGN(tcps_keepprobe); 871 ASSIGN(tcps_keepdrops); 872 ASSIGN(tcps_sndtotal); 873 ASSIGN(tcps_sndpack); 874 ASSIGN(tcps_sndbyte); 875 ASSIGN(tcps_sndrexmitpack); 876 ASSIGN(tcps_sndrexmitbyte); 877 ASSIGN(tcps_sndrexmitfast); 878 ASSIGN(tcps_sndacks); 879 ASSIGN(tcps_sndprobe); 880 ASSIGN(tcps_sndurg); 881 ASSIGN(tcps_sndwinup); 882 ASSIGN(tcps_sndctrl); 883 ASSIGN(tcps_rcvtotal); 884 ASSIGN(tcps_rcvpack); 885 ASSIGN(tcps_rcvbyte); 886 ASSIGN(tcps_rcvbadsum); 887 ASSIGN(tcps_rcvbadoff); 888 ASSIGN(tcps_rcvmemdrop); 889 ASSIGN(tcps_rcvnosec); 890 ASSIGN(tcps_rcvshort); 891 ASSIGN(tcps_rcvduppack); 892 ASSIGN(tcps_rcvdupbyte); 893 ASSIGN(tcps_rcvpartduppack); 894 ASSIGN(tcps_rcvpartdupbyte); 895 ASSIGN(tcps_rcvoopack); 896 ASSIGN(tcps_rcvoobyte); 897 ASSIGN(tcps_rcvpackafterwin); 898 ASSIGN(tcps_rcvbyteafterwin); 899 ASSIGN(tcps_rcvafterclose); 900 ASSIGN(tcps_rcvwinprobe); 901 ASSIGN(tcps_rcvdupack); 902 ASSIGN(tcps_rcvacktoomuch); 903 ASSIGN(tcps_rcvacktooold); 904 ASSIGN(tcps_rcvackpack); 905 ASSIGN(tcps_rcvackbyte); 906 ASSIGN(tcps_rcvwinupd); 907 ASSIGN(tcps_pawsdrop); 908 ASSIGN(tcps_predack); 909 ASSIGN(tcps_preddat); 910 ASSIGN(tcps_pcbhashmiss); 911 ASSIGN(tcps_noport); 912 ASSIGN(tcps_badsyn); 913 ASSIGN(tcps_dropsyn); 914 ASSIGN(tcps_rcvbadsig); 915 ASSIGN(tcps_rcvgoodsig); 916 ASSIGN(tcps_inswcsum); 917 ASSIGN(tcps_outswcsum); 918 ASSIGN(tcps_ecn_accepts); 919 ASSIGN(tcps_ecn_rcvece); 920 ASSIGN(tcps_ecn_rcvcwr); 921 ASSIGN(tcps_ecn_rcvce); 922 ASSIGN(tcps_ecn_sndect); 923 ASSIGN(tcps_ecn_sndece); 924 ASSIGN(tcps_ecn_sndcwr); 925 ASSIGN(tcps_cwr_ecn); 926 ASSIGN(tcps_cwr_frecovery); 927 ASSIGN(tcps_cwr_timeout); 928 ASSIGN(tcps_sc_added); 929 ASSIGN(tcps_sc_completed); 930 ASSIGN(tcps_sc_timed_out); 931 ASSIGN(tcps_sc_overflowed); 932 ASSIGN(tcps_sc_reset); 933 ASSIGN(tcps_sc_unreach); 934 ASSIGN(tcps_sc_bucketoverflow); 935 ASSIGN(tcps_sc_aborted); 936 ASSIGN(tcps_sc_dupesyn); 937 ASSIGN(tcps_sc_dropped); 938 ASSIGN(tcps_sc_collisions); 939 ASSIGN(tcps_sc_retransmitted); 940 ASSIGN(tcps_sc_seedrandom); 941 ASSIGN(tcps_sc_hash_size); 942 ASSIGN(tcps_sc_entry_count); 943 ASSIGN(tcps_sc_entry_limit); 944 ASSIGN(tcps_sc_bucket_maxlen); 945 ASSIGN(tcps_sc_bucket_limit); 946 ASSIGN(tcps_sc_uses_left); 947 ASSIGN(tcps_conndrained); 948 ASSIGN(tcps_sack_recovery_episode); 949 ASSIGN(tcps_sack_rexmits); 950 ASSIGN(tcps_sack_rexmit_bytes); 951 ASSIGN(tcps_sack_rcv_opts); 952 ASSIGN(tcps_sack_snd_opts); 953 954 #undef ASSIGN 955 956 set = &tcp_syn_cache[tcp_syn_cache_active]; 957 tcpstat.tcps_sc_hash_size = set->scs_size; 958 tcpstat.tcps_sc_entry_count = set->scs_count; 959 tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit; 960 tcpstat.tcps_sc_bucket_maxlen = 0; 961 for (i = 0; i < set->scs_size; i++) { 962 if (tcpstat.tcps_sc_bucket_maxlen < 963 set->scs_buckethead[i].sch_length) 964 tcpstat.tcps_sc_bucket_maxlen = 965 set->scs_buckethead[i].sch_length; 966 } 967 tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit; 968 tcpstat.tcps_sc_uses_left = set->scs_use; 969 970 return (sysctl_rdstruct(oldp, oldlenp, newp, 971 &tcpstat, sizeof(tcpstat))); 972 } 973 974 /* 975 * Sysctl for tcp variables. 976 */ 977 int 978 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 979 size_t newlen) 980 { 981 int error, nval; 982 983 /* All sysctl names at this level are terminal. */ 984 if (namelen != 1) 985 return (ENOTDIR); 986 987 switch (name[0]) { 988 case TCPCTL_SACK: 989 NET_LOCK(); 990 error = sysctl_int(oldp, oldlenp, newp, newlen, 991 &tcp_do_sack); 992 NET_UNLOCK(); 993 return (error); 994 995 case TCPCTL_SLOWHZ: 996 return (sysctl_rdint(oldp, oldlenp, newp, PR_SLOWHZ)); 997 998 case TCPCTL_BADDYNAMIC: 999 NET_LOCK(); 1000 error = sysctl_struct(oldp, oldlenp, newp, newlen, 1001 baddynamicports.tcp, sizeof(baddynamicports.tcp)); 1002 NET_UNLOCK(); 1003 return (error); 1004 1005 case TCPCTL_ROOTONLY: 1006 if (newp && securelevel > 0) 1007 return (EPERM); 1008 NET_LOCK(); 1009 error = sysctl_struct(oldp, oldlenp, newp, newlen, 1010 rootonlyports.tcp, sizeof(rootonlyports.tcp)); 1011 NET_UNLOCK(); 1012 return (error); 1013 1014 case TCPCTL_IDENT: 1015 NET_LOCK(); 1016 error = tcp_ident(oldp, oldlenp, newp, newlen, 0); 1017 NET_UNLOCK(); 1018 return (error); 1019 1020 case TCPCTL_DROP: 1021 NET_LOCK(); 1022 error = tcp_ident(oldp, oldlenp, newp, newlen, 1); 1023 NET_UNLOCK(); 1024 return (error); 1025 1026 case TCPCTL_ALWAYS_KEEPALIVE: 1027 NET_LOCK(); 1028 error = sysctl_int(oldp, oldlenp, newp, newlen, 1029 &tcp_always_keepalive); 1030 NET_UNLOCK(); 1031 return (error); 1032 1033 #ifdef TCP_ECN 1034 case TCPCTL_ECN: 1035 NET_LOCK(); 1036 error = sysctl_int(oldp, oldlenp, newp, newlen, 1037 &tcp_do_ecn); 1038 NET_UNLOCK(); 1039 return (error); 1040 #endif 1041 case TCPCTL_REASS_LIMIT: 1042 NET_LOCK(); 1043 nval = tcp_reass_limit; 1044 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1045 if (!error && nval != tcp_reass_limit) { 1046 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); 1047 if (!error) 1048 tcp_reass_limit = nval; 1049 } 1050 NET_UNLOCK(); 1051 return (error); 1052 1053 case TCPCTL_SACKHOLE_LIMIT: 1054 NET_LOCK(); 1055 nval = tcp_sackhole_limit; 1056 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1057 if (!error && nval != tcp_sackhole_limit) { 1058 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); 1059 if (!error) 1060 tcp_sackhole_limit = nval; 1061 } 1062 NET_UNLOCK(); 1063 return (error); 1064 1065 case TCPCTL_STATS: 1066 return (tcp_sysctl_tcpstat(oldp, oldlenp, newp)); 1067 1068 case TCPCTL_SYN_USE_LIMIT: 1069 NET_LOCK(); 1070 error = sysctl_int(oldp, oldlenp, newp, newlen, 1071 &tcp_syn_use_limit); 1072 if (!error && newp != NULL) { 1073 /* 1074 * Global tcp_syn_use_limit is used when reseeding a 1075 * new cache. Also update the value in active cache. 1076 */ 1077 if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit) 1078 tcp_syn_cache[0].scs_use = tcp_syn_use_limit; 1079 if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit) 1080 tcp_syn_cache[1].scs_use = tcp_syn_use_limit; 1081 } 1082 NET_UNLOCK(); 1083 return (error); 1084 1085 case TCPCTL_SYN_HASH_SIZE: 1086 NET_LOCK(); 1087 nval = tcp_syn_hash_size; 1088 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1089 if (!error && nval != tcp_syn_hash_size) { 1090 if (nval < 1 || nval > 100000) { 1091 error = EINVAL; 1092 } else { 1093 /* 1094 * If global hash size has been changed, 1095 * switch sets as soon as possible. Then 1096 * the actual hash array will be reallocated. 1097 */ 1098 if (tcp_syn_cache[0].scs_size != nval) 1099 tcp_syn_cache[0].scs_use = 0; 1100 if (tcp_syn_cache[1].scs_size != nval) 1101 tcp_syn_cache[1].scs_use = 0; 1102 tcp_syn_hash_size = nval; 1103 } 1104 } 1105 NET_UNLOCK(); 1106 return (error); 1107 1108 default: 1109 if (name[0] < TCPCTL_MAXID) { 1110 NET_LOCK(); 1111 error = sysctl_int_arr(tcpctl_vars, name, namelen, 1112 oldp, oldlenp, newp, newlen); 1113 NET_UNLOCK(); 1114 return (error); 1115 } 1116 return (ENOPROTOOPT); 1117 } 1118 /* NOTREACHED */ 1119 } 1120 1121 /* 1122 * Scale the send buffer so that inflight data is not accounted against 1123 * the limit. The buffer will scale with the congestion window, if the 1124 * the receiver stops acking data the window will shrink and therefor 1125 * the buffer size will shrink as well. 1126 * In low memory situation try to shrink the buffer to the initial size 1127 * disabling the send buffer scaling as long as the situation persists. 1128 */ 1129 void 1130 tcp_update_sndspace(struct tcpcb *tp) 1131 { 1132 struct socket *so = tp->t_inpcb->inp_socket; 1133 u_long nmax = so->so_snd.sb_hiwat; 1134 1135 if (sbchecklowmem()) { 1136 /* low on memory try to get rid of some */ 1137 if (tcp_sendspace < nmax) 1138 nmax = tcp_sendspace; 1139 } else if (so->so_snd.sb_wat != tcp_sendspace) 1140 /* user requested buffer size, auto-scaling disabled */ 1141 nmax = so->so_snd.sb_wat; 1142 else 1143 /* automatic buffer scaling */ 1144 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - 1145 tp->snd_una); 1146 1147 /* a writable socket must be preserved because of poll(2) semantics */ 1148 if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) { 1149 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) 1150 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; 1151 if (nmax * 2 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat) 1152 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+1) / 2; 1153 } 1154 1155 /* round to MSS boundary */ 1156 nmax = roundup(nmax, tp->t_maxseg); 1157 1158 if (nmax != so->so_snd.sb_hiwat) 1159 sbreserve(so, &so->so_snd, nmax); 1160 } 1161 1162 /* 1163 * Scale the recv buffer by looking at how much data was transferred in 1164 * on approximated RTT. If more than a big part of the recv buffer was 1165 * transferred during that time we increase the buffer by a constant. 1166 * In low memory situation try to shrink the buffer to the initial size. 1167 */ 1168 void 1169 tcp_update_rcvspace(struct tcpcb *tp) 1170 { 1171 struct socket *so = tp->t_inpcb->inp_socket; 1172 u_long nmax = so->so_rcv.sb_hiwat; 1173 1174 if (sbchecklowmem()) { 1175 /* low on memory try to get rid of some */ 1176 if (tcp_recvspace < nmax) 1177 nmax = tcp_recvspace; 1178 } else if (so->so_rcv.sb_wat != tcp_recvspace) 1179 /* user requested buffer size, auto-scaling disabled */ 1180 nmax = so->so_rcv.sb_wat; 1181 else { 1182 /* automatic buffer scaling */ 1183 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) 1184 nmax = MIN(sb_max, so->so_rcv.sb_hiwat + 1185 tcp_autorcvbuf_inc); 1186 } 1187 1188 /* a readable socket must be preserved because of poll(2) semantics */ 1189 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && 1190 nmax < so->so_snd.sb_lowat) 1191 nmax = so->so_snd.sb_lowat; 1192 1193 if (nmax == so->so_rcv.sb_hiwat) 1194 return; 1195 1196 /* round to MSS boundary */ 1197 nmax = roundup(nmax, tp->t_maxseg); 1198 sbreserve(so, &so->so_rcv, nmax); 1199 } 1200