1 /* $OpenBSD: tcp_usrreq.c,v 1.142 2017/01/10 09:01:18 mpi Exp $ */ 2 /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/protosw.h> 77 #include <sys/stat.h> 78 #include <sys/sysctl.h> 79 #include <sys/domain.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/in_var.h> 89 #include <netinet/ip.h> 90 #include <netinet/in_pcb.h> 91 #include <netinet/ip_var.h> 92 #include <netinet/tcp.h> 93 #include <netinet/tcp_fsm.h> 94 #include <netinet/tcp_seq.h> 95 #include <netinet/tcp_timer.h> 96 #include <netinet/tcp_var.h> 97 #include <netinet/tcpip.h> 98 #include <netinet/tcp_debug.h> 99 100 #ifdef INET6 101 #include <netinet6/in6_var.h> 102 #endif 103 104 #ifndef TCP_SENDSPACE 105 #define TCP_SENDSPACE 1024*16 106 #endif 107 u_int tcp_sendspace = TCP_SENDSPACE; 108 #ifndef TCP_RECVSPACE 109 #define TCP_RECVSPACE 1024*16 110 #endif 111 u_int tcp_recvspace = TCP_RECVSPACE; 112 u_int tcp_autorcvbuf_inc = 16 * 1024; 113 114 int *tcpctl_vars[TCPCTL_MAXID] = TCPCTL_VARS; 115 116 struct inpcbtable tcbtable; 117 118 int tcp_ident(void *, size_t *, void *, size_t, int); 119 120 /* 121 * Process a TCP user request for TCP tb. If this is a send request 122 * then m is the mbuf chain of send data. If this is a timer expiration 123 * (called from the software clock routine), then timertype tells which timer. 124 */ 125 /*ARGSUSED*/ 126 int 127 tcp_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam, 128 struct mbuf *control, struct proc *p) 129 { 130 struct sockaddr_in *sin; 131 struct inpcb *inp; 132 struct tcpcb *tp = NULL; 133 int error = 0; 134 short ostate; 135 136 NET_ASSERT_LOCKED(); 137 138 if (req == PRU_CONTROL) { 139 #ifdef INET6 140 if (sotopf(so) == PF_INET6) 141 return in6_control(so, (u_long)m, (caddr_t)nam, 142 (struct ifnet *)control); 143 else 144 #endif /* INET6 */ 145 return (in_control(so, (u_long)m, (caddr_t)nam, 146 (struct ifnet *)control)); 147 } 148 if (control && control->m_len) { 149 m_freem(control); 150 m_freem(m); 151 return (EINVAL); 152 } 153 154 inp = sotoinpcb(so); 155 /* 156 * When a TCP is attached to a socket, then there will be 157 * a (struct inpcb) pointed at by the socket, and this 158 * structure will point at a subsidiary (struct tcpcb). 159 */ 160 if (inp == NULL && req != PRU_ATTACH) { 161 error = so->so_error; 162 if (error == 0) 163 error = EINVAL; 164 /* 165 * The following corrects an mbuf leak under rare 166 * circumstances 167 */ 168 if (req == PRU_SEND || req == PRU_SENDOOB) 169 m_freem(m); 170 return (error); 171 } 172 if (inp) { 173 tp = intotcpcb(inp); 174 /* tp might get 0 when using socket splicing */ 175 if (tp == NULL) { 176 return (0); 177 } 178 #ifdef KPROF 179 tcp_acounts[tp->t_state][req]++; 180 #endif 181 ostate = tp->t_state; 182 } else 183 ostate = 0; 184 switch (req) { 185 186 /* 187 * TCP attaches to socket via PRU_ATTACH, reserving space, 188 * and an internet control block. 189 */ 190 case PRU_ATTACH: 191 if (inp) { 192 error = EISCONN; 193 break; 194 } 195 error = tcp_attach(so); 196 if (error) 197 break; 198 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 199 so->so_linger = TCP_LINGERTIME; 200 tp = sototcpcb(so); 201 break; 202 203 /* 204 * PRU_DETACH detaches the TCP protocol from the socket. 205 * If the protocol state is non-embryonic, then can't 206 * do this directly: have to initiate a PRU_DISCONNECT, 207 * which may finish later; embryonic TCB's can just 208 * be discarded here. 209 */ 210 case PRU_DETACH: 211 tp = tcp_disconnect(tp); 212 break; 213 214 /* 215 * Give the socket an address. 216 */ 217 case PRU_BIND: 218 error = in_pcbbind(inp, nam, p); 219 break; 220 221 /* 222 * Prepare to accept connections. 223 */ 224 case PRU_LISTEN: 225 if (inp->inp_lport == 0) 226 error = in_pcbbind(inp, NULL, p); 227 /* If the in_pcbbind() above is called, the tp->pf 228 should still be whatever it was before. */ 229 if (error == 0) 230 tp->t_state = TCPS_LISTEN; 231 break; 232 233 /* 234 * Initiate connection to peer. 235 * Create a template for use in transmissions on this connection. 236 * Enter SYN_SENT state, and mark socket as connecting. 237 * Start keep-alive timer, and seed output sequence space. 238 * Send initial segment on connection. 239 */ 240 case PRU_CONNECT: 241 sin = mtod(nam, struct sockaddr_in *); 242 243 #ifdef INET6 244 if (sin->sin_family == AF_INET6) { 245 struct in6_addr *in6_addr = &mtod(nam, 246 struct sockaddr_in6 *)->sin6_addr; 247 248 if (IN6_IS_ADDR_UNSPECIFIED(in6_addr) || 249 IN6_IS_ADDR_MULTICAST(in6_addr) || 250 IN6_IS_ADDR_V4MAPPED(in6_addr)) { 251 error = EINVAL; 252 break; 253 } 254 255 error = in6_pcbconnect(inp, nam); 256 } else if (sin->sin_family == AF_INET) 257 #endif /* INET6 */ 258 { 259 if ((sin->sin_addr.s_addr == INADDR_ANY) || 260 (sin->sin_addr.s_addr == INADDR_BROADCAST) || 261 IN_MULTICAST(sin->sin_addr.s_addr) || 262 in_broadcast(sin->sin_addr, inp->inp_rtableid)) { 263 error = EINVAL; 264 break; 265 } 266 267 error = in_pcbconnect(inp, nam); 268 } 269 270 if (error) 271 break; 272 273 tp->t_template = tcp_template(tp); 274 if (tp->t_template == 0) { 275 in_pcbdisconnect(inp); 276 error = ENOBUFS; 277 break; 278 } 279 280 so->so_state |= SS_CONNECTOUT; 281 282 /* Compute window scaling to request. */ 283 tcp_rscale(tp, sb_max); 284 285 soisconnecting(so); 286 tcpstat.tcps_connattempt++; 287 tp->t_state = TCPS_SYN_SENT; 288 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 289 tcp_set_iss_tsm(tp); 290 tcp_sendseqinit(tp); 291 #if defined(TCP_SACK) 292 tp->snd_last = tp->snd_una; 293 #endif 294 #if defined(TCP_SACK) && defined(TCP_FACK) 295 tp->snd_fack = tp->snd_una; 296 tp->retran_data = 0; 297 tp->snd_awnd = 0; 298 #endif 299 error = tcp_output(tp); 300 break; 301 302 /* 303 * Create a TCP connection between two sockets. 304 */ 305 case PRU_CONNECT2: 306 error = EOPNOTSUPP; 307 break; 308 309 /* 310 * Initiate disconnect from peer. 311 * If connection never passed embryonic stage, just drop; 312 * else if don't need to let data drain, then can just drop anyways, 313 * else have to begin TCP shutdown process: mark socket disconnecting, 314 * drain unread data, state switch to reflect user close, and 315 * send segment (e.g. FIN) to peer. Socket will be really disconnected 316 * when peer sends FIN and acks ours. 317 * 318 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 319 */ 320 case PRU_DISCONNECT: 321 tp = tcp_disconnect(tp); 322 break; 323 324 /* 325 * Accept a connection. Essentially all the work is 326 * done at higher levels; just return the address 327 * of the peer, storing through addr. 328 */ 329 case PRU_ACCEPT: 330 #ifdef INET6 331 if (inp->inp_flags & INP_IPV6) 332 in6_setpeeraddr(inp, nam); 333 else 334 #endif 335 in_setpeeraddr(inp, nam); 336 break; 337 338 /* 339 * Mark the connection as being incapable of further output. 340 */ 341 case PRU_SHUTDOWN: 342 if (so->so_state & SS_CANTSENDMORE) 343 break; 344 socantsendmore(so); 345 tp = tcp_usrclosed(tp); 346 if (tp) 347 error = tcp_output(tp); 348 break; 349 350 /* 351 * After a receive, possibly send window update to peer. 352 */ 353 case PRU_RCVD: 354 /* 355 * soreceive() calls this function when a user receives 356 * ancillary data on a listening socket. We don't call 357 * tcp_output in such a case, since there is no header 358 * template for a listening socket and hence the kernel 359 * will panic. 360 */ 361 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 362 (void) tcp_output(tp); 363 break; 364 365 /* 366 * Do a send by putting data in output queue and updating urgent 367 * marker if URG set. Possibly send more data. 368 */ 369 case PRU_SEND: 370 sbappendstream(&so->so_snd, m); 371 error = tcp_output(tp); 372 break; 373 374 /* 375 * Abort the TCP. 376 */ 377 case PRU_ABORT: 378 tp = tcp_drop(tp, ECONNABORTED); 379 break; 380 381 case PRU_SENSE: 382 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; 383 return (0); 384 385 case PRU_RCVOOB: 386 if ((so->so_oobmark == 0 && 387 (so->so_state & SS_RCVATMARK) == 0) || 388 so->so_options & SO_OOBINLINE || 389 tp->t_oobflags & TCPOOB_HADDATA) { 390 error = EINVAL; 391 break; 392 } 393 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 394 error = EWOULDBLOCK; 395 break; 396 } 397 m->m_len = 1; 398 *mtod(m, caddr_t) = tp->t_iobc; 399 if (((long)nam & MSG_PEEK) == 0) 400 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 401 break; 402 403 case PRU_SENDOOB: 404 if (sbspace(&so->so_snd) < -512) { 405 m_freem(m); 406 error = ENOBUFS; 407 break; 408 } 409 /* 410 * According to RFC961 (Assigned Protocols), 411 * the urgent pointer points to the last octet 412 * of urgent data. We continue, however, 413 * to consider it to indicate the first octet 414 * of data past the urgent section. 415 * Otherwise, snd_up should be one lower. 416 */ 417 sbappendstream(&so->so_snd, m); 418 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 419 tp->t_force = 1; 420 error = tcp_output(tp); 421 tp->t_force = 0; 422 break; 423 424 case PRU_SOCKADDR: 425 #ifdef INET6 426 if (inp->inp_flags & INP_IPV6) 427 in6_setsockaddr(inp, nam); 428 else 429 #endif 430 in_setsockaddr(inp, nam); 431 break; 432 433 case PRU_PEERADDR: 434 #ifdef INET6 435 if (inp->inp_flags & INP_IPV6) 436 in6_setpeeraddr(inp, nam); 437 else 438 #endif 439 in_setpeeraddr(inp, nam); 440 break; 441 442 default: 443 panic("tcp_usrreq"); 444 } 445 if (tp && (so->so_options & SO_DEBUG)) 446 tcp_trace(TA_USER, ostate, tp, (caddr_t)0, req, 0); 447 return (error); 448 } 449 450 int 451 tcp_ctloutput(int op, struct socket *so, int level, int optname, 452 struct mbuf **mp) 453 { 454 int error = 0; 455 struct inpcb *inp; 456 struct tcpcb *tp; 457 struct mbuf *m; 458 int i; 459 460 inp = sotoinpcb(so); 461 if (inp == NULL) { 462 if (op == PRCO_SETOPT) 463 (void) m_free(*mp); 464 return (ECONNRESET); 465 } 466 if (level != IPPROTO_TCP) { 467 switch (so->so_proto->pr_domain->dom_family) { 468 #ifdef INET6 469 case PF_INET6: 470 error = ip6_ctloutput(op, so, level, optname, mp); 471 break; 472 #endif /* INET6 */ 473 case PF_INET: 474 error = ip_ctloutput(op, so, level, optname, mp); 475 break; 476 default: 477 error = EAFNOSUPPORT; /*?*/ 478 break; 479 } 480 return (error); 481 } 482 tp = intotcpcb(inp); 483 484 switch (op) { 485 486 case PRCO_SETOPT: 487 m = *mp; 488 switch (optname) { 489 490 case TCP_NODELAY: 491 if (m == NULL || m->m_len < sizeof (int)) 492 error = EINVAL; 493 else if (*mtod(m, int *)) 494 tp->t_flags |= TF_NODELAY; 495 else 496 tp->t_flags &= ~TF_NODELAY; 497 break; 498 499 case TCP_NOPUSH: 500 if (m == NULL || m->m_len < sizeof (int)) 501 error = EINVAL; 502 else if (*mtod(m, int *)) 503 tp->t_flags |= TF_NOPUSH; 504 else if (tp->t_flags & TF_NOPUSH) { 505 tp->t_flags &= ~TF_NOPUSH; 506 if (TCPS_HAVEESTABLISHED(tp->t_state)) 507 error = tcp_output(tp); 508 } 509 break; 510 511 case TCP_MAXSEG: 512 if (m == NULL || m->m_len < sizeof (int)) { 513 error = EINVAL; 514 break; 515 } 516 517 i = *mtod(m, int *); 518 if (i > 0 && i <= tp->t_maxseg) 519 tp->t_maxseg = i; 520 else 521 error = EINVAL; 522 break; 523 524 #ifdef TCP_SACK 525 case TCP_SACK_ENABLE: 526 if (m == NULL || m->m_len < sizeof (int)) { 527 error = EINVAL; 528 break; 529 } 530 531 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 532 error = EPERM; 533 break; 534 } 535 536 if (tp->t_flags & TF_SIGNATURE) { 537 error = EPERM; 538 break; 539 } 540 541 if (*mtod(m, int *)) 542 tp->sack_enable = 1; 543 else 544 tp->sack_enable = 0; 545 break; 546 #endif 547 #ifdef TCP_SIGNATURE 548 case TCP_MD5SIG: 549 if (m == NULL || m->m_len < sizeof (int)) { 550 error = EINVAL; 551 break; 552 } 553 554 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 555 error = EPERM; 556 break; 557 } 558 559 if (*mtod(m, int *)) { 560 tp->t_flags |= TF_SIGNATURE; 561 #ifdef TCP_SACK 562 tp->sack_enable = 0; 563 #endif /* TCP_SACK */ 564 } else 565 tp->t_flags &= ~TF_SIGNATURE; 566 break; 567 #endif /* TCP_SIGNATURE */ 568 default: 569 error = ENOPROTOOPT; 570 break; 571 } 572 m_free(m); 573 break; 574 575 case PRCO_GETOPT: 576 *mp = m = m_get(M_WAIT, MT_SOOPTS); 577 m->m_len = sizeof(int); 578 579 switch (optname) { 580 case TCP_NODELAY: 581 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 582 break; 583 case TCP_NOPUSH: 584 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 585 break; 586 case TCP_MAXSEG: 587 *mtod(m, int *) = tp->t_maxseg; 588 break; 589 #ifdef TCP_SACK 590 case TCP_SACK_ENABLE: 591 *mtod(m, int *) = tp->sack_enable; 592 break; 593 #endif 594 #ifdef TCP_SIGNATURE 595 case TCP_MD5SIG: 596 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; 597 break; 598 #endif 599 default: 600 error = ENOPROTOOPT; 601 break; 602 } 603 break; 604 } 605 return (error); 606 } 607 608 /* 609 * Attach TCP protocol to socket, allocating 610 * internet protocol control block, tcp control block, 611 * bufer space, and entering LISTEN state if to accept connections. 612 */ 613 int 614 tcp_attach(struct socket *so) 615 { 616 struct tcpcb *tp; 617 struct inpcb *inp; 618 int error; 619 620 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || 621 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || 622 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { 623 error = soreserve(so, tcp_sendspace, tcp_recvspace); 624 if (error) 625 return (error); 626 } 627 628 error = in_pcballoc(so, &tcbtable); 629 if (error) 630 return (error); 631 inp = sotoinpcb(so); 632 tp = tcp_newtcpcb(inp); 633 if (tp == NULL) { 634 int nofd = so->so_state & SS_NOFDREF; /* XXX */ 635 636 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 637 in_pcbdetach(inp); 638 so->so_state |= nofd; 639 return (ENOBUFS); 640 } 641 tp->t_state = TCPS_CLOSED; 642 #ifdef INET6 643 /* we disallow IPv4 mapped address completely. */ 644 if (inp->inp_flags & INP_IPV6) 645 tp->pf = PF_INET6; 646 else 647 tp->pf = PF_INET; 648 #else 649 tp->pf = PF_INET; 650 #endif 651 return (0); 652 } 653 654 /* 655 * Initiate (or continue) disconnect. 656 * If embryonic state, just send reset (once). 657 * If in ``let data drain'' option and linger null, just drop. 658 * Otherwise (hard), mark socket disconnecting and drop 659 * current input data; switch states based on user close, and 660 * send segment to peer (with FIN). 661 */ 662 struct tcpcb * 663 tcp_disconnect(struct tcpcb *tp) 664 { 665 struct socket *so = tp->t_inpcb->inp_socket; 666 667 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 668 tp = tcp_close(tp); 669 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 670 tp = tcp_drop(tp, 0); 671 else { 672 soisdisconnecting(so); 673 sbflush(&so->so_rcv); 674 tp = tcp_usrclosed(tp); 675 if (tp) 676 (void) tcp_output(tp); 677 } 678 return (tp); 679 } 680 681 /* 682 * User issued close, and wish to trail through shutdown states: 683 * if never received SYN, just forget it. If got a SYN from peer, 684 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 685 * If already got a FIN from peer, then almost done; go to LAST_ACK 686 * state. In all other cases, have already sent FIN to peer (e.g. 687 * after PRU_SHUTDOWN), and just have to play tedious game waiting 688 * for peer to send FIN or not respond to keep-alives, etc. 689 * We can let the user exit from the close as soon as the FIN is acked. 690 */ 691 struct tcpcb * 692 tcp_usrclosed(struct tcpcb *tp) 693 { 694 695 switch (tp->t_state) { 696 697 case TCPS_CLOSED: 698 case TCPS_LISTEN: 699 case TCPS_SYN_SENT: 700 tp->t_state = TCPS_CLOSED; 701 tp = tcp_close(tp); 702 break; 703 704 case TCPS_SYN_RECEIVED: 705 case TCPS_ESTABLISHED: 706 tp->t_state = TCPS_FIN_WAIT_1; 707 break; 708 709 case TCPS_CLOSE_WAIT: 710 tp->t_state = TCPS_LAST_ACK; 711 break; 712 } 713 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 714 soisdisconnected(tp->t_inpcb->inp_socket); 715 /* 716 * If we are in FIN_WAIT_2, we arrived here because the 717 * application did a shutdown of the send side. Like the 718 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 719 * a full close, we start a timer to make sure sockets are 720 * not left in FIN_WAIT_2 forever. 721 */ 722 if (tp->t_state == TCPS_FIN_WAIT_2) 723 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 724 } 725 return (tp); 726 } 727 728 /* 729 * Look up a socket for ident or tcpdrop, ... 730 */ 731 int 732 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) 733 { 734 int error = 0; 735 struct tcp_ident_mapping tir; 736 struct inpcb *inp; 737 struct tcpcb *tp = NULL; 738 struct sockaddr_in *fin, *lin; 739 #ifdef INET6 740 struct sockaddr_in6 *fin6, *lin6; 741 struct in6_addr f6, l6; 742 #endif 743 744 splsoftassert(IPL_SOFTNET); 745 746 if (dodrop) { 747 if (oldp != NULL || *oldlenp != 0) 748 return (EINVAL); 749 if (newp == NULL) 750 return (EPERM); 751 if (newlen < sizeof(tir)) 752 return (ENOMEM); 753 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) 754 return (error); 755 } else { 756 if (oldp == NULL) 757 return (EINVAL); 758 if (*oldlenp < sizeof(tir)) 759 return (ENOMEM); 760 if (newp != NULL || newlen != 0) 761 return (EINVAL); 762 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) 763 return (error); 764 } 765 switch (tir.faddr.ss_family) { 766 #ifdef INET6 767 case AF_INET6: 768 fin6 = (struct sockaddr_in6 *)&tir.faddr; 769 error = in6_embedscope(&f6, fin6, NULL); 770 if (error) 771 return EINVAL; /*?*/ 772 lin6 = (struct sockaddr_in6 *)&tir.laddr; 773 error = in6_embedscope(&l6, lin6, NULL); 774 if (error) 775 return EINVAL; /*?*/ 776 break; 777 #endif 778 case AF_INET: 779 fin = (struct sockaddr_in *)&tir.faddr; 780 lin = (struct sockaddr_in *)&tir.laddr; 781 break; 782 default: 783 return (EINVAL); 784 } 785 786 switch (tir.faddr.ss_family) { 787 #ifdef INET6 788 case AF_INET6: 789 inp = in6_pcbhashlookup(&tcbtable, &f6, 790 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); 791 break; 792 #endif 793 case AF_INET: 794 inp = in_pcbhashlookup(&tcbtable, fin->sin_addr, 795 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); 796 break; 797 default: 798 unhandled_af(tir.faddr.ss_family); 799 } 800 801 if (dodrop) { 802 if (inp && (tp = intotcpcb(inp)) && 803 ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) 804 tp = tcp_drop(tp, ECONNABORTED); 805 else 806 error = ESRCH; 807 return (error); 808 } 809 810 if (inp == NULL) { 811 ++tcpstat.tcps_pcbhashmiss; 812 switch (tir.faddr.ss_family) { 813 #ifdef INET6 814 case AF_INET6: 815 inp = in6_pcblookup_listen(&tcbtable, 816 &l6, lin6->sin6_port, 0, NULL, tir.rdomain); 817 break; 818 #endif 819 case AF_INET: 820 inp = in_pcblookup_listen(&tcbtable, 821 lin->sin_addr, lin->sin_port, 0, NULL, tir.rdomain); 822 break; 823 } 824 } 825 826 if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) { 827 tir.ruid = inp->inp_socket->so_ruid; 828 tir.euid = inp->inp_socket->so_euid; 829 } else { 830 tir.ruid = -1; 831 tir.euid = -1; 832 } 833 834 *oldlenp = sizeof (tir); 835 error = copyout((void *)&tir, oldp, sizeof (tir)); 836 return (error); 837 } 838 839 /* 840 * Sysctl for tcp variables. 841 */ 842 int 843 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 844 size_t newlen) 845 { 846 int error, nval; 847 848 NET_ASSERT_LOCKED(); 849 850 /* All sysctl names at this level are terminal. */ 851 if (namelen != 1) 852 return (ENOTDIR); 853 854 switch (name[0]) { 855 #ifdef TCP_SACK 856 case TCPCTL_SACK: 857 return (sysctl_int(oldp, oldlenp, newp, newlen, 858 &tcp_do_sack)); 859 #endif 860 case TCPCTL_SLOWHZ: 861 return (sysctl_rdint(oldp, oldlenp, newp, PR_SLOWHZ)); 862 863 case TCPCTL_BADDYNAMIC: 864 return (sysctl_struct(oldp, oldlenp, newp, newlen, 865 baddynamicports.tcp, sizeof(baddynamicports.tcp))); 866 867 case TCPCTL_ROOTONLY: 868 if (newp && securelevel > 0) 869 return (EPERM); 870 return (sysctl_struct(oldp, oldlenp, newp, newlen, 871 rootonlyports.tcp, sizeof(rootonlyports.tcp))); 872 873 case TCPCTL_IDENT: 874 return (tcp_ident(oldp, oldlenp, newp, newlen, 0)); 875 876 case TCPCTL_DROP: 877 return (tcp_ident(oldp, oldlenp, newp, newlen, 1)); 878 879 case TCPCTL_ALWAYS_KEEPALIVE: 880 return (sysctl_int(oldp, oldlenp, newp, newlen, 881 &tcp_always_keepalive)); 882 883 #ifdef TCP_ECN 884 case TCPCTL_ECN: 885 return (sysctl_int(oldp, oldlenp, newp, newlen, 886 &tcp_do_ecn)); 887 #endif 888 case TCPCTL_REASS_LIMIT: 889 nval = tcp_reass_limit; 890 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 891 if (error) 892 return (error); 893 if (nval != tcp_reass_limit) { 894 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); 895 if (error) 896 return (error); 897 tcp_reass_limit = nval; 898 } 899 return (0); 900 #ifdef TCP_SACK 901 case TCPCTL_SACKHOLE_LIMIT: 902 nval = tcp_sackhole_limit; 903 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 904 if (error) 905 return (error); 906 if (nval != tcp_sackhole_limit) { 907 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); 908 if (error) 909 return (error); 910 tcp_sackhole_limit = nval; 911 } 912 return (0); 913 #endif 914 915 case TCPCTL_STATS: 916 if (newp != NULL) 917 return (EPERM); 918 { 919 struct syn_cache_set *set; 920 int i; 921 922 set = &tcp_syn_cache[tcp_syn_cache_active]; 923 tcpstat.tcps_sc_hash_size = set->scs_size; 924 tcpstat.tcps_sc_entry_count = set->scs_count; 925 tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit; 926 tcpstat.tcps_sc_bucket_maxlen = 0; 927 for (i = 0; i < set->scs_size; i++) { 928 if (tcpstat.tcps_sc_bucket_maxlen < 929 set->scs_buckethead[i].sch_length) 930 tcpstat.tcps_sc_bucket_maxlen = 931 set->scs_buckethead[i].sch_length; 932 } 933 tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit; 934 tcpstat.tcps_sc_uses_left = set->scs_use; 935 } 936 return (sysctl_struct(oldp, oldlenp, newp, newlen, 937 &tcpstat, sizeof(tcpstat))); 938 939 case TCPCTL_SYN_USE_LIMIT: 940 error = sysctl_int(oldp, oldlenp, newp, newlen, 941 &tcp_syn_use_limit); 942 if (error) 943 return (error); 944 if (newp != NULL) { 945 /* 946 * Global tcp_syn_use_limit is used when reseeding a 947 * new cache. Also update the value in active cache. 948 */ 949 if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit) 950 tcp_syn_cache[0].scs_use = tcp_syn_use_limit; 951 if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit) 952 tcp_syn_cache[1].scs_use = tcp_syn_use_limit; 953 } 954 return (0); 955 956 case TCPCTL_SYN_HASH_SIZE: 957 nval = tcp_syn_hash_size; 958 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 959 if (error) 960 return (error); 961 if (nval != tcp_syn_hash_size) { 962 if (nval < 1 || nval > 100000) 963 return (EINVAL); 964 /* 965 * If global hash size has been changed, switch sets as 966 * soon as possible. Then the actual hash array will 967 * be reallocated. 968 */ 969 if (tcp_syn_cache[0].scs_size != nval) 970 tcp_syn_cache[0].scs_use = 0; 971 if (tcp_syn_cache[1].scs_size != nval) 972 tcp_syn_cache[1].scs_use = 0; 973 tcp_syn_hash_size = nval; 974 } 975 return (0); 976 977 default: 978 if (name[0] < TCPCTL_MAXID) 979 return (sysctl_int_arr(tcpctl_vars, name, namelen, 980 oldp, oldlenp, newp, newlen)); 981 return (ENOPROTOOPT); 982 } 983 /* NOTREACHED */ 984 } 985 986 /* 987 * Scale the send buffer so that inflight data is not accounted against 988 * the limit. The buffer will scale with the congestion window, if the 989 * the receiver stops acking data the window will shrink and therefor 990 * the buffer size will shrink as well. 991 * In low memory situation try to shrink the buffer to the initial size 992 * disabling the send buffer scaling as long as the situation persists. 993 */ 994 void 995 tcp_update_sndspace(struct tcpcb *tp) 996 { 997 struct socket *so = tp->t_inpcb->inp_socket; 998 u_long nmax = so->so_snd.sb_hiwat; 999 1000 if (sbchecklowmem()) { 1001 /* low on memory try to get rid of some */ 1002 if (tcp_sendspace < nmax) 1003 nmax = tcp_sendspace; 1004 } else if (so->so_snd.sb_wat != tcp_sendspace) 1005 /* user requested buffer size, auto-scaling disabled */ 1006 nmax = so->so_snd.sb_wat; 1007 else 1008 /* automatic buffer scaling */ 1009 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - 1010 tp->snd_una); 1011 1012 /* a writable socket must be preserved because of poll(2) semantics */ 1013 if (sbspace(&so->so_snd) >= so->so_snd.sb_lowat) { 1014 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) 1015 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; 1016 if (nmax * 2 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat) 1017 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+1) / 2; 1018 } 1019 1020 /* round to MSS boundary */ 1021 nmax = roundup(nmax, tp->t_maxseg); 1022 1023 if (nmax != so->so_snd.sb_hiwat) 1024 sbreserve(&so->so_snd, nmax); 1025 } 1026 1027 /* 1028 * Scale the recv buffer by looking at how much data was transferred in 1029 * on approximated RTT. If more than a big part of the recv buffer was 1030 * transferred during that time we increase the buffer by a constant. 1031 * In low memory situation try to shrink the buffer to the initial size. 1032 */ 1033 void 1034 tcp_update_rcvspace(struct tcpcb *tp) 1035 { 1036 struct socket *so = tp->t_inpcb->inp_socket; 1037 u_long nmax = so->so_rcv.sb_hiwat; 1038 1039 if (sbchecklowmem()) { 1040 /* low on memory try to get rid of some */ 1041 if (tcp_recvspace < nmax) 1042 nmax = tcp_recvspace; 1043 } else if (so->so_rcv.sb_wat != tcp_recvspace) 1044 /* user requested buffer size, auto-scaling disabled */ 1045 nmax = so->so_rcv.sb_wat; 1046 else { 1047 /* automatic buffer scaling */ 1048 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) 1049 nmax = MIN(sb_max, so->so_rcv.sb_hiwat + 1050 tcp_autorcvbuf_inc); 1051 } 1052 1053 /* a readable socket must be preserved because of poll(2) semantics */ 1054 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && 1055 nmax < so->so_snd.sb_lowat) 1056 nmax = so->so_snd.sb_lowat; 1057 1058 if (nmax == so->so_rcv.sb_hiwat) 1059 return; 1060 1061 /* round to MSS boundary */ 1062 nmax = roundup(nmax, tp->t_maxseg); 1063 sbreserve(&so->so_rcv, nmax); 1064 } 1065