1 /* $OpenBSD: tcp_usrreq.c,v 1.176 2020/08/18 18:19:30 gnezdo Exp $ */ 2 /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/protosw.h> 77 #include <sys/stat.h> 78 #include <sys/sysctl.h> 79 #include <sys/domain.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/in_var.h> 89 #include <netinet/ip.h> 90 #include <netinet/in_pcb.h> 91 #include <netinet/ip_var.h> 92 #include <netinet/tcp.h> 93 #include <netinet/tcp_fsm.h> 94 #include <netinet/tcp_seq.h> 95 #include <netinet/tcp_timer.h> 96 #include <netinet/tcp_var.h> 97 #include <netinet/tcp_debug.h> 98 99 #ifdef INET6 100 #include <netinet6/in6_var.h> 101 #endif 102 103 #ifndef TCP_SENDSPACE 104 #define TCP_SENDSPACE 1024*16 105 #endif 106 u_int tcp_sendspace = TCP_SENDSPACE; 107 #ifndef TCP_RECVSPACE 108 #define TCP_RECVSPACE 1024*16 109 #endif 110 u_int tcp_recvspace = TCP_RECVSPACE; 111 u_int tcp_autorcvbuf_inc = 16 * 1024; 112 113 const struct sysctl_bounded_args tcpctl_vars[] = { 114 { TCPCTL_RFC1323, &tcp_do_rfc1323, 0, 1 }, 115 { TCPCTL_KEEPINITTIME, &tcptv_keep_init, 1, 3 * TCPTV_KEEP_INIT }, 116 { TCPCTL_KEEPIDLE, &tcp_keepidle, 1, 5 * TCPTV_KEEP_IDLE }, 117 { TCPCTL_KEEPINTVL, &tcp_keepintvl, 1, 3 * TCPTV_KEEPINTVL }, 118 { TCPCTL_SACK, &tcp_do_sack, 0, 1 }, 119 { TCPCTL_MSSDFLT, &tcp_mssdflt, TCP_MSS, 65535 }, 120 { TCPCTL_RSTPPSLIMIT, &tcp_rst_ppslim, 1, 1000 * 1000 }, 121 { TCPCTL_ACK_ON_PUSH, &tcp_ack_on_push, 0, 1 }, 122 #ifdef TCP_ECN 123 { TCPCTL_ECN, &tcp_do_ecn, 0, 1 }, 124 #endif 125 { TCPCTL_SYN_CACHE_LIMIT, &tcp_syn_cache_limit, 1, 1000 * 1000 }, 126 { TCPCTL_SYN_BUCKET_LIMIT, &tcp_syn_bucket_limit, 1, INT_MAX }, 127 { TCPCTL_RFC3390, &tcp_do_rfc3390, 0, 2 }, 128 }; 129 130 struct inpcbtable tcbtable; 131 132 int tcp_ident(void *, size_t *, void *, size_t, int); 133 134 /* 135 * Process a TCP user request for TCP tb. If this is a send request 136 * then m is the mbuf chain of send data. If this is a timer expiration 137 * (called from the software clock routine), then timertype tells which timer. 138 */ 139 /*ARGSUSED*/ 140 int 141 tcp_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam, 142 struct mbuf *control, struct proc *p) 143 { 144 struct inpcb *inp; 145 struct tcpcb *otp = NULL, *tp = NULL; 146 int error = 0; 147 short ostate; 148 149 if (req == PRU_CONTROL) { 150 #ifdef INET6 151 if (sotopf(so) == PF_INET6) 152 return in6_control(so, (u_long)m, (caddr_t)nam, 153 (struct ifnet *)control); 154 else 155 #endif /* INET6 */ 156 return (in_control(so, (u_long)m, (caddr_t)nam, 157 (struct ifnet *)control)); 158 } 159 160 soassertlocked(so); 161 162 if (control && control->m_len) { 163 error = EINVAL; 164 goto release; 165 } 166 167 inp = sotoinpcb(so); 168 /* 169 * When a TCP is attached to a socket, then there will be 170 * a (struct inpcb) pointed at by the socket, and this 171 * structure will point at a subsidiary (struct tcpcb). 172 */ 173 if (inp == NULL) { 174 error = so->so_error; 175 if (error == 0) 176 error = EINVAL; 177 goto release; 178 } 179 tp = intotcpcb(inp); 180 /* tp might get 0 when using socket splicing */ 181 if (tp == NULL) 182 goto release; 183 if (so->so_options & SO_DEBUG) { 184 otp = tp; 185 ostate = tp->t_state; 186 } 187 188 switch (req) { 189 190 /* 191 * Give the socket an address. 192 */ 193 case PRU_BIND: 194 error = in_pcbbind(inp, nam, p); 195 break; 196 197 /* 198 * Prepare to accept connections. 199 */ 200 case PRU_LISTEN: 201 if (inp->inp_lport == 0) 202 error = in_pcbbind(inp, NULL, p); 203 /* If the in_pcbbind() above is called, the tp->pf 204 should still be whatever it was before. */ 205 if (error == 0) 206 tp->t_state = TCPS_LISTEN; 207 break; 208 209 /* 210 * Initiate connection to peer. 211 * Create a template for use in transmissions on this connection. 212 * Enter SYN_SENT state, and mark socket as connecting. 213 * Start keep-alive timer, and seed output sequence space. 214 * Send initial segment on connection. 215 */ 216 case PRU_CONNECT: 217 #ifdef INET6 218 if (inp->inp_flags & INP_IPV6) { 219 struct sockaddr_in6 *sin6; 220 221 if ((error = in6_nam2sin6(nam, &sin6))) 222 break; 223 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || 224 IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { 225 error = EINVAL; 226 break; 227 } 228 error = in6_pcbconnect(inp, nam); 229 } else 230 #endif /* INET6 */ 231 { 232 struct sockaddr_in *sin; 233 234 if ((error = in_nam2sin(nam, &sin))) 235 break; 236 if ((sin->sin_addr.s_addr == INADDR_ANY) || 237 (sin->sin_addr.s_addr == INADDR_BROADCAST) || 238 IN_MULTICAST(sin->sin_addr.s_addr) || 239 in_broadcast(sin->sin_addr, inp->inp_rtableid)) { 240 error = EINVAL; 241 break; 242 } 243 error = in_pcbconnect(inp, nam); 244 } 245 if (error) 246 break; 247 248 tp->t_template = tcp_template(tp); 249 if (tp->t_template == 0) { 250 in_pcbdisconnect(inp); 251 error = ENOBUFS; 252 break; 253 } 254 255 so->so_state |= SS_CONNECTOUT; 256 257 /* Compute window scaling to request. */ 258 tcp_rscale(tp, sb_max); 259 260 soisconnecting(so); 261 tcpstat_inc(tcps_connattempt); 262 tp->t_state = TCPS_SYN_SENT; 263 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 264 tcp_set_iss_tsm(tp); 265 tcp_sendseqinit(tp); 266 tp->snd_last = tp->snd_una; 267 error = tcp_output(tp); 268 break; 269 270 /* 271 * Create a TCP connection between two sockets. 272 */ 273 case PRU_CONNECT2: 274 error = EOPNOTSUPP; 275 break; 276 277 /* 278 * Initiate disconnect from peer. 279 * If connection never passed embryonic stage, just drop; 280 * else if don't need to let data drain, then can just drop anyways, 281 * else have to begin TCP shutdown process: mark socket disconnecting, 282 * drain unread data, state switch to reflect user close, and 283 * send segment (e.g. FIN) to peer. Socket will be really disconnected 284 * when peer sends FIN and acks ours. 285 * 286 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 287 */ 288 case PRU_DISCONNECT: 289 tp = tcp_disconnect(tp); 290 break; 291 292 /* 293 * Accept a connection. Essentially all the work is 294 * done at higher levels; just return the address 295 * of the peer, storing through addr. 296 */ 297 case PRU_ACCEPT: 298 #ifdef INET6 299 if (inp->inp_flags & INP_IPV6) 300 in6_setpeeraddr(inp, nam); 301 else 302 #endif 303 in_setpeeraddr(inp, nam); 304 break; 305 306 /* 307 * Mark the connection as being incapable of further output. 308 */ 309 case PRU_SHUTDOWN: 310 if (so->so_state & SS_CANTSENDMORE) 311 break; 312 socantsendmore(so); 313 tp = tcp_usrclosed(tp); 314 if (tp) 315 error = tcp_output(tp); 316 break; 317 318 /* 319 * After a receive, possibly send window update to peer. 320 */ 321 case PRU_RCVD: 322 /* 323 * soreceive() calls this function when a user receives 324 * ancillary data on a listening socket. We don't call 325 * tcp_output in such a case, since there is no header 326 * template for a listening socket and hence the kernel 327 * will panic. 328 */ 329 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 330 (void) tcp_output(tp); 331 break; 332 333 /* 334 * Do a send by putting data in output queue and updating urgent 335 * marker if URG set. Possibly send more data. 336 */ 337 case PRU_SEND: 338 sbappendstream(so, &so->so_snd, m); 339 error = tcp_output(tp); 340 break; 341 342 /* 343 * Abort the TCP. 344 */ 345 case PRU_ABORT: 346 tp = tcp_drop(tp, ECONNABORTED); 347 break; 348 349 case PRU_SENSE: 350 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; 351 break; 352 353 case PRU_RCVOOB: 354 if ((so->so_oobmark == 0 && 355 (so->so_state & SS_RCVATMARK) == 0) || 356 so->so_options & SO_OOBINLINE || 357 tp->t_oobflags & TCPOOB_HADDATA) { 358 error = EINVAL; 359 break; 360 } 361 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 362 error = EWOULDBLOCK; 363 break; 364 } 365 m->m_len = 1; 366 *mtod(m, caddr_t) = tp->t_iobc; 367 if (((long)nam & MSG_PEEK) == 0) 368 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 369 break; 370 371 case PRU_SENDOOB: 372 if (sbspace(so, &so->so_snd) < -512) { 373 m_freem(m); 374 error = ENOBUFS; 375 break; 376 } 377 /* 378 * According to RFC961 (Assigned Protocols), 379 * the urgent pointer points to the last octet 380 * of urgent data. We continue, however, 381 * to consider it to indicate the first octet 382 * of data past the urgent section. 383 * Otherwise, snd_up should be one lower. 384 */ 385 sbappendstream(so, &so->so_snd, m); 386 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 387 tp->t_force = 1; 388 error = tcp_output(tp); 389 tp->t_force = 0; 390 break; 391 392 case PRU_SOCKADDR: 393 #ifdef INET6 394 if (inp->inp_flags & INP_IPV6) 395 in6_setsockaddr(inp, nam); 396 else 397 #endif 398 in_setsockaddr(inp, nam); 399 break; 400 401 case PRU_PEERADDR: 402 #ifdef INET6 403 if (inp->inp_flags & INP_IPV6) 404 in6_setpeeraddr(inp, nam); 405 else 406 #endif 407 in_setpeeraddr(inp, nam); 408 break; 409 410 default: 411 panic("tcp_usrreq"); 412 } 413 if (otp) 414 tcp_trace(TA_USER, ostate, tp, otp, NULL, req, 0); 415 return (error); 416 417 release: 418 if (req != PRU_RCVD && req != PRU_RCVOOB && req != PRU_SENSE) { 419 m_freem(control); 420 m_freem(m); 421 } 422 return (error); 423 } 424 425 int 426 tcp_ctloutput(int op, struct socket *so, int level, int optname, 427 struct mbuf *m) 428 { 429 int error = 0; 430 struct inpcb *inp; 431 struct tcpcb *tp; 432 int i; 433 434 inp = sotoinpcb(so); 435 if (inp == NULL) 436 return (ECONNRESET); 437 if (level != IPPROTO_TCP) { 438 switch (so->so_proto->pr_domain->dom_family) { 439 #ifdef INET6 440 case PF_INET6: 441 error = ip6_ctloutput(op, so, level, optname, m); 442 break; 443 #endif /* INET6 */ 444 case PF_INET: 445 error = ip_ctloutput(op, so, level, optname, m); 446 break; 447 default: 448 error = EAFNOSUPPORT; /*?*/ 449 break; 450 } 451 return (error); 452 } 453 tp = intotcpcb(inp); 454 455 switch (op) { 456 457 case PRCO_SETOPT: 458 switch (optname) { 459 460 case TCP_NODELAY: 461 if (m == NULL || m->m_len < sizeof (int)) 462 error = EINVAL; 463 else if (*mtod(m, int *)) 464 tp->t_flags |= TF_NODELAY; 465 else 466 tp->t_flags &= ~TF_NODELAY; 467 break; 468 469 case TCP_NOPUSH: 470 if (m == NULL || m->m_len < sizeof (int)) 471 error = EINVAL; 472 else if (*mtod(m, int *)) 473 tp->t_flags |= TF_NOPUSH; 474 else if (tp->t_flags & TF_NOPUSH) { 475 tp->t_flags &= ~TF_NOPUSH; 476 if (TCPS_HAVEESTABLISHED(tp->t_state)) 477 error = tcp_output(tp); 478 } 479 break; 480 481 case TCP_MAXSEG: 482 if (m == NULL || m->m_len < sizeof (int)) { 483 error = EINVAL; 484 break; 485 } 486 487 i = *mtod(m, int *); 488 if (i > 0 && i <= tp->t_maxseg) 489 tp->t_maxseg = i; 490 else 491 error = EINVAL; 492 break; 493 494 case TCP_SACK_ENABLE: 495 if (m == NULL || m->m_len < sizeof (int)) { 496 error = EINVAL; 497 break; 498 } 499 500 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 501 error = EPERM; 502 break; 503 } 504 505 if (tp->t_flags & TF_SIGNATURE) { 506 error = EPERM; 507 break; 508 } 509 510 if (*mtod(m, int *)) 511 tp->sack_enable = 1; 512 else 513 tp->sack_enable = 0; 514 break; 515 #ifdef TCP_SIGNATURE 516 case TCP_MD5SIG: 517 if (m == NULL || m->m_len < sizeof (int)) { 518 error = EINVAL; 519 break; 520 } 521 522 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 523 error = EPERM; 524 break; 525 } 526 527 if (*mtod(m, int *)) { 528 tp->t_flags |= TF_SIGNATURE; 529 tp->sack_enable = 0; 530 } else 531 tp->t_flags &= ~TF_SIGNATURE; 532 break; 533 #endif /* TCP_SIGNATURE */ 534 default: 535 error = ENOPROTOOPT; 536 break; 537 } 538 break; 539 540 case PRCO_GETOPT: 541 m->m_len = sizeof(int); 542 543 switch (optname) { 544 case TCP_NODELAY: 545 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 546 break; 547 case TCP_NOPUSH: 548 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 549 break; 550 case TCP_MAXSEG: 551 *mtod(m, int *) = tp->t_maxseg; 552 break; 553 case TCP_SACK_ENABLE: 554 *mtod(m, int *) = tp->sack_enable; 555 break; 556 #ifdef TCP_SIGNATURE 557 case TCP_MD5SIG: 558 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; 559 break; 560 #endif 561 default: 562 error = ENOPROTOOPT; 563 break; 564 } 565 break; 566 } 567 return (error); 568 } 569 570 /* 571 * Attach TCP protocol to socket, allocating 572 * internet protocol control block, tcp control block, 573 * buffer space, and entering LISTEN state to accept connections. 574 */ 575 int 576 tcp_attach(struct socket *so, int proto) 577 { 578 struct tcpcb *tp; 579 struct inpcb *inp; 580 int error; 581 582 if (so->so_pcb) 583 return EISCONN; 584 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || 585 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || 586 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { 587 error = soreserve(so, tcp_sendspace, tcp_recvspace); 588 if (error) 589 return (error); 590 } 591 592 NET_ASSERT_LOCKED(); 593 error = in_pcballoc(so, &tcbtable); 594 if (error) 595 return (error); 596 inp = sotoinpcb(so); 597 tp = tcp_newtcpcb(inp); 598 if (tp == NULL) { 599 unsigned int nofd = so->so_state & SS_NOFDREF; /* XXX */ 600 601 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 602 in_pcbdetach(inp); 603 so->so_state |= nofd; 604 return (ENOBUFS); 605 } 606 tp->t_state = TCPS_CLOSED; 607 #ifdef INET6 608 /* we disallow IPv4 mapped address completely. */ 609 if (inp->inp_flags & INP_IPV6) 610 tp->pf = PF_INET6; 611 else 612 tp->pf = PF_INET; 613 #else 614 tp->pf = PF_INET; 615 #endif 616 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 617 so->so_linger = TCP_LINGERTIME; 618 619 if (so->so_options & SO_DEBUG) 620 tcp_trace(TA_USER, TCPS_CLOSED, tp, tp, NULL, PRU_ATTACH, 0); 621 return (0); 622 } 623 624 int 625 tcp_detach(struct socket *so) 626 { 627 struct inpcb *inp; 628 struct tcpcb *otp = NULL, *tp = NULL; 629 int error = 0; 630 short ostate; 631 632 soassertlocked(so); 633 634 inp = sotoinpcb(so); 635 /* 636 * When a TCP is attached to a socket, then there will be 637 * a (struct inpcb) pointed at by the socket, and this 638 * structure will point at a subsidiary (struct tcpcb). 639 */ 640 if (inp == NULL) { 641 error = so->so_error; 642 if (error == 0) 643 error = EINVAL; 644 return (error); 645 } 646 tp = intotcpcb(inp); 647 /* tp might get 0 when using socket splicing */ 648 if (tp == NULL) 649 return (0); 650 if (so->so_options & SO_DEBUG) { 651 otp = tp; 652 ostate = tp->t_state; 653 } 654 655 /* 656 * Detach the TCP protocol from the socket. 657 * If the protocol state is non-embryonic, then can't 658 * do this directly: have to initiate a PRU_DISCONNECT, 659 * which may finish later; embryonic TCB's can just 660 * be discarded here. 661 */ 662 tp = tcp_disconnect(tp); 663 664 if (otp) 665 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DETACH, 0); 666 return (error); 667 } 668 669 /* 670 * Initiate (or continue) disconnect. 671 * If embryonic state, just send reset (once). 672 * If in ``let data drain'' option and linger null, just drop. 673 * Otherwise (hard), mark socket disconnecting and drop 674 * current input data; switch states based on user close, and 675 * send segment to peer (with FIN). 676 */ 677 struct tcpcb * 678 tcp_disconnect(struct tcpcb *tp) 679 { 680 struct socket *so = tp->t_inpcb->inp_socket; 681 682 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 683 tp = tcp_close(tp); 684 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 685 tp = tcp_drop(tp, 0); 686 else { 687 soisdisconnecting(so); 688 sbflush(so, &so->so_rcv); 689 tp = tcp_usrclosed(tp); 690 if (tp) 691 (void) tcp_output(tp); 692 } 693 return (tp); 694 } 695 696 /* 697 * User issued close, and wish to trail through shutdown states: 698 * if never received SYN, just forget it. If got a SYN from peer, 699 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 700 * If already got a FIN from peer, then almost done; go to LAST_ACK 701 * state. In all other cases, have already sent FIN to peer (e.g. 702 * after PRU_SHUTDOWN), and just have to play tedious game waiting 703 * for peer to send FIN or not respond to keep-alives, etc. 704 * We can let the user exit from the close as soon as the FIN is acked. 705 */ 706 struct tcpcb * 707 tcp_usrclosed(struct tcpcb *tp) 708 { 709 710 switch (tp->t_state) { 711 712 case TCPS_CLOSED: 713 case TCPS_LISTEN: 714 case TCPS_SYN_SENT: 715 tp->t_state = TCPS_CLOSED; 716 tp = tcp_close(tp); 717 break; 718 719 case TCPS_SYN_RECEIVED: 720 case TCPS_ESTABLISHED: 721 tp->t_state = TCPS_FIN_WAIT_1; 722 break; 723 724 case TCPS_CLOSE_WAIT: 725 tp->t_state = TCPS_LAST_ACK; 726 break; 727 } 728 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 729 soisdisconnected(tp->t_inpcb->inp_socket); 730 /* 731 * If we are in FIN_WAIT_2, we arrived here because the 732 * application did a shutdown of the send side. Like the 733 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 734 * a full close, we start a timer to make sure sockets are 735 * not left in FIN_WAIT_2 forever. 736 */ 737 if (tp->t_state == TCPS_FIN_WAIT_2) 738 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 739 } 740 return (tp); 741 } 742 743 /* 744 * Look up a socket for ident or tcpdrop, ... 745 */ 746 int 747 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) 748 { 749 int error = 0; 750 struct tcp_ident_mapping tir; 751 struct inpcb *inp; 752 struct tcpcb *tp = NULL; 753 struct sockaddr_in *fin, *lin; 754 #ifdef INET6 755 struct sockaddr_in6 *fin6, *lin6; 756 struct in6_addr f6, l6; 757 #endif 758 759 NET_ASSERT_LOCKED(); 760 761 if (dodrop) { 762 if (oldp != NULL || *oldlenp != 0) 763 return (EINVAL); 764 if (newp == NULL) 765 return (EPERM); 766 if (newlen < sizeof(tir)) 767 return (ENOMEM); 768 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) 769 return (error); 770 } else { 771 if (oldp == NULL) 772 return (EINVAL); 773 if (*oldlenp < sizeof(tir)) 774 return (ENOMEM); 775 if (newp != NULL || newlen != 0) 776 return (EINVAL); 777 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) 778 return (error); 779 } 780 switch (tir.faddr.ss_family) { 781 #ifdef INET6 782 case AF_INET6: 783 fin6 = (struct sockaddr_in6 *)&tir.faddr; 784 error = in6_embedscope(&f6, fin6, NULL); 785 if (error) 786 return EINVAL; /*?*/ 787 lin6 = (struct sockaddr_in6 *)&tir.laddr; 788 error = in6_embedscope(&l6, lin6, NULL); 789 if (error) 790 return EINVAL; /*?*/ 791 break; 792 #endif 793 case AF_INET: 794 fin = (struct sockaddr_in *)&tir.faddr; 795 lin = (struct sockaddr_in *)&tir.laddr; 796 break; 797 default: 798 return (EINVAL); 799 } 800 801 switch (tir.faddr.ss_family) { 802 #ifdef INET6 803 case AF_INET6: 804 inp = in6_pcbhashlookup(&tcbtable, &f6, 805 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); 806 break; 807 #endif 808 case AF_INET: 809 inp = in_pcbhashlookup(&tcbtable, fin->sin_addr, 810 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); 811 break; 812 default: 813 unhandled_af(tir.faddr.ss_family); 814 } 815 816 if (dodrop) { 817 if (inp && (tp = intotcpcb(inp)) && 818 ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) 819 tp = tcp_drop(tp, ECONNABORTED); 820 else 821 error = ESRCH; 822 return (error); 823 } 824 825 if (inp == NULL) { 826 tcpstat_inc(tcps_pcbhashmiss); 827 switch (tir.faddr.ss_family) { 828 #ifdef INET6 829 case AF_INET6: 830 inp = in6_pcblookup_listen(&tcbtable, 831 &l6, lin6->sin6_port, NULL, tir.rdomain); 832 break; 833 #endif 834 case AF_INET: 835 inp = in_pcblookup_listen(&tcbtable, 836 lin->sin_addr, lin->sin_port, NULL, tir.rdomain); 837 break; 838 } 839 } 840 841 if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) { 842 tir.ruid = inp->inp_socket->so_ruid; 843 tir.euid = inp->inp_socket->so_euid; 844 } else { 845 tir.ruid = -1; 846 tir.euid = -1; 847 } 848 849 *oldlenp = sizeof (tir); 850 error = copyout((void *)&tir, oldp, sizeof (tir)); 851 return (error); 852 } 853 854 int 855 tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp) 856 { 857 uint64_t counters[tcps_ncounters]; 858 struct tcpstat tcpstat; 859 struct syn_cache_set *set; 860 int i = 0; 861 862 #define ASSIGN(field) do { tcpstat.field = counters[i++]; } while (0) 863 864 memset(&tcpstat, 0, sizeof tcpstat); 865 counters_read(tcpcounters, counters, nitems(counters)); 866 ASSIGN(tcps_connattempt); 867 ASSIGN(tcps_accepts); 868 ASSIGN(tcps_connects); 869 ASSIGN(tcps_drops); 870 ASSIGN(tcps_conndrops); 871 ASSIGN(tcps_closed); 872 ASSIGN(tcps_segstimed); 873 ASSIGN(tcps_rttupdated); 874 ASSIGN(tcps_delack); 875 ASSIGN(tcps_timeoutdrop); 876 ASSIGN(tcps_rexmttimeo); 877 ASSIGN(tcps_persisttimeo); 878 ASSIGN(tcps_persistdrop); 879 ASSIGN(tcps_keeptimeo); 880 ASSIGN(tcps_keepprobe); 881 ASSIGN(tcps_keepdrops); 882 ASSIGN(tcps_sndtotal); 883 ASSIGN(tcps_sndpack); 884 ASSIGN(tcps_sndbyte); 885 ASSIGN(tcps_sndrexmitpack); 886 ASSIGN(tcps_sndrexmitbyte); 887 ASSIGN(tcps_sndrexmitfast); 888 ASSIGN(tcps_sndacks); 889 ASSIGN(tcps_sndprobe); 890 ASSIGN(tcps_sndurg); 891 ASSIGN(tcps_sndwinup); 892 ASSIGN(tcps_sndctrl); 893 ASSIGN(tcps_rcvtotal); 894 ASSIGN(tcps_rcvpack); 895 ASSIGN(tcps_rcvbyte); 896 ASSIGN(tcps_rcvbadsum); 897 ASSIGN(tcps_rcvbadoff); 898 ASSIGN(tcps_rcvmemdrop); 899 ASSIGN(tcps_rcvnosec); 900 ASSIGN(tcps_rcvshort); 901 ASSIGN(tcps_rcvduppack); 902 ASSIGN(tcps_rcvdupbyte); 903 ASSIGN(tcps_rcvpartduppack); 904 ASSIGN(tcps_rcvpartdupbyte); 905 ASSIGN(tcps_rcvoopack); 906 ASSIGN(tcps_rcvoobyte); 907 ASSIGN(tcps_rcvpackafterwin); 908 ASSIGN(tcps_rcvbyteafterwin); 909 ASSIGN(tcps_rcvafterclose); 910 ASSIGN(tcps_rcvwinprobe); 911 ASSIGN(tcps_rcvdupack); 912 ASSIGN(tcps_rcvacktoomuch); 913 ASSIGN(tcps_rcvacktooold); 914 ASSIGN(tcps_rcvackpack); 915 ASSIGN(tcps_rcvackbyte); 916 ASSIGN(tcps_rcvwinupd); 917 ASSIGN(tcps_pawsdrop); 918 ASSIGN(tcps_predack); 919 ASSIGN(tcps_preddat); 920 ASSIGN(tcps_pcbhashmiss); 921 ASSIGN(tcps_noport); 922 ASSIGN(tcps_badsyn); 923 ASSIGN(tcps_dropsyn); 924 ASSIGN(tcps_rcvbadsig); 925 ASSIGN(tcps_rcvgoodsig); 926 ASSIGN(tcps_inswcsum); 927 ASSIGN(tcps_outswcsum); 928 ASSIGN(tcps_ecn_accepts); 929 ASSIGN(tcps_ecn_rcvece); 930 ASSIGN(tcps_ecn_rcvcwr); 931 ASSIGN(tcps_ecn_rcvce); 932 ASSIGN(tcps_ecn_sndect); 933 ASSIGN(tcps_ecn_sndece); 934 ASSIGN(tcps_ecn_sndcwr); 935 ASSIGN(tcps_cwr_ecn); 936 ASSIGN(tcps_cwr_frecovery); 937 ASSIGN(tcps_cwr_timeout); 938 ASSIGN(tcps_sc_added); 939 ASSIGN(tcps_sc_completed); 940 ASSIGN(tcps_sc_timed_out); 941 ASSIGN(tcps_sc_overflowed); 942 ASSIGN(tcps_sc_reset); 943 ASSIGN(tcps_sc_unreach); 944 ASSIGN(tcps_sc_bucketoverflow); 945 ASSIGN(tcps_sc_aborted); 946 ASSIGN(tcps_sc_dupesyn); 947 ASSIGN(tcps_sc_dropped); 948 ASSIGN(tcps_sc_collisions); 949 ASSIGN(tcps_sc_retransmitted); 950 ASSIGN(tcps_sc_seedrandom); 951 ASSIGN(tcps_sc_hash_size); 952 ASSIGN(tcps_sc_entry_count); 953 ASSIGN(tcps_sc_entry_limit); 954 ASSIGN(tcps_sc_bucket_maxlen); 955 ASSIGN(tcps_sc_bucket_limit); 956 ASSIGN(tcps_sc_uses_left); 957 ASSIGN(tcps_conndrained); 958 ASSIGN(tcps_sack_recovery_episode); 959 ASSIGN(tcps_sack_rexmits); 960 ASSIGN(tcps_sack_rexmit_bytes); 961 ASSIGN(tcps_sack_rcv_opts); 962 ASSIGN(tcps_sack_snd_opts); 963 ASSIGN(tcps_sack_drop_opts); 964 965 #undef ASSIGN 966 967 set = &tcp_syn_cache[tcp_syn_cache_active]; 968 tcpstat.tcps_sc_hash_size = set->scs_size; 969 tcpstat.tcps_sc_entry_count = set->scs_count; 970 tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit; 971 tcpstat.tcps_sc_bucket_maxlen = 0; 972 for (i = 0; i < set->scs_size; i++) { 973 if (tcpstat.tcps_sc_bucket_maxlen < 974 set->scs_buckethead[i].sch_length) 975 tcpstat.tcps_sc_bucket_maxlen = 976 set->scs_buckethead[i].sch_length; 977 } 978 tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit; 979 tcpstat.tcps_sc_uses_left = set->scs_use; 980 981 return (sysctl_rdstruct(oldp, oldlenp, newp, 982 &tcpstat, sizeof(tcpstat))); 983 } 984 985 /* 986 * Sysctl for tcp variables. 987 */ 988 int 989 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 990 size_t newlen) 991 { 992 int error, nval; 993 994 /* All sysctl names at this level are terminal. */ 995 if (namelen != 1) 996 return (ENOTDIR); 997 998 switch (name[0]) { 999 case TCPCTL_SLOWHZ: 1000 return (sysctl_rdint(oldp, oldlenp, newp, PR_SLOWHZ)); 1001 1002 case TCPCTL_BADDYNAMIC: 1003 NET_LOCK(); 1004 error = sysctl_struct(oldp, oldlenp, newp, newlen, 1005 baddynamicports.tcp, sizeof(baddynamicports.tcp)); 1006 NET_UNLOCK(); 1007 return (error); 1008 1009 case TCPCTL_ROOTONLY: 1010 if (newp && securelevel > 0) 1011 return (EPERM); 1012 NET_LOCK(); 1013 error = sysctl_struct(oldp, oldlenp, newp, newlen, 1014 rootonlyports.tcp, sizeof(rootonlyports.tcp)); 1015 NET_UNLOCK(); 1016 return (error); 1017 1018 case TCPCTL_IDENT: 1019 NET_LOCK(); 1020 error = tcp_ident(oldp, oldlenp, newp, newlen, 0); 1021 NET_UNLOCK(); 1022 return (error); 1023 1024 case TCPCTL_DROP: 1025 NET_LOCK(); 1026 error = tcp_ident(oldp, oldlenp, newp, newlen, 1); 1027 NET_UNLOCK(); 1028 return (error); 1029 1030 case TCPCTL_ALWAYS_KEEPALIVE: 1031 NET_LOCK(); 1032 error = sysctl_int(oldp, oldlenp, newp, newlen, 1033 &tcp_always_keepalive); 1034 NET_UNLOCK(); 1035 return (error); 1036 1037 case TCPCTL_REASS_LIMIT: 1038 NET_LOCK(); 1039 nval = tcp_reass_limit; 1040 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1041 if (!error && nval != tcp_reass_limit) { 1042 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); 1043 if (!error) 1044 tcp_reass_limit = nval; 1045 } 1046 NET_UNLOCK(); 1047 return (error); 1048 1049 case TCPCTL_SACKHOLE_LIMIT: 1050 NET_LOCK(); 1051 nval = tcp_sackhole_limit; 1052 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1053 if (!error && nval != tcp_sackhole_limit) { 1054 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); 1055 if (!error) 1056 tcp_sackhole_limit = nval; 1057 } 1058 NET_UNLOCK(); 1059 return (error); 1060 1061 case TCPCTL_STATS: 1062 return (tcp_sysctl_tcpstat(oldp, oldlenp, newp)); 1063 1064 case TCPCTL_SYN_USE_LIMIT: 1065 NET_LOCK(); 1066 error = sysctl_int(oldp, oldlenp, newp, newlen, 1067 &tcp_syn_use_limit); 1068 if (!error && newp != NULL) { 1069 /* 1070 * Global tcp_syn_use_limit is used when reseeding a 1071 * new cache. Also update the value in active cache. 1072 */ 1073 if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit) 1074 tcp_syn_cache[0].scs_use = tcp_syn_use_limit; 1075 if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit) 1076 tcp_syn_cache[1].scs_use = tcp_syn_use_limit; 1077 } 1078 NET_UNLOCK(); 1079 return (error); 1080 1081 case TCPCTL_SYN_HASH_SIZE: 1082 NET_LOCK(); 1083 nval = tcp_syn_hash_size; 1084 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1085 if (!error && nval != tcp_syn_hash_size) { 1086 if (nval < 1 || nval > 100000) { 1087 error = EINVAL; 1088 } else { 1089 /* 1090 * If global hash size has been changed, 1091 * switch sets as soon as possible. Then 1092 * the actual hash array will be reallocated. 1093 */ 1094 if (tcp_syn_cache[0].scs_size != nval) 1095 tcp_syn_cache[0].scs_use = 0; 1096 if (tcp_syn_cache[1].scs_size != nval) 1097 tcp_syn_cache[1].scs_use = 0; 1098 tcp_syn_hash_size = nval; 1099 } 1100 } 1101 NET_UNLOCK(); 1102 return (error); 1103 1104 default: 1105 NET_LOCK(); 1106 error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars), name, 1107 namelen, oldp, oldlenp, newp, newlen); 1108 NET_UNLOCK(); 1109 return (error); 1110 } 1111 /* NOTREACHED */ 1112 } 1113 1114 /* 1115 * Scale the send buffer so that inflight data is not accounted against 1116 * the limit. The buffer will scale with the congestion window, if the 1117 * the receiver stops acking data the window will shrink and therefor 1118 * the buffer size will shrink as well. 1119 * In low memory situation try to shrink the buffer to the initial size 1120 * disabling the send buffer scaling as long as the situation persists. 1121 */ 1122 void 1123 tcp_update_sndspace(struct tcpcb *tp) 1124 { 1125 struct socket *so = tp->t_inpcb->inp_socket; 1126 u_long nmax = so->so_snd.sb_hiwat; 1127 1128 if (sbchecklowmem()) { 1129 /* low on memory try to get rid of some */ 1130 if (tcp_sendspace < nmax) 1131 nmax = tcp_sendspace; 1132 } else if (so->so_snd.sb_wat != tcp_sendspace) 1133 /* user requested buffer size, auto-scaling disabled */ 1134 nmax = so->so_snd.sb_wat; 1135 else 1136 /* automatic buffer scaling */ 1137 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - 1138 tp->snd_una); 1139 1140 /* a writable socket must be preserved because of poll(2) semantics */ 1141 if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) { 1142 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) 1143 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; 1144 /* keep in sync with sbreserve() calculation */ 1145 if (nmax * 8 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat) 1146 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+7) / 8; 1147 } 1148 1149 /* round to MSS boundary */ 1150 nmax = roundup(nmax, tp->t_maxseg); 1151 1152 if (nmax != so->so_snd.sb_hiwat) 1153 sbreserve(so, &so->so_snd, nmax); 1154 } 1155 1156 /* 1157 * Scale the recv buffer by looking at how much data was transferred in 1158 * on approximated RTT. If more than a big part of the recv buffer was 1159 * transferred during that time we increase the buffer by a constant. 1160 * In low memory situation try to shrink the buffer to the initial size. 1161 */ 1162 void 1163 tcp_update_rcvspace(struct tcpcb *tp) 1164 { 1165 struct socket *so = tp->t_inpcb->inp_socket; 1166 u_long nmax = so->so_rcv.sb_hiwat; 1167 1168 if (sbchecklowmem()) { 1169 /* low on memory try to get rid of some */ 1170 if (tcp_recvspace < nmax) 1171 nmax = tcp_recvspace; 1172 } else if (so->so_rcv.sb_wat != tcp_recvspace) 1173 /* user requested buffer size, auto-scaling disabled */ 1174 nmax = so->so_rcv.sb_wat; 1175 else { 1176 /* automatic buffer scaling */ 1177 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) 1178 nmax = MIN(sb_max, so->so_rcv.sb_hiwat + 1179 tcp_autorcvbuf_inc); 1180 } 1181 1182 /* a readable socket must be preserved because of poll(2) semantics */ 1183 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && 1184 nmax < so->so_snd.sb_lowat) 1185 nmax = so->so_snd.sb_lowat; 1186 1187 if (nmax == so->so_rcv.sb_hiwat) 1188 return; 1189 1190 /* round to MSS boundary */ 1191 nmax = roundup(nmax, tp->t_maxseg); 1192 sbreserve(so, &so->so_rcv, nmax); 1193 } 1194