1 /* $OpenBSD: tcp_usrreq.c,v 1.158 2017/10/25 12:38:21 job Exp $ */ 2 /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/protosw.h> 77 #include <sys/stat.h> 78 #include <sys/sysctl.h> 79 #include <sys/domain.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/in_var.h> 89 #include <netinet/ip.h> 90 #include <netinet/in_pcb.h> 91 #include <netinet/ip_var.h> 92 #include <netinet/tcp.h> 93 #include <netinet/tcp_fsm.h> 94 #include <netinet/tcp_seq.h> 95 #include <netinet/tcp_timer.h> 96 #include <netinet/tcp_var.h> 97 #include <netinet/tcp_debug.h> 98 99 #ifdef INET6 100 #include <netinet6/in6_var.h> 101 #endif 102 103 #ifndef TCP_SENDSPACE 104 #define TCP_SENDSPACE 1024*16 105 #endif 106 u_int tcp_sendspace = TCP_SENDSPACE; 107 #ifndef TCP_RECVSPACE 108 #define TCP_RECVSPACE 1024*16 109 #endif 110 u_int tcp_recvspace = TCP_RECVSPACE; 111 u_int tcp_autorcvbuf_inc = 16 * 1024; 112 113 int *tcpctl_vars[TCPCTL_MAXID] = TCPCTL_VARS; 114 115 struct inpcbtable tcbtable; 116 117 int tcp_ident(void *, size_t *, void *, size_t, int); 118 119 /* 120 * Process a TCP user request for TCP tb. If this is a send request 121 * then m is the mbuf chain of send data. If this is a timer expiration 122 * (called from the software clock routine), then timertype tells which timer. 123 */ 124 /*ARGSUSED*/ 125 int 126 tcp_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam, 127 struct mbuf *control, struct proc *p) 128 { 129 struct inpcb *inp; 130 struct tcpcb *tp = NULL; 131 int error = 0; 132 short ostate; 133 134 soassertlocked(so); 135 136 if (req == PRU_CONTROL) { 137 #ifdef INET6 138 if (sotopf(so) == PF_INET6) 139 return in6_control(so, (u_long)m, (caddr_t)nam, 140 (struct ifnet *)control); 141 else 142 #endif /* INET6 */ 143 return (in_control(so, (u_long)m, (caddr_t)nam, 144 (struct ifnet *)control)); 145 } 146 if (control && control->m_len) { 147 m_freem(control); 148 m_freem(m); 149 return (EINVAL); 150 } 151 152 inp = sotoinpcb(so); 153 /* 154 * When a TCP is attached to a socket, then there will be 155 * a (struct inpcb) pointed at by the socket, and this 156 * structure will point at a subsidiary (struct tcpcb). 157 */ 158 if (inp == NULL) { 159 error = so->so_error; 160 if (error == 0) 161 error = EINVAL; 162 /* 163 * The following corrects an mbuf leak under rare 164 * circumstances 165 */ 166 if (req == PRU_SEND || req == PRU_SENDOOB) 167 m_freem(m); 168 return (error); 169 } 170 if (inp) { 171 tp = intotcpcb(inp); 172 /* tp might get 0 when using socket splicing */ 173 if (tp == NULL) { 174 return (0); 175 } 176 #ifdef KPROF 177 tcp_acounts[tp->t_state][req]++; 178 #endif 179 ostate = tp->t_state; 180 } else 181 ostate = 0; 182 switch (req) { 183 184 /* 185 * PRU_DETACH detaches the TCP protocol from the socket. 186 * If the protocol state is non-embryonic, then can't 187 * do this directly: have to initiate a PRU_DISCONNECT, 188 * which may finish later; embryonic TCB's can just 189 * be discarded here. 190 */ 191 case PRU_DETACH: 192 tp = tcp_disconnect(tp); 193 break; 194 195 /* 196 * Give the socket an address. 197 */ 198 case PRU_BIND: 199 error = in_pcbbind(inp, nam, p); 200 break; 201 202 /* 203 * Prepare to accept connections. 204 */ 205 case PRU_LISTEN: 206 if (inp->inp_lport == 0) 207 error = in_pcbbind(inp, NULL, p); 208 /* If the in_pcbbind() above is called, the tp->pf 209 should still be whatever it was before. */ 210 if (error == 0) 211 tp->t_state = TCPS_LISTEN; 212 break; 213 214 /* 215 * Initiate connection to peer. 216 * Create a template for use in transmissions on this connection. 217 * Enter SYN_SENT state, and mark socket as connecting. 218 * Start keep-alive timer, and seed output sequence space. 219 * Send initial segment on connection. 220 */ 221 case PRU_CONNECT: 222 #ifdef INET6 223 if (inp->inp_flags & INP_IPV6) { 224 struct sockaddr_in6 *sin6; 225 226 if ((error = in6_nam2sin6(nam, &sin6))) 227 break; 228 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || 229 IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { 230 error = EINVAL; 231 break; 232 } 233 error = in6_pcbconnect(inp, nam); 234 } else 235 #endif /* INET6 */ 236 { 237 struct sockaddr_in *sin; 238 239 if ((error = in_nam2sin(nam, &sin))) 240 break; 241 if ((sin->sin_addr.s_addr == INADDR_ANY) || 242 (sin->sin_addr.s_addr == INADDR_BROADCAST) || 243 IN_MULTICAST(sin->sin_addr.s_addr) || 244 in_broadcast(sin->sin_addr, inp->inp_rtableid)) { 245 error = EINVAL; 246 break; 247 } 248 error = in_pcbconnect(inp, nam); 249 } 250 if (error) 251 break; 252 253 tp->t_template = tcp_template(tp); 254 if (tp->t_template == 0) { 255 in_pcbdisconnect(inp); 256 error = ENOBUFS; 257 break; 258 } 259 260 so->so_state |= SS_CONNECTOUT; 261 262 /* Compute window scaling to request. */ 263 tcp_rscale(tp, sb_max); 264 265 soisconnecting(so); 266 tcpstat_inc(tcps_connattempt); 267 tp->t_state = TCPS_SYN_SENT; 268 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 269 tcp_set_iss_tsm(tp); 270 tcp_sendseqinit(tp); 271 tp->snd_last = tp->snd_una; 272 error = tcp_output(tp); 273 break; 274 275 /* 276 * Create a TCP connection between two sockets. 277 */ 278 case PRU_CONNECT2: 279 error = EOPNOTSUPP; 280 break; 281 282 /* 283 * Initiate disconnect from peer. 284 * If connection never passed embryonic stage, just drop; 285 * else if don't need to let data drain, then can just drop anyways, 286 * else have to begin TCP shutdown process: mark socket disconnecting, 287 * drain unread data, state switch to reflect user close, and 288 * send segment (e.g. FIN) to peer. Socket will be really disconnected 289 * when peer sends FIN and acks ours. 290 * 291 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 292 */ 293 case PRU_DISCONNECT: 294 tp = tcp_disconnect(tp); 295 break; 296 297 /* 298 * Accept a connection. Essentially all the work is 299 * done at higher levels; just return the address 300 * of the peer, storing through addr. 301 */ 302 case PRU_ACCEPT: 303 #ifdef INET6 304 if (inp->inp_flags & INP_IPV6) 305 in6_setpeeraddr(inp, nam); 306 else 307 #endif 308 in_setpeeraddr(inp, nam); 309 break; 310 311 /* 312 * Mark the connection as being incapable of further output. 313 */ 314 case PRU_SHUTDOWN: 315 if (so->so_state & SS_CANTSENDMORE) 316 break; 317 socantsendmore(so); 318 tp = tcp_usrclosed(tp); 319 if (tp) 320 error = tcp_output(tp); 321 break; 322 323 /* 324 * After a receive, possibly send window update to peer. 325 */ 326 case PRU_RCVD: 327 /* 328 * soreceive() calls this function when a user receives 329 * ancillary data on a listening socket. We don't call 330 * tcp_output in such a case, since there is no header 331 * template for a listening socket and hence the kernel 332 * will panic. 333 */ 334 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 335 (void) tcp_output(tp); 336 break; 337 338 /* 339 * Do a send by putting data in output queue and updating urgent 340 * marker if URG set. Possibly send more data. 341 */ 342 case PRU_SEND: 343 sbappendstream(so, &so->so_snd, m); 344 error = tcp_output(tp); 345 break; 346 347 /* 348 * Abort the TCP. 349 */ 350 case PRU_ABORT: 351 tp = tcp_drop(tp, ECONNABORTED); 352 break; 353 354 case PRU_SENSE: 355 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; 356 return (0); 357 358 case PRU_RCVOOB: 359 if ((so->so_oobmark == 0 && 360 (so->so_state & SS_RCVATMARK) == 0) || 361 so->so_options & SO_OOBINLINE || 362 tp->t_oobflags & TCPOOB_HADDATA) { 363 error = EINVAL; 364 break; 365 } 366 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 367 error = EWOULDBLOCK; 368 break; 369 } 370 m->m_len = 1; 371 *mtod(m, caddr_t) = tp->t_iobc; 372 if (((long)nam & MSG_PEEK) == 0) 373 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 374 break; 375 376 case PRU_SENDOOB: 377 if (sbspace(so, &so->so_snd) < -512) { 378 m_freem(m); 379 error = ENOBUFS; 380 break; 381 } 382 /* 383 * According to RFC961 (Assigned Protocols), 384 * the urgent pointer points to the last octet 385 * of urgent data. We continue, however, 386 * to consider it to indicate the first octet 387 * of data past the urgent section. 388 * Otherwise, snd_up should be one lower. 389 */ 390 sbappendstream(so, &so->so_snd, m); 391 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 392 tp->t_force = 1; 393 error = tcp_output(tp); 394 tp->t_force = 0; 395 break; 396 397 case PRU_SOCKADDR: 398 #ifdef INET6 399 if (inp->inp_flags & INP_IPV6) 400 in6_setsockaddr(inp, nam); 401 else 402 #endif 403 in_setsockaddr(inp, nam); 404 break; 405 406 case PRU_PEERADDR: 407 #ifdef INET6 408 if (inp->inp_flags & INP_IPV6) 409 in6_setpeeraddr(inp, nam); 410 else 411 #endif 412 in_setpeeraddr(inp, nam); 413 break; 414 415 default: 416 panic("tcp_usrreq"); 417 } 418 if (tp && (so->so_options & SO_DEBUG)) 419 tcp_trace(TA_USER, ostate, tp, (caddr_t)0, req, 0); 420 return (error); 421 } 422 423 int 424 tcp_ctloutput(int op, struct socket *so, int level, int optname, 425 struct mbuf *m) 426 { 427 int error = 0; 428 struct inpcb *inp; 429 struct tcpcb *tp; 430 int i; 431 432 inp = sotoinpcb(so); 433 if (inp == NULL) 434 return (ECONNRESET); 435 if (level != IPPROTO_TCP) { 436 switch (so->so_proto->pr_domain->dom_family) { 437 #ifdef INET6 438 case PF_INET6: 439 error = ip6_ctloutput(op, so, level, optname, m); 440 break; 441 #endif /* INET6 */ 442 case PF_INET: 443 error = ip_ctloutput(op, so, level, optname, m); 444 break; 445 default: 446 error = EAFNOSUPPORT; /*?*/ 447 break; 448 } 449 return (error); 450 } 451 tp = intotcpcb(inp); 452 453 switch (op) { 454 455 case PRCO_SETOPT: 456 switch (optname) { 457 458 case TCP_NODELAY: 459 if (m == NULL || m->m_len < sizeof (int)) 460 error = EINVAL; 461 else if (*mtod(m, int *)) 462 tp->t_flags |= TF_NODELAY; 463 else 464 tp->t_flags &= ~TF_NODELAY; 465 break; 466 467 case TCP_NOPUSH: 468 if (m == NULL || m->m_len < sizeof (int)) 469 error = EINVAL; 470 else if (*mtod(m, int *)) 471 tp->t_flags |= TF_NOPUSH; 472 else if (tp->t_flags & TF_NOPUSH) { 473 tp->t_flags &= ~TF_NOPUSH; 474 if (TCPS_HAVEESTABLISHED(tp->t_state)) 475 error = tcp_output(tp); 476 } 477 break; 478 479 case TCP_MAXSEG: 480 if (m == NULL || m->m_len < sizeof (int)) { 481 error = EINVAL; 482 break; 483 } 484 485 i = *mtod(m, int *); 486 if (i > 0 && i <= tp->t_maxseg) 487 tp->t_maxseg = i; 488 else 489 error = EINVAL; 490 break; 491 492 case TCP_SACK_ENABLE: 493 if (m == NULL || m->m_len < sizeof (int)) { 494 error = EINVAL; 495 break; 496 } 497 498 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 499 error = EPERM; 500 break; 501 } 502 503 if (tp->t_flags & TF_SIGNATURE) { 504 error = EPERM; 505 break; 506 } 507 508 if (*mtod(m, int *)) 509 tp->sack_enable = 1; 510 else 511 tp->sack_enable = 0; 512 break; 513 #ifdef TCP_SIGNATURE 514 case TCP_MD5SIG: 515 if (m == NULL || m->m_len < sizeof (int)) { 516 error = EINVAL; 517 break; 518 } 519 520 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 521 error = EPERM; 522 break; 523 } 524 525 if (*mtod(m, int *)) { 526 tp->t_flags |= TF_SIGNATURE; 527 tp->sack_enable = 0; 528 } else 529 tp->t_flags &= ~TF_SIGNATURE; 530 break; 531 #endif /* TCP_SIGNATURE */ 532 default: 533 error = ENOPROTOOPT; 534 break; 535 } 536 break; 537 538 case PRCO_GETOPT: 539 m->m_len = sizeof(int); 540 541 switch (optname) { 542 case TCP_NODELAY: 543 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 544 break; 545 case TCP_NOPUSH: 546 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 547 break; 548 case TCP_MAXSEG: 549 *mtod(m, int *) = tp->t_maxseg; 550 break; 551 case TCP_SACK_ENABLE: 552 *mtod(m, int *) = tp->sack_enable; 553 break; 554 #ifdef TCP_SIGNATURE 555 case TCP_MD5SIG: 556 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; 557 break; 558 #endif 559 default: 560 error = ENOPROTOOPT; 561 break; 562 } 563 break; 564 } 565 return (error); 566 } 567 568 /* 569 * Attach TCP protocol to socket, allocating 570 * internet protocol control block, tcp control block, 571 * bufer space, and entering LISTEN state if to accept connections. 572 */ 573 int 574 tcp_attach(struct socket *so, int proto) 575 { 576 struct tcpcb *tp; 577 struct inpcb *inp; 578 int error; 579 580 if (so->so_pcb) 581 return EISCONN; 582 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || 583 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || 584 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { 585 error = soreserve(so, tcp_sendspace, tcp_recvspace); 586 if (error) 587 return (error); 588 } 589 590 error = in_pcballoc(so, &tcbtable); 591 if (error) 592 return (error); 593 inp = sotoinpcb(so); 594 tp = tcp_newtcpcb(inp); 595 if (tp == NULL) { 596 int nofd = so->so_state & SS_NOFDREF; /* XXX */ 597 598 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 599 in_pcbdetach(inp); 600 so->so_state |= nofd; 601 return (ENOBUFS); 602 } 603 tp->t_state = TCPS_CLOSED; 604 #ifdef INET6 605 /* we disallow IPv4 mapped address completely. */ 606 if (inp->inp_flags & INP_IPV6) 607 tp->pf = PF_INET6; 608 else 609 tp->pf = PF_INET; 610 #else 611 tp->pf = PF_INET; 612 #endif 613 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 614 so->so_linger = TCP_LINGERTIME; 615 616 if (tp && (so->so_options & SO_DEBUG)) 617 tcp_trace(TA_USER, 0, tp, (caddr_t)0, 0 /* XXX */, 0); 618 return (0); 619 } 620 621 /* 622 * Initiate (or continue) disconnect. 623 * If embryonic state, just send reset (once). 624 * If in ``let data drain'' option and linger null, just drop. 625 * Otherwise (hard), mark socket disconnecting and drop 626 * current input data; switch states based on user close, and 627 * send segment to peer (with FIN). 628 */ 629 struct tcpcb * 630 tcp_disconnect(struct tcpcb *tp) 631 { 632 struct socket *so = tp->t_inpcb->inp_socket; 633 634 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 635 tp = tcp_close(tp); 636 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 637 tp = tcp_drop(tp, 0); 638 else { 639 soisdisconnecting(so); 640 sbflush(so, &so->so_rcv); 641 tp = tcp_usrclosed(tp); 642 if (tp) 643 (void) tcp_output(tp); 644 } 645 return (tp); 646 } 647 648 /* 649 * User issued close, and wish to trail through shutdown states: 650 * if never received SYN, just forget it. If got a SYN from peer, 651 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 652 * If already got a FIN from peer, then almost done; go to LAST_ACK 653 * state. In all other cases, have already sent FIN to peer (e.g. 654 * after PRU_SHUTDOWN), and just have to play tedious game waiting 655 * for peer to send FIN or not respond to keep-alives, etc. 656 * We can let the user exit from the close as soon as the FIN is acked. 657 */ 658 struct tcpcb * 659 tcp_usrclosed(struct tcpcb *tp) 660 { 661 662 switch (tp->t_state) { 663 664 case TCPS_CLOSED: 665 case TCPS_LISTEN: 666 case TCPS_SYN_SENT: 667 tp->t_state = TCPS_CLOSED; 668 tp = tcp_close(tp); 669 break; 670 671 case TCPS_SYN_RECEIVED: 672 case TCPS_ESTABLISHED: 673 tp->t_state = TCPS_FIN_WAIT_1; 674 break; 675 676 case TCPS_CLOSE_WAIT: 677 tp->t_state = TCPS_LAST_ACK; 678 break; 679 } 680 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 681 soisdisconnected(tp->t_inpcb->inp_socket); 682 /* 683 * If we are in FIN_WAIT_2, we arrived here because the 684 * application did a shutdown of the send side. Like the 685 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 686 * a full close, we start a timer to make sure sockets are 687 * not left in FIN_WAIT_2 forever. 688 */ 689 if (tp->t_state == TCPS_FIN_WAIT_2) 690 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 691 } 692 return (tp); 693 } 694 695 /* 696 * Look up a socket for ident or tcpdrop, ... 697 */ 698 int 699 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) 700 { 701 int error = 0; 702 struct tcp_ident_mapping tir; 703 struct inpcb *inp; 704 struct tcpcb *tp = NULL; 705 struct sockaddr_in *fin, *lin; 706 #ifdef INET6 707 struct sockaddr_in6 *fin6, *lin6; 708 struct in6_addr f6, l6; 709 #endif 710 711 NET_ASSERT_LOCKED(); 712 713 if (dodrop) { 714 if (oldp != NULL || *oldlenp != 0) 715 return (EINVAL); 716 if (newp == NULL) 717 return (EPERM); 718 if (newlen < sizeof(tir)) 719 return (ENOMEM); 720 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) 721 return (error); 722 } else { 723 if (oldp == NULL) 724 return (EINVAL); 725 if (*oldlenp < sizeof(tir)) 726 return (ENOMEM); 727 if (newp != NULL || newlen != 0) 728 return (EINVAL); 729 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) 730 return (error); 731 } 732 switch (tir.faddr.ss_family) { 733 #ifdef INET6 734 case AF_INET6: 735 fin6 = (struct sockaddr_in6 *)&tir.faddr; 736 error = in6_embedscope(&f6, fin6, NULL); 737 if (error) 738 return EINVAL; /*?*/ 739 lin6 = (struct sockaddr_in6 *)&tir.laddr; 740 error = in6_embedscope(&l6, lin6, NULL); 741 if (error) 742 return EINVAL; /*?*/ 743 break; 744 #endif 745 case AF_INET: 746 fin = (struct sockaddr_in *)&tir.faddr; 747 lin = (struct sockaddr_in *)&tir.laddr; 748 break; 749 default: 750 return (EINVAL); 751 } 752 753 switch (tir.faddr.ss_family) { 754 #ifdef INET6 755 case AF_INET6: 756 inp = in6_pcbhashlookup(&tcbtable, &f6, 757 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); 758 break; 759 #endif 760 case AF_INET: 761 inp = in_pcbhashlookup(&tcbtable, fin->sin_addr, 762 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); 763 break; 764 default: 765 unhandled_af(tir.faddr.ss_family); 766 } 767 768 if (dodrop) { 769 if (inp && (tp = intotcpcb(inp)) && 770 ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) 771 tp = tcp_drop(tp, ECONNABORTED); 772 else 773 error = ESRCH; 774 return (error); 775 } 776 777 if (inp == NULL) { 778 tcpstat_inc(tcps_pcbhashmiss); 779 switch (tir.faddr.ss_family) { 780 #ifdef INET6 781 case AF_INET6: 782 inp = in6_pcblookup_listen(&tcbtable, 783 &l6, lin6->sin6_port, 0, NULL, tir.rdomain); 784 break; 785 #endif 786 case AF_INET: 787 inp = in_pcblookup_listen(&tcbtable, 788 lin->sin_addr, lin->sin_port, 0, NULL, tir.rdomain); 789 break; 790 } 791 } 792 793 if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) { 794 tir.ruid = inp->inp_socket->so_ruid; 795 tir.euid = inp->inp_socket->so_euid; 796 } else { 797 tir.ruid = -1; 798 tir.euid = -1; 799 } 800 801 *oldlenp = sizeof (tir); 802 error = copyout((void *)&tir, oldp, sizeof (tir)); 803 return (error); 804 } 805 806 int 807 tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp) 808 { 809 uint64_t counters[tcps_ncounters]; 810 struct tcpstat tcpstat; 811 struct syn_cache_set *set; 812 int i = 0; 813 814 #define ASSIGN(field) do { tcpstat.field = counters[i++]; } while (0) 815 816 memset(&tcpstat, 0, sizeof tcpstat); 817 counters_read(tcpcounters, counters, nitems(counters)); 818 ASSIGN(tcps_connattempt); 819 ASSIGN(tcps_accepts); 820 ASSIGN(tcps_connects); 821 ASSIGN(tcps_drops); 822 ASSIGN(tcps_conndrops); 823 ASSIGN(tcps_closed); 824 ASSIGN(tcps_segstimed); 825 ASSIGN(tcps_rttupdated); 826 ASSIGN(tcps_delack); 827 ASSIGN(tcps_timeoutdrop); 828 ASSIGN(tcps_rexmttimeo); 829 ASSIGN(tcps_persisttimeo); 830 ASSIGN(tcps_persistdrop); 831 ASSIGN(tcps_keeptimeo); 832 ASSIGN(tcps_keepprobe); 833 ASSIGN(tcps_keepdrops); 834 ASSIGN(tcps_sndtotal); 835 ASSIGN(tcps_sndpack); 836 ASSIGN(tcps_sndbyte); 837 ASSIGN(tcps_sndrexmitpack); 838 ASSIGN(tcps_sndrexmitbyte); 839 ASSIGN(tcps_sndrexmitfast); 840 ASSIGN(tcps_sndacks); 841 ASSIGN(tcps_sndprobe); 842 ASSIGN(tcps_sndurg); 843 ASSIGN(tcps_sndwinup); 844 ASSIGN(tcps_sndctrl); 845 ASSIGN(tcps_rcvtotal); 846 ASSIGN(tcps_rcvpack); 847 ASSIGN(tcps_rcvbyte); 848 ASSIGN(tcps_rcvbadsum); 849 ASSIGN(tcps_rcvbadoff); 850 ASSIGN(tcps_rcvmemdrop); 851 ASSIGN(tcps_rcvnosec); 852 ASSIGN(tcps_rcvshort); 853 ASSIGN(tcps_rcvduppack); 854 ASSIGN(tcps_rcvdupbyte); 855 ASSIGN(tcps_rcvpartduppack); 856 ASSIGN(tcps_rcvpartdupbyte); 857 ASSIGN(tcps_rcvoopack); 858 ASSIGN(tcps_rcvoobyte); 859 ASSIGN(tcps_rcvpackafterwin); 860 ASSIGN(tcps_rcvbyteafterwin); 861 ASSIGN(tcps_rcvafterclose); 862 ASSIGN(tcps_rcvwinprobe); 863 ASSIGN(tcps_rcvdupack); 864 ASSIGN(tcps_rcvacktoomuch); 865 ASSIGN(tcps_rcvacktooold); 866 ASSIGN(tcps_rcvackpack); 867 ASSIGN(tcps_rcvackbyte); 868 ASSIGN(tcps_rcvwinupd); 869 ASSIGN(tcps_pawsdrop); 870 ASSIGN(tcps_predack); 871 ASSIGN(tcps_preddat); 872 ASSIGN(tcps_pcbhashmiss); 873 ASSIGN(tcps_noport); 874 ASSIGN(tcps_badsyn); 875 ASSIGN(tcps_dropsyn); 876 ASSIGN(tcps_rcvbadsig); 877 ASSIGN(tcps_rcvgoodsig); 878 ASSIGN(tcps_inswcsum); 879 ASSIGN(tcps_outswcsum); 880 ASSIGN(tcps_ecn_accepts); 881 ASSIGN(tcps_ecn_rcvece); 882 ASSIGN(tcps_ecn_rcvcwr); 883 ASSIGN(tcps_ecn_rcvce); 884 ASSIGN(tcps_ecn_sndect); 885 ASSIGN(tcps_ecn_sndece); 886 ASSIGN(tcps_ecn_sndcwr); 887 ASSIGN(tcps_cwr_ecn); 888 ASSIGN(tcps_cwr_frecovery); 889 ASSIGN(tcps_cwr_timeout); 890 ASSIGN(tcps_sc_added); 891 ASSIGN(tcps_sc_completed); 892 ASSIGN(tcps_sc_timed_out); 893 ASSIGN(tcps_sc_overflowed); 894 ASSIGN(tcps_sc_reset); 895 ASSIGN(tcps_sc_unreach); 896 ASSIGN(tcps_sc_bucketoverflow); 897 ASSIGN(tcps_sc_aborted); 898 ASSIGN(tcps_sc_dupesyn); 899 ASSIGN(tcps_sc_dropped); 900 ASSIGN(tcps_sc_collisions); 901 ASSIGN(tcps_sc_retransmitted); 902 ASSIGN(tcps_sc_seedrandom); 903 ASSIGN(tcps_sc_hash_size); 904 ASSIGN(tcps_sc_entry_count); 905 ASSIGN(tcps_sc_entry_limit); 906 ASSIGN(tcps_sc_bucket_maxlen); 907 ASSIGN(tcps_sc_bucket_limit); 908 ASSIGN(tcps_sc_uses_left); 909 ASSIGN(tcps_conndrained); 910 ASSIGN(tcps_sack_recovery_episode); 911 ASSIGN(tcps_sack_rexmits); 912 ASSIGN(tcps_sack_rexmit_bytes); 913 ASSIGN(tcps_sack_rcv_opts); 914 ASSIGN(tcps_sack_snd_opts); 915 916 #undef ASSIGN 917 918 set = &tcp_syn_cache[tcp_syn_cache_active]; 919 tcpstat.tcps_sc_hash_size = set->scs_size; 920 tcpstat.tcps_sc_entry_count = set->scs_count; 921 tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit; 922 tcpstat.tcps_sc_bucket_maxlen = 0; 923 for (i = 0; i < set->scs_size; i++) { 924 if (tcpstat.tcps_sc_bucket_maxlen < 925 set->scs_buckethead[i].sch_length) 926 tcpstat.tcps_sc_bucket_maxlen = 927 set->scs_buckethead[i].sch_length; 928 } 929 tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit; 930 tcpstat.tcps_sc_uses_left = set->scs_use; 931 932 return (sysctl_rdstruct(oldp, oldlenp, newp, 933 &tcpstat, sizeof(tcpstat))); 934 } 935 936 /* 937 * Sysctl for tcp variables. 938 */ 939 int 940 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 941 size_t newlen) 942 { 943 int error, nval; 944 945 /* All sysctl names at this level are terminal. */ 946 if (namelen != 1) 947 return (ENOTDIR); 948 949 switch (name[0]) { 950 case TCPCTL_SACK: 951 NET_LOCK(); 952 error = sysctl_int(oldp, oldlenp, newp, newlen, 953 &tcp_do_sack); 954 NET_UNLOCK(); 955 return (error); 956 957 case TCPCTL_SLOWHZ: 958 return (sysctl_rdint(oldp, oldlenp, newp, PR_SLOWHZ)); 959 960 case TCPCTL_BADDYNAMIC: 961 NET_LOCK(); 962 error = sysctl_struct(oldp, oldlenp, newp, newlen, 963 baddynamicports.tcp, sizeof(baddynamicports.tcp)); 964 NET_UNLOCK(); 965 return (error); 966 967 case TCPCTL_ROOTONLY: 968 if (newp && securelevel > 0) 969 return (EPERM); 970 NET_LOCK(); 971 error = sysctl_struct(oldp, oldlenp, newp, newlen, 972 rootonlyports.tcp, sizeof(rootonlyports.tcp)); 973 NET_UNLOCK(); 974 return (error); 975 976 case TCPCTL_IDENT: 977 NET_LOCK(); 978 error = tcp_ident(oldp, oldlenp, newp, newlen, 0); 979 NET_UNLOCK(); 980 return (error); 981 982 case TCPCTL_DROP: 983 NET_LOCK(); 984 error = tcp_ident(oldp, oldlenp, newp, newlen, 1); 985 NET_UNLOCK(); 986 return (error); 987 988 case TCPCTL_ALWAYS_KEEPALIVE: 989 NET_LOCK(); 990 error = sysctl_int(oldp, oldlenp, newp, newlen, 991 &tcp_always_keepalive); 992 NET_UNLOCK(); 993 return (error); 994 995 #ifdef TCP_ECN 996 case TCPCTL_ECN: 997 NET_LOCK(); 998 error = sysctl_int(oldp, oldlenp, newp, newlen, 999 &tcp_do_ecn); 1000 NET_UNLOCK(); 1001 return (error); 1002 #endif 1003 case TCPCTL_REASS_LIMIT: 1004 NET_LOCK(); 1005 nval = tcp_reass_limit; 1006 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1007 if (!error && nval != tcp_reass_limit) { 1008 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); 1009 if (!error) 1010 tcp_reass_limit = nval; 1011 } 1012 NET_UNLOCK(); 1013 return (error); 1014 1015 case TCPCTL_SACKHOLE_LIMIT: 1016 NET_LOCK(); 1017 nval = tcp_sackhole_limit; 1018 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1019 if (!error && nval != tcp_sackhole_limit) { 1020 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); 1021 if (!error) 1022 tcp_sackhole_limit = nval; 1023 } 1024 NET_UNLOCK(); 1025 return (error); 1026 1027 case TCPCTL_STATS: 1028 return (tcp_sysctl_tcpstat(oldp, oldlenp, newp)); 1029 1030 case TCPCTL_SYN_USE_LIMIT: 1031 NET_LOCK(); 1032 error = sysctl_int(oldp, oldlenp, newp, newlen, 1033 &tcp_syn_use_limit); 1034 if (!error && newp != NULL) { 1035 /* 1036 * Global tcp_syn_use_limit is used when reseeding a 1037 * new cache. Also update the value in active cache. 1038 */ 1039 if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit) 1040 tcp_syn_cache[0].scs_use = tcp_syn_use_limit; 1041 if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit) 1042 tcp_syn_cache[1].scs_use = tcp_syn_use_limit; 1043 } 1044 NET_UNLOCK(); 1045 return (error); 1046 1047 case TCPCTL_SYN_HASH_SIZE: 1048 NET_LOCK(); 1049 nval = tcp_syn_hash_size; 1050 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1051 if (!error && nval != tcp_syn_hash_size) { 1052 if (nval < 1 || nval > 100000) { 1053 error = EINVAL; 1054 } else { 1055 /* 1056 * If global hash size has been changed, 1057 * switch sets as soon as possible. Then 1058 * the actual hash array will be reallocated. 1059 */ 1060 if (tcp_syn_cache[0].scs_size != nval) 1061 tcp_syn_cache[0].scs_use = 0; 1062 if (tcp_syn_cache[1].scs_size != nval) 1063 tcp_syn_cache[1].scs_use = 0; 1064 tcp_syn_hash_size = nval; 1065 } 1066 } 1067 NET_UNLOCK(); 1068 return (error); 1069 1070 default: 1071 if (name[0] < TCPCTL_MAXID) { 1072 NET_LOCK(); 1073 error = sysctl_int_arr(tcpctl_vars, name, namelen, 1074 oldp, oldlenp, newp, newlen); 1075 NET_UNLOCK(); 1076 return (error); 1077 } 1078 return (ENOPROTOOPT); 1079 } 1080 /* NOTREACHED */ 1081 } 1082 1083 /* 1084 * Scale the send buffer so that inflight data is not accounted against 1085 * the limit. The buffer will scale with the congestion window, if the 1086 * the receiver stops acking data the window will shrink and therefor 1087 * the buffer size will shrink as well. 1088 * In low memory situation try to shrink the buffer to the initial size 1089 * disabling the send buffer scaling as long as the situation persists. 1090 */ 1091 void 1092 tcp_update_sndspace(struct tcpcb *tp) 1093 { 1094 struct socket *so = tp->t_inpcb->inp_socket; 1095 u_long nmax = so->so_snd.sb_hiwat; 1096 1097 if (sbchecklowmem()) { 1098 /* low on memory try to get rid of some */ 1099 if (tcp_sendspace < nmax) 1100 nmax = tcp_sendspace; 1101 } else if (so->so_snd.sb_wat != tcp_sendspace) 1102 /* user requested buffer size, auto-scaling disabled */ 1103 nmax = so->so_snd.sb_wat; 1104 else 1105 /* automatic buffer scaling */ 1106 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - 1107 tp->snd_una); 1108 1109 /* a writable socket must be preserved because of poll(2) semantics */ 1110 if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) { 1111 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) 1112 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; 1113 if (nmax * 2 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat) 1114 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+1) / 2; 1115 } 1116 1117 /* round to MSS boundary */ 1118 nmax = roundup(nmax, tp->t_maxseg); 1119 1120 if (nmax != so->so_snd.sb_hiwat) 1121 sbreserve(so, &so->so_snd, nmax); 1122 } 1123 1124 /* 1125 * Scale the recv buffer by looking at how much data was transferred in 1126 * on approximated RTT. If more than a big part of the recv buffer was 1127 * transferred during that time we increase the buffer by a constant. 1128 * In low memory situation try to shrink the buffer to the initial size. 1129 */ 1130 void 1131 tcp_update_rcvspace(struct tcpcb *tp) 1132 { 1133 struct socket *so = tp->t_inpcb->inp_socket; 1134 u_long nmax = so->so_rcv.sb_hiwat; 1135 1136 if (sbchecklowmem()) { 1137 /* low on memory try to get rid of some */ 1138 if (tcp_recvspace < nmax) 1139 nmax = tcp_recvspace; 1140 } else if (so->so_rcv.sb_wat != tcp_recvspace) 1141 /* user requested buffer size, auto-scaling disabled */ 1142 nmax = so->so_rcv.sb_wat; 1143 else { 1144 /* automatic buffer scaling */ 1145 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) 1146 nmax = MIN(sb_max, so->so_rcv.sb_hiwat + 1147 tcp_autorcvbuf_inc); 1148 } 1149 1150 /* a readable socket must be preserved because of poll(2) semantics */ 1151 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && 1152 nmax < so->so_snd.sb_lowat) 1153 nmax = so->so_snd.sb_lowat; 1154 1155 if (nmax == so->so_rcv.sb_hiwat) 1156 return; 1157 1158 /* round to MSS boundary */ 1159 nmax = roundup(nmax, tp->t_maxseg); 1160 sbreserve(so, &so->so_rcv, nmax); 1161 } 1162