1 /* $OpenBSD: tcp_usrreq.c,v 1.164 2018/01/22 20:27:28 bluhm Exp $ */ 2 /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/protosw.h> 77 #include <sys/stat.h> 78 #include <sys/sysctl.h> 79 #include <sys/domain.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/in_var.h> 89 #include <netinet/ip.h> 90 #include <netinet/in_pcb.h> 91 #include <netinet/ip_var.h> 92 #include <netinet/tcp.h> 93 #include <netinet/tcp_fsm.h> 94 #include <netinet/tcp_seq.h> 95 #include <netinet/tcp_timer.h> 96 #include <netinet/tcp_var.h> 97 #include <netinet/tcp_debug.h> 98 99 #ifdef INET6 100 #include <netinet6/in6_var.h> 101 #endif 102 103 #ifndef TCP_SENDSPACE 104 #define TCP_SENDSPACE 1024*16 105 #endif 106 u_int tcp_sendspace = TCP_SENDSPACE; 107 #ifndef TCP_RECVSPACE 108 #define TCP_RECVSPACE 1024*16 109 #endif 110 u_int tcp_recvspace = TCP_RECVSPACE; 111 u_int tcp_autorcvbuf_inc = 16 * 1024; 112 113 int *tcpctl_vars[TCPCTL_MAXID] = TCPCTL_VARS; 114 115 struct inpcbtable tcbtable; 116 117 int tcp_ident(void *, size_t *, void *, size_t, int); 118 119 /* 120 * Process a TCP user request for TCP tb. If this is a send request 121 * then m is the mbuf chain of send data. If this is a timer expiration 122 * (called from the software clock routine), then timertype tells which timer. 123 */ 124 /*ARGSUSED*/ 125 int 126 tcp_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam, 127 struct mbuf *control, struct proc *p) 128 { 129 struct inpcb *inp; 130 struct tcpcb *tp = NULL; 131 int error = 0; 132 short ostate; 133 134 soassertlocked(so); 135 136 if (req == PRU_CONTROL) { 137 #ifdef INET6 138 if (sotopf(so) == PF_INET6) 139 return in6_control(so, (u_long)m, (caddr_t)nam, 140 (struct ifnet *)control); 141 else 142 #endif /* INET6 */ 143 return (in_control(so, (u_long)m, (caddr_t)nam, 144 (struct ifnet *)control)); 145 } 146 if (control && control->m_len) { 147 m_freem(control); 148 m_freem(m); 149 return (EINVAL); 150 } 151 152 inp = sotoinpcb(so); 153 /* 154 * When a TCP is attached to a socket, then there will be 155 * a (struct inpcb) pointed at by the socket, and this 156 * structure will point at a subsidiary (struct tcpcb). 157 */ 158 if (inp == NULL) { 159 error = so->so_error; 160 if (error == 0) 161 error = EINVAL; 162 /* 163 * The following corrects an mbuf leak under rare 164 * circumstances 165 */ 166 if (req == PRU_SEND || req == PRU_SENDOOB) 167 m_freem(m); 168 return (error); 169 } 170 if (inp) { 171 tp = intotcpcb(inp); 172 /* tp might get 0 when using socket splicing */ 173 if (tp == NULL) 174 return (0); 175 #ifdef KPROF 176 tcp_acounts[tp->t_state][req]++; 177 #endif 178 ostate = tp->t_state; 179 } else 180 ostate = 0; 181 switch (req) { 182 183 /* 184 * Give the socket an address. 185 */ 186 case PRU_BIND: 187 error = in_pcbbind(inp, nam, p); 188 break; 189 190 /* 191 * Prepare to accept connections. 192 */ 193 case PRU_LISTEN: 194 if (inp->inp_lport == 0) 195 error = in_pcbbind(inp, NULL, p); 196 /* If the in_pcbbind() above is called, the tp->pf 197 should still be whatever it was before. */ 198 if (error == 0) 199 tp->t_state = TCPS_LISTEN; 200 break; 201 202 /* 203 * Initiate connection to peer. 204 * Create a template for use in transmissions on this connection. 205 * Enter SYN_SENT state, and mark socket as connecting. 206 * Start keep-alive timer, and seed output sequence space. 207 * Send initial segment on connection. 208 */ 209 case PRU_CONNECT: 210 #ifdef INET6 211 if (inp->inp_flags & INP_IPV6) { 212 struct sockaddr_in6 *sin6; 213 214 if ((error = in6_nam2sin6(nam, &sin6))) 215 break; 216 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || 217 IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { 218 error = EINVAL; 219 break; 220 } 221 error = in6_pcbconnect(inp, nam); 222 } else 223 #endif /* INET6 */ 224 { 225 struct sockaddr_in *sin; 226 227 if ((error = in_nam2sin(nam, &sin))) 228 break; 229 if ((sin->sin_addr.s_addr == INADDR_ANY) || 230 (sin->sin_addr.s_addr == INADDR_BROADCAST) || 231 IN_MULTICAST(sin->sin_addr.s_addr) || 232 in_broadcast(sin->sin_addr, inp->inp_rtableid)) { 233 error = EINVAL; 234 break; 235 } 236 error = in_pcbconnect(inp, nam); 237 } 238 if (error) 239 break; 240 241 tp->t_template = tcp_template(tp); 242 if (tp->t_template == 0) { 243 in_pcbdisconnect(inp); 244 error = ENOBUFS; 245 break; 246 } 247 248 so->so_state |= SS_CONNECTOUT; 249 250 /* Compute window scaling to request. */ 251 tcp_rscale(tp, sb_max); 252 253 soisconnecting(so); 254 tcpstat_inc(tcps_connattempt); 255 tp->t_state = TCPS_SYN_SENT; 256 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 257 tcp_set_iss_tsm(tp); 258 tcp_sendseqinit(tp); 259 tp->snd_last = tp->snd_una; 260 error = tcp_output(tp); 261 break; 262 263 /* 264 * Create a TCP connection between two sockets. 265 */ 266 case PRU_CONNECT2: 267 error = EOPNOTSUPP; 268 break; 269 270 /* 271 * Initiate disconnect from peer. 272 * If connection never passed embryonic stage, just drop; 273 * else if don't need to let data drain, then can just drop anyways, 274 * else have to begin TCP shutdown process: mark socket disconnecting, 275 * drain unread data, state switch to reflect user close, and 276 * send segment (e.g. FIN) to peer. Socket will be really disconnected 277 * when peer sends FIN and acks ours. 278 * 279 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 280 */ 281 case PRU_DISCONNECT: 282 tp = tcp_disconnect(tp); 283 break; 284 285 /* 286 * Accept a connection. Essentially all the work is 287 * done at higher levels; just return the address 288 * of the peer, storing through addr. 289 */ 290 case PRU_ACCEPT: 291 #ifdef INET6 292 if (inp->inp_flags & INP_IPV6) 293 in6_setpeeraddr(inp, nam); 294 else 295 #endif 296 in_setpeeraddr(inp, nam); 297 break; 298 299 /* 300 * Mark the connection as being incapable of further output. 301 */ 302 case PRU_SHUTDOWN: 303 if (so->so_state & SS_CANTSENDMORE) 304 break; 305 socantsendmore(so); 306 tp = tcp_usrclosed(tp); 307 if (tp) 308 error = tcp_output(tp); 309 break; 310 311 /* 312 * After a receive, possibly send window update to peer. 313 */ 314 case PRU_RCVD: 315 /* 316 * soreceive() calls this function when a user receives 317 * ancillary data on a listening socket. We don't call 318 * tcp_output in such a case, since there is no header 319 * template for a listening socket and hence the kernel 320 * will panic. 321 */ 322 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 323 (void) tcp_output(tp); 324 break; 325 326 /* 327 * Do a send by putting data in output queue and updating urgent 328 * marker if URG set. Possibly send more data. 329 */ 330 case PRU_SEND: 331 sbappendstream(so, &so->so_snd, m); 332 error = tcp_output(tp); 333 break; 334 335 /* 336 * Abort the TCP. 337 */ 338 case PRU_ABORT: 339 tp = tcp_drop(tp, ECONNABORTED); 340 break; 341 342 case PRU_SENSE: 343 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; 344 return (0); 345 346 case PRU_RCVOOB: 347 if ((so->so_oobmark == 0 && 348 (so->so_state & SS_RCVATMARK) == 0) || 349 so->so_options & SO_OOBINLINE || 350 tp->t_oobflags & TCPOOB_HADDATA) { 351 error = EINVAL; 352 break; 353 } 354 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 355 error = EWOULDBLOCK; 356 break; 357 } 358 m->m_len = 1; 359 *mtod(m, caddr_t) = tp->t_iobc; 360 if (((long)nam & MSG_PEEK) == 0) 361 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 362 break; 363 364 case PRU_SENDOOB: 365 if (sbspace(so, &so->so_snd) < -512) { 366 m_freem(m); 367 error = ENOBUFS; 368 break; 369 } 370 /* 371 * According to RFC961 (Assigned Protocols), 372 * the urgent pointer points to the last octet 373 * of urgent data. We continue, however, 374 * to consider it to indicate the first octet 375 * of data past the urgent section. 376 * Otherwise, snd_up should be one lower. 377 */ 378 sbappendstream(so, &so->so_snd, m); 379 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 380 tp->t_force = 1; 381 error = tcp_output(tp); 382 tp->t_force = 0; 383 break; 384 385 case PRU_SOCKADDR: 386 #ifdef INET6 387 if (inp->inp_flags & INP_IPV6) 388 in6_setsockaddr(inp, nam); 389 else 390 #endif 391 in_setsockaddr(inp, nam); 392 break; 393 394 case PRU_PEERADDR: 395 #ifdef INET6 396 if (inp->inp_flags & INP_IPV6) 397 in6_setpeeraddr(inp, nam); 398 else 399 #endif 400 in_setpeeraddr(inp, nam); 401 break; 402 403 default: 404 panic("tcp_usrreq"); 405 } 406 if (tp && (so->so_options & SO_DEBUG)) 407 tcp_trace(TA_USER, ostate, tp, (caddr_t)0, req, 0); 408 return (error); 409 } 410 411 int 412 tcp_ctloutput(int op, struct socket *so, int level, int optname, 413 struct mbuf *m) 414 { 415 int error = 0; 416 struct inpcb *inp; 417 struct tcpcb *tp; 418 int i; 419 420 inp = sotoinpcb(so); 421 if (inp == NULL) 422 return (ECONNRESET); 423 if (level != IPPROTO_TCP) { 424 switch (so->so_proto->pr_domain->dom_family) { 425 #ifdef INET6 426 case PF_INET6: 427 error = ip6_ctloutput(op, so, level, optname, m); 428 break; 429 #endif /* INET6 */ 430 case PF_INET: 431 error = ip_ctloutput(op, so, level, optname, m); 432 break; 433 default: 434 error = EAFNOSUPPORT; /*?*/ 435 break; 436 } 437 return (error); 438 } 439 tp = intotcpcb(inp); 440 441 switch (op) { 442 443 case PRCO_SETOPT: 444 switch (optname) { 445 446 case TCP_NODELAY: 447 if (m == NULL || m->m_len < sizeof (int)) 448 error = EINVAL; 449 else if (*mtod(m, int *)) 450 tp->t_flags |= TF_NODELAY; 451 else 452 tp->t_flags &= ~TF_NODELAY; 453 break; 454 455 case TCP_NOPUSH: 456 if (m == NULL || m->m_len < sizeof (int)) 457 error = EINVAL; 458 else if (*mtod(m, int *)) 459 tp->t_flags |= TF_NOPUSH; 460 else if (tp->t_flags & TF_NOPUSH) { 461 tp->t_flags &= ~TF_NOPUSH; 462 if (TCPS_HAVEESTABLISHED(tp->t_state)) 463 error = tcp_output(tp); 464 } 465 break; 466 467 case TCP_MAXSEG: 468 if (m == NULL || m->m_len < sizeof (int)) { 469 error = EINVAL; 470 break; 471 } 472 473 i = *mtod(m, int *); 474 if (i > 0 && i <= tp->t_maxseg) 475 tp->t_maxseg = i; 476 else 477 error = EINVAL; 478 break; 479 480 case TCP_SACK_ENABLE: 481 if (m == NULL || m->m_len < sizeof (int)) { 482 error = EINVAL; 483 break; 484 } 485 486 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 487 error = EPERM; 488 break; 489 } 490 491 if (tp->t_flags & TF_SIGNATURE) { 492 error = EPERM; 493 break; 494 } 495 496 if (*mtod(m, int *)) 497 tp->sack_enable = 1; 498 else 499 tp->sack_enable = 0; 500 break; 501 #ifdef TCP_SIGNATURE 502 case TCP_MD5SIG: 503 if (m == NULL || m->m_len < sizeof (int)) { 504 error = EINVAL; 505 break; 506 } 507 508 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 509 error = EPERM; 510 break; 511 } 512 513 if (*mtod(m, int *)) { 514 tp->t_flags |= TF_SIGNATURE; 515 tp->sack_enable = 0; 516 } else 517 tp->t_flags &= ~TF_SIGNATURE; 518 break; 519 #endif /* TCP_SIGNATURE */ 520 default: 521 error = ENOPROTOOPT; 522 break; 523 } 524 break; 525 526 case PRCO_GETOPT: 527 m->m_len = sizeof(int); 528 529 switch (optname) { 530 case TCP_NODELAY: 531 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 532 break; 533 case TCP_NOPUSH: 534 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 535 break; 536 case TCP_MAXSEG: 537 *mtod(m, int *) = tp->t_maxseg; 538 break; 539 case TCP_SACK_ENABLE: 540 *mtod(m, int *) = tp->sack_enable; 541 break; 542 #ifdef TCP_SIGNATURE 543 case TCP_MD5SIG: 544 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; 545 break; 546 #endif 547 default: 548 error = ENOPROTOOPT; 549 break; 550 } 551 break; 552 } 553 return (error); 554 } 555 556 /* 557 * Attach TCP protocol to socket, allocating 558 * internet protocol control block, tcp control block, 559 * buffer space, and entering LISTEN state to accept connections. 560 */ 561 int 562 tcp_attach(struct socket *so, int proto) 563 { 564 struct tcpcb *tp; 565 struct inpcb *inp; 566 int error; 567 568 if (so->so_pcb) 569 return EISCONN; 570 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || 571 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || 572 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { 573 error = soreserve(so, tcp_sendspace, tcp_recvspace); 574 if (error) 575 return (error); 576 } 577 578 NET_ASSERT_LOCKED(); 579 error = in_pcballoc(so, &tcbtable); 580 if (error) 581 return (error); 582 inp = sotoinpcb(so); 583 tp = tcp_newtcpcb(inp); 584 if (tp == NULL) { 585 unsigned int nofd = so->so_state & SS_NOFDREF; /* XXX */ 586 587 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 588 in_pcbdetach(inp); 589 so->so_state |= nofd; 590 return (ENOBUFS); 591 } 592 tp->t_state = TCPS_CLOSED; 593 #ifdef INET6 594 /* we disallow IPv4 mapped address completely. */ 595 if (inp->inp_flags & INP_IPV6) 596 tp->pf = PF_INET6; 597 else 598 tp->pf = PF_INET; 599 #else 600 tp->pf = PF_INET; 601 #endif 602 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 603 so->so_linger = TCP_LINGERTIME; 604 605 if (tp && (so->so_options & SO_DEBUG)) 606 tcp_trace(TA_USER, 0, tp, (caddr_t)0, 0 /* XXX */, 0); 607 return (0); 608 } 609 610 int 611 tcp_detach(struct socket *so) 612 { 613 struct inpcb *inp; 614 struct tcpcb *tp = NULL; 615 int error = 0; 616 short ostate; 617 618 soassertlocked(so); 619 620 inp = sotoinpcb(so); 621 /* 622 * When a TCP is attached to a socket, then there will be 623 * a (struct inpcb) pointed at by the socket, and this 624 * structure will point at a subsidiary (struct tcpcb). 625 */ 626 if (inp == NULL) { 627 error = so->so_error; 628 if (error == 0) 629 error = EINVAL; 630 return (error); 631 } 632 tp = intotcpcb(inp); 633 /* tp might get 0 when using socket splicing */ 634 if (tp == NULL) 635 return (0); 636 #ifdef KPROF 637 tcp_acounts[tp->t_state][req]++; 638 #endif 639 ostate = tp->t_state; 640 641 /* 642 * Detach the TCP protocol from the socket. 643 * If the protocol state is non-embryonic, then can't 644 * do this directly: have to initiate a PRU_DISCONNECT, 645 * which may finish later; embryonic TCB's can just 646 * be discarded here. 647 */ 648 tp = tcp_disconnect(tp); 649 650 if (tp && (so->so_options & SO_DEBUG)) 651 tcp_trace(TA_USER, ostate, tp, (caddr_t)0, PRU_DETACH, 0); 652 return (error); 653 } 654 655 /* 656 * Initiate (or continue) disconnect. 657 * If embryonic state, just send reset (once). 658 * If in ``let data drain'' option and linger null, just drop. 659 * Otherwise (hard), mark socket disconnecting and drop 660 * current input data; switch states based on user close, and 661 * send segment to peer (with FIN). 662 */ 663 struct tcpcb * 664 tcp_disconnect(struct tcpcb *tp) 665 { 666 struct socket *so = tp->t_inpcb->inp_socket; 667 668 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 669 tp = tcp_close(tp); 670 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 671 tp = tcp_drop(tp, 0); 672 else { 673 soisdisconnecting(so); 674 sbflush(so, &so->so_rcv); 675 tp = tcp_usrclosed(tp); 676 if (tp) 677 (void) tcp_output(tp); 678 } 679 return (tp); 680 } 681 682 /* 683 * User issued close, and wish to trail through shutdown states: 684 * if never received SYN, just forget it. If got a SYN from peer, 685 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 686 * If already got a FIN from peer, then almost done; go to LAST_ACK 687 * state. In all other cases, have already sent FIN to peer (e.g. 688 * after PRU_SHUTDOWN), and just have to play tedious game waiting 689 * for peer to send FIN or not respond to keep-alives, etc. 690 * We can let the user exit from the close as soon as the FIN is acked. 691 */ 692 struct tcpcb * 693 tcp_usrclosed(struct tcpcb *tp) 694 { 695 696 switch (tp->t_state) { 697 698 case TCPS_CLOSED: 699 case TCPS_LISTEN: 700 case TCPS_SYN_SENT: 701 tp->t_state = TCPS_CLOSED; 702 tp = tcp_close(tp); 703 break; 704 705 case TCPS_SYN_RECEIVED: 706 case TCPS_ESTABLISHED: 707 tp->t_state = TCPS_FIN_WAIT_1; 708 break; 709 710 case TCPS_CLOSE_WAIT: 711 tp->t_state = TCPS_LAST_ACK; 712 break; 713 } 714 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 715 soisdisconnected(tp->t_inpcb->inp_socket); 716 /* 717 * If we are in FIN_WAIT_2, we arrived here because the 718 * application did a shutdown of the send side. Like the 719 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 720 * a full close, we start a timer to make sure sockets are 721 * not left in FIN_WAIT_2 forever. 722 */ 723 if (tp->t_state == TCPS_FIN_WAIT_2) 724 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 725 } 726 return (tp); 727 } 728 729 /* 730 * Look up a socket for ident or tcpdrop, ... 731 */ 732 int 733 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) 734 { 735 int error = 0; 736 struct tcp_ident_mapping tir; 737 struct inpcb *inp; 738 struct tcpcb *tp = NULL; 739 struct sockaddr_in *fin, *lin; 740 #ifdef INET6 741 struct sockaddr_in6 *fin6, *lin6; 742 struct in6_addr f6, l6; 743 #endif 744 745 NET_ASSERT_LOCKED(); 746 747 if (dodrop) { 748 if (oldp != NULL || *oldlenp != 0) 749 return (EINVAL); 750 if (newp == NULL) 751 return (EPERM); 752 if (newlen < sizeof(tir)) 753 return (ENOMEM); 754 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) 755 return (error); 756 } else { 757 if (oldp == NULL) 758 return (EINVAL); 759 if (*oldlenp < sizeof(tir)) 760 return (ENOMEM); 761 if (newp != NULL || newlen != 0) 762 return (EINVAL); 763 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) 764 return (error); 765 } 766 switch (tir.faddr.ss_family) { 767 #ifdef INET6 768 case AF_INET6: 769 fin6 = (struct sockaddr_in6 *)&tir.faddr; 770 error = in6_embedscope(&f6, fin6, NULL); 771 if (error) 772 return EINVAL; /*?*/ 773 lin6 = (struct sockaddr_in6 *)&tir.laddr; 774 error = in6_embedscope(&l6, lin6, NULL); 775 if (error) 776 return EINVAL; /*?*/ 777 break; 778 #endif 779 case AF_INET: 780 fin = (struct sockaddr_in *)&tir.faddr; 781 lin = (struct sockaddr_in *)&tir.laddr; 782 break; 783 default: 784 return (EINVAL); 785 } 786 787 switch (tir.faddr.ss_family) { 788 #ifdef INET6 789 case AF_INET6: 790 inp = in6_pcbhashlookup(&tcbtable, &f6, 791 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); 792 break; 793 #endif 794 case AF_INET: 795 inp = in_pcbhashlookup(&tcbtable, fin->sin_addr, 796 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); 797 break; 798 default: 799 unhandled_af(tir.faddr.ss_family); 800 } 801 802 if (dodrop) { 803 if (inp && (tp = intotcpcb(inp)) && 804 ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) 805 tp = tcp_drop(tp, ECONNABORTED); 806 else 807 error = ESRCH; 808 return (error); 809 } 810 811 if (inp == NULL) { 812 tcpstat_inc(tcps_pcbhashmiss); 813 switch (tir.faddr.ss_family) { 814 #ifdef INET6 815 case AF_INET6: 816 inp = in6_pcblookup_listen(&tcbtable, 817 &l6, lin6->sin6_port, NULL, tir.rdomain); 818 break; 819 #endif 820 case AF_INET: 821 inp = in_pcblookup_listen(&tcbtable, 822 lin->sin_addr, lin->sin_port, NULL, tir.rdomain); 823 break; 824 } 825 } 826 827 if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) { 828 tir.ruid = inp->inp_socket->so_ruid; 829 tir.euid = inp->inp_socket->so_euid; 830 } else { 831 tir.ruid = -1; 832 tir.euid = -1; 833 } 834 835 *oldlenp = sizeof (tir); 836 error = copyout((void *)&tir, oldp, sizeof (tir)); 837 return (error); 838 } 839 840 int 841 tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp) 842 { 843 uint64_t counters[tcps_ncounters]; 844 struct tcpstat tcpstat; 845 struct syn_cache_set *set; 846 int i = 0; 847 848 #define ASSIGN(field) do { tcpstat.field = counters[i++]; } while (0) 849 850 memset(&tcpstat, 0, sizeof tcpstat); 851 counters_read(tcpcounters, counters, nitems(counters)); 852 ASSIGN(tcps_connattempt); 853 ASSIGN(tcps_accepts); 854 ASSIGN(tcps_connects); 855 ASSIGN(tcps_drops); 856 ASSIGN(tcps_conndrops); 857 ASSIGN(tcps_closed); 858 ASSIGN(tcps_segstimed); 859 ASSIGN(tcps_rttupdated); 860 ASSIGN(tcps_delack); 861 ASSIGN(tcps_timeoutdrop); 862 ASSIGN(tcps_rexmttimeo); 863 ASSIGN(tcps_persisttimeo); 864 ASSIGN(tcps_persistdrop); 865 ASSIGN(tcps_keeptimeo); 866 ASSIGN(tcps_keepprobe); 867 ASSIGN(tcps_keepdrops); 868 ASSIGN(tcps_sndtotal); 869 ASSIGN(tcps_sndpack); 870 ASSIGN(tcps_sndbyte); 871 ASSIGN(tcps_sndrexmitpack); 872 ASSIGN(tcps_sndrexmitbyte); 873 ASSIGN(tcps_sndrexmitfast); 874 ASSIGN(tcps_sndacks); 875 ASSIGN(tcps_sndprobe); 876 ASSIGN(tcps_sndurg); 877 ASSIGN(tcps_sndwinup); 878 ASSIGN(tcps_sndctrl); 879 ASSIGN(tcps_rcvtotal); 880 ASSIGN(tcps_rcvpack); 881 ASSIGN(tcps_rcvbyte); 882 ASSIGN(tcps_rcvbadsum); 883 ASSIGN(tcps_rcvbadoff); 884 ASSIGN(tcps_rcvmemdrop); 885 ASSIGN(tcps_rcvnosec); 886 ASSIGN(tcps_rcvshort); 887 ASSIGN(tcps_rcvduppack); 888 ASSIGN(tcps_rcvdupbyte); 889 ASSIGN(tcps_rcvpartduppack); 890 ASSIGN(tcps_rcvpartdupbyte); 891 ASSIGN(tcps_rcvoopack); 892 ASSIGN(tcps_rcvoobyte); 893 ASSIGN(tcps_rcvpackafterwin); 894 ASSIGN(tcps_rcvbyteafterwin); 895 ASSIGN(tcps_rcvafterclose); 896 ASSIGN(tcps_rcvwinprobe); 897 ASSIGN(tcps_rcvdupack); 898 ASSIGN(tcps_rcvacktoomuch); 899 ASSIGN(tcps_rcvacktooold); 900 ASSIGN(tcps_rcvackpack); 901 ASSIGN(tcps_rcvackbyte); 902 ASSIGN(tcps_rcvwinupd); 903 ASSIGN(tcps_pawsdrop); 904 ASSIGN(tcps_predack); 905 ASSIGN(tcps_preddat); 906 ASSIGN(tcps_pcbhashmiss); 907 ASSIGN(tcps_noport); 908 ASSIGN(tcps_badsyn); 909 ASSIGN(tcps_dropsyn); 910 ASSIGN(tcps_rcvbadsig); 911 ASSIGN(tcps_rcvgoodsig); 912 ASSIGN(tcps_inswcsum); 913 ASSIGN(tcps_outswcsum); 914 ASSIGN(tcps_ecn_accepts); 915 ASSIGN(tcps_ecn_rcvece); 916 ASSIGN(tcps_ecn_rcvcwr); 917 ASSIGN(tcps_ecn_rcvce); 918 ASSIGN(tcps_ecn_sndect); 919 ASSIGN(tcps_ecn_sndece); 920 ASSIGN(tcps_ecn_sndcwr); 921 ASSIGN(tcps_cwr_ecn); 922 ASSIGN(tcps_cwr_frecovery); 923 ASSIGN(tcps_cwr_timeout); 924 ASSIGN(tcps_sc_added); 925 ASSIGN(tcps_sc_completed); 926 ASSIGN(tcps_sc_timed_out); 927 ASSIGN(tcps_sc_overflowed); 928 ASSIGN(tcps_sc_reset); 929 ASSIGN(tcps_sc_unreach); 930 ASSIGN(tcps_sc_bucketoverflow); 931 ASSIGN(tcps_sc_aborted); 932 ASSIGN(tcps_sc_dupesyn); 933 ASSIGN(tcps_sc_dropped); 934 ASSIGN(tcps_sc_collisions); 935 ASSIGN(tcps_sc_retransmitted); 936 ASSIGN(tcps_sc_seedrandom); 937 ASSIGN(tcps_sc_hash_size); 938 ASSIGN(tcps_sc_entry_count); 939 ASSIGN(tcps_sc_entry_limit); 940 ASSIGN(tcps_sc_bucket_maxlen); 941 ASSIGN(tcps_sc_bucket_limit); 942 ASSIGN(tcps_sc_uses_left); 943 ASSIGN(tcps_conndrained); 944 ASSIGN(tcps_sack_recovery_episode); 945 ASSIGN(tcps_sack_rexmits); 946 ASSIGN(tcps_sack_rexmit_bytes); 947 ASSIGN(tcps_sack_rcv_opts); 948 ASSIGN(tcps_sack_snd_opts); 949 950 #undef ASSIGN 951 952 set = &tcp_syn_cache[tcp_syn_cache_active]; 953 tcpstat.tcps_sc_hash_size = set->scs_size; 954 tcpstat.tcps_sc_entry_count = set->scs_count; 955 tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit; 956 tcpstat.tcps_sc_bucket_maxlen = 0; 957 for (i = 0; i < set->scs_size; i++) { 958 if (tcpstat.tcps_sc_bucket_maxlen < 959 set->scs_buckethead[i].sch_length) 960 tcpstat.tcps_sc_bucket_maxlen = 961 set->scs_buckethead[i].sch_length; 962 } 963 tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit; 964 tcpstat.tcps_sc_uses_left = set->scs_use; 965 966 return (sysctl_rdstruct(oldp, oldlenp, newp, 967 &tcpstat, sizeof(tcpstat))); 968 } 969 970 /* 971 * Sysctl for tcp variables. 972 */ 973 int 974 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 975 size_t newlen) 976 { 977 int error, nval; 978 979 /* All sysctl names at this level are terminal. */ 980 if (namelen != 1) 981 return (ENOTDIR); 982 983 switch (name[0]) { 984 case TCPCTL_SACK: 985 NET_LOCK(); 986 error = sysctl_int(oldp, oldlenp, newp, newlen, 987 &tcp_do_sack); 988 NET_UNLOCK(); 989 return (error); 990 991 case TCPCTL_SLOWHZ: 992 return (sysctl_rdint(oldp, oldlenp, newp, PR_SLOWHZ)); 993 994 case TCPCTL_BADDYNAMIC: 995 NET_LOCK(); 996 error = sysctl_struct(oldp, oldlenp, newp, newlen, 997 baddynamicports.tcp, sizeof(baddynamicports.tcp)); 998 NET_UNLOCK(); 999 return (error); 1000 1001 case TCPCTL_ROOTONLY: 1002 if (newp && securelevel > 0) 1003 return (EPERM); 1004 NET_LOCK(); 1005 error = sysctl_struct(oldp, oldlenp, newp, newlen, 1006 rootonlyports.tcp, sizeof(rootonlyports.tcp)); 1007 NET_UNLOCK(); 1008 return (error); 1009 1010 case TCPCTL_IDENT: 1011 NET_LOCK(); 1012 error = tcp_ident(oldp, oldlenp, newp, newlen, 0); 1013 NET_UNLOCK(); 1014 return (error); 1015 1016 case TCPCTL_DROP: 1017 NET_LOCK(); 1018 error = tcp_ident(oldp, oldlenp, newp, newlen, 1); 1019 NET_UNLOCK(); 1020 return (error); 1021 1022 case TCPCTL_ALWAYS_KEEPALIVE: 1023 NET_LOCK(); 1024 error = sysctl_int(oldp, oldlenp, newp, newlen, 1025 &tcp_always_keepalive); 1026 NET_UNLOCK(); 1027 return (error); 1028 1029 #ifdef TCP_ECN 1030 case TCPCTL_ECN: 1031 NET_LOCK(); 1032 error = sysctl_int(oldp, oldlenp, newp, newlen, 1033 &tcp_do_ecn); 1034 NET_UNLOCK(); 1035 return (error); 1036 #endif 1037 case TCPCTL_REASS_LIMIT: 1038 NET_LOCK(); 1039 nval = tcp_reass_limit; 1040 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1041 if (!error && nval != tcp_reass_limit) { 1042 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); 1043 if (!error) 1044 tcp_reass_limit = nval; 1045 } 1046 NET_UNLOCK(); 1047 return (error); 1048 1049 case TCPCTL_SACKHOLE_LIMIT: 1050 NET_LOCK(); 1051 nval = tcp_sackhole_limit; 1052 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1053 if (!error && nval != tcp_sackhole_limit) { 1054 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); 1055 if (!error) 1056 tcp_sackhole_limit = nval; 1057 } 1058 NET_UNLOCK(); 1059 return (error); 1060 1061 case TCPCTL_STATS: 1062 return (tcp_sysctl_tcpstat(oldp, oldlenp, newp)); 1063 1064 case TCPCTL_SYN_USE_LIMIT: 1065 NET_LOCK(); 1066 error = sysctl_int(oldp, oldlenp, newp, newlen, 1067 &tcp_syn_use_limit); 1068 if (!error && newp != NULL) { 1069 /* 1070 * Global tcp_syn_use_limit is used when reseeding a 1071 * new cache. Also update the value in active cache. 1072 */ 1073 if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit) 1074 tcp_syn_cache[0].scs_use = tcp_syn_use_limit; 1075 if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit) 1076 tcp_syn_cache[1].scs_use = tcp_syn_use_limit; 1077 } 1078 NET_UNLOCK(); 1079 return (error); 1080 1081 case TCPCTL_SYN_HASH_SIZE: 1082 NET_LOCK(); 1083 nval = tcp_syn_hash_size; 1084 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1085 if (!error && nval != tcp_syn_hash_size) { 1086 if (nval < 1 || nval > 100000) { 1087 error = EINVAL; 1088 } else { 1089 /* 1090 * If global hash size has been changed, 1091 * switch sets as soon as possible. Then 1092 * the actual hash array will be reallocated. 1093 */ 1094 if (tcp_syn_cache[0].scs_size != nval) 1095 tcp_syn_cache[0].scs_use = 0; 1096 if (tcp_syn_cache[1].scs_size != nval) 1097 tcp_syn_cache[1].scs_use = 0; 1098 tcp_syn_hash_size = nval; 1099 } 1100 } 1101 NET_UNLOCK(); 1102 return (error); 1103 1104 default: 1105 if (name[0] < TCPCTL_MAXID) { 1106 NET_LOCK(); 1107 error = sysctl_int_arr(tcpctl_vars, name, namelen, 1108 oldp, oldlenp, newp, newlen); 1109 NET_UNLOCK(); 1110 return (error); 1111 } 1112 return (ENOPROTOOPT); 1113 } 1114 /* NOTREACHED */ 1115 } 1116 1117 /* 1118 * Scale the send buffer so that inflight data is not accounted against 1119 * the limit. The buffer will scale with the congestion window, if the 1120 * the receiver stops acking data the window will shrink and therefor 1121 * the buffer size will shrink as well. 1122 * In low memory situation try to shrink the buffer to the initial size 1123 * disabling the send buffer scaling as long as the situation persists. 1124 */ 1125 void 1126 tcp_update_sndspace(struct tcpcb *tp) 1127 { 1128 struct socket *so = tp->t_inpcb->inp_socket; 1129 u_long nmax = so->so_snd.sb_hiwat; 1130 1131 if (sbchecklowmem()) { 1132 /* low on memory try to get rid of some */ 1133 if (tcp_sendspace < nmax) 1134 nmax = tcp_sendspace; 1135 } else if (so->so_snd.sb_wat != tcp_sendspace) 1136 /* user requested buffer size, auto-scaling disabled */ 1137 nmax = so->so_snd.sb_wat; 1138 else 1139 /* automatic buffer scaling */ 1140 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - 1141 tp->snd_una); 1142 1143 /* a writable socket must be preserved because of poll(2) semantics */ 1144 if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) { 1145 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) 1146 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; 1147 if (nmax * 2 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat) 1148 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+1) / 2; 1149 } 1150 1151 /* round to MSS boundary */ 1152 nmax = roundup(nmax, tp->t_maxseg); 1153 1154 if (nmax != so->so_snd.sb_hiwat) 1155 sbreserve(so, &so->so_snd, nmax); 1156 } 1157 1158 /* 1159 * Scale the recv buffer by looking at how much data was transferred in 1160 * on approximated RTT. If more than a big part of the recv buffer was 1161 * transferred during that time we increase the buffer by a constant. 1162 * In low memory situation try to shrink the buffer to the initial size. 1163 */ 1164 void 1165 tcp_update_rcvspace(struct tcpcb *tp) 1166 { 1167 struct socket *so = tp->t_inpcb->inp_socket; 1168 u_long nmax = so->so_rcv.sb_hiwat; 1169 1170 if (sbchecklowmem()) { 1171 /* low on memory try to get rid of some */ 1172 if (tcp_recvspace < nmax) 1173 nmax = tcp_recvspace; 1174 } else if (so->so_rcv.sb_wat != tcp_recvspace) 1175 /* user requested buffer size, auto-scaling disabled */ 1176 nmax = so->so_rcv.sb_wat; 1177 else { 1178 /* automatic buffer scaling */ 1179 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) 1180 nmax = MIN(sb_max, so->so_rcv.sb_hiwat + 1181 tcp_autorcvbuf_inc); 1182 } 1183 1184 /* a readable socket must be preserved because of poll(2) semantics */ 1185 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && 1186 nmax < so->so_snd.sb_lowat) 1187 nmax = so->so_snd.sb_lowat; 1188 1189 if (nmax == so->so_rcv.sb_hiwat) 1190 return; 1191 1192 /* round to MSS boundary */ 1193 nmax = roundup(nmax, tp->t_maxseg); 1194 sbreserve(so, &so->so_rcv, nmax); 1195 } 1196