1 /* $OpenBSD: tcp_usrreq.c,v 1.136 2016/11/21 09:09:06 mpi Exp $ */ 2 /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/protosw.h> 77 #include <sys/stat.h> 78 #include <sys/sysctl.h> 79 #include <sys/domain.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 #include <net/route.h> 86 87 #include <netinet/in.h> 88 #include <netinet/in_var.h> 89 #include <netinet/ip.h> 90 #include <netinet/in_pcb.h> 91 #include <netinet/ip_var.h> 92 #include <netinet/tcp.h> 93 #include <netinet/tcp_fsm.h> 94 #include <netinet/tcp_seq.h> 95 #include <netinet/tcp_timer.h> 96 #include <netinet/tcp_var.h> 97 #include <netinet/tcpip.h> 98 #include <netinet/tcp_debug.h> 99 100 #ifdef INET6 101 #include <netinet6/in6_var.h> 102 #endif 103 104 #ifndef TCP_SENDSPACE 105 #define TCP_SENDSPACE 1024*16 106 #endif 107 u_int tcp_sendspace = TCP_SENDSPACE; 108 #ifndef TCP_RECVSPACE 109 #define TCP_RECVSPACE 1024*16 110 #endif 111 u_int tcp_recvspace = TCP_RECVSPACE; 112 u_int tcp_autorcvbuf_inc = 16 * 1024; 113 114 int *tcpctl_vars[TCPCTL_MAXID] = TCPCTL_VARS; 115 116 struct inpcbtable tcbtable; 117 118 int tcp_ident(void *, size_t *, void *, size_t, int); 119 120 /* 121 * Process a TCP user request for TCP tb. If this is a send request 122 * then m is the mbuf chain of send data. If this is a timer expiration 123 * (called from the software clock routine), then timertype tells which timer. 124 */ 125 /*ARGSUSED*/ 126 int 127 tcp_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam, 128 struct mbuf *control, struct proc *p) 129 { 130 struct sockaddr_in *sin; 131 struct inpcb *inp; 132 struct tcpcb *tp = NULL; 133 int error = 0; 134 short ostate; 135 136 splsoftassert(IPL_SOFTNET); 137 138 if (req == PRU_CONTROL) { 139 #ifdef INET6 140 if (sotopf(so) == PF_INET6) 141 return in6_control(so, (u_long)m, (caddr_t)nam, 142 (struct ifnet *)control); 143 else 144 #endif /* INET6 */ 145 return (in_control(so, (u_long)m, (caddr_t)nam, 146 (struct ifnet *)control)); 147 } 148 if (control && control->m_len) { 149 m_freem(control); 150 m_freem(m); 151 return (EINVAL); 152 } 153 154 inp = sotoinpcb(so); 155 /* 156 * When a TCP is attached to a socket, then there will be 157 * a (struct inpcb) pointed at by the socket, and this 158 * structure will point at a subsidiary (struct tcpcb). 159 */ 160 if (inp == NULL && req != PRU_ATTACH) { 161 error = so->so_error; 162 if (error == 0) 163 error = EINVAL; 164 /* 165 * The following corrects an mbuf leak under rare 166 * circumstances 167 */ 168 if (req == PRU_SEND || req == PRU_SENDOOB) 169 m_freem(m); 170 return (error); 171 } 172 if (inp) { 173 tp = intotcpcb(inp); 174 /* tp might get 0 when using socket splicing */ 175 if (tp == NULL) { 176 return (0); 177 } 178 #ifdef KPROF 179 tcp_acounts[tp->t_state][req]++; 180 #endif 181 ostate = tp->t_state; 182 } else 183 ostate = 0; 184 switch (req) { 185 186 /* 187 * TCP attaches to socket via PRU_ATTACH, reserving space, 188 * and an internet control block. 189 */ 190 case PRU_ATTACH: 191 if (inp) { 192 error = EISCONN; 193 break; 194 } 195 error = tcp_attach(so); 196 if (error) 197 break; 198 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 199 so->so_linger = TCP_LINGERTIME; 200 tp = sototcpcb(so); 201 break; 202 203 /* 204 * PRU_DETACH detaches the TCP protocol from the socket. 205 * If the protocol state is non-embryonic, then can't 206 * do this directly: have to initiate a PRU_DISCONNECT, 207 * which may finish later; embryonic TCB's can just 208 * be discarded here. 209 */ 210 case PRU_DETACH: 211 tp = tcp_disconnect(tp); 212 break; 213 214 /* 215 * Give the socket an address. 216 */ 217 case PRU_BIND: 218 error = in_pcbbind(inp, nam, p); 219 break; 220 221 /* 222 * Prepare to accept connections. 223 */ 224 case PRU_LISTEN: 225 if (inp->inp_lport == 0) 226 error = in_pcbbind(inp, NULL, p); 227 /* If the in_pcbbind() above is called, the tp->pf 228 should still be whatever it was before. */ 229 if (error == 0) 230 tp->t_state = TCPS_LISTEN; 231 break; 232 233 /* 234 * Initiate connection to peer. 235 * Create a template for use in transmissions on this connection. 236 * Enter SYN_SENT state, and mark socket as connecting. 237 * Start keep-alive timer, and seed output sequence space. 238 * Send initial segment on connection. 239 */ 240 case PRU_CONNECT: 241 sin = mtod(nam, struct sockaddr_in *); 242 243 #ifdef INET6 244 if (sin->sin_family == AF_INET6) { 245 struct in6_addr *in6_addr = &mtod(nam, 246 struct sockaddr_in6 *)->sin6_addr; 247 248 if (IN6_IS_ADDR_UNSPECIFIED(in6_addr) || 249 IN6_IS_ADDR_MULTICAST(in6_addr) || 250 IN6_IS_ADDR_V4MAPPED(in6_addr)) { 251 error = EINVAL; 252 break; 253 } 254 255 error = in6_pcbconnect(inp, nam); 256 } else if (sin->sin_family == AF_INET) 257 #endif /* INET6 */ 258 { 259 if ((sin->sin_addr.s_addr == INADDR_ANY) || 260 (sin->sin_addr.s_addr == INADDR_BROADCAST) || 261 IN_MULTICAST(sin->sin_addr.s_addr) || 262 in_broadcast(sin->sin_addr, inp->inp_rtableid)) { 263 error = EINVAL; 264 break; 265 } 266 267 error = in_pcbconnect(inp, nam); 268 } 269 270 if (error) 271 break; 272 273 tp->t_template = tcp_template(tp); 274 if (tp->t_template == 0) { 275 in_pcbdisconnect(inp); 276 error = ENOBUFS; 277 break; 278 } 279 280 so->so_state |= SS_CONNECTOUT; 281 282 /* Compute window scaling to request. */ 283 tcp_rscale(tp, sb_max); 284 285 soisconnecting(so); 286 tcpstat.tcps_connattempt++; 287 tp->t_state = TCPS_SYN_SENT; 288 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 289 tcp_set_iss_tsm(tp); 290 tcp_sendseqinit(tp); 291 #if defined(TCP_SACK) 292 tp->snd_last = tp->snd_una; 293 #endif 294 #if defined(TCP_SACK) && defined(TCP_FACK) 295 tp->snd_fack = tp->snd_una; 296 tp->retran_data = 0; 297 tp->snd_awnd = 0; 298 #endif 299 error = tcp_output(tp); 300 break; 301 302 /* 303 * Create a TCP connection between two sockets. 304 */ 305 case PRU_CONNECT2: 306 error = EOPNOTSUPP; 307 break; 308 309 /* 310 * Initiate disconnect from peer. 311 * If connection never passed embryonic stage, just drop; 312 * else if don't need to let data drain, then can just drop anyways, 313 * else have to begin TCP shutdown process: mark socket disconnecting, 314 * drain unread data, state switch to reflect user close, and 315 * send segment (e.g. FIN) to peer. Socket will be really disconnected 316 * when peer sends FIN and acks ours. 317 * 318 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 319 */ 320 case PRU_DISCONNECT: 321 tp = tcp_disconnect(tp); 322 break; 323 324 /* 325 * Accept a connection. Essentially all the work is 326 * done at higher levels; just return the address 327 * of the peer, storing through addr. 328 */ 329 case PRU_ACCEPT: 330 #ifdef INET6 331 if (inp->inp_flags & INP_IPV6) 332 in6_setpeeraddr(inp, nam); 333 else 334 #endif 335 in_setpeeraddr(inp, nam); 336 break; 337 338 /* 339 * Mark the connection as being incapable of further output. 340 */ 341 case PRU_SHUTDOWN: 342 if (so->so_state & SS_CANTSENDMORE) 343 break; 344 socantsendmore(so); 345 tp = tcp_usrclosed(tp); 346 if (tp) 347 error = tcp_output(tp); 348 break; 349 350 /* 351 * After a receive, possibly send window update to peer. 352 */ 353 case PRU_RCVD: 354 /* 355 * soreceive() calls this function when a user receives 356 * ancillary data on a listening socket. We don't call 357 * tcp_output in such a case, since there is no header 358 * template for a listening socket and hence the kernel 359 * will panic. 360 */ 361 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 362 (void) tcp_output(tp); 363 break; 364 365 /* 366 * Do a send by putting data in output queue and updating urgent 367 * marker if URG set. Possibly send more data. 368 */ 369 case PRU_SEND: 370 sbappendstream(&so->so_snd, m); 371 error = tcp_output(tp); 372 break; 373 374 /* 375 * Abort the TCP. 376 */ 377 case PRU_ABORT: 378 tp = tcp_drop(tp, ECONNABORTED); 379 break; 380 381 case PRU_SENSE: 382 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; 383 return (0); 384 385 case PRU_RCVOOB: 386 if ((so->so_oobmark == 0 && 387 (so->so_state & SS_RCVATMARK) == 0) || 388 so->so_options & SO_OOBINLINE || 389 tp->t_oobflags & TCPOOB_HADDATA) { 390 error = EINVAL; 391 break; 392 } 393 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 394 error = EWOULDBLOCK; 395 break; 396 } 397 m->m_len = 1; 398 *mtod(m, caddr_t) = tp->t_iobc; 399 if (((long)nam & MSG_PEEK) == 0) 400 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 401 break; 402 403 case PRU_SENDOOB: 404 if (sbspace(&so->so_snd) < -512) { 405 m_freem(m); 406 error = ENOBUFS; 407 break; 408 } 409 /* 410 * According to RFC961 (Assigned Protocols), 411 * the urgent pointer points to the last octet 412 * of urgent data. We continue, however, 413 * to consider it to indicate the first octet 414 * of data past the urgent section. 415 * Otherwise, snd_up should be one lower. 416 */ 417 sbappendstream(&so->so_snd, m); 418 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 419 tp->t_force = 1; 420 error = tcp_output(tp); 421 tp->t_force = 0; 422 break; 423 424 case PRU_SOCKADDR: 425 #ifdef INET6 426 if (inp->inp_flags & INP_IPV6) 427 in6_setsockaddr(inp, nam); 428 else 429 #endif 430 in_setsockaddr(inp, nam); 431 break; 432 433 case PRU_PEERADDR: 434 #ifdef INET6 435 if (inp->inp_flags & INP_IPV6) 436 in6_setpeeraddr(inp, nam); 437 else 438 #endif 439 in_setpeeraddr(inp, nam); 440 break; 441 442 default: 443 panic("tcp_usrreq"); 444 } 445 if (tp && (so->so_options & SO_DEBUG)) 446 tcp_trace(TA_USER, ostate, tp, (caddr_t)0, req, 0); 447 return (error); 448 } 449 450 int 451 tcp_ctloutput(int op, struct socket *so, int level, int optname, 452 struct mbuf **mp) 453 { 454 int error = 0, s; 455 struct inpcb *inp; 456 struct tcpcb *tp; 457 struct mbuf *m; 458 int i; 459 460 s = splsoftnet(); 461 inp = sotoinpcb(so); 462 if (inp == NULL) { 463 splx(s); 464 if (op == PRCO_SETOPT) 465 (void) m_free(*mp); 466 return (ECONNRESET); 467 } 468 if (level != IPPROTO_TCP) { 469 switch (so->so_proto->pr_domain->dom_family) { 470 #ifdef INET6 471 case PF_INET6: 472 error = ip6_ctloutput(op, so, level, optname, mp); 473 break; 474 #endif /* INET6 */ 475 case PF_INET: 476 error = ip_ctloutput(op, so, level, optname, mp); 477 break; 478 default: 479 error = EAFNOSUPPORT; /*?*/ 480 break; 481 } 482 splx(s); 483 return (error); 484 } 485 tp = intotcpcb(inp); 486 487 switch (op) { 488 489 case PRCO_SETOPT: 490 m = *mp; 491 switch (optname) { 492 493 case TCP_NODELAY: 494 if (m == NULL || m->m_len < sizeof (int)) 495 error = EINVAL; 496 else if (*mtod(m, int *)) 497 tp->t_flags |= TF_NODELAY; 498 else 499 tp->t_flags &= ~TF_NODELAY; 500 break; 501 502 case TCP_NOPUSH: 503 if (m == NULL || m->m_len < sizeof (int)) 504 error = EINVAL; 505 else if (*mtod(m, int *)) 506 tp->t_flags |= TF_NOPUSH; 507 else if (tp->t_flags & TF_NOPUSH) { 508 tp->t_flags &= ~TF_NOPUSH; 509 if (TCPS_HAVEESTABLISHED(tp->t_state)) 510 error = tcp_output(tp); 511 } 512 break; 513 514 case TCP_MAXSEG: 515 if (m == NULL || m->m_len < sizeof (int)) { 516 error = EINVAL; 517 break; 518 } 519 520 i = *mtod(m, int *); 521 if (i > 0 && i <= tp->t_maxseg) 522 tp->t_maxseg = i; 523 else 524 error = EINVAL; 525 break; 526 527 #ifdef TCP_SACK 528 case TCP_SACK_ENABLE: 529 if (m == NULL || m->m_len < sizeof (int)) { 530 error = EINVAL; 531 break; 532 } 533 534 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 535 error = EPERM; 536 break; 537 } 538 539 if (tp->t_flags & TF_SIGNATURE) { 540 error = EPERM; 541 break; 542 } 543 544 if (*mtod(m, int *)) 545 tp->sack_enable = 1; 546 else 547 tp->sack_enable = 0; 548 break; 549 #endif 550 #ifdef TCP_SIGNATURE 551 case TCP_MD5SIG: 552 if (m == NULL || m->m_len < sizeof (int)) { 553 error = EINVAL; 554 break; 555 } 556 557 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 558 error = EPERM; 559 break; 560 } 561 562 if (*mtod(m, int *)) { 563 tp->t_flags |= TF_SIGNATURE; 564 #ifdef TCP_SACK 565 tp->sack_enable = 0; 566 #endif /* TCP_SACK */ 567 } else 568 tp->t_flags &= ~TF_SIGNATURE; 569 break; 570 #endif /* TCP_SIGNATURE */ 571 default: 572 error = ENOPROTOOPT; 573 break; 574 } 575 if (m) 576 (void) m_free(m); 577 break; 578 579 case PRCO_GETOPT: 580 *mp = m = m_get(M_WAIT, MT_SOOPTS); 581 m->m_len = sizeof(int); 582 583 switch (optname) { 584 case TCP_NODELAY: 585 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 586 break; 587 case TCP_NOPUSH: 588 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 589 break; 590 case TCP_MAXSEG: 591 *mtod(m, int *) = tp->t_maxseg; 592 break; 593 #ifdef TCP_SACK 594 case TCP_SACK_ENABLE: 595 *mtod(m, int *) = tp->sack_enable; 596 break; 597 #endif 598 #ifdef TCP_SIGNATURE 599 case TCP_MD5SIG: 600 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; 601 break; 602 #endif 603 default: 604 error = ENOPROTOOPT; 605 break; 606 } 607 break; 608 } 609 splx(s); 610 return (error); 611 } 612 613 /* 614 * Attach TCP protocol to socket, allocating 615 * internet protocol control block, tcp control block, 616 * bufer space, and entering LISTEN state if to accept connections. 617 */ 618 int 619 tcp_attach(struct socket *so) 620 { 621 struct tcpcb *tp; 622 struct inpcb *inp; 623 int error; 624 625 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || 626 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || 627 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { 628 error = soreserve(so, tcp_sendspace, tcp_recvspace); 629 if (error) 630 return (error); 631 } 632 633 error = in_pcballoc(so, &tcbtable); 634 if (error) 635 return (error); 636 inp = sotoinpcb(so); 637 tp = tcp_newtcpcb(inp); 638 if (tp == NULL) { 639 int nofd = so->so_state & SS_NOFDREF; /* XXX */ 640 641 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 642 in_pcbdetach(inp); 643 so->so_state |= nofd; 644 return (ENOBUFS); 645 } 646 tp->t_state = TCPS_CLOSED; 647 #ifdef INET6 648 /* we disallow IPv4 mapped address completely. */ 649 if (inp->inp_flags & INP_IPV6) 650 tp->pf = PF_INET6; 651 else 652 tp->pf = PF_INET; 653 #else 654 tp->pf = PF_INET; 655 #endif 656 return (0); 657 } 658 659 /* 660 * Initiate (or continue) disconnect. 661 * If embryonic state, just send reset (once). 662 * If in ``let data drain'' option and linger null, just drop. 663 * Otherwise (hard), mark socket disconnecting and drop 664 * current input data; switch states based on user close, and 665 * send segment to peer (with FIN). 666 */ 667 struct tcpcb * 668 tcp_disconnect(struct tcpcb *tp) 669 { 670 struct socket *so = tp->t_inpcb->inp_socket; 671 672 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 673 tp = tcp_close(tp); 674 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 675 tp = tcp_drop(tp, 0); 676 else { 677 soisdisconnecting(so); 678 sbflush(&so->so_rcv); 679 tp = tcp_usrclosed(tp); 680 if (tp) 681 (void) tcp_output(tp); 682 } 683 return (tp); 684 } 685 686 /* 687 * User issued close, and wish to trail through shutdown states: 688 * if never received SYN, just forget it. If got a SYN from peer, 689 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 690 * If already got a FIN from peer, then almost done; go to LAST_ACK 691 * state. In all other cases, have already sent FIN to peer (e.g. 692 * after PRU_SHUTDOWN), and just have to play tedious game waiting 693 * for peer to send FIN or not respond to keep-alives, etc. 694 * We can let the user exit from the close as soon as the FIN is acked. 695 */ 696 struct tcpcb * 697 tcp_usrclosed(struct tcpcb *tp) 698 { 699 700 switch (tp->t_state) { 701 702 case TCPS_CLOSED: 703 case TCPS_LISTEN: 704 case TCPS_SYN_SENT: 705 tp->t_state = TCPS_CLOSED; 706 tp = tcp_close(tp); 707 break; 708 709 case TCPS_SYN_RECEIVED: 710 case TCPS_ESTABLISHED: 711 tp->t_state = TCPS_FIN_WAIT_1; 712 break; 713 714 case TCPS_CLOSE_WAIT: 715 tp->t_state = TCPS_LAST_ACK; 716 break; 717 } 718 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 719 soisdisconnected(tp->t_inpcb->inp_socket); 720 /* 721 * If we are in FIN_WAIT_2, we arrived here because the 722 * application did a shutdown of the send side. Like the 723 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 724 * a full close, we start a timer to make sure sockets are 725 * not left in FIN_WAIT_2 forever. 726 */ 727 if (tp->t_state == TCPS_FIN_WAIT_2) 728 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 729 } 730 return (tp); 731 } 732 733 /* 734 * Look up a socket for ident or tcpdrop, ... 735 */ 736 int 737 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) 738 { 739 int error = 0, s; 740 struct tcp_ident_mapping tir; 741 struct inpcb *inp; 742 struct tcpcb *tp = NULL; 743 struct sockaddr_in *fin, *lin; 744 #ifdef INET6 745 struct sockaddr_in6 *fin6, *lin6; 746 struct in6_addr f6, l6; 747 #endif 748 if (dodrop) { 749 if (oldp != NULL || *oldlenp != 0) 750 return (EINVAL); 751 if (newp == NULL) 752 return (EPERM); 753 if (newlen < sizeof(tir)) 754 return (ENOMEM); 755 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) 756 return (error); 757 } else { 758 if (oldp == NULL) 759 return (EINVAL); 760 if (*oldlenp < sizeof(tir)) 761 return (ENOMEM); 762 if (newp != NULL || newlen != 0) 763 return (EINVAL); 764 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) 765 return (error); 766 } 767 switch (tir.faddr.ss_family) { 768 #ifdef INET6 769 case AF_INET6: 770 fin6 = (struct sockaddr_in6 *)&tir.faddr; 771 error = in6_embedscope(&f6, fin6, NULL); 772 if (error) 773 return EINVAL; /*?*/ 774 lin6 = (struct sockaddr_in6 *)&tir.laddr; 775 error = in6_embedscope(&l6, lin6, NULL); 776 if (error) 777 return EINVAL; /*?*/ 778 break; 779 #endif 780 case AF_INET: 781 fin = (struct sockaddr_in *)&tir.faddr; 782 lin = (struct sockaddr_in *)&tir.laddr; 783 break; 784 default: 785 return (EINVAL); 786 } 787 788 s = splsoftnet(); 789 switch (tir.faddr.ss_family) { 790 #ifdef INET6 791 case AF_INET6: 792 inp = in6_pcbhashlookup(&tcbtable, &f6, 793 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); 794 break; 795 #endif 796 case AF_INET: 797 inp = in_pcbhashlookup(&tcbtable, fin->sin_addr, 798 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); 799 break; 800 default: 801 unhandled_af(tir.faddr.ss_family); 802 } 803 804 if (dodrop) { 805 if (inp && (tp = intotcpcb(inp)) && 806 ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) 807 tp = tcp_drop(tp, ECONNABORTED); 808 else 809 error = ESRCH; 810 splx(s); 811 return (error); 812 } 813 814 if (inp == NULL) { 815 ++tcpstat.tcps_pcbhashmiss; 816 switch (tir.faddr.ss_family) { 817 #ifdef INET6 818 case AF_INET6: 819 inp = in6_pcblookup_listen(&tcbtable, 820 &l6, lin6->sin6_port, 0, NULL, tir.rdomain); 821 break; 822 #endif 823 case AF_INET: 824 inp = in_pcblookup_listen(&tcbtable, 825 lin->sin_addr, lin->sin_port, 0, NULL, tir.rdomain); 826 break; 827 } 828 } 829 830 if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) { 831 tir.ruid = inp->inp_socket->so_ruid; 832 tir.euid = inp->inp_socket->so_euid; 833 } else { 834 tir.ruid = -1; 835 tir.euid = -1; 836 } 837 splx(s); 838 839 *oldlenp = sizeof (tir); 840 error = copyout((void *)&tir, oldp, sizeof (tir)); 841 return (error); 842 } 843 844 /* 845 * Sysctl for tcp variables. 846 */ 847 int 848 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 849 size_t newlen) 850 { 851 int error, nval; 852 853 /* All sysctl names at this level are terminal. */ 854 if (namelen != 1) 855 return (ENOTDIR); 856 857 switch (name[0]) { 858 #ifdef TCP_SACK 859 case TCPCTL_SACK: 860 return (sysctl_int(oldp, oldlenp, newp, newlen, 861 &tcp_do_sack)); 862 #endif 863 case TCPCTL_SLOWHZ: 864 return (sysctl_rdint(oldp, oldlenp, newp, PR_SLOWHZ)); 865 866 case TCPCTL_BADDYNAMIC: 867 return (sysctl_struct(oldp, oldlenp, newp, newlen, 868 baddynamicports.tcp, sizeof(baddynamicports.tcp))); 869 870 case TCPCTL_ROOTONLY: 871 if (newp && securelevel > 0) 872 return (EPERM); 873 return (sysctl_struct(oldp, oldlenp, newp, newlen, 874 rootonlyports.tcp, sizeof(rootonlyports.tcp))); 875 876 case TCPCTL_IDENT: 877 return (tcp_ident(oldp, oldlenp, newp, newlen, 0)); 878 879 case TCPCTL_DROP: 880 return (tcp_ident(oldp, oldlenp, newp, newlen, 1)); 881 882 case TCPCTL_ALWAYS_KEEPALIVE: 883 return (sysctl_int(oldp, oldlenp, newp, newlen, 884 &tcp_always_keepalive)); 885 886 #ifdef TCP_ECN 887 case TCPCTL_ECN: 888 return (sysctl_int(oldp, oldlenp, newp, newlen, 889 &tcp_do_ecn)); 890 #endif 891 case TCPCTL_REASS_LIMIT: 892 nval = tcp_reass_limit; 893 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 894 if (error) 895 return (error); 896 if (nval != tcp_reass_limit) { 897 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); 898 if (error) 899 return (error); 900 tcp_reass_limit = nval; 901 } 902 return (0); 903 #ifdef TCP_SACK 904 case TCPCTL_SACKHOLE_LIMIT: 905 nval = tcp_sackhole_limit; 906 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 907 if (error) 908 return (error); 909 if (nval != tcp_sackhole_limit) { 910 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); 911 if (error) 912 return (error); 913 tcp_sackhole_limit = nval; 914 } 915 return (0); 916 #endif 917 918 case TCPCTL_STATS: 919 if (newp != NULL) 920 return (EPERM); 921 { 922 struct syn_cache_set *set; 923 int i; 924 925 set = &tcp_syn_cache[tcp_syn_cache_active]; 926 tcpstat.tcps_sc_hash_size = set->scs_size; 927 tcpstat.tcps_sc_entry_count = set->scs_count; 928 tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit; 929 tcpstat.tcps_sc_bucket_maxlen = 0; 930 for (i = 0; i < set->scs_size; i++) { 931 if (tcpstat.tcps_sc_bucket_maxlen < 932 set->scs_buckethead[i].sch_length) 933 tcpstat.tcps_sc_bucket_maxlen = 934 set->scs_buckethead[i].sch_length; 935 } 936 tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit; 937 tcpstat.tcps_sc_uses_left = set->scs_use; 938 } 939 return (sysctl_struct(oldp, oldlenp, newp, newlen, 940 &tcpstat, sizeof(tcpstat))); 941 942 case TCPCTL_SYN_USE_LIMIT: 943 error = sysctl_int(oldp, oldlenp, newp, newlen, 944 &tcp_syn_use_limit); 945 if (error) 946 return (error); 947 if (newp != NULL) { 948 /* 949 * Global tcp_syn_use_limit is used when reseeding a 950 * new cache. Also update the value in active cache. 951 */ 952 if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit) 953 tcp_syn_cache[0].scs_use = tcp_syn_use_limit; 954 if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit) 955 tcp_syn_cache[1].scs_use = tcp_syn_use_limit; 956 } 957 return (0); 958 959 case TCPCTL_SYN_HASH_SIZE: 960 nval = tcp_syn_hash_size; 961 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 962 if (error) 963 return (error); 964 if (nval != tcp_syn_hash_size) { 965 if (nval < 1 || nval > 100000) 966 return (EINVAL); 967 /* 968 * If global hash size has been changed, switch sets as 969 * soon as possible. Then the actual hash array will 970 * be reallocated. 971 */ 972 if (tcp_syn_cache[0].scs_size != nval) 973 tcp_syn_cache[0].scs_use = 0; 974 if (tcp_syn_cache[1].scs_size != nval) 975 tcp_syn_cache[1].scs_use = 0; 976 tcp_syn_hash_size = nval; 977 } 978 return (0); 979 980 default: 981 if (name[0] < TCPCTL_MAXID) 982 return (sysctl_int_arr(tcpctl_vars, name, namelen, 983 oldp, oldlenp, newp, newlen)); 984 return (ENOPROTOOPT); 985 } 986 /* NOTREACHED */ 987 } 988 989 /* 990 * Scale the send buffer so that inflight data is not accounted against 991 * the limit. The buffer will scale with the congestion window, if the 992 * the receiver stops acking data the window will shrink and therefor 993 * the buffer size will shrink as well. 994 * In low memory situation try to shrink the buffer to the initial size 995 * disabling the send buffer scaling as long as the situation persists. 996 */ 997 void 998 tcp_update_sndspace(struct tcpcb *tp) 999 { 1000 struct socket *so = tp->t_inpcb->inp_socket; 1001 u_long nmax = so->so_snd.sb_hiwat; 1002 1003 if (sbchecklowmem()) { 1004 /* low on memory try to get rid of some */ 1005 if (tcp_sendspace < nmax) 1006 nmax = tcp_sendspace; 1007 } else if (so->so_snd.sb_wat != tcp_sendspace) 1008 /* user requested buffer size, auto-scaling disabled */ 1009 nmax = so->so_snd.sb_wat; 1010 else 1011 /* automatic buffer scaling */ 1012 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - 1013 tp->snd_una); 1014 1015 /* a writable socket must be preserved because of poll(2) semantics */ 1016 if (sbspace(&so->so_snd) >= so->so_snd.sb_lowat) { 1017 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) 1018 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; 1019 if (nmax * 2 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat) 1020 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+1) / 2; 1021 } 1022 1023 /* round to MSS boundary */ 1024 nmax = roundup(nmax, tp->t_maxseg); 1025 1026 if (nmax != so->so_snd.sb_hiwat) 1027 sbreserve(&so->so_snd, nmax); 1028 } 1029 1030 /* 1031 * Scale the recv buffer by looking at how much data was transferred in 1032 * on approximated RTT. If more then a big part of the recv buffer was 1033 * transferred during that time we increase the buffer by a constant. 1034 * In low memory situation try to shrink the buffer to the initial size. 1035 */ 1036 void 1037 tcp_update_rcvspace(struct tcpcb *tp) 1038 { 1039 struct socket *so = tp->t_inpcb->inp_socket; 1040 u_long nmax = so->so_rcv.sb_hiwat; 1041 1042 if (sbchecklowmem()) { 1043 /* low on memory try to get rid of some */ 1044 if (tcp_recvspace < nmax) 1045 nmax = tcp_recvspace; 1046 } else if (so->so_rcv.sb_wat != tcp_recvspace) 1047 /* user requested buffer size, auto-scaling disabled */ 1048 nmax = so->so_rcv.sb_wat; 1049 else { 1050 /* automatic buffer scaling */ 1051 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) 1052 nmax = MIN(sb_max, so->so_rcv.sb_hiwat + 1053 tcp_autorcvbuf_inc); 1054 } 1055 1056 /* a readable socket must be preserved because of poll(2) semantics */ 1057 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && 1058 nmax < so->so_snd.sb_lowat) 1059 nmax = so->so_snd.sb_lowat; 1060 1061 if (nmax == so->so_rcv.sb_hiwat) 1062 return; 1063 1064 /* round to MSS boundary */ 1065 nmax = roundup(nmax, tp->t_maxseg); 1066 sbreserve(&so->so_rcv, nmax); 1067 } 1068