1 /* $OpenBSD: tcp_usrreq.c,v 1.122 2014/11/25 15:35:10 mpi Exp $ */ 2 /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/protosw.h> 77 #include <sys/stat.h> 78 #include <sys/sysctl.h> 79 #include <sys/domain.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 83 #include <net/if.h> 84 #include <net/route.h> 85 86 #include <netinet/in.h> 87 #include <netinet/in_var.h> 88 #include <netinet/ip.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/ip_var.h> 91 #include <netinet/tcp.h> 92 #include <netinet/tcp_fsm.h> 93 #include <netinet/tcp_seq.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcpip.h> 97 #include <netinet/tcp_debug.h> 98 99 #ifdef INET6 100 #include <netinet6/in6_var.h> 101 #endif 102 103 #ifndef TCP_SENDSPACE 104 #define TCP_SENDSPACE 1024*16 105 #endif 106 u_int tcp_sendspace = TCP_SENDSPACE; 107 #ifndef TCP_RECVSPACE 108 #define TCP_RECVSPACE 1024*16 109 #endif 110 u_int tcp_recvspace = TCP_RECVSPACE; 111 u_int tcp_autorcvbuf_inc = 16 * 1024; 112 113 int *tcpctl_vars[TCPCTL_MAXID] = TCPCTL_VARS; 114 115 struct inpcbtable tcbtable; 116 117 int tcp_ident(void *, size_t *, void *, size_t, int); 118 119 /* 120 * Process a TCP user request for TCP tb. If this is a send request 121 * then m is the mbuf chain of send data. If this is a timer expiration 122 * (called from the software clock routine), then timertype tells which timer. 123 */ 124 /*ARGSUSED*/ 125 int 126 tcp_usrreq(so, req, m, nam, control, p) 127 struct socket *so; 128 int req; 129 struct mbuf *m, *nam, *control; 130 struct proc *p; 131 { 132 struct sockaddr_in *sin; 133 struct inpcb *inp; 134 struct tcpcb *tp = NULL; 135 int s; 136 int error = 0; 137 short ostate; 138 139 if (req == PRU_CONTROL) { 140 #ifdef INET6 141 if (sotopf(so) == PF_INET6) 142 return in6_control(so, (u_long)m, (caddr_t)nam, 143 (struct ifnet *)control); 144 else 145 #endif /* INET6 */ 146 return (in_control(so, (u_long)m, (caddr_t)nam, 147 (struct ifnet *)control)); 148 } 149 if (control && control->m_len) { 150 m_freem(control); 151 if (m) 152 m_freem(m); 153 return (EINVAL); 154 } 155 156 s = splsoftnet(); 157 inp = sotoinpcb(so); 158 /* 159 * When a TCP is attached to a socket, then there will be 160 * a (struct inpcb) pointed at by the socket, and this 161 * structure will point at a subsidiary (struct tcpcb). 162 */ 163 if (inp == 0 && req != PRU_ATTACH) { 164 error = so->so_error; 165 if (error == 0) 166 error = EINVAL; 167 splx(s); 168 /* 169 * The following corrects an mbuf leak under rare 170 * circumstances 171 */ 172 if (m && (req == PRU_SEND || req == PRU_SENDOOB)) 173 m_freem(m); 174 return (error); 175 } 176 if (inp) { 177 tp = intotcpcb(inp); 178 /* tp might get 0 when using socket splicing */ 179 if (tp == NULL) { 180 splx(s); 181 return (0); 182 } 183 #ifdef KPROF 184 tcp_acounts[tp->t_state][req]++; 185 #endif 186 ostate = tp->t_state; 187 } else 188 ostate = 0; 189 switch (req) { 190 191 /* 192 * TCP attaches to socket via PRU_ATTACH, reserving space, 193 * and an internet control block. 194 */ 195 case PRU_ATTACH: 196 if (inp) { 197 error = EISCONN; 198 break; 199 } 200 error = tcp_attach(so); 201 if (error) 202 break; 203 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 204 so->so_linger = TCP_LINGERTIME; 205 tp = sototcpcb(so); 206 break; 207 208 /* 209 * PRU_DETACH detaches the TCP protocol from the socket. 210 * If the protocol state is non-embryonic, then can't 211 * do this directly: have to initiate a PRU_DISCONNECT, 212 * which may finish later; embryonic TCB's can just 213 * be discarded here. 214 */ 215 case PRU_DETACH: 216 tp = tcp_disconnect(tp); 217 break; 218 219 /* 220 * Give the socket an address. 221 */ 222 case PRU_BIND: 223 #ifdef INET6 224 if (inp->inp_flags & INP_IPV6) 225 error = in6_pcbbind(inp, nam, p); 226 else 227 #endif 228 error = in_pcbbind(inp, nam, p); 229 if (error) 230 break; 231 break; 232 233 /* 234 * Prepare to accept connections. 235 */ 236 case PRU_LISTEN: 237 if (inp->inp_lport == 0) { 238 #ifdef INET6 239 if (inp->inp_flags & INP_IPV6) 240 error = in6_pcbbind(inp, NULL, p); 241 else 242 #endif 243 error = in_pcbbind(inp, NULL, p); 244 } 245 /* If the in_pcbbind() above is called, the tp->pf 246 should still be whatever it was before. */ 247 if (error == 0) 248 tp->t_state = TCPS_LISTEN; 249 break; 250 251 /* 252 * Initiate connection to peer. 253 * Create a template for use in transmissions on this connection. 254 * Enter SYN_SENT state, and mark socket as connecting. 255 * Start keep-alive timer, and seed output sequence space. 256 * Send initial segment on connection. 257 */ 258 case PRU_CONNECT: 259 sin = mtod(nam, struct sockaddr_in *); 260 261 #ifdef INET6 262 if (sin->sin_family == AF_INET6) { 263 struct in6_addr *in6_addr = &mtod(nam, 264 struct sockaddr_in6 *)->sin6_addr; 265 266 if (IN6_IS_ADDR_UNSPECIFIED(in6_addr) || 267 IN6_IS_ADDR_MULTICAST(in6_addr) || 268 IN6_IS_ADDR_V4MAPPED(in6_addr)) { 269 error = EINVAL; 270 break; 271 } 272 273 error = in6_pcbconnect(inp, nam); 274 } else if (sin->sin_family == AF_INET) 275 #endif /* INET6 */ 276 { 277 if ((sin->sin_addr.s_addr == INADDR_ANY) || 278 (sin->sin_addr.s_addr == INADDR_BROADCAST) || 279 IN_MULTICAST(sin->sin_addr.s_addr) || 280 in_broadcast(sin->sin_addr, inp->inp_rtableid)) { 281 error = EINVAL; 282 break; 283 } 284 285 error = in_pcbconnect(inp, nam); 286 } 287 288 if (error) 289 break; 290 291 tp->t_template = tcp_template(tp); 292 if (tp->t_template == 0) { 293 in_pcbdisconnect(inp); 294 error = ENOBUFS; 295 break; 296 } 297 298 so->so_state |= SS_CONNECTOUT; 299 300 /* Compute window scaling to request. */ 301 tcp_rscale(tp, sb_max); 302 303 soisconnecting(so); 304 tcpstat.tcps_connattempt++; 305 tp->t_state = TCPS_SYN_SENT; 306 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 307 tcp_set_iss_tsm(tp); 308 tcp_sendseqinit(tp); 309 #if defined(TCP_SACK) 310 tp->snd_last = tp->snd_una; 311 #endif 312 #if defined(TCP_SACK) && defined(TCP_FACK) 313 tp->snd_fack = tp->snd_una; 314 tp->retran_data = 0; 315 tp->snd_awnd = 0; 316 #endif 317 error = tcp_output(tp); 318 break; 319 320 /* 321 * Create a TCP connection between two sockets. 322 */ 323 case PRU_CONNECT2: 324 error = EOPNOTSUPP; 325 break; 326 327 /* 328 * Initiate disconnect from peer. 329 * If connection never passed embryonic stage, just drop; 330 * else if don't need to let data drain, then can just drop anyways, 331 * else have to begin TCP shutdown process: mark socket disconnecting, 332 * drain unread data, state switch to reflect user close, and 333 * send segment (e.g. FIN) to peer. Socket will be really disconnected 334 * when peer sends FIN and acks ours. 335 * 336 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 337 */ 338 case PRU_DISCONNECT: 339 tp = tcp_disconnect(tp); 340 break; 341 342 /* 343 * Accept a connection. Essentially all the work is 344 * done at higher levels; just return the address 345 * of the peer, storing through addr. 346 */ 347 case PRU_ACCEPT: 348 #ifdef INET6 349 if (inp->inp_flags & INP_IPV6) 350 in6_setpeeraddr(inp, nam); 351 else 352 #endif 353 in_setpeeraddr(inp, nam); 354 break; 355 356 /* 357 * Mark the connection as being incapable of further output. 358 */ 359 case PRU_SHUTDOWN: 360 if (so->so_state & SS_CANTSENDMORE) 361 break; 362 socantsendmore(so); 363 tp = tcp_usrclosed(tp); 364 if (tp) 365 error = tcp_output(tp); 366 break; 367 368 /* 369 * After a receive, possibly send window update to peer. 370 */ 371 case PRU_RCVD: 372 /* 373 * soreceive() calls this function when a user receives 374 * ancillary data on a listening socket. We don't call 375 * tcp_output in such a case, since there is no header 376 * template for a listening socket and hence the kernel 377 * will panic. 378 */ 379 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 380 (void) tcp_output(tp); 381 break; 382 383 /* 384 * Do a send by putting data in output queue and updating urgent 385 * marker if URG set. Possibly send more data. 386 */ 387 case PRU_SEND: 388 sbappendstream(&so->so_snd, m); 389 error = tcp_output(tp); 390 break; 391 392 /* 393 * Abort the TCP. 394 */ 395 case PRU_ABORT: 396 tp = tcp_drop(tp, ECONNABORTED); 397 break; 398 399 case PRU_SENSE: 400 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; 401 splx(s); 402 return (0); 403 404 case PRU_RCVOOB: 405 if ((so->so_oobmark == 0 && 406 (so->so_state & SS_RCVATMARK) == 0) || 407 so->so_options & SO_OOBINLINE || 408 tp->t_oobflags & TCPOOB_HADDATA) { 409 error = EINVAL; 410 break; 411 } 412 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 413 error = EWOULDBLOCK; 414 break; 415 } 416 m->m_len = 1; 417 *mtod(m, caddr_t) = tp->t_iobc; 418 if (((long)nam & MSG_PEEK) == 0) 419 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 420 break; 421 422 case PRU_SENDOOB: 423 if (sbspace(&so->so_snd) < -512) { 424 m_freem(m); 425 error = ENOBUFS; 426 break; 427 } 428 /* 429 * According to RFC961 (Assigned Protocols), 430 * the urgent pointer points to the last octet 431 * of urgent data. We continue, however, 432 * to consider it to indicate the first octet 433 * of data past the urgent section. 434 * Otherwise, snd_up should be one lower. 435 */ 436 sbappendstream(&so->so_snd, m); 437 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 438 tp->t_force = 1; 439 error = tcp_output(tp); 440 tp->t_force = 0; 441 break; 442 443 case PRU_SOCKADDR: 444 #ifdef INET6 445 if (inp->inp_flags & INP_IPV6) 446 in6_setsockaddr(inp, nam); 447 else 448 #endif 449 in_setsockaddr(inp, nam); 450 break; 451 452 case PRU_PEERADDR: 453 #ifdef INET6 454 if (inp->inp_flags & INP_IPV6) 455 in6_setpeeraddr(inp, nam); 456 else 457 #endif 458 in_setpeeraddr(inp, nam); 459 break; 460 461 default: 462 panic("tcp_usrreq"); 463 } 464 if (tp && (so->so_options & SO_DEBUG)) 465 tcp_trace(TA_USER, ostate, tp, (caddr_t)0, req, 0); 466 splx(s); 467 return (error); 468 } 469 470 int 471 tcp_ctloutput(op, so, level, optname, mp) 472 int op; 473 struct socket *so; 474 int level, optname; 475 struct mbuf **mp; 476 { 477 int error = 0, s; 478 struct inpcb *inp; 479 struct tcpcb *tp; 480 struct mbuf *m; 481 int i; 482 483 s = splsoftnet(); 484 inp = sotoinpcb(so); 485 if (inp == NULL) { 486 splx(s); 487 if (op == PRCO_SETOPT && *mp) 488 (void) m_free(*mp); 489 return (ECONNRESET); 490 } 491 if (level != IPPROTO_TCP) { 492 switch (so->so_proto->pr_domain->dom_family) { 493 #ifdef INET6 494 case PF_INET6: 495 error = ip6_ctloutput(op, so, level, optname, mp); 496 break; 497 #endif /* INET6 */ 498 case PF_INET: 499 error = ip_ctloutput(op, so, level, optname, mp); 500 break; 501 default: 502 error = EAFNOSUPPORT; /*?*/ 503 break; 504 } 505 splx(s); 506 return (error); 507 } 508 tp = intotcpcb(inp); 509 510 switch (op) { 511 512 case PRCO_SETOPT: 513 m = *mp; 514 switch (optname) { 515 516 case TCP_NODELAY: 517 if (m == NULL || m->m_len < sizeof (int)) 518 error = EINVAL; 519 else if (*mtod(m, int *)) 520 tp->t_flags |= TF_NODELAY; 521 else 522 tp->t_flags &= ~TF_NODELAY; 523 break; 524 525 case TCP_NOPUSH: 526 if (m == NULL || m->m_len < sizeof (int)) 527 error = EINVAL; 528 else if (*mtod(m, int *)) 529 tp->t_flags |= TF_NOPUSH; 530 else if (tp->t_flags & TF_NOPUSH) { 531 tp->t_flags &= ~TF_NOPUSH; 532 if (TCPS_HAVEESTABLISHED(tp->t_state)) 533 error = tcp_output(tp); 534 } 535 break; 536 537 case TCP_MAXSEG: 538 if (m == NULL || m->m_len < sizeof (int)) { 539 error = EINVAL; 540 break; 541 } 542 543 i = *mtod(m, int *); 544 if (i > 0 && i <= tp->t_maxseg) 545 tp->t_maxseg = i; 546 else 547 error = EINVAL; 548 break; 549 550 #ifdef TCP_SACK 551 case TCP_SACK_ENABLE: 552 if (m == NULL || m->m_len < sizeof (int)) { 553 error = EINVAL; 554 break; 555 } 556 557 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 558 error = EPERM; 559 break; 560 } 561 562 if (tp->t_flags & TF_SIGNATURE) { 563 error = EPERM; 564 break; 565 } 566 567 if (*mtod(m, int *)) 568 tp->sack_enable = 1; 569 else 570 tp->sack_enable = 0; 571 break; 572 #endif 573 #ifdef TCP_SIGNATURE 574 case TCP_MD5SIG: 575 if (m == NULL || m->m_len < sizeof (int)) { 576 error = EINVAL; 577 break; 578 } 579 580 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 581 error = EPERM; 582 break; 583 } 584 585 if (*mtod(m, int *)) { 586 tp->t_flags |= TF_SIGNATURE; 587 #ifdef TCP_SACK 588 tp->sack_enable = 0; 589 #endif /* TCP_SACK */ 590 } else 591 tp->t_flags &= ~TF_SIGNATURE; 592 break; 593 #endif /* TCP_SIGNATURE */ 594 default: 595 error = ENOPROTOOPT; 596 break; 597 } 598 if (m) 599 (void) m_free(m); 600 break; 601 602 case PRCO_GETOPT: 603 *mp = m = m_get(M_WAIT, MT_SOOPTS); 604 m->m_len = sizeof(int); 605 606 switch (optname) { 607 case TCP_NODELAY: 608 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 609 break; 610 case TCP_NOPUSH: 611 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 612 break; 613 case TCP_MAXSEG: 614 *mtod(m, int *) = tp->t_maxseg; 615 break; 616 #ifdef TCP_SACK 617 case TCP_SACK_ENABLE: 618 *mtod(m, int *) = tp->sack_enable; 619 break; 620 #endif 621 #ifdef TCP_SIGNATURE 622 case TCP_MD5SIG: 623 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; 624 break; 625 #endif 626 default: 627 error = ENOPROTOOPT; 628 break; 629 } 630 break; 631 } 632 splx(s); 633 return (error); 634 } 635 636 /* 637 * Attach TCP protocol to socket, allocating 638 * internet protocol control block, tcp control block, 639 * bufer space, and entering LISTEN state if to accept connections. 640 */ 641 int 642 tcp_attach(so) 643 struct socket *so; 644 { 645 struct tcpcb *tp; 646 struct inpcb *inp; 647 int error; 648 649 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || 650 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || 651 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { 652 error = soreserve(so, tcp_sendspace, tcp_recvspace); 653 if (error) 654 return (error); 655 } 656 657 error = in_pcballoc(so, &tcbtable); 658 if (error) 659 return (error); 660 inp = sotoinpcb(so); 661 tp = tcp_newtcpcb(inp); 662 if (tp == NULL) { 663 int nofd = so->so_state & SS_NOFDREF; /* XXX */ 664 665 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 666 in_pcbdetach(inp); 667 so->so_state |= nofd; 668 return (ENOBUFS); 669 } 670 tp->t_state = TCPS_CLOSED; 671 #ifdef INET6 672 /* we disallow IPv4 mapped address completely. */ 673 if (inp->inp_flags & INP_IPV6) 674 tp->pf = PF_INET6; 675 else 676 tp->pf = PF_INET; 677 #else 678 tp->pf = PF_INET; 679 #endif 680 return (0); 681 } 682 683 /* 684 * Initiate (or continue) disconnect. 685 * If embryonic state, just send reset (once). 686 * If in ``let data drain'' option and linger null, just drop. 687 * Otherwise (hard), mark socket disconnecting and drop 688 * current input data; switch states based on user close, and 689 * send segment to peer (with FIN). 690 */ 691 struct tcpcb * 692 tcp_disconnect(tp) 693 struct tcpcb *tp; 694 { 695 struct socket *so = tp->t_inpcb->inp_socket; 696 697 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 698 tp = tcp_close(tp); 699 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 700 tp = tcp_drop(tp, 0); 701 else { 702 soisdisconnecting(so); 703 sbflush(&so->so_rcv); 704 tp = tcp_usrclosed(tp); 705 if (tp) 706 (void) tcp_output(tp); 707 } 708 return (tp); 709 } 710 711 /* 712 * User issued close, and wish to trail through shutdown states: 713 * if never received SYN, just forget it. If got a SYN from peer, 714 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 715 * If already got a FIN from peer, then almost done; go to LAST_ACK 716 * state. In all other cases, have already sent FIN to peer (e.g. 717 * after PRU_SHUTDOWN), and just have to play tedious game waiting 718 * for peer to send FIN or not respond to keep-alives, etc. 719 * We can let the user exit from the close as soon as the FIN is acked. 720 */ 721 struct tcpcb * 722 tcp_usrclosed(tp) 723 struct tcpcb *tp; 724 { 725 726 switch (tp->t_state) { 727 728 case TCPS_CLOSED: 729 case TCPS_LISTEN: 730 case TCPS_SYN_SENT: 731 tp->t_state = TCPS_CLOSED; 732 tp = tcp_close(tp); 733 break; 734 735 case TCPS_SYN_RECEIVED: 736 case TCPS_ESTABLISHED: 737 tp->t_state = TCPS_FIN_WAIT_1; 738 break; 739 740 case TCPS_CLOSE_WAIT: 741 tp->t_state = TCPS_LAST_ACK; 742 break; 743 } 744 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 745 soisdisconnected(tp->t_inpcb->inp_socket); 746 /* 747 * If we are in FIN_WAIT_2, we arrived here because the 748 * application did a shutdown of the send side. Like the 749 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 750 * a full close, we start a timer to make sure sockets are 751 * not left in FIN_WAIT_2 forever. 752 */ 753 if (tp->t_state == TCPS_FIN_WAIT_2) 754 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 755 } 756 return (tp); 757 } 758 759 /* 760 * Look up a socket for ident or tcpdrop, ... 761 */ 762 int 763 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) 764 { 765 int error = 0, s; 766 struct tcp_ident_mapping tir; 767 struct inpcb *inp; 768 struct tcpcb *tp = NULL; 769 struct sockaddr_in *fin, *lin; 770 #ifdef INET6 771 struct sockaddr_in6 *fin6, *lin6; 772 struct in6_addr f6, l6; 773 #endif 774 if (dodrop) { 775 if (oldp != NULL || *oldlenp != 0) 776 return (EINVAL); 777 if (newp == NULL) 778 return (EPERM); 779 if (newlen < sizeof(tir)) 780 return (ENOMEM); 781 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) 782 return (error); 783 } else { 784 if (oldp == NULL) 785 return (EINVAL); 786 if (*oldlenp < sizeof(tir)) 787 return (ENOMEM); 788 if (newp != NULL || newlen != 0) 789 return (EINVAL); 790 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) 791 return (error); 792 } 793 switch (tir.faddr.ss_family) { 794 #ifdef INET6 795 case AF_INET6: 796 fin6 = (struct sockaddr_in6 *)&tir.faddr; 797 error = in6_embedscope(&f6, fin6, NULL, NULL); 798 if (error) 799 return EINVAL; /*?*/ 800 lin6 = (struct sockaddr_in6 *)&tir.laddr; 801 error = in6_embedscope(&l6, lin6, NULL, NULL); 802 if (error) 803 return EINVAL; /*?*/ 804 break; 805 #endif 806 case AF_INET: 807 fin = (struct sockaddr_in *)&tir.faddr; 808 lin = (struct sockaddr_in *)&tir.laddr; 809 break; 810 default: 811 return (EINVAL); 812 } 813 814 s = splsoftnet(); 815 switch (tir.faddr.ss_family) { 816 #ifdef INET6 817 case AF_INET6: 818 inp = in6_pcbhashlookup(&tcbtable, &f6, 819 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); 820 break; 821 #endif 822 case AF_INET: 823 inp = in_pcbhashlookup(&tcbtable, fin->sin_addr, 824 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); 825 break; 826 } 827 828 if (dodrop) { 829 if (inp && (tp = intotcpcb(inp)) && 830 ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) 831 tp = tcp_drop(tp, ECONNABORTED); 832 else 833 error = ESRCH; 834 splx(s); 835 return (error); 836 } 837 838 if (inp == NULL) { 839 ++tcpstat.tcps_pcbhashmiss; 840 switch (tir.faddr.ss_family) { 841 #ifdef INET6 842 case AF_INET6: 843 inp = in6_pcblookup_listen(&tcbtable, 844 &l6, lin6->sin6_port, 0, NULL, tir.rdomain); 845 break; 846 #endif 847 case AF_INET: 848 inp = in_pcblookup_listen(&tcbtable, 849 lin->sin_addr, lin->sin_port, 0, NULL, tir.rdomain); 850 break; 851 } 852 } 853 854 if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) { 855 tir.ruid = inp->inp_socket->so_ruid; 856 tir.euid = inp->inp_socket->so_euid; 857 } else { 858 tir.ruid = -1; 859 tir.euid = -1; 860 } 861 splx(s); 862 863 *oldlenp = sizeof (tir); 864 error = copyout((void *)&tir, oldp, sizeof (tir)); 865 return (error); 866 } 867 868 /* 869 * Sysctl for tcp variables. 870 */ 871 int 872 tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen) 873 int *name; 874 u_int namelen; 875 void *oldp; 876 size_t *oldlenp; 877 void *newp; 878 size_t newlen; 879 { 880 int error, nval; 881 882 /* All sysctl names at this level are terminal. */ 883 if (namelen != 1) 884 return (ENOTDIR); 885 886 switch (name[0]) { 887 #ifdef TCP_SACK 888 case TCPCTL_SACK: 889 return (sysctl_int(oldp, oldlenp, newp, newlen, 890 &tcp_do_sack)); 891 #endif 892 case TCPCTL_SLOWHZ: 893 return (sysctl_rdint(oldp, oldlenp, newp, PR_SLOWHZ)); 894 895 case TCPCTL_BADDYNAMIC: 896 return (sysctl_struct(oldp, oldlenp, newp, newlen, 897 baddynamicports.tcp, sizeof(baddynamicports.tcp))); 898 899 case TCPCTL_IDENT: 900 return (tcp_ident(oldp, oldlenp, newp, newlen, 0)); 901 902 case TCPCTL_DROP: 903 return (tcp_ident(oldp, oldlenp, newp, newlen, 1)); 904 905 case TCPCTL_ALWAYS_KEEPALIVE: 906 return (sysctl_int(oldp, oldlenp, newp, newlen, 907 &tcp_always_keepalive)); 908 909 #ifdef TCP_ECN 910 case TCPCTL_ECN: 911 return (sysctl_int(oldp, oldlenp, newp, newlen, 912 &tcp_do_ecn)); 913 #endif 914 case TCPCTL_REASS_LIMIT: 915 nval = tcp_reass_limit; 916 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 917 if (error) 918 return (error); 919 if (nval != tcp_reass_limit) { 920 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); 921 if (error) 922 return (error); 923 tcp_reass_limit = nval; 924 } 925 return (0); 926 #ifdef TCP_SACK 927 case TCPCTL_SACKHOLE_LIMIT: 928 nval = tcp_sackhole_limit; 929 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 930 if (error) 931 return (error); 932 if (nval != tcp_sackhole_limit) { 933 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); 934 if (error) 935 return (error); 936 tcp_sackhole_limit = nval; 937 } 938 return (0); 939 #endif 940 941 case TCPCTL_STATS: 942 if (newp != NULL) 943 return (EPERM); 944 return (sysctl_struct(oldp, oldlenp, newp, newlen, 945 &tcpstat, sizeof(tcpstat))); 946 947 default: 948 if (name[0] < TCPCTL_MAXID) 949 return (sysctl_int_arr(tcpctl_vars, name, namelen, 950 oldp, oldlenp, newp, newlen)); 951 return (ENOPROTOOPT); 952 } 953 /* NOTREACHED */ 954 } 955 956 /* 957 * Scale the send buffer so that inflight data is not accounted against 958 * the limit. The buffer will scale with the congestion window, if the 959 * the receiver stops acking data the window will shrink and therefor 960 * the buffer size will shrink as well. 961 * In low memory situation try to shrink the buffer to the initial size 962 * disabling the send buffer scaling as long as the situation persists. 963 */ 964 void 965 tcp_update_sndspace(struct tcpcb *tp) 966 { 967 struct socket *so = tp->t_inpcb->inp_socket; 968 u_long nmax; 969 970 if (sbchecklowmem()) 971 /* low on memory try to get rid of some */ 972 nmax = tcp_sendspace; 973 else if (so->so_snd.sb_wat != tcp_sendspace) 974 /* user requested buffer size, auto-scaling disabled */ 975 nmax = so->so_snd.sb_wat; 976 else 977 /* automatic buffer scaling */ 978 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - 979 tp->snd_una); 980 981 /* round to MSS boundary */ 982 nmax = roundup(nmax, tp->t_maxseg); 983 984 if (nmax != so->so_snd.sb_hiwat) 985 sbreserve(&so->so_snd, nmax); 986 } 987 988 /* 989 * Scale the recv buffer by looking at how much data was transferred in 990 * on approximated RTT. If more then a big part of the recv buffer was 991 * transferred during that time we increase the buffer by a constant. 992 * In low memory situation try to shrink the buffer to the initial size. 993 */ 994 void 995 tcp_update_rcvspace(struct tcpcb *tp) 996 { 997 struct socket *so = tp->t_inpcb->inp_socket; 998 u_long nmax = so->so_rcv.sb_hiwat; 999 1000 if (sbchecklowmem()) 1001 /* low on memory try to get rid of some */ 1002 nmax = tcp_recvspace; 1003 else if (so->so_rcv.sb_wat != tcp_recvspace) 1004 /* user requested buffer size, auto-scaling disabled */ 1005 nmax = so->so_rcv.sb_wat; 1006 else { 1007 /* automatic buffer scaling */ 1008 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) 1009 nmax = MIN(sb_max, so->so_rcv.sb_hiwat + 1010 tcp_autorcvbuf_inc); 1011 } 1012 1013 if (nmax == so->so_rcv.sb_hiwat) 1014 return; 1015 1016 /* round to MSS boundary */ 1017 nmax = roundup(nmax, tp->t_maxseg); 1018 sbreserve(&so->so_rcv, nmax); 1019 } 1020