1 /* $NetBSD: tcp_usrreq.c,v 1.129 2006/11/10 13:19:16 yamt Exp $ */ 2 3 /* 4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the project nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /*- 33 * Copyright (c) 1997, 1998, 2005, 2006 The NetBSD Foundation, Inc. 34 * All rights reserved. 35 * 36 * This code is derived from software contributed to The NetBSD Foundation 37 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation 38 * Facility, NASA Ames Research Center. 39 * This code is derived from software contributed to The NetBSD Foundation 40 * by Charles M. Hannum. 41 * This code is derived from software contributed to The NetBSD Foundation 42 * by Rui Paulo. 43 * 44 * Redistribution and use in source and binary forms, with or without 45 * modification, are permitted provided that the following conditions 46 * are met: 47 * 1. Redistributions of source code must retain the above copyright 48 * notice, this list of conditions and the following disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 3. All advertising materials mentioning features or use of this software 53 * must display the following acknowledgement: 54 * This product includes software developed by the NetBSD 55 * Foundation, Inc. and its contributors. 56 * 4. Neither the name of The NetBSD Foundation nor the names of its 57 * contributors may be used to endorse or promote products derived 58 * from this software without specific prior written permission. 59 * 60 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 61 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 62 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 63 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 64 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 65 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 66 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 67 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 68 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 69 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 70 * POSSIBILITY OF SUCH DAMAGE. 71 */ 72 73 /* 74 * Copyright (c) 1982, 1986, 1988, 1993, 1995 75 * The Regents of the University of California. All rights reserved. 76 * 77 * Redistribution and use in source and binary forms, with or without 78 * modification, are permitted provided that the following conditions 79 * are met: 80 * 1. Redistributions of source code must retain the above copyright 81 * notice, this list of conditions and the following disclaimer. 82 * 2. Redistributions in binary form must reproduce the above copyright 83 * notice, this list of conditions and the following disclaimer in the 84 * documentation and/or other materials provided with the distribution. 85 * 3. Neither the name of the University nor the names of its contributors 86 * may be used to endorse or promote products derived from this software 87 * without specific prior written permission. 88 * 89 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 90 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 91 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 92 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 93 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 94 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 95 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 96 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 97 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 98 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 99 * SUCH DAMAGE. 100 * 101 * @(#)tcp_usrreq.c 8.5 (Berkeley) 6/21/95 102 */ 103 104 #include <sys/cdefs.h> 105 __KERNEL_RCSID(0, "$NetBSD: tcp_usrreq.c,v 1.129 2006/11/10 13:19:16 yamt Exp $"); 106 107 #include "opt_inet.h" 108 #include "opt_ipsec.h" 109 #include "opt_tcp_debug.h" 110 #include "opt_mbuftrace.h" 111 #include "rnd.h" 112 113 #include <sys/param.h> 114 #include <sys/systm.h> 115 #include <sys/kernel.h> 116 #include <sys/malloc.h> 117 #include <sys/mbuf.h> 118 #include <sys/socket.h> 119 #include <sys/socketvar.h> 120 #include <sys/protosw.h> 121 #include <sys/errno.h> 122 #include <sys/stat.h> 123 #include <sys/proc.h> 124 #include <sys/domain.h> 125 #include <sys/sysctl.h> 126 #include <sys/kauth.h> 127 128 #include <net/if.h> 129 #include <net/route.h> 130 131 #include <netinet/in.h> 132 #include <netinet/in_systm.h> 133 #include <netinet/in_var.h> 134 #include <netinet/ip.h> 135 #include <netinet/in_pcb.h> 136 #include <netinet/ip_var.h> 137 #include <netinet/in_offload.h> 138 139 #ifdef INET6 140 #ifndef INET 141 #include <netinet/in.h> 142 #endif 143 #include <netinet/ip6.h> 144 #include <netinet6/in6_pcb.h> 145 #include <netinet6/ip6_var.h> 146 #endif 147 148 #include <netinet/tcp.h> 149 #include <netinet/tcp_fsm.h> 150 #include <netinet/tcp_seq.h> 151 #include <netinet/tcp_timer.h> 152 #include <netinet/tcp_var.h> 153 #include <netinet/tcp_congctl.h> 154 #include <netinet/tcpip.h> 155 #include <netinet/tcp_debug.h> 156 157 #include "opt_tcp_space.h" 158 159 #ifdef IPSEC 160 #include <netinet6/ipsec.h> 161 #endif /*IPSEC*/ 162 163 /* 164 * TCP protocol interface to socket abstraction. 165 */ 166 167 /* 168 * Process a TCP user request for TCP tb. If this is a send request 169 * then m is the mbuf chain of send data. If this is a timer expiration 170 * (called from the software clock routine), then timertype tells which timer. 171 */ 172 /*ARGSUSED*/ 173 int 174 tcp_usrreq(struct socket *so, int req, 175 struct mbuf *m, struct mbuf *nam, struct mbuf *control, struct lwp *l) 176 { 177 struct inpcb *inp; 178 #ifdef INET6 179 struct in6pcb *in6p; 180 #endif 181 struct tcpcb *tp = NULL; 182 int s; 183 int error = 0; 184 #ifdef TCP_DEBUG 185 int ostate = 0; 186 #endif 187 int family; /* family of the socket */ 188 189 family = so->so_proto->pr_domain->dom_family; 190 191 if (req == PRU_CONTROL) { 192 switch (family) { 193 #ifdef INET 194 case PF_INET: 195 return (in_control(so, (long)m, (caddr_t)nam, 196 (struct ifnet *)control, l)); 197 #endif 198 #ifdef INET6 199 case PF_INET6: 200 return (in6_control(so, (long)m, (caddr_t)nam, 201 (struct ifnet *)control, l)); 202 #endif 203 default: 204 return EAFNOSUPPORT; 205 } 206 } 207 208 s = splsoftnet(); 209 210 if (req == PRU_PURGEIF) { 211 switch (family) { 212 #ifdef INET 213 case PF_INET: 214 in_pcbpurgeif0(&tcbtable, (struct ifnet *)control); 215 in_purgeif((struct ifnet *)control); 216 in_pcbpurgeif(&tcbtable, (struct ifnet *)control); 217 break; 218 #endif 219 #ifdef INET6 220 case PF_INET6: 221 in6_pcbpurgeif0(&tcbtable, (struct ifnet *)control); 222 in6_purgeif((struct ifnet *)control); 223 in6_pcbpurgeif(&tcbtable, (struct ifnet *)control); 224 break; 225 #endif 226 default: 227 splx(s); 228 return (EAFNOSUPPORT); 229 } 230 splx(s); 231 return (0); 232 } 233 234 switch (family) { 235 #ifdef INET 236 case PF_INET: 237 inp = sotoinpcb(so); 238 #ifdef INET6 239 in6p = NULL; 240 #endif 241 break; 242 #endif 243 #ifdef INET6 244 case PF_INET6: 245 inp = NULL; 246 in6p = sotoin6pcb(so); 247 break; 248 #endif 249 default: 250 splx(s); 251 return EAFNOSUPPORT; 252 } 253 254 #ifdef DIAGNOSTIC 255 #ifdef INET6 256 if (inp && in6p) 257 panic("tcp_usrreq: both inp and in6p set to non-NULL"); 258 #endif 259 if (req != PRU_SEND && req != PRU_SENDOOB && control) 260 panic("tcp_usrreq: unexpected control mbuf"); 261 #endif 262 /* 263 * When a TCP is attached to a socket, then there will be 264 * a (struct inpcb) pointed at by the socket, and this 265 * structure will point at a subsidary (struct tcpcb). 266 */ 267 #ifndef INET6 268 if (inp == 0 && req != PRU_ATTACH) 269 #else 270 if ((inp == 0 && in6p == 0) && req != PRU_ATTACH) 271 #endif 272 { 273 error = EINVAL; 274 goto release; 275 } 276 #ifdef INET 277 if (inp) { 278 tp = intotcpcb(inp); 279 /* WHAT IF TP IS 0? */ 280 #ifdef KPROF 281 tcp_acounts[tp->t_state][req]++; 282 #endif 283 #ifdef TCP_DEBUG 284 ostate = tp->t_state; 285 #endif 286 } 287 #endif 288 #ifdef INET6 289 if (in6p) { 290 tp = in6totcpcb(in6p); 291 /* WHAT IF TP IS 0? */ 292 #ifdef KPROF 293 tcp_acounts[tp->t_state][req]++; 294 #endif 295 #ifdef TCP_DEBUG 296 ostate = tp->t_state; 297 #endif 298 } 299 #endif 300 301 switch (req) { 302 303 /* 304 * TCP attaches to socket via PRU_ATTACH, reserving space, 305 * and an internet control block. 306 */ 307 case PRU_ATTACH: 308 #ifndef INET6 309 if (inp != 0) 310 #else 311 if (inp != 0 || in6p != 0) 312 #endif 313 { 314 error = EISCONN; 315 break; 316 } 317 error = tcp_attach(so); 318 if (error) 319 break; 320 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 321 so->so_linger = TCP_LINGERTIME; 322 tp = sototcpcb(so); 323 break; 324 325 /* 326 * PRU_DETACH detaches the TCP protocol from the socket. 327 */ 328 case PRU_DETACH: 329 tp = tcp_disconnect(tp); 330 break; 331 332 /* 333 * Give the socket an address. 334 */ 335 case PRU_BIND: 336 switch (family) { 337 #ifdef INET 338 case PF_INET: 339 error = in_pcbbind(inp, nam, l); 340 break; 341 #endif 342 #ifdef INET6 343 case PF_INET6: 344 error = in6_pcbbind(in6p, nam, l); 345 if (!error) { 346 /* mapped addr case */ 347 if (IN6_IS_ADDR_V4MAPPED(&in6p->in6p_laddr)) 348 tp->t_family = AF_INET; 349 else 350 tp->t_family = AF_INET6; 351 } 352 break; 353 #endif 354 } 355 break; 356 357 /* 358 * Prepare to accept connections. 359 */ 360 case PRU_LISTEN: 361 #ifdef INET 362 if (inp && inp->inp_lport == 0) { 363 error = in_pcbbind(inp, (struct mbuf *)0, 364 (struct lwp *)0); 365 if (error) 366 break; 367 } 368 #endif 369 #ifdef INET6 370 if (in6p && in6p->in6p_lport == 0) { 371 error = in6_pcbbind(in6p, (struct mbuf *)0, 372 (struct lwp *)0); 373 if (error) 374 break; 375 } 376 #endif 377 tp->t_state = TCPS_LISTEN; 378 break; 379 380 /* 381 * Initiate connection to peer. 382 * Create a template for use in transmissions on this connection. 383 * Enter SYN_SENT state, and mark socket as connecting. 384 * Start keep-alive timer, and seed output sequence space. 385 * Send initial segment on connection. 386 */ 387 case PRU_CONNECT: 388 #ifdef INET 389 if (inp) { 390 if (inp->inp_lport == 0) { 391 error = in_pcbbind(inp, (struct mbuf *)0, 392 (struct lwp *)0); 393 if (error) 394 break; 395 } 396 error = in_pcbconnect(inp, nam, l); 397 } 398 #endif 399 #ifdef INET6 400 if (in6p) { 401 if (in6p->in6p_lport == 0) { 402 error = in6_pcbbind(in6p, (struct mbuf *)0, 403 (struct lwp *)0); 404 if (error) 405 break; 406 } 407 error = in6_pcbconnect(in6p, nam, l); 408 if (!error) { 409 /* mapped addr case */ 410 if (IN6_IS_ADDR_V4MAPPED(&in6p->in6p_faddr)) 411 tp->t_family = AF_INET; 412 else 413 tp->t_family = AF_INET6; 414 } 415 } 416 #endif 417 if (error) 418 break; 419 tp->t_template = tcp_template(tp); 420 if (tp->t_template == 0) { 421 #ifdef INET 422 if (inp) 423 in_pcbdisconnect(inp); 424 #endif 425 #ifdef INET6 426 if (in6p) 427 in6_pcbdisconnect(in6p); 428 #endif 429 error = ENOBUFS; 430 break; 431 } 432 /* Compute window scaling to request. */ 433 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 434 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) 435 tp->request_r_scale++; 436 soisconnecting(so); 437 tcpstat.tcps_connattempt++; 438 tp->t_state = TCPS_SYN_SENT; 439 TCP_TIMER_ARM(tp, TCPT_KEEP, TCPTV_KEEP_INIT); 440 tp->iss = tcp_new_iss(tp, 0); 441 tcp_sendseqinit(tp); 442 error = tcp_output(tp); 443 break; 444 445 /* 446 * Create a TCP connection between two sockets. 447 */ 448 case PRU_CONNECT2: 449 error = EOPNOTSUPP; 450 break; 451 452 /* 453 * Initiate disconnect from peer. 454 * If connection never passed embryonic stage, just drop; 455 * else if don't need to let data drain, then can just drop anyways, 456 * else have to begin TCP shutdown process: mark socket disconnecting, 457 * drain unread data, state switch to reflect user close, and 458 * send segment (e.g. FIN) to peer. Socket will be really disconnected 459 * when peer sends FIN and acks ours. 460 * 461 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 462 */ 463 case PRU_DISCONNECT: 464 tp = tcp_disconnect(tp); 465 break; 466 467 /* 468 * Accept a connection. Essentially all the work is 469 * done at higher levels; just return the address 470 * of the peer, storing through addr. 471 */ 472 case PRU_ACCEPT: 473 #ifdef INET 474 if (inp) 475 in_setpeeraddr(inp, nam); 476 #endif 477 #ifdef INET6 478 if (in6p) 479 in6_setpeeraddr(in6p, nam); 480 #endif 481 break; 482 483 /* 484 * Mark the connection as being incapable of further output. 485 */ 486 case PRU_SHUTDOWN: 487 socantsendmore(so); 488 tp = tcp_usrclosed(tp); 489 if (tp) 490 error = tcp_output(tp); 491 break; 492 493 /* 494 * After a receive, possibly send window update to peer. 495 */ 496 case PRU_RCVD: 497 /* 498 * soreceive() calls this function when a user receives 499 * ancillary data on a listening socket. We don't call 500 * tcp_output in such a case, since there is no header 501 * template for a listening socket and hence the kernel 502 * will panic. 503 */ 504 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 505 (void) tcp_output(tp); 506 break; 507 508 /* 509 * Do a send by putting data in output queue and updating urgent 510 * marker if URG set. Possibly send more data. 511 */ 512 case PRU_SEND: 513 if (control && control->m_len) { 514 m_freem(control); 515 m_freem(m); 516 error = EINVAL; 517 break; 518 } 519 sbappendstream(&so->so_snd, m); 520 error = tcp_output(tp); 521 break; 522 523 /* 524 * Abort the TCP. 525 */ 526 case PRU_ABORT: 527 tp = tcp_drop(tp, ECONNABORTED); 528 break; 529 530 case PRU_SENSE: 531 /* 532 * stat: don't bother with a blocksize. 533 */ 534 splx(s); 535 return (0); 536 537 case PRU_RCVOOB: 538 if (control && control->m_len) { 539 m_freem(control); 540 m_freem(m); 541 error = EINVAL; 542 break; 543 } 544 if ((so->so_oobmark == 0 && 545 (so->so_state & SS_RCVATMARK) == 0) || 546 so->so_options & SO_OOBINLINE || 547 tp->t_oobflags & TCPOOB_HADDATA) { 548 error = EINVAL; 549 break; 550 } 551 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 552 error = EWOULDBLOCK; 553 break; 554 } 555 m->m_len = 1; 556 *mtod(m, caddr_t) = tp->t_iobc; 557 if (((long)nam & MSG_PEEK) == 0) 558 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 559 break; 560 561 case PRU_SENDOOB: 562 if (sbspace(&so->so_snd) < -512) { 563 m_freem(m); 564 error = ENOBUFS; 565 break; 566 } 567 /* 568 * According to RFC961 (Assigned Protocols), 569 * the urgent pointer points to the last octet 570 * of urgent data. We continue, however, 571 * to consider it to indicate the first octet 572 * of data past the urgent section. 573 * Otherwise, snd_up should be one lower. 574 */ 575 sbappendstream(&so->so_snd, m); 576 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 577 tp->t_force = 1; 578 error = tcp_output(tp); 579 tp->t_force = 0; 580 break; 581 582 case PRU_SOCKADDR: 583 #ifdef INET 584 if (inp) 585 in_setsockaddr(inp, nam); 586 #endif 587 #ifdef INET6 588 if (in6p) 589 in6_setsockaddr(in6p, nam); 590 #endif 591 break; 592 593 case PRU_PEERADDR: 594 #ifdef INET 595 if (inp) 596 in_setpeeraddr(inp, nam); 597 #endif 598 #ifdef INET6 599 if (in6p) 600 in6_setpeeraddr(in6p, nam); 601 #endif 602 break; 603 604 default: 605 panic("tcp_usrreq"); 606 } 607 #ifdef TCP_DEBUG 608 if (tp && (so->so_options & SO_DEBUG)) 609 tcp_trace(TA_USER, ostate, tp, NULL, req); 610 #endif 611 612 release: 613 splx(s); 614 return (error); 615 } 616 617 int 618 tcp_ctloutput(int op, struct socket *so, int level, int optname, 619 struct mbuf **mp) 620 { 621 int error = 0, s; 622 struct inpcb *inp; 623 #ifdef INET6 624 struct in6pcb *in6p; 625 #endif 626 struct tcpcb *tp; 627 struct mbuf *m; 628 int i; 629 int family; /* family of the socket */ 630 631 family = so->so_proto->pr_domain->dom_family; 632 633 s = splsoftnet(); 634 switch (family) { 635 #ifdef INET 636 case PF_INET: 637 inp = sotoinpcb(so); 638 #ifdef INET6 639 in6p = NULL; 640 #endif 641 break; 642 #endif 643 #ifdef INET6 644 case PF_INET6: 645 inp = NULL; 646 in6p = sotoin6pcb(so); 647 break; 648 #endif 649 default: 650 splx(s); 651 panic("%s: af %d", __func__, family); 652 } 653 #ifndef INET6 654 if (inp == NULL) 655 #else 656 if (inp == NULL && in6p == NULL) 657 #endif 658 { 659 splx(s); 660 if (op == PRCO_SETOPT && *mp) 661 (void) m_free(*mp); 662 return (ECONNRESET); 663 } 664 if (level != IPPROTO_TCP) { 665 switch (family) { 666 #ifdef INET 667 case PF_INET: 668 error = ip_ctloutput(op, so, level, optname, mp); 669 break; 670 #endif 671 #ifdef INET6 672 case PF_INET6: 673 error = ip6_ctloutput(op, so, level, optname, mp); 674 break; 675 #endif 676 } 677 splx(s); 678 return (error); 679 } 680 if (inp) 681 tp = intotcpcb(inp); 682 #ifdef INET6 683 else if (in6p) 684 tp = in6totcpcb(in6p); 685 #endif 686 else 687 tp = NULL; 688 689 switch (op) { 690 691 case PRCO_SETOPT: 692 m = *mp; 693 switch (optname) { 694 695 #ifdef TCP_SIGNATURE 696 case TCP_MD5SIG: 697 if (m == NULL || m->m_len < sizeof (int)) 698 error = EINVAL; 699 if (error) 700 break; 701 if (*mtod(m, int *) > 0) 702 tp->t_flags |= TF_SIGNATURE; 703 else 704 tp->t_flags &= ~TF_SIGNATURE; 705 break; 706 #endif /* TCP_SIGNATURE */ 707 708 case TCP_NODELAY: 709 if (m == NULL || m->m_len < sizeof (int)) 710 error = EINVAL; 711 else if (*mtod(m, int *)) 712 tp->t_flags |= TF_NODELAY; 713 else 714 tp->t_flags &= ~TF_NODELAY; 715 break; 716 717 case TCP_MAXSEG: 718 if (m && (i = *mtod(m, int *)) > 0 && 719 i <= tp->t_peermss) 720 tp->t_peermss = i; /* limit on send size */ 721 else 722 error = EINVAL; 723 break; 724 #ifdef notyet 725 case TCP_CONGCTL: 726 if (m == NULL) 727 error = EINVAL; 728 error = tcp_congctl_select(tp, mtod(m, char *)); 729 #endif 730 break; 731 732 default: 733 error = ENOPROTOOPT; 734 break; 735 } 736 if (m) 737 (void) m_free(m); 738 break; 739 740 case PRCO_GETOPT: 741 *mp = m = m_get(M_WAIT, MT_SOOPTS); 742 m->m_len = sizeof(int); 743 MCLAIM(m, so->so_mowner); 744 745 switch (optname) { 746 #ifdef TCP_SIGNATURE 747 case TCP_MD5SIG: 748 *mtod(m, int *) = (tp->t_flags & TF_SIGNATURE) ? 1 : 0; 749 break; 750 #endif 751 case TCP_NODELAY: 752 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 753 break; 754 case TCP_MAXSEG: 755 *mtod(m, int *) = tp->t_peermss; 756 break; 757 #ifdef notyet 758 case TCP_CONGCTL: 759 break; 760 #endif 761 default: 762 error = ENOPROTOOPT; 763 break; 764 } 765 break; 766 } 767 splx(s); 768 return (error); 769 } 770 771 #ifndef TCP_SENDSPACE 772 #define TCP_SENDSPACE 1024*32 773 #endif 774 int tcp_sendspace = TCP_SENDSPACE; 775 #ifndef TCP_RECVSPACE 776 #define TCP_RECVSPACE 1024*32 777 #endif 778 int tcp_recvspace = TCP_RECVSPACE; 779 780 /* 781 * Attach TCP protocol to socket, allocating 782 * internet protocol control block, tcp control block, 783 * bufer space, and entering LISTEN state if to accept connections. 784 */ 785 int 786 tcp_attach(struct socket *so) 787 { 788 struct tcpcb *tp; 789 struct inpcb *inp; 790 #ifdef INET6 791 struct in6pcb *in6p; 792 #endif 793 int error; 794 int family; /* family of the socket */ 795 796 family = so->so_proto->pr_domain->dom_family; 797 798 #ifdef MBUFTRACE 799 so->so_mowner = &tcp_mowner; 800 so->so_rcv.sb_mowner = &tcp_rx_mowner; 801 so->so_snd.sb_mowner = &tcp_tx_mowner; 802 #endif 803 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 804 error = soreserve(so, tcp_sendspace, tcp_recvspace); 805 if (error) 806 return (error); 807 } 808 switch (family) { 809 #ifdef INET 810 case PF_INET: 811 error = in_pcballoc(so, &tcbtable); 812 if (error) 813 return (error); 814 inp = sotoinpcb(so); 815 #ifdef INET6 816 in6p = NULL; 817 #endif 818 break; 819 #endif 820 #ifdef INET6 821 case PF_INET6: 822 error = in6_pcballoc(so, &tcbtable); 823 if (error) 824 return (error); 825 inp = NULL; 826 in6p = sotoin6pcb(so); 827 break; 828 #endif 829 default: 830 return EAFNOSUPPORT; 831 } 832 if (inp) 833 tp = tcp_newtcpcb(family, (void *)inp); 834 #ifdef INET6 835 else if (in6p) 836 tp = tcp_newtcpcb(family, (void *)in6p); 837 #endif 838 else 839 tp = NULL; 840 841 if (tp == 0) { 842 int nofd = so->so_state & SS_NOFDREF; /* XXX */ 843 844 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 845 #ifdef INET 846 if (inp) 847 in_pcbdetach(inp); 848 #endif 849 #ifdef INET6 850 if (in6p) 851 in6_pcbdetach(in6p); 852 #endif 853 so->so_state |= nofd; 854 return (ENOBUFS); 855 } 856 tp->t_state = TCPS_CLOSED; 857 return (0); 858 } 859 860 /* 861 * Initiate (or continue) disconnect. 862 * If embryonic state, just send reset (once). 863 * If in ``let data drain'' option and linger null, just drop. 864 * Otherwise (hard), mark socket disconnecting and drop 865 * current input data; switch states based on user close, and 866 * send segment to peer (with FIN). 867 */ 868 struct tcpcb * 869 tcp_disconnect(struct tcpcb *tp) 870 { 871 struct socket *so; 872 873 if (tp->t_inpcb) 874 so = tp->t_inpcb->inp_socket; 875 #ifdef INET6 876 else if (tp->t_in6pcb) 877 so = tp->t_in6pcb->in6p_socket; 878 #endif 879 else 880 so = NULL; 881 882 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 883 tp = tcp_close(tp); 884 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 885 tp = tcp_drop(tp, 0); 886 else { 887 soisdisconnecting(so); 888 sbflush(&so->so_rcv); 889 tp = tcp_usrclosed(tp); 890 if (tp) 891 (void) tcp_output(tp); 892 } 893 return (tp); 894 } 895 896 /* 897 * User issued close, and wish to trail through shutdown states: 898 * if never received SYN, just forget it. If got a SYN from peer, 899 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 900 * If already got a FIN from peer, then almost done; go to LAST_ACK 901 * state. In all other cases, have already sent FIN to peer (e.g. 902 * after PRU_SHUTDOWN), and just have to play tedious game waiting 903 * for peer to send FIN or not respond to keep-alives, etc. 904 * We can let the user exit from the close as soon as the FIN is acked. 905 */ 906 struct tcpcb * 907 tcp_usrclosed(struct tcpcb *tp) 908 { 909 910 switch (tp->t_state) { 911 912 case TCPS_CLOSED: 913 case TCPS_LISTEN: 914 case TCPS_SYN_SENT: 915 tp->t_state = TCPS_CLOSED; 916 tp = tcp_close(tp); 917 break; 918 919 case TCPS_SYN_RECEIVED: 920 case TCPS_ESTABLISHED: 921 tp->t_state = TCPS_FIN_WAIT_1; 922 break; 923 924 case TCPS_CLOSE_WAIT: 925 tp->t_state = TCPS_LAST_ACK; 926 break; 927 } 928 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 929 struct socket *so; 930 if (tp->t_inpcb) 931 so = tp->t_inpcb->inp_socket; 932 #ifdef INET6 933 else if (tp->t_in6pcb) 934 so = tp->t_in6pcb->in6p_socket; 935 #endif 936 else 937 so = NULL; 938 if (so) 939 soisdisconnected(so); 940 /* 941 * If we are in FIN_WAIT_2, we arrived here because the 942 * application did a shutdown of the send side. Like the 943 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 944 * a full close, we start a timer to make sure sockets are 945 * not left in FIN_WAIT_2 forever. 946 */ 947 if ((tp->t_state == TCPS_FIN_WAIT_2) && (tcp_maxidle > 0)) 948 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 949 } 950 return (tp); 951 } 952 953 /* 954 * sysctl helper routine for net.inet.ip.mssdflt. it can't be less 955 * than 32. 956 */ 957 static int 958 sysctl_net_inet_tcp_mssdflt(SYSCTLFN_ARGS) 959 { 960 int error, mssdflt; 961 struct sysctlnode node; 962 963 mssdflt = tcp_mssdflt; 964 node = *rnode; 965 node.sysctl_data = &mssdflt; 966 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 967 if (error || newp == NULL) 968 return (error); 969 970 if (mssdflt < 32) 971 return (EINVAL); 972 tcp_mssdflt = mssdflt; 973 974 return (0); 975 } 976 977 /* 978 * sysctl helper routine for setting port related values under 979 * net.inet.ip and net.inet6.ip6. does basic range checking and does 980 * additional checks for each type. this code has placed in 981 * tcp_input.c since INET and INET6 both use the same tcp code. 982 * 983 * this helper is not static so that both inet and inet6 can use it. 984 */ 985 int 986 sysctl_net_inet_ip_ports(SYSCTLFN_ARGS) 987 { 988 int error, tmp; 989 int apmin, apmax; 990 #ifndef IPNOPRIVPORTS 991 int lpmin, lpmax; 992 #endif /* IPNOPRIVPORTS */ 993 struct sysctlnode node; 994 995 if (namelen != 0) 996 return (EINVAL); 997 998 switch (name[-3]) { 999 #ifdef INET 1000 case PF_INET: 1001 apmin = anonportmin; 1002 apmax = anonportmax; 1003 #ifndef IPNOPRIVPORTS 1004 lpmin = lowportmin; 1005 lpmax = lowportmax; 1006 #endif /* IPNOPRIVPORTS */ 1007 break; 1008 #endif /* INET */ 1009 #ifdef INET6 1010 case PF_INET6: 1011 apmin = ip6_anonportmin; 1012 apmax = ip6_anonportmax; 1013 #ifndef IPNOPRIVPORTS 1014 lpmin = ip6_lowportmin; 1015 lpmax = ip6_lowportmax; 1016 #endif /* IPNOPRIVPORTS */ 1017 break; 1018 #endif /* INET6 */ 1019 default: 1020 return (EINVAL); 1021 } 1022 1023 /* 1024 * insert temporary copy into node, perform lookup on 1025 * temporary, then restore pointer 1026 */ 1027 node = *rnode; 1028 tmp = *(int*)rnode->sysctl_data; 1029 node.sysctl_data = &tmp; 1030 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 1031 if (error || newp == NULL) 1032 return (error); 1033 1034 /* 1035 * simple port range check 1036 */ 1037 if (tmp < 0 || tmp > 65535) 1038 return (EINVAL); 1039 1040 /* 1041 * per-node range checks 1042 */ 1043 switch (rnode->sysctl_num) { 1044 case IPCTL_ANONPORTMIN: 1045 if (tmp >= apmax) 1046 return (EINVAL); 1047 #ifndef IPNOPRIVPORTS 1048 if (tmp < IPPORT_RESERVED) 1049 return (EINVAL); 1050 #endif /* IPNOPRIVPORTS */ 1051 break; 1052 1053 case IPCTL_ANONPORTMAX: 1054 if (apmin >= tmp) 1055 return (EINVAL); 1056 #ifndef IPNOPRIVPORTS 1057 if (tmp < IPPORT_RESERVED) 1058 return (EINVAL); 1059 #endif /* IPNOPRIVPORTS */ 1060 break; 1061 1062 #ifndef IPNOPRIVPORTS 1063 case IPCTL_LOWPORTMIN: 1064 if (tmp >= lpmax || 1065 tmp > IPPORT_RESERVEDMAX || 1066 tmp < IPPORT_RESERVEDMIN) 1067 return (EINVAL); 1068 break; 1069 1070 case IPCTL_LOWPORTMAX: 1071 if (lpmin >= tmp || 1072 tmp > IPPORT_RESERVEDMAX || 1073 tmp < IPPORT_RESERVEDMIN) 1074 return (EINVAL); 1075 break; 1076 #endif /* IPNOPRIVPORTS */ 1077 1078 default: 1079 return (EINVAL); 1080 } 1081 1082 *(int*)rnode->sysctl_data = tmp; 1083 1084 return (0); 1085 } 1086 1087 /* 1088 * sysctl helper routine for the net.inet.tcp.ident and 1089 * net.inet6.tcp6.ident nodes. contains backwards compat code for the 1090 * old way of looking up the ident information for ipv4 which involves 1091 * stuffing the port/addr pairs into the mib lookup. 1092 */ 1093 static int 1094 sysctl_net_inet_tcp_ident(SYSCTLFN_ARGS) 1095 { 1096 #ifdef INET 1097 struct inpcb *inb; 1098 struct sockaddr_in *si4[2]; 1099 #endif /* INET */ 1100 #ifdef INET6 1101 struct in6pcb *in6b; 1102 struct sockaddr_in6 *si6[2]; 1103 #endif /* INET6 */ 1104 struct sockaddr_storage sa[2]; 1105 struct socket *sockp; 1106 size_t sz; 1107 uid_t uid; 1108 int error, pf; 1109 1110 if (namelen != 4 && namelen != 0) 1111 return (EINVAL); 1112 if (name[-2] != IPPROTO_TCP) 1113 return (EINVAL); 1114 pf = name[-3]; 1115 1116 /* old style lookup, ipv4 only */ 1117 if (namelen == 4) { 1118 #ifdef INET 1119 struct in_addr laddr, raddr; 1120 u_int lport, rport; 1121 1122 if (pf != PF_INET) 1123 return (EPROTONOSUPPORT); 1124 raddr.s_addr = (uint32_t)name[0]; 1125 rport = (u_int)name[1]; 1126 laddr.s_addr = (uint32_t)name[2]; 1127 lport = (u_int)name[3]; 1128 inb = in_pcblookup_connect(&tcbtable, raddr, rport, 1129 laddr, lport); 1130 if (inb == NULL || (sockp = inb->inp_socket) == NULL) 1131 return (ESRCH); 1132 uid = sockp->so_uidinfo->ui_uid; 1133 if (oldp) { 1134 sz = MIN(sizeof(uid), *oldlenp); 1135 error = copyout(&uid, oldp, sz); 1136 if (error) 1137 return (error); 1138 } 1139 *oldlenp = sizeof(uid); 1140 return (0); 1141 #else /* INET */ 1142 return (EINVAL); 1143 #endif /* INET */ 1144 } 1145 1146 if (newp == NULL || newlen != sizeof(sa)) 1147 return (EINVAL); 1148 error = copyin(newp, &sa, newlen); 1149 if (error) 1150 return (error); 1151 1152 /* 1153 * requested families must match 1154 */ 1155 if (pf != sa[0].ss_family || sa[0].ss_family != sa[1].ss_family) 1156 return (EINVAL); 1157 1158 switch (pf) { 1159 #ifdef INET 1160 case PF_INET: 1161 si4[0] = (struct sockaddr_in*)&sa[0]; 1162 si4[1] = (struct sockaddr_in*)&sa[1]; 1163 if (si4[0]->sin_len != sizeof(*si4[0]) || 1164 si4[0]->sin_len != si4[1]->sin_len) 1165 return (EINVAL); 1166 inb = in_pcblookup_connect(&tcbtable, 1167 si4[0]->sin_addr, si4[0]->sin_port, 1168 si4[1]->sin_addr, si4[1]->sin_port); 1169 if (inb == NULL || (sockp = inb->inp_socket) == NULL) 1170 return (ESRCH); 1171 break; 1172 #endif /* INET */ 1173 #ifdef INET6 1174 case PF_INET6: 1175 si6[0] = (struct sockaddr_in6*)&sa[0]; 1176 si6[1] = (struct sockaddr_in6*)&sa[1]; 1177 if (si6[0]->sin6_len != sizeof(*si6[0]) || 1178 si6[0]->sin6_len != si6[1]->sin6_len) 1179 return (EINVAL); 1180 in6b = in6_pcblookup_connect(&tcbtable, 1181 &si6[0]->sin6_addr, si6[0]->sin6_port, 1182 &si6[1]->sin6_addr, si6[1]->sin6_port, 0); 1183 if (in6b == NULL || (sockp = in6b->in6p_socket) == NULL) 1184 return (ESRCH); 1185 break; 1186 #endif /* INET6 */ 1187 default: 1188 return (EPROTONOSUPPORT); 1189 } 1190 *oldlenp = sizeof(uid); 1191 1192 uid = sockp->so_uidinfo->ui_uid; 1193 if (oldp) { 1194 sz = MIN(sizeof(uid), *oldlenp); 1195 error = copyout(&uid, oldp, sz); 1196 if (error) 1197 return (error); 1198 } 1199 *oldlenp = sizeof(uid); 1200 1201 return (0); 1202 } 1203 1204 /* 1205 * sysctl helper for the inet and inet6 pcblists. handles tcp/udp and 1206 * inet/inet6, as well as raw pcbs for each. specifically not 1207 * declared static so that raw sockets and udp/udp6 can use it as 1208 * well. 1209 */ 1210 int 1211 sysctl_inpcblist(SYSCTLFN_ARGS) 1212 { 1213 #ifdef INET 1214 struct sockaddr_in *in; 1215 const struct inpcb *inp; 1216 #endif 1217 #ifdef INET6 1218 struct sockaddr_in6 *in6; 1219 const struct in6pcb *in6p; 1220 #endif 1221 /* 1222 * sysctl_data is const, but CIRCLEQ_FOREACH can't use a const 1223 * struct inpcbtable pointer, so we have to discard const. :-/ 1224 */ 1225 struct inpcbtable *pcbtbl = __UNCONST(rnode->sysctl_data); 1226 const struct inpcb_hdr *inph; 1227 struct tcpcb *tp; 1228 struct kinfo_pcb pcb; 1229 char *dp; 1230 u_int op, arg; 1231 size_t len, needed, elem_size, out_size; 1232 int error, elem_count, pf, proto, pf2; 1233 1234 if (namelen != 4) 1235 return (EINVAL); 1236 1237 if (oldp != NULL) { 1238 len = *oldlenp; 1239 elem_size = name[2]; 1240 elem_count = name[3]; 1241 if (elem_size != sizeof(pcb)) 1242 return EINVAL; 1243 } else { 1244 len = 0; 1245 elem_count = INT_MAX; 1246 elem_size = sizeof(pcb); 1247 } 1248 error = 0; 1249 dp = oldp; 1250 op = name[0]; 1251 arg = name[1]; 1252 out_size = elem_size; 1253 needed = 0; 1254 1255 if (namelen == 1 && name[0] == CTL_QUERY) 1256 return (sysctl_query(SYSCTLFN_CALL(rnode))); 1257 1258 if (name - oname != 4) 1259 return (EINVAL); 1260 1261 pf = oname[1]; 1262 proto = oname[2]; 1263 pf2 = (oldp != NULL) ? pf : 0; 1264 1265 CIRCLEQ_FOREACH(inph, &pcbtbl->inpt_queue, inph_queue) { 1266 #ifdef INET 1267 inp = (const struct inpcb *)inph; 1268 #endif 1269 #ifdef INET6 1270 in6p = (const struct in6pcb *)inph; 1271 #endif 1272 1273 if (inph->inph_af != pf) 1274 continue; 1275 1276 if (kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET, 1277 KAUTH_REQ_NETWORK_SOCKET_CANSEE, inph->inph_socket, NULL, 1278 NULL) != 0) 1279 continue; 1280 1281 memset(&pcb, 0, sizeof(pcb)); 1282 1283 pcb.ki_family = pf; 1284 pcb.ki_type = proto; 1285 1286 switch (pf2) { 1287 case 0: 1288 /* just probing for size */ 1289 break; 1290 #ifdef INET 1291 case PF_INET: 1292 pcb.ki_family = inp->inp_socket->so_proto-> 1293 pr_domain->dom_family; 1294 pcb.ki_type = inp->inp_socket->so_proto-> 1295 pr_type; 1296 pcb.ki_protocol = inp->inp_socket->so_proto-> 1297 pr_protocol; 1298 pcb.ki_pflags = inp->inp_flags; 1299 1300 pcb.ki_sostate = inp->inp_socket->so_state; 1301 pcb.ki_prstate = inp->inp_state; 1302 if (proto == IPPROTO_TCP) { 1303 tp = intotcpcb(inp); 1304 pcb.ki_tstate = tp->t_state; 1305 pcb.ki_tflags = tp->t_flags; 1306 } 1307 1308 pcb.ki_pcbaddr = PTRTOUINT64(inp); 1309 pcb.ki_ppcbaddr = PTRTOUINT64(inp->inp_ppcb); 1310 pcb.ki_sockaddr = PTRTOUINT64(inp->inp_socket); 1311 1312 pcb.ki_rcvq = inp->inp_socket->so_rcv.sb_cc; 1313 pcb.ki_sndq = inp->inp_socket->so_snd.sb_cc; 1314 1315 in = satosin(&pcb.ki_src); 1316 in->sin_len = sizeof(*in); 1317 in->sin_family = pf; 1318 in->sin_port = inp->inp_lport; 1319 in->sin_addr = inp->inp_laddr; 1320 if (pcb.ki_prstate >= INP_CONNECTED) { 1321 in = satosin(&pcb.ki_dst); 1322 in->sin_len = sizeof(*in); 1323 in->sin_family = pf; 1324 in->sin_port = inp->inp_fport; 1325 in->sin_addr = inp->inp_faddr; 1326 } 1327 break; 1328 #endif 1329 #ifdef INET6 1330 case PF_INET6: 1331 pcb.ki_family = in6p->in6p_socket->so_proto-> 1332 pr_domain->dom_family; 1333 pcb.ki_type = in6p->in6p_socket->so_proto->pr_type; 1334 pcb.ki_protocol = in6p->in6p_socket->so_proto-> 1335 pr_protocol; 1336 pcb.ki_pflags = in6p->in6p_flags; 1337 1338 pcb.ki_sostate = in6p->in6p_socket->so_state; 1339 pcb.ki_prstate = in6p->in6p_state; 1340 if (proto == IPPROTO_TCP) { 1341 tp = in6totcpcb(in6p); 1342 pcb.ki_tstate = tp->t_state; 1343 pcb.ki_tflags = tp->t_flags; 1344 } 1345 1346 pcb.ki_pcbaddr = PTRTOUINT64(in6p); 1347 pcb.ki_ppcbaddr = PTRTOUINT64(in6p->in6p_ppcb); 1348 pcb.ki_sockaddr = PTRTOUINT64(in6p->in6p_socket); 1349 1350 pcb.ki_rcvq = in6p->in6p_socket->so_rcv.sb_cc; 1351 pcb.ki_sndq = in6p->in6p_socket->so_snd.sb_cc; 1352 1353 in6 = satosin6(&pcb.ki_src); 1354 in6->sin6_len = sizeof(*in6); 1355 in6->sin6_family = pf; 1356 in6->sin6_port = in6p->in6p_lport; 1357 in6->sin6_flowinfo = in6p->in6p_flowinfo; 1358 in6->sin6_addr = in6p->in6p_laddr; 1359 in6->sin6_scope_id = 0; /* XXX? */ 1360 1361 if (pcb.ki_prstate >= IN6P_CONNECTED) { 1362 in6 = satosin6(&pcb.ki_dst); 1363 in6->sin6_len = sizeof(*in6); 1364 in6->sin6_family = pf; 1365 in6->sin6_port = in6p->in6p_fport; 1366 in6->sin6_flowinfo = in6p->in6p_flowinfo; 1367 in6->sin6_addr = in6p->in6p_faddr; 1368 in6->sin6_scope_id = 0; /* XXX? */ 1369 } 1370 break; 1371 #endif 1372 } 1373 1374 if (len >= elem_size && elem_count > 0) { 1375 error = copyout(&pcb, dp, out_size); 1376 if (error) 1377 return (error); 1378 dp += elem_size; 1379 len -= elem_size; 1380 } 1381 if (elem_count > 0) { 1382 needed += elem_size; 1383 if (elem_count != INT_MAX) 1384 elem_count--; 1385 } 1386 } 1387 1388 *oldlenp = needed; 1389 if (oldp == NULL) 1390 *oldlenp += PCB_SLOP * sizeof(struct kinfo_pcb); 1391 1392 return (error); 1393 } 1394 1395 static int 1396 sysctl_tcp_congctl(SYSCTLFN_ARGS) 1397 { 1398 struct sysctlnode node; 1399 int error, r; 1400 char newname[TCPCC_MAXLEN]; 1401 1402 strlcpy(newname, tcp_congctl_global_name, sizeof(newname) - 1); 1403 1404 node = *rnode; 1405 node.sysctl_data = newname; 1406 node.sysctl_size = sizeof(newname); 1407 1408 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 1409 1410 if (error || 1411 newp == NULL || 1412 strncmp(newname, tcp_congctl_global_name, sizeof(newname)) == 0) 1413 return error; 1414 1415 if ((r = tcp_congctl_select(NULL, newname))) 1416 return r; 1417 1418 return error; 1419 } 1420 1421 /* 1422 * this (second stage) setup routine is a replacement for tcp_sysctl() 1423 * (which is currently used for ipv4 and ipv6) 1424 */ 1425 static void 1426 sysctl_net_inet_tcp_setup2(struct sysctllog **clog, int pf, const char *pfname, 1427 const char *tcpname) 1428 { 1429 const struct sysctlnode *sack_node; 1430 const struct sysctlnode *abc_node; 1431 const struct sysctlnode *ecn_node; 1432 const struct sysctlnode *congctl_node; 1433 #ifdef TCP_DEBUG 1434 extern struct tcp_debug tcp_debug[TCP_NDEBUG]; 1435 extern int tcp_debx; 1436 #endif 1437 1438 sysctl_createv(clog, 0, NULL, NULL, 1439 CTLFLAG_PERMANENT, 1440 CTLTYPE_NODE, "net", NULL, 1441 NULL, 0, NULL, 0, 1442 CTL_NET, CTL_EOL); 1443 sysctl_createv(clog, 0, NULL, NULL, 1444 CTLFLAG_PERMANENT, 1445 CTLTYPE_NODE, pfname, NULL, 1446 NULL, 0, NULL, 0, 1447 CTL_NET, pf, CTL_EOL); 1448 sysctl_createv(clog, 0, NULL, NULL, 1449 CTLFLAG_PERMANENT, 1450 CTLTYPE_NODE, tcpname, 1451 SYSCTL_DESCR("TCP related settings"), 1452 NULL, 0, NULL, 0, 1453 CTL_NET, pf, IPPROTO_TCP, CTL_EOL); 1454 1455 sysctl_createv(clog, 0, NULL, NULL, 1456 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1457 CTLTYPE_INT, "rfc1323", 1458 SYSCTL_DESCR("Enable RFC1323 TCP extensions"), 1459 NULL, 0, &tcp_do_rfc1323, 0, 1460 CTL_NET, pf, IPPROTO_TCP, TCPCTL_RFC1323, CTL_EOL); 1461 sysctl_createv(clog, 0, NULL, NULL, 1462 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1463 CTLTYPE_INT, "sendspace", 1464 SYSCTL_DESCR("Default TCP send buffer size"), 1465 NULL, 0, &tcp_sendspace, 0, 1466 CTL_NET, pf, IPPROTO_TCP, TCPCTL_SENDSPACE, CTL_EOL); 1467 sysctl_createv(clog, 0, NULL, NULL, 1468 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1469 CTLTYPE_INT, "recvspace", 1470 SYSCTL_DESCR("Default TCP receive buffer size"), 1471 NULL, 0, &tcp_recvspace, 0, 1472 CTL_NET, pf, IPPROTO_TCP, TCPCTL_RECVSPACE, CTL_EOL); 1473 sysctl_createv(clog, 0, NULL, NULL, 1474 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1475 CTLTYPE_INT, "mssdflt", 1476 SYSCTL_DESCR("Default maximum segment size"), 1477 sysctl_net_inet_tcp_mssdflt, 0, &tcp_mssdflt, 0, 1478 CTL_NET, pf, IPPROTO_TCP, TCPCTL_MSSDFLT, CTL_EOL); 1479 sysctl_createv(clog, 0, NULL, NULL, 1480 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1481 CTLTYPE_INT, "syn_cache_limit", 1482 SYSCTL_DESCR("Maximum number of entries in the TCP " 1483 "compressed state engine"), 1484 NULL, 0, &tcp_syn_cache_limit, 0, 1485 CTL_NET, pf, IPPROTO_TCP, TCPCTL_SYN_CACHE_LIMIT, 1486 CTL_EOL); 1487 sysctl_createv(clog, 0, NULL, NULL, 1488 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1489 CTLTYPE_INT, "syn_bucket_limit", 1490 SYSCTL_DESCR("Maximum number of entries per hash " 1491 "bucket in the TCP compressed state " 1492 "engine"), 1493 NULL, 0, &tcp_syn_bucket_limit, 0, 1494 CTL_NET, pf, IPPROTO_TCP, TCPCTL_SYN_BUCKET_LIMIT, 1495 CTL_EOL); 1496 #if 0 /* obsoleted */ 1497 sysctl_createv(clog, 0, NULL, NULL, 1498 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1499 CTLTYPE_INT, "syn_cache_interval", 1500 SYSCTL_DESCR("TCP compressed state engine's timer interval"), 1501 NULL, 0, &tcp_syn_cache_interval, 0, 1502 CTL_NET, pf, IPPROTO_TCP, TCPCTL_SYN_CACHE_INTER, 1503 CTL_EOL); 1504 #endif 1505 sysctl_createv(clog, 0, NULL, NULL, 1506 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1507 CTLTYPE_INT, "init_win", 1508 SYSCTL_DESCR("Initial TCP congestion window"), 1509 NULL, 0, &tcp_init_win, 0, 1510 CTL_NET, pf, IPPROTO_TCP, TCPCTL_INIT_WIN, CTL_EOL); 1511 sysctl_createv(clog, 0, NULL, NULL, 1512 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1513 CTLTYPE_INT, "mss_ifmtu", 1514 SYSCTL_DESCR("Use interface MTU for calculating MSS"), 1515 NULL, 0, &tcp_mss_ifmtu, 0, 1516 CTL_NET, pf, IPPROTO_TCP, TCPCTL_MSS_IFMTU, CTL_EOL); 1517 sysctl_createv(clog, 0, NULL, &sack_node, 1518 CTLFLAG_PERMANENT, 1519 CTLTYPE_NODE, "sack", 1520 SYSCTL_DESCR("RFC2018 Selective ACKnowledgement tunables"), 1521 NULL, 0, NULL, 0, 1522 CTL_NET, pf, IPPROTO_TCP, TCPCTL_SACK, CTL_EOL); 1523 1524 /* Congctl subtree */ 1525 sysctl_createv(clog, 0, NULL, &congctl_node, 1526 CTLFLAG_PERMANENT, 1527 CTLTYPE_NODE, "congctl", 1528 SYSCTL_DESCR("TCP Congestion Control"), 1529 NULL, 0, NULL, 0, 1530 CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL); 1531 sysctl_createv(clog, 0, &congctl_node, NULL, 1532 CTLFLAG_PERMANENT, 1533 CTLTYPE_STRING, "available", 1534 SYSCTL_DESCR("Available Congestion Control Mechanisms"), 1535 NULL, 0, &tcp_congctl_avail, 0, CTL_CREATE, CTL_EOL); 1536 sysctl_createv(clog, 0, &congctl_node, NULL, 1537 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1538 CTLTYPE_STRING, "selected", 1539 SYSCTL_DESCR("Selected Congestion Control Mechanism"), 1540 sysctl_tcp_congctl, 0, NULL, TCPCC_MAXLEN, 1541 CTL_CREATE, CTL_EOL); 1542 1543 sysctl_createv(clog, 0, NULL, NULL, 1544 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1545 CTLTYPE_INT, "win_scale", 1546 SYSCTL_DESCR("Use RFC1323 window scale options"), 1547 NULL, 0, &tcp_do_win_scale, 0, 1548 CTL_NET, pf, IPPROTO_TCP, TCPCTL_WSCALE, CTL_EOL); 1549 sysctl_createv(clog, 0, NULL, NULL, 1550 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1551 CTLTYPE_INT, "timestamps", 1552 SYSCTL_DESCR("Use RFC1323 time stamp options"), 1553 NULL, 0, &tcp_do_timestamps, 0, 1554 CTL_NET, pf, IPPROTO_TCP, TCPCTL_TSTAMP, CTL_EOL); 1555 sysctl_createv(clog, 0, NULL, NULL, 1556 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1557 CTLTYPE_INT, "compat_42", 1558 SYSCTL_DESCR("Enable workarounds for 4.2BSD TCP bugs"), 1559 NULL, 0, &tcp_compat_42, 0, 1560 CTL_NET, pf, IPPROTO_TCP, TCPCTL_COMPAT_42, CTL_EOL); 1561 sysctl_createv(clog, 0, NULL, NULL, 1562 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1563 CTLTYPE_INT, "cwm", 1564 SYSCTL_DESCR("Hughes/Touch/Heidemann Congestion Window " 1565 "Monitoring"), 1566 NULL, 0, &tcp_cwm, 0, 1567 CTL_NET, pf, IPPROTO_TCP, TCPCTL_CWM, CTL_EOL); 1568 sysctl_createv(clog, 0, NULL, NULL, 1569 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1570 CTLTYPE_INT, "cwm_burstsize", 1571 SYSCTL_DESCR("Congestion Window Monitoring allowed " 1572 "burst count in packets"), 1573 NULL, 0, &tcp_cwm_burstsize, 0, 1574 CTL_NET, pf, IPPROTO_TCP, TCPCTL_CWM_BURSTSIZE, 1575 CTL_EOL); 1576 sysctl_createv(clog, 0, NULL, NULL, 1577 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1578 CTLTYPE_INT, "ack_on_push", 1579 SYSCTL_DESCR("Immediately return ACK when PSH is " 1580 "received"), 1581 NULL, 0, &tcp_ack_on_push, 0, 1582 CTL_NET, pf, IPPROTO_TCP, TCPCTL_ACK_ON_PUSH, CTL_EOL); 1583 sysctl_createv(clog, 0, NULL, NULL, 1584 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1585 CTLTYPE_INT, "keepidle", 1586 SYSCTL_DESCR("Allowed connection idle ticks before a " 1587 "keepalive probe is sent"), 1588 NULL, 0, &tcp_keepidle, 0, 1589 CTL_NET, pf, IPPROTO_TCP, TCPCTL_KEEPIDLE, CTL_EOL); 1590 sysctl_createv(clog, 0, NULL, NULL, 1591 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1592 CTLTYPE_INT, "keepintvl", 1593 SYSCTL_DESCR("Ticks before next keepalive probe is sent"), 1594 NULL, 0, &tcp_keepintvl, 0, 1595 CTL_NET, pf, IPPROTO_TCP, TCPCTL_KEEPINTVL, CTL_EOL); 1596 sysctl_createv(clog, 0, NULL, NULL, 1597 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1598 CTLTYPE_INT, "keepcnt", 1599 SYSCTL_DESCR("Number of keepalive probes to send"), 1600 NULL, 0, &tcp_keepcnt, 0, 1601 CTL_NET, pf, IPPROTO_TCP, TCPCTL_KEEPCNT, CTL_EOL); 1602 sysctl_createv(clog, 0, NULL, NULL, 1603 CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE, 1604 CTLTYPE_INT, "slowhz", 1605 SYSCTL_DESCR("Keepalive ticks per second"), 1606 NULL, PR_SLOWHZ, NULL, 0, 1607 CTL_NET, pf, IPPROTO_TCP, TCPCTL_SLOWHZ, CTL_EOL); 1608 sysctl_createv(clog, 0, NULL, NULL, 1609 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1610 CTLTYPE_INT, "log_refused", 1611 SYSCTL_DESCR("Log refused TCP connections"), 1612 NULL, 0, &tcp_log_refused, 0, 1613 CTL_NET, pf, IPPROTO_TCP, TCPCTL_LOG_REFUSED, CTL_EOL); 1614 #if 0 /* obsoleted */ 1615 sysctl_createv(clog, 0, NULL, NULL, 1616 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1617 CTLTYPE_INT, "rstratelimit", NULL, 1618 NULL, 0, &tcp_rst_ratelim, 0, 1619 CTL_NET, pf, IPPROTO_TCP, TCPCTL_RSTRATELIMIT, CTL_EOL); 1620 #endif 1621 sysctl_createv(clog, 0, NULL, NULL, 1622 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1623 CTLTYPE_INT, "rstppslimit", 1624 SYSCTL_DESCR("Maximum number of RST packets to send " 1625 "per second"), 1626 NULL, 0, &tcp_rst_ppslim, 0, 1627 CTL_NET, pf, IPPROTO_TCP, TCPCTL_RSTPPSLIMIT, CTL_EOL); 1628 sysctl_createv(clog, 0, NULL, NULL, 1629 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1630 CTLTYPE_INT, "delack_ticks", 1631 SYSCTL_DESCR("Number of ticks to delay sending an ACK"), 1632 NULL, 0, &tcp_delack_ticks, 0, 1633 CTL_NET, pf, IPPROTO_TCP, TCPCTL_DELACK_TICKS, CTL_EOL); 1634 sysctl_createv(clog, 0, NULL, NULL, 1635 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1636 CTLTYPE_INT, "init_win_local", 1637 SYSCTL_DESCR("Initial TCP window size (in segments)"), 1638 NULL, 0, &tcp_init_win_local, 0, 1639 CTL_NET, pf, IPPROTO_TCP, TCPCTL_INIT_WIN_LOCAL, 1640 CTL_EOL); 1641 sysctl_createv(clog, 0, NULL, NULL, 1642 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1643 CTLTYPE_STRUCT, "ident", 1644 SYSCTL_DESCR("RFC1413 Identification Protocol lookups"), 1645 sysctl_net_inet_tcp_ident, 0, NULL, sizeof(uid_t), 1646 CTL_NET, pf, IPPROTO_TCP, TCPCTL_IDENT, CTL_EOL); 1647 sysctl_createv(clog, 0, NULL, NULL, 1648 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1649 CTLTYPE_INT, "do_loopback_cksum", 1650 SYSCTL_DESCR("Perform TCP checksum on loopback"), 1651 NULL, 0, &tcp_do_loopback_cksum, 0, 1652 CTL_NET, pf, IPPROTO_TCP, TCPCTL_LOOPBACKCKSUM, 1653 CTL_EOL); 1654 sysctl_createv(clog, 0, NULL, NULL, 1655 CTLFLAG_PERMANENT, 1656 CTLTYPE_STRUCT, "pcblist", 1657 SYSCTL_DESCR("TCP protocol control block list"), 1658 sysctl_inpcblist, 0, &tcbtable, 0, 1659 CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, 1660 CTL_EOL); 1661 1662 /* ECN subtree */ 1663 sysctl_createv(clog, 0, NULL, &ecn_node, 1664 CTLFLAG_PERMANENT, 1665 CTLTYPE_NODE, "ecn", 1666 SYSCTL_DESCR("RFC3168 Explicit Congestion Notification"), 1667 NULL, 0, NULL, 0, 1668 CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL); 1669 sysctl_createv(clog, 0, &ecn_node, NULL, 1670 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1671 CTLTYPE_INT, "enable", 1672 SYSCTL_DESCR("Enable TCP Explicit Congestion " 1673 "Notification"), 1674 NULL, 0, &tcp_do_ecn, 0, CTL_CREATE, CTL_EOL); 1675 sysctl_createv(clog, 0, &ecn_node, NULL, 1676 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1677 CTLTYPE_INT, "maxretries", 1678 SYSCTL_DESCR("Number of times to retry ECN setup " 1679 "before disabling ECN on the connection"), 1680 NULL, 0, &tcp_ecn_maxretries, 0, CTL_CREATE, CTL_EOL); 1681 1682 /* SACK gets it's own little subtree. */ 1683 sysctl_createv(clog, 0, NULL, &sack_node, 1684 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1685 CTLTYPE_INT, "enable", 1686 SYSCTL_DESCR("Enable RFC2018 Selective ACKnowledgement"), 1687 NULL, 0, &tcp_do_sack, 0, 1688 CTL_NET, pf, IPPROTO_TCP, TCPCTL_SACK, CTL_CREATE, CTL_EOL); 1689 sysctl_createv(clog, 0, NULL, &sack_node, 1690 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1691 CTLTYPE_INT, "maxholes", 1692 SYSCTL_DESCR("Maximum number of TCP SACK holes allowed per connection"), 1693 NULL, 0, &tcp_sack_tp_maxholes, 0, 1694 CTL_NET, pf, IPPROTO_TCP, TCPCTL_SACK, CTL_CREATE, CTL_EOL); 1695 sysctl_createv(clog, 0, NULL, &sack_node, 1696 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1697 CTLTYPE_INT, "globalmaxholes", 1698 SYSCTL_DESCR("Global maximum number of TCP SACK holes"), 1699 NULL, 0, &tcp_sack_globalmaxholes, 0, 1700 CTL_NET, pf, IPPROTO_TCP, TCPCTL_SACK, CTL_CREATE, CTL_EOL); 1701 sysctl_createv(clog, 0, NULL, &sack_node, 1702 CTLFLAG_PERMANENT, 1703 CTLTYPE_INT, "globalholes", 1704 SYSCTL_DESCR("Global number of TCP SACK holes"), 1705 NULL, 0, &tcp_sack_globalholes, 0, 1706 CTL_NET, pf, IPPROTO_TCP, TCPCTL_SACK, CTL_CREATE, CTL_EOL); 1707 1708 sysctl_createv(clog, 0, NULL, NULL, 1709 CTLFLAG_PERMANENT, 1710 CTLTYPE_STRUCT, "stats", 1711 SYSCTL_DESCR("TCP statistics"), 1712 NULL, 0, &tcpstat, sizeof(tcpstat), 1713 CTL_NET, pf, IPPROTO_TCP, TCPCTL_STATS, 1714 CTL_EOL); 1715 #ifdef TCP_DEBUG 1716 sysctl_createv(clog, 0, NULL, NULL, 1717 CTLFLAG_PERMANENT, 1718 CTLTYPE_STRUCT, "debug", 1719 SYSCTL_DESCR("TCP sockets debug information"), 1720 NULL, 0, &tcp_debug, sizeof(tcp_debug), 1721 CTL_NET, pf, IPPROTO_TCP, TCPCTL_DEBUG, 1722 CTL_EOL); 1723 sysctl_createv(clog, 0, NULL, NULL, 1724 CTLFLAG_PERMANENT, 1725 CTLTYPE_INT, "debx", 1726 SYSCTL_DESCR("Number of TCP debug sockets messages"), 1727 NULL, 0, &tcp_debx, sizeof(tcp_debx), 1728 CTL_NET, pf, IPPROTO_TCP, TCPCTL_DEBX, 1729 CTL_EOL); 1730 #endif 1731 #if NRND > 0 1732 sysctl_createv(clog, 0, NULL, NULL, 1733 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1734 CTLTYPE_INT, "iss_hash", 1735 SYSCTL_DESCR("Enable RFC 1948 ISS by cryptographic " 1736 "hash computation"), 1737 NULL, 0, &tcp_do_rfc1948, sizeof(tcp_do_rfc1948), 1738 CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, 1739 CTL_EOL); 1740 #endif 1741 1742 /* ABC subtree */ 1743 1744 sysctl_createv(clog, 0, NULL, &abc_node, 1745 CTLFLAG_PERMANENT, CTLTYPE_NODE, "abc", 1746 SYSCTL_DESCR("RFC3465 Appropriate Byte Counting (ABC)"), 1747 NULL, 0, NULL, 0, 1748 CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL); 1749 sysctl_createv(clog, 0, &abc_node, NULL, 1750 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1751 CTLTYPE_INT, "enable", 1752 SYSCTL_DESCR("Enable RFC3465 Appropriate Byte Counting"), 1753 NULL, 0, &tcp_do_abc, 0, CTL_CREATE, CTL_EOL); 1754 sysctl_createv(clog, 0, &abc_node, NULL, 1755 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1756 CTLTYPE_INT, "aggressive", 1757 SYSCTL_DESCR("1: L=2*SMSS 0: L=1*SMSS"), 1758 NULL, 0, &tcp_abc_aggressive, 0, CTL_CREATE, CTL_EOL); 1759 } 1760 1761 /* 1762 * Sysctl for tcp variables. 1763 */ 1764 #ifdef INET 1765 SYSCTL_SETUP(sysctl_net_inet_tcp_setup, "sysctl net.inet.tcp subtree setup") 1766 { 1767 1768 sysctl_net_inet_tcp_setup2(clog, PF_INET, "inet", "tcp"); 1769 } 1770 #endif /* INET */ 1771 1772 #ifdef INET6 1773 SYSCTL_SETUP(sysctl_net_inet6_tcp6_setup, "sysctl net.inet6.tcp6 subtree setup") 1774 { 1775 1776 sysctl_net_inet_tcp_setup2(clog, PF_INET6, "inet6", "tcp6"); 1777 } 1778 #endif /* INET6 */ 1779