1 /* $OpenBSD: tcp_usrreq.c,v 1.230 2024/02/11 01:27:45 bluhm Exp $ */ 2 /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/protosw.h> 77 #include <sys/stat.h> 78 #include <sys/sysctl.h> 79 #include <sys/domain.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 #include <sys/proc.h> 83 84 #include <net/if.h> 85 #include <net/if_var.h> 86 #include <net/route.h> 87 88 #include <netinet/in.h> 89 #include <netinet/in_var.h> 90 #include <netinet/ip.h> 91 #include <netinet/in_pcb.h> 92 #include <netinet/ip_var.h> 93 #include <netinet6/ip6_var.h> 94 #include <netinet/tcp.h> 95 #include <netinet/tcp_fsm.h> 96 #include <netinet/tcp_seq.h> 97 #include <netinet/tcp_timer.h> 98 #include <netinet/tcp_var.h> 99 #include <netinet/tcp_debug.h> 100 101 #ifdef INET6 102 #include <netinet6/in6_var.h> 103 #endif 104 105 #ifndef TCP_SENDSPACE 106 #define TCP_SENDSPACE 1024*16 107 #endif 108 u_int tcp_sendspace = TCP_SENDSPACE; 109 #ifndef TCP_RECVSPACE 110 #define TCP_RECVSPACE 1024*16 111 #endif 112 u_int tcp_recvspace = TCP_RECVSPACE; 113 u_int tcp_autorcvbuf_inc = 16 * 1024; 114 115 const struct pr_usrreqs tcp_usrreqs = { 116 .pru_attach = tcp_attach, 117 .pru_detach = tcp_detach, 118 .pru_bind = tcp_bind, 119 .pru_listen = tcp_listen, 120 .pru_connect = tcp_connect, 121 .pru_accept = tcp_accept, 122 .pru_disconnect = tcp_disconnect, 123 .pru_shutdown = tcp_shutdown, 124 .pru_rcvd = tcp_rcvd, 125 .pru_send = tcp_send, 126 .pru_abort = tcp_abort, 127 .pru_sense = tcp_sense, 128 .pru_rcvoob = tcp_rcvoob, 129 .pru_sendoob = tcp_sendoob, 130 .pru_control = in_control, 131 .pru_sockaddr = tcp_sockaddr, 132 .pru_peeraddr = tcp_peeraddr, 133 }; 134 135 #ifdef INET6 136 const struct pr_usrreqs tcp6_usrreqs = { 137 .pru_attach = tcp_attach, 138 .pru_detach = tcp_detach, 139 .pru_bind = tcp_bind, 140 .pru_listen = tcp_listen, 141 .pru_connect = tcp_connect, 142 .pru_accept = tcp_accept, 143 .pru_disconnect = tcp_disconnect, 144 .pru_shutdown = tcp_shutdown, 145 .pru_rcvd = tcp_rcvd, 146 .pru_send = tcp_send, 147 .pru_abort = tcp_abort, 148 .pru_sense = tcp_sense, 149 .pru_rcvoob = tcp_rcvoob, 150 .pru_sendoob = tcp_sendoob, 151 .pru_control = in6_control, 152 .pru_sockaddr = tcp_sockaddr, 153 .pru_peeraddr = tcp_peeraddr, 154 }; 155 #endif 156 157 const struct sysctl_bounded_args tcpctl_vars[] = { 158 { TCPCTL_RFC1323, &tcp_do_rfc1323, 0, 1 }, 159 { TCPCTL_SACK, &tcp_do_sack, 0, 1 }, 160 { TCPCTL_MSSDFLT, &tcp_mssdflt, TCP_MSS, 65535 }, 161 { TCPCTL_RSTPPSLIMIT, &tcp_rst_ppslim, 1, 1000 * 1000 }, 162 { TCPCTL_ACK_ON_PUSH, &tcp_ack_on_push, 0, 1 }, 163 #ifdef TCP_ECN 164 { TCPCTL_ECN, &tcp_do_ecn, 0, 1 }, 165 #endif 166 { TCPCTL_SYN_CACHE_LIMIT, &tcp_syn_cache_limit, 1, 1000 * 1000 }, 167 { TCPCTL_SYN_BUCKET_LIMIT, &tcp_syn_bucket_limit, 1, INT_MAX }, 168 { TCPCTL_RFC3390, &tcp_do_rfc3390, 0, 2 }, 169 { TCPCTL_ALWAYS_KEEPALIVE, &tcp_always_keepalive, 0, 1 }, 170 { TCPCTL_TSO, &tcp_do_tso, 0, 1 }, 171 }; 172 173 struct inpcbtable tcbtable; 174 175 int tcp_fill_info(struct tcpcb *, struct socket *, struct mbuf *); 176 int tcp_ident(void *, size_t *, void *, size_t, int); 177 178 static inline int tcp_sogetpcb(struct socket *, struct inpcb **, 179 struct tcpcb **); 180 181 static inline int 182 tcp_sogetpcb(struct socket *so, struct inpcb **rinp, struct tcpcb **rtp) 183 { 184 struct inpcb *inp; 185 struct tcpcb *tp; 186 187 /* 188 * When a TCP is attached to a socket, then there will be 189 * a (struct inpcb) pointed at by the socket, and this 190 * structure will point at a subsidiary (struct tcpcb). 191 */ 192 if ((inp = sotoinpcb(so)) == NULL || (tp = intotcpcb(inp)) == NULL) { 193 if (so->so_error) 194 return so->so_error; 195 return EINVAL; 196 } 197 198 *rinp = inp; 199 *rtp = tp; 200 201 return 0; 202 } 203 204 /* 205 * Export internal TCP state information via a struct tcp_info without 206 * leaking any sensitive information. Sequence numbers are reported 207 * relative to the initial sequence number. 208 */ 209 int 210 tcp_fill_info(struct tcpcb *tp, struct socket *so, struct mbuf *m) 211 { 212 struct proc *p = curproc; 213 struct tcp_info *ti; 214 u_int t = 1000; /* msec => usec */ 215 uint64_t now; 216 217 if (sizeof(*ti) > MLEN) { 218 MCLGETL(m, M_WAITOK, sizeof(*ti)); 219 if (!ISSET(m->m_flags, M_EXT)) 220 return ENOMEM; 221 } 222 ti = mtod(m, struct tcp_info *); 223 m->m_len = sizeof(*ti); 224 memset(ti, 0, sizeof(*ti)); 225 now = tcp_now(); 226 227 ti->tcpi_state = tp->t_state; 228 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) 229 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; 230 if (tp->t_flags & TF_SACK_PERMIT) 231 ti->tcpi_options |= TCPI_OPT_SACK; 232 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { 233 ti->tcpi_options |= TCPI_OPT_WSCALE; 234 ti->tcpi_snd_wscale = tp->snd_scale; 235 ti->tcpi_rcv_wscale = tp->rcv_scale; 236 } 237 #ifdef TCP_ECN 238 if (tp->t_flags & TF_ECN_PERMIT) 239 ti->tcpi_options |= TCPI_OPT_ECN; 240 #endif 241 242 ti->tcpi_rto = tp->t_rxtcur * t; 243 ti->tcpi_snd_mss = tp->t_maxseg; 244 ti->tcpi_rcv_mss = tp->t_peermss; 245 246 ti->tcpi_last_data_sent = (now - tp->t_sndtime) * t; 247 ti->tcpi_last_ack_sent = (now - tp->t_sndacktime) * t; 248 ti->tcpi_last_data_recv = (now - tp->t_rcvtime) * t; 249 ti->tcpi_last_ack_recv = (now - tp->t_rcvacktime) * t; 250 251 ti->tcpi_rtt = ((uint64_t)tp->t_srtt * t) >> 252 (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 253 ti->tcpi_rttvar = ((uint64_t)tp->t_rttvar * t) >> 254 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT); 255 ti->tcpi_snd_ssthresh = tp->snd_ssthresh; 256 ti->tcpi_snd_cwnd = tp->snd_cwnd; 257 258 ti->tcpi_rcv_space = tp->rcv_wnd; 259 260 /* 261 * Provide only minimal information for unprivileged processes. 262 */ 263 if (suser(p) != 0) 264 return 0; 265 266 /* FreeBSD-specific extension fields for tcp_info. */ 267 ti->tcpi_snd_wnd = tp->snd_wnd; 268 ti->tcpi_snd_nxt = tp->snd_nxt - tp->iss; 269 ti->tcpi_rcv_nxt = tp->rcv_nxt - tp->irs; 270 /* missing tcpi_toe_tid */ 271 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; 272 ti->tcpi_rcv_ooopack = tp->t_rcvoopack; 273 ti->tcpi_snd_zerowin = tp->t_sndzerowin; 274 275 /* OpenBSD extensions */ 276 ti->tcpi_rttmin = tp->t_rttmin * t; 277 ti->tcpi_max_sndwnd = tp->max_sndwnd; 278 ti->tcpi_rcv_adv = tp->rcv_adv - tp->irs; 279 ti->tcpi_rcv_up = tp->rcv_up - tp->irs; 280 ti->tcpi_snd_una = tp->snd_una - tp->iss; 281 ti->tcpi_snd_up = tp->snd_up - tp->iss; 282 ti->tcpi_snd_wl1 = tp->snd_wl1 - tp->iss; 283 ti->tcpi_snd_wl2 = tp->snd_wl2 - tp->iss; 284 ti->tcpi_snd_max = tp->snd_max - tp->iss; 285 286 ti->tcpi_ts_recent = tp->ts_recent; /* XXX value from the wire */ 287 ti->tcpi_ts_recent_age = (now - tp->ts_recent_age) * t; 288 ti->tcpi_rfbuf_cnt = tp->rfbuf_cnt; 289 ti->tcpi_rfbuf_ts = (now - tp->rfbuf_ts) * t; 290 291 ti->tcpi_so_rcv_sb_cc = so->so_rcv.sb_cc; 292 ti->tcpi_so_rcv_sb_hiwat = so->so_rcv.sb_hiwat; 293 ti->tcpi_so_rcv_sb_lowat = so->so_rcv.sb_lowat; 294 ti->tcpi_so_rcv_sb_wat = so->so_rcv.sb_wat; 295 ti->tcpi_so_snd_sb_cc = so->so_snd.sb_cc; 296 ti->tcpi_so_snd_sb_hiwat = so->so_snd.sb_hiwat; 297 ti->tcpi_so_snd_sb_lowat = so->so_snd.sb_lowat; 298 ti->tcpi_so_snd_sb_wat = so->so_snd.sb_wat; 299 300 return 0; 301 } 302 303 int 304 tcp_ctloutput(int op, struct socket *so, int level, int optname, 305 struct mbuf *m) 306 { 307 int error = 0; 308 struct inpcb *inp; 309 struct tcpcb *tp; 310 int i; 311 312 inp = sotoinpcb(so); 313 if (inp == NULL) 314 return (ECONNRESET); 315 if (level != IPPROTO_TCP) { 316 #ifdef INET6 317 if (ISSET(inp->inp_flags, INP_IPV6)) 318 error = ip6_ctloutput(op, so, level, optname, m); 319 else 320 #endif /* INET6 */ 321 error = ip_ctloutput(op, so, level, optname, m); 322 return (error); 323 } 324 tp = intotcpcb(inp); 325 326 switch (op) { 327 328 case PRCO_SETOPT: 329 switch (optname) { 330 331 case TCP_NODELAY: 332 if (m == NULL || m->m_len < sizeof (int)) 333 error = EINVAL; 334 else if (*mtod(m, int *)) 335 tp->t_flags |= TF_NODELAY; 336 else 337 tp->t_flags &= ~TF_NODELAY; 338 break; 339 340 case TCP_NOPUSH: 341 if (m == NULL || m->m_len < sizeof (int)) 342 error = EINVAL; 343 else if (*mtod(m, int *)) 344 tp->t_flags |= TF_NOPUSH; 345 else if (tp->t_flags & TF_NOPUSH) { 346 tp->t_flags &= ~TF_NOPUSH; 347 if (TCPS_HAVEESTABLISHED(tp->t_state)) 348 error = tcp_output(tp); 349 } 350 break; 351 352 case TCP_MAXSEG: 353 if (m == NULL || m->m_len < sizeof (int)) { 354 error = EINVAL; 355 break; 356 } 357 358 i = *mtod(m, int *); 359 if (i > 0 && i <= tp->t_maxseg) 360 tp->t_maxseg = i; 361 else 362 error = EINVAL; 363 break; 364 365 case TCP_SACK_ENABLE: 366 if (m == NULL || m->m_len < sizeof (int)) { 367 error = EINVAL; 368 break; 369 } 370 371 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 372 error = EPERM; 373 break; 374 } 375 376 if (tp->t_flags & TF_SIGNATURE) { 377 error = EPERM; 378 break; 379 } 380 381 if (*mtod(m, int *)) 382 tp->sack_enable = 1; 383 else 384 tp->sack_enable = 0; 385 break; 386 #ifdef TCP_SIGNATURE 387 case TCP_MD5SIG: 388 if (m == NULL || m->m_len < sizeof (int)) { 389 error = EINVAL; 390 break; 391 } 392 393 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 394 error = EPERM; 395 break; 396 } 397 398 if (*mtod(m, int *)) { 399 tp->t_flags |= TF_SIGNATURE; 400 tp->sack_enable = 0; 401 } else 402 tp->t_flags &= ~TF_SIGNATURE; 403 break; 404 #endif /* TCP_SIGNATURE */ 405 default: 406 error = ENOPROTOOPT; 407 break; 408 } 409 break; 410 411 case PRCO_GETOPT: 412 switch (optname) { 413 case TCP_NODELAY: 414 m->m_len = sizeof(int); 415 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 416 break; 417 case TCP_NOPUSH: 418 m->m_len = sizeof(int); 419 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 420 break; 421 case TCP_MAXSEG: 422 m->m_len = sizeof(int); 423 *mtod(m, int *) = tp->t_maxseg; 424 break; 425 case TCP_SACK_ENABLE: 426 m->m_len = sizeof(int); 427 *mtod(m, int *) = tp->sack_enable; 428 break; 429 case TCP_INFO: 430 error = tcp_fill_info(tp, so, m); 431 break; 432 #ifdef TCP_SIGNATURE 433 case TCP_MD5SIG: 434 m->m_len = sizeof(int); 435 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; 436 break; 437 #endif 438 default: 439 error = ENOPROTOOPT; 440 break; 441 } 442 break; 443 } 444 return (error); 445 } 446 447 /* 448 * Attach TCP protocol to socket, allocating 449 * internet protocol control block, tcp control block, 450 * buffer space, and entering LISTEN state to accept connections. 451 */ 452 int 453 tcp_attach(struct socket *so, int proto, int wait) 454 { 455 struct tcpcb *tp; 456 struct inpcb *inp; 457 int error; 458 459 if (so->so_pcb) 460 return EISCONN; 461 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || 462 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || 463 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { 464 error = soreserve(so, tcp_sendspace, tcp_recvspace); 465 if (error) 466 return (error); 467 } 468 469 NET_ASSERT_LOCKED(); 470 error = in_pcballoc(so, &tcbtable, wait); 471 if (error) 472 return (error); 473 inp = sotoinpcb(so); 474 tp = tcp_newtcpcb(inp, wait); 475 if (tp == NULL) { 476 unsigned int nofd = so->so_state & SS_NOFDREF; /* XXX */ 477 478 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 479 in_pcbdetach(inp); 480 so->so_state |= nofd; 481 return (ENOBUFS); 482 } 483 tp->t_state = TCPS_CLOSED; 484 #ifdef INET6 485 /* we disallow IPv4 mapped address completely. */ 486 if (inp->inp_flags & INP_IPV6) 487 tp->pf = PF_INET6; 488 else 489 tp->pf = PF_INET; 490 #else 491 tp->pf = PF_INET; 492 #endif 493 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 494 so->so_linger = TCP_LINGERTIME; 495 496 if (so->so_options & SO_DEBUG) 497 tcp_trace(TA_USER, TCPS_CLOSED, tp, tp, NULL, PRU_ATTACH, 0); 498 return (0); 499 } 500 501 int 502 tcp_detach(struct socket *so) 503 { 504 struct inpcb *inp; 505 struct tcpcb *otp = NULL, *tp; 506 int error; 507 short ostate; 508 509 soassertlocked(so); 510 511 if ((error = tcp_sogetpcb(so, &inp, &tp))) 512 return (error); 513 514 if (so->so_options & SO_DEBUG) { 515 otp = tp; 516 ostate = tp->t_state; 517 } 518 519 /* 520 * Detach the TCP protocol from the socket. 521 * If the protocol state is non-embryonic, then can't 522 * do this directly: have to initiate a PRU_DISCONNECT, 523 * which may finish later; embryonic TCB's can just 524 * be discarded here. 525 */ 526 tp = tcp_dodisconnect(tp); 527 528 if (otp) 529 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DETACH, 0); 530 return (0); 531 } 532 533 /* 534 * Give the socket an address. 535 */ 536 int 537 tcp_bind(struct socket *so, struct mbuf *nam, struct proc *p) 538 { 539 struct inpcb *inp; 540 struct tcpcb *tp; 541 int error; 542 short ostate; 543 544 soassertlocked(so); 545 546 if ((error = tcp_sogetpcb(so, &inp, &tp))) 547 return (error); 548 549 if (so->so_options & SO_DEBUG) 550 ostate = tp->t_state; 551 552 error = in_pcbbind(inp, nam, p); 553 554 if (so->so_options & SO_DEBUG) 555 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_BIND, 0); 556 return (error); 557 } 558 559 /* 560 * Prepare to accept connections. 561 */ 562 int 563 tcp_listen(struct socket *so) 564 { 565 struct inpcb *inp; 566 struct tcpcb *tp, *otp = NULL; 567 int error; 568 short ostate; 569 570 soassertlocked(so); 571 572 if ((error = tcp_sogetpcb(so, &inp, &tp))) 573 return (error); 574 575 if (so->so_options & SO_DEBUG) { 576 otp = tp; 577 ostate = tp->t_state; 578 } 579 580 if (inp->inp_lport == 0) 581 if ((error = in_pcbbind(inp, NULL, curproc))) 582 goto out; 583 584 /* 585 * If the in_pcbbind() above is called, the tp->pf 586 * should still be whatever it was before. 587 */ 588 tp->t_state = TCPS_LISTEN; 589 590 out: 591 if (otp) 592 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_LISTEN, 0); 593 return (error); 594 } 595 596 /* 597 * Initiate connection to peer. 598 * Create a template for use in transmissions on this connection. 599 * Enter SYN_SENT state, and mark socket as connecting. 600 * Start keep-alive timer, and seed output sequence space. 601 * Send initial segment on connection. 602 */ 603 int 604 tcp_connect(struct socket *so, struct mbuf *nam) 605 { 606 struct inpcb *inp; 607 struct tcpcb *tp, *otp = NULL; 608 int error; 609 short ostate; 610 611 soassertlocked(so); 612 613 if ((error = tcp_sogetpcb(so, &inp, &tp))) 614 return (error); 615 616 if (so->so_options & SO_DEBUG) { 617 otp = tp; 618 ostate = tp->t_state; 619 } 620 621 #ifdef INET6 622 if (inp->inp_flags & INP_IPV6) { 623 struct sockaddr_in6 *sin6; 624 625 if ((error = in6_nam2sin6(nam, &sin6))) 626 goto out; 627 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || 628 IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { 629 error = EINVAL; 630 goto out; 631 } 632 } else 633 #endif /* INET6 */ 634 { 635 struct sockaddr_in *sin; 636 637 if ((error = in_nam2sin(nam, &sin))) 638 goto out; 639 if ((sin->sin_addr.s_addr == INADDR_ANY) || 640 (sin->sin_addr.s_addr == INADDR_BROADCAST) || 641 IN_MULTICAST(sin->sin_addr.s_addr) || 642 in_broadcast(sin->sin_addr, inp->inp_rtableid)) { 643 error = EINVAL; 644 goto out; 645 } 646 } 647 error = in_pcbconnect(inp, nam); 648 if (error) 649 goto out; 650 651 tp->t_template = tcp_template(tp); 652 if (tp->t_template == 0) { 653 in_pcbunset_faddr(inp); 654 in_pcbdisconnect(inp); 655 error = ENOBUFS; 656 goto out; 657 } 658 659 so->so_state |= SS_CONNECTOUT; 660 661 /* Compute window scaling to request. */ 662 tcp_rscale(tp, sb_max); 663 664 soisconnecting(so); 665 tcpstat_inc(tcps_connattempt); 666 tp->t_state = TCPS_SYN_SENT; 667 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 668 tcp_set_iss_tsm(tp); 669 tcp_sendseqinit(tp); 670 tp->snd_last = tp->snd_una; 671 error = tcp_output(tp); 672 673 out: 674 if (otp) 675 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_CONNECT, 0); 676 return (error); 677 } 678 679 /* 680 * Accept a connection. Essentially all the work is done at higher 681 * levels; just return the address of the peer, storing through addr. 682 */ 683 int 684 tcp_accept(struct socket *so, struct mbuf *nam) 685 { 686 struct inpcb *inp; 687 struct tcpcb *tp; 688 int error; 689 690 soassertlocked(so); 691 692 if ((error = tcp_sogetpcb(so, &inp, &tp))) 693 return (error); 694 695 in_setpeeraddr(inp, nam); 696 697 if (so->so_options & SO_DEBUG) 698 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_ACCEPT, 0); 699 return (0); 700 } 701 702 /* 703 * Initiate disconnect from peer. 704 * If connection never passed embryonic stage, just drop; 705 * else if don't need to let data drain, then can just drop anyways, 706 * else have to begin TCP shutdown process: mark socket disconnecting, 707 * drain unread data, state switch to reflect user close, and 708 * send segment (e.g. FIN) to peer. Socket will be really disconnected 709 * when peer sends FIN and acks ours. 710 * 711 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 712 */ 713 int 714 tcp_disconnect(struct socket *so) 715 { 716 struct inpcb *inp; 717 struct tcpcb *tp, *otp = NULL; 718 int error; 719 short ostate; 720 721 soassertlocked(so); 722 723 if ((error = tcp_sogetpcb(so, &inp, &tp))) 724 return (error); 725 726 if (so->so_options & SO_DEBUG) { 727 otp = tp; 728 ostate = tp->t_state; 729 } 730 731 tp = tcp_dodisconnect(tp); 732 733 if (otp) 734 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DISCONNECT, 0); 735 return (0); 736 } 737 738 /* 739 * Mark the connection as being incapable of further output. 740 */ 741 int 742 tcp_shutdown(struct socket *so) 743 { 744 struct inpcb *inp; 745 struct tcpcb *tp, *otp = NULL; 746 int error; 747 short ostate; 748 749 soassertlocked(so); 750 751 if ((error = tcp_sogetpcb(so, &inp, &tp))) 752 return (error); 753 754 if (so->so_options & SO_DEBUG) { 755 otp = tp; 756 ostate = tp->t_state; 757 } 758 759 if (so->so_snd.sb_state & SS_CANTSENDMORE) 760 goto out; 761 762 socantsendmore(so); 763 tp = tcp_usrclosed(tp); 764 if (tp) 765 error = tcp_output(tp); 766 767 out: 768 if (otp) 769 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_SHUTDOWN, 0); 770 return (error); 771 } 772 773 /* 774 * After a receive, possibly send window update to peer. 775 */ 776 void 777 tcp_rcvd(struct socket *so) 778 { 779 struct inpcb *inp; 780 struct tcpcb *tp; 781 short ostate; 782 783 soassertlocked(so); 784 785 if (tcp_sogetpcb(so, &inp, &tp)) 786 return; 787 788 if (so->so_options & SO_DEBUG) 789 ostate = tp->t_state; 790 791 /* 792 * soreceive() calls this function when a user receives 793 * ancillary data on a listening socket. We don't call 794 * tcp_output in such a case, since there is no header 795 * template for a listening socket and hence the kernel 796 * will panic. 797 */ 798 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 799 (void) tcp_output(tp); 800 801 if (so->so_options & SO_DEBUG) 802 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_RCVD, 0); 803 } 804 805 /* 806 * Do a send by putting data in output queue and updating urgent 807 * marker if URG set. Possibly send more data. 808 */ 809 int 810 tcp_send(struct socket *so, struct mbuf *m, struct mbuf *nam, 811 struct mbuf *control) 812 { 813 struct inpcb *inp; 814 struct tcpcb *tp; 815 int error; 816 short ostate; 817 818 soassertlocked(so); 819 820 if (control && control->m_len) { 821 error = EINVAL; 822 goto out; 823 } 824 825 if ((error = tcp_sogetpcb(so, &inp, &tp))) 826 goto out; 827 828 if (so->so_options & SO_DEBUG) 829 ostate = tp->t_state; 830 831 sbappendstream(so, &so->so_snd, m); 832 m = NULL; 833 834 error = tcp_output(tp); 835 836 if (so->so_options & SO_DEBUG) 837 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SEND, 0); 838 839 out: 840 m_freem(control); 841 m_freem(m); 842 843 return (error); 844 } 845 846 /* 847 * Abort the TCP. 848 */ 849 void 850 tcp_abort(struct socket *so) 851 { 852 struct inpcb *inp; 853 struct tcpcb *tp, *otp = NULL; 854 short ostate; 855 856 soassertlocked(so); 857 858 if (tcp_sogetpcb(so, &inp, &tp)) 859 return; 860 861 if (so->so_options & SO_DEBUG) { 862 otp = tp; 863 ostate = tp->t_state; 864 } 865 866 tp = tcp_drop(tp, ECONNABORTED); 867 868 if (otp) 869 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_ABORT, 0); 870 } 871 872 int 873 tcp_sense(struct socket *so, struct stat *ub) 874 { 875 struct inpcb *inp; 876 struct tcpcb *tp; 877 int error; 878 879 soassertlocked(so); 880 881 if ((error = tcp_sogetpcb(so, &inp, &tp))) 882 return (error); 883 884 ub->st_blksize = so->so_snd.sb_hiwat; 885 886 if (so->so_options & SO_DEBUG) 887 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_SENSE, 0); 888 return (0); 889 } 890 891 int 892 tcp_rcvoob(struct socket *so, struct mbuf *m, int flags) 893 { 894 struct inpcb *inp; 895 struct tcpcb *tp; 896 int error; 897 898 soassertlocked(so); 899 900 if ((error = tcp_sogetpcb(so, &inp, &tp))) 901 return (error); 902 903 if ((so->so_oobmark == 0 && 904 (so->so_rcv.sb_state & SS_RCVATMARK) == 0) || 905 so->so_options & SO_OOBINLINE || 906 tp->t_oobflags & TCPOOB_HADDATA) { 907 error = EINVAL; 908 goto out; 909 } 910 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 911 error = EWOULDBLOCK; 912 goto out; 913 } 914 m->m_len = 1; 915 *mtod(m, caddr_t) = tp->t_iobc; 916 if ((flags & MSG_PEEK) == 0) 917 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 918 out: 919 if (so->so_options & SO_DEBUG) 920 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_RCVOOB, 0); 921 return (error); 922 } 923 924 int 925 tcp_sendoob(struct socket *so, struct mbuf *m, struct mbuf *nam, 926 struct mbuf *control) 927 { 928 struct inpcb *inp; 929 struct tcpcb *tp; 930 int error; 931 short ostate; 932 933 soassertlocked(so); 934 935 if (control && control->m_len) { 936 error = EINVAL; 937 goto release; 938 } 939 940 if ((error = tcp_sogetpcb(so, &inp, &tp))) 941 goto release; 942 943 if (so->so_options & SO_DEBUG) 944 ostate = tp->t_state; 945 946 if (sbspace(so, &so->so_snd) < -512) { 947 error = ENOBUFS; 948 goto out; 949 } 950 951 /* 952 * According to RFC961 (Assigned Protocols), 953 * the urgent pointer points to the last octet 954 * of urgent data. We continue, however, 955 * to consider it to indicate the first octet 956 * of data past the urgent section. 957 * Otherwise, snd_up should be one lower. 958 */ 959 sbappendstream(so, &so->so_snd, m); 960 m = NULL; 961 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 962 tp->t_force = 1; 963 error = tcp_output(tp); 964 tp->t_force = 0; 965 966 out: 967 if (so->so_options & SO_DEBUG) 968 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SENDOOB, 0); 969 970 release: 971 m_freem(control); 972 m_freem(m); 973 974 return (error); 975 } 976 977 int 978 tcp_sockaddr(struct socket *so, struct mbuf *nam) 979 { 980 struct inpcb *inp; 981 struct tcpcb *tp; 982 int error; 983 984 soassertlocked(so); 985 986 if ((error = tcp_sogetpcb(so, &inp, &tp))) 987 return (error); 988 989 in_setsockaddr(inp, nam); 990 991 if (so->so_options & SO_DEBUG) 992 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, 993 PRU_SOCKADDR, 0); 994 return (0); 995 } 996 997 int 998 tcp_peeraddr(struct socket *so, struct mbuf *nam) 999 { 1000 struct inpcb *inp; 1001 struct tcpcb *tp; 1002 int error; 1003 1004 soassertlocked(so); 1005 1006 if ((error = tcp_sogetpcb(so, &inp, &tp))) 1007 return (error); 1008 1009 in_setpeeraddr(inp, nam); 1010 1011 if (so->so_options & SO_DEBUG) 1012 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_PEERADDR, 0); 1013 return (0); 1014 } 1015 1016 /* 1017 * Initiate (or continue) disconnect. 1018 * If embryonic state, just send reset (once). 1019 * If in ``let data drain'' option and linger null, just drop. 1020 * Otherwise (hard), mark socket disconnecting and drop 1021 * current input data; switch states based on user close, and 1022 * send segment to peer (with FIN). 1023 */ 1024 struct tcpcb * 1025 tcp_dodisconnect(struct tcpcb *tp) 1026 { 1027 struct socket *so = tp->t_inpcb->inp_socket; 1028 1029 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 1030 tp = tcp_close(tp); 1031 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 1032 tp = tcp_drop(tp, 0); 1033 else { 1034 soisdisconnecting(so); 1035 sbflush(so, &so->so_rcv); 1036 tp = tcp_usrclosed(tp); 1037 if (tp) 1038 (void) tcp_output(tp); 1039 } 1040 return (tp); 1041 } 1042 1043 /* 1044 * User issued close, and wish to trail through shutdown states: 1045 * if never received SYN, just forget it. If got a SYN from peer, 1046 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 1047 * If already got a FIN from peer, then almost done; go to LAST_ACK 1048 * state. In all other cases, have already sent FIN to peer (e.g. 1049 * after PRU_SHUTDOWN), and just have to play tedious game waiting 1050 * for peer to send FIN or not respond to keep-alives, etc. 1051 * We can let the user exit from the close as soon as the FIN is acked. 1052 */ 1053 struct tcpcb * 1054 tcp_usrclosed(struct tcpcb *tp) 1055 { 1056 1057 switch (tp->t_state) { 1058 1059 case TCPS_CLOSED: 1060 case TCPS_LISTEN: 1061 case TCPS_SYN_SENT: 1062 tp->t_state = TCPS_CLOSED; 1063 tp = tcp_close(tp); 1064 break; 1065 1066 case TCPS_SYN_RECEIVED: 1067 case TCPS_ESTABLISHED: 1068 tp->t_state = TCPS_FIN_WAIT_1; 1069 break; 1070 1071 case TCPS_CLOSE_WAIT: 1072 tp->t_state = TCPS_LAST_ACK; 1073 break; 1074 } 1075 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 1076 soisdisconnected(tp->t_inpcb->inp_socket); 1077 /* 1078 * If we are in FIN_WAIT_2, we arrived here because the 1079 * application did a shutdown of the send side. Like the 1080 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 1081 * a full close, we start a timer to make sure sockets are 1082 * not left in FIN_WAIT_2 forever. 1083 */ 1084 if (tp->t_state == TCPS_FIN_WAIT_2) 1085 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1086 } 1087 return (tp); 1088 } 1089 1090 /* 1091 * Look up a socket for ident or tcpdrop, ... 1092 */ 1093 int 1094 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) 1095 { 1096 int error = 0; 1097 struct tcp_ident_mapping tir; 1098 struct inpcb *inp; 1099 struct tcpcb *tp = NULL; 1100 struct sockaddr_in *fin, *lin; 1101 #ifdef INET6 1102 struct sockaddr_in6 *fin6, *lin6; 1103 struct in6_addr f6, l6; 1104 #endif 1105 1106 NET_ASSERT_LOCKED(); 1107 1108 if (dodrop) { 1109 if (oldp != NULL || *oldlenp != 0) 1110 return (EINVAL); 1111 if (newp == NULL) 1112 return (EPERM); 1113 if (newlen < sizeof(tir)) 1114 return (ENOMEM); 1115 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) 1116 return (error); 1117 } else { 1118 if (oldp == NULL) 1119 return (EINVAL); 1120 if (*oldlenp < sizeof(tir)) 1121 return (ENOMEM); 1122 if (newp != NULL || newlen != 0) 1123 return (EINVAL); 1124 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) 1125 return (error); 1126 } 1127 switch (tir.faddr.ss_family) { 1128 #ifdef INET6 1129 case AF_INET6: 1130 fin6 = (struct sockaddr_in6 *)&tir.faddr; 1131 error = in6_embedscope(&f6, fin6, NULL, NULL); 1132 if (error) 1133 return EINVAL; /*?*/ 1134 lin6 = (struct sockaddr_in6 *)&tir.laddr; 1135 error = in6_embedscope(&l6, lin6, NULL, NULL); 1136 if (error) 1137 return EINVAL; /*?*/ 1138 break; 1139 #endif 1140 case AF_INET: 1141 fin = (struct sockaddr_in *)&tir.faddr; 1142 lin = (struct sockaddr_in *)&tir.laddr; 1143 break; 1144 default: 1145 return (EINVAL); 1146 } 1147 1148 switch (tir.faddr.ss_family) { 1149 #ifdef INET6 1150 case AF_INET6: 1151 inp = in6_pcblookup(&tcbtable, &f6, 1152 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); 1153 break; 1154 #endif 1155 case AF_INET: 1156 inp = in_pcblookup(&tcbtable, fin->sin_addr, 1157 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); 1158 break; 1159 default: 1160 unhandled_af(tir.faddr.ss_family); 1161 } 1162 1163 if (dodrop) { 1164 if (inp && (tp = intotcpcb(inp)) && 1165 ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) 1166 tp = tcp_drop(tp, ECONNABORTED); 1167 else 1168 error = ESRCH; 1169 in_pcbunref(inp); 1170 return (error); 1171 } 1172 1173 if (inp == NULL) { 1174 tcpstat_inc(tcps_pcbhashmiss); 1175 switch (tir.faddr.ss_family) { 1176 #ifdef INET6 1177 case AF_INET6: 1178 inp = in6_pcblookup_listen(&tcbtable, 1179 &l6, lin6->sin6_port, NULL, tir.rdomain); 1180 break; 1181 #endif 1182 case AF_INET: 1183 inp = in_pcblookup_listen(&tcbtable, 1184 lin->sin_addr, lin->sin_port, NULL, tir.rdomain); 1185 break; 1186 } 1187 } 1188 1189 if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) { 1190 tir.ruid = inp->inp_socket->so_ruid; 1191 tir.euid = inp->inp_socket->so_euid; 1192 } else { 1193 tir.ruid = -1; 1194 tir.euid = -1; 1195 } 1196 1197 *oldlenp = sizeof (tir); 1198 error = copyout((void *)&tir, oldp, sizeof (tir)); 1199 in_pcbunref(inp); 1200 return (error); 1201 } 1202 1203 int 1204 tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp) 1205 { 1206 uint64_t counters[tcps_ncounters]; 1207 struct tcpstat tcpstat; 1208 struct syn_cache_set *set; 1209 int i = 0; 1210 1211 #define ASSIGN(field) do { tcpstat.field = counters[i++]; } while (0) 1212 1213 memset(&tcpstat, 0, sizeof tcpstat); 1214 counters_read(tcpcounters, counters, nitems(counters), NULL); 1215 ASSIGN(tcps_connattempt); 1216 ASSIGN(tcps_accepts); 1217 ASSIGN(tcps_connects); 1218 ASSIGN(tcps_drops); 1219 ASSIGN(tcps_conndrops); 1220 ASSIGN(tcps_closed); 1221 ASSIGN(tcps_segstimed); 1222 ASSIGN(tcps_rttupdated); 1223 ASSIGN(tcps_delack); 1224 ASSIGN(tcps_timeoutdrop); 1225 ASSIGN(tcps_rexmttimeo); 1226 ASSIGN(tcps_persisttimeo); 1227 ASSIGN(tcps_persistdrop); 1228 ASSIGN(tcps_keeptimeo); 1229 ASSIGN(tcps_keepprobe); 1230 ASSIGN(tcps_keepdrops); 1231 ASSIGN(tcps_sndtotal); 1232 ASSIGN(tcps_sndpack); 1233 ASSIGN(tcps_sndbyte); 1234 ASSIGN(tcps_sndrexmitpack); 1235 ASSIGN(tcps_sndrexmitbyte); 1236 ASSIGN(tcps_sndrexmitfast); 1237 ASSIGN(tcps_sndacks); 1238 ASSIGN(tcps_sndprobe); 1239 ASSIGN(tcps_sndurg); 1240 ASSIGN(tcps_sndwinup); 1241 ASSIGN(tcps_sndctrl); 1242 ASSIGN(tcps_rcvtotal); 1243 ASSIGN(tcps_rcvpack); 1244 ASSIGN(tcps_rcvbyte); 1245 ASSIGN(tcps_rcvbadsum); 1246 ASSIGN(tcps_rcvbadoff); 1247 ASSIGN(tcps_rcvmemdrop); 1248 ASSIGN(tcps_rcvnosec); 1249 ASSIGN(tcps_rcvshort); 1250 ASSIGN(tcps_rcvduppack); 1251 ASSIGN(tcps_rcvdupbyte); 1252 ASSIGN(tcps_rcvpartduppack); 1253 ASSIGN(tcps_rcvpartdupbyte); 1254 ASSIGN(tcps_rcvoopack); 1255 ASSIGN(tcps_rcvoobyte); 1256 ASSIGN(tcps_rcvpackafterwin); 1257 ASSIGN(tcps_rcvbyteafterwin); 1258 ASSIGN(tcps_rcvafterclose); 1259 ASSIGN(tcps_rcvwinprobe); 1260 ASSIGN(tcps_rcvdupack); 1261 ASSIGN(tcps_rcvacktoomuch); 1262 ASSIGN(tcps_rcvacktooold); 1263 ASSIGN(tcps_rcvackpack); 1264 ASSIGN(tcps_rcvackbyte); 1265 ASSIGN(tcps_rcvwinupd); 1266 ASSIGN(tcps_pawsdrop); 1267 ASSIGN(tcps_predack); 1268 ASSIGN(tcps_preddat); 1269 ASSIGN(tcps_pcbhashmiss); 1270 ASSIGN(tcps_noport); 1271 ASSIGN(tcps_badsyn); 1272 ASSIGN(tcps_dropsyn); 1273 ASSIGN(tcps_rcvbadsig); 1274 ASSIGN(tcps_rcvgoodsig); 1275 ASSIGN(tcps_inswcsum); 1276 ASSIGN(tcps_outswcsum); 1277 ASSIGN(tcps_ecn_accepts); 1278 ASSIGN(tcps_ecn_rcvece); 1279 ASSIGN(tcps_ecn_rcvcwr); 1280 ASSIGN(tcps_ecn_rcvce); 1281 ASSIGN(tcps_ecn_sndect); 1282 ASSIGN(tcps_ecn_sndece); 1283 ASSIGN(tcps_ecn_sndcwr); 1284 ASSIGN(tcps_cwr_ecn); 1285 ASSIGN(tcps_cwr_frecovery); 1286 ASSIGN(tcps_cwr_timeout); 1287 ASSIGN(tcps_sc_added); 1288 ASSIGN(tcps_sc_completed); 1289 ASSIGN(tcps_sc_timed_out); 1290 ASSIGN(tcps_sc_overflowed); 1291 ASSIGN(tcps_sc_reset); 1292 ASSIGN(tcps_sc_unreach); 1293 ASSIGN(tcps_sc_bucketoverflow); 1294 ASSIGN(tcps_sc_aborted); 1295 ASSIGN(tcps_sc_dupesyn); 1296 ASSIGN(tcps_sc_dropped); 1297 ASSIGN(tcps_sc_collisions); 1298 ASSIGN(tcps_sc_retransmitted); 1299 ASSIGN(tcps_sc_seedrandom); 1300 ASSIGN(tcps_sc_hash_size); 1301 ASSIGN(tcps_sc_entry_count); 1302 ASSIGN(tcps_sc_entry_limit); 1303 ASSIGN(tcps_sc_bucket_maxlen); 1304 ASSIGN(tcps_sc_bucket_limit); 1305 ASSIGN(tcps_sc_uses_left); 1306 ASSIGN(tcps_conndrained); 1307 ASSIGN(tcps_sack_recovery_episode); 1308 ASSIGN(tcps_sack_rexmits); 1309 ASSIGN(tcps_sack_rexmit_bytes); 1310 ASSIGN(tcps_sack_rcv_opts); 1311 ASSIGN(tcps_sack_snd_opts); 1312 ASSIGN(tcps_sack_drop_opts); 1313 ASSIGN(tcps_outswtso); 1314 ASSIGN(tcps_outhwtso); 1315 ASSIGN(tcps_outpkttso); 1316 ASSIGN(tcps_outbadtso); 1317 ASSIGN(tcps_inswlro); 1318 ASSIGN(tcps_inhwlro); 1319 ASSIGN(tcps_inpktlro); 1320 ASSIGN(tcps_inbadlro); 1321 1322 #undef ASSIGN 1323 1324 mtx_enter(&syn_cache_mtx); 1325 set = &tcp_syn_cache[tcp_syn_cache_active]; 1326 tcpstat.tcps_sc_hash_size = set->scs_size; 1327 tcpstat.tcps_sc_entry_count = set->scs_count; 1328 tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit; 1329 tcpstat.tcps_sc_bucket_maxlen = 0; 1330 for (i = 0; i < set->scs_size; i++) { 1331 if (tcpstat.tcps_sc_bucket_maxlen < 1332 set->scs_buckethead[i].sch_length) 1333 tcpstat.tcps_sc_bucket_maxlen = 1334 set->scs_buckethead[i].sch_length; 1335 } 1336 tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit; 1337 tcpstat.tcps_sc_uses_left = set->scs_use; 1338 mtx_leave(&syn_cache_mtx); 1339 1340 return (sysctl_rdstruct(oldp, oldlenp, newp, 1341 &tcpstat, sizeof(tcpstat))); 1342 } 1343 1344 /* 1345 * Sysctl for tcp variables. 1346 */ 1347 int 1348 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 1349 size_t newlen) 1350 { 1351 int error, nval; 1352 1353 /* All sysctl names at this level are terminal. */ 1354 if (namelen != 1) 1355 return (ENOTDIR); 1356 1357 switch (name[0]) { 1358 case TCPCTL_KEEPINITTIME: 1359 NET_LOCK(); 1360 nval = tcptv_keep_init / TCP_TIME(1); 1361 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval, 1362 1, 3 * (TCPTV_KEEP_INIT / TCP_TIME(1))); 1363 if (!error) 1364 tcptv_keep_init = TCP_TIME(nval); 1365 NET_UNLOCK(); 1366 return (error); 1367 1368 case TCPCTL_KEEPIDLE: 1369 NET_LOCK(); 1370 nval = tcp_keepidle / TCP_TIME(1); 1371 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval, 1372 1, 5 * (TCPTV_KEEP_IDLE / TCP_TIME(1))); 1373 if (!error) 1374 tcp_keepidle = TCP_TIME(nval); 1375 NET_UNLOCK(); 1376 return (error); 1377 1378 case TCPCTL_KEEPINTVL: 1379 NET_LOCK(); 1380 nval = tcp_keepintvl / TCP_TIME(1); 1381 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval, 1382 1, 3 * (TCPTV_KEEPINTVL / TCP_TIME(1))); 1383 if (!error) 1384 tcp_keepintvl = TCP_TIME(nval); 1385 NET_UNLOCK(); 1386 return (error); 1387 1388 case TCPCTL_BADDYNAMIC: 1389 NET_LOCK(); 1390 error = sysctl_struct(oldp, oldlenp, newp, newlen, 1391 baddynamicports.tcp, sizeof(baddynamicports.tcp)); 1392 NET_UNLOCK(); 1393 return (error); 1394 1395 case TCPCTL_ROOTONLY: 1396 if (newp && securelevel > 0) 1397 return (EPERM); 1398 NET_LOCK(); 1399 error = sysctl_struct(oldp, oldlenp, newp, newlen, 1400 rootonlyports.tcp, sizeof(rootonlyports.tcp)); 1401 NET_UNLOCK(); 1402 return (error); 1403 1404 case TCPCTL_IDENT: 1405 NET_LOCK(); 1406 error = tcp_ident(oldp, oldlenp, newp, newlen, 0); 1407 NET_UNLOCK(); 1408 return (error); 1409 1410 case TCPCTL_DROP: 1411 NET_LOCK(); 1412 error = tcp_ident(oldp, oldlenp, newp, newlen, 1); 1413 NET_UNLOCK(); 1414 return (error); 1415 1416 case TCPCTL_REASS_LIMIT: 1417 NET_LOCK(); 1418 nval = tcp_reass_limit; 1419 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1420 if (!error && nval != tcp_reass_limit) { 1421 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); 1422 if (!error) 1423 tcp_reass_limit = nval; 1424 } 1425 NET_UNLOCK(); 1426 return (error); 1427 1428 case TCPCTL_SACKHOLE_LIMIT: 1429 NET_LOCK(); 1430 nval = tcp_sackhole_limit; 1431 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1432 if (!error && nval != tcp_sackhole_limit) { 1433 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); 1434 if (!error) 1435 tcp_sackhole_limit = nval; 1436 } 1437 NET_UNLOCK(); 1438 return (error); 1439 1440 case TCPCTL_STATS: 1441 return (tcp_sysctl_tcpstat(oldp, oldlenp, newp)); 1442 1443 case TCPCTL_SYN_USE_LIMIT: 1444 NET_LOCK(); 1445 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, 1446 &tcp_syn_use_limit, 0, INT_MAX); 1447 if (!error && newp != NULL) { 1448 /* 1449 * Global tcp_syn_use_limit is used when reseeding a 1450 * new cache. Also update the value in active cache. 1451 */ 1452 mtx_enter(&syn_cache_mtx); 1453 if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit) 1454 tcp_syn_cache[0].scs_use = tcp_syn_use_limit; 1455 if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit) 1456 tcp_syn_cache[1].scs_use = tcp_syn_use_limit; 1457 mtx_leave(&syn_cache_mtx); 1458 } 1459 NET_UNLOCK(); 1460 return (error); 1461 1462 case TCPCTL_SYN_HASH_SIZE: 1463 NET_LOCK(); 1464 nval = tcp_syn_hash_size; 1465 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, 1466 &nval, 1, 100000); 1467 if (!error && nval != tcp_syn_hash_size) { 1468 /* 1469 * If global hash size has been changed, 1470 * switch sets as soon as possible. Then 1471 * the actual hash array will be reallocated. 1472 */ 1473 mtx_enter(&syn_cache_mtx); 1474 if (tcp_syn_cache[0].scs_size != nval) 1475 tcp_syn_cache[0].scs_use = 0; 1476 if (tcp_syn_cache[1].scs_size != nval) 1477 tcp_syn_cache[1].scs_use = 0; 1478 tcp_syn_hash_size = nval; 1479 mtx_leave(&syn_cache_mtx); 1480 } 1481 NET_UNLOCK(); 1482 return (error); 1483 1484 default: 1485 NET_LOCK(); 1486 error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars), 1487 name, namelen, oldp, oldlenp, newp, newlen); 1488 NET_UNLOCK(); 1489 return (error); 1490 } 1491 /* NOTREACHED */ 1492 } 1493 1494 /* 1495 * Scale the send buffer so that inflight data is not accounted against 1496 * the limit. The buffer will scale with the congestion window, if the 1497 * the receiver stops acking data the window will shrink and therefore 1498 * the buffer size will shrink as well. 1499 * In low memory situation try to shrink the buffer to the initial size 1500 * disabling the send buffer scaling as long as the situation persists. 1501 */ 1502 void 1503 tcp_update_sndspace(struct tcpcb *tp) 1504 { 1505 struct socket *so = tp->t_inpcb->inp_socket; 1506 u_long nmax = so->so_snd.sb_hiwat; 1507 1508 if (sbchecklowmem()) { 1509 /* low on memory try to get rid of some */ 1510 if (tcp_sendspace < nmax) 1511 nmax = tcp_sendspace; 1512 } else if (so->so_snd.sb_wat != tcp_sendspace) 1513 /* user requested buffer size, auto-scaling disabled */ 1514 nmax = so->so_snd.sb_wat; 1515 else 1516 /* automatic buffer scaling */ 1517 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - 1518 tp->snd_una); 1519 1520 /* a writable socket must be preserved because of poll(2) semantics */ 1521 if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) { 1522 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) 1523 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; 1524 /* keep in sync with sbreserve() calculation */ 1525 if (nmax * 8 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat) 1526 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+7) / 8; 1527 } 1528 1529 /* round to MSS boundary */ 1530 nmax = roundup(nmax, tp->t_maxseg); 1531 1532 if (nmax != so->so_snd.sb_hiwat) 1533 sbreserve(so, &so->so_snd, nmax); 1534 } 1535 1536 /* 1537 * Scale the recv buffer by looking at how much data was transferred in 1538 * one approximated RTT. If more than a big part of the recv buffer was 1539 * transferred during that time we increase the buffer by a constant. 1540 * In low memory situation try to shrink the buffer to the initial size. 1541 */ 1542 void 1543 tcp_update_rcvspace(struct tcpcb *tp) 1544 { 1545 struct socket *so = tp->t_inpcb->inp_socket; 1546 u_long nmax = so->so_rcv.sb_hiwat; 1547 1548 if (sbchecklowmem()) { 1549 /* low on memory try to get rid of some */ 1550 if (tcp_recvspace < nmax) 1551 nmax = tcp_recvspace; 1552 } else if (so->so_rcv.sb_wat != tcp_recvspace) 1553 /* user requested buffer size, auto-scaling disabled */ 1554 nmax = so->so_rcv.sb_wat; 1555 else { 1556 /* automatic buffer scaling */ 1557 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) 1558 nmax = MIN(sb_max, so->so_rcv.sb_hiwat + 1559 tcp_autorcvbuf_inc); 1560 } 1561 1562 /* a readable socket must be preserved because of poll(2) semantics */ 1563 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && 1564 nmax < so->so_snd.sb_lowat) 1565 nmax = so->so_snd.sb_lowat; 1566 1567 if (nmax == so->so_rcv.sb_hiwat) 1568 return; 1569 1570 /* round to MSS boundary */ 1571 nmax = roundup(nmax, tp->t_maxseg); 1572 sbreserve(so, &so->so_rcv, nmax); 1573 } 1574