1 /* $OpenBSD: tcp_usrreq.c,v 1.239 2025/01/09 16:47:24 bluhm Exp $ */ 2 /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/protosw.h> 77 #include <sys/stat.h> 78 #include <sys/sysctl.h> 79 #include <sys/domain.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 #include <sys/proc.h> 83 84 #include <net/if.h> 85 #include <net/if_var.h> 86 #include <net/route.h> 87 88 #include <netinet/in.h> 89 #include <netinet/in_var.h> 90 #include <netinet/ip.h> 91 #include <netinet/in_pcb.h> 92 #include <netinet/ip_var.h> 93 #include <netinet6/ip6_var.h> 94 #include <netinet/tcp.h> 95 #include <netinet/tcp_fsm.h> 96 #include <netinet/tcp_seq.h> 97 #include <netinet/tcp_timer.h> 98 #include <netinet/tcp_var.h> 99 #include <netinet/tcp_debug.h> 100 101 #ifdef INET6 102 #include <netinet6/in6_var.h> 103 #endif 104 105 /* 106 * Locks used to protect global variables in this file: 107 * I immutable after creation 108 */ 109 110 #ifndef TCP_SENDSPACE 111 #define TCP_SENDSPACE 1024*16 112 #endif 113 u_int tcp_sendspace = TCP_SENDSPACE; /* [I] */ 114 #ifndef TCP_RECVSPACE 115 #define TCP_RECVSPACE 1024*16 116 #endif 117 u_int tcp_recvspace = TCP_RECVSPACE; /* [I] */ 118 u_int tcp_autorcvbuf_inc = 16 * 1024; /* [I] */ 119 120 const struct pr_usrreqs tcp_usrreqs = { 121 .pru_attach = tcp_attach, 122 .pru_detach = tcp_detach, 123 .pru_bind = tcp_bind, 124 .pru_listen = tcp_listen, 125 .pru_connect = tcp_connect, 126 .pru_accept = tcp_accept, 127 .pru_disconnect = tcp_disconnect, 128 .pru_shutdown = tcp_shutdown, 129 .pru_rcvd = tcp_rcvd, 130 .pru_send = tcp_send, 131 .pru_abort = tcp_abort, 132 .pru_sense = tcp_sense, 133 .pru_rcvoob = tcp_rcvoob, 134 .pru_sendoob = tcp_sendoob, 135 .pru_control = in_control, 136 .pru_sockaddr = tcp_sockaddr, 137 .pru_peeraddr = tcp_peeraddr, 138 }; 139 140 #ifdef INET6 141 const struct pr_usrreqs tcp6_usrreqs = { 142 .pru_attach = tcp_attach, 143 .pru_detach = tcp_detach, 144 .pru_bind = tcp_bind, 145 .pru_listen = tcp_listen, 146 .pru_connect = tcp_connect, 147 .pru_accept = tcp_accept, 148 .pru_disconnect = tcp_disconnect, 149 .pru_shutdown = tcp_shutdown, 150 .pru_rcvd = tcp_rcvd, 151 .pru_send = tcp_send, 152 .pru_abort = tcp_abort, 153 .pru_sense = tcp_sense, 154 .pru_rcvoob = tcp_rcvoob, 155 .pru_sendoob = tcp_sendoob, 156 .pru_control = in6_control, 157 .pru_sockaddr = tcp_sockaddr, 158 .pru_peeraddr = tcp_peeraddr, 159 }; 160 #endif 161 162 const struct sysctl_bounded_args tcpctl_vars[] = { 163 { TCPCTL_RFC1323, &tcp_do_rfc1323, 0, 1 }, 164 { TCPCTL_SACK, &tcp_do_sack, 0, 1 }, 165 { TCPCTL_MSSDFLT, &tcp_mssdflt, TCP_MSS, 65535 }, 166 { TCPCTL_RSTPPSLIMIT, &tcp_rst_ppslim, 1, 1000 * 1000 }, 167 { TCPCTL_ACK_ON_PUSH, &tcp_ack_on_push, 0, 1 }, 168 #ifdef TCP_ECN 169 { TCPCTL_ECN, &tcp_do_ecn, 0, 1 }, 170 #endif 171 { TCPCTL_SYN_CACHE_LIMIT, &tcp_syn_cache_limit, 1, 1000 * 1000 }, 172 { TCPCTL_SYN_BUCKET_LIMIT, &tcp_syn_bucket_limit, 1, INT_MAX }, 173 { TCPCTL_RFC3390, &tcp_do_rfc3390, 0, 2 }, 174 { TCPCTL_ALWAYS_KEEPALIVE, &tcp_always_keepalive, 0, 1 }, 175 { TCPCTL_TSO, &tcp_do_tso, 0, 1 }, 176 }; 177 178 struct inpcbtable tcbtable; 179 #ifdef INET6 180 struct inpcbtable tcb6table; 181 #endif 182 183 int tcp_fill_info(struct tcpcb *, struct socket *, struct mbuf *); 184 int tcp_ident(void *, size_t *, void *, size_t, int); 185 186 static inline int tcp_sogetpcb(struct socket *, struct inpcb **, 187 struct tcpcb **); 188 189 static inline int 190 tcp_sogetpcb(struct socket *so, struct inpcb **rinp, struct tcpcb **rtp) 191 { 192 struct inpcb *inp; 193 struct tcpcb *tp; 194 195 /* 196 * When a TCP is attached to a socket, then there will be 197 * a (struct inpcb) pointed at by the socket, and this 198 * structure will point at a subsidiary (struct tcpcb). 199 */ 200 if ((inp = sotoinpcb(so)) == NULL || (tp = intotcpcb(inp)) == NULL) { 201 int error; 202 203 if ((error = READ_ONCE(so->so_error))) 204 return error; 205 return EINVAL; 206 } 207 208 *rinp = inp; 209 *rtp = tp; 210 211 return 0; 212 } 213 214 /* 215 * Export internal TCP state information via a struct tcp_info without 216 * leaking any sensitive information. Sequence numbers are reported 217 * relative to the initial sequence number. 218 */ 219 int 220 tcp_fill_info(struct tcpcb *tp, struct socket *so, struct mbuf *m) 221 { 222 struct proc *p = curproc; 223 struct tcp_info *ti; 224 u_int t = 1000; /* msec => usec */ 225 uint64_t now; 226 227 if (sizeof(*ti) > MLEN) { 228 MCLGETL(m, M_WAITOK, sizeof(*ti)); 229 if (!ISSET(m->m_flags, M_EXT)) 230 return ENOMEM; 231 } 232 ti = mtod(m, struct tcp_info *); 233 m->m_len = sizeof(*ti); 234 memset(ti, 0, sizeof(*ti)); 235 now = tcp_now(); 236 237 ti->tcpi_state = tp->t_state; 238 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) 239 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; 240 if (tp->t_flags & TF_SACK_PERMIT) 241 ti->tcpi_options |= TCPI_OPT_SACK; 242 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { 243 ti->tcpi_options |= TCPI_OPT_WSCALE; 244 ti->tcpi_snd_wscale = tp->snd_scale; 245 ti->tcpi_rcv_wscale = tp->rcv_scale; 246 } 247 #ifdef TCP_ECN 248 if (tp->t_flags & TF_ECN_PERMIT) 249 ti->tcpi_options |= TCPI_OPT_ECN; 250 #endif 251 252 ti->tcpi_rto = tp->t_rxtcur * t; 253 ti->tcpi_snd_mss = tp->t_maxseg; 254 ti->tcpi_rcv_mss = tp->t_peermss; 255 256 ti->tcpi_last_data_sent = (now - tp->t_sndtime) * t; 257 ti->tcpi_last_ack_sent = (now - tp->t_sndacktime) * t; 258 ti->tcpi_last_data_recv = (now - tp->t_rcvtime) * t; 259 ti->tcpi_last_ack_recv = (now - tp->t_rcvacktime) * t; 260 261 ti->tcpi_rtt = ((uint64_t)tp->t_srtt * t) >> 262 (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 263 ti->tcpi_rttvar = ((uint64_t)tp->t_rttvar * t) >> 264 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT); 265 ti->tcpi_snd_ssthresh = tp->snd_ssthresh; 266 ti->tcpi_snd_cwnd = tp->snd_cwnd; 267 268 ti->tcpi_rcv_space = tp->rcv_wnd; 269 270 /* 271 * Provide only minimal information for unprivileged processes. 272 */ 273 if (suser(p) != 0) 274 return 0; 275 276 /* FreeBSD-specific extension fields for tcp_info. */ 277 ti->tcpi_snd_wnd = tp->snd_wnd; 278 ti->tcpi_snd_nxt = tp->snd_nxt - tp->iss; 279 ti->tcpi_rcv_nxt = tp->rcv_nxt - tp->irs; 280 /* missing tcpi_toe_tid */ 281 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; 282 ti->tcpi_rcv_ooopack = tp->t_rcvoopack; 283 ti->tcpi_snd_zerowin = tp->t_sndzerowin; 284 285 /* OpenBSD extensions */ 286 ti->tcpi_rttmin = tp->t_rttmin * t; 287 ti->tcpi_max_sndwnd = tp->max_sndwnd; 288 ti->tcpi_rcv_adv = tp->rcv_adv - tp->irs; 289 ti->tcpi_rcv_up = tp->rcv_up - tp->irs; 290 ti->tcpi_snd_una = tp->snd_una - tp->iss; 291 ti->tcpi_snd_up = tp->snd_up - tp->iss; 292 ti->tcpi_snd_wl1 = tp->snd_wl1 - tp->iss; 293 ti->tcpi_snd_wl2 = tp->snd_wl2 - tp->iss; 294 ti->tcpi_snd_max = tp->snd_max - tp->iss; 295 296 ti->tcpi_ts_recent = tp->ts_recent; /* XXX value from the wire */ 297 ti->tcpi_ts_recent_age = (now - tp->ts_recent_age) * t; 298 ti->tcpi_rfbuf_cnt = tp->rfbuf_cnt; 299 ti->tcpi_rfbuf_ts = (now - tp->rfbuf_ts) * t; 300 301 mtx_enter(&so->so_rcv.sb_mtx); 302 ti->tcpi_so_rcv_sb_cc = so->so_rcv.sb_cc; 303 ti->tcpi_so_rcv_sb_hiwat = so->so_rcv.sb_hiwat; 304 ti->tcpi_so_rcv_sb_lowat = so->so_rcv.sb_lowat; 305 ti->tcpi_so_rcv_sb_wat = so->so_rcv.sb_wat; 306 mtx_leave(&so->so_rcv.sb_mtx); 307 mtx_enter(&so->so_snd.sb_mtx); 308 ti->tcpi_so_snd_sb_cc = so->so_snd.sb_cc; 309 ti->tcpi_so_snd_sb_hiwat = so->so_snd.sb_hiwat; 310 ti->tcpi_so_snd_sb_lowat = so->so_snd.sb_lowat; 311 ti->tcpi_so_snd_sb_wat = so->so_snd.sb_wat; 312 mtx_leave(&so->so_snd.sb_mtx); 313 314 return 0; 315 } 316 317 int 318 tcp_ctloutput(int op, struct socket *so, int level, int optname, 319 struct mbuf *m) 320 { 321 int error = 0; 322 struct inpcb *inp; 323 struct tcpcb *tp; 324 int i; 325 326 inp = sotoinpcb(so); 327 if (inp == NULL) 328 return (ECONNRESET); 329 if (level != IPPROTO_TCP) { 330 #ifdef INET6 331 if (ISSET(inp->inp_flags, INP_IPV6)) 332 error = ip6_ctloutput(op, so, level, optname, m); 333 else 334 #endif 335 error = ip_ctloutput(op, so, level, optname, m); 336 return (error); 337 } 338 tp = intotcpcb(inp); 339 340 switch (op) { 341 342 case PRCO_SETOPT: 343 switch (optname) { 344 345 case TCP_NODELAY: 346 if (m == NULL || m->m_len < sizeof (int)) 347 error = EINVAL; 348 else if (*mtod(m, int *)) 349 tp->t_flags |= TF_NODELAY; 350 else 351 tp->t_flags &= ~TF_NODELAY; 352 break; 353 354 case TCP_NOPUSH: 355 if (m == NULL || m->m_len < sizeof (int)) 356 error = EINVAL; 357 else if (*mtod(m, int *)) 358 tp->t_flags |= TF_NOPUSH; 359 else if (tp->t_flags & TF_NOPUSH) { 360 tp->t_flags &= ~TF_NOPUSH; 361 if (TCPS_HAVEESTABLISHED(tp->t_state)) 362 error = tcp_output(tp); 363 } 364 break; 365 366 case TCP_MAXSEG: 367 if (m == NULL || m->m_len < sizeof (int)) { 368 error = EINVAL; 369 break; 370 } 371 372 i = *mtod(m, int *); 373 if (i > 0 && i <= tp->t_maxseg) 374 tp->t_maxseg = i; 375 else 376 error = EINVAL; 377 break; 378 379 case TCP_SACK_ENABLE: 380 if (m == NULL || m->m_len < sizeof (int)) { 381 error = EINVAL; 382 break; 383 } 384 385 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 386 error = EPERM; 387 break; 388 } 389 390 if (tp->t_flags & TF_SIGNATURE) { 391 error = EPERM; 392 break; 393 } 394 395 if (*mtod(m, int *)) 396 tp->sack_enable = 1; 397 else 398 tp->sack_enable = 0; 399 break; 400 #ifdef TCP_SIGNATURE 401 case TCP_MD5SIG: 402 if (m == NULL || m->m_len < sizeof (int)) { 403 error = EINVAL; 404 break; 405 } 406 407 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 408 error = EPERM; 409 break; 410 } 411 412 if (*mtod(m, int *)) { 413 tp->t_flags |= TF_SIGNATURE; 414 tp->sack_enable = 0; 415 } else 416 tp->t_flags &= ~TF_SIGNATURE; 417 break; 418 #endif /* TCP_SIGNATURE */ 419 default: 420 error = ENOPROTOOPT; 421 break; 422 } 423 break; 424 425 case PRCO_GETOPT: 426 switch (optname) { 427 case TCP_NODELAY: 428 m->m_len = sizeof(int); 429 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 430 break; 431 case TCP_NOPUSH: 432 m->m_len = sizeof(int); 433 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 434 break; 435 case TCP_MAXSEG: 436 m->m_len = sizeof(int); 437 *mtod(m, int *) = tp->t_maxseg; 438 break; 439 case TCP_SACK_ENABLE: 440 m->m_len = sizeof(int); 441 *mtod(m, int *) = tp->sack_enable; 442 break; 443 case TCP_INFO: 444 error = tcp_fill_info(tp, so, m); 445 break; 446 #ifdef TCP_SIGNATURE 447 case TCP_MD5SIG: 448 m->m_len = sizeof(int); 449 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; 450 break; 451 #endif 452 default: 453 error = ENOPROTOOPT; 454 break; 455 } 456 break; 457 } 458 return (error); 459 } 460 461 /* 462 * Attach TCP protocol to socket, allocating 463 * internet protocol control block, tcp control block, 464 * buffer space, and entering LISTEN state to accept connections. 465 */ 466 int 467 tcp_attach(struct socket *so, int proto, int wait) 468 { 469 struct inpcbtable *table; 470 struct tcpcb *tp; 471 struct inpcb *inp; 472 int error; 473 474 if (so->so_pcb) 475 return EISCONN; 476 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || 477 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || 478 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { 479 error = soreserve(so, tcp_sendspace, tcp_recvspace); 480 if (error) 481 return (error); 482 } 483 484 NET_ASSERT_LOCKED(); 485 #ifdef INET6 486 if (so->so_proto->pr_domain->dom_family == PF_INET6) 487 table = &tcb6table; 488 else 489 #endif 490 table = &tcbtable; 491 error = in_pcballoc(so, table, wait); 492 if (error) 493 return (error); 494 inp = sotoinpcb(so); 495 tp = tcp_newtcpcb(inp, wait); 496 if (tp == NULL) { 497 unsigned int nofd = so->so_state & SS_NOFDREF; /* XXX */ 498 499 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 500 in_pcbdetach(inp); 501 so->so_state |= nofd; 502 return (ENOBUFS); 503 } 504 tp->t_state = TCPS_CLOSED; 505 #ifdef INET6 506 if (ISSET(inp->inp_flags, INP_IPV6)) 507 tp->pf = PF_INET6; 508 else 509 #endif 510 tp->pf = PF_INET; 511 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 512 so->so_linger = TCP_LINGERTIME; 513 514 if (so->so_options & SO_DEBUG) 515 tcp_trace(TA_USER, TCPS_CLOSED, tp, tp, NULL, PRU_ATTACH, 0); 516 return (0); 517 } 518 519 int 520 tcp_detach(struct socket *so) 521 { 522 struct inpcb *inp; 523 struct tcpcb *otp = NULL, *tp; 524 int error; 525 short ostate; 526 527 soassertlocked(so); 528 529 if ((error = tcp_sogetpcb(so, &inp, &tp))) 530 return (error); 531 532 if (so->so_options & SO_DEBUG) { 533 otp = tp; 534 ostate = tp->t_state; 535 } 536 537 /* 538 * Detach the TCP protocol from the socket. 539 * If the protocol state is non-embryonic, then can't 540 * do this directly: have to initiate a PRU_DISCONNECT, 541 * which may finish later; embryonic TCB's can just 542 * be discarded here. 543 */ 544 tp = tcp_dodisconnect(tp); 545 546 if (otp) 547 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DETACH, 0); 548 return (0); 549 } 550 551 /* 552 * Give the socket an address. 553 */ 554 int 555 tcp_bind(struct socket *so, struct mbuf *nam, struct proc *p) 556 { 557 struct inpcb *inp; 558 struct tcpcb *tp; 559 int error; 560 short ostate; 561 562 soassertlocked(so); 563 564 if ((error = tcp_sogetpcb(so, &inp, &tp))) 565 return (error); 566 567 if (so->so_options & SO_DEBUG) 568 ostate = tp->t_state; 569 570 error = in_pcbbind(inp, nam, p); 571 572 if (so->so_options & SO_DEBUG) 573 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_BIND, 0); 574 return (error); 575 } 576 577 /* 578 * Prepare to accept connections. 579 */ 580 int 581 tcp_listen(struct socket *so) 582 { 583 struct inpcb *inp; 584 struct tcpcb *tp, *otp = NULL; 585 int error; 586 short ostate; 587 588 soassertlocked(so); 589 590 if ((error = tcp_sogetpcb(so, &inp, &tp))) 591 return (error); 592 593 if (so->so_options & SO_DEBUG) { 594 otp = tp; 595 ostate = tp->t_state; 596 } 597 598 if (inp->inp_lport == 0) 599 if ((error = in_pcbbind(inp, NULL, curproc))) 600 goto out; 601 602 /* 603 * If the in_pcbbind() above is called, the tp->pf 604 * should still be whatever it was before. 605 */ 606 tp->t_state = TCPS_LISTEN; 607 608 out: 609 if (otp) 610 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_LISTEN, 0); 611 return (error); 612 } 613 614 /* 615 * Initiate connection to peer. 616 * Create a template for use in transmissions on this connection. 617 * Enter SYN_SENT state, and mark socket as connecting. 618 * Start keep-alive timer, and seed output sequence space. 619 * Send initial segment on connection. 620 */ 621 int 622 tcp_connect(struct socket *so, struct mbuf *nam) 623 { 624 struct inpcb *inp; 625 struct tcpcb *tp, *otp = NULL; 626 int error; 627 short ostate; 628 629 soassertlocked(so); 630 631 if ((error = tcp_sogetpcb(so, &inp, &tp))) 632 return (error); 633 634 if (so->so_options & SO_DEBUG) { 635 otp = tp; 636 ostate = tp->t_state; 637 } 638 639 #ifdef INET6 640 if (ISSET(inp->inp_flags, INP_IPV6)) { 641 struct sockaddr_in6 *sin6; 642 643 if ((error = in6_nam2sin6(nam, &sin6))) 644 goto out; 645 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || 646 IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { 647 error = EINVAL; 648 goto out; 649 } 650 } else 651 #endif 652 { 653 struct sockaddr_in *sin; 654 655 if ((error = in_nam2sin(nam, &sin))) 656 goto out; 657 if ((sin->sin_addr.s_addr == INADDR_ANY) || 658 (sin->sin_addr.s_addr == INADDR_BROADCAST) || 659 IN_MULTICAST(sin->sin_addr.s_addr) || 660 in_broadcast(sin->sin_addr, inp->inp_rtableid)) { 661 error = EINVAL; 662 goto out; 663 } 664 } 665 error = in_pcbconnect(inp, nam); 666 if (error) 667 goto out; 668 669 tp->t_template = tcp_template(tp); 670 if (tp->t_template == 0) { 671 in_pcbunset_faddr(inp); 672 in_pcbdisconnect(inp); 673 error = ENOBUFS; 674 goto out; 675 } 676 677 so->so_state |= SS_CONNECTOUT; 678 679 /* Compute window scaling to request. */ 680 tcp_rscale(tp, sb_max); 681 682 soisconnecting(so); 683 tcpstat_inc(tcps_connattempt); 684 tp->t_state = TCPS_SYN_SENT; 685 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 686 tcp_set_iss_tsm(tp); 687 tcp_sendseqinit(tp); 688 tp->snd_last = tp->snd_una; 689 error = tcp_output(tp); 690 691 out: 692 if (otp) 693 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_CONNECT, 0); 694 return (error); 695 } 696 697 /* 698 * Accept a connection. Essentially all the work is done at higher 699 * levels; just return the address of the peer, storing through addr. 700 */ 701 int 702 tcp_accept(struct socket *so, struct mbuf *nam) 703 { 704 struct inpcb *inp; 705 struct tcpcb *tp; 706 int error; 707 708 soassertlocked(so); 709 710 if ((error = tcp_sogetpcb(so, &inp, &tp))) 711 return (error); 712 713 in_setpeeraddr(inp, nam); 714 715 if (so->so_options & SO_DEBUG) 716 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_ACCEPT, 0); 717 return (0); 718 } 719 720 /* 721 * Initiate disconnect from peer. 722 * If connection never passed embryonic stage, just drop; 723 * else if don't need to let data drain, then can just drop anyways, 724 * else have to begin TCP shutdown process: mark socket disconnecting, 725 * drain unread data, state switch to reflect user close, and 726 * send segment (e.g. FIN) to peer. Socket will be really disconnected 727 * when peer sends FIN and acks ours. 728 * 729 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 730 */ 731 int 732 tcp_disconnect(struct socket *so) 733 { 734 struct inpcb *inp; 735 struct tcpcb *tp, *otp = NULL; 736 int error; 737 short ostate; 738 739 soassertlocked(so); 740 741 if ((error = tcp_sogetpcb(so, &inp, &tp))) 742 return (error); 743 744 if (so->so_options & SO_DEBUG) { 745 otp = tp; 746 ostate = tp->t_state; 747 } 748 749 tp = tcp_dodisconnect(tp); 750 751 if (otp) 752 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DISCONNECT, 0); 753 return (0); 754 } 755 756 /* 757 * Mark the connection as being incapable of further output. 758 */ 759 int 760 tcp_shutdown(struct socket *so) 761 { 762 struct inpcb *inp; 763 struct tcpcb *tp, *otp = NULL; 764 int error; 765 short ostate; 766 767 soassertlocked(so); 768 769 if ((error = tcp_sogetpcb(so, &inp, &tp))) 770 return (error); 771 772 if (so->so_options & SO_DEBUG) { 773 otp = tp; 774 ostate = tp->t_state; 775 } 776 777 if (so->so_snd.sb_state & SS_CANTSENDMORE) 778 goto out; 779 780 socantsendmore(so); 781 tp = tcp_usrclosed(tp); 782 if (tp) 783 error = tcp_output(tp); 784 785 out: 786 if (otp) 787 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_SHUTDOWN, 0); 788 return (error); 789 } 790 791 /* 792 * After a receive, possibly send window update to peer. 793 */ 794 void 795 tcp_rcvd(struct socket *so) 796 { 797 struct inpcb *inp; 798 struct tcpcb *tp; 799 short ostate; 800 801 soassertlocked(so); 802 803 if (tcp_sogetpcb(so, &inp, &tp)) 804 return; 805 806 if (so->so_options & SO_DEBUG) 807 ostate = tp->t_state; 808 809 /* 810 * soreceive() calls this function when a user receives 811 * ancillary data on a listening socket. We don't call 812 * tcp_output in such a case, since there is no header 813 * template for a listening socket and hence the kernel 814 * will panic. 815 */ 816 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 817 (void) tcp_output(tp); 818 819 if (so->so_options & SO_DEBUG) 820 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_RCVD, 0); 821 } 822 823 /* 824 * Do a send by putting data in output queue and updating urgent 825 * marker if URG set. Possibly send more data. 826 */ 827 int 828 tcp_send(struct socket *so, struct mbuf *m, struct mbuf *nam, 829 struct mbuf *control) 830 { 831 struct inpcb *inp; 832 struct tcpcb *tp; 833 int error; 834 short ostate; 835 836 soassertlocked(so); 837 838 if (control && control->m_len) { 839 error = EINVAL; 840 goto out; 841 } 842 843 if ((error = tcp_sogetpcb(so, &inp, &tp))) 844 goto out; 845 846 if (so->so_options & SO_DEBUG) 847 ostate = tp->t_state; 848 849 mtx_enter(&so->so_snd.sb_mtx); 850 sbappendstream(so, &so->so_snd, m); 851 mtx_leave(&so->so_snd.sb_mtx); 852 m = NULL; 853 854 error = tcp_output(tp); 855 856 if (so->so_options & SO_DEBUG) 857 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SEND, 0); 858 859 out: 860 m_freem(control); 861 m_freem(m); 862 863 return (error); 864 } 865 866 /* 867 * Abort the TCP. 868 */ 869 void 870 tcp_abort(struct socket *so) 871 { 872 struct inpcb *inp; 873 struct tcpcb *tp, *otp = NULL; 874 short ostate; 875 876 soassertlocked(so); 877 878 if (tcp_sogetpcb(so, &inp, &tp)) 879 return; 880 881 if (so->so_options & SO_DEBUG) { 882 otp = tp; 883 ostate = tp->t_state; 884 } 885 886 tp = tcp_drop(tp, ECONNABORTED); 887 888 if (otp) 889 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_ABORT, 0); 890 } 891 892 int 893 tcp_sense(struct socket *so, struct stat *ub) 894 { 895 struct inpcb *inp; 896 struct tcpcb *tp; 897 int error; 898 899 soassertlocked(so); 900 901 if ((error = tcp_sogetpcb(so, &inp, &tp))) 902 return (error); 903 904 mtx_enter(&so->so_snd.sb_mtx); 905 ub->st_blksize = so->so_snd.sb_hiwat; 906 mtx_leave(&so->so_snd.sb_mtx); 907 908 if (so->so_options & SO_DEBUG) 909 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_SENSE, 0); 910 return (0); 911 } 912 913 int 914 tcp_rcvoob(struct socket *so, struct mbuf *m, int flags) 915 { 916 struct inpcb *inp; 917 struct tcpcb *tp; 918 int error; 919 920 soassertlocked(so); 921 922 if ((error = tcp_sogetpcb(so, &inp, &tp))) 923 return (error); 924 925 if ((so->so_oobmark == 0 && 926 (so->so_rcv.sb_state & SS_RCVATMARK) == 0) || 927 so->so_options & SO_OOBINLINE || 928 tp->t_oobflags & TCPOOB_HADDATA) { 929 error = EINVAL; 930 goto out; 931 } 932 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 933 error = EWOULDBLOCK; 934 goto out; 935 } 936 m->m_len = 1; 937 *mtod(m, caddr_t) = tp->t_iobc; 938 if ((flags & MSG_PEEK) == 0) 939 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 940 out: 941 if (so->so_options & SO_DEBUG) 942 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_RCVOOB, 0); 943 return (error); 944 } 945 946 int 947 tcp_sendoob(struct socket *so, struct mbuf *m, struct mbuf *nam, 948 struct mbuf *control) 949 { 950 struct inpcb *inp; 951 struct tcpcb *tp; 952 int error; 953 short ostate; 954 955 soassertlocked(so); 956 957 if (control && control->m_len) { 958 error = EINVAL; 959 goto release; 960 } 961 962 if ((error = tcp_sogetpcb(so, &inp, &tp))) 963 goto release; 964 965 if (so->so_options & SO_DEBUG) 966 ostate = tp->t_state; 967 968 if (sbspace(so, &so->so_snd) < -512) { 969 error = ENOBUFS; 970 goto out; 971 } 972 973 /* 974 * According to RFC961 (Assigned Protocols), 975 * the urgent pointer points to the last octet 976 * of urgent data. We continue, however, 977 * to consider it to indicate the first octet 978 * of data past the urgent section. 979 * Otherwise, snd_up should be one lower. 980 */ 981 mtx_enter(&so->so_snd.sb_mtx); 982 sbappendstream(so, &so->so_snd, m); 983 mtx_leave(&so->so_snd.sb_mtx); 984 m = NULL; 985 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 986 tp->t_force = 1; 987 error = tcp_output(tp); 988 tp->t_force = 0; 989 990 out: 991 if (so->so_options & SO_DEBUG) 992 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SENDOOB, 0); 993 994 release: 995 m_freem(control); 996 m_freem(m); 997 998 return (error); 999 } 1000 1001 int 1002 tcp_sockaddr(struct socket *so, struct mbuf *nam) 1003 { 1004 struct inpcb *inp; 1005 struct tcpcb *tp; 1006 int error; 1007 1008 soassertlocked(so); 1009 1010 if ((error = tcp_sogetpcb(so, &inp, &tp))) 1011 return (error); 1012 1013 in_setsockaddr(inp, nam); 1014 1015 if (so->so_options & SO_DEBUG) 1016 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, 1017 PRU_SOCKADDR, 0); 1018 return (0); 1019 } 1020 1021 int 1022 tcp_peeraddr(struct socket *so, struct mbuf *nam) 1023 { 1024 struct inpcb *inp; 1025 struct tcpcb *tp; 1026 int error; 1027 1028 soassertlocked(so); 1029 1030 if ((error = tcp_sogetpcb(so, &inp, &tp))) 1031 return (error); 1032 1033 in_setpeeraddr(inp, nam); 1034 1035 if (so->so_options & SO_DEBUG) 1036 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_PEERADDR, 0); 1037 return (0); 1038 } 1039 1040 /* 1041 * Initiate (or continue) disconnect. 1042 * If embryonic state, just send reset (once). 1043 * If in ``let data drain'' option and linger null, just drop. 1044 * Otherwise (hard), mark socket disconnecting and drop 1045 * current input data; switch states based on user close, and 1046 * send segment to peer (with FIN). 1047 */ 1048 struct tcpcb * 1049 tcp_dodisconnect(struct tcpcb *tp) 1050 { 1051 struct socket *so = tp->t_inpcb->inp_socket; 1052 1053 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 1054 tp = tcp_close(tp); 1055 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 1056 tp = tcp_drop(tp, 0); 1057 else { 1058 soisdisconnecting(so); 1059 mtx_enter(&so->so_rcv.sb_mtx); 1060 sbflush(so, &so->so_rcv); 1061 mtx_leave(&so->so_rcv.sb_mtx); 1062 tp = tcp_usrclosed(tp); 1063 if (tp) 1064 (void) tcp_output(tp); 1065 } 1066 return (tp); 1067 } 1068 1069 /* 1070 * User issued close, and wish to trail through shutdown states: 1071 * if never received SYN, just forget it. If got a SYN from peer, 1072 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 1073 * If already got a FIN from peer, then almost done; go to LAST_ACK 1074 * state. In all other cases, have already sent FIN to peer (e.g. 1075 * after PRU_SHUTDOWN), and just have to play tedious game waiting 1076 * for peer to send FIN or not respond to keep-alives, etc. 1077 * We can let the user exit from the close as soon as the FIN is acked. 1078 */ 1079 struct tcpcb * 1080 tcp_usrclosed(struct tcpcb *tp) 1081 { 1082 1083 switch (tp->t_state) { 1084 1085 case TCPS_CLOSED: 1086 case TCPS_LISTEN: 1087 case TCPS_SYN_SENT: 1088 tp->t_state = TCPS_CLOSED; 1089 tp = tcp_close(tp); 1090 break; 1091 1092 case TCPS_SYN_RECEIVED: 1093 case TCPS_ESTABLISHED: 1094 tp->t_state = TCPS_FIN_WAIT_1; 1095 break; 1096 1097 case TCPS_CLOSE_WAIT: 1098 tp->t_state = TCPS_LAST_ACK; 1099 break; 1100 } 1101 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 1102 soisdisconnected(tp->t_inpcb->inp_socket); 1103 /* 1104 * If we are in FIN_WAIT_2, we arrived here because the 1105 * application did a shutdown of the send side. Like the 1106 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 1107 * a full close, we start a timer to make sure sockets are 1108 * not left in FIN_WAIT_2 forever. 1109 */ 1110 if (tp->t_state == TCPS_FIN_WAIT_2) 1111 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1112 } 1113 return (tp); 1114 } 1115 1116 /* 1117 * Look up a socket for ident or tcpdrop, ... 1118 */ 1119 int 1120 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) 1121 { 1122 int error = 0; 1123 struct tcp_ident_mapping tir; 1124 struct inpcb *inp; 1125 struct socket *so = NULL; 1126 struct sockaddr_in *fin, *lin; 1127 #ifdef INET6 1128 struct sockaddr_in6 *fin6, *lin6; 1129 struct in6_addr f6, l6; 1130 #endif 1131 1132 if (dodrop) { 1133 if (oldp != NULL || *oldlenp != 0) 1134 return (EINVAL); 1135 if (newp == NULL) 1136 return (EPERM); 1137 if (newlen < sizeof(tir)) 1138 return (ENOMEM); 1139 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) 1140 return (error); 1141 } else { 1142 if (oldp == NULL) 1143 return (EINVAL); 1144 if (*oldlenp < sizeof(tir)) 1145 return (ENOMEM); 1146 if (newp != NULL || newlen != 0) 1147 return (EINVAL); 1148 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) 1149 return (error); 1150 } 1151 1152 NET_LOCK_SHARED(); 1153 1154 switch (tir.faddr.ss_family) { 1155 #ifdef INET6 1156 case AF_INET6: 1157 if (tir.laddr.ss_family != AF_INET6) { 1158 NET_UNLOCK_SHARED(); 1159 return (EAFNOSUPPORT); 1160 } 1161 fin6 = (struct sockaddr_in6 *)&tir.faddr; 1162 error = in6_embedscope(&f6, fin6, NULL, NULL); 1163 if (error) { 1164 NET_UNLOCK_SHARED(); 1165 return EINVAL; /*?*/ 1166 } 1167 lin6 = (struct sockaddr_in6 *)&tir.laddr; 1168 error = in6_embedscope(&l6, lin6, NULL, NULL); 1169 if (error) { 1170 NET_UNLOCK_SHARED(); 1171 return EINVAL; /*?*/ 1172 } 1173 break; 1174 #endif 1175 case AF_INET: 1176 if (tir.laddr.ss_family != AF_INET) { 1177 NET_UNLOCK_SHARED(); 1178 return (EAFNOSUPPORT); 1179 } 1180 fin = (struct sockaddr_in *)&tir.faddr; 1181 lin = (struct sockaddr_in *)&tir.laddr; 1182 break; 1183 default: 1184 NET_UNLOCK_SHARED(); 1185 return (EAFNOSUPPORT); 1186 } 1187 1188 switch (tir.faddr.ss_family) { 1189 #ifdef INET6 1190 case AF_INET6: 1191 inp = in6_pcblookup(&tcb6table, &f6, 1192 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); 1193 break; 1194 #endif 1195 case AF_INET: 1196 inp = in_pcblookup(&tcbtable, fin->sin_addr, 1197 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); 1198 break; 1199 default: 1200 unhandled_af(tir.faddr.ss_family); 1201 } 1202 1203 if (dodrop) { 1204 struct tcpcb *tp = NULL; 1205 1206 if (inp != NULL) { 1207 so = in_pcbsolock_ref(inp); 1208 if (so != NULL) 1209 tp = intotcpcb(inp); 1210 } 1211 if (tp != NULL && !ISSET(so->so_options, SO_ACCEPTCONN)) 1212 tp = tcp_drop(tp, ECONNABORTED); 1213 else 1214 error = ESRCH; 1215 1216 in_pcbsounlock_rele(inp, so); 1217 NET_UNLOCK_SHARED(); 1218 in_pcbunref(inp); 1219 return (error); 1220 } 1221 1222 if (inp == NULL) { 1223 tcpstat_inc(tcps_pcbhashmiss); 1224 switch (tir.faddr.ss_family) { 1225 #ifdef INET6 1226 case AF_INET6: 1227 inp = in6_pcblookup_listen(&tcb6table, 1228 &l6, lin6->sin6_port, NULL, tir.rdomain); 1229 break; 1230 #endif 1231 case AF_INET: 1232 inp = in_pcblookup_listen(&tcbtable, 1233 lin->sin_addr, lin->sin_port, NULL, tir.rdomain); 1234 break; 1235 } 1236 } 1237 1238 if (inp != NULL) 1239 so = in_pcbsolock_ref(inp); 1240 1241 if (so != NULL && ISSET(so->so_state, SS_CONNECTOUT)) { 1242 tir.ruid = so->so_ruid; 1243 tir.euid = so->so_euid; 1244 } else { 1245 tir.ruid = -1; 1246 tir.euid = -1; 1247 } 1248 1249 in_pcbsounlock_rele(inp, so); 1250 NET_UNLOCK_SHARED(); 1251 in_pcbunref(inp); 1252 1253 *oldlenp = sizeof(tir); 1254 return copyout(&tir, oldp, sizeof(tir)); 1255 } 1256 1257 int 1258 tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp) 1259 { 1260 uint64_t counters[tcps_ncounters]; 1261 struct tcpstat tcpstat; 1262 struct syn_cache_set *set; 1263 int i = 0; 1264 1265 #define ASSIGN(field) do { tcpstat.field = counters[i++]; } while (0) 1266 1267 memset(&tcpstat, 0, sizeof tcpstat); 1268 counters_read(tcpcounters, counters, nitems(counters), NULL); 1269 ASSIGN(tcps_connattempt); 1270 ASSIGN(tcps_accepts); 1271 ASSIGN(tcps_connects); 1272 ASSIGN(tcps_drops); 1273 ASSIGN(tcps_conndrops); 1274 ASSIGN(tcps_closed); 1275 ASSIGN(tcps_segstimed); 1276 ASSIGN(tcps_rttupdated); 1277 ASSIGN(tcps_delack); 1278 ASSIGN(tcps_timeoutdrop); 1279 ASSIGN(tcps_rexmttimeo); 1280 ASSIGN(tcps_persisttimeo); 1281 ASSIGN(tcps_persistdrop); 1282 ASSIGN(tcps_keeptimeo); 1283 ASSIGN(tcps_keepprobe); 1284 ASSIGN(tcps_keepdrops); 1285 ASSIGN(tcps_sndtotal); 1286 ASSIGN(tcps_sndpack); 1287 ASSIGN(tcps_sndbyte); 1288 ASSIGN(tcps_sndrexmitpack); 1289 ASSIGN(tcps_sndrexmitbyte); 1290 ASSIGN(tcps_sndrexmitfast); 1291 ASSIGN(tcps_sndacks); 1292 ASSIGN(tcps_sndprobe); 1293 ASSIGN(tcps_sndurg); 1294 ASSIGN(tcps_sndwinup); 1295 ASSIGN(tcps_sndctrl); 1296 ASSIGN(tcps_rcvtotal); 1297 ASSIGN(tcps_rcvpack); 1298 ASSIGN(tcps_rcvbyte); 1299 ASSIGN(tcps_rcvbadsum); 1300 ASSIGN(tcps_rcvbadoff); 1301 ASSIGN(tcps_rcvmemdrop); 1302 ASSIGN(tcps_rcvnosec); 1303 ASSIGN(tcps_rcvshort); 1304 ASSIGN(tcps_rcvduppack); 1305 ASSIGN(tcps_rcvdupbyte); 1306 ASSIGN(tcps_rcvpartduppack); 1307 ASSIGN(tcps_rcvpartdupbyte); 1308 ASSIGN(tcps_rcvoopack); 1309 ASSIGN(tcps_rcvoobyte); 1310 ASSIGN(tcps_rcvpackafterwin); 1311 ASSIGN(tcps_rcvbyteafterwin); 1312 ASSIGN(tcps_rcvafterclose); 1313 ASSIGN(tcps_rcvwinprobe); 1314 ASSIGN(tcps_rcvdupack); 1315 ASSIGN(tcps_rcvacktoomuch); 1316 ASSIGN(tcps_rcvacktooold); 1317 ASSIGN(tcps_rcvackpack); 1318 ASSIGN(tcps_rcvackbyte); 1319 ASSIGN(tcps_rcvwinupd); 1320 ASSIGN(tcps_pawsdrop); 1321 ASSIGN(tcps_predack); 1322 ASSIGN(tcps_preddat); 1323 ASSIGN(tcps_pcbhashmiss); 1324 ASSIGN(tcps_noport); 1325 ASSIGN(tcps_badsyn); 1326 ASSIGN(tcps_dropsyn); 1327 ASSIGN(tcps_rcvbadsig); 1328 ASSIGN(tcps_rcvgoodsig); 1329 ASSIGN(tcps_inswcsum); 1330 ASSIGN(tcps_outswcsum); 1331 ASSIGN(tcps_ecn_accepts); 1332 ASSIGN(tcps_ecn_rcvece); 1333 ASSIGN(tcps_ecn_rcvcwr); 1334 ASSIGN(tcps_ecn_rcvce); 1335 ASSIGN(tcps_ecn_sndect); 1336 ASSIGN(tcps_ecn_sndece); 1337 ASSIGN(tcps_ecn_sndcwr); 1338 ASSIGN(tcps_cwr_ecn); 1339 ASSIGN(tcps_cwr_frecovery); 1340 ASSIGN(tcps_cwr_timeout); 1341 ASSIGN(tcps_sc_added); 1342 ASSIGN(tcps_sc_completed); 1343 ASSIGN(tcps_sc_timed_out); 1344 ASSIGN(tcps_sc_overflowed); 1345 ASSIGN(tcps_sc_reset); 1346 ASSIGN(tcps_sc_unreach); 1347 ASSIGN(tcps_sc_bucketoverflow); 1348 ASSIGN(tcps_sc_aborted); 1349 ASSIGN(tcps_sc_dupesyn); 1350 ASSIGN(tcps_sc_dropped); 1351 ASSIGN(tcps_sc_collisions); 1352 ASSIGN(tcps_sc_retransmitted); 1353 ASSIGN(tcps_sc_seedrandom); 1354 ASSIGN(tcps_sc_hash_size); 1355 ASSIGN(tcps_sc_entry_count); 1356 ASSIGN(tcps_sc_entry_limit); 1357 ASSIGN(tcps_sc_bucket_maxlen); 1358 ASSIGN(tcps_sc_bucket_limit); 1359 ASSIGN(tcps_sc_uses_left); 1360 ASSIGN(tcps_conndrained); 1361 ASSIGN(tcps_sack_recovery_episode); 1362 ASSIGN(tcps_sack_rexmits); 1363 ASSIGN(tcps_sack_rexmit_bytes); 1364 ASSIGN(tcps_sack_rcv_opts); 1365 ASSIGN(tcps_sack_snd_opts); 1366 ASSIGN(tcps_sack_drop_opts); 1367 ASSIGN(tcps_outswtso); 1368 ASSIGN(tcps_outhwtso); 1369 ASSIGN(tcps_outpkttso); 1370 ASSIGN(tcps_outbadtso); 1371 ASSIGN(tcps_inswlro); 1372 ASSIGN(tcps_inhwlro); 1373 ASSIGN(tcps_inpktlro); 1374 ASSIGN(tcps_inbadlro); 1375 1376 #undef ASSIGN 1377 1378 mtx_enter(&syn_cache_mtx); 1379 set = &tcp_syn_cache[tcp_syn_cache_active]; 1380 tcpstat.tcps_sc_hash_size = set->scs_size; 1381 tcpstat.tcps_sc_entry_count = set->scs_count; 1382 tcpstat.tcps_sc_entry_limit = atomic_load_int(&tcp_syn_cache_limit); 1383 tcpstat.tcps_sc_bucket_maxlen = 0; 1384 for (i = 0; i < set->scs_size; i++) { 1385 if (tcpstat.tcps_sc_bucket_maxlen < 1386 set->scs_buckethead[i].sch_length) 1387 tcpstat.tcps_sc_bucket_maxlen = 1388 set->scs_buckethead[i].sch_length; 1389 } 1390 tcpstat.tcps_sc_bucket_limit = atomic_load_int(&tcp_syn_bucket_limit); 1391 tcpstat.tcps_sc_uses_left = set->scs_use; 1392 mtx_leave(&syn_cache_mtx); 1393 1394 return (sysctl_rdstruct(oldp, oldlenp, newp, 1395 &tcpstat, sizeof(tcpstat))); 1396 } 1397 1398 /* 1399 * Sysctl for tcp variables. 1400 */ 1401 int 1402 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 1403 size_t newlen) 1404 { 1405 int error, oval, nval; 1406 1407 /* All sysctl names at this level are terminal. */ 1408 if (namelen != 1) 1409 return (ENOTDIR); 1410 1411 switch (name[0]) { 1412 case TCPCTL_KEEPINITTIME: 1413 NET_LOCK(); 1414 nval = tcptv_keep_init / TCP_TIME(1); 1415 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval, 1416 1, 3 * (TCPTV_KEEP_INIT / TCP_TIME(1))); 1417 if (!error) 1418 tcptv_keep_init = TCP_TIME(nval); 1419 NET_UNLOCK(); 1420 return (error); 1421 1422 case TCPCTL_KEEPIDLE: 1423 NET_LOCK(); 1424 nval = tcp_keepidle / TCP_TIME(1); 1425 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval, 1426 1, 5 * (TCPTV_KEEP_IDLE / TCP_TIME(1))); 1427 if (!error) 1428 tcp_keepidle = TCP_TIME(nval); 1429 NET_UNLOCK(); 1430 return (error); 1431 1432 case TCPCTL_KEEPINTVL: 1433 NET_LOCK(); 1434 nval = tcp_keepintvl / TCP_TIME(1); 1435 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval, 1436 1, 3 * (TCPTV_KEEPINTVL / TCP_TIME(1))); 1437 if (!error) 1438 tcp_keepintvl = TCP_TIME(nval); 1439 NET_UNLOCK(); 1440 return (error); 1441 1442 case TCPCTL_BADDYNAMIC: 1443 NET_LOCK(); 1444 error = sysctl_struct(oldp, oldlenp, newp, newlen, 1445 baddynamicports.tcp, sizeof(baddynamicports.tcp)); 1446 NET_UNLOCK(); 1447 return (error); 1448 1449 case TCPCTL_ROOTONLY: 1450 if (newp && securelevel > 0) 1451 return (EPERM); 1452 NET_LOCK(); 1453 error = sysctl_struct(oldp, oldlenp, newp, newlen, 1454 rootonlyports.tcp, sizeof(rootonlyports.tcp)); 1455 NET_UNLOCK(); 1456 return (error); 1457 1458 case TCPCTL_IDENT: 1459 return tcp_ident(oldp, oldlenp, newp, newlen, 0); 1460 1461 case TCPCTL_DROP: 1462 return tcp_ident(oldp, oldlenp, newp, newlen, 1); 1463 1464 case TCPCTL_REASS_LIMIT: 1465 NET_LOCK(); 1466 nval = tcp_reass_limit; 1467 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1468 if (!error && nval != tcp_reass_limit) { 1469 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); 1470 if (!error) 1471 tcp_reass_limit = nval; 1472 } 1473 NET_UNLOCK(); 1474 return (error); 1475 1476 case TCPCTL_SACKHOLE_LIMIT: 1477 NET_LOCK(); 1478 nval = tcp_sackhole_limit; 1479 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1480 if (!error && nval != tcp_sackhole_limit) { 1481 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); 1482 if (!error) 1483 tcp_sackhole_limit = nval; 1484 } 1485 NET_UNLOCK(); 1486 return (error); 1487 1488 case TCPCTL_STATS: 1489 return (tcp_sysctl_tcpstat(oldp, oldlenp, newp)); 1490 1491 case TCPCTL_SYN_USE_LIMIT: 1492 oval = nval = atomic_load_int(&tcp_syn_use_limit); 1493 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, 1494 &nval, 0, INT_MAX); 1495 if (!error && oval != nval) { 1496 /* 1497 * Global tcp_syn_use_limit is used when reseeding a 1498 * new cache. Also update the value in active cache. 1499 */ 1500 mtx_enter(&syn_cache_mtx); 1501 if (tcp_syn_cache[0].scs_use > nval) 1502 tcp_syn_cache[0].scs_use = nval; 1503 if (tcp_syn_cache[1].scs_use > nval) 1504 tcp_syn_cache[1].scs_use = nval; 1505 tcp_syn_use_limit = nval; 1506 mtx_leave(&syn_cache_mtx); 1507 } 1508 return (error); 1509 1510 case TCPCTL_SYN_HASH_SIZE: 1511 oval = nval = atomic_load_int(&tcp_syn_hash_size); 1512 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, 1513 &nval, 1, 100000); 1514 if (!error && oval != nval) { 1515 /* 1516 * If global hash size has been changed, 1517 * switch sets as soon as possible. Then 1518 * the actual hash array will be reallocated. 1519 */ 1520 mtx_enter(&syn_cache_mtx); 1521 if (tcp_syn_cache[0].scs_size != nval) 1522 tcp_syn_cache[0].scs_use = 0; 1523 if (tcp_syn_cache[1].scs_size != nval) 1524 tcp_syn_cache[1].scs_use = 0; 1525 tcp_syn_hash_size = nval; 1526 mtx_leave(&syn_cache_mtx); 1527 } 1528 return (error); 1529 1530 default: 1531 return sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars), 1532 name, namelen, oldp, oldlenp, newp, newlen); 1533 } 1534 /* NOTREACHED */ 1535 } 1536 1537 /* 1538 * Scale the send buffer so that inflight data is not accounted against 1539 * the limit. The buffer will scale with the congestion window, if the 1540 * the receiver stops acking data the window will shrink and therefore 1541 * the buffer size will shrink as well. 1542 * In low memory situation try to shrink the buffer to the initial size 1543 * disabling the send buffer scaling as long as the situation persists. 1544 */ 1545 void 1546 tcp_update_sndspace(struct tcpcb *tp) 1547 { 1548 struct socket *so = tp->t_inpcb->inp_socket; 1549 u_long nmax; 1550 1551 mtx_enter(&so->so_snd.sb_mtx); 1552 1553 nmax = so->so_snd.sb_hiwat; 1554 1555 if (sbchecklowmem()) { 1556 /* low on memory try to get rid of some */ 1557 if (tcp_sendspace < nmax) 1558 nmax = tcp_sendspace; 1559 } else if (so->so_snd.sb_wat != tcp_sendspace) { 1560 /* user requested buffer size, auto-scaling disabled */ 1561 nmax = so->so_snd.sb_wat; 1562 } else { 1563 /* automatic buffer scaling */ 1564 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - 1565 tp->snd_una); 1566 } 1567 1568 /* a writable socket must be preserved because of poll(2) semantics */ 1569 if (sbspace_locked(so, &so->so_snd) >= so->so_snd.sb_lowat) { 1570 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) 1571 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; 1572 /* keep in sync with sbreserve() calculation */ 1573 if (nmax * 8 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat) 1574 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+7) / 8; 1575 } 1576 1577 /* round to MSS boundary */ 1578 nmax = roundup(nmax, tp->t_maxseg); 1579 1580 if (nmax != so->so_snd.sb_hiwat) 1581 sbreserve(so, &so->so_snd, nmax); 1582 1583 mtx_leave(&so->so_snd.sb_mtx); 1584 } 1585 1586 /* 1587 * Scale the recv buffer by looking at how much data was transferred in 1588 * one approximated RTT. If more than a big part of the recv buffer was 1589 * transferred during that time we increase the buffer by a constant. 1590 * In low memory situation try to shrink the buffer to the initial size. 1591 */ 1592 void 1593 tcp_update_rcvspace(struct tcpcb *tp) 1594 { 1595 struct socket *so = tp->t_inpcb->inp_socket; 1596 u_long nmax; 1597 1598 mtx_enter(&so->so_rcv.sb_mtx); 1599 1600 nmax = so->so_rcv.sb_hiwat; 1601 1602 if (sbchecklowmem()) { 1603 /* low on memory try to get rid of some */ 1604 if (tcp_recvspace < nmax) 1605 nmax = tcp_recvspace; 1606 } else if (so->so_rcv.sb_wat != tcp_recvspace) { 1607 /* user requested buffer size, auto-scaling disabled */ 1608 nmax = so->so_rcv.sb_wat; 1609 } else { 1610 /* automatic buffer scaling */ 1611 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) 1612 nmax = MIN(sb_max, so->so_rcv.sb_hiwat + 1613 tcp_autorcvbuf_inc); 1614 } 1615 1616 /* a readable socket must be preserved because of poll(2) semantics */ 1617 mtx_enter(&so->so_snd.sb_mtx); 1618 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && 1619 nmax < so->so_snd.sb_lowat) 1620 nmax = so->so_snd.sb_lowat; 1621 mtx_leave(&so->so_snd.sb_mtx); 1622 1623 if (nmax != so->so_rcv.sb_hiwat) { 1624 /* round to MSS boundary */ 1625 nmax = roundup(nmax, tp->t_maxseg); 1626 sbreserve(so, &so->so_rcv, nmax); 1627 } 1628 1629 mtx_leave(&so->so_rcv.sb_mtx); 1630 } 1631