1 /* $OpenBSD: tcp_usrreq.c,v 1.214 2022/12/12 08:30:22 tb Exp $ */ 2 /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/protosw.h> 77 #include <sys/stat.h> 78 #include <sys/sysctl.h> 79 #include <sys/domain.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 #include <sys/proc.h> 83 84 #include <net/if.h> 85 #include <net/if_var.h> 86 #include <net/route.h> 87 88 #include <netinet/in.h> 89 #include <netinet/in_var.h> 90 #include <netinet/ip.h> 91 #include <netinet/in_pcb.h> 92 #include <netinet/ip_var.h> 93 #include <netinet/tcp.h> 94 #include <netinet/tcp_fsm.h> 95 #include <netinet/tcp_seq.h> 96 #include <netinet/tcp_timer.h> 97 #include <netinet/tcp_var.h> 98 #include <netinet/tcp_debug.h> 99 100 #ifdef INET6 101 #include <netinet6/in6_var.h> 102 #endif 103 104 #ifndef TCP_SENDSPACE 105 #define TCP_SENDSPACE 1024*16 106 #endif 107 u_int tcp_sendspace = TCP_SENDSPACE; 108 #ifndef TCP_RECVSPACE 109 #define TCP_RECVSPACE 1024*16 110 #endif 111 u_int tcp_recvspace = TCP_RECVSPACE; 112 u_int tcp_autorcvbuf_inc = 16 * 1024; 113 114 const struct pr_usrreqs tcp_usrreqs = { 115 .pru_attach = tcp_attach, 116 .pru_detach = tcp_detach, 117 .pru_bind = tcp_bind, 118 .pru_listen = tcp_listen, 119 .pru_connect = tcp_connect, 120 .pru_accept = tcp_accept, 121 .pru_disconnect = tcp_disconnect, 122 .pru_shutdown = tcp_shutdown, 123 .pru_rcvd = tcp_rcvd, 124 .pru_send = tcp_send, 125 .pru_abort = tcp_abort, 126 .pru_sense = tcp_sense, 127 .pru_rcvoob = tcp_rcvoob, 128 .pru_sendoob = tcp_sendoob, 129 .pru_control = in_control, 130 .pru_sockaddr = tcp_sockaddr, 131 .pru_peeraddr = tcp_peeraddr, 132 }; 133 134 #ifdef INET6 135 const struct pr_usrreqs tcp6_usrreqs = { 136 .pru_attach = tcp_attach, 137 .pru_detach = tcp_detach, 138 .pru_bind = tcp_bind, 139 .pru_listen = tcp_listen, 140 .pru_connect = tcp_connect, 141 .pru_accept = tcp_accept, 142 .pru_disconnect = tcp_disconnect, 143 .pru_shutdown = tcp_shutdown, 144 .pru_rcvd = tcp_rcvd, 145 .pru_send = tcp_send, 146 .pru_abort = tcp_abort, 147 .pru_sense = tcp_sense, 148 .pru_rcvoob = tcp_rcvoob, 149 .pru_sendoob = tcp_sendoob, 150 .pru_control = in6_control, 151 .pru_sockaddr = tcp_sockaddr, 152 .pru_peeraddr = tcp_peeraddr, 153 }; 154 #endif 155 156 const struct sysctl_bounded_args tcpctl_vars[] = { 157 { TCPCTL_RFC1323, &tcp_do_rfc1323, 0, 1 }, 158 { TCPCTL_KEEPINITTIME, &tcptv_keep_init, 1, 3 * TCPTV_KEEP_INIT }, 159 { TCPCTL_KEEPIDLE, &tcp_keepidle, 1, 5 * TCPTV_KEEP_IDLE }, 160 { TCPCTL_KEEPINTVL, &tcp_keepintvl, 1, 3 * TCPTV_KEEPINTVL }, 161 { TCPCTL_SACK, &tcp_do_sack, 0, 1 }, 162 { TCPCTL_MSSDFLT, &tcp_mssdflt, TCP_MSS, 65535 }, 163 { TCPCTL_RSTPPSLIMIT, &tcp_rst_ppslim, 1, 1000 * 1000 }, 164 { TCPCTL_ACK_ON_PUSH, &tcp_ack_on_push, 0, 1 }, 165 #ifdef TCP_ECN 166 { TCPCTL_ECN, &tcp_do_ecn, 0, 1 }, 167 #endif 168 { TCPCTL_SYN_CACHE_LIMIT, &tcp_syn_cache_limit, 1, 1000 * 1000 }, 169 { TCPCTL_SYN_BUCKET_LIMIT, &tcp_syn_bucket_limit, 1, INT_MAX }, 170 { TCPCTL_RFC3390, &tcp_do_rfc3390, 0, 2 }, 171 { TCPCTL_ALWAYS_KEEPALIVE, &tcp_always_keepalive, 0, 1 }, 172 }; 173 174 struct inpcbtable tcbtable; 175 176 int tcp_fill_info(struct tcpcb *, struct socket *, struct mbuf *); 177 int tcp_ident(void *, size_t *, void *, size_t, int); 178 179 static inline int tcp_sogetpcb(struct socket *, struct inpcb **, 180 struct tcpcb **); 181 182 static inline int 183 tcp_sogetpcb(struct socket *so, struct inpcb **rinp, struct tcpcb **rtp) 184 { 185 struct inpcb *inp; 186 struct tcpcb *tp; 187 188 /* 189 * When a TCP is attached to a socket, then there will be 190 * a (struct inpcb) pointed at by the socket, and this 191 * structure will point at a subsidiary (struct tcpcb). 192 */ 193 if ((inp = sotoinpcb(so)) == NULL || (tp = intotcpcb(inp)) == NULL) { 194 if (so->so_error) 195 return so->so_error; 196 return EINVAL; 197 } 198 199 *rinp = inp; 200 *rtp = tp; 201 202 return 0; 203 } 204 205 /* 206 * Export internal TCP state information via a struct tcp_info without 207 * leaking any sensitive information. Sequence numbers are reported 208 * relative to the initial sequence number. 209 */ 210 int 211 tcp_fill_info(struct tcpcb *tp, struct socket *so, struct mbuf *m) 212 { 213 struct proc *p = curproc; 214 struct tcp_info *ti; 215 u_int t = 1000; /* msec => usec */ 216 uint32_t now; 217 218 if (sizeof(*ti) > MLEN) { 219 MCLGETL(m, M_WAITOK, sizeof(*ti)); 220 if (!ISSET(m->m_flags, M_EXT)) 221 return ENOMEM; 222 } 223 ti = mtod(m, struct tcp_info *); 224 m->m_len = sizeof(*ti); 225 memset(ti, 0, sizeof(*ti)); 226 now = tcp_now(); 227 228 ti->tcpi_state = tp->t_state; 229 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) 230 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; 231 if (tp->t_flags & TF_SACK_PERMIT) 232 ti->tcpi_options |= TCPI_OPT_SACK; 233 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { 234 ti->tcpi_options |= TCPI_OPT_WSCALE; 235 ti->tcpi_snd_wscale = tp->snd_scale; 236 ti->tcpi_rcv_wscale = tp->rcv_scale; 237 } 238 #ifdef TCP_ECN 239 if (tp->t_flags & TF_ECN_PERMIT) 240 ti->tcpi_options |= TCPI_OPT_ECN; 241 #endif 242 243 ti->tcpi_rto = tp->t_rxtcur * t; 244 ti->tcpi_snd_mss = tp->t_maxseg; 245 ti->tcpi_rcv_mss = tp->t_peermss; 246 247 ti->tcpi_last_data_sent = (now - tp->t_sndtime) * t; 248 ti->tcpi_last_ack_sent = (now - tp->t_sndacktime) * t; 249 ti->tcpi_last_data_recv = (now - tp->t_rcvtime) * t; 250 ti->tcpi_last_ack_recv = (now - tp->t_rcvacktime) * t; 251 252 ti->tcpi_rtt = ((uint64_t)tp->t_srtt * t) >> 253 (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 254 ti->tcpi_rttvar = ((uint64_t)tp->t_rttvar * t) >> 255 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT); 256 ti->tcpi_snd_ssthresh = tp->snd_ssthresh; 257 ti->tcpi_snd_cwnd = tp->snd_cwnd; 258 259 ti->tcpi_rcv_space = tp->rcv_wnd; 260 261 /* 262 * Provide only minimal information for unprivileged processes. 263 */ 264 if (suser(p) != 0) 265 return 0; 266 267 /* FreeBSD-specific extension fields for tcp_info. */ 268 ti->tcpi_snd_wnd = tp->snd_wnd; 269 ti->tcpi_snd_nxt = tp->snd_nxt - tp->iss; 270 ti->tcpi_rcv_nxt = tp->rcv_nxt - tp->irs; 271 /* missing tcpi_toe_tid */ 272 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; 273 ti->tcpi_rcv_ooopack = tp->t_rcvoopack; 274 ti->tcpi_snd_zerowin = tp->t_sndzerowin; 275 276 /* OpenBSD extensions */ 277 ti->tcpi_rttmin = tp->t_rttmin * t; 278 ti->tcpi_max_sndwnd = tp->max_sndwnd; 279 ti->tcpi_rcv_adv = tp->rcv_adv - tp->irs; 280 ti->tcpi_rcv_up = tp->rcv_up - tp->irs; 281 ti->tcpi_snd_una = tp->snd_una - tp->iss; 282 ti->tcpi_snd_up = tp->snd_up - tp->iss; 283 ti->tcpi_snd_wl1 = tp->snd_wl1 - tp->iss; 284 ti->tcpi_snd_wl2 = tp->snd_wl2 - tp->iss; 285 ti->tcpi_snd_max = tp->snd_max - tp->iss; 286 287 ti->tcpi_ts_recent = tp->ts_recent; /* XXX value from the wire */ 288 ti->tcpi_ts_recent_age = (now - tp->ts_recent_age) * t; 289 ti->tcpi_rfbuf_cnt = tp->rfbuf_cnt; 290 ti->tcpi_rfbuf_ts = (now - tp->rfbuf_ts) * t; 291 292 ti->tcpi_so_rcv_sb_cc = so->so_rcv.sb_cc; 293 ti->tcpi_so_rcv_sb_hiwat = so->so_rcv.sb_hiwat; 294 ti->tcpi_so_rcv_sb_lowat = so->so_rcv.sb_lowat; 295 ti->tcpi_so_rcv_sb_wat = so->so_rcv.sb_wat; 296 ti->tcpi_so_snd_sb_cc = so->so_snd.sb_cc; 297 ti->tcpi_so_snd_sb_hiwat = so->so_snd.sb_hiwat; 298 ti->tcpi_so_snd_sb_lowat = so->so_snd.sb_lowat; 299 ti->tcpi_so_snd_sb_wat = so->so_snd.sb_wat; 300 301 return 0; 302 } 303 304 int 305 tcp_ctloutput(int op, struct socket *so, int level, int optname, 306 struct mbuf *m) 307 { 308 int error = 0; 309 struct inpcb *inp; 310 struct tcpcb *tp; 311 int i; 312 313 inp = sotoinpcb(so); 314 if (inp == NULL) 315 return (ECONNRESET); 316 if (level != IPPROTO_TCP) { 317 switch (so->so_proto->pr_domain->dom_family) { 318 #ifdef INET6 319 case PF_INET6: 320 error = ip6_ctloutput(op, so, level, optname, m); 321 break; 322 #endif /* INET6 */ 323 case PF_INET: 324 error = ip_ctloutput(op, so, level, optname, m); 325 break; 326 default: 327 error = EAFNOSUPPORT; /*?*/ 328 break; 329 } 330 return (error); 331 } 332 tp = intotcpcb(inp); 333 334 switch (op) { 335 336 case PRCO_SETOPT: 337 switch (optname) { 338 339 case TCP_NODELAY: 340 if (m == NULL || m->m_len < sizeof (int)) 341 error = EINVAL; 342 else if (*mtod(m, int *)) 343 tp->t_flags |= TF_NODELAY; 344 else 345 tp->t_flags &= ~TF_NODELAY; 346 break; 347 348 case TCP_NOPUSH: 349 if (m == NULL || m->m_len < sizeof (int)) 350 error = EINVAL; 351 else if (*mtod(m, int *)) 352 tp->t_flags |= TF_NOPUSH; 353 else if (tp->t_flags & TF_NOPUSH) { 354 tp->t_flags &= ~TF_NOPUSH; 355 if (TCPS_HAVEESTABLISHED(tp->t_state)) 356 error = tcp_output(tp); 357 } 358 break; 359 360 case TCP_MAXSEG: 361 if (m == NULL || m->m_len < sizeof (int)) { 362 error = EINVAL; 363 break; 364 } 365 366 i = *mtod(m, int *); 367 if (i > 0 && i <= tp->t_maxseg) 368 tp->t_maxseg = i; 369 else 370 error = EINVAL; 371 break; 372 373 case TCP_SACK_ENABLE: 374 if (m == NULL || m->m_len < sizeof (int)) { 375 error = EINVAL; 376 break; 377 } 378 379 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 380 error = EPERM; 381 break; 382 } 383 384 if (tp->t_flags & TF_SIGNATURE) { 385 error = EPERM; 386 break; 387 } 388 389 if (*mtod(m, int *)) 390 tp->sack_enable = 1; 391 else 392 tp->sack_enable = 0; 393 break; 394 #ifdef TCP_SIGNATURE 395 case TCP_MD5SIG: 396 if (m == NULL || m->m_len < sizeof (int)) { 397 error = EINVAL; 398 break; 399 } 400 401 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 402 error = EPERM; 403 break; 404 } 405 406 if (*mtod(m, int *)) { 407 tp->t_flags |= TF_SIGNATURE; 408 tp->sack_enable = 0; 409 } else 410 tp->t_flags &= ~TF_SIGNATURE; 411 break; 412 #endif /* TCP_SIGNATURE */ 413 default: 414 error = ENOPROTOOPT; 415 break; 416 } 417 break; 418 419 case PRCO_GETOPT: 420 switch (optname) { 421 case TCP_NODELAY: 422 m->m_len = sizeof(int); 423 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 424 break; 425 case TCP_NOPUSH: 426 m->m_len = sizeof(int); 427 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 428 break; 429 case TCP_MAXSEG: 430 m->m_len = sizeof(int); 431 *mtod(m, int *) = tp->t_maxseg; 432 break; 433 case TCP_SACK_ENABLE: 434 m->m_len = sizeof(int); 435 *mtod(m, int *) = tp->sack_enable; 436 break; 437 case TCP_INFO: 438 error = tcp_fill_info(tp, so, m); 439 break; 440 #ifdef TCP_SIGNATURE 441 case TCP_MD5SIG: 442 m->m_len = sizeof(int); 443 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; 444 break; 445 #endif 446 default: 447 error = ENOPROTOOPT; 448 break; 449 } 450 break; 451 } 452 return (error); 453 } 454 455 /* 456 * Attach TCP protocol to socket, allocating 457 * internet protocol control block, tcp control block, 458 * buffer space, and entering LISTEN state to accept connections. 459 */ 460 int 461 tcp_attach(struct socket *so, int proto, int wait) 462 { 463 struct tcpcb *tp; 464 struct inpcb *inp; 465 int error; 466 467 if (so->so_pcb) 468 return EISCONN; 469 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || 470 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || 471 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { 472 error = soreserve(so, tcp_sendspace, tcp_recvspace); 473 if (error) 474 return (error); 475 } 476 477 NET_ASSERT_LOCKED(); 478 error = in_pcballoc(so, &tcbtable, wait); 479 if (error) 480 return (error); 481 inp = sotoinpcb(so); 482 tp = tcp_newtcpcb(inp, wait); 483 if (tp == NULL) { 484 unsigned int nofd = so->so_state & SS_NOFDREF; /* XXX */ 485 486 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 487 in_pcbdetach(inp); 488 so->so_state |= nofd; 489 return (ENOBUFS); 490 } 491 tp->t_state = TCPS_CLOSED; 492 #ifdef INET6 493 /* we disallow IPv4 mapped address completely. */ 494 if (inp->inp_flags & INP_IPV6) 495 tp->pf = PF_INET6; 496 else 497 tp->pf = PF_INET; 498 #else 499 tp->pf = PF_INET; 500 #endif 501 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 502 so->so_linger = TCP_LINGERTIME; 503 504 if (so->so_options & SO_DEBUG) 505 tcp_trace(TA_USER, TCPS_CLOSED, tp, tp, NULL, PRU_ATTACH, 0); 506 return (0); 507 } 508 509 int 510 tcp_detach(struct socket *so) 511 { 512 struct inpcb *inp; 513 struct tcpcb *otp = NULL, *tp; 514 int error = 0; 515 short ostate; 516 517 soassertlocked(so); 518 519 if ((error = tcp_sogetpcb(so, &inp, &tp))) 520 return (error); 521 522 if (so->so_options & SO_DEBUG) { 523 otp = tp; 524 ostate = tp->t_state; 525 } 526 527 /* 528 * Detach the TCP protocol from the socket. 529 * If the protocol state is non-embryonic, then can't 530 * do this directly: have to initiate a PRU_DISCONNECT, 531 * which may finish later; embryonic TCB's can just 532 * be discarded here. 533 */ 534 tp = tcp_dodisconnect(tp); 535 536 if (otp) 537 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DETACH, 0); 538 return (error); 539 } 540 541 /* 542 * Give the socket an address. 543 */ 544 int 545 tcp_bind(struct socket *so, struct mbuf *nam, struct proc *p) 546 { 547 struct inpcb *inp; 548 struct tcpcb *tp; 549 int error; 550 short ostate; 551 552 soassertlocked(so); 553 554 if ((error = tcp_sogetpcb(so, &inp, &tp))) 555 return (error); 556 557 if (so->so_options & SO_DEBUG) 558 ostate = tp->t_state; 559 560 error = in_pcbbind(inp, nam, p); 561 562 if (so->so_options & SO_DEBUG) 563 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_BIND, 0); 564 return (error); 565 } 566 567 /* 568 * Prepare to accept connections. 569 */ 570 int 571 tcp_listen(struct socket *so) 572 { 573 struct inpcb *inp; 574 struct tcpcb *tp, *otp = NULL; 575 int error; 576 short ostate; 577 578 soassertlocked(so); 579 580 if ((error = tcp_sogetpcb(so, &inp, &tp))) 581 return (error); 582 583 if (so->so_options & SO_DEBUG) { 584 otp = tp; 585 ostate = tp->t_state; 586 } 587 588 if (inp->inp_lport == 0) 589 if ((error = in_pcbbind(inp, NULL, curproc))) 590 goto out; 591 592 /* 593 * If the in_pcbbind() above is called, the tp->pf 594 * should still be whatever it was before. 595 */ 596 tp->t_state = TCPS_LISTEN; 597 598 out: 599 if (otp) 600 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_LISTEN, 0); 601 return (error); 602 } 603 604 /* 605 * Initiate connection to peer. 606 * Create a template for use in transmissions on this connection. 607 * Enter SYN_SENT state, and mark socket as connecting. 608 * Start keep-alive timer, and seed output sequence space. 609 * Send initial segment on connection. 610 */ 611 int 612 tcp_connect(struct socket *so, struct mbuf *nam) 613 { 614 struct inpcb *inp; 615 struct tcpcb *tp, *otp = NULL; 616 int error; 617 short ostate; 618 619 soassertlocked(so); 620 621 if ((error = tcp_sogetpcb(so, &inp, &tp))) 622 return (error); 623 624 if (so->so_options & SO_DEBUG) { 625 otp = tp; 626 ostate = tp->t_state; 627 } 628 629 #ifdef INET6 630 if (inp->inp_flags & INP_IPV6) { 631 struct sockaddr_in6 *sin6; 632 633 if ((error = in6_nam2sin6(nam, &sin6))) 634 goto out; 635 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || 636 IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { 637 error = EINVAL; 638 goto out; 639 } 640 error = in6_pcbconnect(inp, nam); 641 } else 642 #endif /* INET6 */ 643 { 644 struct sockaddr_in *sin; 645 646 if ((error = in_nam2sin(nam, &sin))) 647 goto out; 648 if ((sin->sin_addr.s_addr == INADDR_ANY) || 649 (sin->sin_addr.s_addr == INADDR_BROADCAST) || 650 IN_MULTICAST(sin->sin_addr.s_addr) || 651 in_broadcast(sin->sin_addr, inp->inp_rtableid)) { 652 error = EINVAL; 653 goto out; 654 } 655 error = in_pcbconnect(inp, nam); 656 } 657 if (error) 658 goto out; 659 660 tp->t_template = tcp_template(tp); 661 if (tp->t_template == 0) { 662 in_pcbdisconnect(inp); 663 error = ENOBUFS; 664 goto out; 665 } 666 667 so->so_state |= SS_CONNECTOUT; 668 669 /* Compute window scaling to request. */ 670 tcp_rscale(tp, sb_max); 671 672 soisconnecting(so); 673 tcpstat_inc(tcps_connattempt); 674 tp->t_state = TCPS_SYN_SENT; 675 TCP_TIMER_ARM(tp, TCPT_KEEP, TCP_TIME(tcptv_keep_init)); 676 tcp_set_iss_tsm(tp); 677 tcp_sendseqinit(tp); 678 tp->snd_last = tp->snd_una; 679 error = tcp_output(tp); 680 681 out: 682 if (otp) 683 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_CONNECT, 0); 684 return (error); 685 } 686 687 /* 688 * Accept a connection. Essentially all the work is done at higher 689 * levels; just return the address of the peer, storing through addr. 690 */ 691 int 692 tcp_accept(struct socket *so, struct mbuf *nam) 693 { 694 struct inpcb *inp; 695 struct tcpcb *tp; 696 int error; 697 short ostate; 698 699 soassertlocked(so); 700 701 if ((error = tcp_sogetpcb(so, &inp, &tp))) 702 return (error); 703 704 if (so->so_options & SO_DEBUG) 705 ostate = tp->t_state; 706 707 #ifdef INET6 708 if (inp->inp_flags & INP_IPV6) 709 in6_setpeeraddr(inp, nam); 710 else 711 #endif 712 in_setpeeraddr(inp, nam); 713 714 if (so->so_options & SO_DEBUG) 715 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_ACCEPT, 0); 716 return (error); 717 } 718 719 /* 720 * Initiate disconnect from peer. 721 * If connection never passed embryonic stage, just drop; 722 * else if don't need to let data drain, then can just drop anyways, 723 * else have to begin TCP shutdown process: mark socket disconnecting, 724 * drain unread data, state switch to reflect user close, and 725 * send segment (e.g. FIN) to peer. Socket will be really disconnected 726 * when peer sends FIN and acks ours. 727 * 728 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 729 */ 730 int 731 tcp_disconnect(struct socket *so) 732 { 733 struct inpcb *inp; 734 struct tcpcb *tp, *otp = NULL; 735 int error; 736 short ostate; 737 738 soassertlocked(so); 739 740 if ((error = tcp_sogetpcb(so, &inp, &tp))) 741 return (error); 742 743 if (so->so_options & SO_DEBUG) { 744 otp = tp; 745 ostate = tp->t_state; 746 } 747 748 tp = tcp_dodisconnect(tp); 749 750 if (otp) 751 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DISCONNECT, 0); 752 return (0); 753 } 754 755 /* 756 * Mark the connection as being incapable of further output. 757 */ 758 int 759 tcp_shutdown(struct socket *so) 760 { 761 struct inpcb *inp; 762 struct tcpcb *tp, *otp = NULL; 763 int error; 764 short ostate; 765 766 soassertlocked(so); 767 768 if ((error = tcp_sogetpcb(so, &inp, &tp))) 769 return (error); 770 771 if (so->so_options & SO_DEBUG) { 772 otp = tp; 773 ostate = tp->t_state; 774 } 775 776 if (so->so_state & SS_CANTSENDMORE) 777 goto out; 778 779 socantsendmore(so); 780 tp = tcp_usrclosed(tp); 781 if (tp) 782 error = tcp_output(tp); 783 784 out: 785 if (otp) 786 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_SHUTDOWN, 0); 787 return (error); 788 } 789 790 /* 791 * After a receive, possibly send window update to peer. 792 */ 793 void 794 tcp_rcvd(struct socket *so) 795 { 796 struct inpcb *inp; 797 struct tcpcb *tp; 798 short ostate; 799 800 soassertlocked(so); 801 802 if (tcp_sogetpcb(so, &inp, &tp)) 803 return; 804 805 if (so->so_options & SO_DEBUG) 806 ostate = tp->t_state; 807 808 /* 809 * soreceive() calls this function when a user receives 810 * ancillary data on a listening socket. We don't call 811 * tcp_output in such a case, since there is no header 812 * template for a listening socket and hence the kernel 813 * will panic. 814 */ 815 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 816 (void) tcp_output(tp); 817 818 if (so->so_options & SO_DEBUG) 819 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_RCVD, 0); 820 } 821 822 /* 823 * Do a send by putting data in output queue and updating urgent 824 * marker if URG set. Possibly send more data. 825 */ 826 int 827 tcp_send(struct socket *so, struct mbuf *m, struct mbuf *nam, 828 struct mbuf *control) 829 { 830 struct inpcb *inp; 831 struct tcpcb *tp; 832 int error; 833 short ostate; 834 835 soassertlocked(so); 836 837 if (control && control->m_len) { 838 error = EINVAL; 839 goto out; 840 } 841 842 if ((error = tcp_sogetpcb(so, &inp, &tp))) 843 goto out; 844 845 if (so->so_options & SO_DEBUG) 846 ostate = tp->t_state; 847 848 sbappendstream(so, &so->so_snd, m); 849 m = NULL; 850 851 error = tcp_output(tp); 852 853 if (so->so_options & SO_DEBUG) 854 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SEND, 0); 855 856 out: 857 m_freem(control); 858 m_freem(m); 859 860 return (error); 861 } 862 863 /* 864 * Abort the TCP. 865 */ 866 void 867 tcp_abort(struct socket *so) 868 { 869 struct inpcb *inp; 870 struct tcpcb *tp, *otp = NULL; 871 short ostate; 872 873 soassertlocked(so); 874 875 if (tcp_sogetpcb(so, &inp, &tp)) 876 return; 877 878 if (so->so_options & SO_DEBUG) { 879 otp = tp; 880 ostate = tp->t_state; 881 } 882 883 tp = tcp_drop(tp, ECONNABORTED); 884 885 if (otp) 886 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_ABORT, 0); 887 } 888 889 int 890 tcp_sense(struct socket *so, struct stat *ub) 891 { 892 struct inpcb *inp; 893 struct tcpcb *tp; 894 int error; 895 896 soassertlocked(so); 897 898 if ((error = tcp_sogetpcb(so, &inp, &tp))) 899 return (error); 900 901 ub->st_blksize = so->so_snd.sb_hiwat; 902 903 if (so->so_options & SO_DEBUG) 904 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_SENSE, 0); 905 return (0); 906 } 907 908 int 909 tcp_rcvoob(struct socket *so, struct mbuf *m, int flags) 910 { 911 struct inpcb *inp; 912 struct tcpcb *tp; 913 int error; 914 915 soassertlocked(so); 916 917 if ((error = tcp_sogetpcb(so, &inp, &tp))) 918 return (error); 919 920 if ((so->so_oobmark == 0 && 921 (so->so_state & SS_RCVATMARK) == 0) || 922 so->so_options & SO_OOBINLINE || 923 tp->t_oobflags & TCPOOB_HADDATA) { 924 error = EINVAL; 925 goto out; 926 } 927 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 928 error = EWOULDBLOCK; 929 goto out; 930 } 931 m->m_len = 1; 932 *mtod(m, caddr_t) = tp->t_iobc; 933 if ((flags & MSG_PEEK) == 0) 934 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 935 out: 936 if (so->so_options & SO_DEBUG) 937 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_RCVOOB, 0); 938 return (error); 939 } 940 941 int 942 tcp_sendoob(struct socket *so, struct mbuf *m, struct mbuf *nam, 943 struct mbuf *control) 944 { 945 struct inpcb *inp; 946 struct tcpcb *tp; 947 int error; 948 short ostate; 949 950 soassertlocked(so); 951 952 if (control && control->m_len) { 953 error = EINVAL; 954 goto release; 955 } 956 957 if ((error = tcp_sogetpcb(so, &inp, &tp))) 958 goto release; 959 960 if (so->so_options & SO_DEBUG) 961 ostate = tp->t_state; 962 963 if (sbspace(so, &so->so_snd) < -512) { 964 error = ENOBUFS; 965 goto out; 966 } 967 968 /* 969 * According to RFC961 (Assigned Protocols), 970 * the urgent pointer points to the last octet 971 * of urgent data. We continue, however, 972 * to consider it to indicate the first octet 973 * of data past the urgent section. 974 * Otherwise, snd_up should be one lower. 975 */ 976 sbappendstream(so, &so->so_snd, m); 977 m = NULL; 978 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 979 tp->t_force = 1; 980 error = tcp_output(tp); 981 tp->t_force = 0; 982 983 out: 984 if (so->so_options & SO_DEBUG) 985 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SENDOOB, 0); 986 987 release: 988 m_freem(control); 989 m_freem(m); 990 991 return (error); 992 } 993 994 int 995 tcp_sockaddr(struct socket *so, struct mbuf *nam) 996 { 997 struct inpcb *inp; 998 struct tcpcb *tp; 999 int error; 1000 1001 soassertlocked(so); 1002 1003 if ((error = tcp_sogetpcb(so, &inp, &tp))) 1004 return (error); 1005 1006 #ifdef INET6 1007 if (inp->inp_flags & INP_IPV6) 1008 in6_setsockaddr(inp, nam); 1009 else 1010 #endif 1011 in_setsockaddr(inp, nam); 1012 1013 if (so->so_options & SO_DEBUG) 1014 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, 1015 PRU_SOCKADDR, 0); 1016 return (0); 1017 } 1018 1019 int 1020 tcp_peeraddr(struct socket *so, struct mbuf *nam) 1021 { 1022 struct inpcb *inp; 1023 struct tcpcb *tp; 1024 int error; 1025 1026 soassertlocked(so); 1027 1028 if ((error = tcp_sogetpcb(so, &inp, &tp))) 1029 return (error); 1030 1031 #ifdef INET6 1032 if (inp->inp_flags & INP_IPV6) 1033 in6_setpeeraddr(inp, nam); 1034 else 1035 #endif 1036 in_setpeeraddr(inp, nam); 1037 1038 if (so->so_options & SO_DEBUG) 1039 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, 1040 PRU_PEERADDR, 0); 1041 return (0); 1042 } 1043 1044 /* 1045 * Initiate (or continue) disconnect. 1046 * If embryonic state, just send reset (once). 1047 * If in ``let data drain'' option and linger null, just drop. 1048 * Otherwise (hard), mark socket disconnecting and drop 1049 * current input data; switch states based on user close, and 1050 * send segment to peer (with FIN). 1051 */ 1052 struct tcpcb * 1053 tcp_dodisconnect(struct tcpcb *tp) 1054 { 1055 struct socket *so = tp->t_inpcb->inp_socket; 1056 1057 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 1058 tp = tcp_close(tp); 1059 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 1060 tp = tcp_drop(tp, 0); 1061 else { 1062 soisdisconnecting(so); 1063 sbflush(so, &so->so_rcv); 1064 tp = tcp_usrclosed(tp); 1065 if (tp) 1066 (void) tcp_output(tp); 1067 } 1068 return (tp); 1069 } 1070 1071 /* 1072 * User issued close, and wish to trail through shutdown states: 1073 * if never received SYN, just forget it. If got a SYN from peer, 1074 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 1075 * If already got a FIN from peer, then almost done; go to LAST_ACK 1076 * state. In all other cases, have already sent FIN to peer (e.g. 1077 * after PRU_SHUTDOWN), and just have to play tedious game waiting 1078 * for peer to send FIN or not respond to keep-alives, etc. 1079 * We can let the user exit from the close as soon as the FIN is acked. 1080 */ 1081 struct tcpcb * 1082 tcp_usrclosed(struct tcpcb *tp) 1083 { 1084 1085 switch (tp->t_state) { 1086 1087 case TCPS_CLOSED: 1088 case TCPS_LISTEN: 1089 case TCPS_SYN_SENT: 1090 tp->t_state = TCPS_CLOSED; 1091 tp = tcp_close(tp); 1092 break; 1093 1094 case TCPS_SYN_RECEIVED: 1095 case TCPS_ESTABLISHED: 1096 tp->t_state = TCPS_FIN_WAIT_1; 1097 break; 1098 1099 case TCPS_CLOSE_WAIT: 1100 tp->t_state = TCPS_LAST_ACK; 1101 break; 1102 } 1103 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 1104 soisdisconnected(tp->t_inpcb->inp_socket); 1105 /* 1106 * If we are in FIN_WAIT_2, we arrived here because the 1107 * application did a shutdown of the send side. Like the 1108 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 1109 * a full close, we start a timer to make sure sockets are 1110 * not left in FIN_WAIT_2 forever. 1111 */ 1112 if (tp->t_state == TCPS_FIN_WAIT_2) 1113 TCP_TIMER_ARM(tp, TCPT_2MSL, TCP_TIME(tcp_maxidle)); 1114 } 1115 return (tp); 1116 } 1117 1118 /* 1119 * Look up a socket for ident or tcpdrop, ... 1120 */ 1121 int 1122 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) 1123 { 1124 int error = 0; 1125 struct tcp_ident_mapping tir; 1126 struct inpcb *inp; 1127 struct tcpcb *tp = NULL; 1128 struct sockaddr_in *fin, *lin; 1129 #ifdef INET6 1130 struct sockaddr_in6 *fin6, *lin6; 1131 struct in6_addr f6, l6; 1132 #endif 1133 1134 NET_ASSERT_LOCKED(); 1135 1136 if (dodrop) { 1137 if (oldp != NULL || *oldlenp != 0) 1138 return (EINVAL); 1139 if (newp == NULL) 1140 return (EPERM); 1141 if (newlen < sizeof(tir)) 1142 return (ENOMEM); 1143 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) 1144 return (error); 1145 } else { 1146 if (oldp == NULL) 1147 return (EINVAL); 1148 if (*oldlenp < sizeof(tir)) 1149 return (ENOMEM); 1150 if (newp != NULL || newlen != 0) 1151 return (EINVAL); 1152 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) 1153 return (error); 1154 } 1155 switch (tir.faddr.ss_family) { 1156 #ifdef INET6 1157 case AF_INET6: 1158 fin6 = (struct sockaddr_in6 *)&tir.faddr; 1159 error = in6_embedscope(&f6, fin6, NULL); 1160 if (error) 1161 return EINVAL; /*?*/ 1162 lin6 = (struct sockaddr_in6 *)&tir.laddr; 1163 error = in6_embedscope(&l6, lin6, NULL); 1164 if (error) 1165 return EINVAL; /*?*/ 1166 break; 1167 #endif 1168 case AF_INET: 1169 fin = (struct sockaddr_in *)&tir.faddr; 1170 lin = (struct sockaddr_in *)&tir.laddr; 1171 break; 1172 default: 1173 return (EINVAL); 1174 } 1175 1176 switch (tir.faddr.ss_family) { 1177 #ifdef INET6 1178 case AF_INET6: 1179 inp = in6_pcblookup(&tcbtable, &f6, 1180 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); 1181 break; 1182 #endif 1183 case AF_INET: 1184 inp = in_pcblookup(&tcbtable, fin->sin_addr, 1185 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); 1186 break; 1187 default: 1188 unhandled_af(tir.faddr.ss_family); 1189 } 1190 1191 if (dodrop) { 1192 if (inp && (tp = intotcpcb(inp)) && 1193 ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) 1194 tp = tcp_drop(tp, ECONNABORTED); 1195 else 1196 error = ESRCH; 1197 in_pcbunref(inp); 1198 return (error); 1199 } 1200 1201 if (inp == NULL) { 1202 tcpstat_inc(tcps_pcbhashmiss); 1203 switch (tir.faddr.ss_family) { 1204 #ifdef INET6 1205 case AF_INET6: 1206 inp = in6_pcblookup_listen(&tcbtable, 1207 &l6, lin6->sin6_port, NULL, tir.rdomain); 1208 break; 1209 #endif 1210 case AF_INET: 1211 inp = in_pcblookup_listen(&tcbtable, 1212 lin->sin_addr, lin->sin_port, NULL, tir.rdomain); 1213 break; 1214 } 1215 } 1216 1217 if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) { 1218 tir.ruid = inp->inp_socket->so_ruid; 1219 tir.euid = inp->inp_socket->so_euid; 1220 } else { 1221 tir.ruid = -1; 1222 tir.euid = -1; 1223 } 1224 1225 *oldlenp = sizeof (tir); 1226 error = copyout((void *)&tir, oldp, sizeof (tir)); 1227 in_pcbunref(inp); 1228 return (error); 1229 } 1230 1231 int 1232 tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp) 1233 { 1234 uint64_t counters[tcps_ncounters]; 1235 struct tcpstat tcpstat; 1236 struct syn_cache_set *set; 1237 int i = 0; 1238 1239 #define ASSIGN(field) do { tcpstat.field = counters[i++]; } while (0) 1240 1241 memset(&tcpstat, 0, sizeof tcpstat); 1242 counters_read(tcpcounters, counters, nitems(counters)); 1243 ASSIGN(tcps_connattempt); 1244 ASSIGN(tcps_accepts); 1245 ASSIGN(tcps_connects); 1246 ASSIGN(tcps_drops); 1247 ASSIGN(tcps_conndrops); 1248 ASSIGN(tcps_closed); 1249 ASSIGN(tcps_segstimed); 1250 ASSIGN(tcps_rttupdated); 1251 ASSIGN(tcps_delack); 1252 ASSIGN(tcps_timeoutdrop); 1253 ASSIGN(tcps_rexmttimeo); 1254 ASSIGN(tcps_persisttimeo); 1255 ASSIGN(tcps_persistdrop); 1256 ASSIGN(tcps_keeptimeo); 1257 ASSIGN(tcps_keepprobe); 1258 ASSIGN(tcps_keepdrops); 1259 ASSIGN(tcps_sndtotal); 1260 ASSIGN(tcps_sndpack); 1261 ASSIGN(tcps_sndbyte); 1262 ASSIGN(tcps_sndrexmitpack); 1263 ASSIGN(tcps_sndrexmitbyte); 1264 ASSIGN(tcps_sndrexmitfast); 1265 ASSIGN(tcps_sndacks); 1266 ASSIGN(tcps_sndprobe); 1267 ASSIGN(tcps_sndurg); 1268 ASSIGN(tcps_sndwinup); 1269 ASSIGN(tcps_sndctrl); 1270 ASSIGN(tcps_rcvtotal); 1271 ASSIGN(tcps_rcvpack); 1272 ASSIGN(tcps_rcvbyte); 1273 ASSIGN(tcps_rcvbadsum); 1274 ASSIGN(tcps_rcvbadoff); 1275 ASSIGN(tcps_rcvmemdrop); 1276 ASSIGN(tcps_rcvnosec); 1277 ASSIGN(tcps_rcvshort); 1278 ASSIGN(tcps_rcvduppack); 1279 ASSIGN(tcps_rcvdupbyte); 1280 ASSIGN(tcps_rcvpartduppack); 1281 ASSIGN(tcps_rcvpartdupbyte); 1282 ASSIGN(tcps_rcvoopack); 1283 ASSIGN(tcps_rcvoobyte); 1284 ASSIGN(tcps_rcvpackafterwin); 1285 ASSIGN(tcps_rcvbyteafterwin); 1286 ASSIGN(tcps_rcvafterclose); 1287 ASSIGN(tcps_rcvwinprobe); 1288 ASSIGN(tcps_rcvdupack); 1289 ASSIGN(tcps_rcvacktoomuch); 1290 ASSIGN(tcps_rcvacktooold); 1291 ASSIGN(tcps_rcvackpack); 1292 ASSIGN(tcps_rcvackbyte); 1293 ASSIGN(tcps_rcvwinupd); 1294 ASSIGN(tcps_pawsdrop); 1295 ASSIGN(tcps_predack); 1296 ASSIGN(tcps_preddat); 1297 ASSIGN(tcps_pcbhashmiss); 1298 ASSIGN(tcps_noport); 1299 ASSIGN(tcps_badsyn); 1300 ASSIGN(tcps_dropsyn); 1301 ASSIGN(tcps_rcvbadsig); 1302 ASSIGN(tcps_rcvgoodsig); 1303 ASSIGN(tcps_inswcsum); 1304 ASSIGN(tcps_outswcsum); 1305 ASSIGN(tcps_ecn_accepts); 1306 ASSIGN(tcps_ecn_rcvece); 1307 ASSIGN(tcps_ecn_rcvcwr); 1308 ASSIGN(tcps_ecn_rcvce); 1309 ASSIGN(tcps_ecn_sndect); 1310 ASSIGN(tcps_ecn_sndece); 1311 ASSIGN(tcps_ecn_sndcwr); 1312 ASSIGN(tcps_cwr_ecn); 1313 ASSIGN(tcps_cwr_frecovery); 1314 ASSIGN(tcps_cwr_timeout); 1315 ASSIGN(tcps_sc_added); 1316 ASSIGN(tcps_sc_completed); 1317 ASSIGN(tcps_sc_timed_out); 1318 ASSIGN(tcps_sc_overflowed); 1319 ASSIGN(tcps_sc_reset); 1320 ASSIGN(tcps_sc_unreach); 1321 ASSIGN(tcps_sc_bucketoverflow); 1322 ASSIGN(tcps_sc_aborted); 1323 ASSIGN(tcps_sc_dupesyn); 1324 ASSIGN(tcps_sc_dropped); 1325 ASSIGN(tcps_sc_collisions); 1326 ASSIGN(tcps_sc_retransmitted); 1327 ASSIGN(tcps_sc_seedrandom); 1328 ASSIGN(tcps_sc_hash_size); 1329 ASSIGN(tcps_sc_entry_count); 1330 ASSIGN(tcps_sc_entry_limit); 1331 ASSIGN(tcps_sc_bucket_maxlen); 1332 ASSIGN(tcps_sc_bucket_limit); 1333 ASSIGN(tcps_sc_uses_left); 1334 ASSIGN(tcps_conndrained); 1335 ASSIGN(tcps_sack_recovery_episode); 1336 ASSIGN(tcps_sack_rexmits); 1337 ASSIGN(tcps_sack_rexmit_bytes); 1338 ASSIGN(tcps_sack_rcv_opts); 1339 ASSIGN(tcps_sack_snd_opts); 1340 ASSIGN(tcps_sack_drop_opts); 1341 1342 #undef ASSIGN 1343 1344 set = &tcp_syn_cache[tcp_syn_cache_active]; 1345 tcpstat.tcps_sc_hash_size = set->scs_size; 1346 tcpstat.tcps_sc_entry_count = set->scs_count; 1347 tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit; 1348 tcpstat.tcps_sc_bucket_maxlen = 0; 1349 for (i = 0; i < set->scs_size; i++) { 1350 if (tcpstat.tcps_sc_bucket_maxlen < 1351 set->scs_buckethead[i].sch_length) 1352 tcpstat.tcps_sc_bucket_maxlen = 1353 set->scs_buckethead[i].sch_length; 1354 } 1355 tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit; 1356 tcpstat.tcps_sc_uses_left = set->scs_use; 1357 1358 return (sysctl_rdstruct(oldp, oldlenp, newp, 1359 &tcpstat, sizeof(tcpstat))); 1360 } 1361 1362 /* 1363 * Sysctl for tcp variables. 1364 */ 1365 int 1366 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 1367 size_t newlen) 1368 { 1369 int error, nval; 1370 1371 /* All sysctl names at this level are terminal. */ 1372 if (namelen != 1) 1373 return (ENOTDIR); 1374 1375 switch (name[0]) { 1376 case TCPCTL_BADDYNAMIC: 1377 NET_LOCK(); 1378 error = sysctl_struct(oldp, oldlenp, newp, newlen, 1379 baddynamicports.tcp, sizeof(baddynamicports.tcp)); 1380 NET_UNLOCK(); 1381 return (error); 1382 1383 case TCPCTL_ROOTONLY: 1384 if (newp && securelevel > 0) 1385 return (EPERM); 1386 NET_LOCK(); 1387 error = sysctl_struct(oldp, oldlenp, newp, newlen, 1388 rootonlyports.tcp, sizeof(rootonlyports.tcp)); 1389 NET_UNLOCK(); 1390 return (error); 1391 1392 case TCPCTL_IDENT: 1393 NET_LOCK(); 1394 error = tcp_ident(oldp, oldlenp, newp, newlen, 0); 1395 NET_UNLOCK(); 1396 return (error); 1397 1398 case TCPCTL_DROP: 1399 NET_LOCK(); 1400 error = tcp_ident(oldp, oldlenp, newp, newlen, 1); 1401 NET_UNLOCK(); 1402 return (error); 1403 1404 case TCPCTL_REASS_LIMIT: 1405 NET_LOCK(); 1406 nval = tcp_reass_limit; 1407 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1408 if (!error && nval != tcp_reass_limit) { 1409 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); 1410 if (!error) 1411 tcp_reass_limit = nval; 1412 } 1413 NET_UNLOCK(); 1414 return (error); 1415 1416 case TCPCTL_SACKHOLE_LIMIT: 1417 NET_LOCK(); 1418 nval = tcp_sackhole_limit; 1419 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1420 if (!error && nval != tcp_sackhole_limit) { 1421 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); 1422 if (!error) 1423 tcp_sackhole_limit = nval; 1424 } 1425 NET_UNLOCK(); 1426 return (error); 1427 1428 case TCPCTL_STATS: 1429 return (tcp_sysctl_tcpstat(oldp, oldlenp, newp)); 1430 1431 case TCPCTL_SYN_USE_LIMIT: 1432 NET_LOCK(); 1433 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, 1434 &tcp_syn_use_limit, 0, INT_MAX); 1435 if (!error && newp != NULL) { 1436 /* 1437 * Global tcp_syn_use_limit is used when reseeding a 1438 * new cache. Also update the value in active cache. 1439 */ 1440 if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit) 1441 tcp_syn_cache[0].scs_use = tcp_syn_use_limit; 1442 if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit) 1443 tcp_syn_cache[1].scs_use = tcp_syn_use_limit; 1444 } 1445 NET_UNLOCK(); 1446 return (error); 1447 1448 case TCPCTL_SYN_HASH_SIZE: 1449 NET_LOCK(); 1450 nval = tcp_syn_hash_size; 1451 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, 1452 &nval, 1, 100000); 1453 if (!error && nval != tcp_syn_hash_size) { 1454 /* 1455 * If global hash size has been changed, 1456 * switch sets as soon as possible. Then 1457 * the actual hash array will be reallocated. 1458 */ 1459 if (tcp_syn_cache[0].scs_size != nval) 1460 tcp_syn_cache[0].scs_use = 0; 1461 if (tcp_syn_cache[1].scs_size != nval) 1462 tcp_syn_cache[1].scs_use = 0; 1463 tcp_syn_hash_size = nval; 1464 } 1465 NET_UNLOCK(); 1466 return (error); 1467 1468 default: 1469 NET_LOCK(); 1470 error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars), name, 1471 namelen, oldp, oldlenp, newp, newlen); 1472 NET_UNLOCK(); 1473 return (error); 1474 } 1475 /* NOTREACHED */ 1476 } 1477 1478 /* 1479 * Scale the send buffer so that inflight data is not accounted against 1480 * the limit. The buffer will scale with the congestion window, if the 1481 * the receiver stops acking data the window will shrink and therefore 1482 * the buffer size will shrink as well. 1483 * In low memory situation try to shrink the buffer to the initial size 1484 * disabling the send buffer scaling as long as the situation persists. 1485 */ 1486 void 1487 tcp_update_sndspace(struct tcpcb *tp) 1488 { 1489 struct socket *so = tp->t_inpcb->inp_socket; 1490 u_long nmax = so->so_snd.sb_hiwat; 1491 1492 if (sbchecklowmem()) { 1493 /* low on memory try to get rid of some */ 1494 if (tcp_sendspace < nmax) 1495 nmax = tcp_sendspace; 1496 } else if (so->so_snd.sb_wat != tcp_sendspace) 1497 /* user requested buffer size, auto-scaling disabled */ 1498 nmax = so->so_snd.sb_wat; 1499 else 1500 /* automatic buffer scaling */ 1501 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - 1502 tp->snd_una); 1503 1504 /* a writable socket must be preserved because of poll(2) semantics */ 1505 if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) { 1506 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) 1507 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; 1508 /* keep in sync with sbreserve() calculation */ 1509 if (nmax * 8 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat) 1510 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+7) / 8; 1511 } 1512 1513 /* round to MSS boundary */ 1514 nmax = roundup(nmax, tp->t_maxseg); 1515 1516 if (nmax != so->so_snd.sb_hiwat) 1517 sbreserve(so, &so->so_snd, nmax); 1518 } 1519 1520 /* 1521 * Scale the recv buffer by looking at how much data was transferred in 1522 * one approximated RTT. If more than a big part of the recv buffer was 1523 * transferred during that time we increase the buffer by a constant. 1524 * In low memory situation try to shrink the buffer to the initial size. 1525 */ 1526 void 1527 tcp_update_rcvspace(struct tcpcb *tp) 1528 { 1529 struct socket *so = tp->t_inpcb->inp_socket; 1530 u_long nmax = so->so_rcv.sb_hiwat; 1531 1532 if (sbchecklowmem()) { 1533 /* low on memory try to get rid of some */ 1534 if (tcp_recvspace < nmax) 1535 nmax = tcp_recvspace; 1536 } else if (so->so_rcv.sb_wat != tcp_recvspace) 1537 /* user requested buffer size, auto-scaling disabled */ 1538 nmax = so->so_rcv.sb_wat; 1539 else { 1540 /* automatic buffer scaling */ 1541 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) 1542 nmax = MIN(sb_max, so->so_rcv.sb_hiwat + 1543 tcp_autorcvbuf_inc); 1544 } 1545 1546 /* a readable socket must be preserved because of poll(2) semantics */ 1547 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && 1548 nmax < so->so_snd.sb_lowat) 1549 nmax = so->so_snd.sb_lowat; 1550 1551 if (nmax == so->so_rcv.sb_hiwat) 1552 return; 1553 1554 /* round to MSS boundary */ 1555 nmax = roundup(nmax, tp->t_maxseg); 1556 sbreserve(so, &so->so_rcv, nmax); 1557 } 1558