1 /* $OpenBSD: tcp_usrreq.c,v 1.208 2022/09/13 09:05:47 mvs Exp $ */ 2 /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/protosw.h> 77 #include <sys/stat.h> 78 #include <sys/sysctl.h> 79 #include <sys/domain.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 #include <sys/proc.h> 83 84 #include <net/if.h> 85 #include <net/if_var.h> 86 #include <net/route.h> 87 88 #include <netinet/in.h> 89 #include <netinet/in_var.h> 90 #include <netinet/ip.h> 91 #include <netinet/in_pcb.h> 92 #include <netinet/ip_var.h> 93 #include <netinet/tcp.h> 94 #include <netinet/tcp_fsm.h> 95 #include <netinet/tcp_seq.h> 96 #include <netinet/tcp_timer.h> 97 #include <netinet/tcp_var.h> 98 #include <netinet/tcp_debug.h> 99 100 #ifdef INET6 101 #include <netinet6/in6_var.h> 102 #endif 103 104 #ifndef TCP_SENDSPACE 105 #define TCP_SENDSPACE 1024*16 106 #endif 107 u_int tcp_sendspace = TCP_SENDSPACE; 108 #ifndef TCP_RECVSPACE 109 #define TCP_RECVSPACE 1024*16 110 #endif 111 u_int tcp_recvspace = TCP_RECVSPACE; 112 u_int tcp_autorcvbuf_inc = 16 * 1024; 113 114 const struct pr_usrreqs tcp_usrreqs = { 115 .pru_attach = tcp_attach, 116 .pru_detach = tcp_detach, 117 .pru_bind = tcp_bind, 118 .pru_listen = tcp_listen, 119 .pru_connect = tcp_connect, 120 .pru_accept = tcp_accept, 121 .pru_disconnect = tcp_disconnect, 122 .pru_shutdown = tcp_shutdown, 123 .pru_rcvd = tcp_rcvd, 124 .pru_send = tcp_send, 125 .pru_abort = tcp_abort, 126 .pru_sense = tcp_sense, 127 .pru_rcvoob = tcp_rcvoob, 128 .pru_sendoob = tcp_sendoob, 129 .pru_control = in_control, 130 .pru_sockaddr = tcp_sockaddr, 131 .pru_peeraddr = tcp_peeraddr, 132 }; 133 134 #ifdef INET6 135 const struct pr_usrreqs tcp6_usrreqs = { 136 .pru_attach = tcp_attach, 137 .pru_detach = tcp_detach, 138 .pru_bind = tcp_bind, 139 .pru_listen = tcp_listen, 140 .pru_connect = tcp_connect, 141 .pru_accept = tcp_accept, 142 .pru_disconnect = tcp_disconnect, 143 .pru_shutdown = tcp_shutdown, 144 .pru_rcvd = tcp_rcvd, 145 .pru_send = tcp_send, 146 .pru_abort = tcp_abort, 147 .pru_sense = tcp_sense, 148 .pru_rcvoob = tcp_rcvoob, 149 .pru_sendoob = tcp_sendoob, 150 .pru_control = in6_control, 151 .pru_sockaddr = tcp_sockaddr, 152 .pru_peeraddr = tcp_peeraddr, 153 }; 154 #endif 155 156 static int pr_slowhz = PR_SLOWHZ; 157 const struct sysctl_bounded_args tcpctl_vars[] = { 158 { TCPCTL_SLOWHZ, &pr_slowhz, SYSCTL_INT_READONLY }, 159 { TCPCTL_RFC1323, &tcp_do_rfc1323, 0, 1 }, 160 { TCPCTL_KEEPINITTIME, &tcptv_keep_init, 1, 3 * TCPTV_KEEP_INIT }, 161 { TCPCTL_KEEPIDLE, &tcp_keepidle, 1, 5 * TCPTV_KEEP_IDLE }, 162 { TCPCTL_KEEPINTVL, &tcp_keepintvl, 1, 3 * TCPTV_KEEPINTVL }, 163 { TCPCTL_SACK, &tcp_do_sack, 0, 1 }, 164 { TCPCTL_MSSDFLT, &tcp_mssdflt, TCP_MSS, 65535 }, 165 { TCPCTL_RSTPPSLIMIT, &tcp_rst_ppslim, 1, 1000 * 1000 }, 166 { TCPCTL_ACK_ON_PUSH, &tcp_ack_on_push, 0, 1 }, 167 #ifdef TCP_ECN 168 { TCPCTL_ECN, &tcp_do_ecn, 0, 1 }, 169 #endif 170 { TCPCTL_SYN_CACHE_LIMIT, &tcp_syn_cache_limit, 1, 1000 * 1000 }, 171 { TCPCTL_SYN_BUCKET_LIMIT, &tcp_syn_bucket_limit, 1, INT_MAX }, 172 { TCPCTL_RFC3390, &tcp_do_rfc3390, 0, 2 }, 173 { TCPCTL_ALWAYS_KEEPALIVE, &tcp_always_keepalive, 0, 1 }, 174 }; 175 176 struct inpcbtable tcbtable; 177 178 int tcp_fill_info(struct tcpcb *, struct socket *, struct mbuf *); 179 int tcp_ident(void *, size_t *, void *, size_t, int); 180 181 static inline int tcp_sogetpcb(struct socket *, struct inpcb **, 182 struct tcpcb **); 183 184 static inline int 185 tcp_sogetpcb(struct socket *so, struct inpcb **rinp, struct tcpcb **rtp) 186 { 187 struct inpcb *inp; 188 struct tcpcb *tp; 189 190 /* 191 * When a TCP is attached to a socket, then there will be 192 * a (struct inpcb) pointed at by the socket, and this 193 * structure will point at a subsidiary (struct tcpcb). 194 */ 195 if ((inp = sotoinpcb(so)) == NULL || (tp = intotcpcb(inp)) == NULL) { 196 if (so->so_error) 197 return so->so_error; 198 return EINVAL; 199 } 200 201 *rinp = inp; 202 *rtp = tp; 203 204 return 0; 205 } 206 207 /* 208 * Export internal TCP state information via a struct tcp_info without 209 * leaking any sensitive information. Sequence numbers are reported 210 * relative to the initial sequence number. 211 */ 212 int 213 tcp_fill_info(struct tcpcb *tp, struct socket *so, struct mbuf *m) 214 { 215 struct proc *p = curproc; 216 struct tcp_info *ti; 217 u_int t = 1000000 / PR_SLOWHZ; 218 uint32_t now; 219 220 if (sizeof(*ti) > MLEN) { 221 MCLGETL(m, M_WAITOK, sizeof(*ti)); 222 if (!ISSET(m->m_flags, M_EXT)) 223 return ENOMEM; 224 } 225 ti = mtod(m, struct tcp_info *); 226 m->m_len = sizeof(*ti); 227 memset(ti, 0, sizeof(*ti)); 228 now = READ_ONCE(tcp_now); 229 230 ti->tcpi_state = tp->t_state; 231 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) 232 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; 233 if (tp->t_flags & TF_SACK_PERMIT) 234 ti->tcpi_options |= TCPI_OPT_SACK; 235 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { 236 ti->tcpi_options |= TCPI_OPT_WSCALE; 237 ti->tcpi_snd_wscale = tp->snd_scale; 238 ti->tcpi_rcv_wscale = tp->rcv_scale; 239 } 240 #ifdef TCP_ECN 241 if (tp->t_flags & TF_ECN_PERMIT) 242 ti->tcpi_options |= TCPI_OPT_ECN; 243 #endif 244 245 ti->tcpi_rto = tp->t_rxtcur * t; 246 ti->tcpi_snd_mss = tp->t_maxseg; 247 ti->tcpi_rcv_mss = tp->t_peermss; 248 249 ti->tcpi_last_data_sent = (now - tp->t_sndtime) * t; 250 ti->tcpi_last_ack_sent = (now - tp->t_sndacktime) * t; 251 ti->tcpi_last_data_recv = (now - tp->t_rcvtime) * t; 252 ti->tcpi_last_ack_recv = (now - tp->t_rcvacktime) * t; 253 254 ti->tcpi_rtt = ((uint64_t)tp->t_srtt * t) >> 255 (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 256 ti->tcpi_rttvar = ((uint64_t)tp->t_rttvar * t) >> 257 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT); 258 ti->tcpi_snd_ssthresh = tp->snd_ssthresh; 259 ti->tcpi_snd_cwnd = tp->snd_cwnd; 260 261 ti->tcpi_rcv_space = tp->rcv_wnd; 262 263 /* 264 * Provide only minimal information for unprivileged processes. 265 */ 266 if (suser(p) != 0) 267 return 0; 268 269 /* FreeBSD-specific extension fields for tcp_info. */ 270 ti->tcpi_snd_wnd = tp->snd_wnd; 271 ti->tcpi_snd_nxt = tp->snd_nxt - tp->iss; 272 ti->tcpi_rcv_nxt = tp->rcv_nxt - tp->irs; 273 /* missing tcpi_toe_tid */ 274 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; 275 ti->tcpi_rcv_ooopack = tp->t_rcvoopack; 276 ti->tcpi_snd_zerowin = tp->t_sndzerowin; 277 278 /* OpenBSD extensions */ 279 ti->tcpi_rttmin = tp->t_rttmin * t; 280 ti->tcpi_max_sndwnd = tp->max_sndwnd; 281 ti->tcpi_rcv_adv = tp->rcv_adv - tp->irs; 282 ti->tcpi_rcv_up = tp->rcv_up - tp->irs; 283 ti->tcpi_snd_una = tp->snd_una - tp->iss; 284 ti->tcpi_snd_up = tp->snd_up - tp->iss; 285 ti->tcpi_snd_wl1 = tp->snd_wl1 - tp->iss; 286 ti->tcpi_snd_wl2 = tp->snd_wl2 - tp->iss; 287 ti->tcpi_snd_max = tp->snd_max - tp->iss; 288 289 ti->tcpi_ts_recent = tp->ts_recent; /* XXX value from the wire */ 290 ti->tcpi_ts_recent_age = (now - tp->ts_recent_age) * t; 291 ti->tcpi_rfbuf_cnt = tp->rfbuf_cnt; 292 ti->tcpi_rfbuf_ts = (now - tp->rfbuf_ts) * t; 293 294 ti->tcpi_so_rcv_sb_cc = so->so_rcv.sb_cc; 295 ti->tcpi_so_rcv_sb_hiwat = so->so_rcv.sb_hiwat; 296 ti->tcpi_so_rcv_sb_lowat = so->so_rcv.sb_lowat; 297 ti->tcpi_so_rcv_sb_wat = so->so_rcv.sb_wat; 298 ti->tcpi_so_snd_sb_cc = so->so_snd.sb_cc; 299 ti->tcpi_so_snd_sb_hiwat = so->so_snd.sb_hiwat; 300 ti->tcpi_so_snd_sb_lowat = so->so_snd.sb_lowat; 301 ti->tcpi_so_snd_sb_wat = so->so_snd.sb_wat; 302 303 return 0; 304 } 305 306 int 307 tcp_ctloutput(int op, struct socket *so, int level, int optname, 308 struct mbuf *m) 309 { 310 int error = 0; 311 struct inpcb *inp; 312 struct tcpcb *tp; 313 int i; 314 315 inp = sotoinpcb(so); 316 if (inp == NULL) 317 return (ECONNRESET); 318 if (level != IPPROTO_TCP) { 319 switch (so->so_proto->pr_domain->dom_family) { 320 #ifdef INET6 321 case PF_INET6: 322 error = ip6_ctloutput(op, so, level, optname, m); 323 break; 324 #endif /* INET6 */ 325 case PF_INET: 326 error = ip_ctloutput(op, so, level, optname, m); 327 break; 328 default: 329 error = EAFNOSUPPORT; /*?*/ 330 break; 331 } 332 return (error); 333 } 334 tp = intotcpcb(inp); 335 336 switch (op) { 337 338 case PRCO_SETOPT: 339 switch (optname) { 340 341 case TCP_NODELAY: 342 if (m == NULL || m->m_len < sizeof (int)) 343 error = EINVAL; 344 else if (*mtod(m, int *)) 345 tp->t_flags |= TF_NODELAY; 346 else 347 tp->t_flags &= ~TF_NODELAY; 348 break; 349 350 case TCP_NOPUSH: 351 if (m == NULL || m->m_len < sizeof (int)) 352 error = EINVAL; 353 else if (*mtod(m, int *)) 354 tp->t_flags |= TF_NOPUSH; 355 else if (tp->t_flags & TF_NOPUSH) { 356 tp->t_flags &= ~TF_NOPUSH; 357 if (TCPS_HAVEESTABLISHED(tp->t_state)) 358 error = tcp_output(tp); 359 } 360 break; 361 362 case TCP_MAXSEG: 363 if (m == NULL || m->m_len < sizeof (int)) { 364 error = EINVAL; 365 break; 366 } 367 368 i = *mtod(m, int *); 369 if (i > 0 && i <= tp->t_maxseg) 370 tp->t_maxseg = i; 371 else 372 error = EINVAL; 373 break; 374 375 case TCP_SACK_ENABLE: 376 if (m == NULL || m->m_len < sizeof (int)) { 377 error = EINVAL; 378 break; 379 } 380 381 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 382 error = EPERM; 383 break; 384 } 385 386 if (tp->t_flags & TF_SIGNATURE) { 387 error = EPERM; 388 break; 389 } 390 391 if (*mtod(m, int *)) 392 tp->sack_enable = 1; 393 else 394 tp->sack_enable = 0; 395 break; 396 #ifdef TCP_SIGNATURE 397 case TCP_MD5SIG: 398 if (m == NULL || m->m_len < sizeof (int)) { 399 error = EINVAL; 400 break; 401 } 402 403 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 404 error = EPERM; 405 break; 406 } 407 408 if (*mtod(m, int *)) { 409 tp->t_flags |= TF_SIGNATURE; 410 tp->sack_enable = 0; 411 } else 412 tp->t_flags &= ~TF_SIGNATURE; 413 break; 414 #endif /* TCP_SIGNATURE */ 415 default: 416 error = ENOPROTOOPT; 417 break; 418 } 419 break; 420 421 case PRCO_GETOPT: 422 switch (optname) { 423 case TCP_NODELAY: 424 m->m_len = sizeof(int); 425 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 426 break; 427 case TCP_NOPUSH: 428 m->m_len = sizeof(int); 429 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 430 break; 431 case TCP_MAXSEG: 432 m->m_len = sizeof(int); 433 *mtod(m, int *) = tp->t_maxseg; 434 break; 435 case TCP_SACK_ENABLE: 436 m->m_len = sizeof(int); 437 *mtod(m, int *) = tp->sack_enable; 438 break; 439 case TCP_INFO: 440 error = tcp_fill_info(tp, so, m); 441 break; 442 #ifdef TCP_SIGNATURE 443 case TCP_MD5SIG: 444 m->m_len = sizeof(int); 445 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; 446 break; 447 #endif 448 default: 449 error = ENOPROTOOPT; 450 break; 451 } 452 break; 453 } 454 return (error); 455 } 456 457 /* 458 * Attach TCP protocol to socket, allocating 459 * internet protocol control block, tcp control block, 460 * buffer space, and entering LISTEN state to accept connections. 461 */ 462 int 463 tcp_attach(struct socket *so, int proto) 464 { 465 struct tcpcb *tp; 466 struct inpcb *inp; 467 int error; 468 469 if (so->so_pcb) 470 return EISCONN; 471 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || 472 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || 473 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { 474 error = soreserve(so, tcp_sendspace, tcp_recvspace); 475 if (error) 476 return (error); 477 } 478 479 NET_ASSERT_LOCKED(); 480 error = in_pcballoc(so, &tcbtable); 481 if (error) 482 return (error); 483 inp = sotoinpcb(so); 484 tp = tcp_newtcpcb(inp); 485 if (tp == NULL) { 486 unsigned int nofd = so->so_state & SS_NOFDREF; /* XXX */ 487 488 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 489 in_pcbdetach(inp); 490 so->so_state |= nofd; 491 return (ENOBUFS); 492 } 493 tp->t_state = TCPS_CLOSED; 494 #ifdef INET6 495 /* we disallow IPv4 mapped address completely. */ 496 if (inp->inp_flags & INP_IPV6) 497 tp->pf = PF_INET6; 498 else 499 tp->pf = PF_INET; 500 #else 501 tp->pf = PF_INET; 502 #endif 503 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 504 so->so_linger = TCP_LINGERTIME; 505 506 if (so->so_options & SO_DEBUG) 507 tcp_trace(TA_USER, TCPS_CLOSED, tp, tp, NULL, PRU_ATTACH, 0); 508 return (0); 509 } 510 511 int 512 tcp_detach(struct socket *so) 513 { 514 struct inpcb *inp; 515 struct tcpcb *otp = NULL, *tp; 516 int error = 0; 517 short ostate; 518 519 soassertlocked(so); 520 521 if ((error = tcp_sogetpcb(so, &inp, &tp))) 522 return (error); 523 524 if (so->so_options & SO_DEBUG) { 525 otp = tp; 526 ostate = tp->t_state; 527 } 528 529 /* 530 * Detach the TCP protocol from the socket. 531 * If the protocol state is non-embryonic, then can't 532 * do this directly: have to initiate a PRU_DISCONNECT, 533 * which may finish later; embryonic TCB's can just 534 * be discarded here. 535 */ 536 tp = tcp_dodisconnect(tp); 537 538 if (otp) 539 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DETACH, 0); 540 return (error); 541 } 542 543 /* 544 * Give the socket an address. 545 */ 546 int 547 tcp_bind(struct socket *so, struct mbuf *nam, struct proc *p) 548 { 549 struct inpcb *inp; 550 struct tcpcb *tp; 551 int error; 552 short ostate; 553 554 soassertlocked(so); 555 556 if ((error = tcp_sogetpcb(so, &inp, &tp))) 557 return (error); 558 559 if (so->so_options & SO_DEBUG) 560 ostate = tp->t_state; 561 562 error = in_pcbbind(inp, nam, p); 563 564 if (so->so_options & SO_DEBUG) 565 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_BIND, 0); 566 return (error); 567 } 568 569 /* 570 * Prepare to accept connections. 571 */ 572 int 573 tcp_listen(struct socket *so) 574 { 575 struct inpcb *inp; 576 struct tcpcb *tp, *otp = NULL; 577 int error; 578 short ostate; 579 580 soassertlocked(so); 581 582 if ((error = tcp_sogetpcb(so, &inp, &tp))) 583 return (error); 584 585 if (so->so_options & SO_DEBUG) { 586 otp = tp; 587 ostate = tp->t_state; 588 } 589 590 if (inp->inp_lport == 0) 591 if ((error = in_pcbbind(inp, NULL, curproc))) 592 goto out; 593 594 /* 595 * If the in_pcbbind() above is called, the tp->pf 596 * should still be whatever it was before. 597 */ 598 tp->t_state = TCPS_LISTEN; 599 600 out: 601 if (otp) 602 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_LISTEN, 0); 603 return (error); 604 } 605 606 /* 607 * Initiate connection to peer. 608 * Create a template for use in transmissions on this connection. 609 * Enter SYN_SENT state, and mark socket as connecting. 610 * Start keep-alive timer, and seed output sequence space. 611 * Send initial segment on connection. 612 */ 613 int 614 tcp_connect(struct socket *so, struct mbuf *nam) 615 { 616 struct inpcb *inp; 617 struct tcpcb *tp, *otp = NULL; 618 int error; 619 short ostate; 620 621 soassertlocked(so); 622 623 if ((error = tcp_sogetpcb(so, &inp, &tp))) 624 return (error); 625 626 if (so->so_options & SO_DEBUG) { 627 otp = tp; 628 ostate = tp->t_state; 629 } 630 631 #ifdef INET6 632 if (inp->inp_flags & INP_IPV6) { 633 struct sockaddr_in6 *sin6; 634 635 if ((error = in6_nam2sin6(nam, &sin6))) 636 goto out; 637 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || 638 IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { 639 error = EINVAL; 640 goto out; 641 } 642 error = in6_pcbconnect(inp, nam); 643 } else 644 #endif /* INET6 */ 645 { 646 struct sockaddr_in *sin; 647 648 if ((error = in_nam2sin(nam, &sin))) 649 goto out; 650 if ((sin->sin_addr.s_addr == INADDR_ANY) || 651 (sin->sin_addr.s_addr == INADDR_BROADCAST) || 652 IN_MULTICAST(sin->sin_addr.s_addr) || 653 in_broadcast(sin->sin_addr, inp->inp_rtableid)) { 654 error = EINVAL; 655 goto out; 656 } 657 error = in_pcbconnect(inp, nam); 658 } 659 if (error) 660 goto out; 661 662 tp->t_template = tcp_template(tp); 663 if (tp->t_template == 0) { 664 in_pcbdisconnect(inp); 665 error = ENOBUFS; 666 goto out; 667 } 668 669 so->so_state |= SS_CONNECTOUT; 670 671 /* Compute window scaling to request. */ 672 tcp_rscale(tp, sb_max); 673 674 soisconnecting(so); 675 tcpstat_inc(tcps_connattempt); 676 tp->t_state = TCPS_SYN_SENT; 677 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 678 tcp_set_iss_tsm(tp); 679 tcp_sendseqinit(tp); 680 tp->snd_last = tp->snd_una; 681 error = tcp_output(tp); 682 683 out: 684 if (otp) 685 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_CONNECT, 0); 686 return (error); 687 } 688 689 /* 690 * Accept a connection. Essentially all the work is done at higher 691 * levels; just return the address of the peer, storing through addr. 692 */ 693 int 694 tcp_accept(struct socket *so, struct mbuf *nam) 695 { 696 struct inpcb *inp; 697 struct tcpcb *tp; 698 int error; 699 short ostate; 700 701 soassertlocked(so); 702 703 if ((error = tcp_sogetpcb(so, &inp, &tp))) 704 return (error); 705 706 if (so->so_options & SO_DEBUG) 707 ostate = tp->t_state; 708 709 #ifdef INET6 710 if (inp->inp_flags & INP_IPV6) 711 in6_setpeeraddr(inp, nam); 712 else 713 #endif 714 in_setpeeraddr(inp, nam); 715 716 if (so->so_options & SO_DEBUG) 717 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_ACCEPT, 0); 718 return (error); 719 } 720 721 /* 722 * Initiate disconnect from peer. 723 * If connection never passed embryonic stage, just drop; 724 * else if don't need to let data drain, then can just drop anyways, 725 * else have to begin TCP shutdown process: mark socket disconnecting, 726 * drain unread data, state switch to reflect user close, and 727 * send segment (e.g. FIN) to peer. Socket will be really disconnected 728 * when peer sends FIN and acks ours. 729 * 730 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 731 */ 732 int 733 tcp_disconnect(struct socket *so) 734 { 735 struct inpcb *inp; 736 struct tcpcb *tp, *otp = NULL; 737 int error; 738 short ostate; 739 740 soassertlocked(so); 741 742 if ((error = tcp_sogetpcb(so, &inp, &tp))) 743 return (error); 744 745 if (so->so_options & SO_DEBUG) { 746 otp = tp; 747 ostate = tp->t_state; 748 } 749 750 tp = tcp_dodisconnect(tp); 751 752 if (otp) 753 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DISCONNECT, 0); 754 return (0); 755 } 756 757 /* 758 * Mark the connection as being incapable of further output. 759 */ 760 int 761 tcp_shutdown(struct socket *so) 762 { 763 struct inpcb *inp; 764 struct tcpcb *tp, *otp = NULL; 765 int error; 766 short ostate; 767 768 soassertlocked(so); 769 770 if ((error = tcp_sogetpcb(so, &inp, &tp))) 771 return (error); 772 773 if (so->so_options & SO_DEBUG) { 774 otp = tp; 775 ostate = tp->t_state; 776 } 777 778 if (so->so_state & SS_CANTSENDMORE) 779 goto out; 780 781 socantsendmore(so); 782 tp = tcp_usrclosed(tp); 783 if (tp) 784 error = tcp_output(tp); 785 786 out: 787 if (otp) 788 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_SHUTDOWN, 0); 789 return (error); 790 } 791 792 /* 793 * After a receive, possibly send window update to peer. 794 */ 795 void 796 tcp_rcvd(struct socket *so) 797 { 798 struct inpcb *inp; 799 struct tcpcb *tp; 800 short ostate; 801 802 soassertlocked(so); 803 804 if (tcp_sogetpcb(so, &inp, &tp)) 805 return; 806 807 if (so->so_options & SO_DEBUG) 808 ostate = tp->t_state; 809 810 /* 811 * soreceive() calls this function when a user receives 812 * ancillary data on a listening socket. We don't call 813 * tcp_output in such a case, since there is no header 814 * template for a listening socket and hence the kernel 815 * will panic. 816 */ 817 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 818 (void) tcp_output(tp); 819 820 if (so->so_options & SO_DEBUG) 821 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_RCVD, 0); 822 } 823 824 /* 825 * Do a send by putting data in output queue and updating urgent 826 * marker if URG set. Possibly send more data. 827 */ 828 int 829 tcp_send(struct socket *so, struct mbuf *m, struct mbuf *nam, 830 struct mbuf *control) 831 { 832 struct inpcb *inp; 833 struct tcpcb *tp; 834 int error; 835 short ostate; 836 837 soassertlocked(so); 838 839 if (control && control->m_len) { 840 error = EINVAL; 841 goto out; 842 } 843 844 if ((error = tcp_sogetpcb(so, &inp, &tp))) 845 goto out; 846 847 if (so->so_options & SO_DEBUG) 848 ostate = tp->t_state; 849 850 sbappendstream(so, &so->so_snd, m); 851 m = NULL; 852 853 error = tcp_output(tp); 854 855 if (so->so_options & SO_DEBUG) 856 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SEND, 0); 857 858 out: 859 m_freem(control); 860 m_freem(m); 861 862 return (error); 863 } 864 865 /* 866 * Abort the TCP. 867 */ 868 int 869 tcp_abort(struct socket *so) 870 { 871 struct inpcb *inp; 872 struct tcpcb *tp, *otp = NULL; 873 int error; 874 short ostate; 875 876 soassertlocked(so); 877 878 if ((error = tcp_sogetpcb(so, &inp, &tp))) 879 return (error); 880 881 if (so->so_options & SO_DEBUG) { 882 otp = tp; 883 ostate = tp->t_state; 884 } 885 886 tp = tcp_drop(tp, ECONNABORTED); 887 888 if (otp) 889 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_ABORT, 0); 890 return (0); 891 } 892 893 int 894 tcp_sense(struct socket *so, struct stat *ub) 895 { 896 struct inpcb *inp; 897 struct tcpcb *tp; 898 int error; 899 900 soassertlocked(so); 901 902 if ((error = tcp_sogetpcb(so, &inp, &tp))) 903 return (error); 904 905 ub->st_blksize = so->so_snd.sb_hiwat; 906 907 if (so->so_options & SO_DEBUG) 908 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_SENSE, 0); 909 return (0); 910 } 911 912 int 913 tcp_rcvoob(struct socket *so, struct mbuf *m, int flags) 914 { 915 struct inpcb *inp; 916 struct tcpcb *tp; 917 int error; 918 919 soassertlocked(so); 920 921 if ((error = tcp_sogetpcb(so, &inp, &tp))) 922 return (error); 923 924 if ((so->so_oobmark == 0 && 925 (so->so_state & SS_RCVATMARK) == 0) || 926 so->so_options & SO_OOBINLINE || 927 tp->t_oobflags & TCPOOB_HADDATA) { 928 error = EINVAL; 929 goto out; 930 } 931 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 932 error = EWOULDBLOCK; 933 goto out; 934 } 935 m->m_len = 1; 936 *mtod(m, caddr_t) = tp->t_iobc; 937 if ((flags & MSG_PEEK) == 0) 938 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 939 out: 940 if (so->so_options & SO_DEBUG) 941 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_RCVOOB, 0); 942 return (error); 943 } 944 945 int 946 tcp_sendoob(struct socket *so, struct mbuf *m, struct mbuf *nam, 947 struct mbuf *control) 948 { 949 struct inpcb *inp; 950 struct tcpcb *tp; 951 int error; 952 short ostate; 953 954 soassertlocked(so); 955 956 if (control && control->m_len) { 957 error = EINVAL; 958 goto release; 959 } 960 961 if ((error = tcp_sogetpcb(so, &inp, &tp))) 962 goto release; 963 964 if (so->so_options & SO_DEBUG) 965 ostate = tp->t_state; 966 967 if (sbspace(so, &so->so_snd) < -512) { 968 error = ENOBUFS; 969 goto out; 970 } 971 972 /* 973 * According to RFC961 (Assigned Protocols), 974 * the urgent pointer points to the last octet 975 * of urgent data. We continue, however, 976 * to consider it to indicate the first octet 977 * of data past the urgent section. 978 * Otherwise, snd_up should be one lower. 979 */ 980 sbappendstream(so, &so->so_snd, m); 981 m = NULL; 982 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 983 tp->t_force = 1; 984 error = tcp_output(tp); 985 tp->t_force = 0; 986 987 out: 988 if (so->so_options & SO_DEBUG) 989 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SENDOOB, 0); 990 991 release: 992 m_freem(control); 993 m_freem(m); 994 995 return (error); 996 } 997 998 int 999 tcp_sockaddr(struct socket *so, struct mbuf *nam) 1000 { 1001 struct inpcb *inp; 1002 struct tcpcb *tp; 1003 int error; 1004 1005 soassertlocked(so); 1006 1007 if ((error = tcp_sogetpcb(so, &inp, &tp))) 1008 return (error); 1009 1010 #ifdef INET6 1011 if (inp->inp_flags & INP_IPV6) 1012 in6_setsockaddr(inp, nam); 1013 else 1014 #endif 1015 in_setsockaddr(inp, nam); 1016 1017 if (so->so_options & SO_DEBUG) 1018 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, 1019 PRU_SOCKADDR, 0); 1020 return (0); 1021 } 1022 1023 int 1024 tcp_peeraddr(struct socket *so, struct mbuf *nam) 1025 { 1026 struct inpcb *inp; 1027 struct tcpcb *tp; 1028 int error; 1029 1030 soassertlocked(so); 1031 1032 if ((error = tcp_sogetpcb(so, &inp, &tp))) 1033 return (error); 1034 1035 #ifdef INET6 1036 if (inp->inp_flags & INP_IPV6) 1037 in6_setpeeraddr(inp, nam); 1038 else 1039 #endif 1040 in_setpeeraddr(inp, nam); 1041 1042 if (so->so_options & SO_DEBUG) 1043 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, 1044 PRU_PEERADDR, 0); 1045 return (0); 1046 } 1047 1048 /* 1049 * Initiate (or continue) disconnect. 1050 * If embryonic state, just send reset (once). 1051 * If in ``let data drain'' option and linger null, just drop. 1052 * Otherwise (hard), mark socket disconnecting and drop 1053 * current input data; switch states based on user close, and 1054 * send segment to peer (with FIN). 1055 */ 1056 struct tcpcb * 1057 tcp_dodisconnect(struct tcpcb *tp) 1058 { 1059 struct socket *so = tp->t_inpcb->inp_socket; 1060 1061 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 1062 tp = tcp_close(tp); 1063 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 1064 tp = tcp_drop(tp, 0); 1065 else { 1066 soisdisconnecting(so); 1067 sbflush(so, &so->so_rcv); 1068 tp = tcp_usrclosed(tp); 1069 if (tp) 1070 (void) tcp_output(tp); 1071 } 1072 return (tp); 1073 } 1074 1075 /* 1076 * User issued close, and wish to trail through shutdown states: 1077 * if never received SYN, just forget it. If got a SYN from peer, 1078 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 1079 * If already got a FIN from peer, then almost done; go to LAST_ACK 1080 * state. In all other cases, have already sent FIN to peer (e.g. 1081 * after PRU_SHUTDOWN), and just have to play tedious game waiting 1082 * for peer to send FIN or not respond to keep-alives, etc. 1083 * We can let the user exit from the close as soon as the FIN is acked. 1084 */ 1085 struct tcpcb * 1086 tcp_usrclosed(struct tcpcb *tp) 1087 { 1088 1089 switch (tp->t_state) { 1090 1091 case TCPS_CLOSED: 1092 case TCPS_LISTEN: 1093 case TCPS_SYN_SENT: 1094 tp->t_state = TCPS_CLOSED; 1095 tp = tcp_close(tp); 1096 break; 1097 1098 case TCPS_SYN_RECEIVED: 1099 case TCPS_ESTABLISHED: 1100 tp->t_state = TCPS_FIN_WAIT_1; 1101 break; 1102 1103 case TCPS_CLOSE_WAIT: 1104 tp->t_state = TCPS_LAST_ACK; 1105 break; 1106 } 1107 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 1108 soisdisconnected(tp->t_inpcb->inp_socket); 1109 /* 1110 * If we are in FIN_WAIT_2, we arrived here because the 1111 * application did a shutdown of the send side. Like the 1112 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 1113 * a full close, we start a timer to make sure sockets are 1114 * not left in FIN_WAIT_2 forever. 1115 */ 1116 if (tp->t_state == TCPS_FIN_WAIT_2) 1117 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1118 } 1119 return (tp); 1120 } 1121 1122 /* 1123 * Look up a socket for ident or tcpdrop, ... 1124 */ 1125 int 1126 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) 1127 { 1128 int error = 0; 1129 struct tcp_ident_mapping tir; 1130 struct inpcb *inp; 1131 struct tcpcb *tp = NULL; 1132 struct sockaddr_in *fin, *lin; 1133 #ifdef INET6 1134 struct sockaddr_in6 *fin6, *lin6; 1135 struct in6_addr f6, l6; 1136 #endif 1137 1138 NET_ASSERT_LOCKED(); 1139 1140 if (dodrop) { 1141 if (oldp != NULL || *oldlenp != 0) 1142 return (EINVAL); 1143 if (newp == NULL) 1144 return (EPERM); 1145 if (newlen < sizeof(tir)) 1146 return (ENOMEM); 1147 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) 1148 return (error); 1149 } else { 1150 if (oldp == NULL) 1151 return (EINVAL); 1152 if (*oldlenp < sizeof(tir)) 1153 return (ENOMEM); 1154 if (newp != NULL || newlen != 0) 1155 return (EINVAL); 1156 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) 1157 return (error); 1158 } 1159 switch (tir.faddr.ss_family) { 1160 #ifdef INET6 1161 case AF_INET6: 1162 fin6 = (struct sockaddr_in6 *)&tir.faddr; 1163 error = in6_embedscope(&f6, fin6, NULL); 1164 if (error) 1165 return EINVAL; /*?*/ 1166 lin6 = (struct sockaddr_in6 *)&tir.laddr; 1167 error = in6_embedscope(&l6, lin6, NULL); 1168 if (error) 1169 return EINVAL; /*?*/ 1170 break; 1171 #endif 1172 case AF_INET: 1173 fin = (struct sockaddr_in *)&tir.faddr; 1174 lin = (struct sockaddr_in *)&tir.laddr; 1175 break; 1176 default: 1177 return (EINVAL); 1178 } 1179 1180 switch (tir.faddr.ss_family) { 1181 #ifdef INET6 1182 case AF_INET6: 1183 inp = in6_pcblookup(&tcbtable, &f6, 1184 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); 1185 break; 1186 #endif 1187 case AF_INET: 1188 inp = in_pcblookup(&tcbtable, fin->sin_addr, 1189 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); 1190 break; 1191 default: 1192 unhandled_af(tir.faddr.ss_family); 1193 } 1194 1195 if (dodrop) { 1196 if (inp && (tp = intotcpcb(inp)) && 1197 ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) 1198 tp = tcp_drop(tp, ECONNABORTED); 1199 else 1200 error = ESRCH; 1201 in_pcbunref(inp); 1202 return (error); 1203 } 1204 1205 if (inp == NULL) { 1206 tcpstat_inc(tcps_pcbhashmiss); 1207 switch (tir.faddr.ss_family) { 1208 #ifdef INET6 1209 case AF_INET6: 1210 inp = in6_pcblookup_listen(&tcbtable, 1211 &l6, lin6->sin6_port, NULL, tir.rdomain); 1212 break; 1213 #endif 1214 case AF_INET: 1215 inp = in_pcblookup_listen(&tcbtable, 1216 lin->sin_addr, lin->sin_port, NULL, tir.rdomain); 1217 break; 1218 } 1219 } 1220 1221 if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) { 1222 tir.ruid = inp->inp_socket->so_ruid; 1223 tir.euid = inp->inp_socket->so_euid; 1224 } else { 1225 tir.ruid = -1; 1226 tir.euid = -1; 1227 } 1228 1229 *oldlenp = sizeof (tir); 1230 error = copyout((void *)&tir, oldp, sizeof (tir)); 1231 in_pcbunref(inp); 1232 return (error); 1233 } 1234 1235 int 1236 tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp) 1237 { 1238 uint64_t counters[tcps_ncounters]; 1239 struct tcpstat tcpstat; 1240 struct syn_cache_set *set; 1241 int i = 0; 1242 1243 #define ASSIGN(field) do { tcpstat.field = counters[i++]; } while (0) 1244 1245 memset(&tcpstat, 0, sizeof tcpstat); 1246 counters_read(tcpcounters, counters, nitems(counters)); 1247 ASSIGN(tcps_connattempt); 1248 ASSIGN(tcps_accepts); 1249 ASSIGN(tcps_connects); 1250 ASSIGN(tcps_drops); 1251 ASSIGN(tcps_conndrops); 1252 ASSIGN(tcps_closed); 1253 ASSIGN(tcps_segstimed); 1254 ASSIGN(tcps_rttupdated); 1255 ASSIGN(tcps_delack); 1256 ASSIGN(tcps_timeoutdrop); 1257 ASSIGN(tcps_rexmttimeo); 1258 ASSIGN(tcps_persisttimeo); 1259 ASSIGN(tcps_persistdrop); 1260 ASSIGN(tcps_keeptimeo); 1261 ASSIGN(tcps_keepprobe); 1262 ASSIGN(tcps_keepdrops); 1263 ASSIGN(tcps_sndtotal); 1264 ASSIGN(tcps_sndpack); 1265 ASSIGN(tcps_sndbyte); 1266 ASSIGN(tcps_sndrexmitpack); 1267 ASSIGN(tcps_sndrexmitbyte); 1268 ASSIGN(tcps_sndrexmitfast); 1269 ASSIGN(tcps_sndacks); 1270 ASSIGN(tcps_sndprobe); 1271 ASSIGN(tcps_sndurg); 1272 ASSIGN(tcps_sndwinup); 1273 ASSIGN(tcps_sndctrl); 1274 ASSIGN(tcps_rcvtotal); 1275 ASSIGN(tcps_rcvpack); 1276 ASSIGN(tcps_rcvbyte); 1277 ASSIGN(tcps_rcvbadsum); 1278 ASSIGN(tcps_rcvbadoff); 1279 ASSIGN(tcps_rcvmemdrop); 1280 ASSIGN(tcps_rcvnosec); 1281 ASSIGN(tcps_rcvshort); 1282 ASSIGN(tcps_rcvduppack); 1283 ASSIGN(tcps_rcvdupbyte); 1284 ASSIGN(tcps_rcvpartduppack); 1285 ASSIGN(tcps_rcvpartdupbyte); 1286 ASSIGN(tcps_rcvoopack); 1287 ASSIGN(tcps_rcvoobyte); 1288 ASSIGN(tcps_rcvpackafterwin); 1289 ASSIGN(tcps_rcvbyteafterwin); 1290 ASSIGN(tcps_rcvafterclose); 1291 ASSIGN(tcps_rcvwinprobe); 1292 ASSIGN(tcps_rcvdupack); 1293 ASSIGN(tcps_rcvacktoomuch); 1294 ASSIGN(tcps_rcvacktooold); 1295 ASSIGN(tcps_rcvackpack); 1296 ASSIGN(tcps_rcvackbyte); 1297 ASSIGN(tcps_rcvwinupd); 1298 ASSIGN(tcps_pawsdrop); 1299 ASSIGN(tcps_predack); 1300 ASSIGN(tcps_preddat); 1301 ASSIGN(tcps_pcbhashmiss); 1302 ASSIGN(tcps_noport); 1303 ASSIGN(tcps_badsyn); 1304 ASSIGN(tcps_dropsyn); 1305 ASSIGN(tcps_rcvbadsig); 1306 ASSIGN(tcps_rcvgoodsig); 1307 ASSIGN(tcps_inswcsum); 1308 ASSIGN(tcps_outswcsum); 1309 ASSIGN(tcps_ecn_accepts); 1310 ASSIGN(tcps_ecn_rcvece); 1311 ASSIGN(tcps_ecn_rcvcwr); 1312 ASSIGN(tcps_ecn_rcvce); 1313 ASSIGN(tcps_ecn_sndect); 1314 ASSIGN(tcps_ecn_sndece); 1315 ASSIGN(tcps_ecn_sndcwr); 1316 ASSIGN(tcps_cwr_ecn); 1317 ASSIGN(tcps_cwr_frecovery); 1318 ASSIGN(tcps_cwr_timeout); 1319 ASSIGN(tcps_sc_added); 1320 ASSIGN(tcps_sc_completed); 1321 ASSIGN(tcps_sc_timed_out); 1322 ASSIGN(tcps_sc_overflowed); 1323 ASSIGN(tcps_sc_reset); 1324 ASSIGN(tcps_sc_unreach); 1325 ASSIGN(tcps_sc_bucketoverflow); 1326 ASSIGN(tcps_sc_aborted); 1327 ASSIGN(tcps_sc_dupesyn); 1328 ASSIGN(tcps_sc_dropped); 1329 ASSIGN(tcps_sc_collisions); 1330 ASSIGN(tcps_sc_retransmitted); 1331 ASSIGN(tcps_sc_seedrandom); 1332 ASSIGN(tcps_sc_hash_size); 1333 ASSIGN(tcps_sc_entry_count); 1334 ASSIGN(tcps_sc_entry_limit); 1335 ASSIGN(tcps_sc_bucket_maxlen); 1336 ASSIGN(tcps_sc_bucket_limit); 1337 ASSIGN(tcps_sc_uses_left); 1338 ASSIGN(tcps_conndrained); 1339 ASSIGN(tcps_sack_recovery_episode); 1340 ASSIGN(tcps_sack_rexmits); 1341 ASSIGN(tcps_sack_rexmit_bytes); 1342 ASSIGN(tcps_sack_rcv_opts); 1343 ASSIGN(tcps_sack_snd_opts); 1344 ASSIGN(tcps_sack_drop_opts); 1345 1346 #undef ASSIGN 1347 1348 set = &tcp_syn_cache[tcp_syn_cache_active]; 1349 tcpstat.tcps_sc_hash_size = set->scs_size; 1350 tcpstat.tcps_sc_entry_count = set->scs_count; 1351 tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit; 1352 tcpstat.tcps_sc_bucket_maxlen = 0; 1353 for (i = 0; i < set->scs_size; i++) { 1354 if (tcpstat.tcps_sc_bucket_maxlen < 1355 set->scs_buckethead[i].sch_length) 1356 tcpstat.tcps_sc_bucket_maxlen = 1357 set->scs_buckethead[i].sch_length; 1358 } 1359 tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit; 1360 tcpstat.tcps_sc_uses_left = set->scs_use; 1361 1362 return (sysctl_rdstruct(oldp, oldlenp, newp, 1363 &tcpstat, sizeof(tcpstat))); 1364 } 1365 1366 /* 1367 * Sysctl for tcp variables. 1368 */ 1369 int 1370 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 1371 size_t newlen) 1372 { 1373 int error, nval; 1374 1375 /* All sysctl names at this level are terminal. */ 1376 if (namelen != 1) 1377 return (ENOTDIR); 1378 1379 switch (name[0]) { 1380 case TCPCTL_BADDYNAMIC: 1381 NET_LOCK(); 1382 error = sysctl_struct(oldp, oldlenp, newp, newlen, 1383 baddynamicports.tcp, sizeof(baddynamicports.tcp)); 1384 NET_UNLOCK(); 1385 return (error); 1386 1387 case TCPCTL_ROOTONLY: 1388 if (newp && securelevel > 0) 1389 return (EPERM); 1390 NET_LOCK(); 1391 error = sysctl_struct(oldp, oldlenp, newp, newlen, 1392 rootonlyports.tcp, sizeof(rootonlyports.tcp)); 1393 NET_UNLOCK(); 1394 return (error); 1395 1396 case TCPCTL_IDENT: 1397 NET_LOCK(); 1398 error = tcp_ident(oldp, oldlenp, newp, newlen, 0); 1399 NET_UNLOCK(); 1400 return (error); 1401 1402 case TCPCTL_DROP: 1403 NET_LOCK(); 1404 error = tcp_ident(oldp, oldlenp, newp, newlen, 1); 1405 NET_UNLOCK(); 1406 return (error); 1407 1408 case TCPCTL_REASS_LIMIT: 1409 NET_LOCK(); 1410 nval = tcp_reass_limit; 1411 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1412 if (!error && nval != tcp_reass_limit) { 1413 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); 1414 if (!error) 1415 tcp_reass_limit = nval; 1416 } 1417 NET_UNLOCK(); 1418 return (error); 1419 1420 case TCPCTL_SACKHOLE_LIMIT: 1421 NET_LOCK(); 1422 nval = tcp_sackhole_limit; 1423 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1424 if (!error && nval != tcp_sackhole_limit) { 1425 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); 1426 if (!error) 1427 tcp_sackhole_limit = nval; 1428 } 1429 NET_UNLOCK(); 1430 return (error); 1431 1432 case TCPCTL_STATS: 1433 return (tcp_sysctl_tcpstat(oldp, oldlenp, newp)); 1434 1435 case TCPCTL_SYN_USE_LIMIT: 1436 NET_LOCK(); 1437 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, 1438 &tcp_syn_use_limit, 0, INT_MAX); 1439 if (!error && newp != NULL) { 1440 /* 1441 * Global tcp_syn_use_limit is used when reseeding a 1442 * new cache. Also update the value in active cache. 1443 */ 1444 if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit) 1445 tcp_syn_cache[0].scs_use = tcp_syn_use_limit; 1446 if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit) 1447 tcp_syn_cache[1].scs_use = tcp_syn_use_limit; 1448 } 1449 NET_UNLOCK(); 1450 return (error); 1451 1452 case TCPCTL_SYN_HASH_SIZE: 1453 NET_LOCK(); 1454 nval = tcp_syn_hash_size; 1455 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, 1456 &nval, 1, 100000); 1457 if (!error && nval != tcp_syn_hash_size) { 1458 /* 1459 * If global hash size has been changed, 1460 * switch sets as soon as possible. Then 1461 * the actual hash array will be reallocated. 1462 */ 1463 if (tcp_syn_cache[0].scs_size != nval) 1464 tcp_syn_cache[0].scs_use = 0; 1465 if (tcp_syn_cache[1].scs_size != nval) 1466 tcp_syn_cache[1].scs_use = 0; 1467 tcp_syn_hash_size = nval; 1468 } 1469 NET_UNLOCK(); 1470 return (error); 1471 1472 default: 1473 NET_LOCK(); 1474 error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars), name, 1475 namelen, oldp, oldlenp, newp, newlen); 1476 NET_UNLOCK(); 1477 return (error); 1478 } 1479 /* NOTREACHED */ 1480 } 1481 1482 /* 1483 * Scale the send buffer so that inflight data is not accounted against 1484 * the limit. The buffer will scale with the congestion window, if the 1485 * the receiver stops acking data the window will shrink and therefore 1486 * the buffer size will shrink as well. 1487 * In low memory situation try to shrink the buffer to the initial size 1488 * disabling the send buffer scaling as long as the situation persists. 1489 */ 1490 void 1491 tcp_update_sndspace(struct tcpcb *tp) 1492 { 1493 struct socket *so = tp->t_inpcb->inp_socket; 1494 u_long nmax = so->so_snd.sb_hiwat; 1495 1496 if (sbchecklowmem()) { 1497 /* low on memory try to get rid of some */ 1498 if (tcp_sendspace < nmax) 1499 nmax = tcp_sendspace; 1500 } else if (so->so_snd.sb_wat != tcp_sendspace) 1501 /* user requested buffer size, auto-scaling disabled */ 1502 nmax = so->so_snd.sb_wat; 1503 else 1504 /* automatic buffer scaling */ 1505 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - 1506 tp->snd_una); 1507 1508 /* a writable socket must be preserved because of poll(2) semantics */ 1509 if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) { 1510 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) 1511 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; 1512 /* keep in sync with sbreserve() calculation */ 1513 if (nmax * 8 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat) 1514 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+7) / 8; 1515 } 1516 1517 /* round to MSS boundary */ 1518 nmax = roundup(nmax, tp->t_maxseg); 1519 1520 if (nmax != so->so_snd.sb_hiwat) 1521 sbreserve(so, &so->so_snd, nmax); 1522 } 1523 1524 /* 1525 * Scale the recv buffer by looking at how much data was transferred in 1526 * on approximated RTT. If more than a big part of the recv buffer was 1527 * transferred during that time we increase the buffer by a constant. 1528 * In low memory situation try to shrink the buffer to the initial size. 1529 */ 1530 void 1531 tcp_update_rcvspace(struct tcpcb *tp) 1532 { 1533 struct socket *so = tp->t_inpcb->inp_socket; 1534 u_long nmax = so->so_rcv.sb_hiwat; 1535 1536 if (sbchecklowmem()) { 1537 /* low on memory try to get rid of some */ 1538 if (tcp_recvspace < nmax) 1539 nmax = tcp_recvspace; 1540 } else if (so->so_rcv.sb_wat != tcp_recvspace) 1541 /* user requested buffer size, auto-scaling disabled */ 1542 nmax = so->so_rcv.sb_wat; 1543 else { 1544 /* automatic buffer scaling */ 1545 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) 1546 nmax = MIN(sb_max, so->so_rcv.sb_hiwat + 1547 tcp_autorcvbuf_inc); 1548 } 1549 1550 /* a readable socket must be preserved because of poll(2) semantics */ 1551 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && 1552 nmax < so->so_snd.sb_lowat) 1553 nmax = so->so_snd.sb_lowat; 1554 1555 if (nmax == so->so_rcv.sb_hiwat) 1556 return; 1557 1558 /* round to MSS boundary */ 1559 nmax = roundup(nmax, tp->t_maxseg); 1560 sbreserve(so, &so->so_rcv, nmax); 1561 } 1562