1 /* $OpenBSD: tcp_usrreq.c,v 1.227 2023/12/03 20:24:17 bluhm Exp $ */ 2 /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/protosw.h> 77 #include <sys/stat.h> 78 #include <sys/sysctl.h> 79 #include <sys/domain.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 #include <sys/proc.h> 83 84 #include <net/if.h> 85 #include <net/if_var.h> 86 #include <net/route.h> 87 88 #include <netinet/in.h> 89 #include <netinet/in_var.h> 90 #include <netinet/ip.h> 91 #include <netinet/in_pcb.h> 92 #include <netinet/ip_var.h> 93 #include <netinet/tcp.h> 94 #include <netinet/tcp_fsm.h> 95 #include <netinet/tcp_seq.h> 96 #include <netinet/tcp_timer.h> 97 #include <netinet/tcp_var.h> 98 #include <netinet/tcp_debug.h> 99 100 #ifdef INET6 101 #include <netinet6/in6_var.h> 102 #endif 103 104 #ifndef TCP_SENDSPACE 105 #define TCP_SENDSPACE 1024*16 106 #endif 107 u_int tcp_sendspace = TCP_SENDSPACE; 108 #ifndef TCP_RECVSPACE 109 #define TCP_RECVSPACE 1024*16 110 #endif 111 u_int tcp_recvspace = TCP_RECVSPACE; 112 u_int tcp_autorcvbuf_inc = 16 * 1024; 113 114 const struct pr_usrreqs tcp_usrreqs = { 115 .pru_attach = tcp_attach, 116 .pru_detach = tcp_detach, 117 .pru_bind = tcp_bind, 118 .pru_listen = tcp_listen, 119 .pru_connect = tcp_connect, 120 .pru_accept = tcp_accept, 121 .pru_disconnect = tcp_disconnect, 122 .pru_shutdown = tcp_shutdown, 123 .pru_rcvd = tcp_rcvd, 124 .pru_send = tcp_send, 125 .pru_abort = tcp_abort, 126 .pru_sense = tcp_sense, 127 .pru_rcvoob = tcp_rcvoob, 128 .pru_sendoob = tcp_sendoob, 129 .pru_control = in_control, 130 .pru_sockaddr = tcp_sockaddr, 131 .pru_peeraddr = tcp_peeraddr, 132 }; 133 134 #ifdef INET6 135 const struct pr_usrreqs tcp6_usrreqs = { 136 .pru_attach = tcp_attach, 137 .pru_detach = tcp_detach, 138 .pru_bind = tcp_bind, 139 .pru_listen = tcp_listen, 140 .pru_connect = tcp_connect, 141 .pru_accept = tcp_accept, 142 .pru_disconnect = tcp_disconnect, 143 .pru_shutdown = tcp_shutdown, 144 .pru_rcvd = tcp_rcvd, 145 .pru_send = tcp_send, 146 .pru_abort = tcp_abort, 147 .pru_sense = tcp_sense, 148 .pru_rcvoob = tcp_rcvoob, 149 .pru_sendoob = tcp_sendoob, 150 .pru_control = in6_control, 151 .pru_sockaddr = tcp_sockaddr, 152 .pru_peeraddr = tcp_peeraddr, 153 }; 154 #endif 155 156 const struct sysctl_bounded_args tcpctl_vars[] = { 157 { TCPCTL_RFC1323, &tcp_do_rfc1323, 0, 1 }, 158 { TCPCTL_SACK, &tcp_do_sack, 0, 1 }, 159 { TCPCTL_MSSDFLT, &tcp_mssdflt, TCP_MSS, 65535 }, 160 { TCPCTL_RSTPPSLIMIT, &tcp_rst_ppslim, 1, 1000 * 1000 }, 161 { TCPCTL_ACK_ON_PUSH, &tcp_ack_on_push, 0, 1 }, 162 #ifdef TCP_ECN 163 { TCPCTL_ECN, &tcp_do_ecn, 0, 1 }, 164 #endif 165 { TCPCTL_SYN_CACHE_LIMIT, &tcp_syn_cache_limit, 1, 1000 * 1000 }, 166 { TCPCTL_SYN_BUCKET_LIMIT, &tcp_syn_bucket_limit, 1, INT_MAX }, 167 { TCPCTL_RFC3390, &tcp_do_rfc3390, 0, 2 }, 168 { TCPCTL_ALWAYS_KEEPALIVE, &tcp_always_keepalive, 0, 1 }, 169 { TCPCTL_TSO, &tcp_do_tso, 0, 1 }, 170 }; 171 172 struct inpcbtable tcbtable; 173 174 int tcp_fill_info(struct tcpcb *, struct socket *, struct mbuf *); 175 int tcp_ident(void *, size_t *, void *, size_t, int); 176 177 static inline int tcp_sogetpcb(struct socket *, struct inpcb **, 178 struct tcpcb **); 179 180 static inline int 181 tcp_sogetpcb(struct socket *so, struct inpcb **rinp, struct tcpcb **rtp) 182 { 183 struct inpcb *inp; 184 struct tcpcb *tp; 185 186 /* 187 * When a TCP is attached to a socket, then there will be 188 * a (struct inpcb) pointed at by the socket, and this 189 * structure will point at a subsidiary (struct tcpcb). 190 */ 191 if ((inp = sotoinpcb(so)) == NULL || (tp = intotcpcb(inp)) == NULL) { 192 if (so->so_error) 193 return so->so_error; 194 return EINVAL; 195 } 196 197 *rinp = inp; 198 *rtp = tp; 199 200 return 0; 201 } 202 203 /* 204 * Export internal TCP state information via a struct tcp_info without 205 * leaking any sensitive information. Sequence numbers are reported 206 * relative to the initial sequence number. 207 */ 208 int 209 tcp_fill_info(struct tcpcb *tp, struct socket *so, struct mbuf *m) 210 { 211 struct proc *p = curproc; 212 struct tcp_info *ti; 213 u_int t = 1000; /* msec => usec */ 214 uint64_t now; 215 216 if (sizeof(*ti) > MLEN) { 217 MCLGETL(m, M_WAITOK, sizeof(*ti)); 218 if (!ISSET(m->m_flags, M_EXT)) 219 return ENOMEM; 220 } 221 ti = mtod(m, struct tcp_info *); 222 m->m_len = sizeof(*ti); 223 memset(ti, 0, sizeof(*ti)); 224 now = tcp_now(); 225 226 ti->tcpi_state = tp->t_state; 227 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) 228 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; 229 if (tp->t_flags & TF_SACK_PERMIT) 230 ti->tcpi_options |= TCPI_OPT_SACK; 231 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { 232 ti->tcpi_options |= TCPI_OPT_WSCALE; 233 ti->tcpi_snd_wscale = tp->snd_scale; 234 ti->tcpi_rcv_wscale = tp->rcv_scale; 235 } 236 #ifdef TCP_ECN 237 if (tp->t_flags & TF_ECN_PERMIT) 238 ti->tcpi_options |= TCPI_OPT_ECN; 239 #endif 240 241 ti->tcpi_rto = tp->t_rxtcur * t; 242 ti->tcpi_snd_mss = tp->t_maxseg; 243 ti->tcpi_rcv_mss = tp->t_peermss; 244 245 ti->tcpi_last_data_sent = (now - tp->t_sndtime) * t; 246 ti->tcpi_last_ack_sent = (now - tp->t_sndacktime) * t; 247 ti->tcpi_last_data_recv = (now - tp->t_rcvtime) * t; 248 ti->tcpi_last_ack_recv = (now - tp->t_rcvacktime) * t; 249 250 ti->tcpi_rtt = ((uint64_t)tp->t_srtt * t) >> 251 (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 252 ti->tcpi_rttvar = ((uint64_t)tp->t_rttvar * t) >> 253 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT); 254 ti->tcpi_snd_ssthresh = tp->snd_ssthresh; 255 ti->tcpi_snd_cwnd = tp->snd_cwnd; 256 257 ti->tcpi_rcv_space = tp->rcv_wnd; 258 259 /* 260 * Provide only minimal information for unprivileged processes. 261 */ 262 if (suser(p) != 0) 263 return 0; 264 265 /* FreeBSD-specific extension fields for tcp_info. */ 266 ti->tcpi_snd_wnd = tp->snd_wnd; 267 ti->tcpi_snd_nxt = tp->snd_nxt - tp->iss; 268 ti->tcpi_rcv_nxt = tp->rcv_nxt - tp->irs; 269 /* missing tcpi_toe_tid */ 270 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; 271 ti->tcpi_rcv_ooopack = tp->t_rcvoopack; 272 ti->tcpi_snd_zerowin = tp->t_sndzerowin; 273 274 /* OpenBSD extensions */ 275 ti->tcpi_rttmin = tp->t_rttmin * t; 276 ti->tcpi_max_sndwnd = tp->max_sndwnd; 277 ti->tcpi_rcv_adv = tp->rcv_adv - tp->irs; 278 ti->tcpi_rcv_up = tp->rcv_up - tp->irs; 279 ti->tcpi_snd_una = tp->snd_una - tp->iss; 280 ti->tcpi_snd_up = tp->snd_up - tp->iss; 281 ti->tcpi_snd_wl1 = tp->snd_wl1 - tp->iss; 282 ti->tcpi_snd_wl2 = tp->snd_wl2 - tp->iss; 283 ti->tcpi_snd_max = tp->snd_max - tp->iss; 284 285 ti->tcpi_ts_recent = tp->ts_recent; /* XXX value from the wire */ 286 ti->tcpi_ts_recent_age = (now - tp->ts_recent_age) * t; 287 ti->tcpi_rfbuf_cnt = tp->rfbuf_cnt; 288 ti->tcpi_rfbuf_ts = (now - tp->rfbuf_ts) * t; 289 290 ti->tcpi_so_rcv_sb_cc = so->so_rcv.sb_cc; 291 ti->tcpi_so_rcv_sb_hiwat = so->so_rcv.sb_hiwat; 292 ti->tcpi_so_rcv_sb_lowat = so->so_rcv.sb_lowat; 293 ti->tcpi_so_rcv_sb_wat = so->so_rcv.sb_wat; 294 ti->tcpi_so_snd_sb_cc = so->so_snd.sb_cc; 295 ti->tcpi_so_snd_sb_hiwat = so->so_snd.sb_hiwat; 296 ti->tcpi_so_snd_sb_lowat = so->so_snd.sb_lowat; 297 ti->tcpi_so_snd_sb_wat = so->so_snd.sb_wat; 298 299 return 0; 300 } 301 302 int 303 tcp_ctloutput(int op, struct socket *so, int level, int optname, 304 struct mbuf *m) 305 { 306 int error = 0; 307 struct inpcb *inp; 308 struct tcpcb *tp; 309 int i; 310 311 inp = sotoinpcb(so); 312 if (inp == NULL) 313 return (ECONNRESET); 314 if (level != IPPROTO_TCP) { 315 #ifdef INET6 316 if (ISSET(inp->inp_flags, INP_IPV6)) 317 error = ip6_ctloutput(op, so, level, optname, m); 318 else 319 #endif /* INET6 */ 320 error = ip_ctloutput(op, so, level, optname, m); 321 return (error); 322 } 323 tp = intotcpcb(inp); 324 325 switch (op) { 326 327 case PRCO_SETOPT: 328 switch (optname) { 329 330 case TCP_NODELAY: 331 if (m == NULL || m->m_len < sizeof (int)) 332 error = EINVAL; 333 else if (*mtod(m, int *)) 334 tp->t_flags |= TF_NODELAY; 335 else 336 tp->t_flags &= ~TF_NODELAY; 337 break; 338 339 case TCP_NOPUSH: 340 if (m == NULL || m->m_len < sizeof (int)) 341 error = EINVAL; 342 else if (*mtod(m, int *)) 343 tp->t_flags |= TF_NOPUSH; 344 else if (tp->t_flags & TF_NOPUSH) { 345 tp->t_flags &= ~TF_NOPUSH; 346 if (TCPS_HAVEESTABLISHED(tp->t_state)) 347 error = tcp_output(tp); 348 } 349 break; 350 351 case TCP_MAXSEG: 352 if (m == NULL || m->m_len < sizeof (int)) { 353 error = EINVAL; 354 break; 355 } 356 357 i = *mtod(m, int *); 358 if (i > 0 && i <= tp->t_maxseg) 359 tp->t_maxseg = i; 360 else 361 error = EINVAL; 362 break; 363 364 case TCP_SACK_ENABLE: 365 if (m == NULL || m->m_len < sizeof (int)) { 366 error = EINVAL; 367 break; 368 } 369 370 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 371 error = EPERM; 372 break; 373 } 374 375 if (tp->t_flags & TF_SIGNATURE) { 376 error = EPERM; 377 break; 378 } 379 380 if (*mtod(m, int *)) 381 tp->sack_enable = 1; 382 else 383 tp->sack_enable = 0; 384 break; 385 #ifdef TCP_SIGNATURE 386 case TCP_MD5SIG: 387 if (m == NULL || m->m_len < sizeof (int)) { 388 error = EINVAL; 389 break; 390 } 391 392 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 393 error = EPERM; 394 break; 395 } 396 397 if (*mtod(m, int *)) { 398 tp->t_flags |= TF_SIGNATURE; 399 tp->sack_enable = 0; 400 } else 401 tp->t_flags &= ~TF_SIGNATURE; 402 break; 403 #endif /* TCP_SIGNATURE */ 404 default: 405 error = ENOPROTOOPT; 406 break; 407 } 408 break; 409 410 case PRCO_GETOPT: 411 switch (optname) { 412 case TCP_NODELAY: 413 m->m_len = sizeof(int); 414 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 415 break; 416 case TCP_NOPUSH: 417 m->m_len = sizeof(int); 418 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 419 break; 420 case TCP_MAXSEG: 421 m->m_len = sizeof(int); 422 *mtod(m, int *) = tp->t_maxseg; 423 break; 424 case TCP_SACK_ENABLE: 425 m->m_len = sizeof(int); 426 *mtod(m, int *) = tp->sack_enable; 427 break; 428 case TCP_INFO: 429 error = tcp_fill_info(tp, so, m); 430 break; 431 #ifdef TCP_SIGNATURE 432 case TCP_MD5SIG: 433 m->m_len = sizeof(int); 434 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; 435 break; 436 #endif 437 default: 438 error = ENOPROTOOPT; 439 break; 440 } 441 break; 442 } 443 return (error); 444 } 445 446 /* 447 * Attach TCP protocol to socket, allocating 448 * internet protocol control block, tcp control block, 449 * buffer space, and entering LISTEN state to accept connections. 450 */ 451 int 452 tcp_attach(struct socket *so, int proto, int wait) 453 { 454 struct tcpcb *tp; 455 struct inpcb *inp; 456 int error; 457 458 if (so->so_pcb) 459 return EISCONN; 460 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || 461 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || 462 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { 463 error = soreserve(so, tcp_sendspace, tcp_recvspace); 464 if (error) 465 return (error); 466 } 467 468 NET_ASSERT_LOCKED(); 469 error = in_pcballoc(so, &tcbtable, wait); 470 if (error) 471 return (error); 472 inp = sotoinpcb(so); 473 tp = tcp_newtcpcb(inp, wait); 474 if (tp == NULL) { 475 unsigned int nofd = so->so_state & SS_NOFDREF; /* XXX */ 476 477 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 478 in_pcbdetach(inp); 479 so->so_state |= nofd; 480 return (ENOBUFS); 481 } 482 tp->t_state = TCPS_CLOSED; 483 #ifdef INET6 484 /* we disallow IPv4 mapped address completely. */ 485 if (inp->inp_flags & INP_IPV6) 486 tp->pf = PF_INET6; 487 else 488 tp->pf = PF_INET; 489 #else 490 tp->pf = PF_INET; 491 #endif 492 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 493 so->so_linger = TCP_LINGERTIME; 494 495 if (so->so_options & SO_DEBUG) 496 tcp_trace(TA_USER, TCPS_CLOSED, tp, tp, NULL, PRU_ATTACH, 0); 497 return (0); 498 } 499 500 int 501 tcp_detach(struct socket *so) 502 { 503 struct inpcb *inp; 504 struct tcpcb *otp = NULL, *tp; 505 int error = 0; 506 short ostate; 507 508 soassertlocked(so); 509 510 if ((error = tcp_sogetpcb(so, &inp, &tp))) 511 return (error); 512 513 if (so->so_options & SO_DEBUG) { 514 otp = tp; 515 ostate = tp->t_state; 516 } 517 518 /* 519 * Detach the TCP protocol from the socket. 520 * If the protocol state is non-embryonic, then can't 521 * do this directly: have to initiate a PRU_DISCONNECT, 522 * which may finish later; embryonic TCB's can just 523 * be discarded here. 524 */ 525 tp = tcp_dodisconnect(tp); 526 527 if (otp) 528 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DETACH, 0); 529 return (error); 530 } 531 532 /* 533 * Give the socket an address. 534 */ 535 int 536 tcp_bind(struct socket *so, struct mbuf *nam, struct proc *p) 537 { 538 struct inpcb *inp; 539 struct tcpcb *tp; 540 int error; 541 short ostate; 542 543 soassertlocked(so); 544 545 if ((error = tcp_sogetpcb(so, &inp, &tp))) 546 return (error); 547 548 if (so->so_options & SO_DEBUG) 549 ostate = tp->t_state; 550 551 error = in_pcbbind(inp, nam, p); 552 553 if (so->so_options & SO_DEBUG) 554 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_BIND, 0); 555 return (error); 556 } 557 558 /* 559 * Prepare to accept connections. 560 */ 561 int 562 tcp_listen(struct socket *so) 563 { 564 struct inpcb *inp; 565 struct tcpcb *tp, *otp = NULL; 566 int error; 567 short ostate; 568 569 soassertlocked(so); 570 571 if ((error = tcp_sogetpcb(so, &inp, &tp))) 572 return (error); 573 574 if (so->so_options & SO_DEBUG) { 575 otp = tp; 576 ostate = tp->t_state; 577 } 578 579 if (inp->inp_lport == 0) 580 if ((error = in_pcbbind(inp, NULL, curproc))) 581 goto out; 582 583 /* 584 * If the in_pcbbind() above is called, the tp->pf 585 * should still be whatever it was before. 586 */ 587 tp->t_state = TCPS_LISTEN; 588 589 out: 590 if (otp) 591 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_LISTEN, 0); 592 return (error); 593 } 594 595 /* 596 * Initiate connection to peer. 597 * Create a template for use in transmissions on this connection. 598 * Enter SYN_SENT state, and mark socket as connecting. 599 * Start keep-alive timer, and seed output sequence space. 600 * Send initial segment on connection. 601 */ 602 int 603 tcp_connect(struct socket *so, struct mbuf *nam) 604 { 605 struct inpcb *inp; 606 struct tcpcb *tp, *otp = NULL; 607 int error; 608 short ostate; 609 610 soassertlocked(so); 611 612 if ((error = tcp_sogetpcb(so, &inp, &tp))) 613 return (error); 614 615 if (so->so_options & SO_DEBUG) { 616 otp = tp; 617 ostate = tp->t_state; 618 } 619 620 #ifdef INET6 621 if (inp->inp_flags & INP_IPV6) { 622 struct sockaddr_in6 *sin6; 623 624 if ((error = in6_nam2sin6(nam, &sin6))) 625 goto out; 626 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || 627 IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { 628 error = EINVAL; 629 goto out; 630 } 631 } else 632 #endif /* INET6 */ 633 { 634 struct sockaddr_in *sin; 635 636 if ((error = in_nam2sin(nam, &sin))) 637 goto out; 638 if ((sin->sin_addr.s_addr == INADDR_ANY) || 639 (sin->sin_addr.s_addr == INADDR_BROADCAST) || 640 IN_MULTICAST(sin->sin_addr.s_addr) || 641 in_broadcast(sin->sin_addr, inp->inp_rtableid)) { 642 error = EINVAL; 643 goto out; 644 } 645 } 646 error = in_pcbconnect(inp, nam); 647 if (error) 648 goto out; 649 650 tp->t_template = tcp_template(tp); 651 if (tp->t_template == 0) { 652 in_pcbunset_faddr(inp); 653 in_pcbdisconnect(inp); 654 error = ENOBUFS; 655 goto out; 656 } 657 658 so->so_state |= SS_CONNECTOUT; 659 660 /* Compute window scaling to request. */ 661 tcp_rscale(tp, sb_max); 662 663 soisconnecting(so); 664 tcpstat_inc(tcps_connattempt); 665 tp->t_state = TCPS_SYN_SENT; 666 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 667 tcp_set_iss_tsm(tp); 668 tcp_sendseqinit(tp); 669 tp->snd_last = tp->snd_una; 670 error = tcp_output(tp); 671 672 out: 673 if (otp) 674 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_CONNECT, 0); 675 return (error); 676 } 677 678 /* 679 * Accept a connection. Essentially all the work is done at higher 680 * levels; just return the address of the peer, storing through addr. 681 */ 682 int 683 tcp_accept(struct socket *so, struct mbuf *nam) 684 { 685 struct inpcb *inp; 686 struct tcpcb *tp; 687 int error; 688 short ostate; 689 690 soassertlocked(so); 691 692 if ((error = tcp_sogetpcb(so, &inp, &tp))) 693 return (error); 694 695 if (so->so_options & SO_DEBUG) 696 ostate = tp->t_state; 697 698 #ifdef INET6 699 if (inp->inp_flags & INP_IPV6) 700 in6_setpeeraddr(inp, nam); 701 else 702 #endif 703 in_setpeeraddr(inp, nam); 704 705 if (so->so_options & SO_DEBUG) 706 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_ACCEPT, 0); 707 return (error); 708 } 709 710 /* 711 * Initiate disconnect from peer. 712 * If connection never passed embryonic stage, just drop; 713 * else if don't need to let data drain, then can just drop anyways, 714 * else have to begin TCP shutdown process: mark socket disconnecting, 715 * drain unread data, state switch to reflect user close, and 716 * send segment (e.g. FIN) to peer. Socket will be really disconnected 717 * when peer sends FIN and acks ours. 718 * 719 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 720 */ 721 int 722 tcp_disconnect(struct socket *so) 723 { 724 struct inpcb *inp; 725 struct tcpcb *tp, *otp = NULL; 726 int error; 727 short ostate; 728 729 soassertlocked(so); 730 731 if ((error = tcp_sogetpcb(so, &inp, &tp))) 732 return (error); 733 734 if (so->so_options & SO_DEBUG) { 735 otp = tp; 736 ostate = tp->t_state; 737 } 738 739 tp = tcp_dodisconnect(tp); 740 741 if (otp) 742 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DISCONNECT, 0); 743 return (0); 744 } 745 746 /* 747 * Mark the connection as being incapable of further output. 748 */ 749 int 750 tcp_shutdown(struct socket *so) 751 { 752 struct inpcb *inp; 753 struct tcpcb *tp, *otp = NULL; 754 int error; 755 short ostate; 756 757 soassertlocked(so); 758 759 if ((error = tcp_sogetpcb(so, &inp, &tp))) 760 return (error); 761 762 if (so->so_options & SO_DEBUG) { 763 otp = tp; 764 ostate = tp->t_state; 765 } 766 767 if (so->so_snd.sb_state & SS_CANTSENDMORE) 768 goto out; 769 770 socantsendmore(so); 771 tp = tcp_usrclosed(tp); 772 if (tp) 773 error = tcp_output(tp); 774 775 out: 776 if (otp) 777 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_SHUTDOWN, 0); 778 return (error); 779 } 780 781 /* 782 * After a receive, possibly send window update to peer. 783 */ 784 void 785 tcp_rcvd(struct socket *so) 786 { 787 struct inpcb *inp; 788 struct tcpcb *tp; 789 short ostate; 790 791 soassertlocked(so); 792 793 if (tcp_sogetpcb(so, &inp, &tp)) 794 return; 795 796 if (so->so_options & SO_DEBUG) 797 ostate = tp->t_state; 798 799 /* 800 * soreceive() calls this function when a user receives 801 * ancillary data on a listening socket. We don't call 802 * tcp_output in such a case, since there is no header 803 * template for a listening socket and hence the kernel 804 * will panic. 805 */ 806 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 807 (void) tcp_output(tp); 808 809 if (so->so_options & SO_DEBUG) 810 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_RCVD, 0); 811 } 812 813 /* 814 * Do a send by putting data in output queue and updating urgent 815 * marker if URG set. Possibly send more data. 816 */ 817 int 818 tcp_send(struct socket *so, struct mbuf *m, struct mbuf *nam, 819 struct mbuf *control) 820 { 821 struct inpcb *inp; 822 struct tcpcb *tp; 823 int error; 824 short ostate; 825 826 soassertlocked(so); 827 828 if (control && control->m_len) { 829 error = EINVAL; 830 goto out; 831 } 832 833 if ((error = tcp_sogetpcb(so, &inp, &tp))) 834 goto out; 835 836 if (so->so_options & SO_DEBUG) 837 ostate = tp->t_state; 838 839 sbappendstream(so, &so->so_snd, m); 840 m = NULL; 841 842 error = tcp_output(tp); 843 844 if (so->so_options & SO_DEBUG) 845 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SEND, 0); 846 847 out: 848 m_freem(control); 849 m_freem(m); 850 851 return (error); 852 } 853 854 /* 855 * Abort the TCP. 856 */ 857 void 858 tcp_abort(struct socket *so) 859 { 860 struct inpcb *inp; 861 struct tcpcb *tp, *otp = NULL; 862 short ostate; 863 864 soassertlocked(so); 865 866 if (tcp_sogetpcb(so, &inp, &tp)) 867 return; 868 869 if (so->so_options & SO_DEBUG) { 870 otp = tp; 871 ostate = tp->t_state; 872 } 873 874 tp = tcp_drop(tp, ECONNABORTED); 875 876 if (otp) 877 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_ABORT, 0); 878 } 879 880 int 881 tcp_sense(struct socket *so, struct stat *ub) 882 { 883 struct inpcb *inp; 884 struct tcpcb *tp; 885 int error; 886 887 soassertlocked(so); 888 889 if ((error = tcp_sogetpcb(so, &inp, &tp))) 890 return (error); 891 892 ub->st_blksize = so->so_snd.sb_hiwat; 893 894 if (so->so_options & SO_DEBUG) 895 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_SENSE, 0); 896 return (0); 897 } 898 899 int 900 tcp_rcvoob(struct socket *so, struct mbuf *m, int flags) 901 { 902 struct inpcb *inp; 903 struct tcpcb *tp; 904 int error; 905 906 soassertlocked(so); 907 908 if ((error = tcp_sogetpcb(so, &inp, &tp))) 909 return (error); 910 911 if ((so->so_oobmark == 0 && 912 (so->so_rcv.sb_state & SS_RCVATMARK) == 0) || 913 so->so_options & SO_OOBINLINE || 914 tp->t_oobflags & TCPOOB_HADDATA) { 915 error = EINVAL; 916 goto out; 917 } 918 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 919 error = EWOULDBLOCK; 920 goto out; 921 } 922 m->m_len = 1; 923 *mtod(m, caddr_t) = tp->t_iobc; 924 if ((flags & MSG_PEEK) == 0) 925 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 926 out: 927 if (so->so_options & SO_DEBUG) 928 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_RCVOOB, 0); 929 return (error); 930 } 931 932 int 933 tcp_sendoob(struct socket *so, struct mbuf *m, struct mbuf *nam, 934 struct mbuf *control) 935 { 936 struct inpcb *inp; 937 struct tcpcb *tp; 938 int error; 939 short ostate; 940 941 soassertlocked(so); 942 943 if (control && control->m_len) { 944 error = EINVAL; 945 goto release; 946 } 947 948 if ((error = tcp_sogetpcb(so, &inp, &tp))) 949 goto release; 950 951 if (so->so_options & SO_DEBUG) 952 ostate = tp->t_state; 953 954 if (sbspace(so, &so->so_snd) < -512) { 955 error = ENOBUFS; 956 goto out; 957 } 958 959 /* 960 * According to RFC961 (Assigned Protocols), 961 * the urgent pointer points to the last octet 962 * of urgent data. We continue, however, 963 * to consider it to indicate the first octet 964 * of data past the urgent section. 965 * Otherwise, snd_up should be one lower. 966 */ 967 sbappendstream(so, &so->so_snd, m); 968 m = NULL; 969 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 970 tp->t_force = 1; 971 error = tcp_output(tp); 972 tp->t_force = 0; 973 974 out: 975 if (so->so_options & SO_DEBUG) 976 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SENDOOB, 0); 977 978 release: 979 m_freem(control); 980 m_freem(m); 981 982 return (error); 983 } 984 985 int 986 tcp_sockaddr(struct socket *so, struct mbuf *nam) 987 { 988 struct inpcb *inp; 989 struct tcpcb *tp; 990 int error; 991 992 soassertlocked(so); 993 994 if ((error = tcp_sogetpcb(so, &inp, &tp))) 995 return (error); 996 997 #ifdef INET6 998 if (inp->inp_flags & INP_IPV6) 999 in6_setsockaddr(inp, nam); 1000 else 1001 #endif 1002 in_setsockaddr(inp, nam); 1003 1004 if (so->so_options & SO_DEBUG) 1005 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, 1006 PRU_SOCKADDR, 0); 1007 return (0); 1008 } 1009 1010 int 1011 tcp_peeraddr(struct socket *so, struct mbuf *nam) 1012 { 1013 struct inpcb *inp; 1014 struct tcpcb *tp; 1015 int error; 1016 1017 soassertlocked(so); 1018 1019 if ((error = tcp_sogetpcb(so, &inp, &tp))) 1020 return (error); 1021 1022 #ifdef INET6 1023 if (inp->inp_flags & INP_IPV6) 1024 in6_setpeeraddr(inp, nam); 1025 else 1026 #endif 1027 in_setpeeraddr(inp, nam); 1028 1029 if (so->so_options & SO_DEBUG) 1030 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, 1031 PRU_PEERADDR, 0); 1032 return (0); 1033 } 1034 1035 /* 1036 * Initiate (or continue) disconnect. 1037 * If embryonic state, just send reset (once). 1038 * If in ``let data drain'' option and linger null, just drop. 1039 * Otherwise (hard), mark socket disconnecting and drop 1040 * current input data; switch states based on user close, and 1041 * send segment to peer (with FIN). 1042 */ 1043 struct tcpcb * 1044 tcp_dodisconnect(struct tcpcb *tp) 1045 { 1046 struct socket *so = tp->t_inpcb->inp_socket; 1047 1048 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 1049 tp = tcp_close(tp); 1050 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 1051 tp = tcp_drop(tp, 0); 1052 else { 1053 soisdisconnecting(so); 1054 sbflush(so, &so->so_rcv); 1055 tp = tcp_usrclosed(tp); 1056 if (tp) 1057 (void) tcp_output(tp); 1058 } 1059 return (tp); 1060 } 1061 1062 /* 1063 * User issued close, and wish to trail through shutdown states: 1064 * if never received SYN, just forget it. If got a SYN from peer, 1065 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 1066 * If already got a FIN from peer, then almost done; go to LAST_ACK 1067 * state. In all other cases, have already sent FIN to peer (e.g. 1068 * after PRU_SHUTDOWN), and just have to play tedious game waiting 1069 * for peer to send FIN or not respond to keep-alives, etc. 1070 * We can let the user exit from the close as soon as the FIN is acked. 1071 */ 1072 struct tcpcb * 1073 tcp_usrclosed(struct tcpcb *tp) 1074 { 1075 1076 switch (tp->t_state) { 1077 1078 case TCPS_CLOSED: 1079 case TCPS_LISTEN: 1080 case TCPS_SYN_SENT: 1081 tp->t_state = TCPS_CLOSED; 1082 tp = tcp_close(tp); 1083 break; 1084 1085 case TCPS_SYN_RECEIVED: 1086 case TCPS_ESTABLISHED: 1087 tp->t_state = TCPS_FIN_WAIT_1; 1088 break; 1089 1090 case TCPS_CLOSE_WAIT: 1091 tp->t_state = TCPS_LAST_ACK; 1092 break; 1093 } 1094 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 1095 soisdisconnected(tp->t_inpcb->inp_socket); 1096 /* 1097 * If we are in FIN_WAIT_2, we arrived here because the 1098 * application did a shutdown of the send side. Like the 1099 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 1100 * a full close, we start a timer to make sure sockets are 1101 * not left in FIN_WAIT_2 forever. 1102 */ 1103 if (tp->t_state == TCPS_FIN_WAIT_2) 1104 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1105 } 1106 return (tp); 1107 } 1108 1109 /* 1110 * Look up a socket for ident or tcpdrop, ... 1111 */ 1112 int 1113 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) 1114 { 1115 int error = 0; 1116 struct tcp_ident_mapping tir; 1117 struct inpcb *inp; 1118 struct tcpcb *tp = NULL; 1119 struct sockaddr_in *fin, *lin; 1120 #ifdef INET6 1121 struct sockaddr_in6 *fin6, *lin6; 1122 struct in6_addr f6, l6; 1123 #endif 1124 1125 NET_ASSERT_LOCKED(); 1126 1127 if (dodrop) { 1128 if (oldp != NULL || *oldlenp != 0) 1129 return (EINVAL); 1130 if (newp == NULL) 1131 return (EPERM); 1132 if (newlen < sizeof(tir)) 1133 return (ENOMEM); 1134 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) 1135 return (error); 1136 } else { 1137 if (oldp == NULL) 1138 return (EINVAL); 1139 if (*oldlenp < sizeof(tir)) 1140 return (ENOMEM); 1141 if (newp != NULL || newlen != 0) 1142 return (EINVAL); 1143 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) 1144 return (error); 1145 } 1146 switch (tir.faddr.ss_family) { 1147 #ifdef INET6 1148 case AF_INET6: 1149 fin6 = (struct sockaddr_in6 *)&tir.faddr; 1150 error = in6_embedscope(&f6, fin6, NULL, NULL); 1151 if (error) 1152 return EINVAL; /*?*/ 1153 lin6 = (struct sockaddr_in6 *)&tir.laddr; 1154 error = in6_embedscope(&l6, lin6, NULL, NULL); 1155 if (error) 1156 return EINVAL; /*?*/ 1157 break; 1158 #endif 1159 case AF_INET: 1160 fin = (struct sockaddr_in *)&tir.faddr; 1161 lin = (struct sockaddr_in *)&tir.laddr; 1162 break; 1163 default: 1164 return (EINVAL); 1165 } 1166 1167 switch (tir.faddr.ss_family) { 1168 #ifdef INET6 1169 case AF_INET6: 1170 inp = in6_pcblookup(&tcbtable, &f6, 1171 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); 1172 break; 1173 #endif 1174 case AF_INET: 1175 inp = in_pcblookup(&tcbtable, fin->sin_addr, 1176 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); 1177 break; 1178 default: 1179 unhandled_af(tir.faddr.ss_family); 1180 } 1181 1182 if (dodrop) { 1183 if (inp && (tp = intotcpcb(inp)) && 1184 ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) 1185 tp = tcp_drop(tp, ECONNABORTED); 1186 else 1187 error = ESRCH; 1188 in_pcbunref(inp); 1189 return (error); 1190 } 1191 1192 if (inp == NULL) { 1193 tcpstat_inc(tcps_pcbhashmiss); 1194 switch (tir.faddr.ss_family) { 1195 #ifdef INET6 1196 case AF_INET6: 1197 inp = in6_pcblookup_listen(&tcbtable, 1198 &l6, lin6->sin6_port, NULL, tir.rdomain); 1199 break; 1200 #endif 1201 case AF_INET: 1202 inp = in_pcblookup_listen(&tcbtable, 1203 lin->sin_addr, lin->sin_port, NULL, tir.rdomain); 1204 break; 1205 } 1206 } 1207 1208 if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) { 1209 tir.ruid = inp->inp_socket->so_ruid; 1210 tir.euid = inp->inp_socket->so_euid; 1211 } else { 1212 tir.ruid = -1; 1213 tir.euid = -1; 1214 } 1215 1216 *oldlenp = sizeof (tir); 1217 error = copyout((void *)&tir, oldp, sizeof (tir)); 1218 in_pcbunref(inp); 1219 return (error); 1220 } 1221 1222 int 1223 tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp) 1224 { 1225 uint64_t counters[tcps_ncounters]; 1226 struct tcpstat tcpstat; 1227 struct syn_cache_set *set; 1228 int i = 0; 1229 1230 #define ASSIGN(field) do { tcpstat.field = counters[i++]; } while (0) 1231 1232 memset(&tcpstat, 0, sizeof tcpstat); 1233 counters_read(tcpcounters, counters, nitems(counters), NULL); 1234 ASSIGN(tcps_connattempt); 1235 ASSIGN(tcps_accepts); 1236 ASSIGN(tcps_connects); 1237 ASSIGN(tcps_drops); 1238 ASSIGN(tcps_conndrops); 1239 ASSIGN(tcps_closed); 1240 ASSIGN(tcps_segstimed); 1241 ASSIGN(tcps_rttupdated); 1242 ASSIGN(tcps_delack); 1243 ASSIGN(tcps_timeoutdrop); 1244 ASSIGN(tcps_rexmttimeo); 1245 ASSIGN(tcps_persisttimeo); 1246 ASSIGN(tcps_persistdrop); 1247 ASSIGN(tcps_keeptimeo); 1248 ASSIGN(tcps_keepprobe); 1249 ASSIGN(tcps_keepdrops); 1250 ASSIGN(tcps_sndtotal); 1251 ASSIGN(tcps_sndpack); 1252 ASSIGN(tcps_sndbyte); 1253 ASSIGN(tcps_sndrexmitpack); 1254 ASSIGN(tcps_sndrexmitbyte); 1255 ASSIGN(tcps_sndrexmitfast); 1256 ASSIGN(tcps_sndacks); 1257 ASSIGN(tcps_sndprobe); 1258 ASSIGN(tcps_sndurg); 1259 ASSIGN(tcps_sndwinup); 1260 ASSIGN(tcps_sndctrl); 1261 ASSIGN(tcps_rcvtotal); 1262 ASSIGN(tcps_rcvpack); 1263 ASSIGN(tcps_rcvbyte); 1264 ASSIGN(tcps_rcvbadsum); 1265 ASSIGN(tcps_rcvbadoff); 1266 ASSIGN(tcps_rcvmemdrop); 1267 ASSIGN(tcps_rcvnosec); 1268 ASSIGN(tcps_rcvshort); 1269 ASSIGN(tcps_rcvduppack); 1270 ASSIGN(tcps_rcvdupbyte); 1271 ASSIGN(tcps_rcvpartduppack); 1272 ASSIGN(tcps_rcvpartdupbyte); 1273 ASSIGN(tcps_rcvoopack); 1274 ASSIGN(tcps_rcvoobyte); 1275 ASSIGN(tcps_rcvpackafterwin); 1276 ASSIGN(tcps_rcvbyteafterwin); 1277 ASSIGN(tcps_rcvafterclose); 1278 ASSIGN(tcps_rcvwinprobe); 1279 ASSIGN(tcps_rcvdupack); 1280 ASSIGN(tcps_rcvacktoomuch); 1281 ASSIGN(tcps_rcvacktooold); 1282 ASSIGN(tcps_rcvackpack); 1283 ASSIGN(tcps_rcvackbyte); 1284 ASSIGN(tcps_rcvwinupd); 1285 ASSIGN(tcps_pawsdrop); 1286 ASSIGN(tcps_predack); 1287 ASSIGN(tcps_preddat); 1288 ASSIGN(tcps_pcbhashmiss); 1289 ASSIGN(tcps_noport); 1290 ASSIGN(tcps_badsyn); 1291 ASSIGN(tcps_dropsyn); 1292 ASSIGN(tcps_rcvbadsig); 1293 ASSIGN(tcps_rcvgoodsig); 1294 ASSIGN(tcps_inswcsum); 1295 ASSIGN(tcps_outswcsum); 1296 ASSIGN(tcps_ecn_accepts); 1297 ASSIGN(tcps_ecn_rcvece); 1298 ASSIGN(tcps_ecn_rcvcwr); 1299 ASSIGN(tcps_ecn_rcvce); 1300 ASSIGN(tcps_ecn_sndect); 1301 ASSIGN(tcps_ecn_sndece); 1302 ASSIGN(tcps_ecn_sndcwr); 1303 ASSIGN(tcps_cwr_ecn); 1304 ASSIGN(tcps_cwr_frecovery); 1305 ASSIGN(tcps_cwr_timeout); 1306 ASSIGN(tcps_sc_added); 1307 ASSIGN(tcps_sc_completed); 1308 ASSIGN(tcps_sc_timed_out); 1309 ASSIGN(tcps_sc_overflowed); 1310 ASSIGN(tcps_sc_reset); 1311 ASSIGN(tcps_sc_unreach); 1312 ASSIGN(tcps_sc_bucketoverflow); 1313 ASSIGN(tcps_sc_aborted); 1314 ASSIGN(tcps_sc_dupesyn); 1315 ASSIGN(tcps_sc_dropped); 1316 ASSIGN(tcps_sc_collisions); 1317 ASSIGN(tcps_sc_retransmitted); 1318 ASSIGN(tcps_sc_seedrandom); 1319 ASSIGN(tcps_sc_hash_size); 1320 ASSIGN(tcps_sc_entry_count); 1321 ASSIGN(tcps_sc_entry_limit); 1322 ASSIGN(tcps_sc_bucket_maxlen); 1323 ASSIGN(tcps_sc_bucket_limit); 1324 ASSIGN(tcps_sc_uses_left); 1325 ASSIGN(tcps_conndrained); 1326 ASSIGN(tcps_sack_recovery_episode); 1327 ASSIGN(tcps_sack_rexmits); 1328 ASSIGN(tcps_sack_rexmit_bytes); 1329 ASSIGN(tcps_sack_rcv_opts); 1330 ASSIGN(tcps_sack_snd_opts); 1331 ASSIGN(tcps_sack_drop_opts); 1332 ASSIGN(tcps_outswtso); 1333 ASSIGN(tcps_outhwtso); 1334 ASSIGN(tcps_outpkttso); 1335 ASSIGN(tcps_outbadtso); 1336 ASSIGN(tcps_inswlro); 1337 ASSIGN(tcps_inhwlro); 1338 ASSIGN(tcps_inpktlro); 1339 ASSIGN(tcps_inbadlro); 1340 1341 #undef ASSIGN 1342 1343 mtx_enter(&syn_cache_mtx); 1344 set = &tcp_syn_cache[tcp_syn_cache_active]; 1345 tcpstat.tcps_sc_hash_size = set->scs_size; 1346 tcpstat.tcps_sc_entry_count = set->scs_count; 1347 tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit; 1348 tcpstat.tcps_sc_bucket_maxlen = 0; 1349 for (i = 0; i < set->scs_size; i++) { 1350 if (tcpstat.tcps_sc_bucket_maxlen < 1351 set->scs_buckethead[i].sch_length) 1352 tcpstat.tcps_sc_bucket_maxlen = 1353 set->scs_buckethead[i].sch_length; 1354 } 1355 tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit; 1356 tcpstat.tcps_sc_uses_left = set->scs_use; 1357 mtx_leave(&syn_cache_mtx); 1358 1359 return (sysctl_rdstruct(oldp, oldlenp, newp, 1360 &tcpstat, sizeof(tcpstat))); 1361 } 1362 1363 /* 1364 * Sysctl for tcp variables. 1365 */ 1366 int 1367 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 1368 size_t newlen) 1369 { 1370 int error, nval; 1371 1372 /* All sysctl names at this level are terminal. */ 1373 if (namelen != 1) 1374 return (ENOTDIR); 1375 1376 switch (name[0]) { 1377 case TCPCTL_KEEPINITTIME: 1378 NET_LOCK(); 1379 nval = tcptv_keep_init / TCP_TIME(1); 1380 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval, 1381 1, 3 * (TCPTV_KEEP_INIT / TCP_TIME(1))); 1382 if (!error) 1383 tcptv_keep_init = TCP_TIME(nval); 1384 NET_UNLOCK(); 1385 return (error); 1386 1387 case TCPCTL_KEEPIDLE: 1388 NET_LOCK(); 1389 nval = tcp_keepidle / TCP_TIME(1); 1390 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval, 1391 1, 5 * (TCPTV_KEEP_IDLE / TCP_TIME(1))); 1392 if (!error) 1393 tcp_keepidle = TCP_TIME(nval); 1394 NET_UNLOCK(); 1395 return (error); 1396 1397 case TCPCTL_KEEPINTVL: 1398 NET_LOCK(); 1399 nval = tcp_keepintvl / TCP_TIME(1); 1400 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval, 1401 1, 3 * (TCPTV_KEEPINTVL / TCP_TIME(1))); 1402 if (!error) 1403 tcp_keepintvl = TCP_TIME(nval); 1404 NET_UNLOCK(); 1405 return (error); 1406 1407 case TCPCTL_BADDYNAMIC: 1408 NET_LOCK(); 1409 error = sysctl_struct(oldp, oldlenp, newp, newlen, 1410 baddynamicports.tcp, sizeof(baddynamicports.tcp)); 1411 NET_UNLOCK(); 1412 return (error); 1413 1414 case TCPCTL_ROOTONLY: 1415 if (newp && securelevel > 0) 1416 return (EPERM); 1417 NET_LOCK(); 1418 error = sysctl_struct(oldp, oldlenp, newp, newlen, 1419 rootonlyports.tcp, sizeof(rootonlyports.tcp)); 1420 NET_UNLOCK(); 1421 return (error); 1422 1423 case TCPCTL_IDENT: 1424 NET_LOCK(); 1425 error = tcp_ident(oldp, oldlenp, newp, newlen, 0); 1426 NET_UNLOCK(); 1427 return (error); 1428 1429 case TCPCTL_DROP: 1430 NET_LOCK(); 1431 error = tcp_ident(oldp, oldlenp, newp, newlen, 1); 1432 NET_UNLOCK(); 1433 return (error); 1434 1435 case TCPCTL_REASS_LIMIT: 1436 NET_LOCK(); 1437 nval = tcp_reass_limit; 1438 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1439 if (!error && nval != tcp_reass_limit) { 1440 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); 1441 if (!error) 1442 tcp_reass_limit = nval; 1443 } 1444 NET_UNLOCK(); 1445 return (error); 1446 1447 case TCPCTL_SACKHOLE_LIMIT: 1448 NET_LOCK(); 1449 nval = tcp_sackhole_limit; 1450 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1451 if (!error && nval != tcp_sackhole_limit) { 1452 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); 1453 if (!error) 1454 tcp_sackhole_limit = nval; 1455 } 1456 NET_UNLOCK(); 1457 return (error); 1458 1459 case TCPCTL_STATS: 1460 return (tcp_sysctl_tcpstat(oldp, oldlenp, newp)); 1461 1462 case TCPCTL_SYN_USE_LIMIT: 1463 NET_LOCK(); 1464 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, 1465 &tcp_syn_use_limit, 0, INT_MAX); 1466 if (!error && newp != NULL) { 1467 /* 1468 * Global tcp_syn_use_limit is used when reseeding a 1469 * new cache. Also update the value in active cache. 1470 */ 1471 mtx_enter(&syn_cache_mtx); 1472 if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit) 1473 tcp_syn_cache[0].scs_use = tcp_syn_use_limit; 1474 if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit) 1475 tcp_syn_cache[1].scs_use = tcp_syn_use_limit; 1476 mtx_leave(&syn_cache_mtx); 1477 } 1478 NET_UNLOCK(); 1479 return (error); 1480 1481 case TCPCTL_SYN_HASH_SIZE: 1482 NET_LOCK(); 1483 nval = tcp_syn_hash_size; 1484 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, 1485 &nval, 1, 100000); 1486 if (!error && nval != tcp_syn_hash_size) { 1487 /* 1488 * If global hash size has been changed, 1489 * switch sets as soon as possible. Then 1490 * the actual hash array will be reallocated. 1491 */ 1492 mtx_enter(&syn_cache_mtx); 1493 if (tcp_syn_cache[0].scs_size != nval) 1494 tcp_syn_cache[0].scs_use = 0; 1495 if (tcp_syn_cache[1].scs_size != nval) 1496 tcp_syn_cache[1].scs_use = 0; 1497 tcp_syn_hash_size = nval; 1498 mtx_leave(&syn_cache_mtx); 1499 } 1500 NET_UNLOCK(); 1501 return (error); 1502 1503 default: 1504 NET_LOCK(); 1505 error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars), 1506 name, namelen, oldp, oldlenp, newp, newlen); 1507 NET_UNLOCK(); 1508 return (error); 1509 } 1510 /* NOTREACHED */ 1511 } 1512 1513 /* 1514 * Scale the send buffer so that inflight data is not accounted against 1515 * the limit. The buffer will scale with the congestion window, if the 1516 * the receiver stops acking data the window will shrink and therefore 1517 * the buffer size will shrink as well. 1518 * In low memory situation try to shrink the buffer to the initial size 1519 * disabling the send buffer scaling as long as the situation persists. 1520 */ 1521 void 1522 tcp_update_sndspace(struct tcpcb *tp) 1523 { 1524 struct socket *so = tp->t_inpcb->inp_socket; 1525 u_long nmax = so->so_snd.sb_hiwat; 1526 1527 if (sbchecklowmem()) { 1528 /* low on memory try to get rid of some */ 1529 if (tcp_sendspace < nmax) 1530 nmax = tcp_sendspace; 1531 } else if (so->so_snd.sb_wat != tcp_sendspace) 1532 /* user requested buffer size, auto-scaling disabled */ 1533 nmax = so->so_snd.sb_wat; 1534 else 1535 /* automatic buffer scaling */ 1536 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - 1537 tp->snd_una); 1538 1539 /* a writable socket must be preserved because of poll(2) semantics */ 1540 if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) { 1541 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) 1542 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; 1543 /* keep in sync with sbreserve() calculation */ 1544 if (nmax * 8 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat) 1545 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+7) / 8; 1546 } 1547 1548 /* round to MSS boundary */ 1549 nmax = roundup(nmax, tp->t_maxseg); 1550 1551 if (nmax != so->so_snd.sb_hiwat) 1552 sbreserve(so, &so->so_snd, nmax); 1553 } 1554 1555 /* 1556 * Scale the recv buffer by looking at how much data was transferred in 1557 * one approximated RTT. If more than a big part of the recv buffer was 1558 * transferred during that time we increase the buffer by a constant. 1559 * In low memory situation try to shrink the buffer to the initial size. 1560 */ 1561 void 1562 tcp_update_rcvspace(struct tcpcb *tp) 1563 { 1564 struct socket *so = tp->t_inpcb->inp_socket; 1565 u_long nmax = so->so_rcv.sb_hiwat; 1566 1567 if (sbchecklowmem()) { 1568 /* low on memory try to get rid of some */ 1569 if (tcp_recvspace < nmax) 1570 nmax = tcp_recvspace; 1571 } else if (so->so_rcv.sb_wat != tcp_recvspace) 1572 /* user requested buffer size, auto-scaling disabled */ 1573 nmax = so->so_rcv.sb_wat; 1574 else { 1575 /* automatic buffer scaling */ 1576 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) 1577 nmax = MIN(sb_max, so->so_rcv.sb_hiwat + 1578 tcp_autorcvbuf_inc); 1579 } 1580 1581 /* a readable socket must be preserved because of poll(2) semantics */ 1582 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && 1583 nmax < so->so_snd.sb_lowat) 1584 nmax = so->so_snd.sb_lowat; 1585 1586 if (nmax == so->so_rcv.sb_hiwat) 1587 return; 1588 1589 /* round to MSS boundary */ 1590 nmax = roundup(nmax, tp->t_maxseg); 1591 sbreserve(so, &so->so_rcv, nmax); 1592 } 1593