1 /* $OpenBSD: tcp_usrreq.c,v 1.217 2023/03/14 00:24:05 yasuoka Exp $ */ 2 /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/protosw.h> 77 #include <sys/stat.h> 78 #include <sys/sysctl.h> 79 #include <sys/domain.h> 80 #include <sys/kernel.h> 81 #include <sys/pool.h> 82 #include <sys/proc.h> 83 84 #include <net/if.h> 85 #include <net/if_var.h> 86 #include <net/route.h> 87 88 #include <netinet/in.h> 89 #include <netinet/in_var.h> 90 #include <netinet/ip.h> 91 #include <netinet/in_pcb.h> 92 #include <netinet/ip_var.h> 93 #include <netinet/tcp.h> 94 #include <netinet/tcp_fsm.h> 95 #include <netinet/tcp_seq.h> 96 #include <netinet/tcp_timer.h> 97 #include <netinet/tcp_var.h> 98 #include <netinet/tcp_debug.h> 99 100 #ifdef INET6 101 #include <netinet6/in6_var.h> 102 #endif 103 104 #ifndef TCP_SENDSPACE 105 #define TCP_SENDSPACE 1024*16 106 #endif 107 u_int tcp_sendspace = TCP_SENDSPACE; 108 #ifndef TCP_RECVSPACE 109 #define TCP_RECVSPACE 1024*16 110 #endif 111 u_int tcp_recvspace = TCP_RECVSPACE; 112 u_int tcp_autorcvbuf_inc = 16 * 1024; 113 114 const struct pr_usrreqs tcp_usrreqs = { 115 .pru_attach = tcp_attach, 116 .pru_detach = tcp_detach, 117 .pru_bind = tcp_bind, 118 .pru_listen = tcp_listen, 119 .pru_connect = tcp_connect, 120 .pru_accept = tcp_accept, 121 .pru_disconnect = tcp_disconnect, 122 .pru_shutdown = tcp_shutdown, 123 .pru_rcvd = tcp_rcvd, 124 .pru_send = tcp_send, 125 .pru_abort = tcp_abort, 126 .pru_sense = tcp_sense, 127 .pru_rcvoob = tcp_rcvoob, 128 .pru_sendoob = tcp_sendoob, 129 .pru_control = in_control, 130 .pru_sockaddr = tcp_sockaddr, 131 .pru_peeraddr = tcp_peeraddr, 132 }; 133 134 #ifdef INET6 135 const struct pr_usrreqs tcp6_usrreqs = { 136 .pru_attach = tcp_attach, 137 .pru_detach = tcp_detach, 138 .pru_bind = tcp_bind, 139 .pru_listen = tcp_listen, 140 .pru_connect = tcp_connect, 141 .pru_accept = tcp_accept, 142 .pru_disconnect = tcp_disconnect, 143 .pru_shutdown = tcp_shutdown, 144 .pru_rcvd = tcp_rcvd, 145 .pru_send = tcp_send, 146 .pru_abort = tcp_abort, 147 .pru_sense = tcp_sense, 148 .pru_rcvoob = tcp_rcvoob, 149 .pru_sendoob = tcp_sendoob, 150 .pru_control = in6_control, 151 .pru_sockaddr = tcp_sockaddr, 152 .pru_peeraddr = tcp_peeraddr, 153 }; 154 #endif 155 156 const struct sysctl_bounded_args tcpctl_vars[] = { 157 { TCPCTL_RFC1323, &tcp_do_rfc1323, 0, 1 }, 158 { TCPCTL_SACK, &tcp_do_sack, 0, 1 }, 159 { TCPCTL_MSSDFLT, &tcp_mssdflt, TCP_MSS, 65535 }, 160 { TCPCTL_RSTPPSLIMIT, &tcp_rst_ppslim, 1, 1000 * 1000 }, 161 { TCPCTL_ACK_ON_PUSH, &tcp_ack_on_push, 0, 1 }, 162 #ifdef TCP_ECN 163 { TCPCTL_ECN, &tcp_do_ecn, 0, 1 }, 164 #endif 165 { TCPCTL_SYN_CACHE_LIMIT, &tcp_syn_cache_limit, 1, 1000 * 1000 }, 166 { TCPCTL_SYN_BUCKET_LIMIT, &tcp_syn_bucket_limit, 1, INT_MAX }, 167 { TCPCTL_RFC3390, &tcp_do_rfc3390, 0, 2 }, 168 { TCPCTL_ALWAYS_KEEPALIVE, &tcp_always_keepalive, 0, 1 }, 169 }; 170 171 struct inpcbtable tcbtable; 172 173 int tcp_fill_info(struct tcpcb *, struct socket *, struct mbuf *); 174 int tcp_ident(void *, size_t *, void *, size_t, int); 175 176 static inline int tcp_sogetpcb(struct socket *, struct inpcb **, 177 struct tcpcb **); 178 179 static inline int 180 tcp_sogetpcb(struct socket *so, struct inpcb **rinp, struct tcpcb **rtp) 181 { 182 struct inpcb *inp; 183 struct tcpcb *tp; 184 185 /* 186 * When a TCP is attached to a socket, then there will be 187 * a (struct inpcb) pointed at by the socket, and this 188 * structure will point at a subsidiary (struct tcpcb). 189 */ 190 if ((inp = sotoinpcb(so)) == NULL || (tp = intotcpcb(inp)) == NULL) { 191 if (so->so_error) 192 return so->so_error; 193 return EINVAL; 194 } 195 196 *rinp = inp; 197 *rtp = tp; 198 199 return 0; 200 } 201 202 /* 203 * Export internal TCP state information via a struct tcp_info without 204 * leaking any sensitive information. Sequence numbers are reported 205 * relative to the initial sequence number. 206 */ 207 int 208 tcp_fill_info(struct tcpcb *tp, struct socket *so, struct mbuf *m) 209 { 210 struct proc *p = curproc; 211 struct tcp_info *ti; 212 u_int t = 1000; /* msec => usec */ 213 uint32_t now; 214 215 if (sizeof(*ti) > MLEN) { 216 MCLGETL(m, M_WAITOK, sizeof(*ti)); 217 if (!ISSET(m->m_flags, M_EXT)) 218 return ENOMEM; 219 } 220 ti = mtod(m, struct tcp_info *); 221 m->m_len = sizeof(*ti); 222 memset(ti, 0, sizeof(*ti)); 223 now = tcp_now(); 224 225 ti->tcpi_state = tp->t_state; 226 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) 227 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; 228 if (tp->t_flags & TF_SACK_PERMIT) 229 ti->tcpi_options |= TCPI_OPT_SACK; 230 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { 231 ti->tcpi_options |= TCPI_OPT_WSCALE; 232 ti->tcpi_snd_wscale = tp->snd_scale; 233 ti->tcpi_rcv_wscale = tp->rcv_scale; 234 } 235 #ifdef TCP_ECN 236 if (tp->t_flags & TF_ECN_PERMIT) 237 ti->tcpi_options |= TCPI_OPT_ECN; 238 #endif 239 240 ti->tcpi_rto = tp->t_rxtcur * t; 241 ti->tcpi_snd_mss = tp->t_maxseg; 242 ti->tcpi_rcv_mss = tp->t_peermss; 243 244 ti->tcpi_last_data_sent = (now - tp->t_sndtime) * t; 245 ti->tcpi_last_ack_sent = (now - tp->t_sndacktime) * t; 246 ti->tcpi_last_data_recv = (now - tp->t_rcvtime) * t; 247 ti->tcpi_last_ack_recv = (now - tp->t_rcvacktime) * t; 248 249 ti->tcpi_rtt = ((uint64_t)tp->t_srtt * t) >> 250 (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 251 ti->tcpi_rttvar = ((uint64_t)tp->t_rttvar * t) >> 252 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT); 253 ti->tcpi_snd_ssthresh = tp->snd_ssthresh; 254 ti->tcpi_snd_cwnd = tp->snd_cwnd; 255 256 ti->tcpi_rcv_space = tp->rcv_wnd; 257 258 /* 259 * Provide only minimal information for unprivileged processes. 260 */ 261 if (suser(p) != 0) 262 return 0; 263 264 /* FreeBSD-specific extension fields for tcp_info. */ 265 ti->tcpi_snd_wnd = tp->snd_wnd; 266 ti->tcpi_snd_nxt = tp->snd_nxt - tp->iss; 267 ti->tcpi_rcv_nxt = tp->rcv_nxt - tp->irs; 268 /* missing tcpi_toe_tid */ 269 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; 270 ti->tcpi_rcv_ooopack = tp->t_rcvoopack; 271 ti->tcpi_snd_zerowin = tp->t_sndzerowin; 272 273 /* OpenBSD extensions */ 274 ti->tcpi_rttmin = tp->t_rttmin * t; 275 ti->tcpi_max_sndwnd = tp->max_sndwnd; 276 ti->tcpi_rcv_adv = tp->rcv_adv - tp->irs; 277 ti->tcpi_rcv_up = tp->rcv_up - tp->irs; 278 ti->tcpi_snd_una = tp->snd_una - tp->iss; 279 ti->tcpi_snd_up = tp->snd_up - tp->iss; 280 ti->tcpi_snd_wl1 = tp->snd_wl1 - tp->iss; 281 ti->tcpi_snd_wl2 = tp->snd_wl2 - tp->iss; 282 ti->tcpi_snd_max = tp->snd_max - tp->iss; 283 284 ti->tcpi_ts_recent = tp->ts_recent; /* XXX value from the wire */ 285 ti->tcpi_ts_recent_age = (now - tp->ts_recent_age) * t; 286 ti->tcpi_rfbuf_cnt = tp->rfbuf_cnt; 287 ti->tcpi_rfbuf_ts = (now - tp->rfbuf_ts) * t; 288 289 ti->tcpi_so_rcv_sb_cc = so->so_rcv.sb_cc; 290 ti->tcpi_so_rcv_sb_hiwat = so->so_rcv.sb_hiwat; 291 ti->tcpi_so_rcv_sb_lowat = so->so_rcv.sb_lowat; 292 ti->tcpi_so_rcv_sb_wat = so->so_rcv.sb_wat; 293 ti->tcpi_so_snd_sb_cc = so->so_snd.sb_cc; 294 ti->tcpi_so_snd_sb_hiwat = so->so_snd.sb_hiwat; 295 ti->tcpi_so_snd_sb_lowat = so->so_snd.sb_lowat; 296 ti->tcpi_so_snd_sb_wat = so->so_snd.sb_wat; 297 298 return 0; 299 } 300 301 int 302 tcp_ctloutput(int op, struct socket *so, int level, int optname, 303 struct mbuf *m) 304 { 305 int error = 0; 306 struct inpcb *inp; 307 struct tcpcb *tp; 308 int i; 309 310 inp = sotoinpcb(so); 311 if (inp == NULL) 312 return (ECONNRESET); 313 if (level != IPPROTO_TCP) { 314 switch (so->so_proto->pr_domain->dom_family) { 315 #ifdef INET6 316 case PF_INET6: 317 error = ip6_ctloutput(op, so, level, optname, m); 318 break; 319 #endif /* INET6 */ 320 case PF_INET: 321 error = ip_ctloutput(op, so, level, optname, m); 322 break; 323 default: 324 error = EAFNOSUPPORT; /*?*/ 325 break; 326 } 327 return (error); 328 } 329 tp = intotcpcb(inp); 330 331 switch (op) { 332 333 case PRCO_SETOPT: 334 switch (optname) { 335 336 case TCP_NODELAY: 337 if (m == NULL || m->m_len < sizeof (int)) 338 error = EINVAL; 339 else if (*mtod(m, int *)) 340 tp->t_flags |= TF_NODELAY; 341 else 342 tp->t_flags &= ~TF_NODELAY; 343 break; 344 345 case TCP_NOPUSH: 346 if (m == NULL || m->m_len < sizeof (int)) 347 error = EINVAL; 348 else if (*mtod(m, int *)) 349 tp->t_flags |= TF_NOPUSH; 350 else if (tp->t_flags & TF_NOPUSH) { 351 tp->t_flags &= ~TF_NOPUSH; 352 if (TCPS_HAVEESTABLISHED(tp->t_state)) 353 error = tcp_output(tp); 354 } 355 break; 356 357 case TCP_MAXSEG: 358 if (m == NULL || m->m_len < sizeof (int)) { 359 error = EINVAL; 360 break; 361 } 362 363 i = *mtod(m, int *); 364 if (i > 0 && i <= tp->t_maxseg) 365 tp->t_maxseg = i; 366 else 367 error = EINVAL; 368 break; 369 370 case TCP_SACK_ENABLE: 371 if (m == NULL || m->m_len < sizeof (int)) { 372 error = EINVAL; 373 break; 374 } 375 376 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 377 error = EPERM; 378 break; 379 } 380 381 if (tp->t_flags & TF_SIGNATURE) { 382 error = EPERM; 383 break; 384 } 385 386 if (*mtod(m, int *)) 387 tp->sack_enable = 1; 388 else 389 tp->sack_enable = 0; 390 break; 391 #ifdef TCP_SIGNATURE 392 case TCP_MD5SIG: 393 if (m == NULL || m->m_len < sizeof (int)) { 394 error = EINVAL; 395 break; 396 } 397 398 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 399 error = EPERM; 400 break; 401 } 402 403 if (*mtod(m, int *)) { 404 tp->t_flags |= TF_SIGNATURE; 405 tp->sack_enable = 0; 406 } else 407 tp->t_flags &= ~TF_SIGNATURE; 408 break; 409 #endif /* TCP_SIGNATURE */ 410 default: 411 error = ENOPROTOOPT; 412 break; 413 } 414 break; 415 416 case PRCO_GETOPT: 417 switch (optname) { 418 case TCP_NODELAY: 419 m->m_len = sizeof(int); 420 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 421 break; 422 case TCP_NOPUSH: 423 m->m_len = sizeof(int); 424 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 425 break; 426 case TCP_MAXSEG: 427 m->m_len = sizeof(int); 428 *mtod(m, int *) = tp->t_maxseg; 429 break; 430 case TCP_SACK_ENABLE: 431 m->m_len = sizeof(int); 432 *mtod(m, int *) = tp->sack_enable; 433 break; 434 case TCP_INFO: 435 error = tcp_fill_info(tp, so, m); 436 break; 437 #ifdef TCP_SIGNATURE 438 case TCP_MD5SIG: 439 m->m_len = sizeof(int); 440 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; 441 break; 442 #endif 443 default: 444 error = ENOPROTOOPT; 445 break; 446 } 447 break; 448 } 449 return (error); 450 } 451 452 /* 453 * Attach TCP protocol to socket, allocating 454 * internet protocol control block, tcp control block, 455 * buffer space, and entering LISTEN state to accept connections. 456 */ 457 int 458 tcp_attach(struct socket *so, int proto, int wait) 459 { 460 struct tcpcb *tp; 461 struct inpcb *inp; 462 int error; 463 464 if (so->so_pcb) 465 return EISCONN; 466 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || 467 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || 468 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { 469 error = soreserve(so, tcp_sendspace, tcp_recvspace); 470 if (error) 471 return (error); 472 } 473 474 NET_ASSERT_LOCKED(); 475 error = in_pcballoc(so, &tcbtable, wait); 476 if (error) 477 return (error); 478 inp = sotoinpcb(so); 479 tp = tcp_newtcpcb(inp, wait); 480 if (tp == NULL) { 481 unsigned int nofd = so->so_state & SS_NOFDREF; /* XXX */ 482 483 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 484 in_pcbdetach(inp); 485 so->so_state |= nofd; 486 return (ENOBUFS); 487 } 488 tp->t_state = TCPS_CLOSED; 489 #ifdef INET6 490 /* we disallow IPv4 mapped address completely. */ 491 if (inp->inp_flags & INP_IPV6) 492 tp->pf = PF_INET6; 493 else 494 tp->pf = PF_INET; 495 #else 496 tp->pf = PF_INET; 497 #endif 498 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 499 so->so_linger = TCP_LINGERTIME; 500 501 if (so->so_options & SO_DEBUG) 502 tcp_trace(TA_USER, TCPS_CLOSED, tp, tp, NULL, PRU_ATTACH, 0); 503 return (0); 504 } 505 506 int 507 tcp_detach(struct socket *so) 508 { 509 struct inpcb *inp; 510 struct tcpcb *otp = NULL, *tp; 511 int error = 0; 512 short ostate; 513 514 soassertlocked(so); 515 516 if ((error = tcp_sogetpcb(so, &inp, &tp))) 517 return (error); 518 519 if (so->so_options & SO_DEBUG) { 520 otp = tp; 521 ostate = tp->t_state; 522 } 523 524 /* 525 * Detach the TCP protocol from the socket. 526 * If the protocol state is non-embryonic, then can't 527 * do this directly: have to initiate a PRU_DISCONNECT, 528 * which may finish later; embryonic TCB's can just 529 * be discarded here. 530 */ 531 tp = tcp_dodisconnect(tp); 532 533 if (otp) 534 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DETACH, 0); 535 return (error); 536 } 537 538 /* 539 * Give the socket an address. 540 */ 541 int 542 tcp_bind(struct socket *so, struct mbuf *nam, struct proc *p) 543 { 544 struct inpcb *inp; 545 struct tcpcb *tp; 546 int error; 547 short ostate; 548 549 soassertlocked(so); 550 551 if ((error = tcp_sogetpcb(so, &inp, &tp))) 552 return (error); 553 554 if (so->so_options & SO_DEBUG) 555 ostate = tp->t_state; 556 557 error = in_pcbbind(inp, nam, p); 558 559 if (so->so_options & SO_DEBUG) 560 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_BIND, 0); 561 return (error); 562 } 563 564 /* 565 * Prepare to accept connections. 566 */ 567 int 568 tcp_listen(struct socket *so) 569 { 570 struct inpcb *inp; 571 struct tcpcb *tp, *otp = NULL; 572 int error; 573 short ostate; 574 575 soassertlocked(so); 576 577 if ((error = tcp_sogetpcb(so, &inp, &tp))) 578 return (error); 579 580 if (so->so_options & SO_DEBUG) { 581 otp = tp; 582 ostate = tp->t_state; 583 } 584 585 if (inp->inp_lport == 0) 586 if ((error = in_pcbbind(inp, NULL, curproc))) 587 goto out; 588 589 /* 590 * If the in_pcbbind() above is called, the tp->pf 591 * should still be whatever it was before. 592 */ 593 tp->t_state = TCPS_LISTEN; 594 595 out: 596 if (otp) 597 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_LISTEN, 0); 598 return (error); 599 } 600 601 /* 602 * Initiate connection to peer. 603 * Create a template for use in transmissions on this connection. 604 * Enter SYN_SENT state, and mark socket as connecting. 605 * Start keep-alive timer, and seed output sequence space. 606 * Send initial segment on connection. 607 */ 608 int 609 tcp_connect(struct socket *so, struct mbuf *nam) 610 { 611 struct inpcb *inp; 612 struct tcpcb *tp, *otp = NULL; 613 int error; 614 short ostate; 615 616 soassertlocked(so); 617 618 if ((error = tcp_sogetpcb(so, &inp, &tp))) 619 return (error); 620 621 if (so->so_options & SO_DEBUG) { 622 otp = tp; 623 ostate = tp->t_state; 624 } 625 626 #ifdef INET6 627 if (inp->inp_flags & INP_IPV6) { 628 struct sockaddr_in6 *sin6; 629 630 if ((error = in6_nam2sin6(nam, &sin6))) 631 goto out; 632 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || 633 IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { 634 error = EINVAL; 635 goto out; 636 } 637 error = in6_pcbconnect(inp, nam); 638 } else 639 #endif /* INET6 */ 640 { 641 struct sockaddr_in *sin; 642 643 if ((error = in_nam2sin(nam, &sin))) 644 goto out; 645 if ((sin->sin_addr.s_addr == INADDR_ANY) || 646 (sin->sin_addr.s_addr == INADDR_BROADCAST) || 647 IN_MULTICAST(sin->sin_addr.s_addr) || 648 in_broadcast(sin->sin_addr, inp->inp_rtableid)) { 649 error = EINVAL; 650 goto out; 651 } 652 error = in_pcbconnect(inp, nam); 653 } 654 if (error) 655 goto out; 656 657 tp->t_template = tcp_template(tp); 658 if (tp->t_template == 0) { 659 in_pcbdisconnect(inp); 660 error = ENOBUFS; 661 goto out; 662 } 663 664 so->so_state |= SS_CONNECTOUT; 665 666 /* Compute window scaling to request. */ 667 tcp_rscale(tp, sb_max); 668 669 soisconnecting(so); 670 tcpstat_inc(tcps_connattempt); 671 tp->t_state = TCPS_SYN_SENT; 672 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 673 tcp_set_iss_tsm(tp); 674 tcp_sendseqinit(tp); 675 tp->snd_last = tp->snd_una; 676 error = tcp_output(tp); 677 678 out: 679 if (otp) 680 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_CONNECT, 0); 681 return (error); 682 } 683 684 /* 685 * Accept a connection. Essentially all the work is done at higher 686 * levels; just return the address of the peer, storing through addr. 687 */ 688 int 689 tcp_accept(struct socket *so, struct mbuf *nam) 690 { 691 struct inpcb *inp; 692 struct tcpcb *tp; 693 int error; 694 short ostate; 695 696 soassertlocked(so); 697 698 if ((error = tcp_sogetpcb(so, &inp, &tp))) 699 return (error); 700 701 if (so->so_options & SO_DEBUG) 702 ostate = tp->t_state; 703 704 #ifdef INET6 705 if (inp->inp_flags & INP_IPV6) 706 in6_setpeeraddr(inp, nam); 707 else 708 #endif 709 in_setpeeraddr(inp, nam); 710 711 if (so->so_options & SO_DEBUG) 712 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_ACCEPT, 0); 713 return (error); 714 } 715 716 /* 717 * Initiate disconnect from peer. 718 * If connection never passed embryonic stage, just drop; 719 * else if don't need to let data drain, then can just drop anyways, 720 * else have to begin TCP shutdown process: mark socket disconnecting, 721 * drain unread data, state switch to reflect user close, and 722 * send segment (e.g. FIN) to peer. Socket will be really disconnected 723 * when peer sends FIN and acks ours. 724 * 725 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 726 */ 727 int 728 tcp_disconnect(struct socket *so) 729 { 730 struct inpcb *inp; 731 struct tcpcb *tp, *otp = NULL; 732 int error; 733 short ostate; 734 735 soassertlocked(so); 736 737 if ((error = tcp_sogetpcb(so, &inp, &tp))) 738 return (error); 739 740 if (so->so_options & SO_DEBUG) { 741 otp = tp; 742 ostate = tp->t_state; 743 } 744 745 tp = tcp_dodisconnect(tp); 746 747 if (otp) 748 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DISCONNECT, 0); 749 return (0); 750 } 751 752 /* 753 * Mark the connection as being incapable of further output. 754 */ 755 int 756 tcp_shutdown(struct socket *so) 757 { 758 struct inpcb *inp; 759 struct tcpcb *tp, *otp = NULL; 760 int error; 761 short ostate; 762 763 soassertlocked(so); 764 765 if ((error = tcp_sogetpcb(so, &inp, &tp))) 766 return (error); 767 768 if (so->so_options & SO_DEBUG) { 769 otp = tp; 770 ostate = tp->t_state; 771 } 772 773 if (so->so_snd.sb_state & SS_CANTSENDMORE) 774 goto out; 775 776 socantsendmore(so); 777 tp = tcp_usrclosed(tp); 778 if (tp) 779 error = tcp_output(tp); 780 781 out: 782 if (otp) 783 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_SHUTDOWN, 0); 784 return (error); 785 } 786 787 /* 788 * After a receive, possibly send window update to peer. 789 */ 790 void 791 tcp_rcvd(struct socket *so) 792 { 793 struct inpcb *inp; 794 struct tcpcb *tp; 795 short ostate; 796 797 soassertlocked(so); 798 799 if (tcp_sogetpcb(so, &inp, &tp)) 800 return; 801 802 if (so->so_options & SO_DEBUG) 803 ostate = tp->t_state; 804 805 /* 806 * soreceive() calls this function when a user receives 807 * ancillary data on a listening socket. We don't call 808 * tcp_output in such a case, since there is no header 809 * template for a listening socket and hence the kernel 810 * will panic. 811 */ 812 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 813 (void) tcp_output(tp); 814 815 if (so->so_options & SO_DEBUG) 816 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_RCVD, 0); 817 } 818 819 /* 820 * Do a send by putting data in output queue and updating urgent 821 * marker if URG set. Possibly send more data. 822 */ 823 int 824 tcp_send(struct socket *so, struct mbuf *m, struct mbuf *nam, 825 struct mbuf *control) 826 { 827 struct inpcb *inp; 828 struct tcpcb *tp; 829 int error; 830 short ostate; 831 832 soassertlocked(so); 833 834 if (control && control->m_len) { 835 error = EINVAL; 836 goto out; 837 } 838 839 if ((error = tcp_sogetpcb(so, &inp, &tp))) 840 goto out; 841 842 if (so->so_options & SO_DEBUG) 843 ostate = tp->t_state; 844 845 sbappendstream(so, &so->so_snd, m); 846 m = NULL; 847 848 error = tcp_output(tp); 849 850 if (so->so_options & SO_DEBUG) 851 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SEND, 0); 852 853 out: 854 m_freem(control); 855 m_freem(m); 856 857 return (error); 858 } 859 860 /* 861 * Abort the TCP. 862 */ 863 void 864 tcp_abort(struct socket *so) 865 { 866 struct inpcb *inp; 867 struct tcpcb *tp, *otp = NULL; 868 short ostate; 869 870 soassertlocked(so); 871 872 if (tcp_sogetpcb(so, &inp, &tp)) 873 return; 874 875 if (so->so_options & SO_DEBUG) { 876 otp = tp; 877 ostate = tp->t_state; 878 } 879 880 tp = tcp_drop(tp, ECONNABORTED); 881 882 if (otp) 883 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_ABORT, 0); 884 } 885 886 int 887 tcp_sense(struct socket *so, struct stat *ub) 888 { 889 struct inpcb *inp; 890 struct tcpcb *tp; 891 int error; 892 893 soassertlocked(so); 894 895 if ((error = tcp_sogetpcb(so, &inp, &tp))) 896 return (error); 897 898 ub->st_blksize = so->so_snd.sb_hiwat; 899 900 if (so->so_options & SO_DEBUG) 901 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_SENSE, 0); 902 return (0); 903 } 904 905 int 906 tcp_rcvoob(struct socket *so, struct mbuf *m, int flags) 907 { 908 struct inpcb *inp; 909 struct tcpcb *tp; 910 int error; 911 912 soassertlocked(so); 913 914 if ((error = tcp_sogetpcb(so, &inp, &tp))) 915 return (error); 916 917 if ((so->so_oobmark == 0 && 918 (so->so_rcv.sb_state & SS_RCVATMARK) == 0) || 919 so->so_options & SO_OOBINLINE || 920 tp->t_oobflags & TCPOOB_HADDATA) { 921 error = EINVAL; 922 goto out; 923 } 924 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 925 error = EWOULDBLOCK; 926 goto out; 927 } 928 m->m_len = 1; 929 *mtod(m, caddr_t) = tp->t_iobc; 930 if ((flags & MSG_PEEK) == 0) 931 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 932 out: 933 if (so->so_options & SO_DEBUG) 934 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_RCVOOB, 0); 935 return (error); 936 } 937 938 int 939 tcp_sendoob(struct socket *so, struct mbuf *m, struct mbuf *nam, 940 struct mbuf *control) 941 { 942 struct inpcb *inp; 943 struct tcpcb *tp; 944 int error; 945 short ostate; 946 947 soassertlocked(so); 948 949 if (control && control->m_len) { 950 error = EINVAL; 951 goto release; 952 } 953 954 if ((error = tcp_sogetpcb(so, &inp, &tp))) 955 goto release; 956 957 if (so->so_options & SO_DEBUG) 958 ostate = tp->t_state; 959 960 if (sbspace(so, &so->so_snd) < -512) { 961 error = ENOBUFS; 962 goto out; 963 } 964 965 /* 966 * According to RFC961 (Assigned Protocols), 967 * the urgent pointer points to the last octet 968 * of urgent data. We continue, however, 969 * to consider it to indicate the first octet 970 * of data past the urgent section. 971 * Otherwise, snd_up should be one lower. 972 */ 973 sbappendstream(so, &so->so_snd, m); 974 m = NULL; 975 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 976 tp->t_force = 1; 977 error = tcp_output(tp); 978 tp->t_force = 0; 979 980 out: 981 if (so->so_options & SO_DEBUG) 982 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SENDOOB, 0); 983 984 release: 985 m_freem(control); 986 m_freem(m); 987 988 return (error); 989 } 990 991 int 992 tcp_sockaddr(struct socket *so, struct mbuf *nam) 993 { 994 struct inpcb *inp; 995 struct tcpcb *tp; 996 int error; 997 998 soassertlocked(so); 999 1000 if ((error = tcp_sogetpcb(so, &inp, &tp))) 1001 return (error); 1002 1003 #ifdef INET6 1004 if (inp->inp_flags & INP_IPV6) 1005 in6_setsockaddr(inp, nam); 1006 else 1007 #endif 1008 in_setsockaddr(inp, nam); 1009 1010 if (so->so_options & SO_DEBUG) 1011 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, 1012 PRU_SOCKADDR, 0); 1013 return (0); 1014 } 1015 1016 int 1017 tcp_peeraddr(struct socket *so, struct mbuf *nam) 1018 { 1019 struct inpcb *inp; 1020 struct tcpcb *tp; 1021 int error; 1022 1023 soassertlocked(so); 1024 1025 if ((error = tcp_sogetpcb(so, &inp, &tp))) 1026 return (error); 1027 1028 #ifdef INET6 1029 if (inp->inp_flags & INP_IPV6) 1030 in6_setpeeraddr(inp, nam); 1031 else 1032 #endif 1033 in_setpeeraddr(inp, nam); 1034 1035 if (so->so_options & SO_DEBUG) 1036 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, 1037 PRU_PEERADDR, 0); 1038 return (0); 1039 } 1040 1041 /* 1042 * Initiate (or continue) disconnect. 1043 * If embryonic state, just send reset (once). 1044 * If in ``let data drain'' option and linger null, just drop. 1045 * Otherwise (hard), mark socket disconnecting and drop 1046 * current input data; switch states based on user close, and 1047 * send segment to peer (with FIN). 1048 */ 1049 struct tcpcb * 1050 tcp_dodisconnect(struct tcpcb *tp) 1051 { 1052 struct socket *so = tp->t_inpcb->inp_socket; 1053 1054 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 1055 tp = tcp_close(tp); 1056 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 1057 tp = tcp_drop(tp, 0); 1058 else { 1059 soisdisconnecting(so); 1060 sbflush(so, &so->so_rcv); 1061 tp = tcp_usrclosed(tp); 1062 if (tp) 1063 (void) tcp_output(tp); 1064 } 1065 return (tp); 1066 } 1067 1068 /* 1069 * User issued close, and wish to trail through shutdown states: 1070 * if never received SYN, just forget it. If got a SYN from peer, 1071 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 1072 * If already got a FIN from peer, then almost done; go to LAST_ACK 1073 * state. In all other cases, have already sent FIN to peer (e.g. 1074 * after PRU_SHUTDOWN), and just have to play tedious game waiting 1075 * for peer to send FIN or not respond to keep-alives, etc. 1076 * We can let the user exit from the close as soon as the FIN is acked. 1077 */ 1078 struct tcpcb * 1079 tcp_usrclosed(struct tcpcb *tp) 1080 { 1081 1082 switch (tp->t_state) { 1083 1084 case TCPS_CLOSED: 1085 case TCPS_LISTEN: 1086 case TCPS_SYN_SENT: 1087 tp->t_state = TCPS_CLOSED; 1088 tp = tcp_close(tp); 1089 break; 1090 1091 case TCPS_SYN_RECEIVED: 1092 case TCPS_ESTABLISHED: 1093 tp->t_state = TCPS_FIN_WAIT_1; 1094 break; 1095 1096 case TCPS_CLOSE_WAIT: 1097 tp->t_state = TCPS_LAST_ACK; 1098 break; 1099 } 1100 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 1101 soisdisconnected(tp->t_inpcb->inp_socket); 1102 /* 1103 * If we are in FIN_WAIT_2, we arrived here because the 1104 * application did a shutdown of the send side. Like the 1105 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 1106 * a full close, we start a timer to make sure sockets are 1107 * not left in FIN_WAIT_2 forever. 1108 */ 1109 if (tp->t_state == TCPS_FIN_WAIT_2) 1110 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1111 } 1112 return (tp); 1113 } 1114 1115 /* 1116 * Look up a socket for ident or tcpdrop, ... 1117 */ 1118 int 1119 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) 1120 { 1121 int error = 0; 1122 struct tcp_ident_mapping tir; 1123 struct inpcb *inp; 1124 struct tcpcb *tp = NULL; 1125 struct sockaddr_in *fin, *lin; 1126 #ifdef INET6 1127 struct sockaddr_in6 *fin6, *lin6; 1128 struct in6_addr f6, l6; 1129 #endif 1130 1131 NET_ASSERT_LOCKED(); 1132 1133 if (dodrop) { 1134 if (oldp != NULL || *oldlenp != 0) 1135 return (EINVAL); 1136 if (newp == NULL) 1137 return (EPERM); 1138 if (newlen < sizeof(tir)) 1139 return (ENOMEM); 1140 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) 1141 return (error); 1142 } else { 1143 if (oldp == NULL) 1144 return (EINVAL); 1145 if (*oldlenp < sizeof(tir)) 1146 return (ENOMEM); 1147 if (newp != NULL || newlen != 0) 1148 return (EINVAL); 1149 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) 1150 return (error); 1151 } 1152 switch (tir.faddr.ss_family) { 1153 #ifdef INET6 1154 case AF_INET6: 1155 fin6 = (struct sockaddr_in6 *)&tir.faddr; 1156 error = in6_embedscope(&f6, fin6, NULL); 1157 if (error) 1158 return EINVAL; /*?*/ 1159 lin6 = (struct sockaddr_in6 *)&tir.laddr; 1160 error = in6_embedscope(&l6, lin6, NULL); 1161 if (error) 1162 return EINVAL; /*?*/ 1163 break; 1164 #endif 1165 case AF_INET: 1166 fin = (struct sockaddr_in *)&tir.faddr; 1167 lin = (struct sockaddr_in *)&tir.laddr; 1168 break; 1169 default: 1170 return (EINVAL); 1171 } 1172 1173 switch (tir.faddr.ss_family) { 1174 #ifdef INET6 1175 case AF_INET6: 1176 inp = in6_pcblookup(&tcbtable, &f6, 1177 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); 1178 break; 1179 #endif 1180 case AF_INET: 1181 inp = in_pcblookup(&tcbtable, fin->sin_addr, 1182 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); 1183 break; 1184 default: 1185 unhandled_af(tir.faddr.ss_family); 1186 } 1187 1188 if (dodrop) { 1189 if (inp && (tp = intotcpcb(inp)) && 1190 ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) 1191 tp = tcp_drop(tp, ECONNABORTED); 1192 else 1193 error = ESRCH; 1194 in_pcbunref(inp); 1195 return (error); 1196 } 1197 1198 if (inp == NULL) { 1199 tcpstat_inc(tcps_pcbhashmiss); 1200 switch (tir.faddr.ss_family) { 1201 #ifdef INET6 1202 case AF_INET6: 1203 inp = in6_pcblookup_listen(&tcbtable, 1204 &l6, lin6->sin6_port, NULL, tir.rdomain); 1205 break; 1206 #endif 1207 case AF_INET: 1208 inp = in_pcblookup_listen(&tcbtable, 1209 lin->sin_addr, lin->sin_port, NULL, tir.rdomain); 1210 break; 1211 } 1212 } 1213 1214 if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) { 1215 tir.ruid = inp->inp_socket->so_ruid; 1216 tir.euid = inp->inp_socket->so_euid; 1217 } else { 1218 tir.ruid = -1; 1219 tir.euid = -1; 1220 } 1221 1222 *oldlenp = sizeof (tir); 1223 error = copyout((void *)&tir, oldp, sizeof (tir)); 1224 in_pcbunref(inp); 1225 return (error); 1226 } 1227 1228 int 1229 tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp) 1230 { 1231 uint64_t counters[tcps_ncounters]; 1232 struct tcpstat tcpstat; 1233 struct syn_cache_set *set; 1234 int i = 0; 1235 1236 #define ASSIGN(field) do { tcpstat.field = counters[i++]; } while (0) 1237 1238 memset(&tcpstat, 0, sizeof tcpstat); 1239 counters_read(tcpcounters, counters, nitems(counters)); 1240 ASSIGN(tcps_connattempt); 1241 ASSIGN(tcps_accepts); 1242 ASSIGN(tcps_connects); 1243 ASSIGN(tcps_drops); 1244 ASSIGN(tcps_conndrops); 1245 ASSIGN(tcps_closed); 1246 ASSIGN(tcps_segstimed); 1247 ASSIGN(tcps_rttupdated); 1248 ASSIGN(tcps_delack); 1249 ASSIGN(tcps_timeoutdrop); 1250 ASSIGN(tcps_rexmttimeo); 1251 ASSIGN(tcps_persisttimeo); 1252 ASSIGN(tcps_persistdrop); 1253 ASSIGN(tcps_keeptimeo); 1254 ASSIGN(tcps_keepprobe); 1255 ASSIGN(tcps_keepdrops); 1256 ASSIGN(tcps_sndtotal); 1257 ASSIGN(tcps_sndpack); 1258 ASSIGN(tcps_sndbyte); 1259 ASSIGN(tcps_sndrexmitpack); 1260 ASSIGN(tcps_sndrexmitbyte); 1261 ASSIGN(tcps_sndrexmitfast); 1262 ASSIGN(tcps_sndacks); 1263 ASSIGN(tcps_sndprobe); 1264 ASSIGN(tcps_sndurg); 1265 ASSIGN(tcps_sndwinup); 1266 ASSIGN(tcps_sndctrl); 1267 ASSIGN(tcps_rcvtotal); 1268 ASSIGN(tcps_rcvpack); 1269 ASSIGN(tcps_rcvbyte); 1270 ASSIGN(tcps_rcvbadsum); 1271 ASSIGN(tcps_rcvbadoff); 1272 ASSIGN(tcps_rcvmemdrop); 1273 ASSIGN(tcps_rcvnosec); 1274 ASSIGN(tcps_rcvshort); 1275 ASSIGN(tcps_rcvduppack); 1276 ASSIGN(tcps_rcvdupbyte); 1277 ASSIGN(tcps_rcvpartduppack); 1278 ASSIGN(tcps_rcvpartdupbyte); 1279 ASSIGN(tcps_rcvoopack); 1280 ASSIGN(tcps_rcvoobyte); 1281 ASSIGN(tcps_rcvpackafterwin); 1282 ASSIGN(tcps_rcvbyteafterwin); 1283 ASSIGN(tcps_rcvafterclose); 1284 ASSIGN(tcps_rcvwinprobe); 1285 ASSIGN(tcps_rcvdupack); 1286 ASSIGN(tcps_rcvacktoomuch); 1287 ASSIGN(tcps_rcvacktooold); 1288 ASSIGN(tcps_rcvackpack); 1289 ASSIGN(tcps_rcvackbyte); 1290 ASSIGN(tcps_rcvwinupd); 1291 ASSIGN(tcps_pawsdrop); 1292 ASSIGN(tcps_predack); 1293 ASSIGN(tcps_preddat); 1294 ASSIGN(tcps_pcbhashmiss); 1295 ASSIGN(tcps_noport); 1296 ASSIGN(tcps_badsyn); 1297 ASSIGN(tcps_dropsyn); 1298 ASSIGN(tcps_rcvbadsig); 1299 ASSIGN(tcps_rcvgoodsig); 1300 ASSIGN(tcps_inswcsum); 1301 ASSIGN(tcps_outswcsum); 1302 ASSIGN(tcps_ecn_accepts); 1303 ASSIGN(tcps_ecn_rcvece); 1304 ASSIGN(tcps_ecn_rcvcwr); 1305 ASSIGN(tcps_ecn_rcvce); 1306 ASSIGN(tcps_ecn_sndect); 1307 ASSIGN(tcps_ecn_sndece); 1308 ASSIGN(tcps_ecn_sndcwr); 1309 ASSIGN(tcps_cwr_ecn); 1310 ASSIGN(tcps_cwr_frecovery); 1311 ASSIGN(tcps_cwr_timeout); 1312 ASSIGN(tcps_sc_added); 1313 ASSIGN(tcps_sc_completed); 1314 ASSIGN(tcps_sc_timed_out); 1315 ASSIGN(tcps_sc_overflowed); 1316 ASSIGN(tcps_sc_reset); 1317 ASSIGN(tcps_sc_unreach); 1318 ASSIGN(tcps_sc_bucketoverflow); 1319 ASSIGN(tcps_sc_aborted); 1320 ASSIGN(tcps_sc_dupesyn); 1321 ASSIGN(tcps_sc_dropped); 1322 ASSIGN(tcps_sc_collisions); 1323 ASSIGN(tcps_sc_retransmitted); 1324 ASSIGN(tcps_sc_seedrandom); 1325 ASSIGN(tcps_sc_hash_size); 1326 ASSIGN(tcps_sc_entry_count); 1327 ASSIGN(tcps_sc_entry_limit); 1328 ASSIGN(tcps_sc_bucket_maxlen); 1329 ASSIGN(tcps_sc_bucket_limit); 1330 ASSIGN(tcps_sc_uses_left); 1331 ASSIGN(tcps_conndrained); 1332 ASSIGN(tcps_sack_recovery_episode); 1333 ASSIGN(tcps_sack_rexmits); 1334 ASSIGN(tcps_sack_rexmit_bytes); 1335 ASSIGN(tcps_sack_rcv_opts); 1336 ASSIGN(tcps_sack_snd_opts); 1337 ASSIGN(tcps_sack_drop_opts); 1338 1339 #undef ASSIGN 1340 1341 set = &tcp_syn_cache[tcp_syn_cache_active]; 1342 tcpstat.tcps_sc_hash_size = set->scs_size; 1343 tcpstat.tcps_sc_entry_count = set->scs_count; 1344 tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit; 1345 tcpstat.tcps_sc_bucket_maxlen = 0; 1346 for (i = 0; i < set->scs_size; i++) { 1347 if (tcpstat.tcps_sc_bucket_maxlen < 1348 set->scs_buckethead[i].sch_length) 1349 tcpstat.tcps_sc_bucket_maxlen = 1350 set->scs_buckethead[i].sch_length; 1351 } 1352 tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit; 1353 tcpstat.tcps_sc_uses_left = set->scs_use; 1354 1355 return (sysctl_rdstruct(oldp, oldlenp, newp, 1356 &tcpstat, sizeof(tcpstat))); 1357 } 1358 1359 /* 1360 * Sysctl for tcp variables. 1361 */ 1362 int 1363 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 1364 size_t newlen) 1365 { 1366 int error, nval; 1367 1368 /* All sysctl names at this level are terminal. */ 1369 if (namelen != 1) 1370 return (ENOTDIR); 1371 1372 switch (name[0]) { 1373 case TCPCTL_KEEPINITTIME: 1374 NET_LOCK(); 1375 nval = tcptv_keep_init / TCP_TIME(1); 1376 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval, 1377 1, 3 * (TCPTV_KEEP_INIT / TCP_TIME(1))); 1378 if (!error) 1379 tcptv_keep_init = TCP_TIME(nval); 1380 NET_UNLOCK(); 1381 return (error); 1382 1383 case TCPCTL_KEEPIDLE: 1384 NET_LOCK(); 1385 nval = tcp_keepidle / TCP_TIME(1); 1386 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval, 1387 1, 5 * (TCPTV_KEEP_IDLE / TCP_TIME(1))); 1388 if (!error) 1389 tcp_keepidle = TCP_TIME(nval); 1390 NET_UNLOCK(); 1391 return (error); 1392 1393 case TCPCTL_KEEPINTVL: 1394 NET_LOCK(); 1395 nval = tcp_keepintvl / TCP_TIME(1); 1396 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval, 1397 1, 3 * (TCPTV_KEEPINTVL / TCP_TIME(1))); 1398 if (!error) 1399 tcp_keepintvl = TCP_TIME(nval); 1400 NET_UNLOCK(); 1401 return (error); 1402 1403 case TCPCTL_BADDYNAMIC: 1404 NET_LOCK(); 1405 error = sysctl_struct(oldp, oldlenp, newp, newlen, 1406 baddynamicports.tcp, sizeof(baddynamicports.tcp)); 1407 NET_UNLOCK(); 1408 return (error); 1409 1410 case TCPCTL_ROOTONLY: 1411 if (newp && securelevel > 0) 1412 return (EPERM); 1413 NET_LOCK(); 1414 error = sysctl_struct(oldp, oldlenp, newp, newlen, 1415 rootonlyports.tcp, sizeof(rootonlyports.tcp)); 1416 NET_UNLOCK(); 1417 return (error); 1418 1419 case TCPCTL_IDENT: 1420 NET_LOCK(); 1421 error = tcp_ident(oldp, oldlenp, newp, newlen, 0); 1422 NET_UNLOCK(); 1423 return (error); 1424 1425 case TCPCTL_DROP: 1426 NET_LOCK(); 1427 error = tcp_ident(oldp, oldlenp, newp, newlen, 1); 1428 NET_UNLOCK(); 1429 return (error); 1430 1431 case TCPCTL_REASS_LIMIT: 1432 NET_LOCK(); 1433 nval = tcp_reass_limit; 1434 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1435 if (!error && nval != tcp_reass_limit) { 1436 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); 1437 if (!error) 1438 tcp_reass_limit = nval; 1439 } 1440 NET_UNLOCK(); 1441 return (error); 1442 1443 case TCPCTL_SACKHOLE_LIMIT: 1444 NET_LOCK(); 1445 nval = tcp_sackhole_limit; 1446 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1447 if (!error && nval != tcp_sackhole_limit) { 1448 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); 1449 if (!error) 1450 tcp_sackhole_limit = nval; 1451 } 1452 NET_UNLOCK(); 1453 return (error); 1454 1455 case TCPCTL_STATS: 1456 return (tcp_sysctl_tcpstat(oldp, oldlenp, newp)); 1457 1458 case TCPCTL_SYN_USE_LIMIT: 1459 NET_LOCK(); 1460 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, 1461 &tcp_syn_use_limit, 0, INT_MAX); 1462 if (!error && newp != NULL) { 1463 /* 1464 * Global tcp_syn_use_limit is used when reseeding a 1465 * new cache. Also update the value in active cache. 1466 */ 1467 if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit) 1468 tcp_syn_cache[0].scs_use = tcp_syn_use_limit; 1469 if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit) 1470 tcp_syn_cache[1].scs_use = tcp_syn_use_limit; 1471 } 1472 NET_UNLOCK(); 1473 return (error); 1474 1475 case TCPCTL_SYN_HASH_SIZE: 1476 NET_LOCK(); 1477 nval = tcp_syn_hash_size; 1478 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, 1479 &nval, 1, 100000); 1480 if (!error && nval != tcp_syn_hash_size) { 1481 /* 1482 * If global hash size has been changed, 1483 * switch sets as soon as possible. Then 1484 * the actual hash array will be reallocated. 1485 */ 1486 if (tcp_syn_cache[0].scs_size != nval) 1487 tcp_syn_cache[0].scs_use = 0; 1488 if (tcp_syn_cache[1].scs_size != nval) 1489 tcp_syn_cache[1].scs_use = 0; 1490 tcp_syn_hash_size = nval; 1491 } 1492 NET_UNLOCK(); 1493 return (error); 1494 1495 default: 1496 NET_LOCK(); 1497 error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars), name, 1498 namelen, oldp, oldlenp, newp, newlen); 1499 NET_UNLOCK(); 1500 return (error); 1501 } 1502 /* NOTREACHED */ 1503 } 1504 1505 /* 1506 * Scale the send buffer so that inflight data is not accounted against 1507 * the limit. The buffer will scale with the congestion window, if the 1508 * the receiver stops acking data the window will shrink and therefore 1509 * the buffer size will shrink as well. 1510 * In low memory situation try to shrink the buffer to the initial size 1511 * disabling the send buffer scaling as long as the situation persists. 1512 */ 1513 void 1514 tcp_update_sndspace(struct tcpcb *tp) 1515 { 1516 struct socket *so = tp->t_inpcb->inp_socket; 1517 u_long nmax = so->so_snd.sb_hiwat; 1518 1519 if (sbchecklowmem()) { 1520 /* low on memory try to get rid of some */ 1521 if (tcp_sendspace < nmax) 1522 nmax = tcp_sendspace; 1523 } else if (so->so_snd.sb_wat != tcp_sendspace) 1524 /* user requested buffer size, auto-scaling disabled */ 1525 nmax = so->so_snd.sb_wat; 1526 else 1527 /* automatic buffer scaling */ 1528 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - 1529 tp->snd_una); 1530 1531 /* a writable socket must be preserved because of poll(2) semantics */ 1532 if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) { 1533 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) 1534 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; 1535 /* keep in sync with sbreserve() calculation */ 1536 if (nmax * 8 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat) 1537 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+7) / 8; 1538 } 1539 1540 /* round to MSS boundary */ 1541 nmax = roundup(nmax, tp->t_maxseg); 1542 1543 if (nmax != so->so_snd.sb_hiwat) 1544 sbreserve(so, &so->so_snd, nmax); 1545 } 1546 1547 /* 1548 * Scale the recv buffer by looking at how much data was transferred in 1549 * one approximated RTT. If more than a big part of the recv buffer was 1550 * transferred during that time we increase the buffer by a constant. 1551 * In low memory situation try to shrink the buffer to the initial size. 1552 */ 1553 void 1554 tcp_update_rcvspace(struct tcpcb *tp) 1555 { 1556 struct socket *so = tp->t_inpcb->inp_socket; 1557 u_long nmax = so->so_rcv.sb_hiwat; 1558 1559 if (sbchecklowmem()) { 1560 /* low on memory try to get rid of some */ 1561 if (tcp_recvspace < nmax) 1562 nmax = tcp_recvspace; 1563 } else if (so->so_rcv.sb_wat != tcp_recvspace) 1564 /* user requested buffer size, auto-scaling disabled */ 1565 nmax = so->so_rcv.sb_wat; 1566 else { 1567 /* automatic buffer scaling */ 1568 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) 1569 nmax = MIN(sb_max, so->so_rcv.sb_hiwat + 1570 tcp_autorcvbuf_inc); 1571 } 1572 1573 /* a readable socket must be preserved because of poll(2) semantics */ 1574 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && 1575 nmax < so->so_snd.sb_lowat) 1576 nmax = so->so_snd.sb_lowat; 1577 1578 if (nmax == so->so_rcv.sb_hiwat) 1579 return; 1580 1581 /* round to MSS boundary */ 1582 nmax = roundup(nmax, tp->t_maxseg); 1583 sbreserve(so, &so->so_rcv, nmax); 1584 } 1585