1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include "opt_inet.h" 38 #include "opt_inet6.h" 39 #include "opt_tcpdebug.h" 40 #include "opt_rss.h" 41 42 #include <sys/param.h> 43 #include <sys/kernel.h> 44 #include <sys/lock.h> 45 #include <sys/mbuf.h> 46 #include <sys/mutex.h> 47 #include <sys/protosw.h> 48 #include <sys/smp.h> 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/sysctl.h> 52 #include <sys/systm.h> 53 54 #include <net/if.h> 55 #include <net/route.h> 56 #include <net/rss_config.h> 57 #include <net/vnet.h> 58 #include <net/netisr.h> 59 60 #include <netinet/in.h> 61 #include <netinet/in_kdtrace.h> 62 #include <netinet/in_pcb.h> 63 #include <netinet/in_rss.h> 64 #include <netinet/in_systm.h> 65 #ifdef INET6 66 #include <netinet6/in6_pcb.h> 67 #endif 68 #include <netinet/ip_var.h> 69 #include <netinet/tcp.h> 70 #include <netinet/tcp_fsm.h> 71 #include <netinet/tcp_log_buf.h> 72 #include <netinet/tcp_timer.h> 73 #include <netinet/tcp_var.h> 74 #include <netinet/tcp_seq.h> 75 #include <netinet/cc/cc.h> 76 #ifdef INET6 77 #include <netinet6/tcp6_var.h> 78 #endif 79 #include <netinet/tcpip.h> 80 #include <netinet/tcp_debug.h> 81 82 int tcp_persmin; 83 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, 84 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 85 &tcp_persmin, 0, sysctl_msec_to_ticks, "I", 86 "minimum persistence interval"); 87 88 int tcp_persmax; 89 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, 90 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 91 &tcp_persmax, 0, sysctl_msec_to_ticks, "I", 92 "maximum persistence interval"); 93 94 int tcp_keepinit; 95 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, 96 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 97 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", 98 "time to establish connection"); 99 100 int tcp_keepidle; 101 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, 102 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 103 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", 104 "time before keepalive probes begin"); 105 106 int tcp_keepintvl; 107 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, 108 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 109 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", 110 "time between keepalive probes"); 111 112 int tcp_delacktime; 113 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, 114 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 115 &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 116 "Time before a delayed ACK is sent"); 117 118 VNET_DEFINE(int, tcp_msl); 119 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, 120 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET, 121 &VNET_NAME(tcp_msl), 0, sysctl_msec_to_ticks, "I", 122 "Maximum segment lifetime"); 123 124 int tcp_rexmit_initial; 125 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_initial, 126 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 127 &tcp_rexmit_initial, 0, sysctl_msec_to_ticks, "I", 128 "Initial Retransmission Timeout"); 129 130 int tcp_rexmit_min; 131 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, 132 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 133 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 134 "Minimum Retransmission Timeout"); 135 136 int tcp_rexmit_slop; 137 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, 138 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 139 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 140 "Retransmission Timer Slop"); 141 142 VNET_DEFINE(int, tcp_always_keepalive) = 1; 143 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_VNET|CTLFLAG_RW, 144 &VNET_NAME(tcp_always_keepalive) , 0, 145 "Assume SO_KEEPALIVE on all TCP connections"); 146 147 int tcp_fast_finwait2_recycle = 0; 148 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 149 &tcp_fast_finwait2_recycle, 0, 150 "Recycle closed FIN_WAIT_2 connections faster"); 151 152 int tcp_finwait2_timeout; 153 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, 154 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 155 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", 156 "FIN-WAIT2 timeout"); 157 158 int tcp_keepcnt = TCPTV_KEEPCNT; 159 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 160 "Number of keepalive probes to send"); 161 162 /* max idle probes */ 163 int tcp_maxpersistidle; 164 165 int tcp_rexmit_drop_options = 0; 166 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 167 &tcp_rexmit_drop_options, 0, 168 "Drop TCP options from 3rd and later retransmitted SYN"); 169 170 VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 171 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 172 CTLFLAG_RW|CTLFLAG_VNET, 173 &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 174 "Path MTU Discovery Black Hole Detection Enabled"); 175 176 #ifdef INET 177 VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 178 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 179 CTLFLAG_RW|CTLFLAG_VNET, 180 &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 181 "Path MTU Discovery Black Hole Detection lowered MSS"); 182 #endif 183 184 #ifdef INET6 185 VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 186 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 187 CTLFLAG_RW|CTLFLAG_VNET, 188 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 189 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 190 #endif 191 192 #ifdef RSS 193 static int per_cpu_timers = 1; 194 #else 195 static int per_cpu_timers = 0; 196 #endif 197 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 198 &per_cpu_timers , 0, "run tcp timers on all cpus"); 199 200 /* 201 * Map the given inp to a CPU id. 202 * 203 * This queries RSS if it's compiled in, else it defaults to the current 204 * CPU ID. 205 */ 206 inline int 207 inp_to_cpuid(struct inpcb *inp) 208 { 209 u_int cpuid; 210 211 if (per_cpu_timers) { 212 #ifdef RSS 213 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 214 if (cpuid == NETISR_CPUID_NONE) 215 return (curcpu); /* XXX */ 216 else 217 return (cpuid); 218 #endif 219 /* 220 * We don't have a flowid -> cpuid mapping, so cheat and 221 * just map unknown cpuids to curcpu. Not the best, but 222 * apparently better than defaulting to swi 0. 223 */ 224 cpuid = inp->inp_flowid % (mp_maxid + 1); 225 if (! CPU_ABSENT(cpuid)) 226 return (cpuid); 227 return (curcpu); 228 } else { 229 return (0); 230 } 231 } 232 233 /* 234 * Legacy TCP global callout routine called every 500 ms. 235 * Used to cleanup timewait states, which lack their own callouts. 236 */ 237 static struct callout tcpslow_callout; 238 static void 239 tcp_slowtimo(void *arg __unused) 240 { 241 struct epoch_tracker et; 242 VNET_ITERATOR_DECL(vnet_iter); 243 244 NET_EPOCH_ENTER(et); 245 VNET_LIST_RLOCK_NOSLEEP(); 246 VNET_FOREACH(vnet_iter) { 247 CURVNET_SET(vnet_iter); 248 (void) tcp_tw_2msl_scan(0); 249 CURVNET_RESTORE(); 250 } 251 VNET_LIST_RUNLOCK_NOSLEEP(); 252 NET_EPOCH_EXIT(et); 253 254 callout_reset_sbt(&tcpslow_callout, SBT_1MS * 500, SBT_1MS * 10, 255 tcp_slowtimo, NULL, 0); 256 } 257 258 static void 259 tcp_slowtimo_init(void *arg __unused) 260 { 261 262 callout_init(&tcpslow_callout, 1); 263 callout_reset_sbt(&tcpslow_callout, SBT_1MS * 500, SBT_1MS * 10, 264 tcp_slowtimo, NULL, 0); 265 } 266 SYSINIT(tcp_timer, SI_SUB_VNET_DONE, SI_ORDER_ANY, tcp_slowtimo_init, NULL); 267 268 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 269 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 270 271 int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 272 273 /* 274 * TCP timer processing. 275 */ 276 277 void 278 tcp_timer_delack(void *xtp) 279 { 280 struct epoch_tracker et; 281 struct tcpcb *tp = xtp; 282 struct inpcb *inp; 283 CURVNET_SET(tp->t_vnet); 284 285 inp = tp->t_inpcb; 286 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 287 INP_WLOCK(inp); 288 if (callout_pending(&tp->t_timers->tt_delack) || 289 !callout_active(&tp->t_timers->tt_delack)) { 290 INP_WUNLOCK(inp); 291 CURVNET_RESTORE(); 292 return; 293 } 294 callout_deactivate(&tp->t_timers->tt_delack); 295 if ((inp->inp_flags & INP_DROPPED) != 0) { 296 INP_WUNLOCK(inp); 297 CURVNET_RESTORE(); 298 return; 299 } 300 tp->t_flags |= TF_ACKNOW; 301 TCPSTAT_INC(tcps_delack); 302 NET_EPOCH_ENTER(et); 303 (void) tcp_output_unlock(tp); 304 NET_EPOCH_EXIT(et); 305 CURVNET_RESTORE(); 306 } 307 308 void 309 tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp) 310 { 311 if (inp && tp != NULL) 312 INP_WUNLOCK(inp); 313 } 314 315 void 316 tcp_timer_2msl(void *xtp) 317 { 318 struct tcpcb *tp = xtp; 319 struct inpcb *inp; 320 struct epoch_tracker et; 321 CURVNET_SET(tp->t_vnet); 322 #ifdef TCPDEBUG 323 int ostate; 324 325 ostate = tp->t_state; 326 #endif 327 inp = tp->t_inpcb; 328 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 329 INP_WLOCK(inp); 330 tcp_log_end_status(tp, TCP_EI_STATUS_2MSL); 331 tcp_free_sackholes(tp); 332 if (callout_pending(&tp->t_timers->tt_2msl) || 333 !callout_active(&tp->t_timers->tt_2msl)) { 334 INP_WUNLOCK(tp->t_inpcb); 335 CURVNET_RESTORE(); 336 return; 337 } 338 callout_deactivate(&tp->t_timers->tt_2msl); 339 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 340 INP_WUNLOCK(inp); 341 CURVNET_RESTORE(); 342 return; 343 } 344 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 345 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 346 /* 347 * 2 MSL timeout in shutdown went off. If we're closed but 348 * still waiting for peer to close and connection has been idle 349 * too long delete connection control block. Otherwise, check 350 * again in a bit. 351 * 352 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 353 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 354 * Ignore fact that there were recent incoming segments. 355 */ 356 if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && 357 tp->t_inpcb && tp->t_inpcb->inp_socket && 358 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 359 TCPSTAT_INC(tcps_finwait2_drops); 360 NET_EPOCH_ENTER(et); 361 tp = tcp_close(tp); 362 NET_EPOCH_EXIT(et); 363 tcp_inpinfo_lock_del(inp, tp); 364 goto out; 365 } else { 366 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { 367 callout_reset(&tp->t_timers->tt_2msl, 368 TP_KEEPINTVL(tp), tcp_timer_2msl, tp); 369 } else { 370 NET_EPOCH_ENTER(et); 371 tp = tcp_close(tp); 372 NET_EPOCH_EXIT(et); 373 tcp_inpinfo_lock_del(inp, tp); 374 goto out; 375 } 376 } 377 378 #ifdef TCPDEBUG 379 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 380 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 381 PRU_SLOWTIMO); 382 #endif 383 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 384 385 if (tp != NULL) 386 INP_WUNLOCK(inp); 387 out: 388 CURVNET_RESTORE(); 389 } 390 391 void 392 tcp_timer_keep(void *xtp) 393 { 394 struct tcpcb *tp = xtp; 395 struct tcptemp *t_template; 396 struct inpcb *inp; 397 struct epoch_tracker et; 398 CURVNET_SET(tp->t_vnet); 399 #ifdef TCPDEBUG 400 int ostate; 401 402 ostate = tp->t_state; 403 #endif 404 inp = tp->t_inpcb; 405 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 406 INP_WLOCK(inp); 407 if (callout_pending(&tp->t_timers->tt_keep) || 408 !callout_active(&tp->t_timers->tt_keep)) { 409 INP_WUNLOCK(inp); 410 CURVNET_RESTORE(); 411 return; 412 } 413 callout_deactivate(&tp->t_timers->tt_keep); 414 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 415 INP_WUNLOCK(inp); 416 CURVNET_RESTORE(); 417 return; 418 } 419 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 420 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 421 422 /* 423 * Because we don't regularly reset the keepalive callout in 424 * the ESTABLISHED state, it may be that we don't actually need 425 * to send a keepalive yet. If that occurs, schedule another 426 * call for the next time the keepalive timer might expire. 427 */ 428 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 429 u_int idletime; 430 431 idletime = ticks - tp->t_rcvtime; 432 if (idletime < TP_KEEPIDLE(tp)) { 433 callout_reset(&tp->t_timers->tt_keep, 434 TP_KEEPIDLE(tp) - idletime, tcp_timer_keep, tp); 435 INP_WUNLOCK(inp); 436 CURVNET_RESTORE(); 437 return; 438 } 439 } 440 441 /* 442 * Keep-alive timer went off; send something 443 * or drop connection if idle for too long. 444 */ 445 TCPSTAT_INC(tcps_keeptimeo); 446 if (tp->t_state < TCPS_ESTABLISHED) 447 goto dropit; 448 if ((V_tcp_always_keepalive || 449 inp->inp_socket->so_options & SO_KEEPALIVE) && 450 tp->t_state <= TCPS_CLOSING) { 451 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 452 goto dropit; 453 /* 454 * Send a packet designed to force a response 455 * if the peer is up and reachable: 456 * either an ACK if the connection is still alive, 457 * or an RST if the peer has closed the connection 458 * due to timeout or reboot. 459 * Using sequence number tp->snd_una-1 460 * causes the transmitted zero-length segment 461 * to lie outside the receive window; 462 * by the protocol spec, this requires the 463 * correspondent TCP to respond. 464 */ 465 TCPSTAT_INC(tcps_keepprobe); 466 t_template = tcpip_maketemplate(inp); 467 if (t_template) { 468 NET_EPOCH_ENTER(et); 469 tcp_respond(tp, t_template->tt_ipgen, 470 &t_template->tt_t, (struct mbuf *)NULL, 471 tp->rcv_nxt, tp->snd_una - 1, 0); 472 NET_EPOCH_EXIT(et); 473 free(t_template, M_TEMP); 474 } 475 callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), 476 tcp_timer_keep, tp); 477 } else 478 callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), 479 tcp_timer_keep, tp); 480 481 #ifdef TCPDEBUG 482 if (inp->inp_socket->so_options & SO_DEBUG) 483 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 484 PRU_SLOWTIMO); 485 #endif 486 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 487 INP_WUNLOCK(inp); 488 CURVNET_RESTORE(); 489 return; 490 491 dropit: 492 TCPSTAT_INC(tcps_keepdrops); 493 NET_EPOCH_ENTER(et); 494 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 495 tp = tcp_drop(tp, ETIMEDOUT); 496 497 #ifdef TCPDEBUG 498 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 499 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 500 PRU_SLOWTIMO); 501 #endif 502 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 503 NET_EPOCH_EXIT(et); 504 tcp_inpinfo_lock_del(inp, tp); 505 CURVNET_RESTORE(); 506 } 507 508 void 509 tcp_timer_persist(void *xtp) 510 { 511 struct tcpcb *tp = xtp; 512 struct inpcb *inp; 513 struct epoch_tracker et; 514 int outrv; 515 CURVNET_SET(tp->t_vnet); 516 #ifdef TCPDEBUG 517 int ostate; 518 519 ostate = tp->t_state; 520 #endif 521 inp = tp->t_inpcb; 522 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 523 INP_WLOCK(inp); 524 if (callout_pending(&tp->t_timers->tt_persist) || 525 !callout_active(&tp->t_timers->tt_persist)) { 526 INP_WUNLOCK(inp); 527 CURVNET_RESTORE(); 528 return; 529 } 530 callout_deactivate(&tp->t_timers->tt_persist); 531 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 532 INP_WUNLOCK(inp); 533 CURVNET_RESTORE(); 534 return; 535 } 536 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 537 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 538 /* 539 * Persistence timer into zero window. 540 * Force a byte to be output, if possible. 541 */ 542 TCPSTAT_INC(tcps_persisttimeo); 543 /* 544 * Hack: if the peer is dead/unreachable, we do not 545 * time out if the window is closed. After a full 546 * backoff, drop the connection if the idle time 547 * (no responses to probes) reaches the maximum 548 * backoff that we would use if retransmitting. 549 */ 550 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 551 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 552 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 553 TCPSTAT_INC(tcps_persistdrop); 554 NET_EPOCH_ENTER(et); 555 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 556 tp = tcp_drop(tp, ETIMEDOUT); 557 NET_EPOCH_EXIT(et); 558 tcp_inpinfo_lock_del(inp, tp); 559 goto out; 560 } 561 /* 562 * If the user has closed the socket then drop a persisting 563 * connection after a much reduced timeout. 564 */ 565 if (tp->t_state > TCPS_CLOSE_WAIT && 566 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 567 TCPSTAT_INC(tcps_persistdrop); 568 NET_EPOCH_ENTER(et); 569 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 570 tp = tcp_drop(tp, ETIMEDOUT); 571 NET_EPOCH_EXIT(et); 572 tcp_inpinfo_lock_del(inp, tp); 573 goto out; 574 } 575 tcp_setpersist(tp); 576 tp->t_flags |= TF_FORCEDATA; 577 NET_EPOCH_ENTER(et); 578 outrv = tcp_output_nodrop(tp); 579 tp->t_flags &= ~TF_FORCEDATA; 580 581 #ifdef TCPDEBUG 582 if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 583 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 584 #endif 585 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 586 (void) tcp_unlock_or_drop(tp, outrv); 587 NET_EPOCH_EXIT(et); 588 out: 589 CURVNET_RESTORE(); 590 } 591 592 void 593 tcp_timer_rexmt(void * xtp) 594 { 595 struct tcpcb *tp = xtp; 596 CURVNET_SET(tp->t_vnet); 597 int rexmt, outrv; 598 struct inpcb *inp; 599 struct epoch_tracker et; 600 bool isipv6; 601 #ifdef TCPDEBUG 602 int ostate; 603 604 ostate = tp->t_state; 605 #endif 606 inp = tp->t_inpcb; 607 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 608 INP_WLOCK(inp); 609 if (callout_pending(&tp->t_timers->tt_rexmt) || 610 !callout_active(&tp->t_timers->tt_rexmt)) { 611 INP_WUNLOCK(inp); 612 CURVNET_RESTORE(); 613 return; 614 } 615 callout_deactivate(&tp->t_timers->tt_rexmt); 616 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 617 INP_WUNLOCK(inp); 618 CURVNET_RESTORE(); 619 return; 620 } 621 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 622 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 623 tcp_free_sackholes(tp); 624 TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, NULL, false); 625 if (tp->t_fb->tfb_tcp_rexmit_tmr) { 626 /* The stack has a timer action too. */ 627 (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); 628 } 629 /* 630 * Retransmission timer went off. Message has not 631 * been acked within retransmit interval. Back off 632 * to a longer retransmit interval and retransmit one segment. 633 */ 634 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 635 tp->t_rxtshift = TCP_MAXRXTSHIFT; 636 TCPSTAT_INC(tcps_timeoutdrop); 637 NET_EPOCH_ENTER(et); 638 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 639 tp = tcp_drop(tp, ETIMEDOUT); 640 NET_EPOCH_EXIT(et); 641 tcp_inpinfo_lock_del(inp, tp); 642 goto out; 643 } 644 if (tp->t_state == TCPS_SYN_SENT) { 645 /* 646 * If the SYN was retransmitted, indicate CWND to be 647 * limited to 1 segment in cc_conn_init(). 648 */ 649 tp->snd_cwnd = 1; 650 } else if (tp->t_rxtshift == 1) { 651 /* 652 * first retransmit; record ssthresh and cwnd so they can 653 * be recovered if this turns out to be a "bad" retransmit. 654 * A retransmit is considered "bad" if an ACK for this 655 * segment is received within RTT/2 interval; the assumption 656 * here is that the ACK was already in flight. See 657 * "On Estimating End-to-End Network Path Properties" by 658 * Allman and Paxson for more details. 659 */ 660 tp->snd_cwnd_prev = tp->snd_cwnd; 661 tp->snd_ssthresh_prev = tp->snd_ssthresh; 662 tp->snd_recover_prev = tp->snd_recover; 663 if (IN_FASTRECOVERY(tp->t_flags)) 664 tp->t_flags |= TF_WASFRECOVERY; 665 else 666 tp->t_flags &= ~TF_WASFRECOVERY; 667 if (IN_CONGRECOVERY(tp->t_flags)) 668 tp->t_flags |= TF_WASCRECOVERY; 669 else 670 tp->t_flags &= ~TF_WASCRECOVERY; 671 if ((tp->t_flags & TF_RCVD_TSTMP) == 0) 672 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 673 /* In the event that we've negotiated timestamps 674 * badrxtwin will be set to the value that we set 675 * the retransmitted packet's to_tsval to by tcp_output 676 */ 677 tp->t_flags |= TF_PREVVALID; 678 } else 679 tp->t_flags &= ~TF_PREVVALID; 680 TCPSTAT_INC(tcps_rexmttimeo); 681 if ((tp->t_state == TCPS_SYN_SENT) || 682 (tp->t_state == TCPS_SYN_RECEIVED)) 683 rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift]; 684 else 685 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 686 TCPT_RANGESET(tp->t_rxtcur, rexmt, 687 tp->t_rttmin, TCPTV_REXMTMAX); 688 689 /* 690 * We enter the path for PLMTUD if connection is established or, if 691 * connection is FIN_WAIT_1 status, reason for the last is that if 692 * amount of data we send is very small, we could send it in couple of 693 * packets and process straight to FIN. In that case we won't catch 694 * ESTABLISHED state. 695 */ 696 #ifdef INET6 697 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false; 698 #else 699 isipv6 = false; 700 #endif 701 if (((V_tcp_pmtud_blackhole_detect == 1) || 702 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 703 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 704 ((tp->t_state == TCPS_ESTABLISHED) || 705 (tp->t_state == TCPS_FIN_WAIT_1))) { 706 if (tp->t_rxtshift == 1) { 707 /* 708 * We enter blackhole detection after the first 709 * unsuccessful timer based retransmission. 710 * Then we reduce up to two times the MSS, each 711 * candidate giving two tries of retransmissions. 712 * But we give a candidate only two tries, if it 713 * actually reduces the MSS. 714 */ 715 tp->t_blackhole_enter = 2; 716 tp->t_blackhole_exit = tp->t_blackhole_enter; 717 if (isipv6) { 718 #ifdef INET6 719 if (tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) 720 tp->t_blackhole_exit += 2; 721 if (tp->t_maxseg > V_tcp_v6mssdflt && 722 V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt) 723 tp->t_blackhole_exit += 2; 724 #endif 725 } else { 726 #ifdef INET 727 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) 728 tp->t_blackhole_exit += 2; 729 if (tp->t_maxseg > V_tcp_mssdflt && 730 V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt) 731 tp->t_blackhole_exit += 2; 732 #endif 733 } 734 } 735 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 736 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 737 (tp->t_rxtshift >= tp->t_blackhole_enter && 738 tp->t_rxtshift < tp->t_blackhole_exit && 739 (tp->t_rxtshift - tp->t_blackhole_enter) % 2 == 0)) { 740 /* 741 * Enter Path MTU Black-hole Detection mechanism: 742 * - Disable Path MTU Discovery (IP "DF" bit). 743 * - Reduce MTU to lower value than what we 744 * negotiated with peer. 745 */ 746 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 747 /* Record that we may have found a black hole. */ 748 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 749 /* Keep track of previous MSS. */ 750 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 751 } 752 753 /* 754 * Reduce the MSS to blackhole value or to the default 755 * in an attempt to retransmit. 756 */ 757 #ifdef INET6 758 if (isipv6 && 759 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss && 760 V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt) { 761 /* Use the sysctl tuneable blackhole MSS. */ 762 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 763 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 764 } else if (isipv6) { 765 /* Use the default MSS. */ 766 tp->t_maxseg = V_tcp_v6mssdflt; 767 /* 768 * Disable Path MTU Discovery when we switch to 769 * minmss. 770 */ 771 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 772 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 773 } 774 #endif 775 #if defined(INET6) && defined(INET) 776 else 777 #endif 778 #ifdef INET 779 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss && 780 V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt) { 781 /* Use the sysctl tuneable blackhole MSS. */ 782 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 783 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 784 } else { 785 /* Use the default MSS. */ 786 tp->t_maxseg = V_tcp_mssdflt; 787 /* 788 * Disable Path MTU Discovery when we switch to 789 * minmss. 790 */ 791 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 792 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 793 } 794 #endif 795 /* 796 * Reset the slow-start flight size 797 * as it may depend on the new MSS. 798 */ 799 if (CC_ALGO(tp)->conn_init != NULL) 800 CC_ALGO(tp)->conn_init(tp->ccv); 801 } else { 802 /* 803 * If further retransmissions are still unsuccessful 804 * with a lowered MTU, maybe this isn't a blackhole and 805 * we restore the previous MSS and blackhole detection 806 * flags. 807 */ 808 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 809 (tp->t_rxtshift >= tp->t_blackhole_exit)) { 810 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 811 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 812 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 813 TCPSTAT_INC(tcps_pmtud_blackhole_failed); 814 /* 815 * Reset the slow-start flight size as it 816 * may depend on the new MSS. 817 */ 818 if (CC_ALGO(tp)->conn_init != NULL) 819 CC_ALGO(tp)->conn_init(tp->ccv); 820 } 821 } 822 } 823 824 /* 825 * Disable RFC1323 and SACK if we haven't got any response to 826 * our third SYN to work-around some broken terminal servers 827 * (most of which have hopefully been retired) that have bad VJ 828 * header compression code which trashes TCP segments containing 829 * unknown-to-them TCP options. 830 */ 831 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 832 (tp->t_rxtshift == 3)) 833 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 834 /* 835 * If we backed off this far, notify the L3 protocol that we're having 836 * connection problems. 837 */ 838 if (tp->t_rxtshift > TCP_RTT_INVALIDATE) { 839 #ifdef INET6 840 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 841 in6_losing(tp->t_inpcb); 842 else 843 #endif 844 in_losing(tp->t_inpcb); 845 } 846 tp->snd_nxt = tp->snd_una; 847 tp->snd_recover = tp->snd_max; 848 /* 849 * Force a segment to be sent. 850 */ 851 tp->t_flags |= TF_ACKNOW; 852 /* 853 * If timing a segment in this window, stop the timer. 854 */ 855 tp->t_rtttime = 0; 856 857 cc_cong_signal(tp, NULL, CC_RTO); 858 NET_EPOCH_ENTER(et); 859 outrv = tcp_output_nodrop(tp); 860 #ifdef TCPDEBUG 861 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 862 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 863 PRU_SLOWTIMO); 864 #endif 865 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 866 (void) tcp_unlock_or_drop(tp, outrv); 867 NET_EPOCH_EXIT(et); 868 out: 869 CURVNET_RESTORE(); 870 } 871 872 void 873 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) 874 { 875 struct callout *t_callout; 876 callout_func_t *f_callout; 877 struct inpcb *inp = tp->t_inpcb; 878 int cpu = inp_to_cpuid(inp); 879 880 #ifdef TCP_OFFLOAD 881 if (tp->t_flags & TF_TOE) 882 return; 883 #endif 884 885 if (tp->t_timers->tt_flags & TT_STOPPED) 886 return; 887 888 switch (timer_type) { 889 case TT_DELACK: 890 t_callout = &tp->t_timers->tt_delack; 891 f_callout = tcp_timer_delack; 892 break; 893 case TT_REXMT: 894 t_callout = &tp->t_timers->tt_rexmt; 895 f_callout = tcp_timer_rexmt; 896 break; 897 case TT_PERSIST: 898 t_callout = &tp->t_timers->tt_persist; 899 f_callout = tcp_timer_persist; 900 break; 901 case TT_KEEP: 902 t_callout = &tp->t_timers->tt_keep; 903 f_callout = tcp_timer_keep; 904 break; 905 case TT_2MSL: 906 t_callout = &tp->t_timers->tt_2msl; 907 f_callout = tcp_timer_2msl; 908 break; 909 default: 910 if (tp->t_fb->tfb_tcp_timer_activate) { 911 tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); 912 return; 913 } 914 panic("tp %p bad timer_type %#x", tp, timer_type); 915 } 916 if (delta == 0) { 917 callout_stop(t_callout); 918 } else { 919 callout_reset_on(t_callout, delta, f_callout, tp, cpu); 920 } 921 } 922 923 int 924 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) 925 { 926 struct callout *t_callout; 927 928 switch (timer_type) { 929 case TT_DELACK: 930 t_callout = &tp->t_timers->tt_delack; 931 break; 932 case TT_REXMT: 933 t_callout = &tp->t_timers->tt_rexmt; 934 break; 935 case TT_PERSIST: 936 t_callout = &tp->t_timers->tt_persist; 937 break; 938 case TT_KEEP: 939 t_callout = &tp->t_timers->tt_keep; 940 break; 941 case TT_2MSL: 942 t_callout = &tp->t_timers->tt_2msl; 943 break; 944 default: 945 if (tp->t_fb->tfb_tcp_timer_active) { 946 return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); 947 } 948 panic("tp %p bad timer_type %#x", tp, timer_type); 949 } 950 return callout_active(t_callout); 951 } 952 953 /* 954 * Stop the timer from running, and apply a flag 955 * against the timer_flags that will force the 956 * timer never to run. The flag is needed to assure 957 * a race does not leave it running and cause 958 * the timer to possibly restart itself (keep and persist 959 * especially do this). 960 */ 961 int 962 tcp_timer_suspend(struct tcpcb *tp, uint32_t timer_type) 963 { 964 struct callout *t_callout; 965 uint32_t t_flags; 966 967 switch (timer_type) { 968 case TT_DELACK: 969 t_flags = TT_DELACK_SUS; 970 t_callout = &tp->t_timers->tt_delack; 971 break; 972 case TT_REXMT: 973 t_flags = TT_REXMT_SUS; 974 t_callout = &tp->t_timers->tt_rexmt; 975 break; 976 case TT_PERSIST: 977 t_flags = TT_PERSIST_SUS; 978 t_callout = &tp->t_timers->tt_persist; 979 break; 980 case TT_KEEP: 981 t_flags = TT_KEEP_SUS; 982 t_callout = &tp->t_timers->tt_keep; 983 break; 984 case TT_2MSL: 985 t_flags = TT_2MSL_SUS; 986 t_callout = &tp->t_timers->tt_2msl; 987 break; 988 default: 989 panic("tp:%p bad timer_type 0x%x", tp, timer_type); 990 } 991 tp->t_timers->tt_flags |= t_flags; 992 return (callout_stop(t_callout)); 993 } 994 995 void 996 tcp_timers_unsuspend(struct tcpcb *tp, uint32_t timer_type) 997 { 998 switch (timer_type) { 999 case TT_DELACK: 1000 if (tp->t_timers->tt_flags & TT_DELACK_SUS) { 1001 tp->t_timers->tt_flags &= ~TT_DELACK_SUS; 1002 if (tp->t_flags & TF_DELACK) { 1003 /* Delayed ack timer should be up activate a timer */ 1004 tp->t_flags &= ~TF_DELACK; 1005 tcp_timer_activate(tp, TT_DELACK, 1006 tcp_delacktime); 1007 } 1008 } 1009 break; 1010 case TT_REXMT: 1011 if (tp->t_timers->tt_flags & TT_REXMT_SUS) { 1012 tp->t_timers->tt_flags &= ~TT_REXMT_SUS; 1013 if (SEQ_GT(tp->snd_max, tp->snd_una) && 1014 (tcp_timer_active((tp), TT_PERSIST) == 0) && 1015 tp->snd_wnd) { 1016 /* We have outstanding data activate a timer */ 1017 tcp_timer_activate(tp, TT_REXMT, 1018 tp->t_rxtcur); 1019 } 1020 } 1021 break; 1022 case TT_PERSIST: 1023 if (tp->t_timers->tt_flags & TT_PERSIST_SUS) { 1024 tp->t_timers->tt_flags &= ~TT_PERSIST_SUS; 1025 if (tp->snd_wnd == 0) { 1026 /* Activate the persists timer */ 1027 tp->t_rxtshift = 0; 1028 tcp_setpersist(tp); 1029 } 1030 } 1031 break; 1032 case TT_KEEP: 1033 if (tp->t_timers->tt_flags & TT_KEEP_SUS) { 1034 tp->t_timers->tt_flags &= ~TT_KEEP_SUS; 1035 tcp_timer_activate(tp, TT_KEEP, 1036 TCPS_HAVEESTABLISHED(tp->t_state) ? 1037 TP_KEEPIDLE(tp) : TP_KEEPINIT(tp)); 1038 } 1039 break; 1040 case TT_2MSL: 1041 if (tp->t_timers->tt_flags &= TT_2MSL_SUS) { 1042 tp->t_timers->tt_flags &= ~TT_2MSL_SUS; 1043 if ((tp->t_state == TCPS_FIN_WAIT_2) && 1044 ((tp->t_inpcb->inp_socket == NULL) || 1045 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE))) { 1046 /* Star the 2MSL timer */ 1047 tcp_timer_activate(tp, TT_2MSL, 1048 (tcp_fast_finwait2_recycle) ? 1049 tcp_finwait2_timeout : TP_MAXIDLE(tp)); 1050 } 1051 } 1052 break; 1053 default: 1054 panic("tp:%p bad timer_type 0x%x", tp, timer_type); 1055 } 1056 } 1057 1058 static void 1059 tcp_timer_discard(void *ptp) 1060 { 1061 struct inpcb *inp; 1062 struct tcpcb *tp; 1063 struct epoch_tracker et; 1064 1065 tp = (struct tcpcb *)ptp; 1066 CURVNET_SET(tp->t_vnet); 1067 NET_EPOCH_ENTER(et); 1068 inp = tp->t_inpcb; 1069 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", 1070 __func__, tp)); 1071 INP_WLOCK(inp); 1072 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) != 0, 1073 ("%s: tcpcb has to be stopped here", __func__)); 1074 if (--tp->t_timers->tt_draincnt > 0 || 1075 tcp_freecb(tp) == false) 1076 INP_WUNLOCK(inp); 1077 NET_EPOCH_EXIT(et); 1078 CURVNET_RESTORE(); 1079 } 1080 1081 void 1082 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) 1083 { 1084 struct callout *t_callout; 1085 1086 tp->t_timers->tt_flags |= TT_STOPPED; 1087 switch (timer_type) { 1088 case TT_DELACK: 1089 t_callout = &tp->t_timers->tt_delack; 1090 break; 1091 case TT_REXMT: 1092 t_callout = &tp->t_timers->tt_rexmt; 1093 break; 1094 case TT_PERSIST: 1095 t_callout = &tp->t_timers->tt_persist; 1096 break; 1097 case TT_KEEP: 1098 t_callout = &tp->t_timers->tt_keep; 1099 break; 1100 case TT_2MSL: 1101 t_callout = &tp->t_timers->tt_2msl; 1102 break; 1103 default: 1104 if (tp->t_fb->tfb_tcp_timer_stop) { 1105 /* 1106 * XXXrrs we need to look at this with the 1107 * stop case below (flags). 1108 */ 1109 tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); 1110 return; 1111 } 1112 panic("tp %p bad timer_type %#x", tp, timer_type); 1113 } 1114 1115 if (callout_async_drain(t_callout, tcp_timer_discard) == 0) { 1116 /* 1117 * Can't stop the callout, defer tcpcb actual deletion 1118 * to the last one. We do this using the async drain 1119 * function and incrementing the count in 1120 */ 1121 tp->t_timers->tt_draincnt++; 1122 } 1123 } 1124