1 /* $OpenBSD: tcp_timer.c,v 1.81 2025/01/14 13:49:44 bluhm Exp $ */ 2 /* $NetBSD: tcp_timer.c,v 1.14 1996/02/13 23:44:09 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)tcp_timer.c 8.1 (Berkeley) 6/10/93 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/mbuf.h> 38 #include <sys/socket.h> 39 #include <sys/socketvar.h> 40 #include <sys/protosw.h> 41 #include <sys/kernel.h> 42 #include <sys/pool.h> 43 44 #include <net/route.h> 45 46 #include <netinet/in.h> 47 #include <netinet/ip.h> 48 #include <netinet/in_pcb.h> 49 #include <netinet/ip_var.h> 50 #include <netinet/tcp.h> 51 #include <netinet/tcp_fsm.h> 52 #include <netinet/tcp_timer.h> 53 #include <netinet/tcp_var.h> 54 #include <netinet/tcp_debug.h> 55 #include <netinet/ip_icmp.h> 56 #include <netinet/tcp_seq.h> 57 58 /* 59 * Locks used to protect struct members in this file: 60 * T tcp_timer_mtx global tcp timer data structures 61 */ 62 63 int tcp_always_keepalive; 64 int tcp_keepidle; 65 int tcp_keepintvl; 66 int tcp_maxpersistidle; /* max idle time in persist */ 67 int tcp_maxidle; /* [T] max idle time for keep alive */ 68 69 /* 70 * Time to delay the ACK. This is initialized in tcp_init(), unless 71 * its patched. 72 */ 73 int tcp_delack_msecs; 74 75 void tcp_timer_rexmt(void *); 76 void tcp_timer_persist(void *); 77 void tcp_timer_keep(void *); 78 void tcp_timer_2msl(void *); 79 void tcp_timer_delack(void *); 80 81 const tcp_timer_func_t tcp_timer_funcs[TCPT_NTIMERS] = { 82 tcp_timer_rexmt, 83 tcp_timer_persist, 84 tcp_timer_keep, 85 tcp_timer_2msl, 86 tcp_timer_delack, 87 }; 88 89 /* 90 * Timer state initialization, called from tcp_init(). 91 */ 92 void 93 tcp_timer_init(void) 94 { 95 96 if (tcp_keepidle == 0) 97 tcp_keepidle = TCPTV_KEEP_IDLE; 98 99 if (tcp_keepintvl == 0) 100 tcp_keepintvl = TCPTV_KEEPINTVL; 101 102 if (tcp_maxpersistidle == 0) 103 tcp_maxpersistidle = TCPTV_KEEP_IDLE; 104 105 if (tcp_delack_msecs == 0) 106 tcp_delack_msecs = TCP_DELACK_MSECS; 107 } 108 109 static inline int 110 tcp_timer_enter(struct inpcb *inp, struct socket **so, struct tcpcb **tp, 111 u_int timer) 112 { 113 KASSERT(timer < TCPT_NTIMERS); 114 115 NET_LOCK_SHARED(); 116 *so = in_pcbsolock_ref(inp); 117 if (*so == NULL) { 118 *tp = NULL; 119 return -1; 120 } 121 *tp = intotcpcb(inp); 122 /* Ignore canceled timeouts or timeouts that have been rescheduled. */ 123 if (*tp == NULL || !ISSET((*tp)->t_flags, TF_TIMER << timer) || 124 timeout_pending(&(*tp)->t_timer[timer])) 125 return -1; 126 CLR((*tp)->t_flags, TF_TIMER << timer); 127 128 return 0; 129 } 130 131 static inline void 132 tcp_timer_leave(struct inpcb *inp, struct socket *so) 133 { 134 in_pcbsounlock_rele(inp, so); 135 NET_UNLOCK_SHARED(); 136 in_pcbunref(inp); 137 } 138 139 /* 140 * Callout to process delayed ACKs for a TCPCB. 141 */ 142 void 143 tcp_timer_delack(void *arg) 144 { 145 struct inpcb *inp = arg; 146 struct socket *so; 147 struct tcpcb *otp = NULL, *tp; 148 short ostate; 149 150 /* 151 * If tcp_output() wasn't able to transmit the ACK 152 * for whatever reason, it will restart the delayed 153 * ACK callout. 154 */ 155 if (tcp_timer_enter(inp, &so, &tp, TCPT_DELACK)) 156 goto out; 157 158 if (so->so_options & SO_DEBUG) { 159 otp = tp; 160 ostate = tp->t_state; 161 } 162 tp->t_flags |= TF_ACKNOW; 163 (void) tcp_output(tp); 164 if (otp) 165 tcp_trace(TA_TIMER, ostate, tp, otp, NULL, TCPT_DELACK, 0); 166 out: 167 tcp_timer_leave(inp, so); 168 } 169 170 /* 171 * Tcp protocol timeout routine called every 500 ms. 172 * Updates the timers in all active tcb's and 173 * causes finite state machine actions if timers expire. 174 */ 175 void 176 tcp_slowtimo(void) 177 { 178 mtx_enter(&tcp_timer_mtx); 179 tcp_maxidle = TCPTV_KEEPCNT * tcp_keepintvl; 180 tcp_iss += TCP_ISSINCR2/PR_SLOWHZ; /* increment iss */ 181 mtx_leave(&tcp_timer_mtx); 182 } 183 184 /* 185 * Cancel all timers for TCP tp. 186 */ 187 void 188 tcp_canceltimers(struct tcpcb *tp) 189 { 190 int i; 191 192 for (i = 0; i < TCPT_NTIMERS; i++) 193 TCP_TIMER_DISARM(tp, i); 194 } 195 196 const int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 197 { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 }; 198 199 const int tcp_totbackoff = 511; /* sum of tcp_backoff[] */ 200 201 /* 202 * TCP timer processing. 203 */ 204 205 void tcp_timer_freesack(struct tcpcb *); 206 207 void 208 tcp_timer_freesack(struct tcpcb *tp) 209 { 210 struct sackhole *p, *q; 211 /* 212 * Free SACK holes for 2MSL and REXMT timers. 213 */ 214 q = tp->snd_holes; 215 while (q != NULL) { 216 p = q; 217 q = q->next; 218 pool_put(&sackhl_pool, p); 219 } 220 tp->snd_holes = 0; 221 } 222 223 void 224 tcp_timer_rexmt(void *arg) 225 { 226 struct inpcb *inp = arg; 227 struct socket *so; 228 struct tcpcb *otp = NULL, *tp; 229 short ostate; 230 uint32_t rto; 231 232 if (tcp_timer_enter(inp, &so, &tp, TCPT_REXMT)) 233 goto out; 234 235 if ((tp->t_flags & TF_PMTUD_PEND) && 236 SEQ_GEQ(tp->t_pmtud_th_seq, tp->snd_una) && 237 SEQ_LT(tp->t_pmtud_th_seq, (int)(tp->snd_una + tp->t_maxseg))) { 238 struct sockaddr_in sin; 239 struct icmp icmp; 240 241 /* TF_PMTUD_PEND is set in tcp_ctlinput() which is IPv4 only */ 242 KASSERT(!ISSET(inp->inp_flags, INP_IPV6)); 243 tp->t_flags &= ~TF_PMTUD_PEND; 244 245 /* XXX create fake icmp message with relevant entries */ 246 icmp.icmp_nextmtu = tp->t_pmtud_nextmtu; 247 icmp.icmp_ip.ip_len = tp->t_pmtud_ip_len; 248 icmp.icmp_ip.ip_hl = tp->t_pmtud_ip_hl; 249 icmp.icmp_ip.ip_dst = inp->inp_faddr; 250 icmp_mtudisc(&icmp, inp->inp_rtableid); 251 252 /* 253 * Notify all connections to the same peer about 254 * new mss and trigger retransmit. 255 */ 256 bzero(&sin, sizeof(sin)); 257 sin.sin_len = sizeof(sin); 258 sin.sin_family = AF_INET; 259 sin.sin_addr = inp->inp_faddr; 260 in_pcbnotifyall(&tcbtable, &sin, inp->inp_rtableid, EMSGSIZE, 261 tcp_mtudisc); 262 goto out; 263 } 264 265 tcp_timer_freesack(tp); 266 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 267 tp->t_rxtshift = TCP_MAXRXTSHIFT; 268 tcpstat_inc(tcps_timeoutdrop); 269 tp = tcp_drop(tp, tp->t_softerror ? 270 tp->t_softerror : ETIMEDOUT); 271 goto out; 272 } 273 if (so->so_options & SO_DEBUG) { 274 otp = tp; 275 ostate = tp->t_state; 276 } 277 tcpstat_inc(tcps_rexmttimeo); 278 rto = TCP_REXMTVAL(tp); 279 if (rto < tp->t_rttmin) 280 rto = tp->t_rttmin; 281 TCPT_RANGESET(tp->t_rxtcur, 282 rto * tcp_backoff[tp->t_rxtshift], 283 tp->t_rttmin, TCPTV_REXMTMAX); 284 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 285 286 /* 287 * If we are losing and we are trying path MTU discovery, 288 * try turning it off. This will avoid black holes in 289 * the network which suppress or fail to send "packet 290 * too big" ICMP messages. We should ideally do 291 * lots more sophisticated searching to find the right 292 * value here... 293 */ 294 if (ip_mtudisc && 295 TCPS_HAVEESTABLISHED(tp->t_state) && 296 tp->t_rxtshift > TCP_MAXRXTSHIFT / 6) { 297 struct rtentry *rt = NULL; 298 299 /* No data to send means path mtu is not a problem */ 300 if (!READ_ONCE(so->so_snd.sb_cc)) 301 goto leave; 302 303 rt = in_pcbrtentry(inp); 304 /* Check if path MTU discovery is disabled already */ 305 if (rt && (rt->rt_flags & RTF_HOST) && 306 (rt->rt_locks & RTV_MTU)) 307 goto leave; 308 309 rt = NULL; 310 switch(tp->pf) { 311 #ifdef INET6 312 case PF_INET6: 313 /* 314 * We can not turn off path MTU for IPv6. 315 * Do nothing for now, maybe lower to 316 * minimum MTU. 317 */ 318 break; 319 #endif 320 case PF_INET: 321 rt = icmp_mtudisc_clone(inp->inp_faddr, 322 inp->inp_rtableid, 0); 323 break; 324 } 325 if (rt != NULL) { 326 /* Disable path MTU discovery */ 327 if ((rt->rt_locks & RTV_MTU) == 0) { 328 rt->rt_locks |= RTV_MTU; 329 in_rtchange(inp, 0); 330 } 331 332 rtfree(rt); 333 } 334 leave: 335 ; 336 } 337 338 /* 339 * If losing, let the lower level know and try for 340 * a better route. Also, if we backed off this far, 341 * our srtt estimate is probably bogus. Clobber it 342 * so we'll take the next rtt measurement as our srtt; 343 * move the current srtt into rttvar to keep the current 344 * retransmit times until then. 345 */ 346 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 347 in_losing(inp); 348 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 349 tp->t_srtt = 0; 350 } 351 tp->snd_nxt = tp->snd_una; 352 /* 353 * Note: We overload snd_last to function also as the 354 * snd_last variable described in RFC 2582 355 */ 356 tp->snd_last = tp->snd_max; 357 /* 358 * If timing a segment in this window, stop the timer. 359 */ 360 tp->t_rtttime = 0; 361 #ifdef TCP_ECN 362 /* 363 * if ECN is enabled, there might be a broken firewall which 364 * blocks ecn packets. fall back to non-ecn. 365 */ 366 if ((tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) 367 && atomic_load_int(&tcp_do_ecn) && !(tp->t_flags & TF_DISABLE_ECN)) 368 tp->t_flags |= TF_DISABLE_ECN; 369 #endif 370 /* 371 * Close the congestion window down to one segment 372 * (we'll open it by one segment for each ack we get). 373 * Since we probably have a window's worth of unacked 374 * data accumulated, this "slow start" keeps us from 375 * dumping all that data as back-to-back packets (which 376 * might overwhelm an intermediate gateway). 377 * 378 * There are two phases to the opening: Initially we 379 * open by one mss on each ack. This makes the window 380 * size increase exponentially with time. If the 381 * window is larger than the path can handle, this 382 * exponential growth results in dropped packet(s) 383 * almost immediately. To get more time between 384 * drops but still "push" the network to take advantage 385 * of improving conditions, we switch from exponential 386 * to linear window opening at some threshold size. 387 * For a threshold, we use half the current window 388 * size, truncated to a multiple of the mss. 389 * 390 * (the minimum cwnd that will give us exponential 391 * growth is 2 mss. We don't allow the threshold 392 * to go below this.) 393 */ 394 { 395 u_long win; 396 397 win = ulmin(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; 398 if (win < 2) 399 win = 2; 400 tp->snd_cwnd = tp->t_maxseg; 401 tp->snd_ssthresh = win * tp->t_maxseg; 402 tp->t_dupacks = 0; 403 #ifdef TCP_ECN 404 tp->snd_last = tp->snd_max; 405 tp->t_flags |= TF_SEND_CWR; 406 #endif 407 #if 1 /* TCP_ECN */ 408 tcpstat_inc(tcps_cwr_timeout); 409 #endif 410 } 411 (void) tcp_output(tp); 412 if (otp) 413 tcp_trace(TA_TIMER, ostate, tp, otp, NULL, TCPT_REXMT, 0); 414 out: 415 tcp_timer_leave(inp, so); 416 } 417 418 void 419 tcp_timer_persist(void *arg) 420 { 421 struct inpcb *inp = arg; 422 struct socket *so; 423 struct tcpcb *otp = NULL, *tp; 424 short ostate; 425 uint64_t now; 426 uint32_t rto; 427 428 if (tcp_timer_enter(inp, &so, &tp, TCPT_PERSIST)) 429 goto out; 430 431 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT)) 432 goto out; 433 434 if (so->so_options & SO_DEBUG) { 435 otp = tp; 436 ostate = tp->t_state; 437 } 438 tcpstat_inc(tcps_persisttimeo); 439 /* 440 * Hack: if the peer is dead/unreachable, we do not 441 * time out if the window is closed. After a full 442 * backoff, drop the connection if the idle time 443 * (no responses to probes) reaches the maximum 444 * backoff that we would use if retransmitting. 445 */ 446 rto = TCP_REXMTVAL(tp); 447 if (rto < tp->t_rttmin) 448 rto = tp->t_rttmin; 449 now = tcp_now(); 450 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 451 ((now - tp->t_rcvtime) >= tcp_maxpersistidle || 452 (now - tp->t_rcvtime) >= rto * tcp_totbackoff)) { 453 tcpstat_inc(tcps_persistdrop); 454 tp = tcp_drop(tp, ETIMEDOUT); 455 goto out; 456 } 457 tcp_setpersist(tp); 458 tp->t_force = 1; 459 (void) tcp_output(tp); 460 tp->t_force = 0; 461 if (otp) 462 tcp_trace(TA_TIMER, ostate, tp, otp, NULL, TCPT_PERSIST, 0); 463 out: 464 tcp_timer_leave(inp, so); 465 } 466 467 void 468 tcp_timer_keep(void *arg) 469 { 470 struct inpcb *inp = arg; 471 struct socket *so; 472 struct tcpcb *otp = NULL, *tp; 473 short ostate; 474 475 if (tcp_timer_enter(inp, &so, &tp, TCPT_KEEP)) 476 goto out; 477 478 if (so->so_options & SO_DEBUG) { 479 otp = tp; 480 ostate = tp->t_state; 481 } 482 tcpstat_inc(tcps_keeptimeo); 483 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) { 484 tcpstat_inc(tcps_keepdrops); 485 tp = tcp_drop(tp, ETIMEDOUT); 486 goto out; 487 } 488 if ((atomic_load_int(&tcp_always_keepalive) || 489 so->so_options & SO_KEEPALIVE) && 490 tp->t_state <= TCPS_CLOSING) { 491 int maxidle; 492 uint64_t now; 493 494 maxidle = READ_ONCE(tcp_maxidle); 495 now = tcp_now(); 496 if ((maxidle > 0) && 497 ((now - tp->t_rcvtime) >= tcp_keepidle + maxidle)) { 498 tcpstat_inc(tcps_keepdrops); 499 tp = tcp_drop(tp, ETIMEDOUT); 500 goto out; 501 } 502 /* 503 * Send a packet designed to force a response 504 * if the peer is up and reachable: 505 * either an ACK if the connection is still alive, 506 * or an RST if the peer has closed the connection 507 * due to timeout or reboot. 508 * Using sequence number tp->snd_una-1 509 * causes the transmitted zero-length segment 510 * to lie outside the receive window; 511 * by the protocol spec, this requires the 512 * correspondent TCP to respond. 513 */ 514 tcpstat_inc(tcps_keepprobe); 515 tcp_respond(tp, mtod(tp->t_template, caddr_t), 516 NULL, tp->rcv_nxt, tp->snd_una - 1, 0, 0, now); 517 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepintvl); 518 } else 519 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle); 520 if (otp) 521 tcp_trace(TA_TIMER, ostate, tp, otp, NULL, TCPT_KEEP, 0); 522 out: 523 tcp_timer_leave(inp, so); 524 } 525 526 void 527 tcp_timer_2msl(void *arg) 528 { 529 struct inpcb *inp = arg; 530 struct socket *so; 531 struct tcpcb *otp = NULL, *tp; 532 short ostate; 533 uint64_t now; 534 int maxidle; 535 536 if (tcp_timer_enter(inp, &so, &tp, TCPT_2MSL)) 537 goto out; 538 539 if (so->so_options & SO_DEBUG) { 540 otp = tp; 541 ostate = tp->t_state; 542 } 543 tcp_timer_freesack(tp); 544 545 maxidle = READ_ONCE(tcp_maxidle); 546 now = tcp_now(); 547 if (tp->t_state != TCPS_TIME_WAIT && 548 ((maxidle == 0) || ((now - tp->t_rcvtime) <= maxidle))) 549 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_keepintvl); 550 else 551 tp = tcp_close(tp); 552 if (otp) 553 tcp_trace(TA_TIMER, ostate, tp, otp, NULL, TCPT_2MSL, 0); 554 out: 555 tcp_timer_leave(inp, so); 556 } 557 558 void 559 tcp_timer_reaper(void *arg) 560 { 561 struct tcpcb *tp = arg; 562 563 /* 564 * This timer is necessary to delay the pool_put() after all timers 565 * have finished, even if they were sleeping to grab the net lock. 566 * Putting the pool_put() in a timer is sufficient as all timers run 567 * from the same timeout thread. Note that neither softnet thread nor 568 * user process may access the tcpcb after arming the reaper timer. 569 * Freeing may run in parallel as it does not grab the net lock. 570 */ 571 pool_put(&tcpcb_pool, tp); 572 } 573