xref: /openbsd-src/sys/netinet/tcp_timer.c (revision 8ead0783a05eee83ab02af2c7b14b10fbcdce47d)
1 /*	$OpenBSD: tcp_timer.c,v 1.59 2017/10/25 12:38:21 job Exp $	*/
2 /*	$NetBSD: tcp_timer.c,v 1.14 1996/02/13 23:44:09 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1990, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)tcp_timer.c	8.1 (Berkeley) 6/10/93
33  */
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/mbuf.h>
38 #include <sys/socket.h>
39 #include <sys/socketvar.h>
40 #include <sys/protosw.h>
41 #include <sys/kernel.h>
42 #include <sys/pool.h>
43 
44 #include <net/route.h>
45 
46 #include <netinet/in.h>
47 #include <netinet/ip.h>
48 #include <netinet/in_pcb.h>
49 #include <netinet/ip_var.h>
50 #include <netinet/tcp.h>
51 #include <netinet/tcp_fsm.h>
52 #include <netinet/tcp_timer.h>
53 #include <netinet/tcp_var.h>
54 #include <netinet/ip_icmp.h>
55 #include <netinet/tcp_seq.h>
56 
57 int	tcp_always_keepalive;
58 int	tcp_keepidle;
59 int	tcp_keepintvl;
60 int	tcp_maxpersistidle;	/* max idle time in persist */
61 int	tcp_maxidle;
62 
63 /*
64  * Time to delay the ACK.  This is initialized in tcp_init(), unless
65  * its patched.
66  */
67 int	tcp_delack_ticks;
68 
69 void	tcp_timer_rexmt(void *);
70 void	tcp_timer_persist(void *);
71 void	tcp_timer_keep(void *);
72 void	tcp_timer_2msl(void *);
73 
74 const tcp_timer_func_t tcp_timer_funcs[TCPT_NTIMERS] = {
75 	tcp_timer_rexmt,
76 	tcp_timer_persist,
77 	tcp_timer_keep,
78 	tcp_timer_2msl,
79 };
80 
81 /*
82  * Timer state initialization, called from tcp_init().
83  */
84 void
85 tcp_timer_init(void)
86 {
87 
88 	if (tcp_keepidle == 0)
89 		tcp_keepidle = TCPTV_KEEP_IDLE;
90 
91 	if (tcp_keepintvl == 0)
92 		tcp_keepintvl = TCPTV_KEEPINTVL;
93 
94 	if (tcp_maxpersistidle == 0)
95 		tcp_maxpersistidle = TCPTV_KEEP_IDLE;
96 
97 	if (tcp_delack_ticks == 0)
98 		tcp_delack_ticks = TCP_DELACK_TICKS;
99 }
100 
101 /*
102  * Callout to process delayed ACKs for a TCPCB.
103  */
104 void
105 tcp_delack(void *arg)
106 {
107 	struct tcpcb *tp = arg;
108 
109 	/*
110 	 * If tcp_output() wasn't able to transmit the ACK
111 	 * for whatever reason, it will restart the delayed
112 	 * ACK callout.
113 	 */
114 	NET_LOCK();
115 	if (tp->t_flags & TF_DEAD)
116 		goto out;
117 	tp->t_flags |= TF_ACKNOW;
118 	(void) tcp_output(tp);
119  out:
120 	NET_UNLOCK();
121 }
122 
123 /*
124  * Tcp protocol timeout routine called every 500 ms.
125  * Updates the timers in all active tcb's and
126  * causes finite state machine actions if timers expire.
127  */
128 void
129 tcp_slowtimo(void)
130 {
131 	NET_ASSERT_LOCKED();
132 
133 	tcp_maxidle = TCPTV_KEEPCNT * tcp_keepintvl;
134 	tcp_iss += TCP_ISSINCR2/PR_SLOWHZ;		/* increment iss */
135 	tcp_now++;					/* for timestamps */
136 }
137 
138 /*
139  * Cancel all timers for TCP tp.
140  */
141 void
142 tcp_canceltimers(struct tcpcb *tp)
143 {
144 	int i;
145 
146 	for (i = 0; i < TCPT_NTIMERS; i++)
147 		TCP_TIMER_DISARM(tp, i);
148 }
149 
150 int	tcp_backoff[TCP_MAXRXTSHIFT + 1] =
151     { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
152 
153 int tcp_totbackoff = 511;	/* sum of tcp_backoff[] */
154 
155 /*
156  * TCP timer processing.
157  */
158 
159 void	tcp_timer_freesack(struct tcpcb *);
160 
161 void
162 tcp_timer_freesack(struct tcpcb *tp)
163 {
164 	struct sackhole *p, *q;
165 	/*
166 	 * Free SACK holes for 2MSL and REXMT timers.
167 	 */
168 	q = tp->snd_holes;
169 	while (q != NULL) {
170 		p = q;
171 		q = q->next;
172 		pool_put(&sackhl_pool, p);
173 	}
174 	tp->snd_holes = 0;
175 }
176 
177 void
178 tcp_timer_rexmt(void *arg)
179 {
180 	struct tcpcb *tp = arg;
181 	uint32_t rto;
182 
183 	NET_LOCK();
184 	if (tp->t_flags & TF_DEAD)
185 		goto out;
186 
187 	if ((tp->t_flags & TF_PMTUD_PEND) && tp->t_inpcb &&
188 	    SEQ_GEQ(tp->t_pmtud_th_seq, tp->snd_una) &&
189 	    SEQ_LT(tp->t_pmtud_th_seq, (int)(tp->snd_una + tp->t_maxseg))) {
190 		struct sockaddr_in sin;
191 		struct icmp icmp;
192 
193 		tp->t_flags &= ~TF_PMTUD_PEND;
194 
195 		/* XXX create fake icmp message with relevant entries */
196 		icmp.icmp_nextmtu = tp->t_pmtud_nextmtu;
197 		icmp.icmp_ip.ip_len = tp->t_pmtud_ip_len;
198 		icmp.icmp_ip.ip_hl = tp->t_pmtud_ip_hl;
199 		icmp.icmp_ip.ip_dst = tp->t_inpcb->inp_faddr;
200 		icmp_mtudisc(&icmp, tp->t_inpcb->inp_rtableid);
201 
202 		/*
203 		 * Notify all connections to the same peer about
204 		 * new mss and trigger retransmit.
205 		 */
206 		bzero(&sin, sizeof(sin));
207 		sin.sin_len = sizeof(sin);
208 		sin.sin_family = AF_INET;
209 		sin.sin_addr = tp->t_inpcb->inp_faddr;
210 		in_pcbnotifyall(&tcbtable, sintosa(&sin),
211 		    tp->t_inpcb->inp_rtableid, EMSGSIZE, tcp_mtudisc);
212 		goto out;
213 	}
214 
215 	tcp_timer_freesack(tp);
216 	if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
217 		tp->t_rxtshift = TCP_MAXRXTSHIFT;
218 		tcpstat_inc(tcps_timeoutdrop);
219 		(void)tcp_drop(tp, tp->t_softerror ?
220 		    tp->t_softerror : ETIMEDOUT);
221 		goto out;
222 	}
223 	tcpstat_inc(tcps_rexmttimeo);
224 	rto = TCP_REXMTVAL(tp);
225 	if (rto < tp->t_rttmin)
226 		rto = tp->t_rttmin;
227 	TCPT_RANGESET(tp->t_rxtcur,
228 	    rto * tcp_backoff[tp->t_rxtshift],
229 	    tp->t_rttmin, TCPTV_REXMTMAX);
230 	TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
231 
232 	/*
233 	 * If we are losing and we are trying path MTU discovery,
234 	 * try turning it off.  This will avoid black holes in
235 	 * the network which suppress or fail to send "packet
236 	 * too big" ICMP messages.  We should ideally do
237 	 * lots more sophisticated searching to find the right
238 	 * value here...
239 	 */
240 	if (ip_mtudisc && tp->t_inpcb &&
241 	    TCPS_HAVEESTABLISHED(tp->t_state) &&
242 	    tp->t_rxtshift > TCP_MAXRXTSHIFT / 6) {
243 		struct inpcb *inp = tp->t_inpcb;
244 		struct rtentry *rt = NULL;
245 
246 		/* No data to send means path mtu is not a problem */
247 		if (!inp->inp_socket->so_snd.sb_cc)
248 			goto leave;
249 
250 		rt = in_pcbrtentry(inp);
251 		/* Check if path MTU discovery is disabled already */
252 		if (rt && (rt->rt_flags & RTF_HOST) &&
253 		    (rt->rt_locks & RTV_MTU))
254 			goto leave;
255 
256 		rt = NULL;
257 		switch(tp->pf) {
258 #ifdef INET6
259 		case PF_INET6:
260 			/*
261 			 * We can not turn off path MTU for IPv6.
262 			 * Do nothing for now, maybe lower to
263 			 * minimum MTU.
264 			 */
265 			break;
266 #endif
267 		case PF_INET:
268 			rt = icmp_mtudisc_clone(inp->inp_faddr,
269 			    inp->inp_rtableid);
270 			break;
271 		}
272 		if (rt != NULL) {
273 			/* Disable path MTU discovery */
274 			if ((rt->rt_locks & RTV_MTU) == 0) {
275 				rt->rt_locks |= RTV_MTU;
276 				in_rtchange(inp, 0);
277 			}
278 
279 			rtfree(rt);
280 		}
281 	leave:
282 		;
283 	}
284 
285 	/*
286 	 * If losing, let the lower level know and try for
287 	 * a better route.  Also, if we backed off this far,
288 	 * our srtt estimate is probably bogus.  Clobber it
289 	 * so we'll take the next rtt measurement as our srtt;
290 	 * move the current srtt into rttvar to keep the current
291 	 * retransmit times until then.
292 	 */
293 	if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
294 		in_losing(tp->t_inpcb);
295 		tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
296 		tp->t_srtt = 0;
297 	}
298 	tp->snd_nxt = tp->snd_una;
299 	/*
300 	 * Note:  We overload snd_last to function also as the
301 	 * snd_last variable described in RFC 2582
302 	 */
303 	tp->snd_last = tp->snd_max;
304 	/*
305 	 * If timing a segment in this window, stop the timer.
306 	 */
307 	tp->t_rtttime = 0;
308 #ifdef TCP_ECN
309 	/*
310 	 * if ECN is enabled, there might be a broken firewall which
311 	 * blocks ecn packets.  fall back to non-ecn.
312 	 */
313 	if ((tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED)
314 	    && tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN))
315 		tp->t_flags |= TF_DISABLE_ECN;
316 #endif
317 	/*
318 	 * Close the congestion window down to one segment
319 	 * (we'll open it by one segment for each ack we get).
320 	 * Since we probably have a window's worth of unacked
321 	 * data accumulated, this "slow start" keeps us from
322 	 * dumping all that data as back-to-back packets (which
323 	 * might overwhelm an intermediate gateway).
324 	 *
325 	 * There are two phases to the opening: Initially we
326 	 * open by one mss on each ack.  This makes the window
327 	 * size increase exponentially with time.  If the
328 	 * window is larger than the path can handle, this
329 	 * exponential growth results in dropped packet(s)
330 	 * almost immediately.  To get more time between
331 	 * drops but still "push" the network to take advantage
332 	 * of improving conditions, we switch from exponential
333 	 * to linear window opening at some threshold size.
334 	 * For a threshold, we use half the current window
335 	 * size, truncated to a multiple of the mss.
336 	 *
337 	 * (the minimum cwnd that will give us exponential
338 	 * growth is 2 mss.  We don't allow the threshold
339 	 * to go below this.)
340 	 */
341 	{
342 		u_long win = ulmin(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
343 		if (win < 2)
344 			win = 2;
345 		tp->snd_cwnd = tp->t_maxseg;
346 		tp->snd_ssthresh = win * tp->t_maxseg;
347 		tp->t_dupacks = 0;
348 #ifdef TCP_ECN
349 		tp->snd_last = tp->snd_max;
350 		tp->t_flags |= TF_SEND_CWR;
351 #endif
352 #if 1 /* TCP_ECN */
353 		tcpstat_inc(tcps_cwr_timeout);
354 #endif
355 	}
356 	(void) tcp_output(tp);
357 
358  out:
359 	NET_UNLOCK();
360 }
361 
362 void
363 tcp_timer_persist(void *arg)
364 {
365 	struct tcpcb *tp = arg;
366 	uint32_t rto;
367 
368 	NET_LOCK();
369 	if ((tp->t_flags & TF_DEAD) ||
370             TCP_TIMER_ISARMED(tp, TCPT_REXMT)) {
371 		goto out;
372 	}
373 	tcpstat_inc(tcps_persisttimeo);
374 	/*
375 	 * Hack: if the peer is dead/unreachable, we do not
376 	 * time out if the window is closed.  After a full
377 	 * backoff, drop the connection if the idle time
378 	 * (no responses to probes) reaches the maximum
379 	 * backoff that we would use if retransmitting.
380 	 */
381 	rto = TCP_REXMTVAL(tp);
382 	if (rto < tp->t_rttmin)
383 		rto = tp->t_rttmin;
384 	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
385 	    ((tcp_now - tp->t_rcvtime) >= tcp_maxpersistidle ||
386 	    (tcp_now - tp->t_rcvtime) >= rto * tcp_totbackoff)) {
387 		tcpstat_inc(tcps_persistdrop);
388 		tp = tcp_drop(tp, ETIMEDOUT);
389 		goto out;
390 	}
391 	tcp_setpersist(tp);
392 	tp->t_force = 1;
393 	(void) tcp_output(tp);
394 	tp->t_force = 0;
395  out:
396 	NET_UNLOCK();
397 }
398 
399 void
400 tcp_timer_keep(void *arg)
401 {
402 	struct tcpcb *tp = arg;
403 
404 	NET_LOCK();
405 	if (tp->t_flags & TF_DEAD)
406 		goto out;
407 
408 	tcpstat_inc(tcps_keeptimeo);
409 	if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
410 		goto dropit;
411 	if ((tcp_always_keepalive ||
412 	    tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) &&
413 	    tp->t_state <= TCPS_CLOSING) {
414 		if ((tcp_maxidle > 0) &&
415 		    ((tcp_now - tp->t_rcvtime) >= tcp_keepidle + tcp_maxidle))
416 			goto dropit;
417 		/*
418 		 * Send a packet designed to force a response
419 		 * if the peer is up and reachable:
420 		 * either an ACK if the connection is still alive,
421 		 * or an RST if the peer has closed the connection
422 		 * due to timeout or reboot.
423 		 * Using sequence number tp->snd_una-1
424 		 * causes the transmitted zero-length segment
425 		 * to lie outside the receive window;
426 		 * by the protocol spec, this requires the
427 		 * correspondent TCP to respond.
428 		 */
429 		tcpstat_inc(tcps_keepprobe);
430 		tcp_respond(tp, mtod(tp->t_template, caddr_t),
431 		    NULL, tp->rcv_nxt, tp->snd_una - 1, 0, 0);
432 		TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepintvl);
433 	} else
434 		TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
435  out:
436 	NET_UNLOCK();
437 	return;
438 
439  dropit:
440 	tcpstat_inc(tcps_keepdrops);
441 	tp = tcp_drop(tp, ETIMEDOUT);
442 	NET_UNLOCK();
443 }
444 
445 void
446 tcp_timer_2msl(void *arg)
447 {
448 	struct tcpcb *tp = arg;
449 
450 	NET_LOCK();
451 	if (tp->t_flags & TF_DEAD)
452 		goto out;
453 
454 	tcp_timer_freesack(tp);
455 
456 	if (tp->t_state != TCPS_TIME_WAIT &&
457 	    ((tcp_maxidle == 0) || ((tcp_now - tp->t_rcvtime) <= tcp_maxidle)))
458 		TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_keepintvl);
459 	else
460 		tp = tcp_close(tp);
461 
462  out:
463 	NET_UNLOCK();
464 }
465