xref: /openbsd-src/sys/netinet/tcp_timer.c (revision 3a3fbb3f2e2521ab7c4a56b7ff7462ebd9095ec5)
1 /*	$OpenBSD: tcp_timer.c,v 1.23 2002/01/02 20:35:40 deraadt Exp $	*/
2 /*	$NetBSD: tcp_timer.c,v 1.14 1996/02/13 23:44:09 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1990, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)tcp_timer.c	8.1 (Berkeley) 6/10/93
37  */
38 
39 #ifndef TUBA_INCLUDE
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/mbuf.h>
43 #include <sys/socket.h>
44 #include <sys/socketvar.h>
45 #include <sys/protosw.h>
46 
47 #include <net/route.h>
48 
49 #include <netinet/in.h>
50 #include <netinet/in_systm.h>
51 #include <netinet/ip.h>
52 #include <netinet/in_pcb.h>
53 #include <netinet/ip_var.h>
54 #include <netinet/tcp.h>
55 #include <netinet/tcp_fsm.h>
56 #include <netinet/tcp_timer.h>
57 #include <netinet/tcp_var.h>
58 #include <netinet/ip_icmp.h>
59 
60 int	tcp_keepidle = TCPTV_KEEP_IDLE;
61 int	tcp_keepintvl = TCPTV_KEEPINTVL;
62 int	tcp_maxpersistidle = TCPTV_KEEP_IDLE;	/* max idle time in persist */
63 int	tcp_maxidle;
64 #endif /* TUBA_INCLUDE */
65 /*
66  * Fast timeout routine for processing delayed acks
67  */
68 void
69 tcp_fasttimo()
70 {
71 	register struct inpcb *inp;
72 	register struct tcpcb *tp;
73 	int s;
74 
75 	s = splsoftnet();
76 	inp = tcbtable.inpt_queue.cqh_first;
77 	if (inp)						/* XXX */
78 	for (; inp != (struct inpcb *)&tcbtable.inpt_queue;
79 	    inp = inp->inp_queue.cqe_next) {
80 		if ((tp = (struct tcpcb *)inp->inp_ppcb) &&
81 		    (tp->t_flags & TF_DELACK)) {
82 			tp->t_flags &= ~TF_DELACK;
83 			tp->t_flags |= TF_ACKNOW;
84 			tcpstat.tcps_delack++;
85 			(void) tcp_output(tp);
86 		}
87 	}
88 	splx(s);
89 }
90 
91 /*
92  * Tcp protocol timeout routine called every 500 ms.
93  * Updates the timers in all active tcb's and
94  * causes finite state machine actions if timers expire.
95  */
96 void
97 tcp_slowtimo()
98 {
99 	register struct inpcb *ip, *ipnxt;
100 	register struct tcpcb *tp;
101 	int s;
102 	register long i;
103 
104 	s = splsoftnet();
105 	tcp_maxidle = TCPTV_KEEPCNT * tcp_keepintvl;
106 	/*
107 	 * Search through tcb's and update active timers.
108 	 */
109 	ip = tcbtable.inpt_queue.cqh_first;
110 	if (ip == (struct inpcb *)0) {				/* XXX */
111 		splx(s);
112 		return;
113 	}
114 	for (; ip != (struct inpcb *)&tcbtable.inpt_queue; ip = ipnxt) {
115 		ipnxt = ip->inp_queue.cqe_next;
116 		tp = intotcpcb(ip);
117 		if (tp == 0 || tp->t_state == TCPS_LISTEN)
118 			continue;
119 		for (i = 0; i < TCPT_NTIMERS; i++) {
120 			if (tp->t_timer[i] && --tp->t_timer[i] == 0) {
121 				(void) tcp_usrreq(tp->t_inpcb->inp_socket,
122 				    PRU_SLOWTIMO, (struct mbuf *)0,
123 				    (struct mbuf *)i, (struct mbuf *)0);
124 				/* XXX NOT MP SAFE */
125 				if ((ipnxt == (void *)&tcbtable.inpt_queue &&
126 				    tcbtable.inpt_queue.cqh_last != ip) ||
127 				    ipnxt->inp_queue.cqe_prev != ip)
128 					goto tpgone;
129 			}
130 		}
131 		tp->t_idle++;
132 		if (tp->t_rtt)
133 			tp->t_rtt++;
134 tpgone:
135 		;
136 	}
137 #ifdef TCP_COMPAT_42
138 	tcp_iss += TCP_ISSINCR/PR_SLOWHZ;		/* increment iss */
139 	if ((int)tcp_iss < 0)
140 		tcp_iss = 0;				/* XXX */
141 #endif /* TCP_COMPAT_42 */
142 	tcp_now++;					/* for timestamps */
143 	splx(s);
144 }
145 #ifndef TUBA_INCLUDE
146 
147 /*
148  * Cancel all timers for TCP tp.
149  */
150 void
151 tcp_canceltimers(tp)
152 	struct tcpcb *tp;
153 {
154 	register int i;
155 
156 	for (i = 0; i < TCPT_NTIMERS; i++)
157 		tp->t_timer[i] = 0;
158 }
159 
160 int	tcp_backoff[TCP_MAXRXTSHIFT + 1] =
161     { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
162 
163 int tcp_totbackoff = 511;	/* sum of tcp_backoff[] */
164 
165 /*
166  * TCP timer processing.
167  */
168 struct tcpcb *
169 tcp_timers(tp, timer)
170 	register struct tcpcb *tp;
171 	int timer;
172 {
173 	short rto;
174 #ifdef TCP_SACK
175 	struct sackhole *p, *q;
176 	/*
177 	 * Free SACK holes for 2MSL and REXMT timers.
178 	 */
179 	if (timer == TCPT_2MSL || timer == TCPT_REXMT) {
180 		q = p = tp->snd_holes;
181 		while (p != 0) {
182 			q = p->next;
183 			free(p, M_PCB);
184 			p = q;
185 		}
186 		tp->snd_holes = 0;
187 #if defined(TCP_SACK) && defined(TCP_FACK)
188 		tp->snd_fack = tp->snd_una;
189 		tp->retran_data = 0;
190 		tp->snd_awnd = 0;
191 #endif /* TCP_FACK */
192 	}
193 #endif /* TCP_SACK */
194 
195 	switch (timer) {
196 
197 	/*
198 	 * 2 MSL timeout in shutdown went off.  If we're closed but
199 	 * still waiting for peer to close and connection has been idle
200 	 * too long, or if 2MSL time is up from TIME_WAIT, delete connection
201 	 * control block.  Otherwise, check again in a bit.
202 	 */
203 	case TCPT_2MSL:
204 		if (tp->t_state != TCPS_TIME_WAIT &&
205 		    tp->t_idle <= tcp_maxidle)
206 			tp->t_timer[TCPT_2MSL] = tcp_keepintvl;
207 		else
208 			tp = tcp_close(tp);
209 		break;
210 
211 	/*
212 	 * Retransmission timer went off.  Message has not
213 	 * been acked within retransmit interval.  Back off
214 	 * to a longer retransmit interval and retransmit one segment.
215 	 */
216 	case TCPT_REXMT:
217 		if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
218 			tp->t_rxtshift = TCP_MAXRXTSHIFT;
219 			tcpstat.tcps_timeoutdrop++;
220 			tp = tcp_drop(tp, tp->t_softerror ?
221 			    tp->t_softerror : ETIMEDOUT);
222 			break;
223 		}
224 		tcpstat.tcps_rexmttimeo++;
225 		rto = TCP_REXMTVAL(tp);
226 		if (rto < tp->t_rttmin)
227 			rto = tp->t_rttmin;
228 		TCPT_RANGESET((long) tp->t_rxtcur,
229 		    rto * tcp_backoff[tp->t_rxtshift],
230 		    tp->t_rttmin, TCPTV_REXMTMAX);
231 		tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
232 
233 		/*
234 		 * If we are losing and we are trying path MTU discovery,
235 		 * try turning it off.  This will avoid black holes in
236 		 * the network which suppress or fail to send "packet
237 		 * too big" ICMP messages.  We should ideally do
238 		 * lots more sophisticated searching to find the right
239 		 * value here...
240 		 */
241 		if (ip_mtudisc && tp->t_inpcb &&
242 		    TCPS_HAVEESTABLISHED(tp->t_state) &&
243 		    tp->t_rxtshift > TCP_MAXRXTSHIFT / 6) {
244 			struct inpcb *inp = tp->t_inpcb;
245 			struct rtentry *rt = NULL;
246 			struct sockaddr_in sin;
247 
248 			/* No data to send means path mtu is not a problem */
249 			if (!inp->inp_socket->so_snd.sb_cc)
250 				goto out;
251 
252 			rt = in_pcbrtentry(inp);
253 			/* Check if path MTU discovery is disabled already */
254 			if (rt && (rt->rt_flags & RTF_HOST) &&
255 			    (rt->rt_rmx.rmx_locks & RTV_MTU))
256 				goto out;
257 
258 			rt = NULL;
259 			switch(tp->pf) {
260 #ifdef INET6
261 			case PF_INET6:
262 				/*
263 				 * We can not turn off path MTU for IPv6.
264 				 * Do nothing for now, maybe lower to
265 				 * minimum MTU.
266 				 */
267 				break;
268 #endif
269 			case PF_INET:
270 				bzero(&sin, sizeof(struct sockaddr_in));
271 				sin.sin_family = AF_INET;
272 				sin.sin_len = sizeof(struct sockaddr_in);
273 				sin.sin_addr = inp->inp_faddr;
274 				rt = icmp_mtudisc_clone(sintosa(&sin));
275 				break;
276 			}
277 			if (rt != NULL) {
278 				/* Disable path MTU discovery */
279 				if ((rt->rt_rmx.rmx_locks & RTV_MTU) == 0) {
280 					rt->rt_rmx.rmx_locks |= RTV_MTU;
281 					in_rtchange(inp, 0);
282 				}
283 
284 				rtfree(rt);
285 			}
286 			out:
287 				;
288 		}
289 
290 		/*
291 		 * If losing, let the lower level know and try for
292 		 * a better route.  Also, if we backed off this far,
293 		 * our srtt estimate is probably bogus.  Clobber it
294 		 * so we'll take the next rtt measurement as our srtt;
295 		 * move the current srtt into rttvar to keep the current
296 		 * retransmit times until then.
297 		 */
298 		if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
299 			in_losing(tp->t_inpcb);
300 			tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
301 			tp->t_srtt = 0;
302 		}
303 		tp->snd_nxt = tp->snd_una;
304 #if defined(TCP_SACK)
305 		/*
306 		 * Note:  We overload snd_last to function also as the
307 		 * snd_last variable described in RFC 2582
308 		 */
309 		tp->snd_last = tp->snd_max;
310 #endif /* TCP_SACK */
311 		/*
312 		 * If timing a segment in this window, stop the timer.
313 		 */
314 		tp->t_rtt = 0;
315 		/*
316 		 * Close the congestion window down to one segment
317 		 * (we'll open it by one segment for each ack we get).
318 		 * Since we probably have a window's worth of unacked
319 		 * data accumulated, this "slow start" keeps us from
320 		 * dumping all that data as back-to-back packets (which
321 		 * might overwhelm an intermediate gateway).
322 		 *
323 		 * There are two phases to the opening: Initially we
324 		 * open by one mss on each ack.  This makes the window
325 		 * size increase exponentially with time.  If the
326 		 * window is larger than the path can handle, this
327 		 * exponential growth results in dropped packet(s)
328 		 * almost immediately.  To get more time between
329 		 * drops but still "push" the network to take advantage
330 		 * of improving conditions, we switch from exponential
331 		 * to linear window opening at some threshhold size.
332 		 * For a threshhold, we use half the current window
333 		 * size, truncated to a multiple of the mss.
334 		 *
335 		 * (the minimum cwnd that will give us exponential
336 		 * growth is 2 mss.  We don't allow the threshhold
337 		 * to go below this.)
338 		 */
339 		{
340 		u_long win = ulmin(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
341 		if (win < 2)
342 			win = 2;
343 		tp->snd_cwnd = tp->t_maxseg;
344 		tp->snd_ssthresh = win * tp->t_maxseg;
345 		tp->t_dupacks = 0;
346 		}
347 		(void) tcp_output(tp);
348 		break;
349 
350 	/*
351 	 * Persistance timer into zero window.
352 	 * Force a byte to be output, if possible.
353 	 */
354 	case TCPT_PERSIST:
355 		tcpstat.tcps_persisttimeo++;
356 		/*
357 		 * Hack: if the peer is dead/unreachable, we do not
358 		 * time out if the window is closed.  After a full
359 		 * backoff, drop the connection if the idle time
360 		 * (no responses to probes) reaches the maximum
361 		 * backoff that we would use if retransmitting.
362 		 */
363 		rto = TCP_REXMTVAL(tp);
364 		if (rto < tp->t_rttmin)
365 			rto = tp->t_rttmin;
366 		if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
367 		    (tp->t_idle >= tcp_maxpersistidle ||
368 		     tp->t_idle >= rto * tcp_totbackoff)) {
369 			tcpstat.tcps_persistdrop++;
370 			tp = tcp_drop(tp, ETIMEDOUT);
371 			break;
372 		}
373 		tcp_setpersist(tp);
374 		tp->t_force = 1;
375 		(void) tcp_output(tp);
376 		tp->t_force = 0;
377 		break;
378 
379 	/*
380 	 * Keep-alive timer went off; send something
381 	 * or drop connection if idle for too long.
382 	 */
383 	case TCPT_KEEP:
384 		tcpstat.tcps_keeptimeo++;
385 		if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
386 			goto dropit;
387 		if (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE &&
388 		    tp->t_state <= TCPS_CLOSING) {
389 			if (tp->t_idle >= tcp_keepidle + tcp_maxidle)
390 				goto dropit;
391 			/*
392 			 * Send a packet designed to force a response
393 			 * if the peer is up and reachable:
394 			 * either an ACK if the connection is still alive,
395 			 * or an RST if the peer has closed the connection
396 			 * due to timeout or reboot.
397 			 * Using sequence number tp->snd_una-1
398 			 * causes the transmitted zero-length segment
399 			 * to lie outside the receive window;
400 			 * by the protocol spec, this requires the
401 			 * correspondent TCP to respond.
402 			 */
403 			tcpstat.tcps_keepprobe++;
404 #ifdef TCP_COMPAT_42
405 			/*
406 			 * The keepalive packet must have nonzero length
407 			 * to get a 4.2 host to respond.
408 			 */
409 			tcp_respond(tp,
410 				mtod(tp->t_template, caddr_t),
411 				(struct mbuf *)NULL,
412 				tp->rcv_nxt - 1, tp->snd_una - 1, 0);
413 #else
414 			tcp_respond(tp,
415 				mtod(tp->t_template, caddr_t),
416 				(struct mbuf *)NULL,
417 				tp->rcv_nxt, tp->snd_una - 1, 0);
418 #endif
419 			tp->t_timer[TCPT_KEEP] = tcp_keepintvl;
420 		} else
421 			tp->t_timer[TCPT_KEEP] = tcp_keepidle;
422 		break;
423 	dropit:
424 		tcpstat.tcps_keepdrops++;
425 		tp = tcp_drop(tp, ETIMEDOUT);
426 		break;
427 	}
428 	return (tp);
429 }
430 #endif /* TUBA_INCLUDE */
431