xref: /netbsd-src/sys/netinet/tcp_sack.c (revision c9496f6b604074a9451a67df576a5b423068e71e)
1 /* $NetBSD: tcp_sack.c,v 1.33 2016/12/13 08:29:03 ozaki-r Exp $ */
2 
3 /*
4  * Copyright (c) 2005 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Kentaro A. Kurahone.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
34  *	The Regents of the University of California.  All rights reserved.
35  *
36  * Redistribution and use in source and binary forms, with or without
37  * modification, are permitted provided that the following conditions
38  * are met:
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  * 4. Neither the name of the University nor the names of its contributors
45  *    may be used to endorse or promote products derived from this software
46  *    without specific prior written permission.
47  *
48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58  * SUCH DAMAGE.
59  *
60  *	@(#)tcp_sack.c	8.12 (Berkeley) 5/24/95
61  * $FreeBSD: src/sys/netinet/tcp_sack.c,v 1.3.2.2 2004/12/25 23:02:57 rwatson Exp $
62  */
63 
64 /*
65  *	@@(#)COPYRIGHT	1.1 (NRL) 17 January 1995
66  *
67  * NRL grants permission for redistribution and use in source and binary
68  * forms, with or without modification, of the software and documentation
69  * created at NRL provided that the following conditions are met:
70  *
71  * 1. Redistributions of source code must retain the above copyright
72  *    notice, this list of conditions and the following disclaimer.
73  * 2. Redistributions in binary form must reproduce the above copyright
74  *    notice, this list of conditions and the following disclaimer in the
75  *    documentation and/or other materials provided with the distribution.
76  * 3. All advertising materials mentioning features or use of this software
77  *    must display the following acknowledgements:
78  *	This product includes software developed by the University of
79  *	California, Berkeley and its contributors.
80  *	This product includes software developed at the Information
81  *	Technology Division, US Naval Research Laboratory.
82  * 4. Neither the name of the NRL nor the names of its contributors
83  *    may be used to endorse or promote products derived from this software
84  *    without specific prior written permission.
85  *
86  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
87  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
88  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
89  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
90  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
91  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
92  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
93  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
94  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
95  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
96  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
97  *
98  * The views and conclusions contained in the software and documentation
99  * are those of the authors and should not be interpreted as representing
100  * official policies, either expressed or implied, of the US Naval
101  * Research Laboratory (NRL).
102  */
103 
104 #include <sys/cdefs.h>
105 __KERNEL_RCSID(0, "$NetBSD: tcp_sack.c,v 1.33 2016/12/13 08:29:03 ozaki-r Exp $");
106 
107 #ifdef _KERNEL_OPT
108 #include "opt_inet.h"
109 #include "opt_inet_csum.h"
110 #include "opt_tcp_debug.h"
111 #include "opt_ddb.h"
112 #endif
113 
114 #include <sys/param.h>
115 #include <sys/systm.h>
116 #include <sys/mbuf.h>
117 #include <sys/protosw.h>
118 #include <sys/socket.h>
119 #include <sys/socketvar.h>
120 #include <sys/errno.h>
121 #include <sys/syslog.h>
122 #include <sys/pool.h>
123 #include <sys/domain.h>
124 #include <sys/kernel.h>
125 
126 #include <net/if.h>
127 #include <net/route.h>
128 #include <net/if_types.h>
129 
130 #include <netinet/in.h>
131 #include <netinet/in_systm.h>
132 #include <netinet/ip.h>
133 #include <netinet/in_pcb.h>
134 #include <netinet/in_var.h>
135 #include <netinet/ip_var.h>
136 
137 #ifdef INET6
138 #ifndef INET
139 #include <netinet/in.h>
140 #endif
141 #include <netinet/ip6.h>
142 #include <netinet6/ip6_var.h>
143 #include <netinet6/in6_pcb.h>
144 #include <netinet6/ip6_var.h>
145 #include <netinet6/in6_var.h>
146 #include <netinet/icmp6.h>
147 #endif
148 
149 #ifndef INET6
150 /* always need ip6.h for IP6_EXTHDR_GET */
151 #include <netinet/ip6.h>
152 #endif
153 
154 #include <netinet/tcp.h>
155 #include <netinet/tcp_fsm.h>
156 #include <netinet/tcp_seq.h>
157 #include <netinet/tcp_timer.h>
158 #include <netinet/tcp_var.h>
159 #include <netinet/tcpip.h>
160 #include <netinet/tcp_debug.h>
161 
162 /* SACK block pool. */
163 static struct pool sackhole_pool;
164 
165 void
166 tcp_sack_init(void)
167 {
168 
169 	pool_init(&sackhole_pool, sizeof(struct sackhole), 0, 0, 0,
170 	    "sackholepl", NULL, IPL_SOFTNET);
171 }
172 
173 static struct sackhole *
174 sack_allochole(struct tcpcb *tp)
175 {
176 	struct sackhole *hole;
177 
178 	if (tp->snd_numholes >= tcp_sack_tp_maxholes ||
179 	    tcp_sack_globalholes >= tcp_sack_globalmaxholes) {
180 		return NULL;
181 	}
182 	hole = pool_get(&sackhole_pool, PR_NOWAIT);
183 	if (hole == NULL) {
184 		return NULL;
185 	}
186 	tp->snd_numholes++;
187 	tcp_sack_globalholes++;
188 
189 	return hole;
190 }
191 
192 static struct sackhole *
193 sack_inserthole(struct tcpcb *tp, tcp_seq start, tcp_seq end,
194     struct sackhole *prev)
195 {
196 	struct sackhole *hole;
197 
198 	hole = sack_allochole(tp);
199 	if (hole == NULL) {
200 		return NULL;
201 	}
202 	hole->start = hole->rxmit = start;
203 	hole->end = end;
204 	if (prev != NULL) {
205 		TAILQ_INSERT_AFTER(&tp->snd_holes, prev, hole, sackhole_q);
206 	} else {
207 		TAILQ_INSERT_TAIL(&tp->snd_holes, hole, sackhole_q);
208 	}
209 	return hole;
210 }
211 
212 static struct sackhole *
213 sack_removehole(struct tcpcb *tp, struct sackhole *hole)
214 {
215 	struct sackhole *next;
216 
217 	next = TAILQ_NEXT(hole, sackhole_q);
218 	tp->snd_numholes--;
219 	tcp_sack_globalholes--;
220 	TAILQ_REMOVE(&tp->snd_holes, hole, sackhole_q);
221 	pool_put(&sackhole_pool, hole);
222 
223 	return next;
224 }
225 
226 /*
227  * tcp_new_dsack: record the reception of a duplicated segment.
228  */
229 
230 void
231 tcp_new_dsack(struct tcpcb *tp, tcp_seq seq, u_int32_t len)
232 {
233 
234 	if (TCP_SACK_ENABLED(tp)) {
235 		tp->rcv_dsack_block.left = seq;
236 		tp->rcv_dsack_block.right = seq + len;
237 		tp->rcv_sack_flags |= TCPSACK_HAVED;
238 	}
239 }
240 
241 /*
242  * tcp_sack_option: parse the given SACK option and update the scoreboard.
243  */
244 
245 void
246 tcp_sack_option(struct tcpcb *tp, const struct tcphdr *th, const u_char *cp,
247     int optlen)
248 {
249 	struct sackblk
250 	    t_sack_block[(MAX_TCPOPTLEN - 2) / (sizeof(u_int32_t) * 2)];
251 	struct sackblk *sack = NULL;
252 	struct sackhole *cur = NULL;
253 	struct sackhole *tmp = NULL;
254 	const char *lp = cp + 2;
255 	int i, j, num_sack_blks;
256 	tcp_seq left, right, acked;
257 
258 	/*
259 	 * If we aren't processing SACK responses, this is not an ACK
260 	 * or the peer sends us a sack option with invalid length, don't
261 	 * update the scoreboard.
262 	 */
263 	if (!TCP_SACK_ENABLED(tp) || ((th->th_flags & TH_ACK) == 0) ||
264 			(optlen % 8 != 2 || optlen < 10)) {
265 		return;
266 	}
267 
268 	/*
269 	 * If we don't want any SACK holes to be allocated, just return.
270 	 */
271 	if (tcp_sack_globalmaxholes == 0 || tcp_sack_tp_maxholes == 0) {
272 		return;
273 	}
274 
275 	/* If the ACK is outside [snd_una, snd_max], ignore the SACK options. */
276 	if (SEQ_LT(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))
277 		return;
278 
279 	/*
280 	 * Extract SACK blocks.
281 	 *
282 	 * Note that t_sack_block is sorted so that we only need to do
283 	 * one pass over the sequence number space. (SACK "fast-path")
284 	 */
285 	num_sack_blks = optlen / 8;
286 	acked = (SEQ_GT(th->th_ack, tp->snd_una)) ? th->th_ack : tp->snd_una;
287 	for (i = 0; i < num_sack_blks; i++, lp += sizeof(uint32_t) * 2) {
288 		memcpy(&left, lp, sizeof(uint32_t));
289 		memcpy(&right, lp + sizeof(uint32_t), sizeof(uint32_t));
290 		left = ntohl(left);
291 		right = ntohl(right);
292 
293 		if (SEQ_LEQ(right, acked) || SEQ_GT(right, tp->snd_max) ||
294 		    SEQ_GEQ(left, right)) {
295 			/* SACK entry that's old, or invalid. */
296 			i--;
297 			num_sack_blks--;
298 			continue;
299 		}
300 
301 		/* Insertion sort. */
302 		for (j = i; (j > 0) && SEQ_LT(left, t_sack_block[j - 1].left);
303 		    j--) {
304 			t_sack_block[j].left = t_sack_block[j - 1].left;
305 			t_sack_block[j].right = t_sack_block[j - 1].right;
306 		}
307 		t_sack_block[j].left = left;
308 		t_sack_block[j].right = right;
309 	}
310 
311 	/* Update the scoreboard. */
312 	cur = TAILQ_FIRST(&tp->snd_holes);
313 	for (i = 0; i < num_sack_blks; i++) {
314 		sack = &t_sack_block[i];
315 		/*
316 		 * FACK TCP.  Update snd_fack so we can enter Fast
317 		 * Recovery early.
318 		 */
319 		if (SEQ_GEQ(sack->right, tp->snd_fack))
320 			tp->snd_fack = sack->right;
321 
322 		if (TAILQ_EMPTY(&tp->snd_holes)) {
323 			/* First hole. */
324 			cur = sack_inserthole(tp, th->th_ack, sack->left, NULL);
325 			if (cur == NULL) {
326 				/* ENOBUFS, bail out*/
327 				return;
328 			}
329 			tp->rcv_lastsack = sack->right;
330 			continue; /* With next sack block */
331 		}
332 
333 		/* Go through the list of holes. */
334 		while (cur) {
335 			if (SEQ_LEQ(sack->right, cur->start))
336 				/* SACKs data before the current hole */
337 				break; /* No use going through more holes */
338 
339 			if (SEQ_GEQ(sack->left, cur->end)) {
340 				/* SACKs data beyond the current hole */
341 				cur = TAILQ_NEXT(cur, sackhole_q);
342 				continue;
343 			}
344 
345 			if (SEQ_LEQ(sack->left, cur->start)) {
346 				/* Data acks at least the beginning of hole */
347 				if (SEQ_GEQ(sack->right, cur->end)) {
348 					/* Acks entire hole, so delete hole */
349 					cur = sack_removehole(tp, cur);
350 					break;
351 				}
352 
353 				/* Otherwise, move start of hole forward */
354 				cur->start = sack->right;
355 				cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
356 				break;
357 			}
358 
359 			if (SEQ_GEQ(sack->right, cur->end)) {
360 				/* Move end of hole backward. */
361 				cur->end = sack->left;
362 				cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
363 				cur = TAILQ_NEXT(cur, sackhole_q);
364 				break;
365 			}
366 
367 			if (SEQ_LT(cur->start, sack->left) &&
368 			    SEQ_GT(cur->end, sack->right)) {
369 				/*
370 				 * ACKs some data in middle of a hole; need to
371 				 * split current hole
372 				 */
373 				tmp = sack_inserthole(tp, sack->right, cur->end,
374 				    cur);
375 				if (tmp == NULL) {
376 					return;
377 				}
378 				tmp->rxmit = SEQ_MAX(cur->rxmit, tmp->start);
379 				cur->end = sack->left;
380 				cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
381 				cur = tmp;
382 				break;
383 			}
384 		}
385 
386 		/* At this point, we have reached the tail of the list. */
387 		if (SEQ_LT(tp->rcv_lastsack, sack->left)) {
388 			/*
389 			 * Need to append new hole at end.
390 			 */
391 			cur = sack_inserthole(tp, tp->rcv_lastsack, sack->left,
392 			    NULL);
393 			if (cur == NULL) {
394 				return;
395 			}
396 		}
397 		if (SEQ_LT(tp->rcv_lastsack, sack->right)) {
398 			tp->rcv_lastsack = sack->right;
399 		}
400 	}
401 }
402 
403 /*
404  * tcp_del_sackholes: remove holes covered by a cumulative ACK.
405  */
406 
407 void
408 tcp_del_sackholes(struct tcpcb *tp, const struct tcphdr *th)
409 {
410 	/* Max because this could be an older ack that just arrived. */
411 	tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
412 		th->th_ack : tp->snd_una;
413 	struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes);
414 
415 	while (cur) {
416 		if (SEQ_LEQ(cur->end, lastack)) {
417 			cur = sack_removehole(tp, cur);
418 		} else if (SEQ_LT(cur->start, lastack)) {
419 			cur->start = lastack;
420 			if (SEQ_LT(cur->rxmit, cur->start))
421 				cur->rxmit = cur->start;
422 			break;
423 		} else
424 			break;
425 	}
426 }
427 
428 /*
429  * tcp_free_sackholes: clear the scoreboard.
430  */
431 
432 void
433 tcp_free_sackholes(struct tcpcb *tp)
434 {
435 	struct sackhole *sack;
436 
437 	/* Free up the SACK hole list. */
438 	while ((sack = TAILQ_FIRST(&tp->snd_holes)) != NULL) {
439 		sack_removehole(tp, sack);
440 	}
441 	KASSERT(tp->snd_numholes == 0);
442 }
443 
444 /*
445  * Returns pointer to a sackhole if there are any pending retransmissions;
446  * NULL otherwise.
447  */
448 struct sackhole *
449 tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt)
450 {
451 	struct sackhole *cur = NULL;
452 
453 	if (!TCP_SACK_ENABLED(tp))
454 		return (NULL);
455 
456 	*sack_bytes_rexmt = 0;
457 	TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) {
458 		if (SEQ_LT(cur->rxmit, cur->end)) {
459 			if (SEQ_LT(cur->rxmit, tp->snd_una)) {
460 				/* old SACK hole */
461 				continue;
462 			}
463 			*sack_bytes_rexmt += (cur->rxmit - cur->start);
464 			break;
465 		}
466 		*sack_bytes_rexmt += (cur->rxmit - cur->start);
467 	}
468 
469 	return (cur);
470 }
471 
472 /*
473  * After a timeout, the SACK list may be rebuilt.  This SACK information
474  * should be used to avoid retransmitting SACKed data.  This function
475  * traverses the SACK list to see if snd_nxt should be moved forward.
476  */
477 void
478 tcp_sack_adjust(struct tcpcb *tp)
479 {
480 	struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes);
481 	struct sackhole *n = NULL;
482 
483 	if (TAILQ_EMPTY(&tp->snd_holes))
484 		return; /* No holes */
485 	if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack))
486 		return; /* We're already beyond any SACKed blocks */
487 
488 	/*
489 	 * Two cases for which we want to advance snd_nxt:
490 	 * i) snd_nxt lies between end of one hole and beginning of another
491 	 * ii) snd_nxt lies between end of last hole and rcv_lastsack
492 	 */
493 	while ((n = TAILQ_NEXT(cur, sackhole_q)) != NULL) {
494 		if (SEQ_LT(tp->snd_nxt, cur->end))
495 			return;
496 		if (SEQ_GEQ(tp->snd_nxt, n->start))
497 			cur = n;
498 		else {
499 			tp->snd_nxt = n->start;
500 			return;
501 		}
502 	}
503 	if (SEQ_LT(tp->snd_nxt, cur->end))
504 		return;
505 	tp->snd_nxt = tp->rcv_lastsack;
506 
507 	return;
508 }
509 
510 /*
511  * tcp_sack_numblks: return the number of SACK blocks to send.
512  */
513 
514 int
515 tcp_sack_numblks(const struct tcpcb *tp)
516 {
517 	int numblks;
518 
519 	if (!TCP_SACK_ENABLED(tp)) {
520 		return 0;
521 	}
522 
523 	numblks = (((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) ? 1 : 0) +
524 	    tp->t_segqlen;
525 
526 	if (numblks == 0) {
527 		return 0;
528 	}
529 
530 	if (numblks > TCP_SACK_MAX) {
531 		numblks = TCP_SACK_MAX;
532 	}
533 
534 	return numblks;
535 }
536 
537 #if defined(DDB)
538 void sack_dump(const struct tcpcb *);
539 
540 void
541 sack_dump(const struct tcpcb *tp)
542 {
543 	const struct sackhole *cur;
544 
545 	printf("snd_una=%" PRIu32 ", snd_max=%" PRIu32 "\n",
546 	    tp->snd_una, tp->snd_max);
547 	printf("rcv_lastsack=%" PRIu32 ", snd_fack=%" PRIu32 "\n",
548 	    tp->rcv_lastsack, tp->snd_fack);
549 	printf("numholes=%d\n", tp->snd_numholes);
550 	TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) {
551 		printf("\t%" PRIu32 "-%" PRIu32 ", rxmit=%" PRIu32 "\n",
552 		    cur->start, cur->end, cur->rxmit);
553 	}
554 }
555 #endif /* defined(DDB) */
556