xref: /dflybsd-src/sys/netinet/tcp_sack.c (revision e12d3396c777165504d60d2a1408dcd7cb63660d)
1 /*
2  * Copyright (c) 2003, 2004 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 2003, 2004 The DragonFly Project.  All rights reserved.
4  *
5  * This code is derived from software contributed to The DragonFly Project
6  * by Jeffrey M. Hsu.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of The DragonFly Project nor the names of its
17  *    contributors may be used to endorse or promote products derived
18  *    from this software without specific, prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  * $DragonFly: src/sys/netinet/tcp_sack.c,v 1.8 2008/08/15 21:37:16 nth Exp $
34  */
35 
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/kernel.h>
39 #include <sys/malloc.h>
40 #include <sys/queue.h>
41 #include <sys/thread.h>
42 #include <sys/types.h>
43 #include <sys/socket.h>
44 #include <sys/socketvar.h>
45 
46 #include <net/if.h>
47 
48 #include <netinet/in.h>
49 #include <netinet/in_systm.h>
50 #include <netinet/ip.h>
51 #include <netinet/in_var.h>
52 #include <netinet/in_pcb.h>
53 #include <netinet/ip_var.h>
54 #include <netinet/tcp.h>
55 #include <netinet/tcp_seq.h>
56 #include <netinet/tcp_var.h>
57 
58 /*
59  * Implemented:
60  *
61  * RFC 2018
62  * RFC 2883
63  * RFC 3517
64  */
65 
66 struct sackblock {
67 	tcp_seq			sblk_start;
68 	tcp_seq			sblk_end;
69 	TAILQ_ENTRY(sackblock)	sblk_list;
70 };
71 
72 #define	MAXSAVEDBLOCKS	8			/* per connection limit */
73 
74 static int insert_block(struct scoreboard *scb,
75 			const struct raw_sackblock *raw_sb, boolean_t *update);
76 
77 static MALLOC_DEFINE(M_SACKBLOCK, "sblk", "sackblock struct");
78 
79 /*
80  * Per-tcpcb initialization.
81  */
82 void
83 tcp_sack_tcpcb_init(struct tcpcb *tp)
84 {
85 	struct scoreboard *scb = &tp->scb;
86 
87 	scb->nblocks = 0;
88 	TAILQ_INIT(&scb->sackblocks);
89 	scb->lastfound = NULL;
90 }
91 
92 /*
93  * Find the SACK block containing or immediately preceding "seq".
94  * The boolean result indicates whether the sequence is actually
95  * contained in the SACK block.
96  */
97 static boolean_t
98 sack_block_lookup(struct scoreboard *scb, tcp_seq seq, struct sackblock **sb)
99 {
100 	struct sackblock *hint = scb->lastfound;
101 	struct sackblock *cur, *last, *prev;
102 
103 	if (TAILQ_EMPTY(&scb->sackblocks)) {
104 		*sb = NULL;
105 		return FALSE;
106 	}
107 
108 	if (hint == NULL) {
109 		/* No hint.  Search from start to end. */
110 		cur = TAILQ_FIRST(&scb->sackblocks);
111 		last = NULL;
112 		prev = TAILQ_LAST(&scb->sackblocks, sackblock_list);
113 	} else  {
114 		if (SEQ_GEQ(seq, hint->sblk_start)) {
115 			/* Search from hint to end of list. */
116 			cur = hint;
117 			last = NULL;
118 			prev = TAILQ_LAST(&scb->sackblocks, sackblock_list);
119 		} else {
120 			/* Search from front of list to hint. */
121 			cur = TAILQ_FIRST(&scb->sackblocks);
122 			last = hint;
123 			prev = TAILQ_PREV(hint, sackblock_list, sblk_list);
124 		}
125 	}
126 
127 	do {
128 		if (SEQ_GT(cur->sblk_end, seq)) {
129 			if (SEQ_GEQ(seq, cur->sblk_start)) {
130 				*sb = scb->lastfound = cur;
131 				return TRUE;
132 			} else {
133 				*sb = scb->lastfound =
134 				    TAILQ_PREV(cur, sackblock_list, sblk_list);
135 				return FALSE;
136 			}
137 		}
138 		cur = TAILQ_NEXT(cur, sblk_list);
139 	} while (cur != last);
140 
141 	*sb = scb->lastfound = prev;
142 	return FALSE;
143 }
144 
145 /*
146  * Allocate a SACK block.
147  */
148 static __inline struct sackblock *
149 alloc_sackblock(struct scoreboard *scb, const struct raw_sackblock *raw_sb)
150 {
151 	struct sackblock *sb;
152 
153 	if (scb->freecache != NULL) {
154 		sb = scb->freecache;
155 		scb->freecache = NULL;
156 		tcpstat.tcps_sacksbfast++;
157 	} else {
158 		sb = kmalloc(sizeof(struct sackblock), M_SACKBLOCK, M_NOWAIT);
159 		if (sb == NULL) {
160 			tcpstat.tcps_sacksbfailed++;
161 			return NULL;
162 		}
163 	}
164 	sb->sblk_start = raw_sb->rblk_start;
165 	sb->sblk_end = raw_sb->rblk_end;
166 	return sb;
167 }
168 
169 static __inline struct sackblock *
170 alloc_sackblock_limit(struct scoreboard *scb,
171     const struct raw_sackblock *raw_sb)
172 {
173 	if (scb->nblocks == MAXSAVEDBLOCKS) {
174 		/*
175 		 * Should try to kick out older blocks XXX JH
176 		 * May be able to coalesce with existing block.
177 		 * Or, go other way and free all blocks if we hit
178 		 * this limit.
179 		 */
180 		tcpstat.tcps_sacksboverflow++;
181 		return NULL;
182 	}
183 	return alloc_sackblock(scb, raw_sb);
184 }
185 
186 /*
187  * Free a SACK block.
188  */
189 static __inline void
190 free_sackblock(struct scoreboard *scb, struct sackblock *s)
191 {
192 	if (scb->freecache == NULL) {
193 		/* YYY Maybe use the latest freed block? */
194 		scb->freecache = s;
195 		return;
196 	}
197 	kfree(s, M_SACKBLOCK);
198 }
199 
200 /*
201  * Free up SACK blocks for data that's been acked.
202  */
203 static void
204 tcp_sack_ack_blocks(struct scoreboard *scb, tcp_seq th_ack)
205 {
206 	struct sackblock *sb, *nb;
207 
208 	sb = TAILQ_FIRST(&scb->sackblocks);
209 	while (sb && SEQ_LEQ(sb->sblk_end, th_ack)) {
210 		nb = TAILQ_NEXT(sb, sblk_list);
211 		if (scb->lastfound == sb)
212 			scb->lastfound = NULL;
213 		TAILQ_REMOVE(&scb->sackblocks, sb, sblk_list);
214 		free_sackblock(scb, sb);
215 		--scb->nblocks;
216 		KASSERT(scb->nblocks >= 0,
217 		    ("SACK block count underflow: %d < 0", scb->nblocks));
218 		sb = nb;
219 	}
220 	if (sb && SEQ_GT(th_ack, sb->sblk_start))
221 		sb->sblk_start = th_ack;	/* other side reneged? XXX */
222 }
223 
224 /*
225  * Delete and free SACK blocks saved in scoreboard.
226  */
227 void
228 tcp_sack_cleanup(struct scoreboard *scb)
229 {
230 	struct sackblock *sb, *nb;
231 
232 	TAILQ_FOREACH_MUTABLE(sb, &scb->sackblocks, sblk_list, nb) {
233 		free_sackblock(scb, sb);
234 		--scb->nblocks;
235 	}
236 	KASSERT(scb->nblocks == 0,
237 	    ("SACK block %d count not zero", scb->nblocks));
238 	TAILQ_INIT(&scb->sackblocks);
239 	scb->lastfound = NULL;
240 }
241 
242 /*
243  * Delete and free SACK blocks saved in scoreboard.
244  * Delete the one slot block cache.
245  */
246 void
247 tcp_sack_destroy(struct scoreboard *scb)
248 {
249 	tcp_sack_cleanup(scb);
250 	if (scb->freecache != NULL) {
251 		kfree(scb->freecache, M_SACKBLOCK);
252 		scb->freecache = NULL;
253 	}
254 }
255 
256 /*
257  * Cleanup the reported SACK block information
258  */
259 void
260 tcp_sack_report_cleanup(struct tcpcb *tp)
261 {
262 	tp->sack_flags &=
263 	    ~(TSACK_F_DUPSEG | TSACK_F_ENCLOSESEG | TSACK_F_SACKLEFT);
264 	tp->reportblk.rblk_start = tp->reportblk.rblk_end;
265 }
266 
267 /*
268  * Returns	0 if not D-SACK block,
269  *		1 if D-SACK,
270  *		2 if duplicate of out-of-order D-SACK block.
271  */
272 int
273 tcp_sack_ndsack_blocks(struct raw_sackblock *blocks, const int numblocks,
274 		       tcp_seq snd_una)
275 {
276 	if (numblocks == 0)
277 		return 0;
278 
279 	if (SEQ_LT(blocks[0].rblk_start, snd_una))
280 		return 1;
281 
282 	/* block 0 inside block 1 */
283 	if (numblocks > 1 &&
284 	    SEQ_GEQ(blocks[0].rblk_start, blocks[1].rblk_start) &&
285 	    SEQ_LEQ(blocks[0].rblk_end, blocks[1].rblk_end))
286 		return 2;
287 
288 	return 0;
289 }
290 
291 /*
292  * Update scoreboard on new incoming ACK.
293  */
294 static void
295 tcp_sack_add_blocks(struct tcpcb *tp, struct tcpopt *to)
296 {
297 	const int numblocks = to->to_nsackblocks;
298 	struct raw_sackblock *blocks = to->to_sackblocks;
299 	struct scoreboard *scb = &tp->scb;
300 	int startblock, i;
301 
302 	if (tcp_sack_ndsack_blocks(blocks, numblocks, tp->snd_una) > 0)
303 		startblock = 1;
304 	else
305 		startblock = 0;
306 
307 	to->to_flags |= TOF_SACK_REDUNDANT;
308 	for (i = startblock; i < numblocks; i++) {
309 		struct raw_sackblock *newsackblock = &blocks[i];
310 		boolean_t update;
311 		int error;
312 
313 		/* Guard against ACK reordering */
314 		if (SEQ_LT(newsackblock->rblk_start, tp->snd_una))
315 			continue;
316 
317 		/* Don't accept bad SACK blocks */
318 		if (SEQ_GT(newsackblock->rblk_end, tp->snd_max)) {
319 			tcpstat.tcps_rcvbadsackopt++;
320 			break;		/* skip all other blocks */
321 		}
322 		tcpstat.tcps_sacksbupdate++;
323 
324 		error = insert_block(scb, newsackblock, &update);
325 		if (update)
326 			to->to_flags &= ~TOF_SACK_REDUNDANT;
327 		if (error)
328 			break;
329 	}
330 }
331 
332 void
333 tcp_sack_update_scoreboard(struct tcpcb *tp, struct tcpopt *to)
334 {
335 	struct scoreboard *scb = &tp->scb;
336 	int rexmt_high_update = 0;
337 
338 	tcp_sack_ack_blocks(scb, tp->snd_una);
339 	tcp_sack_add_blocks(tp, to);
340 	tcp_sack_update_lostseq(scb, tp->snd_una, tp->t_maxseg,
341 	    tp->t_rxtthresh);
342 	if (SEQ_LT(tp->rexmt_high, tp->snd_una)) {
343 		tp->rexmt_high = tp->snd_una;
344 		rexmt_high_update = 1;
345 	}
346 	if (tp->sack_flags & TSACK_F_SACKRESCUED) {
347 		if (SEQ_LT(tp->rexmt_rescue, tp->snd_una)) {
348 			tp->sack_flags &= ~TSACK_F_SACKRESCUED;
349 		} else if (tcp_aggressive_rescuesack && rexmt_high_update &&
350 		    SEQ_LT(tp->rexmt_rescue, tp->rexmt_high)) {
351 			/* Drag RescueRxt along with HighRxt */
352 			tp->rexmt_rescue = tp->rexmt_high;
353 		}
354 	}
355 }
356 
357 /*
358  * Insert SACK block into sender's scoreboard.
359  */
360 static int
361 insert_block(struct scoreboard *scb, const struct raw_sackblock *raw_sb,
362     boolean_t *update)
363 {
364 	struct sackblock *sb, *workingblock;
365 	boolean_t overlap_front;
366 
367 	*update = TRUE;
368 	if (TAILQ_EMPTY(&scb->sackblocks)) {
369 		struct sackblock *newblock;
370 
371 		KASSERT(scb->nblocks == 0, ("emply scb w/ blocks"));
372 
373 		newblock = alloc_sackblock(scb, raw_sb);
374 		if (newblock == NULL)
375 			return ENOMEM;
376 		TAILQ_INSERT_HEAD(&scb->sackblocks, newblock, sblk_list);
377 		scb->nblocks = 1;
378 		return 0;
379 	}
380 
381 	KASSERT(scb->nblocks > 0, ("insert_block() called w/ no blocks"));
382 	KASSERT(scb->nblocks <= MAXSAVEDBLOCKS,
383 	    ("too many SACK blocks %d", scb->nblocks));
384 
385 	overlap_front = sack_block_lookup(scb, raw_sb->rblk_start, &sb);
386 
387 	if (sb == NULL) {
388 		workingblock = alloc_sackblock_limit(scb, raw_sb);
389 		if (workingblock == NULL)
390 			return ENOMEM;
391 		TAILQ_INSERT_HEAD(&scb->sackblocks, workingblock, sblk_list);
392 		++scb->nblocks;
393 	} else {
394 		if (overlap_front || sb->sblk_end == raw_sb->rblk_start) {
395 			/* Extend old block */
396 			workingblock = sb;
397 			if (SEQ_GT(raw_sb->rblk_end, sb->sblk_end))
398 				sb->sblk_end = raw_sb->rblk_end;
399 			else
400 				*update = FALSE;
401 			tcpstat.tcps_sacksbreused++;
402 		} else {
403 			workingblock = alloc_sackblock_limit(scb, raw_sb);
404 			if (workingblock == NULL)
405 				return ENOMEM;
406 			TAILQ_INSERT_AFTER(&scb->sackblocks, sb, workingblock,
407 			    sblk_list);
408 			++scb->nblocks;
409 		}
410 	}
411 
412 	/* Consolidate right-hand side. */
413 	sb = TAILQ_NEXT(workingblock, sblk_list);
414 	while (sb != NULL &&
415 	    SEQ_GEQ(workingblock->sblk_end, sb->sblk_end)) {
416 		struct sackblock *nextblock;
417 
418 		nextblock = TAILQ_NEXT(sb, sblk_list);
419 		if (scb->lastfound == sb)
420 			scb->lastfound = NULL;
421 		/* Remove completely overlapped block */
422 		TAILQ_REMOVE(&scb->sackblocks, sb, sblk_list);
423 		free_sackblock(scb, sb);
424 		--scb->nblocks;
425 		KASSERT(scb->nblocks > 0,
426 		    ("removed overlapped block: %d blocks left", scb->nblocks));
427 		sb = nextblock;
428 	}
429 	if (sb != NULL &&
430 	    SEQ_GEQ(workingblock->sblk_end, sb->sblk_start)) {
431 		/* Extend new block to cover partially overlapped old block. */
432 		workingblock->sblk_end = sb->sblk_end;
433 		if (scb->lastfound == sb)
434 			scb->lastfound = NULL;
435 		TAILQ_REMOVE(&scb->sackblocks, sb, sblk_list);
436 		free_sackblock(scb, sb);
437 		--scb->nblocks;
438 		KASSERT(scb->nblocks > 0,
439 		    ("removed partial right: %d blocks left", scb->nblocks));
440 	}
441 	return 0;
442 }
443 
444 #ifdef DEBUG_SACK_BLOCKS
445 static void
446 tcp_sack_dump_blocks(struct scoreboard *scb)
447 {
448 	struct sackblock *sb;
449 
450 	kprintf("%d blocks:", scb->nblocks);
451 	TAILQ_FOREACH(sb, &scb->sackblocks, sblk_list)
452 		kprintf(" [%u, %u)", sb->sblk_start, sb->sblk_end);
453 	kprintf("\n");
454 }
455 #else
456 static __inline void
457 tcp_sack_dump_blocks(struct scoreboard *scb)
458 {
459 }
460 #endif
461 
462 /*
463  * Optimization to quickly determine which packets are lost.
464  */
465 void
466 tcp_sack_update_lostseq(struct scoreboard *scb, tcp_seq snd_una, u_int maxseg,
467     int rxtthresh)
468 {
469 	struct sackblock *sb;
470 	int nsackblocks = 0;
471 	int bytes_sacked = 0;
472 	int rxtthresh_bytes;
473 
474 	/*
475 	 * XXX
476 	 * The RFC3517bis recommends to reduce the byte threshold.
477 	 * However, it will cause extra spurious retransmit if
478 	 * segments are reordered.  Before certain DupThresh adaptive
479 	 * algorithm is implemented, we don't reduce the byte
480 	 * threshold (tcp_rfc3517bis_rxt is off by default).
481 	 */
482 	if (tcp_do_rfc3517bis && tcp_rfc3517bis_rxt)
483 		rxtthresh_bytes = (rxtthresh - 1) * maxseg;
484 	else
485 		rxtthresh_bytes = rxtthresh * maxseg;
486 
487 	sb = TAILQ_LAST(&scb->sackblocks, sackblock_list);
488 	while (sb != NULL) {
489 		++nsackblocks;
490 		bytes_sacked += sb->sblk_end - sb->sblk_start;
491 		if (nsackblocks == rxtthresh ||
492 		    bytes_sacked >= rxtthresh_bytes) {
493 			scb->lostseq = sb->sblk_start;
494 			return;
495 		}
496 		sb = TAILQ_PREV(sb, sackblock_list, sblk_list);
497 	}
498 	scb->lostseq = snd_una;
499 }
500 
501 /*
502  * Return whether the given sequence number is considered lost.
503  */
504 boolean_t
505 tcp_sack_islost(struct scoreboard *scb, tcp_seq seqnum)
506 {
507 	return SEQ_LT(seqnum, scb->lostseq);
508 }
509 
510 /*
511  * True if at least "amount" has been SACKed.  Used by Early Retransmit.
512  */
513 boolean_t
514 tcp_sack_has_sacked(struct scoreboard *scb, u_int amount)
515 {
516 	struct sackblock *sb;
517 	int bytes_sacked = 0;
518 
519 	TAILQ_FOREACH(sb, &scb->sackblocks, sblk_list) {
520 		bytes_sacked += sb->sblk_end - sb->sblk_start;
521 		if (bytes_sacked >= amount)
522 			return TRUE;
523 	}
524 	return FALSE;
525 }
526 
527 /*
528  * Number of bytes SACKed below seq.
529  */
530 int
531 tcp_sack_bytes_below(struct scoreboard *scb, tcp_seq seq)
532 {
533 	struct sackblock *sb;
534 	int bytes_sacked = 0;
535 
536 	sb = TAILQ_FIRST(&scb->sackblocks);
537 	while (sb && SEQ_GT(seq, sb->sblk_start)) {
538 		bytes_sacked += seq_min(seq, sb->sblk_end) - sb->sblk_start;
539 		sb = TAILQ_NEXT(sb, sblk_list);
540 	}
541 	return bytes_sacked;
542 }
543 
544 /*
545  * Return estimate of the number of bytes outstanding in the network.
546  */
547 uint32_t
548 tcp_sack_compute_pipe(struct tcpcb *tp)
549 {
550 	struct scoreboard *scb = &tp->scb;
551 	struct sackblock *sb;
552 	int nlost, nretransmitted;
553 	tcp_seq end;
554 
555 	nlost = tp->snd_max - scb->lostseq;
556 	nretransmitted = tp->rexmt_high - tp->snd_una;
557 
558 	TAILQ_FOREACH(sb, &scb->sackblocks, sblk_list) {
559 		if (SEQ_LT(sb->sblk_start, tp->rexmt_high)) {
560 			end = seq_min(sb->sblk_end, tp->rexmt_high);
561 			nretransmitted -= end - sb->sblk_start;
562 		}
563 		if (SEQ_GEQ(sb->sblk_start, scb->lostseq))
564 			nlost -= sb->sblk_end - sb->sblk_start;
565 	}
566 
567 	return (nlost + nretransmitted);
568 }
569 
570 /*
571  * Return the sequence number and length of the next segment to transmit
572  * when in Fast Recovery.
573  */
574 boolean_t
575 tcp_sack_nextseg(struct tcpcb *tp, tcp_seq *nextrexmt, uint32_t *plen,
576     boolean_t *rescue)
577 {
578 	struct scoreboard *scb = &tp->scb;
579 	struct socket *so = tp->t_inpcb->inp_socket;
580 	struct sackblock *sb;
581 	const struct sackblock *lastblock =
582 	    TAILQ_LAST(&scb->sackblocks, sackblock_list);
583 	tcp_seq torexmt;
584 	long len, off;
585 
586 	/* skip SACKed data */
587 	tcp_sack_skip_sacked(scb, &tp->rexmt_high);
588 
589 	/* Look for lost data. */
590 	torexmt = tp->rexmt_high;
591 	*rescue = FALSE;
592 	if (lastblock != NULL) {
593 		if (SEQ_LT(torexmt, lastblock->sblk_end) &&
594 		    tcp_sack_islost(scb, torexmt)) {
595 sendunsacked:
596 			*nextrexmt = torexmt;
597 			/* If the left-hand edge has been SACKed, pull it in. */
598 			if (sack_block_lookup(scb, torexmt + tp->t_maxseg, &sb))
599 				*plen = sb->sblk_start - torexmt;
600 			else
601 				*plen = tp->t_maxseg;
602 			return TRUE;
603 		}
604 	}
605 
606 	/* See if unsent data available within send window. */
607 	off = tp->snd_max - tp->snd_una;
608 	len = (long) ulmin(so->so_snd.ssb_cc, tp->snd_wnd) - off;
609 	if (len > 0) {
610 		*nextrexmt = tp->snd_max;	/* Send new data. */
611 		*plen = tp->t_maxseg;
612 		return TRUE;
613 	}
614 
615 	/* We're less certain this data has been lost. */
616 	if (lastblock != NULL && SEQ_LT(torexmt, lastblock->sblk_end))
617 		goto sendunsacked;
618 
619 	/* Rescue retransmission */
620 	if (tcp_do_rescuesack || tcp_do_rfc3517bis) {
621 		tcpstat.tcps_sackrescue_try++;
622 		if (tp->sack_flags & TSACK_F_SACKRESCUED) {
623 			if (!tcp_aggressive_rescuesack)
624 				return FALSE;
625 
626 			/*
627 			 * Aggressive variant of the rescue retransmission.
628 			 *
629 			 * The idea of the rescue retransmission is to sustain
630 			 * the ACK clock thus to avoid timeout retransmission.
631 			 *
632 			 * Under some situations, the conservative approach
633 			 * suggested in the draft
634  			 * http://tools.ietf.org/html/
635 			 * draft-nishida-tcpm-rescue-retransmission-00
636 			 * could not sustain ACK clock, since it only allows
637 			 * one rescue retransmission before a cumulative ACK
638 			 * covers the segement transmitted by rescue
639 			 * retransmission.
640 			 *
641 			 * We try to locate the next unSACKed segment which
642 			 * follows the previously sent rescue segment.  If
643 			 * there is no such segment, we loop back to the first
644 			 * unacknowledged segment.
645 			 */
646 
647 			/*
648 			 * Skip SACKed data, but here we follow
649 			 * the last transmitted rescue segment.
650 			 */
651 			torexmt = tp->rexmt_rescue;
652 			tcp_sack_skip_sacked(scb, &torexmt);
653 			if (torexmt == tp->snd_max) {
654 				/* Nothing left to retransmit; restart */
655 				torexmt = tp->snd_una;
656 			}
657 		}
658 		*rescue = TRUE;
659 		goto sendunsacked;
660 	} else if (tcp_do_smartsack && lastblock == NULL) {
661 		tcpstat.tcps_sackrescue_try++;
662 		*rescue = TRUE;
663 		goto sendunsacked;
664 	}
665 
666 	return FALSE;
667 }
668 
669 /*
670  * Return the next sequence number higher than "*prexmt" that has
671  * not been SACKed.
672  */
673 void
674 tcp_sack_skip_sacked(struct scoreboard *scb, tcp_seq *prexmt)
675 {
676 	struct sackblock *sb;
677 
678 	/* skip SACKed data */
679 	if (sack_block_lookup(scb, *prexmt, &sb))
680 		*prexmt = sb->sblk_end;
681 }
682 
683 #ifdef later
684 void
685 tcp_sack_save_scoreboard(struct scoreboard *scb)
686 {
687 	struct scoreboard *scb = &tp->scb;
688 
689 	scb->sackblocks_prev = scb->sackblocks;
690 	TAILQ_INIT(&scb->sackblocks);
691 }
692 
693 void
694 tcp_sack_revert_scoreboard(struct scoreboard *scb, tcp_seq snd_una,
695 			   u_int maxseg)
696 {
697 	struct sackblock *sb;
698 
699 	scb->sackblocks = scb->sackblocks_prev;
700 	scb->nblocks = 0;
701 	TAILQ_FOREACH(sb, &scb->sackblocks, sblk_list)
702 		++scb->nblocks;
703 	tcp_sack_ack_blocks(scb, snd_una);
704 	scb->lastfound = NULL;
705 }
706 #endif
707 
708 #ifdef DEBUG_SACK_HISTORY
709 static void
710 tcp_sack_dump_history(char *msg, struct tcpcb *tp)
711 {
712 	int i;
713 	static int ndumped;
714 
715 	/* only need a couple of these to debug most problems */
716 	if (++ndumped > 900)
717 		return;
718 
719 	kprintf("%s:\tnsackhistory %d: ", msg, tp->nsackhistory);
720 	for (i = 0; i < tp->nsackhistory; ++i)
721 		kprintf("[%u, %u) ", tp->sackhistory[i].rblk_start,
722 		    tp->sackhistory[i].rblk_end);
723 	kprintf("\n");
724 }
725 #else
726 static __inline void
727 tcp_sack_dump_history(char *msg, struct tcpcb *tp)
728 {
729 }
730 #endif
731 
732 /*
733  * Remove old SACK blocks from the SACK history that have already been ACKed.
734  */
735 static void
736 tcp_sack_ack_history(struct tcpcb *tp)
737 {
738 	int i, nblocks, openslot;
739 
740 	tcp_sack_dump_history("before tcp_sack_ack_history", tp);
741 	nblocks = tp->nsackhistory;
742 	for (i = openslot = 0; i < nblocks; ++i) {
743 		if (SEQ_LEQ(tp->sackhistory[i].rblk_end, tp->rcv_nxt)) {
744 			--tp->nsackhistory;
745 			continue;
746 		}
747 		if (SEQ_LT(tp->sackhistory[i].rblk_start, tp->rcv_nxt))
748 			tp->sackhistory[i].rblk_start = tp->rcv_nxt;
749 		if (i == openslot)
750 			++openslot;
751 		else
752 			tp->sackhistory[openslot++] = tp->sackhistory[i];
753 	}
754 	tcp_sack_dump_history("after tcp_sack_ack_history", tp);
755 	KASSERT(openslot == tp->nsackhistory,
756 	    ("tcp_sack_ack_history miscounted: %d != %d",
757 	    openslot, tp->nsackhistory));
758 }
759 
760 /*
761  * Add or merge newblock into reported history.
762  * Also remove or update SACK blocks that will be acked.
763  */
764 static void
765 tcp_sack_update_reported_history(struct tcpcb *tp, tcp_seq start, tcp_seq end)
766 {
767 	struct raw_sackblock copy[MAX_SACK_REPORT_BLOCKS];
768 	int i, cindex;
769 
770 	tcp_sack_dump_history("before tcp_sack_update_reported_history", tp);
771 	/*
772 	 * Six cases:
773 	 *	0) no overlap
774 	 *	1) newblock == oldblock
775 	 *	2) oldblock contains newblock
776 	 *	3) newblock contains oldblock
777 	 *	4) tail of oldblock overlaps or abuts start of newblock
778 	 *	5) tail of newblock overlaps or abuts head of oldblock
779 	 */
780 	for (i = cindex = 0; i < tp->nsackhistory; ++i) {
781 		struct raw_sackblock *oldblock = &tp->sackhistory[i];
782 		tcp_seq old_start = oldblock->rblk_start;
783 		tcp_seq old_end = oldblock->rblk_end;
784 
785 		if (SEQ_LT(end, old_start) || SEQ_GT(start, old_end)) {
786 			/* Case 0:  no overlap.  Copy old block. */
787 			copy[cindex++] = *oldblock;
788 			continue;
789 		}
790 
791 		if (SEQ_GEQ(start, old_start) && SEQ_LEQ(end, old_end)) {
792 			/* Cases 1 & 2.  Move block to front of history. */
793 			int j;
794 
795 			start = old_start;
796 			end = old_end;
797 			/* no need to check rest of blocks */
798 			for (j = i + 1; j < tp->nsackhistory; ++j)
799 				copy[cindex++] = tp->sackhistory[j];
800 			break;
801 		}
802 
803 		if (SEQ_GEQ(old_end, start) && SEQ_LT(old_start, start)) {
804 			/* Case 4:  extend start of new block. */
805 			start = old_start;
806 		} else if (SEQ_GEQ(end, old_start) && SEQ_GT(old_end, end)) {
807 			/* Case 5: extend end of new block */
808 			end = old_end;
809 		} else {
810 			/* Case 3.  Delete old block by not copying it. */
811 			KASSERT(SEQ_LEQ(start, old_start) &&
812 				SEQ_GEQ(end, old_end),
813 			    ("bad logic: old [%u, %u), new [%u, %u)",
814 			     old_start, old_end, start, end));
815 		}
816 	}
817 
818 	/* insert new block */
819 	tp->sackhistory[0].rblk_start = start;
820 	tp->sackhistory[0].rblk_end = end;
821 	cindex = min(cindex, MAX_SACK_REPORT_BLOCKS - 1);
822 	for (i = 0; i < cindex; ++i)
823 		tp->sackhistory[i + 1] = copy[i];
824 	tp->nsackhistory = cindex + 1;
825 	tcp_sack_dump_history("after tcp_sack_update_reported_history", tp);
826 }
827 
828 /*
829  * Fill in SACK report to return to data sender.
830  */
831 void
832 tcp_sack_fill_report(struct tcpcb *tp, u_char *opt, u_int *plen)
833 {
834 	u_int optlen = *plen;
835 	uint32_t *lp = (uint32_t *)(opt + optlen);
836 	uint32_t *olp;
837 	tcp_seq hstart = tp->rcv_nxt, hend;
838 	int nblocks;
839 
840 	KASSERT(TCP_MAXOLEN - optlen >=
841 	    TCPOLEN_SACK_ALIGNED + TCPOLEN_SACK_BLOCK,
842 	    ("no room for SACK header and one block: optlen %d", optlen));
843 
844 	if (tp->sack_flags & TSACK_F_DUPSEG)
845 		tcpstat.tcps_snddsackopt++;
846 	else
847 		tcpstat.tcps_sndsackopt++;
848 
849 	olp = lp++;
850 	optlen += TCPOLEN_SACK_ALIGNED;
851 
852 	tcp_sack_ack_history(tp);
853 	if (tp->reportblk.rblk_start != tp->reportblk.rblk_end) {
854 		*lp++ = htonl(tp->reportblk.rblk_start);
855 		*lp++ = htonl(tp->reportblk.rblk_end);
856 		optlen += TCPOLEN_SACK_BLOCK;
857 		hstart = tp->reportblk.rblk_start;
858 		hend = tp->reportblk.rblk_end;
859 		if (tp->sack_flags & TSACK_F_ENCLOSESEG) {
860 			KASSERT(TCP_MAXOLEN - optlen >= TCPOLEN_SACK_BLOCK,
861 			    ("no room for enclosing SACK block: oplen %d",
862 			    optlen));
863 			*lp++ = htonl(tp->encloseblk.rblk_start);
864 			*lp++ = htonl(tp->encloseblk.rblk_end);
865 			optlen += TCPOLEN_SACK_BLOCK;
866 			hstart = tp->encloseblk.rblk_start;
867 			hend = tp->encloseblk.rblk_end;
868 		}
869 		if (SEQ_GT(hstart, tp->rcv_nxt))
870 			tcp_sack_update_reported_history(tp, hstart, hend);
871 	}
872 	if (tcp_do_smartsack && (tp->sack_flags & TSACK_F_SACKLEFT)) {
873 		/* Fill in from left!  Walk re-assembly queue. */
874 		struct tseg_qent *q;
875 
876 		q = TAILQ_FIRST(&tp->t_segq);
877 		while (q != NULL &&
878 		    TCP_MAXOLEN - optlen >= TCPOLEN_SACK_BLOCK) {
879 			*lp++ = htonl(q->tqe_th->th_seq);
880 			*lp++ = htonl(TCP_SACK_BLKEND(
881 			    q->tqe_th->th_seq + q->tqe_len,
882 			    q->tqe_th->th_flags));
883 			optlen += TCPOLEN_SACK_BLOCK;
884 			q = TAILQ_NEXT(q, tqe_q);
885 		}
886 	} else {
887 		int n = 0;
888 
889 		/* Fill in SACK blocks from right side. */
890 		while (n < tp->nsackhistory &&
891 		    TCP_MAXOLEN - optlen >= TCPOLEN_SACK_BLOCK) {
892 			if (tp->sackhistory[n].rblk_start != hstart) {
893 				*lp++ = htonl(tp->sackhistory[n].rblk_start);
894 				*lp++ = htonl(tp->sackhistory[n].rblk_end);
895 				optlen += TCPOLEN_SACK_BLOCK;
896 			}
897 			++n;
898 		}
899 	}
900 	tp->reportblk.rblk_start = tp->reportblk.rblk_end;
901 	tp->sack_flags &=
902 	    ~(TSACK_F_DUPSEG | TSACK_F_ENCLOSESEG | TSACK_F_SACKLEFT);
903 	nblocks = (lp - olp - 1) / 2;
904 	*olp = htonl(TCPOPT_SACK_ALIGNED |
905 		     (TCPOLEN_SACK + nblocks * TCPOLEN_SACK_BLOCK));
906 	*plen = optlen;
907 }
908