1 /* $NetBSD: tcp_sack.c,v 1.23 2007/03/12 18:18:36 ad Exp $ */ 2 3 /* 4 * Copyright (c) 2005 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Kentaro A. Kurahone. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the NetBSD 21 * Foundation, Inc. and its contributors. 22 * 4. Neither the name of The NetBSD Foundation nor the names of its 23 * contributors may be used to endorse or promote products derived 24 * from this software without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 * POSSIBILITY OF SUCH DAMAGE. 37 */ 38 39 /* 40 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 41 * The Regents of the University of California. All rights reserved. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 4. Neither the name of the University nor the names of its contributors 52 * may be used to endorse or promote products derived from this software 53 * without specific prior written permission. 54 * 55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 65 * SUCH DAMAGE. 66 * 67 * @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95 68 * $FreeBSD: src/sys/netinet/tcp_sack.c,v 1.3.2.2 2004/12/25 23:02:57 rwatson Exp $ 69 */ 70 71 /* 72 * @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995 73 * 74 * NRL grants permission for redistribution and use in source and binary 75 * forms, with or without modification, of the software and documentation 76 * created at NRL provided that the following conditions are met: 77 * 78 * 1. Redistributions of source code must retain the above copyright 79 * notice, this list of conditions and the following disclaimer. 80 * 2. Redistributions in binary form must reproduce the above copyright 81 * notice, this list of conditions and the following disclaimer in the 82 * documentation and/or other materials provided with the distribution. 83 * 3. All advertising materials mentioning features or use of this software 84 * must display the following acknowledgements: 85 * This product includes software developed by the University of 86 * California, Berkeley and its contributors. 87 * This product includes software developed at the Information 88 * Technology Division, US Naval Research Laboratory. 89 * 4. Neither the name of the NRL nor the names of its contributors 90 * may be used to endorse or promote products derived from this software 91 * without specific prior written permission. 92 * 93 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 94 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 95 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 96 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 97 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 98 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 99 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 100 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 101 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 102 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 103 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 104 * 105 * The views and conclusions contained in the software and documentation 106 * are those of the authors and should not be interpreted as representing 107 * official policies, either expressed or implied, of the US Naval 108 * Research Laboratory (NRL). 109 */ 110 111 #include <sys/cdefs.h> 112 __KERNEL_RCSID(0, "$NetBSD: tcp_sack.c,v 1.23 2007/03/12 18:18:36 ad Exp $"); 113 114 #include "opt_inet.h" 115 #include "opt_ipsec.h" 116 #include "opt_inet_csum.h" 117 #include "opt_tcp_debug.h" 118 #include "opt_ddb.h" 119 120 #include <sys/param.h> 121 #include <sys/systm.h> 122 #include <sys/malloc.h> 123 #include <sys/mbuf.h> 124 #include <sys/protosw.h> 125 #include <sys/socket.h> 126 #include <sys/socketvar.h> 127 #include <sys/errno.h> 128 #include <sys/syslog.h> 129 #include <sys/pool.h> 130 #include <sys/domain.h> 131 #include <sys/kernel.h> 132 133 #include <net/if.h> 134 #include <net/route.h> 135 #include <net/if_types.h> 136 137 #include <netinet/in.h> 138 #include <netinet/in_systm.h> 139 #include <netinet/ip.h> 140 #include <netinet/in_pcb.h> 141 #include <netinet/in_var.h> 142 #include <netinet/ip_var.h> 143 144 #ifdef INET6 145 #ifndef INET 146 #include <netinet/in.h> 147 #endif 148 #include <netinet/ip6.h> 149 #include <netinet6/ip6_var.h> 150 #include <netinet6/in6_pcb.h> 151 #include <netinet6/ip6_var.h> 152 #include <netinet6/in6_var.h> 153 #include <netinet/icmp6.h> 154 #include <netinet6/nd6.h> 155 #endif 156 157 #ifndef INET6 158 /* always need ip6.h for IP6_EXTHDR_GET */ 159 #include <netinet/ip6.h> 160 #endif 161 162 #include <netinet/tcp.h> 163 #include <netinet/tcp_fsm.h> 164 #include <netinet/tcp_seq.h> 165 #include <netinet/tcp_timer.h> 166 #include <netinet/tcp_var.h> 167 #include <netinet/tcpip.h> 168 #include <netinet/tcp_debug.h> 169 170 #include <machine/stdarg.h> 171 172 /* SACK block pool. */ 173 static POOL_INIT(sackhole_pool, sizeof(struct sackhole), 0, 0, 0, "sackholepl", 174 NULL, IPL_SOFTNET); 175 176 static struct sackhole * 177 sack_allochole(struct tcpcb *tp) 178 { 179 struct sackhole *hole; 180 181 if (tp->snd_numholes >= tcp_sack_tp_maxholes || 182 tcp_sack_globalholes >= tcp_sack_globalmaxholes) { 183 return NULL; 184 } 185 hole = pool_get(&sackhole_pool, PR_NOWAIT); 186 if (hole == NULL) { 187 return NULL; 188 } 189 tp->snd_numholes++; 190 tcp_sack_globalholes++; 191 192 return hole; 193 } 194 195 static struct sackhole * 196 sack_inserthole(struct tcpcb *tp, tcp_seq start, tcp_seq end, 197 struct sackhole *prev) 198 { 199 struct sackhole *hole; 200 201 hole = sack_allochole(tp); 202 if (hole == NULL) { 203 return NULL; 204 } 205 hole->start = hole->rxmit = start; 206 hole->end = end; 207 if (prev != NULL) { 208 TAILQ_INSERT_AFTER(&tp->snd_holes, prev, hole, sackhole_q); 209 } else { 210 TAILQ_INSERT_TAIL(&tp->snd_holes, hole, sackhole_q); 211 } 212 return hole; 213 } 214 215 static struct sackhole * 216 sack_removehole(struct tcpcb *tp, struct sackhole *hole) 217 { 218 struct sackhole *next; 219 220 next = TAILQ_NEXT(hole, sackhole_q); 221 tp->snd_numholes--; 222 tcp_sack_globalholes--; 223 TAILQ_REMOVE(&tp->snd_holes, hole, sackhole_q); 224 pool_put(&sackhole_pool, hole); 225 226 return next; 227 } 228 229 void 230 tcp_new_dsack(struct tcpcb *tp, tcp_seq seq, u_int32_t len) 231 { 232 if (TCP_SACK_ENABLED(tp)) { 233 tp->rcv_dsack_block.left = seq; 234 tp->rcv_dsack_block.right = seq + len; 235 tp->rcv_sack_flags |= TCPSACK_HAVED; 236 } 237 } 238 239 void 240 tcp_sack_option(struct tcpcb *tp, const struct tcphdr *th, const u_char *cp, 241 int optlen) 242 { 243 struct sackblk 244 t_sack_block[(MAX_TCPOPTLEN - 2) / (sizeof(u_int32_t) * 2)]; 245 struct sackblk *sack = NULL; 246 struct sackhole *cur = NULL; 247 struct sackhole *tmp = NULL; 248 const char *lp = cp + 2; 249 int i, j, num_sack_blks; 250 tcp_seq left, right, acked; 251 252 /* 253 * If we aren't processing SACK responses, this is not an ACK 254 * or the peer sends us a sack option with invalid length, don't 255 * update the scoreboard. 256 */ 257 if (!TCP_SACK_ENABLED(tp) || ((th->th_flags & TH_ACK) == 0) || 258 (optlen % 8 != 2 || optlen < 10)) { 259 return; 260 } 261 262 /* 263 * If we don't want any SACK holes to be allocated, just return. 264 */ 265 if (tcp_sack_globalmaxholes == 0 || tcp_sack_tp_maxholes == 0) { 266 return; 267 } 268 269 /* If the ACK is outside [snd_una, snd_max], ignore the SACK options. */ 270 if (SEQ_LT(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max)) 271 return; 272 273 /* 274 * Extract SACK blocks. 275 * 276 * Note that t_sack_block is sorted so that we only need to do 277 * one pass over the sequence number space. (SACK "fast-path") 278 */ 279 num_sack_blks = optlen / 8; 280 acked = (SEQ_GT(th->th_ack, tp->snd_una)) ? th->th_ack : tp->snd_una; 281 for (i = 0; i < num_sack_blks; i++, lp += sizeof(uint32_t) * 2) { 282 memcpy(&left, lp, sizeof(uint32_t)); 283 memcpy(&right, lp + sizeof(uint32_t), sizeof(uint32_t)); 284 left = ntohl(left); 285 right = ntohl(right); 286 287 if (SEQ_LEQ(right, acked) || SEQ_GT(right, tp->snd_max) || 288 SEQ_GEQ(left, right)) { 289 /* SACK entry that's old, or invalid. */ 290 i--; 291 num_sack_blks--; 292 continue; 293 } 294 295 /* Insertion sort. */ 296 for (j = i; (j > 0) && SEQ_LT(left, t_sack_block[j - 1].left); 297 j--) { 298 t_sack_block[j].left = t_sack_block[j - 1].left; 299 t_sack_block[j].right = t_sack_block[j - 1].right; 300 } 301 t_sack_block[j].left = left; 302 t_sack_block[j].right = right; 303 } 304 305 /* Update the scoreboard. */ 306 cur = TAILQ_FIRST(&tp->snd_holes); 307 for (i = 0; i < num_sack_blks; i++) { 308 sack = &t_sack_block[i]; 309 /* 310 * FACK TCP. Update snd_fack so we can enter Fast 311 * Recovery early. 312 */ 313 if (SEQ_GEQ(sack->right, tp->snd_fack)) 314 tp->snd_fack = sack->right; 315 316 if (TAILQ_EMPTY(&tp->snd_holes)) { 317 /* First hole. */ 318 cur = sack_inserthole(tp, th->th_ack, sack->left, NULL); 319 if (cur == NULL) { 320 /* ENOBUFS, bail out*/ 321 return; 322 } 323 tp->rcv_lastsack = sack->right; 324 continue; /* With next sack block */ 325 } 326 327 /* Go through the list of holes. */ 328 while (cur) { 329 if (SEQ_LEQ(sack->right, cur->start)) 330 /* SACKs data before the current hole */ 331 break; /* No use going through more holes */ 332 333 if (SEQ_GEQ(sack->left, cur->end)) { 334 /* SACKs data beyond the current hole */ 335 cur = TAILQ_NEXT(cur, sackhole_q); 336 continue; 337 } 338 339 if (SEQ_LEQ(sack->left, cur->start)) { 340 /* Data acks at least the beginning of hole */ 341 if (SEQ_GEQ(sack->right, cur->end)) { 342 /* Acks entire hole, so delete hole */ 343 cur = sack_removehole(tp, cur); 344 break; 345 } 346 347 /* Otherwise, move start of hole forward */ 348 cur->start = sack->right; 349 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 350 break; 351 } 352 353 if (SEQ_GEQ(sack->right, cur->end)) { 354 /* Move end of hole backward. */ 355 cur->end = sack->left; 356 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 357 cur = TAILQ_NEXT(cur, sackhole_q); 358 break; 359 } 360 361 if (SEQ_LT(cur->start, sack->left) && 362 SEQ_GT(cur->end, sack->right)) { 363 /* 364 * ACKs some data in middle of a hole; need to 365 * split current hole 366 */ 367 tmp = sack_inserthole(tp, sack->right, cur->end, 368 cur); 369 if (tmp == NULL) { 370 return; 371 } 372 tmp->rxmit = SEQ_MAX(cur->rxmit, tmp->start); 373 cur->end = sack->left; 374 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 375 cur = tmp; 376 break; 377 } 378 } 379 380 /* At this point, we have reached the tail of the list. */ 381 if (SEQ_LT(tp->rcv_lastsack, sack->left)) { 382 /* 383 * Need to append new hole at end. 384 */ 385 cur = sack_inserthole(tp, tp->rcv_lastsack, sack->left, 386 NULL); 387 if (cur == NULL) { 388 return; 389 } 390 } 391 if (SEQ_LT(tp->rcv_lastsack, sack->right)) { 392 tp->rcv_lastsack = sack->right; 393 } 394 } 395 } 396 397 void 398 tcp_del_sackholes(struct tcpcb *tp, const struct tcphdr *th) 399 { 400 /* Max because this could be an older ack that just arrived. */ 401 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 402 th->th_ack : tp->snd_una; 403 struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes); 404 405 while (cur) { 406 if (SEQ_LEQ(cur->end, lastack)) { 407 cur = sack_removehole(tp, cur); 408 } else if (SEQ_LT(cur->start, lastack)) { 409 cur->start = lastack; 410 if (SEQ_LT(cur->rxmit, cur->start)) 411 cur->rxmit = cur->start; 412 break; 413 } else 414 break; 415 } 416 } 417 418 void 419 tcp_free_sackholes(struct tcpcb *tp) 420 { 421 struct sackhole *sack; 422 423 /* Free up the SACK hole list. */ 424 while ((sack = TAILQ_FIRST(&tp->snd_holes)) != NULL) { 425 sack_removehole(tp, sack); 426 } 427 KASSERT(tp->snd_numholes == 0); 428 } 429 430 /* 431 * Implements the SACK response to a new ack, checking for partial acks 432 * in fast recovery. 433 */ 434 void 435 tcp_sack_newack(struct tcpcb *tp, const struct tcphdr *th) 436 { 437 if (tp->t_partialacks < 0) { 438 /* 439 * Not in fast recovery. Reset the duplicate ack 440 * counter. 441 */ 442 tp->t_dupacks = 0; 443 } else if (SEQ_LT(th->th_ack, tp->snd_recover)) { 444 /* 445 * Partial ack handling within a sack recovery episode. 446 * Keeping this very simple for now. When a partial ack 447 * is received, force snd_cwnd to a value that will allow 448 * the sender to transmit no more than 2 segments. 449 * If necessary, a fancier scheme can be adopted at a 450 * later point, but for now, the goal is to prevent the 451 * sender from bursting a large amount of data in the midst 452 * of sack recovery. 453 */ 454 int num_segs = 1; 455 int sack_bytes_rxmt = 0; 456 457 tp->t_partialacks++; 458 TCP_TIMER_DISARM(tp, TCPT_REXMT); 459 tp->t_rtttime = 0; 460 461 /* 462 * send one or 2 segments based on how much new data was acked 463 */ 464 if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2) 465 num_segs = 2; 466 (void)tcp_sack_output(tp, &sack_bytes_rxmt); 467 tp->snd_cwnd = sack_bytes_rxmt + 468 (tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_segsz; 469 tp->t_flags |= TF_ACKNOW; 470 (void) tcp_output(tp); 471 } else { 472 /* 473 * Complete ack, inflate the congestion window to 474 * ssthresh and exit fast recovery. 475 * 476 * Window inflation should have left us with approx. 477 * snd_ssthresh outstanding data. But in case we 478 * would be inclined to send a burst, better to do 479 * it via the slow start mechanism. 480 */ 481 if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh) 482 tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack) 483 + tp->t_segsz; 484 else 485 tp->snd_cwnd = tp->snd_ssthresh; 486 tp->t_partialacks = -1; 487 tp->t_dupacks = 0; 488 if (SEQ_GT(th->th_ack, tp->snd_fack)) 489 tp->snd_fack = th->th_ack; 490 } 491 } 492 493 /* 494 * Returns pointer to a sackhole if there are any pending retransmissions; 495 * NULL otherwise. 496 */ 497 struct sackhole * 498 tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt) 499 { 500 struct sackhole *cur = NULL; 501 502 if (!TCP_SACK_ENABLED(tp)) 503 return (NULL); 504 505 *sack_bytes_rexmt = 0; 506 TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) { 507 if (SEQ_LT(cur->rxmit, cur->end)) { 508 if (SEQ_LT(cur->rxmit, tp->snd_una)) { 509 /* old SACK hole */ 510 continue; 511 } 512 *sack_bytes_rexmt += (cur->rxmit - cur->start); 513 break; 514 } 515 *sack_bytes_rexmt += (cur->rxmit - cur->start); 516 } 517 518 return (cur); 519 } 520 521 /* 522 * After a timeout, the SACK list may be rebuilt. This SACK information 523 * should be used to avoid retransmitting SACKed data. This function 524 * traverses the SACK list to see if snd_nxt should be moved forward. 525 */ 526 void 527 tcp_sack_adjust(struct tcpcb *tp) 528 { 529 struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes); 530 struct sackhole *n = NULL; 531 532 if (TAILQ_EMPTY(&tp->snd_holes)) 533 return; /* No holes */ 534 if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack)) 535 return; /* We're already beyond any SACKed blocks */ 536 537 /* 538 * Two cases for which we want to advance snd_nxt: 539 * i) snd_nxt lies between end of one hole and beginning of another 540 * ii) snd_nxt lies between end of last hole and rcv_lastsack 541 */ 542 while ((n = TAILQ_NEXT(cur, sackhole_q)) != NULL) { 543 if (SEQ_LT(tp->snd_nxt, cur->end)) 544 return; 545 if (SEQ_GEQ(tp->snd_nxt, n->start)) 546 cur = n; 547 else { 548 tp->snd_nxt = n->start; 549 return; 550 } 551 } 552 if (SEQ_LT(tp->snd_nxt, cur->end)) 553 return; 554 tp->snd_nxt = tp->rcv_lastsack; 555 556 return; 557 } 558 559 int 560 tcp_sack_numblks(const struct tcpcb *tp) 561 { 562 int numblks; 563 564 if (!TCP_SACK_ENABLED(tp)) { 565 return 0; 566 } 567 568 numblks = (((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) ? 1 : 0) + 569 tp->t_segqlen; 570 571 if (numblks == 0) { 572 return 0; 573 } 574 575 if (numblks > TCP_SACK_MAX) { 576 numblks = TCP_SACK_MAX; 577 } 578 579 return numblks; 580 } 581 582 #if defined(DDB) 583 void sack_dump(const struct tcpcb *); 584 585 void 586 sack_dump(const struct tcpcb *tp) 587 { 588 const struct sackhole *cur; 589 590 printf("snd_una=%" PRIu32 ", snd_max=%" PRIu32 "\n", 591 tp->snd_una, tp->snd_max); 592 printf("rcv_lastsack=%" PRIu32 ", snd_fack=%" PRIu32 "\n", 593 tp->rcv_lastsack, tp->snd_fack); 594 printf("numholes=%d\n", tp->snd_numholes); 595 TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) { 596 printf("\t%" PRIu32 "-%" PRIu32 ", rxmit=%" PRIu32 "\n", 597 cur->start, cur->end, cur->rxmit); 598 } 599 } 600 #endif /* defined(DDB) */ 601