1 /* $NetBSD: tcp_sack.c,v 1.24 2008/04/28 20:24:09 martin Exp $ */ 2 3 /* 4 * Copyright (c) 2005 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Kentaro A. Kurahone. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 34 * The Regents of the University of California. All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 4. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 * @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95 61 * $FreeBSD: src/sys/netinet/tcp_sack.c,v 1.3.2.2 2004/12/25 23:02:57 rwatson Exp $ 62 */ 63 64 /* 65 * @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995 66 * 67 * NRL grants permission for redistribution and use in source and binary 68 * forms, with or without modification, of the software and documentation 69 * created at NRL provided that the following conditions are met: 70 * 71 * 1. Redistributions of source code must retain the above copyright 72 * notice, this list of conditions and the following disclaimer. 73 * 2. Redistributions in binary form must reproduce the above copyright 74 * notice, this list of conditions and the following disclaimer in the 75 * documentation and/or other materials provided with the distribution. 76 * 3. All advertising materials mentioning features or use of this software 77 * must display the following acknowledgements: 78 * This product includes software developed by the University of 79 * California, Berkeley and its contributors. 80 * This product includes software developed at the Information 81 * Technology Division, US Naval Research Laboratory. 82 * 4. Neither the name of the NRL nor the names of its contributors 83 * may be used to endorse or promote products derived from this software 84 * without specific prior written permission. 85 * 86 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 87 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 88 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 89 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 90 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 91 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 92 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 93 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 94 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 95 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 96 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 97 * 98 * The views and conclusions contained in the software and documentation 99 * are those of the authors and should not be interpreted as representing 100 * official policies, either expressed or implied, of the US Naval 101 * Research Laboratory (NRL). 102 */ 103 104 #include <sys/cdefs.h> 105 __KERNEL_RCSID(0, "$NetBSD: tcp_sack.c,v 1.24 2008/04/28 20:24:09 martin Exp $"); 106 107 #include "opt_inet.h" 108 #include "opt_ipsec.h" 109 #include "opt_inet_csum.h" 110 #include "opt_tcp_debug.h" 111 #include "opt_ddb.h" 112 113 #include <sys/param.h> 114 #include <sys/systm.h> 115 #include <sys/malloc.h> 116 #include <sys/mbuf.h> 117 #include <sys/protosw.h> 118 #include <sys/socket.h> 119 #include <sys/socketvar.h> 120 #include <sys/errno.h> 121 #include <sys/syslog.h> 122 #include <sys/pool.h> 123 #include <sys/domain.h> 124 #include <sys/kernel.h> 125 126 #include <net/if.h> 127 #include <net/route.h> 128 #include <net/if_types.h> 129 130 #include <netinet/in.h> 131 #include <netinet/in_systm.h> 132 #include <netinet/ip.h> 133 #include <netinet/in_pcb.h> 134 #include <netinet/in_var.h> 135 #include <netinet/ip_var.h> 136 137 #ifdef INET6 138 #ifndef INET 139 #include <netinet/in.h> 140 #endif 141 #include <netinet/ip6.h> 142 #include <netinet6/ip6_var.h> 143 #include <netinet6/in6_pcb.h> 144 #include <netinet6/ip6_var.h> 145 #include <netinet6/in6_var.h> 146 #include <netinet/icmp6.h> 147 #include <netinet6/nd6.h> 148 #endif 149 150 #ifndef INET6 151 /* always need ip6.h for IP6_EXTHDR_GET */ 152 #include <netinet/ip6.h> 153 #endif 154 155 #include <netinet/tcp.h> 156 #include <netinet/tcp_fsm.h> 157 #include <netinet/tcp_seq.h> 158 #include <netinet/tcp_timer.h> 159 #include <netinet/tcp_var.h> 160 #include <netinet/tcpip.h> 161 #include <netinet/tcp_debug.h> 162 163 #include <machine/stdarg.h> 164 165 /* SACK block pool. */ 166 static POOL_INIT(sackhole_pool, sizeof(struct sackhole), 0, 0, 0, "sackholepl", 167 NULL, IPL_SOFTNET); 168 169 static struct sackhole * 170 sack_allochole(struct tcpcb *tp) 171 { 172 struct sackhole *hole; 173 174 if (tp->snd_numholes >= tcp_sack_tp_maxholes || 175 tcp_sack_globalholes >= tcp_sack_globalmaxholes) { 176 return NULL; 177 } 178 hole = pool_get(&sackhole_pool, PR_NOWAIT); 179 if (hole == NULL) { 180 return NULL; 181 } 182 tp->snd_numholes++; 183 tcp_sack_globalholes++; 184 185 return hole; 186 } 187 188 static struct sackhole * 189 sack_inserthole(struct tcpcb *tp, tcp_seq start, tcp_seq end, 190 struct sackhole *prev) 191 { 192 struct sackhole *hole; 193 194 hole = sack_allochole(tp); 195 if (hole == NULL) { 196 return NULL; 197 } 198 hole->start = hole->rxmit = start; 199 hole->end = end; 200 if (prev != NULL) { 201 TAILQ_INSERT_AFTER(&tp->snd_holes, prev, hole, sackhole_q); 202 } else { 203 TAILQ_INSERT_TAIL(&tp->snd_holes, hole, sackhole_q); 204 } 205 return hole; 206 } 207 208 static struct sackhole * 209 sack_removehole(struct tcpcb *tp, struct sackhole *hole) 210 { 211 struct sackhole *next; 212 213 next = TAILQ_NEXT(hole, sackhole_q); 214 tp->snd_numholes--; 215 tcp_sack_globalholes--; 216 TAILQ_REMOVE(&tp->snd_holes, hole, sackhole_q); 217 pool_put(&sackhole_pool, hole); 218 219 return next; 220 } 221 222 void 223 tcp_new_dsack(struct tcpcb *tp, tcp_seq seq, u_int32_t len) 224 { 225 if (TCP_SACK_ENABLED(tp)) { 226 tp->rcv_dsack_block.left = seq; 227 tp->rcv_dsack_block.right = seq + len; 228 tp->rcv_sack_flags |= TCPSACK_HAVED; 229 } 230 } 231 232 void 233 tcp_sack_option(struct tcpcb *tp, const struct tcphdr *th, const u_char *cp, 234 int optlen) 235 { 236 struct sackblk 237 t_sack_block[(MAX_TCPOPTLEN - 2) / (sizeof(u_int32_t) * 2)]; 238 struct sackblk *sack = NULL; 239 struct sackhole *cur = NULL; 240 struct sackhole *tmp = NULL; 241 const char *lp = cp + 2; 242 int i, j, num_sack_blks; 243 tcp_seq left, right, acked; 244 245 /* 246 * If we aren't processing SACK responses, this is not an ACK 247 * or the peer sends us a sack option with invalid length, don't 248 * update the scoreboard. 249 */ 250 if (!TCP_SACK_ENABLED(tp) || ((th->th_flags & TH_ACK) == 0) || 251 (optlen % 8 != 2 || optlen < 10)) { 252 return; 253 } 254 255 /* 256 * If we don't want any SACK holes to be allocated, just return. 257 */ 258 if (tcp_sack_globalmaxholes == 0 || tcp_sack_tp_maxholes == 0) { 259 return; 260 } 261 262 /* If the ACK is outside [snd_una, snd_max], ignore the SACK options. */ 263 if (SEQ_LT(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max)) 264 return; 265 266 /* 267 * Extract SACK blocks. 268 * 269 * Note that t_sack_block is sorted so that we only need to do 270 * one pass over the sequence number space. (SACK "fast-path") 271 */ 272 num_sack_blks = optlen / 8; 273 acked = (SEQ_GT(th->th_ack, tp->snd_una)) ? th->th_ack : tp->snd_una; 274 for (i = 0; i < num_sack_blks; i++, lp += sizeof(uint32_t) * 2) { 275 memcpy(&left, lp, sizeof(uint32_t)); 276 memcpy(&right, lp + sizeof(uint32_t), sizeof(uint32_t)); 277 left = ntohl(left); 278 right = ntohl(right); 279 280 if (SEQ_LEQ(right, acked) || SEQ_GT(right, tp->snd_max) || 281 SEQ_GEQ(left, right)) { 282 /* SACK entry that's old, or invalid. */ 283 i--; 284 num_sack_blks--; 285 continue; 286 } 287 288 /* Insertion sort. */ 289 for (j = i; (j > 0) && SEQ_LT(left, t_sack_block[j - 1].left); 290 j--) { 291 t_sack_block[j].left = t_sack_block[j - 1].left; 292 t_sack_block[j].right = t_sack_block[j - 1].right; 293 } 294 t_sack_block[j].left = left; 295 t_sack_block[j].right = right; 296 } 297 298 /* Update the scoreboard. */ 299 cur = TAILQ_FIRST(&tp->snd_holes); 300 for (i = 0; i < num_sack_blks; i++) { 301 sack = &t_sack_block[i]; 302 /* 303 * FACK TCP. Update snd_fack so we can enter Fast 304 * Recovery early. 305 */ 306 if (SEQ_GEQ(sack->right, tp->snd_fack)) 307 tp->snd_fack = sack->right; 308 309 if (TAILQ_EMPTY(&tp->snd_holes)) { 310 /* First hole. */ 311 cur = sack_inserthole(tp, th->th_ack, sack->left, NULL); 312 if (cur == NULL) { 313 /* ENOBUFS, bail out*/ 314 return; 315 } 316 tp->rcv_lastsack = sack->right; 317 continue; /* With next sack block */ 318 } 319 320 /* Go through the list of holes. */ 321 while (cur) { 322 if (SEQ_LEQ(sack->right, cur->start)) 323 /* SACKs data before the current hole */ 324 break; /* No use going through more holes */ 325 326 if (SEQ_GEQ(sack->left, cur->end)) { 327 /* SACKs data beyond the current hole */ 328 cur = TAILQ_NEXT(cur, sackhole_q); 329 continue; 330 } 331 332 if (SEQ_LEQ(sack->left, cur->start)) { 333 /* Data acks at least the beginning of hole */ 334 if (SEQ_GEQ(sack->right, cur->end)) { 335 /* Acks entire hole, so delete hole */ 336 cur = sack_removehole(tp, cur); 337 break; 338 } 339 340 /* Otherwise, move start of hole forward */ 341 cur->start = sack->right; 342 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); 343 break; 344 } 345 346 if (SEQ_GEQ(sack->right, cur->end)) { 347 /* Move end of hole backward. */ 348 cur->end = sack->left; 349 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 350 cur = TAILQ_NEXT(cur, sackhole_q); 351 break; 352 } 353 354 if (SEQ_LT(cur->start, sack->left) && 355 SEQ_GT(cur->end, sack->right)) { 356 /* 357 * ACKs some data in middle of a hole; need to 358 * split current hole 359 */ 360 tmp = sack_inserthole(tp, sack->right, cur->end, 361 cur); 362 if (tmp == NULL) { 363 return; 364 } 365 tmp->rxmit = SEQ_MAX(cur->rxmit, tmp->start); 366 cur->end = sack->left; 367 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); 368 cur = tmp; 369 break; 370 } 371 } 372 373 /* At this point, we have reached the tail of the list. */ 374 if (SEQ_LT(tp->rcv_lastsack, sack->left)) { 375 /* 376 * Need to append new hole at end. 377 */ 378 cur = sack_inserthole(tp, tp->rcv_lastsack, sack->left, 379 NULL); 380 if (cur == NULL) { 381 return; 382 } 383 } 384 if (SEQ_LT(tp->rcv_lastsack, sack->right)) { 385 tp->rcv_lastsack = sack->right; 386 } 387 } 388 } 389 390 void 391 tcp_del_sackholes(struct tcpcb *tp, const struct tcphdr *th) 392 { 393 /* Max because this could be an older ack that just arrived. */ 394 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? 395 th->th_ack : tp->snd_una; 396 struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes); 397 398 while (cur) { 399 if (SEQ_LEQ(cur->end, lastack)) { 400 cur = sack_removehole(tp, cur); 401 } else if (SEQ_LT(cur->start, lastack)) { 402 cur->start = lastack; 403 if (SEQ_LT(cur->rxmit, cur->start)) 404 cur->rxmit = cur->start; 405 break; 406 } else 407 break; 408 } 409 } 410 411 void 412 tcp_free_sackholes(struct tcpcb *tp) 413 { 414 struct sackhole *sack; 415 416 /* Free up the SACK hole list. */ 417 while ((sack = TAILQ_FIRST(&tp->snd_holes)) != NULL) { 418 sack_removehole(tp, sack); 419 } 420 KASSERT(tp->snd_numholes == 0); 421 } 422 423 /* 424 * Implements the SACK response to a new ack, checking for partial acks 425 * in fast recovery. 426 */ 427 void 428 tcp_sack_newack(struct tcpcb *tp, const struct tcphdr *th) 429 { 430 if (tp->t_partialacks < 0) { 431 /* 432 * Not in fast recovery. Reset the duplicate ack 433 * counter. 434 */ 435 tp->t_dupacks = 0; 436 } else if (SEQ_LT(th->th_ack, tp->snd_recover)) { 437 /* 438 * Partial ack handling within a sack recovery episode. 439 * Keeping this very simple for now. When a partial ack 440 * is received, force snd_cwnd to a value that will allow 441 * the sender to transmit no more than 2 segments. 442 * If necessary, a fancier scheme can be adopted at a 443 * later point, but for now, the goal is to prevent the 444 * sender from bursting a large amount of data in the midst 445 * of sack recovery. 446 */ 447 int num_segs = 1; 448 int sack_bytes_rxmt = 0; 449 450 tp->t_partialacks++; 451 TCP_TIMER_DISARM(tp, TCPT_REXMT); 452 tp->t_rtttime = 0; 453 454 /* 455 * send one or 2 segments based on how much new data was acked 456 */ 457 if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2) 458 num_segs = 2; 459 (void)tcp_sack_output(tp, &sack_bytes_rxmt); 460 tp->snd_cwnd = sack_bytes_rxmt + 461 (tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_segsz; 462 tp->t_flags |= TF_ACKNOW; 463 (void) tcp_output(tp); 464 } else { 465 /* 466 * Complete ack, inflate the congestion window to 467 * ssthresh and exit fast recovery. 468 * 469 * Window inflation should have left us with approx. 470 * snd_ssthresh outstanding data. But in case we 471 * would be inclined to send a burst, better to do 472 * it via the slow start mechanism. 473 */ 474 if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh) 475 tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack) 476 + tp->t_segsz; 477 else 478 tp->snd_cwnd = tp->snd_ssthresh; 479 tp->t_partialacks = -1; 480 tp->t_dupacks = 0; 481 if (SEQ_GT(th->th_ack, tp->snd_fack)) 482 tp->snd_fack = th->th_ack; 483 } 484 } 485 486 /* 487 * Returns pointer to a sackhole if there are any pending retransmissions; 488 * NULL otherwise. 489 */ 490 struct sackhole * 491 tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt) 492 { 493 struct sackhole *cur = NULL; 494 495 if (!TCP_SACK_ENABLED(tp)) 496 return (NULL); 497 498 *sack_bytes_rexmt = 0; 499 TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) { 500 if (SEQ_LT(cur->rxmit, cur->end)) { 501 if (SEQ_LT(cur->rxmit, tp->snd_una)) { 502 /* old SACK hole */ 503 continue; 504 } 505 *sack_bytes_rexmt += (cur->rxmit - cur->start); 506 break; 507 } 508 *sack_bytes_rexmt += (cur->rxmit - cur->start); 509 } 510 511 return (cur); 512 } 513 514 /* 515 * After a timeout, the SACK list may be rebuilt. This SACK information 516 * should be used to avoid retransmitting SACKed data. This function 517 * traverses the SACK list to see if snd_nxt should be moved forward. 518 */ 519 void 520 tcp_sack_adjust(struct tcpcb *tp) 521 { 522 struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes); 523 struct sackhole *n = NULL; 524 525 if (TAILQ_EMPTY(&tp->snd_holes)) 526 return; /* No holes */ 527 if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack)) 528 return; /* We're already beyond any SACKed blocks */ 529 530 /* 531 * Two cases for which we want to advance snd_nxt: 532 * i) snd_nxt lies between end of one hole and beginning of another 533 * ii) snd_nxt lies between end of last hole and rcv_lastsack 534 */ 535 while ((n = TAILQ_NEXT(cur, sackhole_q)) != NULL) { 536 if (SEQ_LT(tp->snd_nxt, cur->end)) 537 return; 538 if (SEQ_GEQ(tp->snd_nxt, n->start)) 539 cur = n; 540 else { 541 tp->snd_nxt = n->start; 542 return; 543 } 544 } 545 if (SEQ_LT(tp->snd_nxt, cur->end)) 546 return; 547 tp->snd_nxt = tp->rcv_lastsack; 548 549 return; 550 } 551 552 int 553 tcp_sack_numblks(const struct tcpcb *tp) 554 { 555 int numblks; 556 557 if (!TCP_SACK_ENABLED(tp)) { 558 return 0; 559 } 560 561 numblks = (((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) ? 1 : 0) + 562 tp->t_segqlen; 563 564 if (numblks == 0) { 565 return 0; 566 } 567 568 if (numblks > TCP_SACK_MAX) { 569 numblks = TCP_SACK_MAX; 570 } 571 572 return numblks; 573 } 574 575 #if defined(DDB) 576 void sack_dump(const struct tcpcb *); 577 578 void 579 sack_dump(const struct tcpcb *tp) 580 { 581 const struct sackhole *cur; 582 583 printf("snd_una=%" PRIu32 ", snd_max=%" PRIu32 "\n", 584 tp->snd_una, tp->snd_max); 585 printf("rcv_lastsack=%" PRIu32 ", snd_fack=%" PRIu32 "\n", 586 tp->rcv_lastsack, tp->snd_fack); 587 printf("numholes=%d\n", tp->snd_numholes); 588 TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) { 589 printf("\t%" PRIu32 "-%" PRIu32 ", rxmit=%" PRIu32 "\n", 590 cur->start, cur->end, cur->rxmit); 591 } 592 } 593 #endif /* defined(DDB) */ 594