1 /* $NetBSD: tcp_congctl.c,v 1.20 2015/08/24 22:21:26 pooka Exp $ */ 2 3 /*- 4 * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation 9 * Facility, NASA Ames Research Center. 10 * This code is derived from software contributed to The NetBSD Foundation 11 * by Charles M. Hannum. 12 * This code is derived from software contributed to The NetBSD Foundation 13 * by Rui Paulo. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 25 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 26 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 27 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 28 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 29 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 30 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 31 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 32 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 33 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 * POSSIBILITY OF SUCH DAMAGE. 35 */ 36 37 /* 38 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 39 * All rights reserved. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the project nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 */ 65 66 /* 67 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 68 * 69 * NRL grants permission for redistribution and use in source and binary 70 * forms, with or without modification, of the software and documentation 71 * created at NRL provided that the following conditions are met: 72 * 73 * 1. Redistributions of source code must retain the above copyright 74 * notice, this list of conditions and the following disclaimer. 75 * 2. Redistributions in binary form must reproduce the above copyright 76 * notice, this list of conditions and the following disclaimer in the 77 * documentation and/or other materials provided with the distribution. 78 * 3. All advertising materials mentioning features or use of this software 79 * must display the following acknowledgements: 80 * This product includes software developed by the University of 81 * California, Berkeley and its contributors. 82 * This product includes software developed at the Information 83 * Technology Division, US Naval Research Laboratory. 84 * 4. Neither the name of the NRL nor the names of its contributors 85 * may be used to endorse or promote products derived from this software 86 * without specific prior written permission. 87 * 88 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 89 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 90 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 91 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 92 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 93 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 94 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 95 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 96 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 97 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 98 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 99 * 100 * The views and conclusions contained in the software and documentation 101 * are those of the authors and should not be interpreted as representing 102 * official policies, either expressed or implied, of the US Naval 103 * Research Laboratory (NRL). 104 */ 105 106 /* 107 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 108 * The Regents of the University of California. All rights reserved. 109 * 110 * Redistribution and use in source and binary forms, with or without 111 * modification, are permitted provided that the following conditions 112 * are met: 113 * 1. Redistributions of source code must retain the above copyright 114 * notice, this list of conditions and the following disclaimer. 115 * 2. Redistributions in binary form must reproduce the above copyright 116 * notice, this list of conditions and the following disclaimer in the 117 * documentation and/or other materials provided with the distribution. 118 * 3. Neither the name of the University nor the names of its contributors 119 * may be used to endorse or promote products derived from this software 120 * without specific prior written permission. 121 * 122 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 123 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 124 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 125 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 126 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 127 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 128 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 129 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 130 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 131 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 132 * SUCH DAMAGE. 133 * 134 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 135 */ 136 137 #include <sys/cdefs.h> 138 __KERNEL_RCSID(0, "$NetBSD: tcp_congctl.c,v 1.20 2015/08/24 22:21:26 pooka Exp $"); 139 140 #ifdef _KERNEL_OPT 141 #include "opt_inet.h" 142 #include "opt_tcp_debug.h" 143 #include "opt_tcp_congctl.h" 144 #endif 145 146 #include <sys/param.h> 147 #include <sys/systm.h> 148 #include <sys/malloc.h> 149 #include <sys/mbuf.h> 150 #include <sys/protosw.h> 151 #include <sys/socket.h> 152 #include <sys/socketvar.h> 153 #include <sys/errno.h> 154 #include <sys/syslog.h> 155 #include <sys/pool.h> 156 #include <sys/domain.h> 157 #include <sys/kernel.h> 158 #include <sys/mutex.h> 159 160 #include <net/if.h> 161 #include <net/route.h> 162 163 #include <netinet/in.h> 164 #include <netinet/in_systm.h> 165 #include <netinet/ip.h> 166 #include <netinet/in_pcb.h> 167 #include <netinet/in_var.h> 168 #include <netinet/ip_var.h> 169 170 #ifdef INET6 171 #ifndef INET 172 #include <netinet/in.h> 173 #endif 174 #include <netinet/ip6.h> 175 #include <netinet6/ip6_var.h> 176 #include <netinet6/in6_pcb.h> 177 #include <netinet6/ip6_var.h> 178 #include <netinet6/in6_var.h> 179 #include <netinet/icmp6.h> 180 #include <netinet6/nd6.h> 181 #endif 182 183 #include <netinet/tcp.h> 184 #include <netinet/tcp_fsm.h> 185 #include <netinet/tcp_seq.h> 186 #include <netinet/tcp_timer.h> 187 #include <netinet/tcp_var.h> 188 #include <netinet/tcpip.h> 189 #include <netinet/tcp_congctl.h> 190 #ifdef TCP_DEBUG 191 #include <netinet/tcp_debug.h> 192 #endif 193 194 /* 195 * TODO: 196 * consider separating the actual implementations in another file. 197 */ 198 199 static void tcp_common_congestion_exp(struct tcpcb *, int, int); 200 201 static int tcp_reno_do_fast_retransmit(struct tcpcb *, const struct tcphdr *); 202 static int tcp_reno_fast_retransmit(struct tcpcb *, const struct tcphdr *); 203 static void tcp_reno_slow_retransmit(struct tcpcb *); 204 static void tcp_reno_fast_retransmit_newack(struct tcpcb *, 205 const struct tcphdr *); 206 static void tcp_reno_newack(struct tcpcb *, const struct tcphdr *); 207 static void tcp_reno_congestion_exp(struct tcpcb *tp); 208 209 static int tcp_newreno_fast_retransmit(struct tcpcb *, const struct tcphdr *); 210 static void tcp_newreno_fast_retransmit_newack(struct tcpcb *, 211 const struct tcphdr *); 212 static void tcp_newreno_newack(struct tcpcb *, const struct tcphdr *); 213 214 static int tcp_cubic_fast_retransmit(struct tcpcb *, const struct tcphdr *); 215 static void tcp_cubic_slow_retransmit(struct tcpcb *tp); 216 static void tcp_cubic_newack(struct tcpcb *, const struct tcphdr *); 217 static void tcp_cubic_congestion_exp(struct tcpcb *); 218 219 static void tcp_congctl_fillnames(void); 220 221 extern int tcprexmtthresh; 222 223 MALLOC_DEFINE(M_TCPCONGCTL, "tcpcongctl", "TCP congestion control structures"); 224 225 /* currently selected global congestion control */ 226 char tcp_congctl_global_name[TCPCC_MAXLEN]; 227 228 /* available global congestion control algorithms */ 229 char tcp_congctl_avail[10 * TCPCC_MAXLEN]; 230 231 /* 232 * Used to list the available congestion control algorithms. 233 */ 234 TAILQ_HEAD(, tcp_congctlent) tcp_congctlhd = 235 TAILQ_HEAD_INITIALIZER(tcp_congctlhd); 236 237 static struct tcp_congctlent * tcp_congctl_global; 238 239 static kmutex_t tcp_congctl_mtx; 240 241 void 242 tcp_congctl_init(void) 243 { 244 int r __diagused; 245 246 mutex_init(&tcp_congctl_mtx, MUTEX_DEFAULT, IPL_NONE); 247 248 /* Base algorithms. */ 249 r = tcp_congctl_register("reno", &tcp_reno_ctl); 250 KASSERT(r == 0); 251 r = tcp_congctl_register("newreno", &tcp_newreno_ctl); 252 KASSERT(r == 0); 253 r = tcp_congctl_register("cubic", &tcp_cubic_ctl); 254 KASSERT(r == 0); 255 256 /* NewReno is the default. */ 257 #ifndef TCP_CONGCTL_DEFAULT 258 #define TCP_CONGCTL_DEFAULT "newreno" 259 #endif 260 261 r = tcp_congctl_select(NULL, TCP_CONGCTL_DEFAULT); 262 KASSERT(r == 0); 263 } 264 265 /* 266 * Register a congestion algorithm and select it if we have none. 267 */ 268 int 269 tcp_congctl_register(const char *name, const struct tcp_congctl *tcc) 270 { 271 struct tcp_congctlent *ntcc, *tccp; 272 273 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) 274 if (!strcmp(name, tccp->congctl_name)) { 275 /* name already registered */ 276 return EEXIST; 277 } 278 279 ntcc = malloc(sizeof(*ntcc), M_TCPCONGCTL, M_WAITOK|M_ZERO); 280 281 strlcpy(ntcc->congctl_name, name, sizeof(ntcc->congctl_name) - 1); 282 ntcc->congctl_ctl = tcc; 283 284 TAILQ_INSERT_TAIL(&tcp_congctlhd, ntcc, congctl_ent); 285 tcp_congctl_fillnames(); 286 287 if (TAILQ_FIRST(&tcp_congctlhd) == ntcc) 288 tcp_congctl_select(NULL, name); 289 290 return 0; 291 } 292 293 int 294 tcp_congctl_unregister(const char *name) 295 { 296 struct tcp_congctlent *tccp, *rtccp; 297 unsigned int size; 298 299 rtccp = NULL; 300 size = 0; 301 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { 302 if (!strcmp(name, tccp->congctl_name)) 303 rtccp = tccp; 304 size++; 305 } 306 307 if (!rtccp) 308 return ENOENT; 309 310 if (size <= 1 || tcp_congctl_global == rtccp || rtccp->congctl_refcnt) 311 return EBUSY; 312 313 TAILQ_REMOVE(&tcp_congctlhd, rtccp, congctl_ent); 314 free(rtccp, M_TCPCONGCTL); 315 tcp_congctl_fillnames(); 316 317 return 0; 318 } 319 320 /* 321 * Select a congestion algorithm by name. 322 */ 323 int 324 tcp_congctl_select(struct tcpcb *tp, const char *name) 325 { 326 struct tcp_congctlent *tccp, *old_tccp, *new_tccp; 327 bool old_found, new_found; 328 329 KASSERT(name); 330 331 old_found = (tp == NULL || tp->t_congctl == NULL); 332 old_tccp = NULL; 333 new_found = false; 334 new_tccp = NULL; 335 336 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { 337 if (!old_found && tccp->congctl_ctl == tp->t_congctl) { 338 old_tccp = tccp; 339 old_found = true; 340 } 341 342 if (!new_found && !strcmp(name, tccp->congctl_name)) { 343 new_tccp = tccp; 344 new_found = true; 345 } 346 347 if (new_found && old_found) { 348 if (tp) { 349 mutex_enter(&tcp_congctl_mtx); 350 if (old_tccp) 351 old_tccp->congctl_refcnt--; 352 tp->t_congctl = new_tccp->congctl_ctl; 353 new_tccp->congctl_refcnt++; 354 mutex_exit(&tcp_congctl_mtx); 355 } else { 356 tcp_congctl_global = new_tccp; 357 strlcpy(tcp_congctl_global_name, 358 new_tccp->congctl_name, 359 sizeof(tcp_congctl_global_name) - 1); 360 } 361 return 0; 362 } 363 } 364 365 return EINVAL; 366 } 367 368 void 369 tcp_congctl_release(struct tcpcb *tp) 370 { 371 struct tcp_congctlent *tccp; 372 373 KASSERT(tp->t_congctl); 374 375 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { 376 if (tccp->congctl_ctl == tp->t_congctl) { 377 tccp->congctl_refcnt--; 378 return; 379 } 380 } 381 } 382 383 /* 384 * Returns the name of a congestion algorithm. 385 */ 386 const char * 387 tcp_congctl_bystruct(const struct tcp_congctl *tcc) 388 { 389 struct tcp_congctlent *tccp; 390 391 KASSERT(tcc); 392 393 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) 394 if (tccp->congctl_ctl == tcc) 395 return tccp->congctl_name; 396 397 return NULL; 398 } 399 400 static void 401 tcp_congctl_fillnames(void) 402 { 403 struct tcp_congctlent *tccp; 404 const char *delim = " "; 405 406 tcp_congctl_avail[0] = '\0'; 407 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { 408 strlcat(tcp_congctl_avail, tccp->congctl_name, 409 sizeof(tcp_congctl_avail) - 1); 410 if (TAILQ_NEXT(tccp, congctl_ent)) 411 strlcat(tcp_congctl_avail, delim, 412 sizeof(tcp_congctl_avail) - 1); 413 } 414 415 } 416 417 /* ------------------------------------------------------------------------ */ 418 419 /* 420 * Common stuff 421 */ 422 423 /* Window reduction (1-beta) for [New]Reno: 0.5 */ 424 #define RENO_BETAA 1 425 #define RENO_BETAB 2 426 /* Window reduction (1-beta) for Cubic: 0.8 */ 427 #define CUBIC_BETAA 4 428 #define CUBIC_BETAB 5 429 /* Draft Rhee Section 4.1 */ 430 #define CUBIC_CA 4 431 #define CUBIC_CB 10 432 433 static void 434 tcp_common_congestion_exp(struct tcpcb *tp, int betaa, int betab) 435 { 436 u_int win; 437 438 /* 439 * Reduce the congestion window and the slow start threshold. 440 */ 441 win = min(tp->snd_wnd, tp->snd_cwnd) * betaa / betab / tp->t_segsz; 442 if (win < 2) 443 win = 2; 444 445 tp->snd_ssthresh = win * tp->t_segsz; 446 tp->snd_recover = tp->snd_max; 447 tp->snd_cwnd = tp->snd_ssthresh; 448 449 /* 450 * When using TCP ECN, notify the peer that 451 * we reduced the cwnd. 452 */ 453 if (TCP_ECN_ALLOWED(tp)) 454 tp->t_flags |= TF_ECN_SND_CWR; 455 } 456 457 458 /* ------------------------------------------------------------------------ */ 459 460 /* 461 * TCP/Reno congestion control. 462 */ 463 static void 464 tcp_reno_congestion_exp(struct tcpcb *tp) 465 { 466 467 tcp_common_congestion_exp(tp, RENO_BETAA, RENO_BETAB); 468 } 469 470 static int 471 tcp_reno_do_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) 472 { 473 /* 474 * Dup acks mean that packets have left the 475 * network (they're now cached at the receiver) 476 * so bump cwnd by the amount in the receiver 477 * to keep a constant cwnd packets in the 478 * network. 479 * 480 * If we are using TCP/SACK, then enter 481 * Fast Recovery if the receiver SACKs 482 * data that is tcprexmtthresh * MSS 483 * bytes past the last ACKed segment, 484 * irrespective of the number of DupAcks. 485 */ 486 487 tcp_seq onxt = tp->snd_nxt; 488 489 tp->t_partialacks = 0; 490 TCP_TIMER_DISARM(tp, TCPT_REXMT); 491 tp->t_rtttime = 0; 492 if (TCP_SACK_ENABLED(tp)) { 493 tp->t_dupacks = tcprexmtthresh; 494 tp->sack_newdata = tp->snd_nxt; 495 tp->snd_cwnd = tp->t_segsz; 496 (void) tcp_output(tp); 497 return 0; 498 } 499 tp->snd_nxt = th->th_ack; 500 tp->snd_cwnd = tp->t_segsz; 501 (void) tcp_output(tp); 502 tp->snd_cwnd = tp->snd_ssthresh + tp->t_segsz * tp->t_dupacks; 503 if (SEQ_GT(onxt, tp->snd_nxt)) 504 tp->snd_nxt = onxt; 505 506 return 0; 507 } 508 509 static int 510 tcp_reno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) 511 { 512 513 /* 514 * We know we're losing at the current 515 * window size so do congestion avoidance 516 * (set ssthresh to half the current window 517 * and pull our congestion window back to 518 * the new ssthresh). 519 */ 520 521 tcp_reno_congestion_exp(tp); 522 return tcp_reno_do_fast_retransmit(tp, th); 523 } 524 525 static void 526 tcp_reno_slow_retransmit(struct tcpcb *tp) 527 { 528 u_int win; 529 530 /* 531 * Close the congestion window down to one segment 532 * (we'll open it by one segment for each ack we get). 533 * Since we probably have a window's worth of unacked 534 * data accumulated, this "slow start" keeps us from 535 * dumping all that data as back-to-back packets (which 536 * might overwhelm an intermediate gateway). 537 * 538 * There are two phases to the opening: Initially we 539 * open by one mss on each ack. This makes the window 540 * size increase exponentially with time. If the 541 * window is larger than the path can handle, this 542 * exponential growth results in dropped packet(s) 543 * almost immediately. To get more time between 544 * drops but still "push" the network to take advantage 545 * of improving conditions, we switch from exponential 546 * to linear window opening at some threshhold size. 547 * For a threshhold, we use half the current window 548 * size, truncated to a multiple of the mss. 549 * 550 * (the minimum cwnd that will give us exponential 551 * growth is 2 mss. We don't allow the threshhold 552 * to go below this.) 553 */ 554 555 win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz; 556 if (win < 2) 557 win = 2; 558 /* Loss Window MUST be one segment. */ 559 tp->snd_cwnd = tp->t_segsz; 560 tp->snd_ssthresh = win * tp->t_segsz; 561 tp->t_partialacks = -1; 562 tp->t_dupacks = 0; 563 tp->t_bytes_acked = 0; 564 565 if (TCP_ECN_ALLOWED(tp)) 566 tp->t_flags |= TF_ECN_SND_CWR; 567 } 568 569 static void 570 tcp_reno_fast_retransmit_newack(struct tcpcb *tp, 571 const struct tcphdr *th) 572 { 573 if (tp->t_partialacks < 0) { 574 /* 575 * We were not in fast recovery. Reset the duplicate ack 576 * counter. 577 */ 578 tp->t_dupacks = 0; 579 } else { 580 /* 581 * Clamp the congestion window to the crossover point and 582 * exit fast recovery. 583 */ 584 if (tp->snd_cwnd > tp->snd_ssthresh) 585 tp->snd_cwnd = tp->snd_ssthresh; 586 tp->t_partialacks = -1; 587 tp->t_dupacks = 0; 588 tp->t_bytes_acked = 0; 589 if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack)) 590 tp->snd_fack = th->th_ack; 591 } 592 } 593 594 static void 595 tcp_reno_newack(struct tcpcb *tp, const struct tcphdr *th) 596 { 597 /* 598 * When new data is acked, open the congestion window. 599 */ 600 601 u_int cw = tp->snd_cwnd; 602 u_int incr = tp->t_segsz; 603 604 if (tcp_do_abc) { 605 606 /* 607 * RFC 3465 Appropriate Byte Counting (ABC) 608 */ 609 610 int acked = th->th_ack - tp->snd_una; 611 612 if (cw >= tp->snd_ssthresh) { 613 tp->t_bytes_acked += acked; 614 if (tp->t_bytes_acked >= cw) { 615 /* Time to increase the window. */ 616 tp->t_bytes_acked -= cw; 617 } else { 618 /* No need to increase yet. */ 619 incr = 0; 620 } 621 } else { 622 /* 623 * use 2*SMSS or 1*SMSS for the "L" param, 624 * depending on sysctl setting. 625 * 626 * (See RFC 3465 2.3 Choosing the Limit) 627 */ 628 u_int abc_lim; 629 630 abc_lim = (tcp_abc_aggressive == 0 || 631 tp->snd_nxt != tp->snd_max) ? incr : incr * 2; 632 incr = min(acked, abc_lim); 633 } 634 } else { 635 636 /* 637 * If the window gives us less than ssthresh packets 638 * in flight, open exponentially (segsz per packet). 639 * Otherwise open linearly: segsz per window 640 * (segsz^2 / cwnd per packet). 641 */ 642 643 if (cw >= tp->snd_ssthresh) { 644 incr = incr * incr / cw; 645 } 646 } 647 648 tp->snd_cwnd = min(cw + incr, TCP_MAXWIN << tp->snd_scale); 649 } 650 651 const struct tcp_congctl tcp_reno_ctl = { 652 .fast_retransmit = tcp_reno_fast_retransmit, 653 .slow_retransmit = tcp_reno_slow_retransmit, 654 .fast_retransmit_newack = tcp_reno_fast_retransmit_newack, 655 .newack = tcp_reno_newack, 656 .cong_exp = tcp_reno_congestion_exp, 657 }; 658 659 /* 660 * TCP/NewReno Congestion control. 661 */ 662 static int 663 tcp_newreno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) 664 { 665 666 if (SEQ_LT(th->th_ack, tp->snd_high)) { 667 /* 668 * False fast retransmit after timeout. 669 * Do not enter fast recovery 670 */ 671 tp->t_dupacks = 0; 672 return 1; 673 } 674 /* 675 * Fast retransmit is same as reno. 676 */ 677 return tcp_reno_fast_retransmit(tp, th); 678 } 679 680 /* 681 * Implement the NewReno response to a new ack, checking for partial acks in 682 * fast recovery. 683 */ 684 static void 685 tcp_newreno_fast_retransmit_newack(struct tcpcb *tp, const struct tcphdr *th) 686 { 687 if (tp->t_partialacks < 0) { 688 /* 689 * We were not in fast recovery. Reset the duplicate ack 690 * counter. 691 */ 692 tp->t_dupacks = 0; 693 } else if (SEQ_LT(th->th_ack, tp->snd_recover)) { 694 /* 695 * This is a partial ack. Retransmit the first unacknowledged 696 * segment and deflate the congestion window by the amount of 697 * acknowledged data. Do not exit fast recovery. 698 */ 699 tcp_seq onxt = tp->snd_nxt; 700 u_long ocwnd = tp->snd_cwnd; 701 int sack_num_segs = 1, sack_bytes_rxmt = 0; 702 703 /* 704 * snd_una has not yet been updated and the socket's send 705 * buffer has not yet drained off the ACK'd data, so we 706 * have to leave snd_una as it was to get the correct data 707 * offset in tcp_output(). 708 */ 709 tp->t_partialacks++; 710 TCP_TIMER_DISARM(tp, TCPT_REXMT); 711 tp->t_rtttime = 0; 712 tp->snd_nxt = th->th_ack; 713 714 if (TCP_SACK_ENABLED(tp)) { 715 /* 716 * Partial ack handling within a sack recovery episode. 717 * Keeping this very simple for now. When a partial ack 718 * is received, force snd_cwnd to a value that will 719 * allow the sender to transmit no more than 2 segments. 720 * If necessary, a fancier scheme can be adopted at a 721 * later point, but for now, the goal is to prevent the 722 * sender from bursting a large amount of data in the 723 * midst of sack recovery. 724 */ 725 726 /* 727 * send one or 2 segments based on how much 728 * new data was acked 729 */ 730 if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2) 731 sack_num_segs = 2; 732 (void)tcp_sack_output(tp, &sack_bytes_rxmt); 733 tp->snd_cwnd = sack_bytes_rxmt + 734 (tp->snd_nxt - tp->sack_newdata) + 735 sack_num_segs * tp->t_segsz; 736 tp->t_flags |= TF_ACKNOW; 737 (void) tcp_output(tp); 738 } else { 739 /* 740 * Set snd_cwnd to one segment beyond ACK'd offset 741 * snd_una is not yet updated when we're called 742 */ 743 tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una); 744 (void) tcp_output(tp); 745 tp->snd_cwnd = ocwnd; 746 if (SEQ_GT(onxt, tp->snd_nxt)) 747 tp->snd_nxt = onxt; 748 /* 749 * Partial window deflation. Relies on fact that 750 * tp->snd_una not updated yet. 751 */ 752 tp->snd_cwnd -= (th->th_ack - tp->snd_una - 753 tp->t_segsz); 754 } 755 } else { 756 /* 757 * Complete ack. Inflate the congestion window to ssthresh 758 * and exit fast recovery. 759 * 760 * Window inflation should have left us with approx. 761 * snd_ssthresh outstanding data. But in case we 762 * would be inclined to send a burst, better to do 763 * it via the slow start mechanism. 764 */ 765 if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh) 766 tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack) 767 + tp->t_segsz; 768 else 769 tp->snd_cwnd = tp->snd_ssthresh; 770 tp->t_partialacks = -1; 771 tp->t_dupacks = 0; 772 tp->t_bytes_acked = 0; 773 if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack)) 774 tp->snd_fack = th->th_ack; 775 } 776 } 777 778 static void 779 tcp_newreno_newack(struct tcpcb *tp, const struct tcphdr *th) 780 { 781 /* 782 * If we are still in fast recovery (meaning we are using 783 * NewReno and we have only received partial acks), do not 784 * inflate the window yet. 785 */ 786 if (tp->t_partialacks < 0) 787 tcp_reno_newack(tp, th); 788 } 789 790 791 const struct tcp_congctl tcp_newreno_ctl = { 792 .fast_retransmit = tcp_newreno_fast_retransmit, 793 .slow_retransmit = tcp_reno_slow_retransmit, 794 .fast_retransmit_newack = tcp_newreno_fast_retransmit_newack, 795 .newack = tcp_newreno_newack, 796 .cong_exp = tcp_reno_congestion_exp, 797 }; 798 799 /* 800 * CUBIC - http://tools.ietf.org/html/draft-rhee-tcpm-cubic-02 801 */ 802 803 /* Cubic prototypes */ 804 static void tcp_cubic_update_ctime(struct tcpcb *tp); 805 static uint32_t tcp_cubic_diff_ctime(struct tcpcb *); 806 static uint32_t tcp_cubic_cbrt(uint32_t); 807 static ulong tcp_cubic_getW(struct tcpcb *, uint32_t, uint32_t); 808 809 /* Cubic TIME functions - XXX I don't like using timevals and microuptime */ 810 /* 811 * Set congestion timer to now 812 */ 813 static void 814 tcp_cubic_update_ctime(struct tcpcb *tp) 815 { 816 struct timeval now_timeval; 817 818 getmicrouptime(&now_timeval); 819 tp->snd_cubic_ctime = now_timeval.tv_sec * 1000 + 820 now_timeval.tv_usec / 1000; 821 } 822 823 /* 824 * miliseconds from last congestion 825 */ 826 static uint32_t 827 tcp_cubic_diff_ctime(struct tcpcb *tp) 828 { 829 struct timeval now_timeval; 830 831 getmicrouptime(&now_timeval); 832 return now_timeval.tv_sec * 1000 + now_timeval.tv_usec / 1000 - 833 tp->snd_cubic_ctime; 834 } 835 836 /* 837 * Approximate cubic root 838 */ 839 #define CBRT_ROUNDS 30 840 static uint32_t 841 tcp_cubic_cbrt(uint32_t v) 842 { 843 int i, rounds = CBRT_ROUNDS; 844 uint64_t x = v / 3; 845 846 /* We fail to calculate correct for small numbers */ 847 if (v == 0) 848 return 0; 849 else if (v < 4) 850 return 1; 851 852 /* 853 * largest x that 2*x^3+3*x fits 64bit 854 * Avoid overflow for a time cost 855 */ 856 if (x > 2097151) 857 rounds += 10; 858 859 for (i = 0; i < rounds; i++) 860 if (rounds == CBRT_ROUNDS) 861 x = (v + 2 * x * x * x) / (3 * x * x); 862 else 863 /* Avoid overflow */ 864 x = v / (3 * x * x) + 2 * x / 3; 865 866 return (uint32_t)x; 867 } 868 869 /* Draft Rhee Section 3.1 - get W(t+rtt) - Eq. 1 */ 870 static ulong 871 tcp_cubic_getW(struct tcpcb *tp, uint32_t ms_elapsed, uint32_t rtt) 872 { 873 uint32_t K; 874 long tK3; 875 876 /* Section 3.1 Eq. 2 */ 877 K = tcp_cubic_cbrt(tp->snd_cubic_wmax / CUBIC_BETAB * 878 CUBIC_CB / CUBIC_CA); 879 /* (t-K)^3 - not clear why is the measure unit mattering */ 880 tK3 = (long)(ms_elapsed + rtt) - (long)K; 881 tK3 = tK3 * tK3 * tK3; 882 883 return CUBIC_CA * tK3 / CUBIC_CB + tp->snd_cubic_wmax; 884 } 885 886 static void 887 tcp_cubic_congestion_exp(struct tcpcb *tp) 888 { 889 890 /* 891 * Congestion - Set WMax and shrink cwnd 892 */ 893 tcp_cubic_update_ctime(tp); 894 895 /* Section 3.6 - Fast Convergence */ 896 if (tp->snd_cubic_wmax < tp->snd_cubic_wmax_last) { 897 tp->snd_cubic_wmax_last = tp->snd_cubic_wmax; 898 tp->snd_cubic_wmax = tp->snd_cubic_wmax / 2 + 899 tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB / 2; 900 } else { 901 tp->snd_cubic_wmax_last = tp->snd_cubic_wmax; 902 tp->snd_cubic_wmax = tp->snd_cwnd; 903 } 904 905 tp->snd_cubic_wmax = max(tp->t_segsz, tp->snd_cubic_wmax); 906 907 /* Shrink CWND */ 908 tcp_common_congestion_exp(tp, CUBIC_BETAA, CUBIC_BETAB); 909 } 910 911 static int 912 tcp_cubic_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) 913 { 914 915 if (SEQ_LT(th->th_ack, tp->snd_high)) { 916 /* See newreno */ 917 tp->t_dupacks = 0; 918 return 1; 919 } 920 921 /* 922 * mark WMax 923 */ 924 tcp_cubic_congestion_exp(tp); 925 926 /* Do fast retransmit */ 927 return tcp_reno_do_fast_retransmit(tp, th); 928 } 929 930 static void 931 tcp_cubic_newack(struct tcpcb *tp, const struct tcphdr *th) 932 { 933 uint32_t ms_elapsed, rtt; 934 u_long w_tcp; 935 936 /* Congestion avoidance and not in fast recovery and usable rtt */ 937 if (tp->snd_cwnd > tp->snd_ssthresh && tp->t_partialacks < 0 && 938 /* 939 * t_srtt is 1/32 units of slow ticks 940 * converting it in ms would be equal to 941 * (t_srtt >> 5) * 1000 / PR_SLOWHZ ~= (t_srtt << 5) / PR_SLOWHZ 942 */ 943 (rtt = (tp->t_srtt << 5) / PR_SLOWHZ) > 0) { 944 ms_elapsed = tcp_cubic_diff_ctime(tp); 945 946 /* Compute W_tcp(t) */ 947 w_tcp = tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB + 948 ms_elapsed / rtt / 3; 949 950 if (tp->snd_cwnd > w_tcp) { 951 /* Not in TCP friendly mode */ 952 tp->snd_cwnd += (tcp_cubic_getW(tp, ms_elapsed, rtt) - 953 tp->snd_cwnd) / tp->snd_cwnd; 954 } else { 955 /* friendly TCP mode */ 956 tp->snd_cwnd = w_tcp; 957 } 958 959 /* Make sure we are within limits */ 960 tp->snd_cwnd = max(tp->snd_cwnd, tp->t_segsz); 961 tp->snd_cwnd = min(tp->snd_cwnd, TCP_MAXWIN << tp->snd_scale); 962 } else { 963 /* Use New Reno */ 964 tcp_newreno_newack(tp, th); 965 } 966 } 967 968 static void 969 tcp_cubic_slow_retransmit(struct tcpcb *tp) 970 { 971 972 /* Timeout - Mark new congestion */ 973 tcp_cubic_congestion_exp(tp); 974 975 /* Loss Window MUST be one segment. */ 976 tp->snd_cwnd = tp->t_segsz; 977 tp->t_partialacks = -1; 978 tp->t_dupacks = 0; 979 tp->t_bytes_acked = 0; 980 981 if (TCP_ECN_ALLOWED(tp)) 982 tp->t_flags |= TF_ECN_SND_CWR; 983 } 984 985 const struct tcp_congctl tcp_cubic_ctl = { 986 .fast_retransmit = tcp_cubic_fast_retransmit, 987 .slow_retransmit = tcp_cubic_slow_retransmit, 988 .fast_retransmit_newack = tcp_newreno_fast_retransmit_newack, 989 .newack = tcp_cubic_newack, 990 .cong_exp = tcp_cubic_congestion_exp, 991 }; 992