1 /* $NetBSD: tcp_congctl.c,v 1.24 2018/03/29 07:46:43 maxv Exp $ */ 2 3 /*- 4 * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation 9 * Facility, NASA Ames Research Center. 10 * This code is derived from software contributed to The NetBSD Foundation 11 * by Charles M. Hannum. 12 * This code is derived from software contributed to The NetBSD Foundation 13 * by Rui Paulo. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 25 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 26 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 27 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 28 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 29 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 30 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 31 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 32 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 33 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 * POSSIBILITY OF SUCH DAMAGE. 35 */ 36 37 /* 38 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 39 * All rights reserved. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the project nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 */ 65 66 /* 67 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 68 * 69 * NRL grants permission for redistribution and use in source and binary 70 * forms, with or without modification, of the software and documentation 71 * created at NRL provided that the following conditions are met: 72 * 73 * 1. Redistributions of source code must retain the above copyright 74 * notice, this list of conditions and the following disclaimer. 75 * 2. Redistributions in binary form must reproduce the above copyright 76 * notice, this list of conditions and the following disclaimer in the 77 * documentation and/or other materials provided with the distribution. 78 * 3. All advertising materials mentioning features or use of this software 79 * must display the following acknowledgements: 80 * This product includes software developed by the University of 81 * California, Berkeley and its contributors. 82 * This product includes software developed at the Information 83 * Technology Division, US Naval Research Laboratory. 84 * 4. Neither the name of the NRL nor the names of its contributors 85 * may be used to endorse or promote products derived from this software 86 * without specific prior written permission. 87 * 88 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 89 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 90 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 91 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 92 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 93 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 94 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 95 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 96 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 97 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 98 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 99 * 100 * The views and conclusions contained in the software and documentation 101 * are those of the authors and should not be interpreted as representing 102 * official policies, either expressed or implied, of the US Naval 103 * Research Laboratory (NRL). 104 */ 105 106 /* 107 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 108 * The Regents of the University of California. All rights reserved. 109 * 110 * Redistribution and use in source and binary forms, with or without 111 * modification, are permitted provided that the following conditions 112 * are met: 113 * 1. Redistributions of source code must retain the above copyright 114 * notice, this list of conditions and the following disclaimer. 115 * 2. Redistributions in binary form must reproduce the above copyright 116 * notice, this list of conditions and the following disclaimer in the 117 * documentation and/or other materials provided with the distribution. 118 * 3. Neither the name of the University nor the names of its contributors 119 * may be used to endorse or promote products derived from this software 120 * without specific prior written permission. 121 * 122 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 123 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 124 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 125 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 126 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 127 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 128 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 129 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 130 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 131 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 132 * SUCH DAMAGE. 133 * 134 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 135 */ 136 137 #include <sys/cdefs.h> 138 __KERNEL_RCSID(0, "$NetBSD: tcp_congctl.c,v 1.24 2018/03/29 07:46:43 maxv Exp $"); 139 140 #ifdef _KERNEL_OPT 141 #include "opt_inet.h" 142 #include "opt_tcp_debug.h" 143 #include "opt_tcp_congctl.h" 144 #endif 145 146 #include <sys/param.h> 147 #include <sys/systm.h> 148 #include <sys/malloc.h> 149 #include <sys/mbuf.h> 150 #include <sys/protosw.h> 151 #include <sys/socket.h> 152 #include <sys/socketvar.h> 153 #include <sys/errno.h> 154 #include <sys/syslog.h> 155 #include <sys/pool.h> 156 #include <sys/domain.h> 157 #include <sys/kernel.h> 158 #include <sys/mutex.h> 159 160 #include <net/if.h> 161 162 #include <netinet/in.h> 163 #include <netinet/in_systm.h> 164 #include <netinet/ip.h> 165 #include <netinet/in_pcb.h> 166 #include <netinet/in_var.h> 167 #include <netinet/ip_var.h> 168 169 #ifdef INET6 170 #include <netinet/ip6.h> 171 #include <netinet6/ip6_var.h> 172 #include <netinet6/in6_pcb.h> 173 #include <netinet6/ip6_var.h> 174 #include <netinet6/in6_var.h> 175 #include <netinet/icmp6.h> 176 #endif 177 178 #include <netinet/tcp.h> 179 #include <netinet/tcp_fsm.h> 180 #include <netinet/tcp_seq.h> 181 #include <netinet/tcp_timer.h> 182 #include <netinet/tcp_var.h> 183 #include <netinet/tcpip.h> 184 #include <netinet/tcp_congctl.h> 185 #ifdef TCP_DEBUG 186 #include <netinet/tcp_debug.h> 187 #endif 188 189 /* 190 * TODO: 191 * consider separating the actual implementations in another file. 192 */ 193 194 static void tcp_common_congestion_exp(struct tcpcb *, int, int); 195 196 static int tcp_reno_do_fast_retransmit(struct tcpcb *, const struct tcphdr *); 197 static int tcp_reno_fast_retransmit(struct tcpcb *, const struct tcphdr *); 198 static void tcp_reno_slow_retransmit(struct tcpcb *); 199 static void tcp_reno_fast_retransmit_newack(struct tcpcb *, 200 const struct tcphdr *); 201 static void tcp_reno_newack(struct tcpcb *, const struct tcphdr *); 202 static void tcp_reno_congestion_exp(struct tcpcb *tp); 203 204 static int tcp_newreno_fast_retransmit(struct tcpcb *, const struct tcphdr *); 205 static void tcp_newreno_fast_retransmit_newack(struct tcpcb *, 206 const struct tcphdr *); 207 static void tcp_newreno_newack(struct tcpcb *, const struct tcphdr *); 208 209 static int tcp_cubic_fast_retransmit(struct tcpcb *, const struct tcphdr *); 210 static void tcp_cubic_slow_retransmit(struct tcpcb *tp); 211 static void tcp_cubic_newack(struct tcpcb *, const struct tcphdr *); 212 static void tcp_cubic_congestion_exp(struct tcpcb *); 213 214 static void tcp_congctl_fillnames(void); 215 216 extern int tcprexmtthresh; 217 218 MALLOC_DEFINE(M_TCPCONGCTL, "tcpcongctl", "TCP congestion control structures"); 219 220 /* currently selected global congestion control */ 221 char tcp_congctl_global_name[TCPCC_MAXLEN]; 222 223 /* available global congestion control algorithms */ 224 char tcp_congctl_avail[10 * TCPCC_MAXLEN]; 225 226 /* 227 * Used to list the available congestion control algorithms. 228 */ 229 TAILQ_HEAD(, tcp_congctlent) tcp_congctlhd = 230 TAILQ_HEAD_INITIALIZER(tcp_congctlhd); 231 232 static struct tcp_congctlent * tcp_congctl_global; 233 234 static kmutex_t tcp_congctl_mtx; 235 236 void 237 tcp_congctl_init(void) 238 { 239 int r __diagused; 240 241 mutex_init(&tcp_congctl_mtx, MUTEX_DEFAULT, IPL_NONE); 242 243 /* Base algorithms. */ 244 r = tcp_congctl_register("reno", &tcp_reno_ctl); 245 KASSERT(r == 0); 246 r = tcp_congctl_register("newreno", &tcp_newreno_ctl); 247 KASSERT(r == 0); 248 r = tcp_congctl_register("cubic", &tcp_cubic_ctl); 249 KASSERT(r == 0); 250 251 /* NewReno is the default. */ 252 #ifndef TCP_CONGCTL_DEFAULT 253 #define TCP_CONGCTL_DEFAULT "newreno" 254 #endif 255 256 r = tcp_congctl_select(NULL, TCP_CONGCTL_DEFAULT); 257 KASSERT(r == 0); 258 } 259 260 /* 261 * Register a congestion algorithm and select it if we have none. 262 */ 263 int 264 tcp_congctl_register(const char *name, const struct tcp_congctl *tcc) 265 { 266 struct tcp_congctlent *ntcc, *tccp; 267 268 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) 269 if (!strcmp(name, tccp->congctl_name)) { 270 /* name already registered */ 271 return EEXIST; 272 } 273 274 ntcc = malloc(sizeof(*ntcc), M_TCPCONGCTL, M_WAITOK|M_ZERO); 275 276 strlcpy(ntcc->congctl_name, name, sizeof(ntcc->congctl_name) - 1); 277 ntcc->congctl_ctl = tcc; 278 279 TAILQ_INSERT_TAIL(&tcp_congctlhd, ntcc, congctl_ent); 280 tcp_congctl_fillnames(); 281 282 if (TAILQ_FIRST(&tcp_congctlhd) == ntcc) 283 tcp_congctl_select(NULL, name); 284 285 return 0; 286 } 287 288 int 289 tcp_congctl_unregister(const char *name) 290 { 291 struct tcp_congctlent *tccp, *rtccp; 292 unsigned int size; 293 294 rtccp = NULL; 295 size = 0; 296 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { 297 if (!strcmp(name, tccp->congctl_name)) 298 rtccp = tccp; 299 size++; 300 } 301 302 if (!rtccp) 303 return ENOENT; 304 305 if (size <= 1 || tcp_congctl_global == rtccp || rtccp->congctl_refcnt) 306 return EBUSY; 307 308 TAILQ_REMOVE(&tcp_congctlhd, rtccp, congctl_ent); 309 free(rtccp, M_TCPCONGCTL); 310 tcp_congctl_fillnames(); 311 312 return 0; 313 } 314 315 /* 316 * Select a congestion algorithm by name. 317 */ 318 int 319 tcp_congctl_select(struct tcpcb *tp, const char *name) 320 { 321 struct tcp_congctlent *tccp, *old_tccp, *new_tccp; 322 bool old_found, new_found; 323 324 KASSERT(name); 325 326 old_found = (tp == NULL || tp->t_congctl == NULL); 327 old_tccp = NULL; 328 new_found = false; 329 new_tccp = NULL; 330 331 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { 332 if (!old_found && tccp->congctl_ctl == tp->t_congctl) { 333 old_tccp = tccp; 334 old_found = true; 335 } 336 337 if (!new_found && !strcmp(name, tccp->congctl_name)) { 338 new_tccp = tccp; 339 new_found = true; 340 } 341 342 if (new_found && old_found) { 343 if (tp) { 344 mutex_enter(&tcp_congctl_mtx); 345 if (old_tccp) 346 old_tccp->congctl_refcnt--; 347 tp->t_congctl = new_tccp->congctl_ctl; 348 new_tccp->congctl_refcnt++; 349 mutex_exit(&tcp_congctl_mtx); 350 } else { 351 tcp_congctl_global = new_tccp; 352 strlcpy(tcp_congctl_global_name, 353 new_tccp->congctl_name, 354 sizeof(tcp_congctl_global_name) - 1); 355 } 356 return 0; 357 } 358 } 359 360 return EINVAL; 361 } 362 363 void 364 tcp_congctl_release(struct tcpcb *tp) 365 { 366 struct tcp_congctlent *tccp; 367 368 KASSERT(tp->t_congctl); 369 370 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { 371 if (tccp->congctl_ctl == tp->t_congctl) { 372 tccp->congctl_refcnt--; 373 return; 374 } 375 } 376 } 377 378 /* 379 * Returns the name of a congestion algorithm. 380 */ 381 const char * 382 tcp_congctl_bystruct(const struct tcp_congctl *tcc) 383 { 384 struct tcp_congctlent *tccp; 385 386 KASSERT(tcc); 387 388 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) 389 if (tccp->congctl_ctl == tcc) 390 return tccp->congctl_name; 391 392 return NULL; 393 } 394 395 static void 396 tcp_congctl_fillnames(void) 397 { 398 struct tcp_congctlent *tccp; 399 const char *delim = " "; 400 401 tcp_congctl_avail[0] = '\0'; 402 TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { 403 strlcat(tcp_congctl_avail, tccp->congctl_name, 404 sizeof(tcp_congctl_avail) - 1); 405 if (TAILQ_NEXT(tccp, congctl_ent)) 406 strlcat(tcp_congctl_avail, delim, 407 sizeof(tcp_congctl_avail) - 1); 408 } 409 410 } 411 412 /* ------------------------------------------------------------------------ */ 413 414 /* 415 * Common stuff 416 */ 417 418 /* Window reduction (1-beta) for [New]Reno: 0.5 */ 419 #define RENO_BETAA 1 420 #define RENO_BETAB 2 421 /* Window reduction (1-beta) for Cubic: 0.8 */ 422 #define CUBIC_BETAA 4 423 #define CUBIC_BETAB 5 424 /* Draft Rhee Section 4.1 */ 425 #define CUBIC_CA 4 426 #define CUBIC_CB 10 427 428 static void 429 tcp_common_congestion_exp(struct tcpcb *tp, int betaa, int betab) 430 { 431 u_int win; 432 433 /* 434 * Reduce the congestion window and the slow start threshold. 435 */ 436 win = min(tp->snd_wnd, tp->snd_cwnd) * betaa / betab / tp->t_segsz; 437 if (win < 2) 438 win = 2; 439 440 tp->snd_ssthresh = win * tp->t_segsz; 441 tp->snd_recover = tp->snd_max; 442 tp->snd_cwnd = tp->snd_ssthresh; 443 444 /* 445 * When using TCP ECN, notify the peer that 446 * we reduced the cwnd. 447 */ 448 if (TCP_ECN_ALLOWED(tp)) 449 tp->t_flags |= TF_ECN_SND_CWR; 450 } 451 452 453 /* ------------------------------------------------------------------------ */ 454 455 /* 456 * TCP/Reno congestion control. 457 */ 458 static void 459 tcp_reno_congestion_exp(struct tcpcb *tp) 460 { 461 462 tcp_common_congestion_exp(tp, RENO_BETAA, RENO_BETAB); 463 } 464 465 static int 466 tcp_reno_do_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) 467 { 468 /* 469 * Dup acks mean that packets have left the 470 * network (they're now cached at the receiver) 471 * so bump cwnd by the amount in the receiver 472 * to keep a constant cwnd packets in the 473 * network. 474 * 475 * If we are using TCP/SACK, then enter 476 * Fast Recovery if the receiver SACKs 477 * data that is tcprexmtthresh * MSS 478 * bytes past the last ACKed segment, 479 * irrespective of the number of DupAcks. 480 */ 481 482 tcp_seq onxt = tp->snd_nxt; 483 484 tp->t_partialacks = 0; 485 TCP_TIMER_DISARM(tp, TCPT_REXMT); 486 tp->t_rtttime = 0; 487 if (TCP_SACK_ENABLED(tp)) { 488 tp->t_dupacks = tcprexmtthresh; 489 tp->sack_newdata = tp->snd_nxt; 490 tp->snd_cwnd = tp->t_segsz; 491 (void) tcp_output(tp); 492 return 0; 493 } 494 tp->snd_nxt = th->th_ack; 495 tp->snd_cwnd = tp->t_segsz; 496 (void) tcp_output(tp); 497 tp->snd_cwnd = tp->snd_ssthresh + tp->t_segsz * tp->t_dupacks; 498 if (SEQ_GT(onxt, tp->snd_nxt)) 499 tp->snd_nxt = onxt; 500 501 return 0; 502 } 503 504 static int 505 tcp_reno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) 506 { 507 508 /* 509 * We know we're losing at the current 510 * window size so do congestion avoidance 511 * (set ssthresh to half the current window 512 * and pull our congestion window back to 513 * the new ssthresh). 514 */ 515 516 tcp_reno_congestion_exp(tp); 517 return tcp_reno_do_fast_retransmit(tp, th); 518 } 519 520 static void 521 tcp_reno_slow_retransmit(struct tcpcb *tp) 522 { 523 u_int win; 524 525 /* 526 * Close the congestion window down to one segment 527 * (we'll open it by one segment for each ack we get). 528 * Since we probably have a window's worth of unacked 529 * data accumulated, this "slow start" keeps us from 530 * dumping all that data as back-to-back packets (which 531 * might overwhelm an intermediate gateway). 532 * 533 * There are two phases to the opening: Initially we 534 * open by one mss on each ack. This makes the window 535 * size increase exponentially with time. If the 536 * window is larger than the path can handle, this 537 * exponential growth results in dropped packet(s) 538 * almost immediately. To get more time between 539 * drops but still "push" the network to take advantage 540 * of improving conditions, we switch from exponential 541 * to linear window opening at some threshhold size. 542 * For a threshhold, we use half the current window 543 * size, truncated to a multiple of the mss. 544 * 545 * (the minimum cwnd that will give us exponential 546 * growth is 2 mss. We don't allow the threshhold 547 * to go below this.) 548 */ 549 550 win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz; 551 if (win < 2) 552 win = 2; 553 /* Loss Window MUST be one segment. */ 554 tp->snd_cwnd = tp->t_segsz; 555 tp->snd_ssthresh = win * tp->t_segsz; 556 tp->t_partialacks = -1; 557 tp->t_dupacks = 0; 558 tp->t_bytes_acked = 0; 559 560 if (TCP_ECN_ALLOWED(tp)) 561 tp->t_flags |= TF_ECN_SND_CWR; 562 } 563 564 static void 565 tcp_reno_fast_retransmit_newack(struct tcpcb *tp, 566 const struct tcphdr *th) 567 { 568 if (tp->t_partialacks < 0) { 569 /* 570 * We were not in fast recovery. Reset the duplicate ack 571 * counter. 572 */ 573 tp->t_dupacks = 0; 574 } else { 575 /* 576 * Clamp the congestion window to the crossover point and 577 * exit fast recovery. 578 */ 579 if (tp->snd_cwnd > tp->snd_ssthresh) 580 tp->snd_cwnd = tp->snd_ssthresh; 581 tp->t_partialacks = -1; 582 tp->t_dupacks = 0; 583 tp->t_bytes_acked = 0; 584 if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack)) 585 tp->snd_fack = th->th_ack; 586 } 587 } 588 589 static void 590 tcp_reno_newack(struct tcpcb *tp, const struct tcphdr *th) 591 { 592 /* 593 * When new data is acked, open the congestion window. 594 */ 595 596 u_int cw = tp->snd_cwnd; 597 u_int incr = tp->t_segsz; 598 599 if (tcp_do_abc) { 600 601 /* 602 * RFC 3465 Appropriate Byte Counting (ABC) 603 */ 604 605 int acked = th->th_ack - tp->snd_una; 606 607 if (cw >= tp->snd_ssthresh) { 608 tp->t_bytes_acked += acked; 609 if (tp->t_bytes_acked >= cw) { 610 /* Time to increase the window. */ 611 tp->t_bytes_acked -= cw; 612 } else { 613 /* No need to increase yet. */ 614 incr = 0; 615 } 616 } else { 617 /* 618 * use 2*SMSS or 1*SMSS for the "L" param, 619 * depending on sysctl setting. 620 * 621 * (See RFC 3465 2.3 Choosing the Limit) 622 */ 623 u_int abc_lim; 624 625 abc_lim = (tcp_abc_aggressive == 0 || 626 tp->snd_nxt != tp->snd_max) ? incr : incr * 2; 627 incr = min(acked, abc_lim); 628 } 629 } else { 630 631 /* 632 * If the window gives us less than ssthresh packets 633 * in flight, open exponentially (segsz per packet). 634 * Otherwise open linearly: segsz per window 635 * (segsz^2 / cwnd per packet). 636 */ 637 638 if (cw >= tp->snd_ssthresh) { 639 incr = incr * incr / cw; 640 } 641 } 642 643 tp->snd_cwnd = min(cw + incr, TCP_MAXWIN << tp->snd_scale); 644 } 645 646 const struct tcp_congctl tcp_reno_ctl = { 647 .fast_retransmit = tcp_reno_fast_retransmit, 648 .slow_retransmit = tcp_reno_slow_retransmit, 649 .fast_retransmit_newack = tcp_reno_fast_retransmit_newack, 650 .newack = tcp_reno_newack, 651 .cong_exp = tcp_reno_congestion_exp, 652 }; 653 654 /* 655 * TCP/NewReno Congestion control. 656 */ 657 static int 658 tcp_newreno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) 659 { 660 661 if (SEQ_LT(th->th_ack, tp->snd_high)) { 662 /* 663 * False fast retransmit after timeout. 664 * Do not enter fast recovery 665 */ 666 tp->t_dupacks = 0; 667 return 1; 668 } 669 /* 670 * Fast retransmit is same as reno. 671 */ 672 return tcp_reno_fast_retransmit(tp, th); 673 } 674 675 /* 676 * Implement the NewReno response to a new ack, checking for partial acks in 677 * fast recovery. 678 */ 679 static void 680 tcp_newreno_fast_retransmit_newack(struct tcpcb *tp, const struct tcphdr *th) 681 { 682 if (tp->t_partialacks < 0) { 683 /* 684 * We were not in fast recovery. Reset the duplicate ack 685 * counter. 686 */ 687 tp->t_dupacks = 0; 688 } else if (SEQ_LT(th->th_ack, tp->snd_recover)) { 689 /* 690 * This is a partial ack. Retransmit the first unacknowledged 691 * segment and deflate the congestion window by the amount of 692 * acknowledged data. Do not exit fast recovery. 693 */ 694 tcp_seq onxt = tp->snd_nxt; 695 u_long ocwnd = tp->snd_cwnd; 696 int sack_num_segs = 1, sack_bytes_rxmt = 0; 697 698 /* 699 * snd_una has not yet been updated and the socket's send 700 * buffer has not yet drained off the ACK'd data, so we 701 * have to leave snd_una as it was to get the correct data 702 * offset in tcp_output(). 703 */ 704 tp->t_partialacks++; 705 TCP_TIMER_DISARM(tp, TCPT_REXMT); 706 tp->t_rtttime = 0; 707 708 if (TCP_SACK_ENABLED(tp)) { 709 /* 710 * Partial ack handling within a sack recovery episode. 711 * Keeping this very simple for now. When a partial ack 712 * is received, force snd_cwnd to a value that will 713 * allow the sender to transmit no more than 2 segments. 714 * If necessary, a fancier scheme can be adopted at a 715 * later point, but for now, the goal is to prevent the 716 * sender from bursting a large amount of data in the 717 * midst of sack recovery. 718 */ 719 720 /* 721 * send one or 2 segments based on how much 722 * new data was acked 723 */ 724 if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2) 725 sack_num_segs = 2; 726 (void)tcp_sack_output(tp, &sack_bytes_rxmt); 727 tp->snd_cwnd = sack_bytes_rxmt + 728 (tp->snd_nxt - tp->sack_newdata) + 729 sack_num_segs * tp->t_segsz; 730 tp->t_flags |= TF_ACKNOW; 731 (void) tcp_output(tp); 732 } else { 733 tp->snd_nxt = th->th_ack; 734 /* 735 * Set snd_cwnd to one segment beyond ACK'd offset 736 * snd_una is not yet updated when we're called 737 */ 738 tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una); 739 (void) tcp_output(tp); 740 tp->snd_cwnd = ocwnd; 741 if (SEQ_GT(onxt, tp->snd_nxt)) 742 tp->snd_nxt = onxt; 743 /* 744 * Partial window deflation. Relies on fact that 745 * tp->snd_una not updated yet. 746 */ 747 tp->snd_cwnd -= (th->th_ack - tp->snd_una - 748 tp->t_segsz); 749 } 750 } else { 751 /* 752 * Complete ack. Inflate the congestion window to ssthresh 753 * and exit fast recovery. 754 * 755 * Window inflation should have left us with approx. 756 * snd_ssthresh outstanding data. But in case we 757 * would be inclined to send a burst, better to do 758 * it via the slow start mechanism. 759 */ 760 if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh) 761 tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack) 762 + tp->t_segsz; 763 else 764 tp->snd_cwnd = tp->snd_ssthresh; 765 tp->t_partialacks = -1; 766 tp->t_dupacks = 0; 767 tp->t_bytes_acked = 0; 768 if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack)) 769 tp->snd_fack = th->th_ack; 770 } 771 } 772 773 static void 774 tcp_newreno_newack(struct tcpcb *tp, const struct tcphdr *th) 775 { 776 /* 777 * If we are still in fast recovery (meaning we are using 778 * NewReno and we have only received partial acks), do not 779 * inflate the window yet. 780 */ 781 if (tp->t_partialacks < 0) 782 tcp_reno_newack(tp, th); 783 } 784 785 786 const struct tcp_congctl tcp_newreno_ctl = { 787 .fast_retransmit = tcp_newreno_fast_retransmit, 788 .slow_retransmit = tcp_reno_slow_retransmit, 789 .fast_retransmit_newack = tcp_newreno_fast_retransmit_newack, 790 .newack = tcp_newreno_newack, 791 .cong_exp = tcp_reno_congestion_exp, 792 }; 793 794 /* 795 * CUBIC - http://tools.ietf.org/html/draft-rhee-tcpm-cubic-02 796 */ 797 798 /* Cubic prototypes */ 799 static void tcp_cubic_update_ctime(struct tcpcb *tp); 800 static uint32_t tcp_cubic_diff_ctime(struct tcpcb *); 801 static uint32_t tcp_cubic_cbrt(uint32_t); 802 static ulong tcp_cubic_getW(struct tcpcb *, uint32_t, uint32_t); 803 804 /* Cubic TIME functions - XXX I don't like using timevals and microuptime */ 805 /* 806 * Set congestion timer to now 807 */ 808 static void 809 tcp_cubic_update_ctime(struct tcpcb *tp) 810 { 811 struct timeval now_timeval; 812 813 getmicrouptime(&now_timeval); 814 tp->snd_cubic_ctime = now_timeval.tv_sec * 1000 + 815 now_timeval.tv_usec / 1000; 816 } 817 818 /* 819 * miliseconds from last congestion 820 */ 821 static uint32_t 822 tcp_cubic_diff_ctime(struct tcpcb *tp) 823 { 824 struct timeval now_timeval; 825 826 getmicrouptime(&now_timeval); 827 return now_timeval.tv_sec * 1000 + now_timeval.tv_usec / 1000 - 828 tp->snd_cubic_ctime; 829 } 830 831 /* 832 * Approximate cubic root 833 */ 834 #define CBRT_ROUNDS 30 835 static uint32_t 836 tcp_cubic_cbrt(uint32_t v) 837 { 838 int i, rounds = CBRT_ROUNDS; 839 uint64_t x = v / 3; 840 841 /* We fail to calculate correct for small numbers */ 842 if (v == 0) 843 return 0; 844 else if (v < 4) 845 return 1; 846 847 /* 848 * largest x that 2*x^3+3*x fits 64bit 849 * Avoid overflow for a time cost 850 */ 851 if (x > 2097151) 852 rounds += 10; 853 854 for (i = 0; i < rounds; i++) 855 if (rounds == CBRT_ROUNDS) 856 x = (v + 2 * x * x * x) / (3 * x * x); 857 else 858 /* Avoid overflow */ 859 x = v / (3 * x * x) + 2 * x / 3; 860 861 return (uint32_t)x; 862 } 863 864 /* Draft Rhee Section 3.1 - get W(t+rtt) - Eq. 1 */ 865 static ulong 866 tcp_cubic_getW(struct tcpcb *tp, uint32_t ms_elapsed, uint32_t rtt) 867 { 868 uint32_t K; 869 long tK3; 870 871 /* Section 3.1 Eq. 2 */ 872 K = tcp_cubic_cbrt(tp->snd_cubic_wmax / CUBIC_BETAB * 873 CUBIC_CB / CUBIC_CA); 874 /* (t-K)^3 - not clear why is the measure unit mattering */ 875 tK3 = (long)(ms_elapsed + rtt) - (long)K; 876 tK3 = tK3 * tK3 * tK3; 877 878 return CUBIC_CA * tK3 / CUBIC_CB + tp->snd_cubic_wmax; 879 } 880 881 static void 882 tcp_cubic_congestion_exp(struct tcpcb *tp) 883 { 884 885 /* 886 * Congestion - Set WMax and shrink cwnd 887 */ 888 tcp_cubic_update_ctime(tp); 889 890 /* Section 3.6 - Fast Convergence */ 891 if (tp->snd_cubic_wmax < tp->snd_cubic_wmax_last) { 892 tp->snd_cubic_wmax_last = tp->snd_cubic_wmax; 893 tp->snd_cubic_wmax = tp->snd_cubic_wmax / 2 + 894 tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB / 2; 895 } else { 896 tp->snd_cubic_wmax_last = tp->snd_cubic_wmax; 897 tp->snd_cubic_wmax = tp->snd_cwnd; 898 } 899 900 tp->snd_cubic_wmax = max(tp->t_segsz, tp->snd_cubic_wmax); 901 902 /* Shrink CWND */ 903 tcp_common_congestion_exp(tp, CUBIC_BETAA, CUBIC_BETAB); 904 } 905 906 static int 907 tcp_cubic_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) 908 { 909 910 if (SEQ_LT(th->th_ack, tp->snd_high)) { 911 /* See newreno */ 912 tp->t_dupacks = 0; 913 return 1; 914 } 915 916 /* 917 * mark WMax 918 */ 919 tcp_cubic_congestion_exp(tp); 920 921 /* Do fast retransmit */ 922 return tcp_reno_do_fast_retransmit(tp, th); 923 } 924 925 static void 926 tcp_cubic_newack(struct tcpcb *tp, const struct tcphdr *th) 927 { 928 uint32_t ms_elapsed, rtt; 929 u_long w_tcp; 930 931 /* Congestion avoidance and not in fast recovery and usable rtt */ 932 if (tp->snd_cwnd > tp->snd_ssthresh && tp->t_partialacks < 0 && 933 /* 934 * t_srtt is 1/32 units of slow ticks 935 * converting it in ms would be equal to 936 * (t_srtt >> 5) * 1000 / PR_SLOWHZ ~= (t_srtt << 5) / PR_SLOWHZ 937 */ 938 (rtt = (tp->t_srtt << 5) / PR_SLOWHZ) > 0) { 939 ms_elapsed = tcp_cubic_diff_ctime(tp); 940 941 /* Compute W_tcp(t) */ 942 w_tcp = tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB + 943 ms_elapsed / rtt / 3; 944 945 if (tp->snd_cwnd > w_tcp) { 946 /* Not in TCP friendly mode */ 947 tp->snd_cwnd += (tcp_cubic_getW(tp, ms_elapsed, rtt) - 948 tp->snd_cwnd) / tp->snd_cwnd; 949 } else { 950 /* friendly TCP mode */ 951 tp->snd_cwnd = w_tcp; 952 } 953 954 /* Make sure we are within limits */ 955 tp->snd_cwnd = max(tp->snd_cwnd, tp->t_segsz); 956 tp->snd_cwnd = min(tp->snd_cwnd, TCP_MAXWIN << tp->snd_scale); 957 } else { 958 /* Use New Reno */ 959 tcp_newreno_newack(tp, th); 960 } 961 } 962 963 static void 964 tcp_cubic_slow_retransmit(struct tcpcb *tp) 965 { 966 967 /* Timeout - Mark new congestion */ 968 tcp_cubic_congestion_exp(tp); 969 970 /* Loss Window MUST be one segment. */ 971 tp->snd_cwnd = tp->t_segsz; 972 tp->t_partialacks = -1; 973 tp->t_dupacks = 0; 974 tp->t_bytes_acked = 0; 975 976 if (TCP_ECN_ALLOWED(tp)) 977 tp->t_flags |= TF_ECN_SND_CWR; 978 } 979 980 const struct tcp_congctl tcp_cubic_ctl = { 981 .fast_retransmit = tcp_cubic_fast_retransmit, 982 .slow_retransmit = tcp_cubic_slow_retransmit, 983 .fast_retransmit_newack = tcp_newreno_fast_retransmit_newack, 984 .newack = tcp_cubic_newack, 985 .cong_exp = tcp_cubic_congestion_exp, 986 }; 987