xref: /netbsd-src/sys/netinet/tcp_congctl.c (revision d909946ca08dceb44d7d0f22ec9488679695d976)
1 /*	$NetBSD: tcp_congctl.c,v 1.21 2016/04/26 08:44:44 ozaki-r Exp $	*/
2 
3 /*-
4  * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
9  * Facility, NASA Ames Research Center.
10  * This code is derived from software contributed to The NetBSD Foundation
11  * by Charles M. Hannum.
12  * This code is derived from software contributed to The NetBSD Foundation
13  * by Rui Paulo.
14  *
15  * Redistribution and use in source and binary forms, with or without
16  * modification, are permitted provided that the following conditions
17  * are met:
18  * 1. Redistributions of source code must retain the above copyright
19  *    notice, this list of conditions and the following disclaimer.
20  * 2. Redistributions in binary form must reproduce the above copyright
21  *    notice, this list of conditions and the following disclaimer in the
22  *    documentation and/or other materials provided with the distribution.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
25  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
26  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
27  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
28  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
30  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
32  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  * POSSIBILITY OF SUCH DAMAGE.
35  */
36 
37 /*
38  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
39  * All rights reserved.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the project nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  */
65 
66 /*
67  *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
68  *
69  * NRL grants permission for redistribution and use in source and binary
70  * forms, with or without modification, of the software and documentation
71  * created at NRL provided that the following conditions are met:
72  *
73  * 1. Redistributions of source code must retain the above copyright
74  *    notice, this list of conditions and the following disclaimer.
75  * 2. Redistributions in binary form must reproduce the above copyright
76  *    notice, this list of conditions and the following disclaimer in the
77  *    documentation and/or other materials provided with the distribution.
78  * 3. All advertising materials mentioning features or use of this software
79  *    must display the following acknowledgements:
80  *      This product includes software developed by the University of
81  *      California, Berkeley and its contributors.
82  *      This product includes software developed at the Information
83  *      Technology Division, US Naval Research Laboratory.
84  * 4. Neither the name of the NRL nor the names of its contributors
85  *    may be used to endorse or promote products derived from this software
86  *    without specific prior written permission.
87  *
88  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
89  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
90  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
91  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
92  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
93  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
94  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
95  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
96  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
97  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
98  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
99  *
100  * The views and conclusions contained in the software and documentation
101  * are those of the authors and should not be interpreted as representing
102  * official policies, either expressed or implied, of the US Naval
103  * Research Laboratory (NRL).
104  */
105 
106 /*
107  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
108  *	The Regents of the University of California.  All rights reserved.
109  *
110  * Redistribution and use in source and binary forms, with or without
111  * modification, are permitted provided that the following conditions
112  * are met:
113  * 1. Redistributions of source code must retain the above copyright
114  *    notice, this list of conditions and the following disclaimer.
115  * 2. Redistributions in binary form must reproduce the above copyright
116  *    notice, this list of conditions and the following disclaimer in the
117  *    documentation and/or other materials provided with the distribution.
118  * 3. Neither the name of the University nor the names of its contributors
119  *    may be used to endorse or promote products derived from this software
120  *    without specific prior written permission.
121  *
122  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
123  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
124  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
125  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
126  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
127  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
128  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
129  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
130  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
131  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
132  * SUCH DAMAGE.
133  *
134  *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
135  */
136 
137 #include <sys/cdefs.h>
138 __KERNEL_RCSID(0, "$NetBSD: tcp_congctl.c,v 1.21 2016/04/26 08:44:44 ozaki-r Exp $");
139 
140 #ifdef _KERNEL_OPT
141 #include "opt_inet.h"
142 #include "opt_tcp_debug.h"
143 #include "opt_tcp_congctl.h"
144 #endif
145 
146 #include <sys/param.h>
147 #include <sys/systm.h>
148 #include <sys/malloc.h>
149 #include <sys/mbuf.h>
150 #include <sys/protosw.h>
151 #include <sys/socket.h>
152 #include <sys/socketvar.h>
153 #include <sys/errno.h>
154 #include <sys/syslog.h>
155 #include <sys/pool.h>
156 #include <sys/domain.h>
157 #include <sys/kernel.h>
158 #include <sys/mutex.h>
159 
160 #include <net/if.h>
161 
162 #include <netinet/in.h>
163 #include <netinet/in_systm.h>
164 #include <netinet/ip.h>
165 #include <netinet/in_pcb.h>
166 #include <netinet/in_var.h>
167 #include <netinet/ip_var.h>
168 
169 #ifdef INET6
170 #ifndef INET
171 #include <netinet/in.h>
172 #endif
173 #include <netinet/ip6.h>
174 #include <netinet6/ip6_var.h>
175 #include <netinet6/in6_pcb.h>
176 #include <netinet6/ip6_var.h>
177 #include <netinet6/in6_var.h>
178 #include <netinet/icmp6.h>
179 #include <netinet6/nd6.h>
180 #endif
181 
182 #include <netinet/tcp.h>
183 #include <netinet/tcp_fsm.h>
184 #include <netinet/tcp_seq.h>
185 #include <netinet/tcp_timer.h>
186 #include <netinet/tcp_var.h>
187 #include <netinet/tcpip.h>
188 #include <netinet/tcp_congctl.h>
189 #ifdef TCP_DEBUG
190 #include <netinet/tcp_debug.h>
191 #endif
192 
193 /*
194  * TODO:
195  *   consider separating the actual implementations in another file.
196  */
197 
198 static void tcp_common_congestion_exp(struct tcpcb *, int, int);
199 
200 static int  tcp_reno_do_fast_retransmit(struct tcpcb *, const struct tcphdr *);
201 static int  tcp_reno_fast_retransmit(struct tcpcb *, const struct tcphdr *);
202 static void tcp_reno_slow_retransmit(struct tcpcb *);
203 static void tcp_reno_fast_retransmit_newack(struct tcpcb *,
204     const struct tcphdr *);
205 static void tcp_reno_newack(struct tcpcb *, const struct tcphdr *);
206 static void tcp_reno_congestion_exp(struct tcpcb *tp);
207 
208 static int  tcp_newreno_fast_retransmit(struct tcpcb *, const struct tcphdr *);
209 static void tcp_newreno_fast_retransmit_newack(struct tcpcb *,
210 	const struct tcphdr *);
211 static void tcp_newreno_newack(struct tcpcb *, const struct tcphdr *);
212 
213 static int tcp_cubic_fast_retransmit(struct tcpcb *, const struct tcphdr *);
214 static void tcp_cubic_slow_retransmit(struct tcpcb *tp);
215 static void tcp_cubic_newack(struct tcpcb *, const struct tcphdr *);
216 static void tcp_cubic_congestion_exp(struct tcpcb *);
217 
218 static void tcp_congctl_fillnames(void);
219 
220 extern int tcprexmtthresh;
221 
222 MALLOC_DEFINE(M_TCPCONGCTL, "tcpcongctl", "TCP congestion control structures");
223 
224 /* currently selected global congestion control */
225 char tcp_congctl_global_name[TCPCC_MAXLEN];
226 
227 /* available global congestion control algorithms */
228 char tcp_congctl_avail[10 * TCPCC_MAXLEN];
229 
230 /*
231  * Used to list the available congestion control algorithms.
232  */
233 TAILQ_HEAD(, tcp_congctlent) tcp_congctlhd =
234     TAILQ_HEAD_INITIALIZER(tcp_congctlhd);
235 
236 static struct tcp_congctlent * tcp_congctl_global;
237 
238 static kmutex_t tcp_congctl_mtx;
239 
240 void
241 tcp_congctl_init(void)
242 {
243 	int r __diagused;
244 
245 	mutex_init(&tcp_congctl_mtx, MUTEX_DEFAULT, IPL_NONE);
246 
247 	/* Base algorithms. */
248 	r = tcp_congctl_register("reno", &tcp_reno_ctl);
249 	KASSERT(r == 0);
250 	r = tcp_congctl_register("newreno", &tcp_newreno_ctl);
251 	KASSERT(r == 0);
252 	r = tcp_congctl_register("cubic", &tcp_cubic_ctl);
253 	KASSERT(r == 0);
254 
255 	/* NewReno is the default. */
256 #ifndef TCP_CONGCTL_DEFAULT
257 #define TCP_CONGCTL_DEFAULT "newreno"
258 #endif
259 
260 	r = tcp_congctl_select(NULL, TCP_CONGCTL_DEFAULT);
261 	KASSERT(r == 0);
262 }
263 
264 /*
265  * Register a congestion algorithm and select it if we have none.
266  */
267 int
268 tcp_congctl_register(const char *name, const struct tcp_congctl *tcc)
269 {
270 	struct tcp_congctlent *ntcc, *tccp;
271 
272 	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent)
273 		if (!strcmp(name, tccp->congctl_name)) {
274 			/* name already registered */
275 			return EEXIST;
276 		}
277 
278 	ntcc = malloc(sizeof(*ntcc), M_TCPCONGCTL, M_WAITOK|M_ZERO);
279 
280 	strlcpy(ntcc->congctl_name, name, sizeof(ntcc->congctl_name) - 1);
281 	ntcc->congctl_ctl = tcc;
282 
283 	TAILQ_INSERT_TAIL(&tcp_congctlhd, ntcc, congctl_ent);
284 	tcp_congctl_fillnames();
285 
286 	if (TAILQ_FIRST(&tcp_congctlhd) == ntcc)
287 		tcp_congctl_select(NULL, name);
288 
289 	return 0;
290 }
291 
292 int
293 tcp_congctl_unregister(const char *name)
294 {
295 	struct tcp_congctlent *tccp, *rtccp;
296 	unsigned int size;
297 
298 	rtccp = NULL;
299 	size = 0;
300 	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
301 		if (!strcmp(name, tccp->congctl_name))
302 			rtccp = tccp;
303 		size++;
304 	}
305 
306 	if (!rtccp)
307 		return ENOENT;
308 
309 	if (size <= 1 || tcp_congctl_global == rtccp || rtccp->congctl_refcnt)
310 		return EBUSY;
311 
312 	TAILQ_REMOVE(&tcp_congctlhd, rtccp, congctl_ent);
313 	free(rtccp, M_TCPCONGCTL);
314 	tcp_congctl_fillnames();
315 
316 	return 0;
317 }
318 
319 /*
320  * Select a congestion algorithm by name.
321  */
322 int
323 tcp_congctl_select(struct tcpcb *tp, const char *name)
324 {
325 	struct tcp_congctlent *tccp, *old_tccp, *new_tccp;
326 	bool old_found, new_found;
327 
328 	KASSERT(name);
329 
330 	old_found = (tp == NULL || tp->t_congctl == NULL);
331 	old_tccp = NULL;
332 	new_found = false;
333 	new_tccp = NULL;
334 
335 	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
336 		if (!old_found && tccp->congctl_ctl == tp->t_congctl) {
337 			old_tccp = tccp;
338 			old_found = true;
339 		}
340 
341 		if (!new_found && !strcmp(name, tccp->congctl_name)) {
342 			new_tccp = tccp;
343 			new_found = true;
344 		}
345 
346 		if (new_found && old_found) {
347 			if (tp) {
348 				mutex_enter(&tcp_congctl_mtx);
349 				if (old_tccp)
350 					old_tccp->congctl_refcnt--;
351 				tp->t_congctl = new_tccp->congctl_ctl;
352 				new_tccp->congctl_refcnt++;
353 				mutex_exit(&tcp_congctl_mtx);
354 			} else {
355 				tcp_congctl_global = new_tccp;
356 				strlcpy(tcp_congctl_global_name,
357 				    new_tccp->congctl_name,
358 				    sizeof(tcp_congctl_global_name) - 1);
359 			}
360 			return 0;
361 		}
362 	}
363 
364 	return EINVAL;
365 }
366 
367 void
368 tcp_congctl_release(struct tcpcb *tp)
369 {
370 	struct tcp_congctlent *tccp;
371 
372 	KASSERT(tp->t_congctl);
373 
374 	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
375 		if (tccp->congctl_ctl == tp->t_congctl) {
376 			tccp->congctl_refcnt--;
377 			return;
378 		}
379 	}
380 }
381 
382 /*
383  * Returns the name of a congestion algorithm.
384  */
385 const char *
386 tcp_congctl_bystruct(const struct tcp_congctl *tcc)
387 {
388 	struct tcp_congctlent *tccp;
389 
390 	KASSERT(tcc);
391 
392 	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent)
393 		if (tccp->congctl_ctl == tcc)
394 			return tccp->congctl_name;
395 
396 	return NULL;
397 }
398 
399 static void
400 tcp_congctl_fillnames(void)
401 {
402 	struct tcp_congctlent *tccp;
403 	const char *delim = " ";
404 
405 	tcp_congctl_avail[0] = '\0';
406 	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
407 		strlcat(tcp_congctl_avail, tccp->congctl_name,
408 		    sizeof(tcp_congctl_avail) - 1);
409 		if (TAILQ_NEXT(tccp, congctl_ent))
410 			strlcat(tcp_congctl_avail, delim,
411 			    sizeof(tcp_congctl_avail) - 1);
412 	}
413 
414 }
415 
416 /* ------------------------------------------------------------------------ */
417 
418 /*
419  * Common stuff
420  */
421 
422 /* Window reduction (1-beta) for [New]Reno: 0.5 */
423 #define RENO_BETAA 1
424 #define RENO_BETAB 2
425 /* Window reduction (1-beta) for Cubic: 0.8 */
426 #define CUBIC_BETAA 4
427 #define CUBIC_BETAB 5
428 /* Draft Rhee Section 4.1 */
429 #define CUBIC_CA 4
430 #define CUBIC_CB 10
431 
432 static void
433 tcp_common_congestion_exp(struct tcpcb *tp, int betaa, int betab)
434 {
435 	u_int win;
436 
437 	/*
438 	 * Reduce the congestion window and the slow start threshold.
439 	 */
440 	win = min(tp->snd_wnd, tp->snd_cwnd) * betaa / betab / tp->t_segsz;
441 	if (win < 2)
442 		win = 2;
443 
444 	tp->snd_ssthresh = win * tp->t_segsz;
445 	tp->snd_recover = tp->snd_max;
446 	tp->snd_cwnd = tp->snd_ssthresh;
447 
448 	/*
449 	 * When using TCP ECN, notify the peer that
450 	 * we reduced the cwnd.
451 	 */
452 	if (TCP_ECN_ALLOWED(tp))
453 		tp->t_flags |= TF_ECN_SND_CWR;
454 }
455 
456 
457 /* ------------------------------------------------------------------------ */
458 
459 /*
460  * TCP/Reno congestion control.
461  */
462 static void
463 tcp_reno_congestion_exp(struct tcpcb *tp)
464 {
465 
466 	tcp_common_congestion_exp(tp, RENO_BETAA, RENO_BETAB);
467 }
468 
469 static int
470 tcp_reno_do_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
471 {
472 	/*
473 	 * Dup acks mean that packets have left the
474 	 * network (they're now cached at the receiver)
475 	 * so bump cwnd by the amount in the receiver
476 	 * to keep a constant cwnd packets in the
477 	 * network.
478 	 *
479 	 * If we are using TCP/SACK, then enter
480 	 * Fast Recovery if the receiver SACKs
481 	 * data that is tcprexmtthresh * MSS
482 	 * bytes past the last ACKed segment,
483 	 * irrespective of the number of DupAcks.
484 	 */
485 
486 	tcp_seq onxt = tp->snd_nxt;
487 
488 	tp->t_partialacks = 0;
489 	TCP_TIMER_DISARM(tp, TCPT_REXMT);
490 	tp->t_rtttime = 0;
491 	if (TCP_SACK_ENABLED(tp)) {
492 		tp->t_dupacks = tcprexmtthresh;
493 		tp->sack_newdata = tp->snd_nxt;
494 		tp->snd_cwnd = tp->t_segsz;
495 		(void) tcp_output(tp);
496 		return 0;
497 	}
498 	tp->snd_nxt = th->th_ack;
499 	tp->snd_cwnd = tp->t_segsz;
500 	(void) tcp_output(tp);
501 	tp->snd_cwnd = tp->snd_ssthresh + tp->t_segsz * tp->t_dupacks;
502 	if (SEQ_GT(onxt, tp->snd_nxt))
503 		tp->snd_nxt = onxt;
504 
505 	return 0;
506 }
507 
508 static int
509 tcp_reno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
510 {
511 
512 	/*
513 	 * We know we're losing at the current
514 	 * window size so do congestion avoidance
515 	 * (set ssthresh to half the current window
516 	 * and pull our congestion window back to
517 	 * the new ssthresh).
518 	 */
519 
520 	tcp_reno_congestion_exp(tp);
521 	return tcp_reno_do_fast_retransmit(tp, th);
522 }
523 
524 static void
525 tcp_reno_slow_retransmit(struct tcpcb *tp)
526 {
527 	u_int win;
528 
529 	/*
530 	 * Close the congestion window down to one segment
531 	 * (we'll open it by one segment for each ack we get).
532 	 * Since we probably have a window's worth of unacked
533 	 * data accumulated, this "slow start" keeps us from
534 	 * dumping all that data as back-to-back packets (which
535 	 * might overwhelm an intermediate gateway).
536 	 *
537 	 * There are two phases to the opening: Initially we
538 	 * open by one mss on each ack.  This makes the window
539 	 * size increase exponentially with time.  If the
540 	 * window is larger than the path can handle, this
541 	 * exponential growth results in dropped packet(s)
542 	 * almost immediately.  To get more time between
543 	 * drops but still "push" the network to take advantage
544 	 * of improving conditions, we switch from exponential
545 	 * to linear window opening at some threshhold size.
546 	 * For a threshhold, we use half the current window
547 	 * size, truncated to a multiple of the mss.
548 	 *
549 	 * (the minimum cwnd that will give us exponential
550 	 * growth is 2 mss.  We don't allow the threshhold
551 	 * to go below this.)
552 	 */
553 
554 	win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz;
555 	if (win < 2)
556 		win = 2;
557 	/* Loss Window MUST be one segment. */
558 	tp->snd_cwnd = tp->t_segsz;
559 	tp->snd_ssthresh = win * tp->t_segsz;
560 	tp->t_partialacks = -1;
561 	tp->t_dupacks = 0;
562 	tp->t_bytes_acked = 0;
563 
564 	if (TCP_ECN_ALLOWED(tp))
565 		tp->t_flags |= TF_ECN_SND_CWR;
566 }
567 
568 static void
569 tcp_reno_fast_retransmit_newack(struct tcpcb *tp,
570     const struct tcphdr *th)
571 {
572 	if (tp->t_partialacks < 0) {
573 		/*
574 		 * We were not in fast recovery.  Reset the duplicate ack
575 		 * counter.
576 		 */
577 		tp->t_dupacks = 0;
578 	} else {
579 		/*
580 		 * Clamp the congestion window to the crossover point and
581 		 * exit fast recovery.
582 		 */
583 		if (tp->snd_cwnd > tp->snd_ssthresh)
584 			tp->snd_cwnd = tp->snd_ssthresh;
585 		tp->t_partialacks = -1;
586 		tp->t_dupacks = 0;
587 		tp->t_bytes_acked = 0;
588 		if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
589 			tp->snd_fack = th->th_ack;
590 	}
591 }
592 
593 static void
594 tcp_reno_newack(struct tcpcb *tp, const struct tcphdr *th)
595 {
596 	/*
597 	 * When new data is acked, open the congestion window.
598 	 */
599 
600 	u_int cw = tp->snd_cwnd;
601 	u_int incr = tp->t_segsz;
602 
603 	if (tcp_do_abc) {
604 
605 		/*
606 		 * RFC 3465 Appropriate Byte Counting (ABC)
607 		 */
608 
609 		int acked = th->th_ack - tp->snd_una;
610 
611 		if (cw >= tp->snd_ssthresh) {
612 			tp->t_bytes_acked += acked;
613 			if (tp->t_bytes_acked >= cw) {
614 				/* Time to increase the window. */
615 				tp->t_bytes_acked -= cw;
616 			} else {
617 				/* No need to increase yet. */
618 				incr = 0;
619 			}
620 		} else {
621 			/*
622 			 * use 2*SMSS or 1*SMSS for the "L" param,
623 			 * depending on sysctl setting.
624 			 *
625 			 * (See RFC 3465 2.3 Choosing the Limit)
626 			 */
627 			u_int abc_lim;
628 
629 			abc_lim = (tcp_abc_aggressive == 0 ||
630 			    tp->snd_nxt != tp->snd_max) ? incr : incr * 2;
631 			incr = min(acked, abc_lim);
632 		}
633 	} else {
634 
635 		/*
636 		 * If the window gives us less than ssthresh packets
637 		 * in flight, open exponentially (segsz per packet).
638 		 * Otherwise open linearly: segsz per window
639 		 * (segsz^2 / cwnd per packet).
640 		 */
641 
642 		if (cw >= tp->snd_ssthresh) {
643 			incr = incr * incr / cw;
644 		}
645 	}
646 
647 	tp->snd_cwnd = min(cw + incr, TCP_MAXWIN << tp->snd_scale);
648 }
649 
650 const struct tcp_congctl tcp_reno_ctl = {
651 	.fast_retransmit = tcp_reno_fast_retransmit,
652 	.slow_retransmit = tcp_reno_slow_retransmit,
653 	.fast_retransmit_newack = tcp_reno_fast_retransmit_newack,
654 	.newack = tcp_reno_newack,
655 	.cong_exp = tcp_reno_congestion_exp,
656 };
657 
658 /*
659  * TCP/NewReno Congestion control.
660  */
661 static int
662 tcp_newreno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
663 {
664 
665 	if (SEQ_LT(th->th_ack, tp->snd_high)) {
666 		/*
667 		 * False fast retransmit after timeout.
668 		 * Do not enter fast recovery
669 		 */
670 		tp->t_dupacks = 0;
671 		return 1;
672 	}
673 	/*
674 	 * Fast retransmit is same as reno.
675 	 */
676 	return tcp_reno_fast_retransmit(tp, th);
677 }
678 
679 /*
680  * Implement the NewReno response to a new ack, checking for partial acks in
681  * fast recovery.
682  */
683 static void
684 tcp_newreno_fast_retransmit_newack(struct tcpcb *tp, const struct tcphdr *th)
685 {
686 	if (tp->t_partialacks < 0) {
687 		/*
688 		 * We were not in fast recovery.  Reset the duplicate ack
689 		 * counter.
690 		 */
691 		tp->t_dupacks = 0;
692 	} else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
693 		/*
694 		 * This is a partial ack.  Retransmit the first unacknowledged
695 		 * segment and deflate the congestion window by the amount of
696 		 * acknowledged data.  Do not exit fast recovery.
697 		 */
698 		tcp_seq onxt = tp->snd_nxt;
699 		u_long ocwnd = tp->snd_cwnd;
700 		int sack_num_segs = 1, sack_bytes_rxmt = 0;
701 
702 		/*
703 		 * snd_una has not yet been updated and the socket's send
704 		 * buffer has not yet drained off the ACK'd data, so we
705 		 * have to leave snd_una as it was to get the correct data
706 		 * offset in tcp_output().
707 		 */
708 		tp->t_partialacks++;
709 		TCP_TIMER_DISARM(tp, TCPT_REXMT);
710 		tp->t_rtttime = 0;
711 		tp->snd_nxt = th->th_ack;
712 
713 		if (TCP_SACK_ENABLED(tp)) {
714 			/*
715 			 * Partial ack handling within a sack recovery episode.
716 			 * Keeping this very simple for now. When a partial ack
717 			 * is received, force snd_cwnd to a value that will
718 			 * allow the sender to transmit no more than 2 segments.
719 			 * If necessary, a fancier scheme can be adopted at a
720 			 * later point, but for now, the goal is to prevent the
721 			 * sender from bursting a large amount of data in the
722 			 * midst of sack recovery.
723 		 	 */
724 
725 			/*
726 			 * send one or 2 segments based on how much
727 			 * new data was acked
728 			 */
729 			if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2)
730 				sack_num_segs = 2;
731 			(void)tcp_sack_output(tp, &sack_bytes_rxmt);
732 			tp->snd_cwnd = sack_bytes_rxmt +
733 			    (tp->snd_nxt - tp->sack_newdata) +
734 			    sack_num_segs * tp->t_segsz;
735 			tp->t_flags |= TF_ACKNOW;
736 			(void) tcp_output(tp);
737 		} else {
738 			/*
739 			 * Set snd_cwnd to one segment beyond ACK'd offset
740 			 * snd_una is not yet updated when we're called
741 			 */
742 			tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una);
743 			(void) tcp_output(tp);
744 			tp->snd_cwnd = ocwnd;
745 			if (SEQ_GT(onxt, tp->snd_nxt))
746 				tp->snd_nxt = onxt;
747 			/*
748 			 * Partial window deflation.  Relies on fact that
749 			 * tp->snd_una not updated yet.
750 		 	 */
751 			tp->snd_cwnd -= (th->th_ack - tp->snd_una -
752 			    tp->t_segsz);
753 		}
754 	} else {
755 		/*
756 		 * Complete ack.  Inflate the congestion window to ssthresh
757 		 * and exit fast recovery.
758 		 *
759 		 * Window inflation should have left us with approx.
760 		 * snd_ssthresh outstanding data.  But in case we
761 		 * would be inclined to send a burst, better to do
762 		 * it via the slow start mechanism.
763 		 */
764 		if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
765 			tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
766 			    + tp->t_segsz;
767 		else
768 			tp->snd_cwnd = tp->snd_ssthresh;
769 		tp->t_partialacks = -1;
770 		tp->t_dupacks = 0;
771 		tp->t_bytes_acked = 0;
772 		if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
773 			tp->snd_fack = th->th_ack;
774 	}
775 }
776 
777 static void
778 tcp_newreno_newack(struct tcpcb *tp, const struct tcphdr *th)
779 {
780 	/*
781 	 * If we are still in fast recovery (meaning we are using
782 	 * NewReno and we have only received partial acks), do not
783 	 * inflate the window yet.
784 	 */
785 	if (tp->t_partialacks < 0)
786 		tcp_reno_newack(tp, th);
787 }
788 
789 
790 const struct tcp_congctl tcp_newreno_ctl = {
791 	.fast_retransmit = tcp_newreno_fast_retransmit,
792 	.slow_retransmit = tcp_reno_slow_retransmit,
793 	.fast_retransmit_newack = tcp_newreno_fast_retransmit_newack,
794 	.newack = tcp_newreno_newack,
795 	.cong_exp = tcp_reno_congestion_exp,
796 };
797 
798 /*
799  * CUBIC - http://tools.ietf.org/html/draft-rhee-tcpm-cubic-02
800  */
801 
802 /* Cubic prototypes */
803 static void	tcp_cubic_update_ctime(struct tcpcb *tp);
804 static uint32_t	tcp_cubic_diff_ctime(struct tcpcb *);
805 static uint32_t	tcp_cubic_cbrt(uint32_t);
806 static ulong	tcp_cubic_getW(struct tcpcb *, uint32_t, uint32_t);
807 
808 /* Cubic TIME functions - XXX I don't like using timevals and microuptime */
809 /*
810  * Set congestion timer to now
811  */
812 static void
813 tcp_cubic_update_ctime(struct tcpcb *tp)
814 {
815 	struct timeval now_timeval;
816 
817 	getmicrouptime(&now_timeval);
818 	tp->snd_cubic_ctime = now_timeval.tv_sec * 1000 +
819 	    now_timeval.tv_usec / 1000;
820 }
821 
822 /*
823  * miliseconds from last congestion
824  */
825 static uint32_t
826 tcp_cubic_diff_ctime(struct tcpcb *tp)
827 {
828 	struct timeval now_timeval;
829 
830 	getmicrouptime(&now_timeval);
831 	return now_timeval.tv_sec * 1000 + now_timeval.tv_usec / 1000 -
832 	    tp->snd_cubic_ctime;
833 }
834 
835 /*
836  * Approximate cubic root
837  */
838 #define CBRT_ROUNDS 30
839 static uint32_t
840 tcp_cubic_cbrt(uint32_t v)
841 {
842 	int i, rounds = CBRT_ROUNDS;
843 	uint64_t x = v / 3;
844 
845 	/* We fail to calculate correct for small numbers */
846 	if (v == 0)
847 		return 0;
848 	else if (v < 4)
849 		return 1;
850 
851 	/*
852 	 * largest x that 2*x^3+3*x fits 64bit
853 	 * Avoid overflow for a time cost
854 	 */
855 	if (x > 2097151)
856 		rounds += 10;
857 
858 	for (i = 0; i < rounds; i++)
859 		if (rounds == CBRT_ROUNDS)
860 			x = (v + 2 * x * x * x) / (3 * x * x);
861 		else
862 			/* Avoid overflow */
863 			x = v / (3 * x * x) + 2 * x / 3;
864 
865 	return (uint32_t)x;
866 }
867 
868 /* Draft Rhee Section 3.1 - get W(t+rtt) - Eq. 1 */
869 static ulong
870 tcp_cubic_getW(struct tcpcb *tp, uint32_t ms_elapsed, uint32_t rtt)
871 {
872 	uint32_t K;
873 	long tK3;
874 
875 	/* Section 3.1 Eq. 2 */
876 	K = tcp_cubic_cbrt(tp->snd_cubic_wmax / CUBIC_BETAB *
877 	    CUBIC_CB / CUBIC_CA);
878 	/*  (t-K)^3 - not clear why is the measure unit mattering */
879 	tK3 = (long)(ms_elapsed + rtt) - (long)K;
880 	tK3 = tK3 * tK3 * tK3;
881 
882 	return CUBIC_CA * tK3 / CUBIC_CB + tp->snd_cubic_wmax;
883 }
884 
885 static void
886 tcp_cubic_congestion_exp(struct tcpcb *tp)
887 {
888 
889 	/*
890 	 * Congestion - Set WMax and shrink cwnd
891 	 */
892 	tcp_cubic_update_ctime(tp);
893 
894 	/* Section 3.6 - Fast Convergence */
895 	if (tp->snd_cubic_wmax < tp->snd_cubic_wmax_last) {
896 		tp->snd_cubic_wmax_last = tp->snd_cubic_wmax;
897 		tp->snd_cubic_wmax = tp->snd_cubic_wmax / 2 +
898 		    tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB / 2;
899 	} else {
900 		tp->snd_cubic_wmax_last = tp->snd_cubic_wmax;
901 		tp->snd_cubic_wmax = tp->snd_cwnd;
902 	}
903 
904 	tp->snd_cubic_wmax = max(tp->t_segsz, tp->snd_cubic_wmax);
905 
906 	/* Shrink CWND */
907 	tcp_common_congestion_exp(tp, CUBIC_BETAA, CUBIC_BETAB);
908 }
909 
910 static int
911 tcp_cubic_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
912 {
913 
914 	if (SEQ_LT(th->th_ack, tp->snd_high)) {
915 		/* See newreno */
916 		tp->t_dupacks = 0;
917 		return 1;
918 	}
919 
920 	/*
921 	 * mark WMax
922 	 */
923 	tcp_cubic_congestion_exp(tp);
924 
925 	/* Do fast retransmit */
926 	return tcp_reno_do_fast_retransmit(tp, th);
927 }
928 
929 static void
930 tcp_cubic_newack(struct tcpcb *tp, const struct tcphdr *th)
931 {
932 	uint32_t ms_elapsed, rtt;
933 	u_long w_tcp;
934 
935 	/* Congestion avoidance and not in fast recovery and usable rtt */
936 	if (tp->snd_cwnd > tp->snd_ssthresh && tp->t_partialacks < 0 &&
937 	    /*
938 	     * t_srtt is 1/32 units of slow ticks
939 	     * converting it in ms would be equal to
940 	     * (t_srtt >> 5) * 1000 / PR_SLOWHZ ~= (t_srtt << 5) / PR_SLOWHZ
941 	     */
942 	    (rtt = (tp->t_srtt << 5) / PR_SLOWHZ) > 0) {
943 		ms_elapsed = tcp_cubic_diff_ctime(tp);
944 
945 		/* Compute W_tcp(t) */
946 		w_tcp = tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB +
947 		    ms_elapsed / rtt / 3;
948 
949 		if (tp->snd_cwnd > w_tcp) {
950 			/* Not in TCP friendly mode */
951 			tp->snd_cwnd += (tcp_cubic_getW(tp, ms_elapsed, rtt) -
952 			    tp->snd_cwnd) / tp->snd_cwnd;
953 		} else {
954 			/* friendly TCP mode */
955 			tp->snd_cwnd = w_tcp;
956 		}
957 
958 		/* Make sure we are within limits */
959 		tp->snd_cwnd = max(tp->snd_cwnd, tp->t_segsz);
960 		tp->snd_cwnd = min(tp->snd_cwnd, TCP_MAXWIN << tp->snd_scale);
961 	} else {
962 		/* Use New Reno */
963 		tcp_newreno_newack(tp, th);
964 	}
965 }
966 
967 static void
968 tcp_cubic_slow_retransmit(struct tcpcb *tp)
969 {
970 
971 	/* Timeout - Mark new congestion */
972 	tcp_cubic_congestion_exp(tp);
973 
974 	/* Loss Window MUST be one segment. */
975 	tp->snd_cwnd = tp->t_segsz;
976 	tp->t_partialacks = -1;
977 	tp->t_dupacks = 0;
978 	tp->t_bytes_acked = 0;
979 
980 	if (TCP_ECN_ALLOWED(tp))
981 		tp->t_flags |= TF_ECN_SND_CWR;
982 }
983 
984 const struct tcp_congctl tcp_cubic_ctl = {
985 	.fast_retransmit = tcp_cubic_fast_retransmit,
986 	.slow_retransmit = tcp_cubic_slow_retransmit,
987 	.fast_retransmit_newack = tcp_newreno_fast_retransmit_newack,
988 	.newack = tcp_cubic_newack,
989 	.cong_exp = tcp_cubic_congestion_exp,
990 };
991