xref: /netbsd-src/sys/netinet/tcp_congctl.c (revision e4ebea9efd33d7fbff602d6288b15240e56427d2)
1*e4ebea9eSandvar /*	$NetBSD: tcp_congctl.c,v 1.29 2024/05/14 19:00:44 andvar Exp $	*/
2f3330397Srpaulo 
3f3330397Srpaulo /*-
4f3330397Srpaulo  * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc.
5f3330397Srpaulo  * All rights reserved.
6f3330397Srpaulo  *
7f3330397Srpaulo  * This code is derived from software contributed to The NetBSD Foundation
8f3330397Srpaulo  * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
9f3330397Srpaulo  * Facility, NASA Ames Research Center.
10f3330397Srpaulo  * This code is derived from software contributed to The NetBSD Foundation
11f3330397Srpaulo  * by Charles M. Hannum.
12f3330397Srpaulo  * This code is derived from software contributed to The NetBSD Foundation
13f3330397Srpaulo  * by Rui Paulo.
14f3330397Srpaulo  *
15f3330397Srpaulo  * Redistribution and use in source and binary forms, with or without
16f3330397Srpaulo  * modification, are permitted provided that the following conditions
17f3330397Srpaulo  * are met:
18f3330397Srpaulo  * 1. Redistributions of source code must retain the above copyright
19f3330397Srpaulo  *    notice, this list of conditions and the following disclaimer.
20f3330397Srpaulo  * 2. Redistributions in binary form must reproduce the above copyright
21f3330397Srpaulo  *    notice, this list of conditions and the following disclaimer in the
22f3330397Srpaulo  *    documentation and/or other materials provided with the distribution.
23f3330397Srpaulo  *
24f3330397Srpaulo  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
25f3330397Srpaulo  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
26f3330397Srpaulo  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
27f3330397Srpaulo  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
28f3330397Srpaulo  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29f3330397Srpaulo  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
30f3330397Srpaulo  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31f3330397Srpaulo  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
32f3330397Srpaulo  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33f3330397Srpaulo  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34f3330397Srpaulo  * POSSIBILITY OF SUCH DAMAGE.
35f3330397Srpaulo  */
36f3330397Srpaulo 
37f3330397Srpaulo /*
38f3330397Srpaulo  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
39f3330397Srpaulo  * All rights reserved.
40f3330397Srpaulo  *
41f3330397Srpaulo  * Redistribution and use in source and binary forms, with or without
42f3330397Srpaulo  * modification, are permitted provided that the following conditions
43f3330397Srpaulo  * are met:
44f3330397Srpaulo  * 1. Redistributions of source code must retain the above copyright
45f3330397Srpaulo  *    notice, this list of conditions and the following disclaimer.
46f3330397Srpaulo  * 2. Redistributions in binary form must reproduce the above copyright
47f3330397Srpaulo  *    notice, this list of conditions and the following disclaimer in the
48f3330397Srpaulo  *    documentation and/or other materials provided with the distribution.
49f3330397Srpaulo  * 3. Neither the name of the project nor the names of its contributors
50f3330397Srpaulo  *    may be used to endorse or promote products derived from this software
51f3330397Srpaulo  *    without specific prior written permission.
52f3330397Srpaulo  *
53f3330397Srpaulo  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
54f3330397Srpaulo  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55f3330397Srpaulo  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56f3330397Srpaulo  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
57f3330397Srpaulo  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58f3330397Srpaulo  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59f3330397Srpaulo  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60f3330397Srpaulo  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61f3330397Srpaulo  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62f3330397Srpaulo  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63f3330397Srpaulo  * SUCH DAMAGE.
64f3330397Srpaulo  */
65f3330397Srpaulo 
66f3330397Srpaulo /*
67f3330397Srpaulo  *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
68f3330397Srpaulo  *
69f3330397Srpaulo  * NRL grants permission for redistribution and use in source and binary
70f3330397Srpaulo  * forms, with or without modification, of the software and documentation
71f3330397Srpaulo  * created at NRL provided that the following conditions are met:
72f3330397Srpaulo  *
73f3330397Srpaulo  * 1. Redistributions of source code must retain the above copyright
74f3330397Srpaulo  *    notice, this list of conditions and the following disclaimer.
75f3330397Srpaulo  * 2. Redistributions in binary form must reproduce the above copyright
76f3330397Srpaulo  *    notice, this list of conditions and the following disclaimer in the
77f3330397Srpaulo  *    documentation and/or other materials provided with the distribution.
78f3330397Srpaulo  * 3. All advertising materials mentioning features or use of this software
79f3330397Srpaulo  *    must display the following acknowledgements:
80f3330397Srpaulo  *      This product includes software developed by the University of
81f3330397Srpaulo  *      California, Berkeley and its contributors.
82f3330397Srpaulo  *      This product includes software developed at the Information
83f3330397Srpaulo  *      Technology Division, US Naval Research Laboratory.
84f3330397Srpaulo  * 4. Neither the name of the NRL nor the names of its contributors
85f3330397Srpaulo  *    may be used to endorse or promote products derived from this software
86f3330397Srpaulo  *    without specific prior written permission.
87f3330397Srpaulo  *
88f3330397Srpaulo  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
89f3330397Srpaulo  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
90f3330397Srpaulo  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
91f3330397Srpaulo  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
92f3330397Srpaulo  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
93f3330397Srpaulo  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
94f3330397Srpaulo  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
95f3330397Srpaulo  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
96f3330397Srpaulo  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
97f3330397Srpaulo  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
98f3330397Srpaulo  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
99f3330397Srpaulo  *
100f3330397Srpaulo  * The views and conclusions contained in the software and documentation
101f3330397Srpaulo  * are those of the authors and should not be interpreted as representing
102f3330397Srpaulo  * official policies, either expressed or implied, of the US Naval
103f3330397Srpaulo  * Research Laboratory (NRL).
104f3330397Srpaulo  */
105f3330397Srpaulo 
106f3330397Srpaulo /*
107f3330397Srpaulo  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
108f3330397Srpaulo  *	The Regents of the University of California.  All rights reserved.
109f3330397Srpaulo  *
110f3330397Srpaulo  * Redistribution and use in source and binary forms, with or without
111f3330397Srpaulo  * modification, are permitted provided that the following conditions
112f3330397Srpaulo  * are met:
113f3330397Srpaulo  * 1. Redistributions of source code must retain the above copyright
114f3330397Srpaulo  *    notice, this list of conditions and the following disclaimer.
115f3330397Srpaulo  * 2. Redistributions in binary form must reproduce the above copyright
116f3330397Srpaulo  *    notice, this list of conditions and the following disclaimer in the
117f3330397Srpaulo  *    documentation and/or other materials provided with the distribution.
118f3330397Srpaulo  * 3. Neither the name of the University nor the names of its contributors
119f3330397Srpaulo  *    may be used to endorse or promote products derived from this software
120f3330397Srpaulo  *    without specific prior written permission.
121f3330397Srpaulo  *
122f3330397Srpaulo  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
123f3330397Srpaulo  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
124f3330397Srpaulo  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
125f3330397Srpaulo  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
126f3330397Srpaulo  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
127f3330397Srpaulo  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
128f3330397Srpaulo  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
129f3330397Srpaulo  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
130f3330397Srpaulo  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
131f3330397Srpaulo  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
132f3330397Srpaulo  * SUCH DAMAGE.
133f3330397Srpaulo  *
134f3330397Srpaulo  *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
135f3330397Srpaulo  */
136f3330397Srpaulo 
137f3330397Srpaulo #include <sys/cdefs.h>
138*e4ebea9eSandvar __KERNEL_RCSID(0, "$NetBSD: tcp_congctl.c,v 1.29 2024/05/14 19:00:44 andvar Exp $");
139f3330397Srpaulo 
1401c4a50f1Spooka #ifdef _KERNEL_OPT
141f3330397Srpaulo #include "opt_inet.h"
142f3330397Srpaulo #include "opt_tcp_debug.h"
143f3330397Srpaulo #include "opt_tcp_congctl.h"
1441c4a50f1Spooka #endif
145f3330397Srpaulo 
146f3330397Srpaulo #include <sys/param.h>
147f3330397Srpaulo #include <sys/systm.h>
148f3330397Srpaulo #include <sys/malloc.h>
149f3330397Srpaulo #include <sys/mbuf.h>
150f3330397Srpaulo #include <sys/protosw.h>
151f3330397Srpaulo #include <sys/socket.h>
152f3330397Srpaulo #include <sys/socketvar.h>
153f3330397Srpaulo #include <sys/errno.h>
154f3330397Srpaulo #include <sys/syslog.h>
155f3330397Srpaulo #include <sys/pool.h>
156f3330397Srpaulo #include <sys/domain.h>
157f3330397Srpaulo #include <sys/kernel.h>
15848e23b4aSxtraeme #include <sys/mutex.h>
159f3330397Srpaulo 
160f3330397Srpaulo #include <net/if.h>
161f3330397Srpaulo 
162f3330397Srpaulo #include <netinet/in.h>
163f3330397Srpaulo #include <netinet/in_systm.h>
164f3330397Srpaulo #include <netinet/ip.h>
165f3330397Srpaulo #include <netinet/in_pcb.h>
166f3330397Srpaulo #include <netinet/in_var.h>
167f3330397Srpaulo #include <netinet/ip_var.h>
168f3330397Srpaulo 
169f3330397Srpaulo #ifdef INET6
170f3330397Srpaulo #include <netinet/ip6.h>
171f3330397Srpaulo #include <netinet6/ip6_var.h>
172f3330397Srpaulo #include <netinet6/in6_pcb.h>
173f3330397Srpaulo #include <netinet6/ip6_var.h>
174f3330397Srpaulo #include <netinet6/in6_var.h>
175f3330397Srpaulo #include <netinet/icmp6.h>
176f3330397Srpaulo #endif
177f3330397Srpaulo 
178f3330397Srpaulo #include <netinet/tcp.h>
179f3330397Srpaulo #include <netinet/tcp_fsm.h>
180f3330397Srpaulo #include <netinet/tcp_seq.h>
181f3330397Srpaulo #include <netinet/tcp_timer.h>
182f3330397Srpaulo #include <netinet/tcp_var.h>
183f3330397Srpaulo #include <netinet/tcp_congctl.h>
184f3330397Srpaulo #ifdef TCP_DEBUG
185f3330397Srpaulo #include <netinet/tcp_debug.h>
186f3330397Srpaulo #endif
187f3330397Srpaulo 
188f3330397Srpaulo /*
189f3330397Srpaulo  * TODO:
190f3330397Srpaulo  *   consider separating the actual implementations in another file.
191f3330397Srpaulo  */
192f3330397Srpaulo 
1934d4f2b7dSkefren static void tcp_common_congestion_exp(struct tcpcb *, int, int);
1944d4f2b7dSkefren 
1954d4f2b7dSkefren static int  tcp_reno_do_fast_retransmit(struct tcpcb *, const struct tcphdr *);
1967253aad9Syamt static int  tcp_reno_fast_retransmit(struct tcpcb *, const struct tcphdr *);
197f3330397Srpaulo static void tcp_reno_slow_retransmit(struct tcpcb *);
1987253aad9Syamt static void tcp_reno_fast_retransmit_newack(struct tcpcb *,
1997253aad9Syamt     const struct tcphdr *);
2007253aad9Syamt static void tcp_reno_newack(struct tcpcb *, const struct tcphdr *);
201a70594d3Srpaulo static void tcp_reno_congestion_exp(struct tcpcb *tp);
202f3330397Srpaulo 
2037253aad9Syamt static int  tcp_newreno_fast_retransmit(struct tcpcb *, const struct tcphdr *);
204f3330397Srpaulo static void tcp_newreno_fast_retransmit_newack(struct tcpcb *,
2057253aad9Syamt 	const struct tcphdr *);
2067253aad9Syamt static void tcp_newreno_newack(struct tcpcb *, const struct tcphdr *);
207f3330397Srpaulo 
2084d4f2b7dSkefren static int tcp_cubic_fast_retransmit(struct tcpcb *, const struct tcphdr *);
2094d4f2b7dSkefren static void tcp_cubic_slow_retransmit(struct tcpcb *tp);
2104d4f2b7dSkefren static void tcp_cubic_newack(struct tcpcb *, const struct tcphdr *);
2114d4f2b7dSkefren static void tcp_cubic_congestion_exp(struct tcpcb *);
212f3330397Srpaulo 
213f3330397Srpaulo static void tcp_congctl_fillnames(void);
214f3330397Srpaulo 
215f3330397Srpaulo extern int tcprexmtthresh;
216f3330397Srpaulo 
217f3330397Srpaulo MALLOC_DEFINE(M_TCPCONGCTL, "tcpcongctl", "TCP congestion control structures");
218f3330397Srpaulo 
219a34217b8Smatt /* currently selected global congestion control */
220a34217b8Smatt char tcp_congctl_global_name[TCPCC_MAXLEN];
221a34217b8Smatt 
222a34217b8Smatt /* available global congestion control algorithms */
223a34217b8Smatt char tcp_congctl_avail[10 * TCPCC_MAXLEN];
224a34217b8Smatt 
225f3330397Srpaulo /*
226f3330397Srpaulo  * Used to list the available congestion control algorithms.
227f3330397Srpaulo  */
228a34217b8Smatt TAILQ_HEAD(, tcp_congctlent) tcp_congctlhd =
229a34217b8Smatt     TAILQ_HEAD_INITIALIZER(tcp_congctlhd);
230a34217b8Smatt 
231a34217b8Smatt static struct tcp_congctlent * tcp_congctl_global;
232f3330397Srpaulo 
23348e23b4aSxtraeme static kmutex_t tcp_congctl_mtx;
234f3330397Srpaulo 
235f3330397Srpaulo void
tcp_congctl_init(void)236f3330397Srpaulo tcp_congctl_init(void)
237f3330397Srpaulo {
2387c79fd6cSmartin 	int r __diagused;
239f3330397Srpaulo 
24048e23b4aSxtraeme 	mutex_init(&tcp_congctl_mtx, MUTEX_DEFAULT, IPL_NONE);
241f3330397Srpaulo 
242f3330397Srpaulo 	/* Base algorithms. */
243f3330397Srpaulo 	r = tcp_congctl_register("reno", &tcp_reno_ctl);
244f3330397Srpaulo 	KASSERT(r == 0);
245f3330397Srpaulo 	r = tcp_congctl_register("newreno", &tcp_newreno_ctl);
246f3330397Srpaulo 	KASSERT(r == 0);
2474d4f2b7dSkefren 	r = tcp_congctl_register("cubic", &tcp_cubic_ctl);
2484d4f2b7dSkefren 	KASSERT(r == 0);
249f3330397Srpaulo 
250f3330397Srpaulo 	/* NewReno is the default. */
251f3330397Srpaulo #ifndef TCP_CONGCTL_DEFAULT
252f3330397Srpaulo #define TCP_CONGCTL_DEFAULT "newreno"
253f3330397Srpaulo #endif
254f3330397Srpaulo 
255f3330397Srpaulo 	r = tcp_congctl_select(NULL, TCP_CONGCTL_DEFAULT);
256f3330397Srpaulo 	KASSERT(r == 0);
257f3330397Srpaulo }
258f3330397Srpaulo 
259f3330397Srpaulo /*
260f3330397Srpaulo  * Register a congestion algorithm and select it if we have none.
261f3330397Srpaulo  */
262f3330397Srpaulo int
tcp_congctl_register(const char * name,const struct tcp_congctl * tcc)263a34217b8Smatt tcp_congctl_register(const char *name, const struct tcp_congctl *tcc)
264f3330397Srpaulo {
265f3330397Srpaulo 	struct tcp_congctlent *ntcc, *tccp;
266f3330397Srpaulo 
267f3330397Srpaulo 	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent)
268f3330397Srpaulo 		if (!strcmp(name, tccp->congctl_name)) {
269f3330397Srpaulo 			/* name already registered */
270f3330397Srpaulo 			return EEXIST;
271f3330397Srpaulo 		}
272f3330397Srpaulo 
273a34217b8Smatt 	ntcc = malloc(sizeof(*ntcc), M_TCPCONGCTL, M_WAITOK|M_ZERO);
274f3330397Srpaulo 
275f3330397Srpaulo 	strlcpy(ntcc->congctl_name, name, sizeof(ntcc->congctl_name) - 1);
276f3330397Srpaulo 	ntcc->congctl_ctl = tcc;
277f3330397Srpaulo 
278f3330397Srpaulo 	TAILQ_INSERT_TAIL(&tcp_congctlhd, ntcc, congctl_ent);
279f3330397Srpaulo 	tcp_congctl_fillnames();
280f3330397Srpaulo 
281f3330397Srpaulo 	if (TAILQ_FIRST(&tcp_congctlhd) == ntcc)
282f3330397Srpaulo 		tcp_congctl_select(NULL, name);
283f3330397Srpaulo 
284f3330397Srpaulo 	return 0;
285f3330397Srpaulo }
286f3330397Srpaulo 
287f3330397Srpaulo int
tcp_congctl_unregister(const char * name)288f3330397Srpaulo tcp_congctl_unregister(const char *name)
289f3330397Srpaulo {
290f3330397Srpaulo 	struct tcp_congctlent *tccp, *rtccp;
291f3330397Srpaulo 	unsigned int size;
292f3330397Srpaulo 
293f3330397Srpaulo 	rtccp = NULL;
294f3330397Srpaulo 	size = 0;
295f3330397Srpaulo 	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
296f3330397Srpaulo 		if (!strcmp(name, tccp->congctl_name))
297f3330397Srpaulo 			rtccp = tccp;
298f3330397Srpaulo 		size++;
299f3330397Srpaulo 	}
300f3330397Srpaulo 
301f3330397Srpaulo 	if (!rtccp)
302f3330397Srpaulo 		return ENOENT;
303f3330397Srpaulo 
304a34217b8Smatt 	if (size <= 1 || tcp_congctl_global == rtccp || rtccp->congctl_refcnt)
305f3330397Srpaulo 		return EBUSY;
306f3330397Srpaulo 
307f3330397Srpaulo 	TAILQ_REMOVE(&tcp_congctlhd, rtccp, congctl_ent);
308f3330397Srpaulo 	free(rtccp, M_TCPCONGCTL);
309f3330397Srpaulo 	tcp_congctl_fillnames();
310f3330397Srpaulo 
311f3330397Srpaulo 	return 0;
312f3330397Srpaulo }
313f3330397Srpaulo 
314f3330397Srpaulo /*
315f3330397Srpaulo  * Select a congestion algorithm by name.
316f3330397Srpaulo  */
317f3330397Srpaulo int
tcp_congctl_select(struct tcpcb * tp,const char * name)318f3330397Srpaulo tcp_congctl_select(struct tcpcb *tp, const char *name)
319f3330397Srpaulo {
320a34217b8Smatt 	struct tcp_congctlent *tccp, *old_tccp, *new_tccp;
321a34217b8Smatt 	bool old_found, new_found;
322f3330397Srpaulo 
323f3330397Srpaulo 	KASSERT(name);
324f3330397Srpaulo 
325a34217b8Smatt 	old_found = (tp == NULL || tp->t_congctl == NULL);
326a34217b8Smatt 	old_tccp = NULL;
327a34217b8Smatt 	new_found = false;
328a34217b8Smatt 	new_tccp = NULL;
329a34217b8Smatt 
330a34217b8Smatt 	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
331a34217b8Smatt 		if (!old_found && tccp->congctl_ctl == tp->t_congctl) {
332a34217b8Smatt 			old_tccp = tccp;
333a34217b8Smatt 			old_found = true;
334a34217b8Smatt 		}
335a34217b8Smatt 
336a34217b8Smatt 		if (!new_found && !strcmp(name, tccp->congctl_name)) {
337a34217b8Smatt 			new_tccp = tccp;
338a34217b8Smatt 			new_found = true;
339a34217b8Smatt 		}
340a34217b8Smatt 
341a34217b8Smatt 		if (new_found && old_found) {
342f3330397Srpaulo 			if (tp) {
34348e23b4aSxtraeme 				mutex_enter(&tcp_congctl_mtx);
344a34217b8Smatt 				if (old_tccp)
345a34217b8Smatt 					old_tccp->congctl_refcnt--;
346a34217b8Smatt 				tp->t_congctl = new_tccp->congctl_ctl;
347a34217b8Smatt 				new_tccp->congctl_refcnt++;
34848e23b4aSxtraeme 				mutex_exit(&tcp_congctl_mtx);
349f3330397Srpaulo 			} else {
350a34217b8Smatt 				tcp_congctl_global = new_tccp;
351f3330397Srpaulo 				strlcpy(tcp_congctl_global_name,
352a34217b8Smatt 				    new_tccp->congctl_name,
353f3330397Srpaulo 				    sizeof(tcp_congctl_global_name) - 1);
354f3330397Srpaulo 			}
355f3330397Srpaulo 			return 0;
356f3330397Srpaulo 		}
357a34217b8Smatt 	}
358f3330397Srpaulo 
359f3330397Srpaulo 	return EINVAL;
360f3330397Srpaulo }
361f3330397Srpaulo 
362a34217b8Smatt void
tcp_congctl_release(struct tcpcb * tp)363a34217b8Smatt tcp_congctl_release(struct tcpcb *tp)
364a34217b8Smatt {
365a34217b8Smatt 	struct tcp_congctlent *tccp;
366a34217b8Smatt 
367a34217b8Smatt 	KASSERT(tp->t_congctl);
368a34217b8Smatt 
369a34217b8Smatt 	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
370a34217b8Smatt 		if (tccp->congctl_ctl == tp->t_congctl) {
371a34217b8Smatt 			tccp->congctl_refcnt--;
372a34217b8Smatt 			return;
373a34217b8Smatt 		}
374a34217b8Smatt 	}
375a34217b8Smatt }
376a34217b8Smatt 
377f3330397Srpaulo /*
378f3330397Srpaulo  * Returns the name of a congestion algorithm.
379f3330397Srpaulo  */
380f3330397Srpaulo const char *
tcp_congctl_bystruct(const struct tcp_congctl * tcc)381f3330397Srpaulo tcp_congctl_bystruct(const struct tcp_congctl *tcc)
382f3330397Srpaulo {
383f3330397Srpaulo 	struct tcp_congctlent *tccp;
384f3330397Srpaulo 
385f3330397Srpaulo 	KASSERT(tcc);
386f3330397Srpaulo 
387f3330397Srpaulo 	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent)
388f3330397Srpaulo 		if (tccp->congctl_ctl == tcc)
389f3330397Srpaulo 			return tccp->congctl_name;
390f3330397Srpaulo 
391f3330397Srpaulo 	return NULL;
392f3330397Srpaulo }
393f3330397Srpaulo 
394f3330397Srpaulo static void
tcp_congctl_fillnames(void)395f3330397Srpaulo tcp_congctl_fillnames(void)
396f3330397Srpaulo {
397f3330397Srpaulo 	struct tcp_congctlent *tccp;
398f3330397Srpaulo 	const char *delim = " ";
399f3330397Srpaulo 
400f3330397Srpaulo 	tcp_congctl_avail[0] = '\0';
401f3330397Srpaulo 	TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
402f3330397Srpaulo 		strlcat(tcp_congctl_avail, tccp->congctl_name,
403f3330397Srpaulo 		    sizeof(tcp_congctl_avail) - 1);
404f3330397Srpaulo 		if (TAILQ_NEXT(tccp, congctl_ent))
405f3330397Srpaulo 			strlcat(tcp_congctl_avail, delim,
406f3330397Srpaulo 			    sizeof(tcp_congctl_avail) - 1);
407f3330397Srpaulo 	}
408f3330397Srpaulo 
409f3330397Srpaulo }
410f3330397Srpaulo 
411f3330397Srpaulo /* ------------------------------------------------------------------------ */
412f3330397Srpaulo 
413a70594d3Srpaulo /*
4144d4f2b7dSkefren  * Common stuff
415a70594d3Srpaulo  */
4164d4f2b7dSkefren 
4174d4f2b7dSkefren /* Window reduction (1-beta) for [New]Reno: 0.5 */
4184d4f2b7dSkefren #define RENO_BETAA 1
4194d4f2b7dSkefren #define RENO_BETAB 2
4204d4f2b7dSkefren /* Window reduction (1-beta) for Cubic: 0.8 */
4214d4f2b7dSkefren #define CUBIC_BETAA 4
4224d4f2b7dSkefren #define CUBIC_BETAB 5
4234d4f2b7dSkefren /* Draft Rhee Section 4.1 */
4244d4f2b7dSkefren #define CUBIC_CA 4
4254d4f2b7dSkefren #define CUBIC_CB 10
4264d4f2b7dSkefren 
427a70594d3Srpaulo static void
tcp_common_congestion_exp(struct tcpcb * tp,int betaa,int betab)4284d4f2b7dSkefren tcp_common_congestion_exp(struct tcpcb *tp, int betaa, int betab)
429f3330397Srpaulo {
43053950d86Smsaitoh 	u_long win;
431f3330397Srpaulo 
432f3330397Srpaulo 	/*
4334d4f2b7dSkefren 	 * Reduce the congestion window and the slow start threshold.
434f3330397Srpaulo 	 */
43553950d86Smsaitoh 	win = ulmin(tp->snd_wnd, tp->snd_cwnd) * betaa / betab / tp->t_segsz;
436f3330397Srpaulo 	if (win < 2)
437f3330397Srpaulo 		win = 2;
438f3330397Srpaulo 
439f3330397Srpaulo 	tp->snd_ssthresh = win * tp->t_segsz;
440f3330397Srpaulo 	tp->snd_recover = tp->snd_max;
441f3330397Srpaulo 	tp->snd_cwnd = tp->snd_ssthresh;
442f3330397Srpaulo 
4431c1f230eSrpaulo 	/*
4441c1f230eSrpaulo 	 * When using TCP ECN, notify the peer that
4451c1f230eSrpaulo 	 * we reduced the cwnd.
4461c1f230eSrpaulo 	 */
447f3330397Srpaulo 	if (TCP_ECN_ALLOWED(tp))
448f3330397Srpaulo 		tp->t_flags |= TF_ECN_SND_CWR;
449f3330397Srpaulo }
450f3330397Srpaulo 
451f3330397Srpaulo 
4524d4f2b7dSkefren /* ------------------------------------------------------------------------ */
4534d4f2b7dSkefren 
4544d4f2b7dSkefren /*
4554d4f2b7dSkefren  * TCP/Reno congestion control.
4564d4f2b7dSkefren  */
4574d4f2b7dSkefren static void
tcp_reno_congestion_exp(struct tcpcb * tp)4584d4f2b7dSkefren tcp_reno_congestion_exp(struct tcpcb *tp)
4594d4f2b7dSkefren {
4604d4f2b7dSkefren 
4614d4f2b7dSkefren 	tcp_common_congestion_exp(tp, RENO_BETAA, RENO_BETAB);
4624d4f2b7dSkefren }
463a70594d3Srpaulo 
464f3330397Srpaulo static int
tcp_reno_do_fast_retransmit(struct tcpcb * tp,const struct tcphdr * th)4654d4f2b7dSkefren tcp_reno_do_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
466f3330397Srpaulo {
4671c1f230eSrpaulo 	/*
4681c1f230eSrpaulo 	 * Dup acks mean that packets have left the
4691c1f230eSrpaulo 	 * network (they're now cached at the receiver)
4701c1f230eSrpaulo 	 * so bump cwnd by the amount in the receiver
4711c1f230eSrpaulo 	 * to keep a constant cwnd packets in the
4721c1f230eSrpaulo 	 * network.
4731c1f230eSrpaulo 	 *
4741c1f230eSrpaulo 	 * If we are using TCP/SACK, then enter
4751c1f230eSrpaulo 	 * Fast Recovery if the receiver SACKs
4761c1f230eSrpaulo 	 * data that is tcprexmtthresh * MSS
4771c1f230eSrpaulo 	 * bytes past the last ACKed segment,
4781c1f230eSrpaulo 	 * irrespective of the number of DupAcks.
4791c1f230eSrpaulo 	 */
4801c1f230eSrpaulo 
4814d4f2b7dSkefren 	tcp_seq onxt = tp->snd_nxt;
482f3330397Srpaulo 
483f3330397Srpaulo 	tp->t_partialacks = 0;
484f3330397Srpaulo 	TCP_TIMER_DISARM(tp, TCPT_REXMT);
485f3330397Srpaulo 	tp->t_rtttime = 0;
486f3330397Srpaulo 	if (TCP_SACK_ENABLED(tp)) {
487f3330397Srpaulo 		tp->t_dupacks = tcprexmtthresh;
488f3330397Srpaulo 		tp->sack_newdata = tp->snd_nxt;
489f3330397Srpaulo 		tp->snd_cwnd = tp->t_segsz;
490f3330397Srpaulo 		(void) tcp_output(tp);
491f3330397Srpaulo 		return 0;
492f3330397Srpaulo 	}
493f3330397Srpaulo 	tp->snd_nxt = th->th_ack;
494f3330397Srpaulo 	tp->snd_cwnd = tp->t_segsz;
495f3330397Srpaulo 	(void) tcp_output(tp);
496f3330397Srpaulo 	tp->snd_cwnd = tp->snd_ssthresh + tp->t_segsz * tp->t_dupacks;
497f3330397Srpaulo 	if (SEQ_GT(onxt, tp->snd_nxt))
498f3330397Srpaulo 		tp->snd_nxt = onxt;
499f3330397Srpaulo 
500f3330397Srpaulo 	return 0;
501f3330397Srpaulo }
502f3330397Srpaulo 
5034d4f2b7dSkefren static int
tcp_reno_fast_retransmit(struct tcpcb * tp,const struct tcphdr * th)5044d4f2b7dSkefren tcp_reno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
5054d4f2b7dSkefren {
5064d4f2b7dSkefren 
507be55f323Skefren 	/*
508be55f323Skefren 	 * We know we're losing at the current
509be55f323Skefren 	 * window size so do congestion avoidance
510be55f323Skefren 	 * (set ssthresh to half the current window
511be55f323Skefren 	 * and pull our congestion window back to
512be55f323Skefren 	 * the new ssthresh).
513be55f323Skefren 	 */
514be55f323Skefren 
5154d4f2b7dSkefren 	tcp_reno_congestion_exp(tp);
5164d4f2b7dSkefren 	return tcp_reno_do_fast_retransmit(tp, th);
5174d4f2b7dSkefren }
5184d4f2b7dSkefren 
519f3330397Srpaulo static void
tcp_reno_slow_retransmit(struct tcpcb * tp)520f3330397Srpaulo tcp_reno_slow_retransmit(struct tcpcb *tp)
521f3330397Srpaulo {
52253950d86Smsaitoh 	u_long win;
523f3330397Srpaulo 
524f3330397Srpaulo 	/*
525f3330397Srpaulo 	 * Close the congestion window down to one segment
526f3330397Srpaulo 	 * (we'll open it by one segment for each ack we get).
527f3330397Srpaulo 	 * Since we probably have a window's worth of unacked
528f3330397Srpaulo 	 * data accumulated, this "slow start" keeps us from
529f3330397Srpaulo 	 * dumping all that data as back-to-back packets (which
530f3330397Srpaulo 	 * might overwhelm an intermediate gateway).
531f3330397Srpaulo 	 *
532f3330397Srpaulo 	 * There are two phases to the opening: Initially we
533f3330397Srpaulo 	 * open by one mss on each ack.  This makes the window
534f3330397Srpaulo 	 * size increase exponentially with time.  If the
535f3330397Srpaulo 	 * window is larger than the path can handle, this
536f3330397Srpaulo 	 * exponential growth results in dropped packet(s)
537f3330397Srpaulo 	 * almost immediately.  To get more time between
538f3330397Srpaulo 	 * drops but still "push" the network to take advantage
539f3330397Srpaulo 	 * of improving conditions, we switch from exponential
54040be87aeSandvar 	 * to linear window opening at some threshold size.
54140be87aeSandvar 	 * For a threshold, we use half the current window
542f3330397Srpaulo 	 * size, truncated to a multiple of the mss.
543f3330397Srpaulo 	 *
544f3330397Srpaulo 	 * (the minimum cwnd that will give us exponential
54540be87aeSandvar 	 * growth is 2 mss.  We don't allow the threshold
546f3330397Srpaulo 	 * to go below this.)
547f3330397Srpaulo 	 */
548f3330397Srpaulo 
54953950d86Smsaitoh 	win = ulmin(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz;
550f3330397Srpaulo 	if (win < 2)
551f3330397Srpaulo 		win = 2;
552f3330397Srpaulo 	/* Loss Window MUST be one segment. */
553f3330397Srpaulo 	tp->snd_cwnd = tp->t_segsz;
554f3330397Srpaulo 	tp->snd_ssthresh = win * tp->t_segsz;
555f3330397Srpaulo 	tp->t_partialacks = -1;
556f3330397Srpaulo 	tp->t_dupacks = 0;
55781463c93Syamt 	tp->t_bytes_acked = 0;
5584d4f2b7dSkefren 
5594d4f2b7dSkefren 	if (TCP_ECN_ALLOWED(tp))
5604d4f2b7dSkefren 		tp->t_flags |= TF_ECN_SND_CWR;
561f3330397Srpaulo }
562f3330397Srpaulo 
563f3330397Srpaulo static void
tcp_reno_fast_retransmit_newack(struct tcpcb * tp,const struct tcphdr * th)5647253aad9Syamt tcp_reno_fast_retransmit_newack(struct tcpcb *tp,
565168cd830Schristos     const struct tcphdr *th)
566f3330397Srpaulo {
567f3330397Srpaulo 	if (tp->t_partialacks < 0) {
568f3330397Srpaulo 		/*
569f3330397Srpaulo 		 * We were not in fast recovery.  Reset the duplicate ack
570f3330397Srpaulo 		 * counter.
571f3330397Srpaulo 		 */
572f3330397Srpaulo 		tp->t_dupacks = 0;
573f3330397Srpaulo 	} else {
574f3330397Srpaulo 		/*
575f3330397Srpaulo 		 * Clamp the congestion window to the crossover point and
576f3330397Srpaulo 		 * exit fast recovery.
577f3330397Srpaulo 		 */
578f3330397Srpaulo 		if (tp->snd_cwnd > tp->snd_ssthresh)
579f3330397Srpaulo 			tp->snd_cwnd = tp->snd_ssthresh;
580f3330397Srpaulo 		tp->t_partialacks = -1;
581f3330397Srpaulo 		tp->t_dupacks = 0;
58281463c93Syamt 		tp->t_bytes_acked = 0;
5834d4f2b7dSkefren 		if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
5844d4f2b7dSkefren 			tp->snd_fack = th->th_ack;
585f3330397Srpaulo 	}
586f3330397Srpaulo }
587f3330397Srpaulo 
588f3330397Srpaulo static void
tcp_reno_newack(struct tcpcb * tp,const struct tcphdr * th)5897253aad9Syamt tcp_reno_newack(struct tcpcb *tp, const struct tcphdr *th)
590f3330397Srpaulo {
591f3330397Srpaulo 	/*
592f3330397Srpaulo 	 * When new data is acked, open the congestion window.
59381463c93Syamt 	 */
59481463c93Syamt 
59581463c93Syamt 	u_int cw = tp->snd_cwnd;
59681463c93Syamt 	u_int incr = tp->t_segsz;
59781463c93Syamt 
59881463c93Syamt 	if (tcp_do_abc) {
59981463c93Syamt 
60081463c93Syamt 		/*
60181463c93Syamt 		 * RFC 3465 Appropriate Byte Counting (ABC)
60281463c93Syamt 		 */
60381463c93Syamt 
60481463c93Syamt 		int acked = th->th_ack - tp->snd_una;
60581463c93Syamt 
60681463c93Syamt 		if (cw >= tp->snd_ssthresh) {
60781463c93Syamt 			tp->t_bytes_acked += acked;
60881463c93Syamt 			if (tp->t_bytes_acked >= cw) {
60981463c93Syamt 				/* Time to increase the window. */
61081463c93Syamt 				tp->t_bytes_acked -= cw;
61181463c93Syamt 			} else {
61281463c93Syamt 				/* No need to increase yet. */
61381463c93Syamt 				incr = 0;
61481463c93Syamt 			}
61581463c93Syamt 		} else {
61681463c93Syamt 			/*
61781463c93Syamt 			 * use 2*SMSS or 1*SMSS for the "L" param,
61881463c93Syamt 			 * depending on sysctl setting.
61981463c93Syamt 			 *
62081463c93Syamt 			 * (See RFC 3465 2.3 Choosing the Limit)
62181463c93Syamt 			 */
62281463c93Syamt 			u_int abc_lim;
62381463c93Syamt 
624df8e5bddSyamt 			abc_lim = (tcp_abc_aggressive == 0 ||
625df8e5bddSyamt 			    tp->snd_nxt != tp->snd_max) ? incr : incr * 2;
626d1579b2dSriastradh 			incr = uimin(acked, abc_lim);
62781463c93Syamt 		}
62881463c93Syamt 	} else {
62981463c93Syamt 
63081463c93Syamt 		/*
631f3330397Srpaulo 		 * If the window gives us less than ssthresh packets
632f3330397Srpaulo 		 * in flight, open exponentially (segsz per packet).
633f3330397Srpaulo 		 * Otherwise open linearly: segsz per window
6342f7740a3Syamt 		 * (segsz^2 / cwnd per packet).
635f3330397Srpaulo 		 */
636e1b1f65fSrpaulo 
63781463c93Syamt 		if (cw >= tp->snd_ssthresh) {
638f3330397Srpaulo 			incr = incr * incr / cw;
63981463c93Syamt 		}
64081463c93Syamt 	}
641e1b1f65fSrpaulo 
642d1579b2dSriastradh 	tp->snd_cwnd = uimin(cw + incr, TCP_MAXWIN << tp->snd_scale);
643f3330397Srpaulo }
644f3330397Srpaulo 
645a34217b8Smatt const struct tcp_congctl tcp_reno_ctl = {
646f3330397Srpaulo 	.fast_retransmit = tcp_reno_fast_retransmit,
647f3330397Srpaulo 	.slow_retransmit = tcp_reno_slow_retransmit,
648f3330397Srpaulo 	.fast_retransmit_newack = tcp_reno_fast_retransmit_newack,
649f3330397Srpaulo 	.newack = tcp_reno_newack,
650a70594d3Srpaulo 	.cong_exp = tcp_reno_congestion_exp,
651f3330397Srpaulo };
652f3330397Srpaulo 
653f3330397Srpaulo /*
654f3330397Srpaulo  * TCP/NewReno Congestion control.
655f3330397Srpaulo  */
656f3330397Srpaulo static int
tcp_newreno_fast_retransmit(struct tcpcb * tp,const struct tcphdr * th)6577253aad9Syamt tcp_newreno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
658f3330397Srpaulo {
65918a0ef4aSyamt 
660f3330397Srpaulo 	if (SEQ_LT(th->th_ack, tp->snd_high)) {
661f3330397Srpaulo 		/*
662f3330397Srpaulo 		 * False fast retransmit after timeout.
663f3330397Srpaulo 		 * Do not enter fast recovery
664f3330397Srpaulo 		 */
665f3330397Srpaulo 		tp->t_dupacks = 0;
666f3330397Srpaulo 		return 1;
66718a0ef4aSyamt 	}
668f3330397Srpaulo 	/*
669f3330397Srpaulo 	 * Fast retransmit is same as reno.
670f3330397Srpaulo 	 */
671f3330397Srpaulo 	return tcp_reno_fast_retransmit(tp, th);
672f3330397Srpaulo }
673f3330397Srpaulo 
674f3330397Srpaulo /*
675f3330397Srpaulo  * Implement the NewReno response to a new ack, checking for partial acks in
676f3330397Srpaulo  * fast recovery.
677f3330397Srpaulo  */
678f3330397Srpaulo static void
tcp_newreno_fast_retransmit_newack(struct tcpcb * tp,const struct tcphdr * th)6797253aad9Syamt tcp_newreno_fast_retransmit_newack(struct tcpcb *tp, const struct tcphdr *th)
680f3330397Srpaulo {
681f3330397Srpaulo 	if (tp->t_partialacks < 0) {
682f3330397Srpaulo 		/*
683f3330397Srpaulo 		 * We were not in fast recovery.  Reset the duplicate ack
684f3330397Srpaulo 		 * counter.
685f3330397Srpaulo 		 */
686f3330397Srpaulo 		tp->t_dupacks = 0;
687f3330397Srpaulo 	} else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
688f3330397Srpaulo 		/*
689f3330397Srpaulo 		 * This is a partial ack.  Retransmit the first unacknowledged
690f3330397Srpaulo 		 * segment and deflate the congestion window by the amount of
691f3330397Srpaulo 		 * acknowledged data.  Do not exit fast recovery.
692f3330397Srpaulo 		 */
693f3330397Srpaulo 		tcp_seq onxt = tp->snd_nxt;
694f3330397Srpaulo 		u_long ocwnd = tp->snd_cwnd;
6954d4f2b7dSkefren 		int sack_num_segs = 1, sack_bytes_rxmt = 0;
696f3330397Srpaulo 
697f3330397Srpaulo 		/*
698f3330397Srpaulo 		 * snd_una has not yet been updated and the socket's send
699f3330397Srpaulo 		 * buffer has not yet drained off the ACK'd data, so we
700f3330397Srpaulo 		 * have to leave snd_una as it was to get the correct data
701f3330397Srpaulo 		 * offset in tcp_output().
702f3330397Srpaulo 		 */
7034d4f2b7dSkefren 		tp->t_partialacks++;
704f3330397Srpaulo 		TCP_TIMER_DISARM(tp, TCPT_REXMT);
705f3330397Srpaulo 		tp->t_rtttime = 0;
7064d4f2b7dSkefren 
7074d4f2b7dSkefren 		if (TCP_SACK_ENABLED(tp)) {
708f3330397Srpaulo 			/*
7094d4f2b7dSkefren 			 * Partial ack handling within a sack recovery episode.
7104d4f2b7dSkefren 			 * Keeping this very simple for now. When a partial ack
7114d4f2b7dSkefren 			 * is received, force snd_cwnd to a value that will
7124d4f2b7dSkefren 			 * allow the sender to transmit no more than 2 segments.
7134d4f2b7dSkefren 			 * If necessary, a fancier scheme can be adopted at a
7144d4f2b7dSkefren 			 * later point, but for now, the goal is to prevent the
7154d4f2b7dSkefren 			 * sender from bursting a large amount of data in the
7164d4f2b7dSkefren 			 * midst of sack recovery.
7174d4f2b7dSkefren 		 	 */
7184d4f2b7dSkefren 
7194d4f2b7dSkefren 			/*
7204d4f2b7dSkefren 			 * send one or 2 segments based on how much
7214d4f2b7dSkefren 			 * new data was acked
7224d4f2b7dSkefren 			 */
7234d4f2b7dSkefren 			if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2)
7244d4f2b7dSkefren 				sack_num_segs = 2;
7254d4f2b7dSkefren 			(void)tcp_sack_output(tp, &sack_bytes_rxmt);
7264d4f2b7dSkefren 			tp->snd_cwnd = sack_bytes_rxmt +
7274d4f2b7dSkefren 			    (tp->snd_nxt - tp->sack_newdata) +
7284d4f2b7dSkefren 			    sack_num_segs * tp->t_segsz;
7294d4f2b7dSkefren 			tp->t_flags |= TF_ACKNOW;
7304d4f2b7dSkefren 			(void) tcp_output(tp);
7314d4f2b7dSkefren 		} else {
7324c7fdffbSskrll 			tp->snd_nxt = th->th_ack;
7334d4f2b7dSkefren 			/*
7344d4f2b7dSkefren 			 * Set snd_cwnd to one segment beyond ACK'd offset
7354d4f2b7dSkefren 			 * snd_una is not yet updated when we're called
736f3330397Srpaulo 			 */
737f3330397Srpaulo 			tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una);
738f3330397Srpaulo 			(void) tcp_output(tp);
739f3330397Srpaulo 			tp->snd_cwnd = ocwnd;
740f3330397Srpaulo 			if (SEQ_GT(onxt, tp->snd_nxt))
741f3330397Srpaulo 				tp->snd_nxt = onxt;
742f3330397Srpaulo 			/*
7434d4f2b7dSkefren 			 * Partial window deflation.  Relies on fact that
7444d4f2b7dSkefren 			 * tp->snd_una not updated yet.
745f3330397Srpaulo 		 	 */
7464d4f2b7dSkefren 			tp->snd_cwnd -= (th->th_ack - tp->snd_una -
7474d4f2b7dSkefren 			    tp->t_segsz);
7484d4f2b7dSkefren 		}
749f3330397Srpaulo 	} else {
750f3330397Srpaulo 		/*
751f3330397Srpaulo 		 * Complete ack.  Inflate the congestion window to ssthresh
752f3330397Srpaulo 		 * and exit fast recovery.
753f3330397Srpaulo 		 *
754f3330397Srpaulo 		 * Window inflation should have left us with approx.
755f3330397Srpaulo 		 * snd_ssthresh outstanding data.  But in case we
756f3330397Srpaulo 		 * would be inclined to send a burst, better to do
757f3330397Srpaulo 		 * it via the slow start mechanism.
758f3330397Srpaulo 		 */
759f3330397Srpaulo 		if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
760f3330397Srpaulo 			tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
761f3330397Srpaulo 			    + tp->t_segsz;
762f3330397Srpaulo 		else
763f3330397Srpaulo 			tp->snd_cwnd = tp->snd_ssthresh;
764f3330397Srpaulo 		tp->t_partialacks = -1;
765f3330397Srpaulo 		tp->t_dupacks = 0;
76681463c93Syamt 		tp->t_bytes_acked = 0;
7674d4f2b7dSkefren 		if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
7684d4f2b7dSkefren 			tp->snd_fack = th->th_ack;
769f3330397Srpaulo 	}
770f3330397Srpaulo }
771f3330397Srpaulo 
772f3330397Srpaulo static void
tcp_newreno_newack(struct tcpcb * tp,const struct tcphdr * th)7737253aad9Syamt tcp_newreno_newack(struct tcpcb *tp, const struct tcphdr *th)
774f3330397Srpaulo {
775f3330397Srpaulo 	/*
776e1b1f65fSrpaulo 	 * If we are still in fast recovery (meaning we are using
777e1b1f65fSrpaulo 	 * NewReno and we have only received partial acks), do not
778e1b1f65fSrpaulo 	 * inflate the window yet.
779f3330397Srpaulo 	 */
780e1b1f65fSrpaulo 	if (tp->t_partialacks < 0)
781e1b1f65fSrpaulo 		tcp_reno_newack(tp, th);
782f3330397Srpaulo }
783f3330397Srpaulo 
784f3330397Srpaulo 
785a34217b8Smatt const struct tcp_congctl tcp_newreno_ctl = {
786f3330397Srpaulo 	.fast_retransmit = tcp_newreno_fast_retransmit,
787f3330397Srpaulo 	.slow_retransmit = tcp_reno_slow_retransmit,
788f3330397Srpaulo 	.fast_retransmit_newack = tcp_newreno_fast_retransmit_newack,
789f3330397Srpaulo 	.newack = tcp_newreno_newack,
790a70594d3Srpaulo 	.cong_exp = tcp_reno_congestion_exp,
791f3330397Srpaulo };
792f3330397Srpaulo 
7934d4f2b7dSkefren /*
7944d4f2b7dSkefren  * CUBIC - http://tools.ietf.org/html/draft-rhee-tcpm-cubic-02
7954d4f2b7dSkefren  */
796f3330397Srpaulo 
7974d4f2b7dSkefren /* Cubic prototypes */
7984d4f2b7dSkefren static void	tcp_cubic_update_ctime(struct tcpcb *tp);
7994d4f2b7dSkefren static uint32_t	tcp_cubic_diff_ctime(struct tcpcb *);
8004d4f2b7dSkefren static uint32_t	tcp_cubic_cbrt(uint32_t);
801be55f323Skefren static ulong	tcp_cubic_getW(struct tcpcb *, uint32_t, uint32_t);
8024d4f2b7dSkefren 
8034d4f2b7dSkefren /* Cubic TIME functions - XXX I don't like using timevals and microuptime */
8044d4f2b7dSkefren /*
8054d4f2b7dSkefren  * Set congestion timer to now
8064d4f2b7dSkefren  */
8074d4f2b7dSkefren static void
tcp_cubic_update_ctime(struct tcpcb * tp)8084d4f2b7dSkefren tcp_cubic_update_ctime(struct tcpcb *tp)
8094d4f2b7dSkefren {
8104d4f2b7dSkefren 	struct timeval now_timeval;
8114d4f2b7dSkefren 
8124d4f2b7dSkefren 	getmicrouptime(&now_timeval);
8134d4f2b7dSkefren 	tp->snd_cubic_ctime = now_timeval.tv_sec * 1000 +
8144d4f2b7dSkefren 	    now_timeval.tv_usec / 1000;
8154d4f2b7dSkefren }
8164d4f2b7dSkefren 
8174d4f2b7dSkefren /*
818*e4ebea9eSandvar  * milliseconds from last congestion
8194d4f2b7dSkefren  */
8204d4f2b7dSkefren static uint32_t
tcp_cubic_diff_ctime(struct tcpcb * tp)8214d4f2b7dSkefren tcp_cubic_diff_ctime(struct tcpcb *tp)
8224d4f2b7dSkefren {
8234d4f2b7dSkefren 	struct timeval now_timeval;
8244d4f2b7dSkefren 
8254d4f2b7dSkefren 	getmicrouptime(&now_timeval);
8264d4f2b7dSkefren 	return now_timeval.tv_sec * 1000 + now_timeval.tv_usec / 1000 -
8274d4f2b7dSkefren 	    tp->snd_cubic_ctime;
8284d4f2b7dSkefren }
8294d4f2b7dSkefren 
8304d4f2b7dSkefren /*
8314d4f2b7dSkefren  * Approximate cubic root
8324d4f2b7dSkefren  */
8334d4f2b7dSkefren #define CBRT_ROUNDS 30
8344d4f2b7dSkefren static uint32_t
tcp_cubic_cbrt(uint32_t v)8354d4f2b7dSkefren tcp_cubic_cbrt(uint32_t v)
8364d4f2b7dSkefren {
8374d4f2b7dSkefren 	int i, rounds = CBRT_ROUNDS;
8384d4f2b7dSkefren 	uint64_t x = v / 3;
8394d4f2b7dSkefren 
8404d4f2b7dSkefren 	/* We fail to calculate correct for small numbers */
8414d4f2b7dSkefren 	if (v == 0)
8424d4f2b7dSkefren 		return 0;
8434d4f2b7dSkefren 	else if (v < 4)
8444d4f2b7dSkefren 		return 1;
8454d4f2b7dSkefren 
8464d4f2b7dSkefren 	/*
8474d4f2b7dSkefren 	 * largest x that 2*x^3+3*x fits 64bit
8484d4f2b7dSkefren 	 * Avoid overflow for a time cost
8494d4f2b7dSkefren 	 */
8504d4f2b7dSkefren 	if (x > 2097151)
8514d4f2b7dSkefren 		rounds += 10;
8524d4f2b7dSkefren 
8534d4f2b7dSkefren 	for (i = 0; i < rounds; i++)
8544d4f2b7dSkefren 		if (rounds == CBRT_ROUNDS)
8554d4f2b7dSkefren 			x = (v + 2 * x * x * x) / (3 * x * x);
8564d4f2b7dSkefren 		else
8574d4f2b7dSkefren 			/* Avoid overflow */
8584d4f2b7dSkefren 			x = v / (3 * x * x) + 2 * x / 3;
8594d4f2b7dSkefren 
8604d4f2b7dSkefren 	return (uint32_t)x;
8614d4f2b7dSkefren }
8624d4f2b7dSkefren 
863be55f323Skefren /* Draft Rhee Section 3.1 - get W(t+rtt) - Eq. 1 */
864be55f323Skefren static ulong
tcp_cubic_getW(struct tcpcb * tp,uint32_t ms_elapsed,uint32_t rtt)865be55f323Skefren tcp_cubic_getW(struct tcpcb *tp, uint32_t ms_elapsed, uint32_t rtt)
8664d4f2b7dSkefren {
867be55f323Skefren 	uint32_t K;
868be55f323Skefren 	long tK3;
8694d4f2b7dSkefren 
870be55f323Skefren 	/* Section 3.1 Eq. 2 */
871be55f323Skefren 	K = tcp_cubic_cbrt(tp->snd_cubic_wmax / CUBIC_BETAB *
8724d4f2b7dSkefren 	    CUBIC_CB / CUBIC_CA);
873be55f323Skefren 	/*  (t-K)^3 - not clear why is the measure unit mattering */
874be55f323Skefren 	tK3 = (long)(ms_elapsed + rtt) - (long)K;
875be55f323Skefren 	tK3 = tK3 * tK3 * tK3;
8764d4f2b7dSkefren 
877be55f323Skefren 	return CUBIC_CA * tK3 / CUBIC_CB + tp->snd_cubic_wmax;
8784d4f2b7dSkefren }
8794d4f2b7dSkefren 
8804d4f2b7dSkefren static void
tcp_cubic_congestion_exp(struct tcpcb * tp)8814d4f2b7dSkefren tcp_cubic_congestion_exp(struct tcpcb *tp)
8824d4f2b7dSkefren {
8834d4f2b7dSkefren 
884be55f323Skefren 	/*
885be55f323Skefren 	 * Congestion - Set WMax and shrink cwnd
886be55f323Skefren 	 */
8874d4f2b7dSkefren 	tcp_cubic_update_ctime(tp);
8884d4f2b7dSkefren 
8894d4f2b7dSkefren 	/* Section 3.6 - Fast Convergence */
8904d4f2b7dSkefren 	if (tp->snd_cubic_wmax < tp->snd_cubic_wmax_last) {
8914d4f2b7dSkefren 		tp->snd_cubic_wmax_last = tp->snd_cubic_wmax;
8924d4f2b7dSkefren 		tp->snd_cubic_wmax = tp->snd_cubic_wmax / 2 +
8934d4f2b7dSkefren 		    tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB / 2;
8944d4f2b7dSkefren 	} else {
8954d4f2b7dSkefren 		tp->snd_cubic_wmax_last = tp->snd_cubic_wmax;
8964d4f2b7dSkefren 		tp->snd_cubic_wmax = tp->snd_cwnd;
8974d4f2b7dSkefren 	}
898be55f323Skefren 
899d1579b2dSriastradh 	tp->snd_cubic_wmax = uimax(tp->t_segsz, tp->snd_cubic_wmax);
900be55f323Skefren 
901be55f323Skefren 	/* Shrink CWND */
9024d4f2b7dSkefren 	tcp_common_congestion_exp(tp, CUBIC_BETAA, CUBIC_BETAB);
9034d4f2b7dSkefren }
9044d4f2b7dSkefren 
9054d4f2b7dSkefren static int
tcp_cubic_fast_retransmit(struct tcpcb * tp,const struct tcphdr * th)9064d4f2b7dSkefren tcp_cubic_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
9074d4f2b7dSkefren {
9084d4f2b7dSkefren 
9094d4f2b7dSkefren 	if (SEQ_LT(th->th_ack, tp->snd_high)) {
9104d4f2b7dSkefren 		/* See newreno */
9114d4f2b7dSkefren 		tp->t_dupacks = 0;
9124d4f2b7dSkefren 		return 1;
9134d4f2b7dSkefren 	}
9144d4f2b7dSkefren 
9154d4f2b7dSkefren 	/*
916be55f323Skefren 	 * mark WMax
9174d4f2b7dSkefren 	 */
9184d4f2b7dSkefren 	tcp_cubic_congestion_exp(tp);
919be55f323Skefren 
920be55f323Skefren 	/* Do fast retransmit */
9214d4f2b7dSkefren 	return tcp_reno_do_fast_retransmit(tp, th);
9224d4f2b7dSkefren }
9234d4f2b7dSkefren 
9244d4f2b7dSkefren static void
tcp_cubic_newack(struct tcpcb * tp,const struct tcphdr * th)9254d4f2b7dSkefren tcp_cubic_newack(struct tcpcb *tp, const struct tcphdr *th)
9264d4f2b7dSkefren {
9274d4f2b7dSkefren 	uint32_t ms_elapsed, rtt;
9284d4f2b7dSkefren 	u_long w_tcp;
9294d4f2b7dSkefren 
930be55f323Skefren 	/* Congestion avoidance and not in fast recovery and usable rtt */
931be55f323Skefren 	if (tp->snd_cwnd > tp->snd_ssthresh && tp->t_partialacks < 0 &&
932be55f323Skefren 	    /*
933be55f323Skefren 	     * t_srtt is 1/32 units of slow ticks
934be55f323Skefren 	     * converting it in ms would be equal to
935be55f323Skefren 	     * (t_srtt >> 5) * 1000 / PR_SLOWHZ ~= (t_srtt << 5) / PR_SLOWHZ
936be55f323Skefren 	     */
937be55f323Skefren 	    (rtt = (tp->t_srtt << 5) / PR_SLOWHZ) > 0) {
9384d4f2b7dSkefren 		ms_elapsed = tcp_cubic_diff_ctime(tp);
9394d4f2b7dSkefren 
940be55f323Skefren 		/* Compute W_tcp(t) */
941be55f323Skefren 		w_tcp = tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB +
9424d4f2b7dSkefren 		    ms_elapsed / rtt / 3;
9434d4f2b7dSkefren 
9444d4f2b7dSkefren 		if (tp->snd_cwnd > w_tcp) {
945be55f323Skefren 			/* Not in TCP friendly mode */
946be55f323Skefren 			tp->snd_cwnd += (tcp_cubic_getW(tp, ms_elapsed, rtt) -
947be55f323Skefren 			    tp->snd_cwnd) / tp->snd_cwnd;
9484d4f2b7dSkefren 		} else {
9494d4f2b7dSkefren 			/* friendly TCP mode */
9504d4f2b7dSkefren 			tp->snd_cwnd = w_tcp;
9514d4f2b7dSkefren 		}
9524d4f2b7dSkefren 
9534d4f2b7dSkefren 		/* Make sure we are within limits */
954d1579b2dSriastradh 		tp->snd_cwnd = uimax(tp->snd_cwnd, tp->t_segsz);
955d1579b2dSriastradh 		tp->snd_cwnd = uimin(tp->snd_cwnd, TCP_MAXWIN << tp->snd_scale);
9564d4f2b7dSkefren 	} else {
9574d4f2b7dSkefren 		/* Use New Reno */
9584d4f2b7dSkefren 		tcp_newreno_newack(tp, th);
9594d4f2b7dSkefren 	}
9604d4f2b7dSkefren }
9614d4f2b7dSkefren 
9624d4f2b7dSkefren static void
tcp_cubic_slow_retransmit(struct tcpcb * tp)9634d4f2b7dSkefren tcp_cubic_slow_retransmit(struct tcpcb *tp)
9644d4f2b7dSkefren {
9654d4f2b7dSkefren 
966be55f323Skefren 	/* Timeout - Mark new congestion */
967be55f323Skefren 	tcp_cubic_congestion_exp(tp);
9684d4f2b7dSkefren 
969be55f323Skefren 	/* Loss Window MUST be one segment. */
970be55f323Skefren 	tp->snd_cwnd = tp->t_segsz;
971be55f323Skefren 	tp->t_partialacks = -1;
972be55f323Skefren 	tp->t_dupacks = 0;
973be55f323Skefren 	tp->t_bytes_acked = 0;
974be55f323Skefren 
975be55f323Skefren 	if (TCP_ECN_ALLOWED(tp))
976be55f323Skefren 		tp->t_flags |= TF_ECN_SND_CWR;
9774d4f2b7dSkefren }
9784d4f2b7dSkefren 
9794d4f2b7dSkefren const struct tcp_congctl tcp_cubic_ctl = {
9804d4f2b7dSkefren 	.fast_retransmit = tcp_cubic_fast_retransmit,
9814d4f2b7dSkefren 	.slow_retransmit = tcp_cubic_slow_retransmit,
9824d4f2b7dSkefren 	.fast_retransmit_newack = tcp_newreno_fast_retransmit_newack,
9834d4f2b7dSkefren 	.newack = tcp_cubic_newack,
9844d4f2b7dSkefren 	.cong_exp = tcp_cubic_congestion_exp,
9854d4f2b7dSkefren };
986