1*e4ebea9eSandvar /* $NetBSD: tcp_congctl.c,v 1.29 2024/05/14 19:00:44 andvar Exp $ */
2f3330397Srpaulo
3f3330397Srpaulo /*-
4f3330397Srpaulo * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc.
5f3330397Srpaulo * All rights reserved.
6f3330397Srpaulo *
7f3330397Srpaulo * This code is derived from software contributed to The NetBSD Foundation
8f3330397Srpaulo * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
9f3330397Srpaulo * Facility, NASA Ames Research Center.
10f3330397Srpaulo * This code is derived from software contributed to The NetBSD Foundation
11f3330397Srpaulo * by Charles M. Hannum.
12f3330397Srpaulo * This code is derived from software contributed to The NetBSD Foundation
13f3330397Srpaulo * by Rui Paulo.
14f3330397Srpaulo *
15f3330397Srpaulo * Redistribution and use in source and binary forms, with or without
16f3330397Srpaulo * modification, are permitted provided that the following conditions
17f3330397Srpaulo * are met:
18f3330397Srpaulo * 1. Redistributions of source code must retain the above copyright
19f3330397Srpaulo * notice, this list of conditions and the following disclaimer.
20f3330397Srpaulo * 2. Redistributions in binary form must reproduce the above copyright
21f3330397Srpaulo * notice, this list of conditions and the following disclaimer in the
22f3330397Srpaulo * documentation and/or other materials provided with the distribution.
23f3330397Srpaulo *
24f3330397Srpaulo * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
25f3330397Srpaulo * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
26f3330397Srpaulo * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
27f3330397Srpaulo * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
28f3330397Srpaulo * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29f3330397Srpaulo * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
30f3330397Srpaulo * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31f3330397Srpaulo * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
32f3330397Srpaulo * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33f3330397Srpaulo * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34f3330397Srpaulo * POSSIBILITY OF SUCH DAMAGE.
35f3330397Srpaulo */
36f3330397Srpaulo
37f3330397Srpaulo /*
38f3330397Srpaulo * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
39f3330397Srpaulo * All rights reserved.
40f3330397Srpaulo *
41f3330397Srpaulo * Redistribution and use in source and binary forms, with or without
42f3330397Srpaulo * modification, are permitted provided that the following conditions
43f3330397Srpaulo * are met:
44f3330397Srpaulo * 1. Redistributions of source code must retain the above copyright
45f3330397Srpaulo * notice, this list of conditions and the following disclaimer.
46f3330397Srpaulo * 2. Redistributions in binary form must reproduce the above copyright
47f3330397Srpaulo * notice, this list of conditions and the following disclaimer in the
48f3330397Srpaulo * documentation and/or other materials provided with the distribution.
49f3330397Srpaulo * 3. Neither the name of the project nor the names of its contributors
50f3330397Srpaulo * may be used to endorse or promote products derived from this software
51f3330397Srpaulo * without specific prior written permission.
52f3330397Srpaulo *
53f3330397Srpaulo * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
54f3330397Srpaulo * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55f3330397Srpaulo * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56f3330397Srpaulo * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
57f3330397Srpaulo * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58f3330397Srpaulo * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59f3330397Srpaulo * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60f3330397Srpaulo * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61f3330397Srpaulo * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62f3330397Srpaulo * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63f3330397Srpaulo * SUCH DAMAGE.
64f3330397Srpaulo */
65f3330397Srpaulo
66f3330397Srpaulo /*
67f3330397Srpaulo * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
68f3330397Srpaulo *
69f3330397Srpaulo * NRL grants permission for redistribution and use in source and binary
70f3330397Srpaulo * forms, with or without modification, of the software and documentation
71f3330397Srpaulo * created at NRL provided that the following conditions are met:
72f3330397Srpaulo *
73f3330397Srpaulo * 1. Redistributions of source code must retain the above copyright
74f3330397Srpaulo * notice, this list of conditions and the following disclaimer.
75f3330397Srpaulo * 2. Redistributions in binary form must reproduce the above copyright
76f3330397Srpaulo * notice, this list of conditions and the following disclaimer in the
77f3330397Srpaulo * documentation and/or other materials provided with the distribution.
78f3330397Srpaulo * 3. All advertising materials mentioning features or use of this software
79f3330397Srpaulo * must display the following acknowledgements:
80f3330397Srpaulo * This product includes software developed by the University of
81f3330397Srpaulo * California, Berkeley and its contributors.
82f3330397Srpaulo * This product includes software developed at the Information
83f3330397Srpaulo * Technology Division, US Naval Research Laboratory.
84f3330397Srpaulo * 4. Neither the name of the NRL nor the names of its contributors
85f3330397Srpaulo * may be used to endorse or promote products derived from this software
86f3330397Srpaulo * without specific prior written permission.
87f3330397Srpaulo *
88f3330397Srpaulo * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
89f3330397Srpaulo * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
90f3330397Srpaulo * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
91f3330397Srpaulo * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
92f3330397Srpaulo * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
93f3330397Srpaulo * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
94f3330397Srpaulo * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
95f3330397Srpaulo * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
96f3330397Srpaulo * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
97f3330397Srpaulo * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
98f3330397Srpaulo * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
99f3330397Srpaulo *
100f3330397Srpaulo * The views and conclusions contained in the software and documentation
101f3330397Srpaulo * are those of the authors and should not be interpreted as representing
102f3330397Srpaulo * official policies, either expressed or implied, of the US Naval
103f3330397Srpaulo * Research Laboratory (NRL).
104f3330397Srpaulo */
105f3330397Srpaulo
106f3330397Srpaulo /*
107f3330397Srpaulo * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
108f3330397Srpaulo * The Regents of the University of California. All rights reserved.
109f3330397Srpaulo *
110f3330397Srpaulo * Redistribution and use in source and binary forms, with or without
111f3330397Srpaulo * modification, are permitted provided that the following conditions
112f3330397Srpaulo * are met:
113f3330397Srpaulo * 1. Redistributions of source code must retain the above copyright
114f3330397Srpaulo * notice, this list of conditions and the following disclaimer.
115f3330397Srpaulo * 2. Redistributions in binary form must reproduce the above copyright
116f3330397Srpaulo * notice, this list of conditions and the following disclaimer in the
117f3330397Srpaulo * documentation and/or other materials provided with the distribution.
118f3330397Srpaulo * 3. Neither the name of the University nor the names of its contributors
119f3330397Srpaulo * may be used to endorse or promote products derived from this software
120f3330397Srpaulo * without specific prior written permission.
121f3330397Srpaulo *
122f3330397Srpaulo * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
123f3330397Srpaulo * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
124f3330397Srpaulo * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
125f3330397Srpaulo * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
126f3330397Srpaulo * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
127f3330397Srpaulo * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
128f3330397Srpaulo * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
129f3330397Srpaulo * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
130f3330397Srpaulo * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
131f3330397Srpaulo * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
132f3330397Srpaulo * SUCH DAMAGE.
133f3330397Srpaulo *
134f3330397Srpaulo * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
135f3330397Srpaulo */
136f3330397Srpaulo
137f3330397Srpaulo #include <sys/cdefs.h>
138*e4ebea9eSandvar __KERNEL_RCSID(0, "$NetBSD: tcp_congctl.c,v 1.29 2024/05/14 19:00:44 andvar Exp $");
139f3330397Srpaulo
1401c4a50f1Spooka #ifdef _KERNEL_OPT
141f3330397Srpaulo #include "opt_inet.h"
142f3330397Srpaulo #include "opt_tcp_debug.h"
143f3330397Srpaulo #include "opt_tcp_congctl.h"
1441c4a50f1Spooka #endif
145f3330397Srpaulo
146f3330397Srpaulo #include <sys/param.h>
147f3330397Srpaulo #include <sys/systm.h>
148f3330397Srpaulo #include <sys/malloc.h>
149f3330397Srpaulo #include <sys/mbuf.h>
150f3330397Srpaulo #include <sys/protosw.h>
151f3330397Srpaulo #include <sys/socket.h>
152f3330397Srpaulo #include <sys/socketvar.h>
153f3330397Srpaulo #include <sys/errno.h>
154f3330397Srpaulo #include <sys/syslog.h>
155f3330397Srpaulo #include <sys/pool.h>
156f3330397Srpaulo #include <sys/domain.h>
157f3330397Srpaulo #include <sys/kernel.h>
15848e23b4aSxtraeme #include <sys/mutex.h>
159f3330397Srpaulo
160f3330397Srpaulo #include <net/if.h>
161f3330397Srpaulo
162f3330397Srpaulo #include <netinet/in.h>
163f3330397Srpaulo #include <netinet/in_systm.h>
164f3330397Srpaulo #include <netinet/ip.h>
165f3330397Srpaulo #include <netinet/in_pcb.h>
166f3330397Srpaulo #include <netinet/in_var.h>
167f3330397Srpaulo #include <netinet/ip_var.h>
168f3330397Srpaulo
169f3330397Srpaulo #ifdef INET6
170f3330397Srpaulo #include <netinet/ip6.h>
171f3330397Srpaulo #include <netinet6/ip6_var.h>
172f3330397Srpaulo #include <netinet6/in6_pcb.h>
173f3330397Srpaulo #include <netinet6/ip6_var.h>
174f3330397Srpaulo #include <netinet6/in6_var.h>
175f3330397Srpaulo #include <netinet/icmp6.h>
176f3330397Srpaulo #endif
177f3330397Srpaulo
178f3330397Srpaulo #include <netinet/tcp.h>
179f3330397Srpaulo #include <netinet/tcp_fsm.h>
180f3330397Srpaulo #include <netinet/tcp_seq.h>
181f3330397Srpaulo #include <netinet/tcp_timer.h>
182f3330397Srpaulo #include <netinet/tcp_var.h>
183f3330397Srpaulo #include <netinet/tcp_congctl.h>
184f3330397Srpaulo #ifdef TCP_DEBUG
185f3330397Srpaulo #include <netinet/tcp_debug.h>
186f3330397Srpaulo #endif
187f3330397Srpaulo
188f3330397Srpaulo /*
189f3330397Srpaulo * TODO:
190f3330397Srpaulo * consider separating the actual implementations in another file.
191f3330397Srpaulo */
192f3330397Srpaulo
1934d4f2b7dSkefren static void tcp_common_congestion_exp(struct tcpcb *, int, int);
1944d4f2b7dSkefren
1954d4f2b7dSkefren static int tcp_reno_do_fast_retransmit(struct tcpcb *, const struct tcphdr *);
1967253aad9Syamt static int tcp_reno_fast_retransmit(struct tcpcb *, const struct tcphdr *);
197f3330397Srpaulo static void tcp_reno_slow_retransmit(struct tcpcb *);
1987253aad9Syamt static void tcp_reno_fast_retransmit_newack(struct tcpcb *,
1997253aad9Syamt const struct tcphdr *);
2007253aad9Syamt static void tcp_reno_newack(struct tcpcb *, const struct tcphdr *);
201a70594d3Srpaulo static void tcp_reno_congestion_exp(struct tcpcb *tp);
202f3330397Srpaulo
2037253aad9Syamt static int tcp_newreno_fast_retransmit(struct tcpcb *, const struct tcphdr *);
204f3330397Srpaulo static void tcp_newreno_fast_retransmit_newack(struct tcpcb *,
2057253aad9Syamt const struct tcphdr *);
2067253aad9Syamt static void tcp_newreno_newack(struct tcpcb *, const struct tcphdr *);
207f3330397Srpaulo
2084d4f2b7dSkefren static int tcp_cubic_fast_retransmit(struct tcpcb *, const struct tcphdr *);
2094d4f2b7dSkefren static void tcp_cubic_slow_retransmit(struct tcpcb *tp);
2104d4f2b7dSkefren static void tcp_cubic_newack(struct tcpcb *, const struct tcphdr *);
2114d4f2b7dSkefren static void tcp_cubic_congestion_exp(struct tcpcb *);
212f3330397Srpaulo
213f3330397Srpaulo static void tcp_congctl_fillnames(void);
214f3330397Srpaulo
215f3330397Srpaulo extern int tcprexmtthresh;
216f3330397Srpaulo
217f3330397Srpaulo MALLOC_DEFINE(M_TCPCONGCTL, "tcpcongctl", "TCP congestion control structures");
218f3330397Srpaulo
219a34217b8Smatt /* currently selected global congestion control */
220a34217b8Smatt char tcp_congctl_global_name[TCPCC_MAXLEN];
221a34217b8Smatt
222a34217b8Smatt /* available global congestion control algorithms */
223a34217b8Smatt char tcp_congctl_avail[10 * TCPCC_MAXLEN];
224a34217b8Smatt
225f3330397Srpaulo /*
226f3330397Srpaulo * Used to list the available congestion control algorithms.
227f3330397Srpaulo */
228a34217b8Smatt TAILQ_HEAD(, tcp_congctlent) tcp_congctlhd =
229a34217b8Smatt TAILQ_HEAD_INITIALIZER(tcp_congctlhd);
230a34217b8Smatt
231a34217b8Smatt static struct tcp_congctlent * tcp_congctl_global;
232f3330397Srpaulo
23348e23b4aSxtraeme static kmutex_t tcp_congctl_mtx;
234f3330397Srpaulo
235f3330397Srpaulo void
tcp_congctl_init(void)236f3330397Srpaulo tcp_congctl_init(void)
237f3330397Srpaulo {
2387c79fd6cSmartin int r __diagused;
239f3330397Srpaulo
24048e23b4aSxtraeme mutex_init(&tcp_congctl_mtx, MUTEX_DEFAULT, IPL_NONE);
241f3330397Srpaulo
242f3330397Srpaulo /* Base algorithms. */
243f3330397Srpaulo r = tcp_congctl_register("reno", &tcp_reno_ctl);
244f3330397Srpaulo KASSERT(r == 0);
245f3330397Srpaulo r = tcp_congctl_register("newreno", &tcp_newreno_ctl);
246f3330397Srpaulo KASSERT(r == 0);
2474d4f2b7dSkefren r = tcp_congctl_register("cubic", &tcp_cubic_ctl);
2484d4f2b7dSkefren KASSERT(r == 0);
249f3330397Srpaulo
250f3330397Srpaulo /* NewReno is the default. */
251f3330397Srpaulo #ifndef TCP_CONGCTL_DEFAULT
252f3330397Srpaulo #define TCP_CONGCTL_DEFAULT "newreno"
253f3330397Srpaulo #endif
254f3330397Srpaulo
255f3330397Srpaulo r = tcp_congctl_select(NULL, TCP_CONGCTL_DEFAULT);
256f3330397Srpaulo KASSERT(r == 0);
257f3330397Srpaulo }
258f3330397Srpaulo
259f3330397Srpaulo /*
260f3330397Srpaulo * Register a congestion algorithm and select it if we have none.
261f3330397Srpaulo */
262f3330397Srpaulo int
tcp_congctl_register(const char * name,const struct tcp_congctl * tcc)263a34217b8Smatt tcp_congctl_register(const char *name, const struct tcp_congctl *tcc)
264f3330397Srpaulo {
265f3330397Srpaulo struct tcp_congctlent *ntcc, *tccp;
266f3330397Srpaulo
267f3330397Srpaulo TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent)
268f3330397Srpaulo if (!strcmp(name, tccp->congctl_name)) {
269f3330397Srpaulo /* name already registered */
270f3330397Srpaulo return EEXIST;
271f3330397Srpaulo }
272f3330397Srpaulo
273a34217b8Smatt ntcc = malloc(sizeof(*ntcc), M_TCPCONGCTL, M_WAITOK|M_ZERO);
274f3330397Srpaulo
275f3330397Srpaulo strlcpy(ntcc->congctl_name, name, sizeof(ntcc->congctl_name) - 1);
276f3330397Srpaulo ntcc->congctl_ctl = tcc;
277f3330397Srpaulo
278f3330397Srpaulo TAILQ_INSERT_TAIL(&tcp_congctlhd, ntcc, congctl_ent);
279f3330397Srpaulo tcp_congctl_fillnames();
280f3330397Srpaulo
281f3330397Srpaulo if (TAILQ_FIRST(&tcp_congctlhd) == ntcc)
282f3330397Srpaulo tcp_congctl_select(NULL, name);
283f3330397Srpaulo
284f3330397Srpaulo return 0;
285f3330397Srpaulo }
286f3330397Srpaulo
287f3330397Srpaulo int
tcp_congctl_unregister(const char * name)288f3330397Srpaulo tcp_congctl_unregister(const char *name)
289f3330397Srpaulo {
290f3330397Srpaulo struct tcp_congctlent *tccp, *rtccp;
291f3330397Srpaulo unsigned int size;
292f3330397Srpaulo
293f3330397Srpaulo rtccp = NULL;
294f3330397Srpaulo size = 0;
295f3330397Srpaulo TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
296f3330397Srpaulo if (!strcmp(name, tccp->congctl_name))
297f3330397Srpaulo rtccp = tccp;
298f3330397Srpaulo size++;
299f3330397Srpaulo }
300f3330397Srpaulo
301f3330397Srpaulo if (!rtccp)
302f3330397Srpaulo return ENOENT;
303f3330397Srpaulo
304a34217b8Smatt if (size <= 1 || tcp_congctl_global == rtccp || rtccp->congctl_refcnt)
305f3330397Srpaulo return EBUSY;
306f3330397Srpaulo
307f3330397Srpaulo TAILQ_REMOVE(&tcp_congctlhd, rtccp, congctl_ent);
308f3330397Srpaulo free(rtccp, M_TCPCONGCTL);
309f3330397Srpaulo tcp_congctl_fillnames();
310f3330397Srpaulo
311f3330397Srpaulo return 0;
312f3330397Srpaulo }
313f3330397Srpaulo
314f3330397Srpaulo /*
315f3330397Srpaulo * Select a congestion algorithm by name.
316f3330397Srpaulo */
317f3330397Srpaulo int
tcp_congctl_select(struct tcpcb * tp,const char * name)318f3330397Srpaulo tcp_congctl_select(struct tcpcb *tp, const char *name)
319f3330397Srpaulo {
320a34217b8Smatt struct tcp_congctlent *tccp, *old_tccp, *new_tccp;
321a34217b8Smatt bool old_found, new_found;
322f3330397Srpaulo
323f3330397Srpaulo KASSERT(name);
324f3330397Srpaulo
325a34217b8Smatt old_found = (tp == NULL || tp->t_congctl == NULL);
326a34217b8Smatt old_tccp = NULL;
327a34217b8Smatt new_found = false;
328a34217b8Smatt new_tccp = NULL;
329a34217b8Smatt
330a34217b8Smatt TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
331a34217b8Smatt if (!old_found && tccp->congctl_ctl == tp->t_congctl) {
332a34217b8Smatt old_tccp = tccp;
333a34217b8Smatt old_found = true;
334a34217b8Smatt }
335a34217b8Smatt
336a34217b8Smatt if (!new_found && !strcmp(name, tccp->congctl_name)) {
337a34217b8Smatt new_tccp = tccp;
338a34217b8Smatt new_found = true;
339a34217b8Smatt }
340a34217b8Smatt
341a34217b8Smatt if (new_found && old_found) {
342f3330397Srpaulo if (tp) {
34348e23b4aSxtraeme mutex_enter(&tcp_congctl_mtx);
344a34217b8Smatt if (old_tccp)
345a34217b8Smatt old_tccp->congctl_refcnt--;
346a34217b8Smatt tp->t_congctl = new_tccp->congctl_ctl;
347a34217b8Smatt new_tccp->congctl_refcnt++;
34848e23b4aSxtraeme mutex_exit(&tcp_congctl_mtx);
349f3330397Srpaulo } else {
350a34217b8Smatt tcp_congctl_global = new_tccp;
351f3330397Srpaulo strlcpy(tcp_congctl_global_name,
352a34217b8Smatt new_tccp->congctl_name,
353f3330397Srpaulo sizeof(tcp_congctl_global_name) - 1);
354f3330397Srpaulo }
355f3330397Srpaulo return 0;
356f3330397Srpaulo }
357a34217b8Smatt }
358f3330397Srpaulo
359f3330397Srpaulo return EINVAL;
360f3330397Srpaulo }
361f3330397Srpaulo
362a34217b8Smatt void
tcp_congctl_release(struct tcpcb * tp)363a34217b8Smatt tcp_congctl_release(struct tcpcb *tp)
364a34217b8Smatt {
365a34217b8Smatt struct tcp_congctlent *tccp;
366a34217b8Smatt
367a34217b8Smatt KASSERT(tp->t_congctl);
368a34217b8Smatt
369a34217b8Smatt TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
370a34217b8Smatt if (tccp->congctl_ctl == tp->t_congctl) {
371a34217b8Smatt tccp->congctl_refcnt--;
372a34217b8Smatt return;
373a34217b8Smatt }
374a34217b8Smatt }
375a34217b8Smatt }
376a34217b8Smatt
377f3330397Srpaulo /*
378f3330397Srpaulo * Returns the name of a congestion algorithm.
379f3330397Srpaulo */
380f3330397Srpaulo const char *
tcp_congctl_bystruct(const struct tcp_congctl * tcc)381f3330397Srpaulo tcp_congctl_bystruct(const struct tcp_congctl *tcc)
382f3330397Srpaulo {
383f3330397Srpaulo struct tcp_congctlent *tccp;
384f3330397Srpaulo
385f3330397Srpaulo KASSERT(tcc);
386f3330397Srpaulo
387f3330397Srpaulo TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent)
388f3330397Srpaulo if (tccp->congctl_ctl == tcc)
389f3330397Srpaulo return tccp->congctl_name;
390f3330397Srpaulo
391f3330397Srpaulo return NULL;
392f3330397Srpaulo }
393f3330397Srpaulo
394f3330397Srpaulo static void
tcp_congctl_fillnames(void)395f3330397Srpaulo tcp_congctl_fillnames(void)
396f3330397Srpaulo {
397f3330397Srpaulo struct tcp_congctlent *tccp;
398f3330397Srpaulo const char *delim = " ";
399f3330397Srpaulo
400f3330397Srpaulo tcp_congctl_avail[0] = '\0';
401f3330397Srpaulo TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
402f3330397Srpaulo strlcat(tcp_congctl_avail, tccp->congctl_name,
403f3330397Srpaulo sizeof(tcp_congctl_avail) - 1);
404f3330397Srpaulo if (TAILQ_NEXT(tccp, congctl_ent))
405f3330397Srpaulo strlcat(tcp_congctl_avail, delim,
406f3330397Srpaulo sizeof(tcp_congctl_avail) - 1);
407f3330397Srpaulo }
408f3330397Srpaulo
409f3330397Srpaulo }
410f3330397Srpaulo
411f3330397Srpaulo /* ------------------------------------------------------------------------ */
412f3330397Srpaulo
413a70594d3Srpaulo /*
4144d4f2b7dSkefren * Common stuff
415a70594d3Srpaulo */
4164d4f2b7dSkefren
4174d4f2b7dSkefren /* Window reduction (1-beta) for [New]Reno: 0.5 */
4184d4f2b7dSkefren #define RENO_BETAA 1
4194d4f2b7dSkefren #define RENO_BETAB 2
4204d4f2b7dSkefren /* Window reduction (1-beta) for Cubic: 0.8 */
4214d4f2b7dSkefren #define CUBIC_BETAA 4
4224d4f2b7dSkefren #define CUBIC_BETAB 5
4234d4f2b7dSkefren /* Draft Rhee Section 4.1 */
4244d4f2b7dSkefren #define CUBIC_CA 4
4254d4f2b7dSkefren #define CUBIC_CB 10
4264d4f2b7dSkefren
427a70594d3Srpaulo static void
tcp_common_congestion_exp(struct tcpcb * tp,int betaa,int betab)4284d4f2b7dSkefren tcp_common_congestion_exp(struct tcpcb *tp, int betaa, int betab)
429f3330397Srpaulo {
43053950d86Smsaitoh u_long win;
431f3330397Srpaulo
432f3330397Srpaulo /*
4334d4f2b7dSkefren * Reduce the congestion window and the slow start threshold.
434f3330397Srpaulo */
43553950d86Smsaitoh win = ulmin(tp->snd_wnd, tp->snd_cwnd) * betaa / betab / tp->t_segsz;
436f3330397Srpaulo if (win < 2)
437f3330397Srpaulo win = 2;
438f3330397Srpaulo
439f3330397Srpaulo tp->snd_ssthresh = win * tp->t_segsz;
440f3330397Srpaulo tp->snd_recover = tp->snd_max;
441f3330397Srpaulo tp->snd_cwnd = tp->snd_ssthresh;
442f3330397Srpaulo
4431c1f230eSrpaulo /*
4441c1f230eSrpaulo * When using TCP ECN, notify the peer that
4451c1f230eSrpaulo * we reduced the cwnd.
4461c1f230eSrpaulo */
447f3330397Srpaulo if (TCP_ECN_ALLOWED(tp))
448f3330397Srpaulo tp->t_flags |= TF_ECN_SND_CWR;
449f3330397Srpaulo }
450f3330397Srpaulo
451f3330397Srpaulo
4524d4f2b7dSkefren /* ------------------------------------------------------------------------ */
4534d4f2b7dSkefren
4544d4f2b7dSkefren /*
4554d4f2b7dSkefren * TCP/Reno congestion control.
4564d4f2b7dSkefren */
4574d4f2b7dSkefren static void
tcp_reno_congestion_exp(struct tcpcb * tp)4584d4f2b7dSkefren tcp_reno_congestion_exp(struct tcpcb *tp)
4594d4f2b7dSkefren {
4604d4f2b7dSkefren
4614d4f2b7dSkefren tcp_common_congestion_exp(tp, RENO_BETAA, RENO_BETAB);
4624d4f2b7dSkefren }
463a70594d3Srpaulo
464f3330397Srpaulo static int
tcp_reno_do_fast_retransmit(struct tcpcb * tp,const struct tcphdr * th)4654d4f2b7dSkefren tcp_reno_do_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
466f3330397Srpaulo {
4671c1f230eSrpaulo /*
4681c1f230eSrpaulo * Dup acks mean that packets have left the
4691c1f230eSrpaulo * network (they're now cached at the receiver)
4701c1f230eSrpaulo * so bump cwnd by the amount in the receiver
4711c1f230eSrpaulo * to keep a constant cwnd packets in the
4721c1f230eSrpaulo * network.
4731c1f230eSrpaulo *
4741c1f230eSrpaulo * If we are using TCP/SACK, then enter
4751c1f230eSrpaulo * Fast Recovery if the receiver SACKs
4761c1f230eSrpaulo * data that is tcprexmtthresh * MSS
4771c1f230eSrpaulo * bytes past the last ACKed segment,
4781c1f230eSrpaulo * irrespective of the number of DupAcks.
4791c1f230eSrpaulo */
4801c1f230eSrpaulo
4814d4f2b7dSkefren tcp_seq onxt = tp->snd_nxt;
482f3330397Srpaulo
483f3330397Srpaulo tp->t_partialacks = 0;
484f3330397Srpaulo TCP_TIMER_DISARM(tp, TCPT_REXMT);
485f3330397Srpaulo tp->t_rtttime = 0;
486f3330397Srpaulo if (TCP_SACK_ENABLED(tp)) {
487f3330397Srpaulo tp->t_dupacks = tcprexmtthresh;
488f3330397Srpaulo tp->sack_newdata = tp->snd_nxt;
489f3330397Srpaulo tp->snd_cwnd = tp->t_segsz;
490f3330397Srpaulo (void) tcp_output(tp);
491f3330397Srpaulo return 0;
492f3330397Srpaulo }
493f3330397Srpaulo tp->snd_nxt = th->th_ack;
494f3330397Srpaulo tp->snd_cwnd = tp->t_segsz;
495f3330397Srpaulo (void) tcp_output(tp);
496f3330397Srpaulo tp->snd_cwnd = tp->snd_ssthresh + tp->t_segsz * tp->t_dupacks;
497f3330397Srpaulo if (SEQ_GT(onxt, tp->snd_nxt))
498f3330397Srpaulo tp->snd_nxt = onxt;
499f3330397Srpaulo
500f3330397Srpaulo return 0;
501f3330397Srpaulo }
502f3330397Srpaulo
5034d4f2b7dSkefren static int
tcp_reno_fast_retransmit(struct tcpcb * tp,const struct tcphdr * th)5044d4f2b7dSkefren tcp_reno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
5054d4f2b7dSkefren {
5064d4f2b7dSkefren
507be55f323Skefren /*
508be55f323Skefren * We know we're losing at the current
509be55f323Skefren * window size so do congestion avoidance
510be55f323Skefren * (set ssthresh to half the current window
511be55f323Skefren * and pull our congestion window back to
512be55f323Skefren * the new ssthresh).
513be55f323Skefren */
514be55f323Skefren
5154d4f2b7dSkefren tcp_reno_congestion_exp(tp);
5164d4f2b7dSkefren return tcp_reno_do_fast_retransmit(tp, th);
5174d4f2b7dSkefren }
5184d4f2b7dSkefren
519f3330397Srpaulo static void
tcp_reno_slow_retransmit(struct tcpcb * tp)520f3330397Srpaulo tcp_reno_slow_retransmit(struct tcpcb *tp)
521f3330397Srpaulo {
52253950d86Smsaitoh u_long win;
523f3330397Srpaulo
524f3330397Srpaulo /*
525f3330397Srpaulo * Close the congestion window down to one segment
526f3330397Srpaulo * (we'll open it by one segment for each ack we get).
527f3330397Srpaulo * Since we probably have a window's worth of unacked
528f3330397Srpaulo * data accumulated, this "slow start" keeps us from
529f3330397Srpaulo * dumping all that data as back-to-back packets (which
530f3330397Srpaulo * might overwhelm an intermediate gateway).
531f3330397Srpaulo *
532f3330397Srpaulo * There are two phases to the opening: Initially we
533f3330397Srpaulo * open by one mss on each ack. This makes the window
534f3330397Srpaulo * size increase exponentially with time. If the
535f3330397Srpaulo * window is larger than the path can handle, this
536f3330397Srpaulo * exponential growth results in dropped packet(s)
537f3330397Srpaulo * almost immediately. To get more time between
538f3330397Srpaulo * drops but still "push" the network to take advantage
539f3330397Srpaulo * of improving conditions, we switch from exponential
54040be87aeSandvar * to linear window opening at some threshold size.
54140be87aeSandvar * For a threshold, we use half the current window
542f3330397Srpaulo * size, truncated to a multiple of the mss.
543f3330397Srpaulo *
544f3330397Srpaulo * (the minimum cwnd that will give us exponential
54540be87aeSandvar * growth is 2 mss. We don't allow the threshold
546f3330397Srpaulo * to go below this.)
547f3330397Srpaulo */
548f3330397Srpaulo
54953950d86Smsaitoh win = ulmin(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz;
550f3330397Srpaulo if (win < 2)
551f3330397Srpaulo win = 2;
552f3330397Srpaulo /* Loss Window MUST be one segment. */
553f3330397Srpaulo tp->snd_cwnd = tp->t_segsz;
554f3330397Srpaulo tp->snd_ssthresh = win * tp->t_segsz;
555f3330397Srpaulo tp->t_partialacks = -1;
556f3330397Srpaulo tp->t_dupacks = 0;
55781463c93Syamt tp->t_bytes_acked = 0;
5584d4f2b7dSkefren
5594d4f2b7dSkefren if (TCP_ECN_ALLOWED(tp))
5604d4f2b7dSkefren tp->t_flags |= TF_ECN_SND_CWR;
561f3330397Srpaulo }
562f3330397Srpaulo
563f3330397Srpaulo static void
tcp_reno_fast_retransmit_newack(struct tcpcb * tp,const struct tcphdr * th)5647253aad9Syamt tcp_reno_fast_retransmit_newack(struct tcpcb *tp,
565168cd830Schristos const struct tcphdr *th)
566f3330397Srpaulo {
567f3330397Srpaulo if (tp->t_partialacks < 0) {
568f3330397Srpaulo /*
569f3330397Srpaulo * We were not in fast recovery. Reset the duplicate ack
570f3330397Srpaulo * counter.
571f3330397Srpaulo */
572f3330397Srpaulo tp->t_dupacks = 0;
573f3330397Srpaulo } else {
574f3330397Srpaulo /*
575f3330397Srpaulo * Clamp the congestion window to the crossover point and
576f3330397Srpaulo * exit fast recovery.
577f3330397Srpaulo */
578f3330397Srpaulo if (tp->snd_cwnd > tp->snd_ssthresh)
579f3330397Srpaulo tp->snd_cwnd = tp->snd_ssthresh;
580f3330397Srpaulo tp->t_partialacks = -1;
581f3330397Srpaulo tp->t_dupacks = 0;
58281463c93Syamt tp->t_bytes_acked = 0;
5834d4f2b7dSkefren if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
5844d4f2b7dSkefren tp->snd_fack = th->th_ack;
585f3330397Srpaulo }
586f3330397Srpaulo }
587f3330397Srpaulo
588f3330397Srpaulo static void
tcp_reno_newack(struct tcpcb * tp,const struct tcphdr * th)5897253aad9Syamt tcp_reno_newack(struct tcpcb *tp, const struct tcphdr *th)
590f3330397Srpaulo {
591f3330397Srpaulo /*
592f3330397Srpaulo * When new data is acked, open the congestion window.
59381463c93Syamt */
59481463c93Syamt
59581463c93Syamt u_int cw = tp->snd_cwnd;
59681463c93Syamt u_int incr = tp->t_segsz;
59781463c93Syamt
59881463c93Syamt if (tcp_do_abc) {
59981463c93Syamt
60081463c93Syamt /*
60181463c93Syamt * RFC 3465 Appropriate Byte Counting (ABC)
60281463c93Syamt */
60381463c93Syamt
60481463c93Syamt int acked = th->th_ack - tp->snd_una;
60581463c93Syamt
60681463c93Syamt if (cw >= tp->snd_ssthresh) {
60781463c93Syamt tp->t_bytes_acked += acked;
60881463c93Syamt if (tp->t_bytes_acked >= cw) {
60981463c93Syamt /* Time to increase the window. */
61081463c93Syamt tp->t_bytes_acked -= cw;
61181463c93Syamt } else {
61281463c93Syamt /* No need to increase yet. */
61381463c93Syamt incr = 0;
61481463c93Syamt }
61581463c93Syamt } else {
61681463c93Syamt /*
61781463c93Syamt * use 2*SMSS or 1*SMSS for the "L" param,
61881463c93Syamt * depending on sysctl setting.
61981463c93Syamt *
62081463c93Syamt * (See RFC 3465 2.3 Choosing the Limit)
62181463c93Syamt */
62281463c93Syamt u_int abc_lim;
62381463c93Syamt
624df8e5bddSyamt abc_lim = (tcp_abc_aggressive == 0 ||
625df8e5bddSyamt tp->snd_nxt != tp->snd_max) ? incr : incr * 2;
626d1579b2dSriastradh incr = uimin(acked, abc_lim);
62781463c93Syamt }
62881463c93Syamt } else {
62981463c93Syamt
63081463c93Syamt /*
631f3330397Srpaulo * If the window gives us less than ssthresh packets
632f3330397Srpaulo * in flight, open exponentially (segsz per packet).
633f3330397Srpaulo * Otherwise open linearly: segsz per window
6342f7740a3Syamt * (segsz^2 / cwnd per packet).
635f3330397Srpaulo */
636e1b1f65fSrpaulo
63781463c93Syamt if (cw >= tp->snd_ssthresh) {
638f3330397Srpaulo incr = incr * incr / cw;
63981463c93Syamt }
64081463c93Syamt }
641e1b1f65fSrpaulo
642d1579b2dSriastradh tp->snd_cwnd = uimin(cw + incr, TCP_MAXWIN << tp->snd_scale);
643f3330397Srpaulo }
644f3330397Srpaulo
645a34217b8Smatt const struct tcp_congctl tcp_reno_ctl = {
646f3330397Srpaulo .fast_retransmit = tcp_reno_fast_retransmit,
647f3330397Srpaulo .slow_retransmit = tcp_reno_slow_retransmit,
648f3330397Srpaulo .fast_retransmit_newack = tcp_reno_fast_retransmit_newack,
649f3330397Srpaulo .newack = tcp_reno_newack,
650a70594d3Srpaulo .cong_exp = tcp_reno_congestion_exp,
651f3330397Srpaulo };
652f3330397Srpaulo
653f3330397Srpaulo /*
654f3330397Srpaulo * TCP/NewReno Congestion control.
655f3330397Srpaulo */
656f3330397Srpaulo static int
tcp_newreno_fast_retransmit(struct tcpcb * tp,const struct tcphdr * th)6577253aad9Syamt tcp_newreno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
658f3330397Srpaulo {
65918a0ef4aSyamt
660f3330397Srpaulo if (SEQ_LT(th->th_ack, tp->snd_high)) {
661f3330397Srpaulo /*
662f3330397Srpaulo * False fast retransmit after timeout.
663f3330397Srpaulo * Do not enter fast recovery
664f3330397Srpaulo */
665f3330397Srpaulo tp->t_dupacks = 0;
666f3330397Srpaulo return 1;
66718a0ef4aSyamt }
668f3330397Srpaulo /*
669f3330397Srpaulo * Fast retransmit is same as reno.
670f3330397Srpaulo */
671f3330397Srpaulo return tcp_reno_fast_retransmit(tp, th);
672f3330397Srpaulo }
673f3330397Srpaulo
674f3330397Srpaulo /*
675f3330397Srpaulo * Implement the NewReno response to a new ack, checking for partial acks in
676f3330397Srpaulo * fast recovery.
677f3330397Srpaulo */
678f3330397Srpaulo static void
tcp_newreno_fast_retransmit_newack(struct tcpcb * tp,const struct tcphdr * th)6797253aad9Syamt tcp_newreno_fast_retransmit_newack(struct tcpcb *tp, const struct tcphdr *th)
680f3330397Srpaulo {
681f3330397Srpaulo if (tp->t_partialacks < 0) {
682f3330397Srpaulo /*
683f3330397Srpaulo * We were not in fast recovery. Reset the duplicate ack
684f3330397Srpaulo * counter.
685f3330397Srpaulo */
686f3330397Srpaulo tp->t_dupacks = 0;
687f3330397Srpaulo } else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
688f3330397Srpaulo /*
689f3330397Srpaulo * This is a partial ack. Retransmit the first unacknowledged
690f3330397Srpaulo * segment and deflate the congestion window by the amount of
691f3330397Srpaulo * acknowledged data. Do not exit fast recovery.
692f3330397Srpaulo */
693f3330397Srpaulo tcp_seq onxt = tp->snd_nxt;
694f3330397Srpaulo u_long ocwnd = tp->snd_cwnd;
6954d4f2b7dSkefren int sack_num_segs = 1, sack_bytes_rxmt = 0;
696f3330397Srpaulo
697f3330397Srpaulo /*
698f3330397Srpaulo * snd_una has not yet been updated and the socket's send
699f3330397Srpaulo * buffer has not yet drained off the ACK'd data, so we
700f3330397Srpaulo * have to leave snd_una as it was to get the correct data
701f3330397Srpaulo * offset in tcp_output().
702f3330397Srpaulo */
7034d4f2b7dSkefren tp->t_partialacks++;
704f3330397Srpaulo TCP_TIMER_DISARM(tp, TCPT_REXMT);
705f3330397Srpaulo tp->t_rtttime = 0;
7064d4f2b7dSkefren
7074d4f2b7dSkefren if (TCP_SACK_ENABLED(tp)) {
708f3330397Srpaulo /*
7094d4f2b7dSkefren * Partial ack handling within a sack recovery episode.
7104d4f2b7dSkefren * Keeping this very simple for now. When a partial ack
7114d4f2b7dSkefren * is received, force snd_cwnd to a value that will
7124d4f2b7dSkefren * allow the sender to transmit no more than 2 segments.
7134d4f2b7dSkefren * If necessary, a fancier scheme can be adopted at a
7144d4f2b7dSkefren * later point, but for now, the goal is to prevent the
7154d4f2b7dSkefren * sender from bursting a large amount of data in the
7164d4f2b7dSkefren * midst of sack recovery.
7174d4f2b7dSkefren */
7184d4f2b7dSkefren
7194d4f2b7dSkefren /*
7204d4f2b7dSkefren * send one or 2 segments based on how much
7214d4f2b7dSkefren * new data was acked
7224d4f2b7dSkefren */
7234d4f2b7dSkefren if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2)
7244d4f2b7dSkefren sack_num_segs = 2;
7254d4f2b7dSkefren (void)tcp_sack_output(tp, &sack_bytes_rxmt);
7264d4f2b7dSkefren tp->snd_cwnd = sack_bytes_rxmt +
7274d4f2b7dSkefren (tp->snd_nxt - tp->sack_newdata) +
7284d4f2b7dSkefren sack_num_segs * tp->t_segsz;
7294d4f2b7dSkefren tp->t_flags |= TF_ACKNOW;
7304d4f2b7dSkefren (void) tcp_output(tp);
7314d4f2b7dSkefren } else {
7324c7fdffbSskrll tp->snd_nxt = th->th_ack;
7334d4f2b7dSkefren /*
7344d4f2b7dSkefren * Set snd_cwnd to one segment beyond ACK'd offset
7354d4f2b7dSkefren * snd_una is not yet updated when we're called
736f3330397Srpaulo */
737f3330397Srpaulo tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una);
738f3330397Srpaulo (void) tcp_output(tp);
739f3330397Srpaulo tp->snd_cwnd = ocwnd;
740f3330397Srpaulo if (SEQ_GT(onxt, tp->snd_nxt))
741f3330397Srpaulo tp->snd_nxt = onxt;
742f3330397Srpaulo /*
7434d4f2b7dSkefren * Partial window deflation. Relies on fact that
7444d4f2b7dSkefren * tp->snd_una not updated yet.
745f3330397Srpaulo */
7464d4f2b7dSkefren tp->snd_cwnd -= (th->th_ack - tp->snd_una -
7474d4f2b7dSkefren tp->t_segsz);
7484d4f2b7dSkefren }
749f3330397Srpaulo } else {
750f3330397Srpaulo /*
751f3330397Srpaulo * Complete ack. Inflate the congestion window to ssthresh
752f3330397Srpaulo * and exit fast recovery.
753f3330397Srpaulo *
754f3330397Srpaulo * Window inflation should have left us with approx.
755f3330397Srpaulo * snd_ssthresh outstanding data. But in case we
756f3330397Srpaulo * would be inclined to send a burst, better to do
757f3330397Srpaulo * it via the slow start mechanism.
758f3330397Srpaulo */
759f3330397Srpaulo if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
760f3330397Srpaulo tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
761f3330397Srpaulo + tp->t_segsz;
762f3330397Srpaulo else
763f3330397Srpaulo tp->snd_cwnd = tp->snd_ssthresh;
764f3330397Srpaulo tp->t_partialacks = -1;
765f3330397Srpaulo tp->t_dupacks = 0;
76681463c93Syamt tp->t_bytes_acked = 0;
7674d4f2b7dSkefren if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
7684d4f2b7dSkefren tp->snd_fack = th->th_ack;
769f3330397Srpaulo }
770f3330397Srpaulo }
771f3330397Srpaulo
772f3330397Srpaulo static void
tcp_newreno_newack(struct tcpcb * tp,const struct tcphdr * th)7737253aad9Syamt tcp_newreno_newack(struct tcpcb *tp, const struct tcphdr *th)
774f3330397Srpaulo {
775f3330397Srpaulo /*
776e1b1f65fSrpaulo * If we are still in fast recovery (meaning we are using
777e1b1f65fSrpaulo * NewReno and we have only received partial acks), do not
778e1b1f65fSrpaulo * inflate the window yet.
779f3330397Srpaulo */
780e1b1f65fSrpaulo if (tp->t_partialacks < 0)
781e1b1f65fSrpaulo tcp_reno_newack(tp, th);
782f3330397Srpaulo }
783f3330397Srpaulo
784f3330397Srpaulo
785a34217b8Smatt const struct tcp_congctl tcp_newreno_ctl = {
786f3330397Srpaulo .fast_retransmit = tcp_newreno_fast_retransmit,
787f3330397Srpaulo .slow_retransmit = tcp_reno_slow_retransmit,
788f3330397Srpaulo .fast_retransmit_newack = tcp_newreno_fast_retransmit_newack,
789f3330397Srpaulo .newack = tcp_newreno_newack,
790a70594d3Srpaulo .cong_exp = tcp_reno_congestion_exp,
791f3330397Srpaulo };
792f3330397Srpaulo
7934d4f2b7dSkefren /*
7944d4f2b7dSkefren * CUBIC - http://tools.ietf.org/html/draft-rhee-tcpm-cubic-02
7954d4f2b7dSkefren */
796f3330397Srpaulo
7974d4f2b7dSkefren /* Cubic prototypes */
7984d4f2b7dSkefren static void tcp_cubic_update_ctime(struct tcpcb *tp);
7994d4f2b7dSkefren static uint32_t tcp_cubic_diff_ctime(struct tcpcb *);
8004d4f2b7dSkefren static uint32_t tcp_cubic_cbrt(uint32_t);
801be55f323Skefren static ulong tcp_cubic_getW(struct tcpcb *, uint32_t, uint32_t);
8024d4f2b7dSkefren
8034d4f2b7dSkefren /* Cubic TIME functions - XXX I don't like using timevals and microuptime */
8044d4f2b7dSkefren /*
8054d4f2b7dSkefren * Set congestion timer to now
8064d4f2b7dSkefren */
8074d4f2b7dSkefren static void
tcp_cubic_update_ctime(struct tcpcb * tp)8084d4f2b7dSkefren tcp_cubic_update_ctime(struct tcpcb *tp)
8094d4f2b7dSkefren {
8104d4f2b7dSkefren struct timeval now_timeval;
8114d4f2b7dSkefren
8124d4f2b7dSkefren getmicrouptime(&now_timeval);
8134d4f2b7dSkefren tp->snd_cubic_ctime = now_timeval.tv_sec * 1000 +
8144d4f2b7dSkefren now_timeval.tv_usec / 1000;
8154d4f2b7dSkefren }
8164d4f2b7dSkefren
8174d4f2b7dSkefren /*
818*e4ebea9eSandvar * milliseconds from last congestion
8194d4f2b7dSkefren */
8204d4f2b7dSkefren static uint32_t
tcp_cubic_diff_ctime(struct tcpcb * tp)8214d4f2b7dSkefren tcp_cubic_diff_ctime(struct tcpcb *tp)
8224d4f2b7dSkefren {
8234d4f2b7dSkefren struct timeval now_timeval;
8244d4f2b7dSkefren
8254d4f2b7dSkefren getmicrouptime(&now_timeval);
8264d4f2b7dSkefren return now_timeval.tv_sec * 1000 + now_timeval.tv_usec / 1000 -
8274d4f2b7dSkefren tp->snd_cubic_ctime;
8284d4f2b7dSkefren }
8294d4f2b7dSkefren
8304d4f2b7dSkefren /*
8314d4f2b7dSkefren * Approximate cubic root
8324d4f2b7dSkefren */
8334d4f2b7dSkefren #define CBRT_ROUNDS 30
8344d4f2b7dSkefren static uint32_t
tcp_cubic_cbrt(uint32_t v)8354d4f2b7dSkefren tcp_cubic_cbrt(uint32_t v)
8364d4f2b7dSkefren {
8374d4f2b7dSkefren int i, rounds = CBRT_ROUNDS;
8384d4f2b7dSkefren uint64_t x = v / 3;
8394d4f2b7dSkefren
8404d4f2b7dSkefren /* We fail to calculate correct for small numbers */
8414d4f2b7dSkefren if (v == 0)
8424d4f2b7dSkefren return 0;
8434d4f2b7dSkefren else if (v < 4)
8444d4f2b7dSkefren return 1;
8454d4f2b7dSkefren
8464d4f2b7dSkefren /*
8474d4f2b7dSkefren * largest x that 2*x^3+3*x fits 64bit
8484d4f2b7dSkefren * Avoid overflow for a time cost
8494d4f2b7dSkefren */
8504d4f2b7dSkefren if (x > 2097151)
8514d4f2b7dSkefren rounds += 10;
8524d4f2b7dSkefren
8534d4f2b7dSkefren for (i = 0; i < rounds; i++)
8544d4f2b7dSkefren if (rounds == CBRT_ROUNDS)
8554d4f2b7dSkefren x = (v + 2 * x * x * x) / (3 * x * x);
8564d4f2b7dSkefren else
8574d4f2b7dSkefren /* Avoid overflow */
8584d4f2b7dSkefren x = v / (3 * x * x) + 2 * x / 3;
8594d4f2b7dSkefren
8604d4f2b7dSkefren return (uint32_t)x;
8614d4f2b7dSkefren }
8624d4f2b7dSkefren
863be55f323Skefren /* Draft Rhee Section 3.1 - get W(t+rtt) - Eq. 1 */
864be55f323Skefren static ulong
tcp_cubic_getW(struct tcpcb * tp,uint32_t ms_elapsed,uint32_t rtt)865be55f323Skefren tcp_cubic_getW(struct tcpcb *tp, uint32_t ms_elapsed, uint32_t rtt)
8664d4f2b7dSkefren {
867be55f323Skefren uint32_t K;
868be55f323Skefren long tK3;
8694d4f2b7dSkefren
870be55f323Skefren /* Section 3.1 Eq. 2 */
871be55f323Skefren K = tcp_cubic_cbrt(tp->snd_cubic_wmax / CUBIC_BETAB *
8724d4f2b7dSkefren CUBIC_CB / CUBIC_CA);
873be55f323Skefren /* (t-K)^3 - not clear why is the measure unit mattering */
874be55f323Skefren tK3 = (long)(ms_elapsed + rtt) - (long)K;
875be55f323Skefren tK3 = tK3 * tK3 * tK3;
8764d4f2b7dSkefren
877be55f323Skefren return CUBIC_CA * tK3 / CUBIC_CB + tp->snd_cubic_wmax;
8784d4f2b7dSkefren }
8794d4f2b7dSkefren
8804d4f2b7dSkefren static void
tcp_cubic_congestion_exp(struct tcpcb * tp)8814d4f2b7dSkefren tcp_cubic_congestion_exp(struct tcpcb *tp)
8824d4f2b7dSkefren {
8834d4f2b7dSkefren
884be55f323Skefren /*
885be55f323Skefren * Congestion - Set WMax and shrink cwnd
886be55f323Skefren */
8874d4f2b7dSkefren tcp_cubic_update_ctime(tp);
8884d4f2b7dSkefren
8894d4f2b7dSkefren /* Section 3.6 - Fast Convergence */
8904d4f2b7dSkefren if (tp->snd_cubic_wmax < tp->snd_cubic_wmax_last) {
8914d4f2b7dSkefren tp->snd_cubic_wmax_last = tp->snd_cubic_wmax;
8924d4f2b7dSkefren tp->snd_cubic_wmax = tp->snd_cubic_wmax / 2 +
8934d4f2b7dSkefren tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB / 2;
8944d4f2b7dSkefren } else {
8954d4f2b7dSkefren tp->snd_cubic_wmax_last = tp->snd_cubic_wmax;
8964d4f2b7dSkefren tp->snd_cubic_wmax = tp->snd_cwnd;
8974d4f2b7dSkefren }
898be55f323Skefren
899d1579b2dSriastradh tp->snd_cubic_wmax = uimax(tp->t_segsz, tp->snd_cubic_wmax);
900be55f323Skefren
901be55f323Skefren /* Shrink CWND */
9024d4f2b7dSkefren tcp_common_congestion_exp(tp, CUBIC_BETAA, CUBIC_BETAB);
9034d4f2b7dSkefren }
9044d4f2b7dSkefren
9054d4f2b7dSkefren static int
tcp_cubic_fast_retransmit(struct tcpcb * tp,const struct tcphdr * th)9064d4f2b7dSkefren tcp_cubic_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
9074d4f2b7dSkefren {
9084d4f2b7dSkefren
9094d4f2b7dSkefren if (SEQ_LT(th->th_ack, tp->snd_high)) {
9104d4f2b7dSkefren /* See newreno */
9114d4f2b7dSkefren tp->t_dupacks = 0;
9124d4f2b7dSkefren return 1;
9134d4f2b7dSkefren }
9144d4f2b7dSkefren
9154d4f2b7dSkefren /*
916be55f323Skefren * mark WMax
9174d4f2b7dSkefren */
9184d4f2b7dSkefren tcp_cubic_congestion_exp(tp);
919be55f323Skefren
920be55f323Skefren /* Do fast retransmit */
9214d4f2b7dSkefren return tcp_reno_do_fast_retransmit(tp, th);
9224d4f2b7dSkefren }
9234d4f2b7dSkefren
9244d4f2b7dSkefren static void
tcp_cubic_newack(struct tcpcb * tp,const struct tcphdr * th)9254d4f2b7dSkefren tcp_cubic_newack(struct tcpcb *tp, const struct tcphdr *th)
9264d4f2b7dSkefren {
9274d4f2b7dSkefren uint32_t ms_elapsed, rtt;
9284d4f2b7dSkefren u_long w_tcp;
9294d4f2b7dSkefren
930be55f323Skefren /* Congestion avoidance and not in fast recovery and usable rtt */
931be55f323Skefren if (tp->snd_cwnd > tp->snd_ssthresh && tp->t_partialacks < 0 &&
932be55f323Skefren /*
933be55f323Skefren * t_srtt is 1/32 units of slow ticks
934be55f323Skefren * converting it in ms would be equal to
935be55f323Skefren * (t_srtt >> 5) * 1000 / PR_SLOWHZ ~= (t_srtt << 5) / PR_SLOWHZ
936be55f323Skefren */
937be55f323Skefren (rtt = (tp->t_srtt << 5) / PR_SLOWHZ) > 0) {
9384d4f2b7dSkefren ms_elapsed = tcp_cubic_diff_ctime(tp);
9394d4f2b7dSkefren
940be55f323Skefren /* Compute W_tcp(t) */
941be55f323Skefren w_tcp = tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB +
9424d4f2b7dSkefren ms_elapsed / rtt / 3;
9434d4f2b7dSkefren
9444d4f2b7dSkefren if (tp->snd_cwnd > w_tcp) {
945be55f323Skefren /* Not in TCP friendly mode */
946be55f323Skefren tp->snd_cwnd += (tcp_cubic_getW(tp, ms_elapsed, rtt) -
947be55f323Skefren tp->snd_cwnd) / tp->snd_cwnd;
9484d4f2b7dSkefren } else {
9494d4f2b7dSkefren /* friendly TCP mode */
9504d4f2b7dSkefren tp->snd_cwnd = w_tcp;
9514d4f2b7dSkefren }
9524d4f2b7dSkefren
9534d4f2b7dSkefren /* Make sure we are within limits */
954d1579b2dSriastradh tp->snd_cwnd = uimax(tp->snd_cwnd, tp->t_segsz);
955d1579b2dSriastradh tp->snd_cwnd = uimin(tp->snd_cwnd, TCP_MAXWIN << tp->snd_scale);
9564d4f2b7dSkefren } else {
9574d4f2b7dSkefren /* Use New Reno */
9584d4f2b7dSkefren tcp_newreno_newack(tp, th);
9594d4f2b7dSkefren }
9604d4f2b7dSkefren }
9614d4f2b7dSkefren
9624d4f2b7dSkefren static void
tcp_cubic_slow_retransmit(struct tcpcb * tp)9634d4f2b7dSkefren tcp_cubic_slow_retransmit(struct tcpcb *tp)
9644d4f2b7dSkefren {
9654d4f2b7dSkefren
966be55f323Skefren /* Timeout - Mark new congestion */
967be55f323Skefren tcp_cubic_congestion_exp(tp);
9684d4f2b7dSkefren
969be55f323Skefren /* Loss Window MUST be one segment. */
970be55f323Skefren tp->snd_cwnd = tp->t_segsz;
971be55f323Skefren tp->t_partialacks = -1;
972be55f323Skefren tp->t_dupacks = 0;
973be55f323Skefren tp->t_bytes_acked = 0;
974be55f323Skefren
975be55f323Skefren if (TCP_ECN_ALLOWED(tp))
976be55f323Skefren tp->t_flags |= TF_ECN_SND_CWR;
9774d4f2b7dSkefren }
9784d4f2b7dSkefren
9794d4f2b7dSkefren const struct tcp_congctl tcp_cubic_ctl = {
9804d4f2b7dSkefren .fast_retransmit = tcp_cubic_fast_retransmit,
9814d4f2b7dSkefren .slow_retransmit = tcp_cubic_slow_retransmit,
9824d4f2b7dSkefren .fast_retransmit_newack = tcp_newreno_fast_retransmit_newack,
9834d4f2b7dSkefren .newack = tcp_cubic_newack,
9844d4f2b7dSkefren .cong_exp = tcp_cubic_congestion_exp,
9854d4f2b7dSkefren };
986