xref: /freebsd-src/sys/netinet/tcp_ratelimit.h (revision 1f628be888b74f1219b3ea7ccea1e7a3d1db77a2)
120abea66SRandall Stewart /*-
220abea66SRandall Stewart  *
320abea66SRandall Stewart  * SPDX-License-Identifier: BSD-3-Clause
420abea66SRandall Stewart  *
528540ab1SWarner Losh  * Copyright (c) 2018-2020
620abea66SRandall Stewart  *	Netflix Inc.
720abea66SRandall Stewart  *
820abea66SRandall Stewart  * Redistribution and use in source and binary forms, with or without
920abea66SRandall Stewart  * modification, are permitted provided that the following conditions
1020abea66SRandall Stewart  * are met:
1120abea66SRandall Stewart  * 1. Redistributions of source code must retain the above copyright
1220abea66SRandall Stewart  *    notice, this list of conditions and the following disclaimer.
1320abea66SRandall Stewart  * 2. Redistributions in binary form must reproduce the above copyright
1420abea66SRandall Stewart  *    notice, this list of conditions and the following disclaimer in the
1520abea66SRandall Stewart  *    documentation and/or other materials provided with the distribution.
1620abea66SRandall Stewart  *
1720abea66SRandall Stewart  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
1820abea66SRandall Stewart  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1920abea66SRandall Stewart  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2020abea66SRandall Stewart  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
2120abea66SRandall Stewart  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2220abea66SRandall Stewart  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2320abea66SRandall Stewart  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2420abea66SRandall Stewart  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2520abea66SRandall Stewart  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2620abea66SRandall Stewart  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2720abea66SRandall Stewart  * SUCH DAMAGE.
2820abea66SRandall Stewart  *
2920abea66SRandall Stewart  */
3020abea66SRandall Stewart /**
3120abea66SRandall Stewart  * Author: Randall Stewart <rrs@netflix.com>
3220abea66SRandall Stewart  */
3320abea66SRandall Stewart #ifndef __tcp_ratelimit_h__
3420abea66SRandall Stewart #define __tcp_ratelimit_h__
3520abea66SRandall Stewart 
3620abea66SRandall Stewart struct m_snd_tag;
3720abea66SRandall Stewart 
3826bdd35cSRandall Stewart #define RL_MIN_DIVISOR 50
3926bdd35cSRandall Stewart #define RL_DEFAULT_DIVISOR 1000
4026bdd35cSRandall Stewart 
4120abea66SRandall Stewart /* Flags on an individual rate */
4220abea66SRandall Stewart #define HDWRPACE_INITED 	0x0001
4320abea66SRandall Stewart #define HDWRPACE_TAGPRESENT	0x0002
4420abea66SRandall Stewart #define HDWRPACE_IFPDEPARTED	0x0004
4520abea66SRandall Stewart struct tcp_hwrate_limit_table {
4620abea66SRandall Stewart 	const struct tcp_rate_set *ptbl;	/* Pointer to parent table */
4720abea66SRandall Stewart 	struct m_snd_tag *tag;	/* Send tag if needed (chelsio) */
485d8fd932SRandall Stewart 	long	 rate;		/* Rate we get in Bytes per second (Bps) */
495d8fd932SRandall Stewart 	long	 using;		/* How many flows are using this hdwr rate. */
505d8fd932SRandall Stewart 	long	 rs_num_enobufs;
5120abea66SRandall Stewart 	uint32_t time_between;	/* Time-Gap between packets at this rate */
5220abea66SRandall Stewart 	uint32_t flags;
5320abea66SRandall Stewart };
5420abea66SRandall Stewart 
5520abea66SRandall Stewart /* Rateset flags */
5620abea66SRandall Stewart #define RS_IS_DEFF      0x0001	/* Its a lagg, do a double lookup */
5720abea66SRandall Stewart #define RS_IS_INTF      0x0002	/* Its a plain interface */
5820abea66SRandall Stewart #define RS_NO_PRE       0x0004	/* The interfacd has set rates */
5920abea66SRandall Stewart #define RS_INT_TBL      0x0010	/*
6020abea66SRandall Stewart 				 * The table is the internal version
6120abea66SRandall Stewart 				 * which has special setup requirements.
6220abea66SRandall Stewart 				 */
6320abea66SRandall Stewart #define RS_IS_DEAD      0x0020	/* The RS is dead list */
6420abea66SRandall Stewart #define RS_FUNERAL_SCHD 0x0040  /* Is a epoch call scheduled to bury this guy?*/
6520abea66SRandall Stewart #define RS_INTF_NO_SUP  0x0100 	/* The interface does not support the ratelimiting */
6620abea66SRandall Stewart 
6720abea66SRandall Stewart struct tcp_rate_set {
6820abea66SRandall Stewart 	struct sysctl_ctx_list sysctl_ctx;
6920abea66SRandall Stewart 	CK_LIST_ENTRY(tcp_rate_set) next;
7020abea66SRandall Stewart 	struct ifnet *rs_ifp;
7120abea66SRandall Stewart 	struct tcp_hwrate_limit_table *rs_rlt;
7220abea66SRandall Stewart 	uint64_t rs_flows_using;
7320abea66SRandall Stewart 	uint64_t rs_flow_limit;
7420abea66SRandall Stewart 	uint32_t rs_if_dunit;
7520abea66SRandall Stewart 	int rs_rate_cnt;
7620abea66SRandall Stewart 	int rs_min_seg;
7720abea66SRandall Stewart 	int rs_highest_valid;
7820abea66SRandall Stewart 	int rs_lowest_valid;
7920abea66SRandall Stewart 	int rs_disable;
8020abea66SRandall Stewart 	int rs_flags;
8120abea66SRandall Stewart 	struct epoch_context rs_epoch_ctx;
8220abea66SRandall Stewart };
8320abea66SRandall Stewart 
8420abea66SRandall Stewart CK_LIST_HEAD(head_tcp_rate_set, tcp_rate_set);
8520abea66SRandall Stewart 
8620abea66SRandall Stewart /* Request flags */
8720abea66SRandall Stewart #define RS_PACING_EXACT_MATCH	0x0001	/* Need an exact match for rate */
8820abea66SRandall Stewart #define RS_PACING_GT		0x0002	/* Greater than requested */
8920abea66SRandall Stewart #define RS_PACING_GEQ		0x0004	/* Greater than or equal too */
9020abea66SRandall Stewart #define RS_PACING_LT		0x0008	/* Less than requested rate */
9120abea66SRandall Stewart #define RS_PACING_SUB_OK	0x0010	/* If a rate can't be found get the
9220abea66SRandall Stewart 					 * next best rate (highest or lowest). */
9320abea66SRandall Stewart #ifdef _KERNEL
94d7313dc6SRandall Stewart #ifndef ETHERNET_SEGMENT_SIZE
95d7313dc6SRandall Stewart #define ETHERNET_SEGMENT_SIZE 1514
96d7313dc6SRandall Stewart #endif
97*1f628be8SAndrew Gallatin struct tcpcb;
98*1f628be8SAndrew Gallatin 
992f1cc984SRandall Stewart #ifdef RATELIMIT
10020abea66SRandall Stewart #define DETAILED_RATELIMIT_SYSCTL 1	/*
10120abea66SRandall Stewart 					 * Undefine this if you don't want
10220abea66SRandall Stewart 					 * detailed rates to appear in
10320abea66SRandall Stewart 					 * net.inet.tcp.rl.
10420abea66SRandall Stewart 					 * With the defintion each rate
10520abea66SRandall Stewart 					 * shows up in your sysctl tree
10620abea66SRandall Stewart 					 * this can be big.
10720abea66SRandall Stewart 					 */
1081a714ff2SRandall Stewart uint64_t inline
1091a714ff2SRandall Stewart tcp_hw_highest_rate(const struct tcp_hwrate_limit_table *rle)
1101a714ff2SRandall Stewart {
1111a714ff2SRandall Stewart 	return (rle->ptbl->rs_rlt[rle->ptbl->rs_highest_valid].rate);
1121a714ff2SRandall Stewart }
1131a714ff2SRandall Stewart 
1141a714ff2SRandall Stewart uint64_t
1151a714ff2SRandall Stewart tcp_hw_highest_rate_ifp(struct ifnet *ifp, struct inpcb *inp);
11620abea66SRandall Stewart 
11720abea66SRandall Stewart const struct tcp_hwrate_limit_table *
11820abea66SRandall Stewart tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
1191a714ff2SRandall Stewart     uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate);
12020abea66SRandall Stewart 
12120abea66SRandall Stewart const struct tcp_hwrate_limit_table *
12220abea66SRandall Stewart tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
12320abea66SRandall Stewart     struct tcpcb *tp, struct ifnet *ifp,
1241a714ff2SRandall Stewart     uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate);
12520abea66SRandall Stewart void
12620abea66SRandall Stewart tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte,
12720abea66SRandall Stewart     struct tcpcb *tp);
12826bdd35cSRandall Stewart 
12926bdd35cSRandall Stewart uint32_t
13026bdd35cSRandall Stewart tcp_get_pacing_burst_size_w_divisor(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss,
13126bdd35cSRandall Stewart     const struct tcp_hwrate_limit_table *te, int *err, int divisor);
13226bdd35cSRandall Stewart 
13326bdd35cSRandall Stewart void
13426bdd35cSRandall Stewart tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte);
13526bdd35cSRandall Stewart 
136*1f628be8SAndrew Gallatin void
137*1f628be8SAndrew Gallatin tcp_rl_release_ifnet(struct ifnet *ifp);
138*1f628be8SAndrew Gallatin 
13920abea66SRandall Stewart #else
14020abea66SRandall Stewart static inline const struct tcp_hwrate_limit_table *
14120abea66SRandall Stewart tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
1421a714ff2SRandall Stewart     uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate)
14320abea66SRandall Stewart {
14420abea66SRandall Stewart 	if (error)
14520abea66SRandall Stewart 		*error = EOPNOTSUPP;
14620abea66SRandall Stewart 	return (NULL);
14720abea66SRandall Stewart }
14820abea66SRandall Stewart 
14920abea66SRandall Stewart static inline const struct tcp_hwrate_limit_table *
15020abea66SRandall Stewart tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
15120abea66SRandall Stewart     struct tcpcb *tp, struct ifnet *ifp,
1521a714ff2SRandall Stewart     uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate)
15320abea66SRandall Stewart {
15420abea66SRandall Stewart 	if (error)
15520abea66SRandall Stewart 		*error = EOPNOTSUPP;
15620abea66SRandall Stewart 	return (NULL);
15720abea66SRandall Stewart }
15820abea66SRandall Stewart 
15920abea66SRandall Stewart static inline void
16020abea66SRandall Stewart tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte,
16120abea66SRandall Stewart     struct tcpcb *tp)
16220abea66SRandall Stewart {
16320abea66SRandall Stewart 	return;
16420abea66SRandall Stewart }
1655a4333a5SRandall Stewart 
1665a4333a5SRandall Stewart static uint64_t inline
1675a4333a5SRandall Stewart tcp_hw_highest_rate(const struct tcp_hwrate_limit_table *rle)
1685a4333a5SRandall Stewart {
1695a4333a5SRandall Stewart 	return (0);
1705a4333a5SRandall Stewart }
1715a4333a5SRandall Stewart 
1725a4333a5SRandall Stewart static uint64_t inline
1735a4333a5SRandall Stewart tcp_hw_highest_rate_ifp(struct ifnet *ifp, struct inpcb *inp)
1745a4333a5SRandall Stewart {
1755a4333a5SRandall Stewart 	return (0);
1765a4333a5SRandall Stewart }
1775a4333a5SRandall Stewart 
17826bdd35cSRandall Stewart static inline uint32_t
17926bdd35cSRandall Stewart tcp_get_pacing_burst_size_w_divisor(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss,
18026bdd35cSRandall Stewart    const struct tcp_hwrate_limit_table *te, int *err, int divisor)
18126bdd35cSRandall Stewart {
18226bdd35cSRandall Stewart 	/*
18326bdd35cSRandall Stewart 	 * We use the google formula to calculate the
18426bdd35cSRandall Stewart 	 * TSO size. I.E.
18526bdd35cSRandall Stewart 	 * bw < 24Meg
18626bdd35cSRandall Stewart 	 *   tso = 2mss
18726bdd35cSRandall Stewart 	 * else
18826bdd35cSRandall Stewart 	 *   tso = min(bw/(div=1000), 64k)
18926bdd35cSRandall Stewart 	 *
19026bdd35cSRandall Stewart 	 * Note for these calculations we ignore the
19126bdd35cSRandall Stewart 	 * packet overhead (enet hdr, ip hdr and tcp hdr).
19226bdd35cSRandall Stewart 	 * We only get the google formula when we have
19326bdd35cSRandall Stewart 	 * divisor = 1000, which is the default for now.
19426bdd35cSRandall Stewart 	 */
19526bdd35cSRandall Stewart 	uint64_t bytes;
19626bdd35cSRandall Stewart 	uint32_t new_tso, min_tso_segs;
19726bdd35cSRandall Stewart 
19826bdd35cSRandall Stewart 	/* It can't be zero */
19926bdd35cSRandall Stewart 	if ((divisor == 0) ||
20026bdd35cSRandall Stewart 	    (divisor < RL_MIN_DIVISOR)) {
20126bdd35cSRandall Stewart 		bytes = bw / RL_DEFAULT_DIVISOR;
20226bdd35cSRandall Stewart 	} else
20326bdd35cSRandall Stewart 		bytes = bw / divisor;
20426bdd35cSRandall Stewart 	/* We can't ever send more than 65k in a TSO */
20526bdd35cSRandall Stewart 	if (bytes > 0xffff) {
20626bdd35cSRandall Stewart 		bytes = 0xffff;
20726bdd35cSRandall Stewart 	}
20826bdd35cSRandall Stewart 	/* Round up */
20926bdd35cSRandall Stewart 	new_tso = (bytes + segsiz - 1) / segsiz;
21026bdd35cSRandall Stewart 	if (can_use_1mss)
21126bdd35cSRandall Stewart 		min_tso_segs = 1;
21226bdd35cSRandall Stewart 	else
21326bdd35cSRandall Stewart 		min_tso_segs = 2;
21426bdd35cSRandall Stewart 	if (new_tso < min_tso_segs)
21526bdd35cSRandall Stewart 		new_tso = min_tso_segs;
21626bdd35cSRandall Stewart 	new_tso *= segsiz;
21726bdd35cSRandall Stewart 	return (new_tso);
21826bdd35cSRandall Stewart }
21926bdd35cSRandall Stewart 
22026bdd35cSRandall Stewart /* Do nothing if RATELIMIT is not defined */
221876fddc8SMark Johnston static inline void
22226bdd35cSRandall Stewart tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte)
22326bdd35cSRandall Stewart {
22426bdd35cSRandall Stewart }
2255a4333a5SRandall Stewart 
226*1f628be8SAndrew Gallatin static inline void
227*1f628be8SAndrew Gallatin tcp_rl_release_ifnet(struct ifnet *ifp)
228*1f628be8SAndrew Gallatin {
229*1f628be8SAndrew Gallatin }
23020abea66SRandall Stewart #endif
23126bdd35cSRandall Stewart 
232d7313dc6SRandall Stewart /*
233d7313dc6SRandall Stewart  * Given a b/w and a segsiz, and optional hardware
234d7313dc6SRandall Stewart  * rate limit, return the ideal size to burst
235d7313dc6SRandall Stewart  * out at once. Note the parameter can_use_1mss
236d7313dc6SRandall Stewart  * dictates if the transport will tolerate a 1mss
237d7313dc6SRandall Stewart  * limit, if not it will bottom out at 2mss (think
238d7313dc6SRandall Stewart  * delayed ack).
239d7313dc6SRandall Stewart  */
24026bdd35cSRandall Stewart static inline uint32_t
2411a714ff2SRandall Stewart tcp_get_pacing_burst_size(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss,
24226bdd35cSRandall Stewart 			  const struct tcp_hwrate_limit_table *te, int *err)
24326bdd35cSRandall Stewart {
244d7313dc6SRandall Stewart 
24526bdd35cSRandall Stewart 	return (tcp_get_pacing_burst_size_w_divisor(tp, bw, segsiz,
24626bdd35cSRandall Stewart 						    can_use_1mss,
24726bdd35cSRandall Stewart 						    te, err, 0));
24826bdd35cSRandall Stewart }
2491a714ff2SRandall Stewart 
25020abea66SRandall Stewart #endif
25120abea66SRandall Stewart #endif
252