120abea66SRandall Stewart /*- 220abea66SRandall Stewart * 320abea66SRandall Stewart * SPDX-License-Identifier: BSD-3-Clause 420abea66SRandall Stewart * 528540ab1SWarner Losh * Copyright (c) 2018-2020 620abea66SRandall Stewart * Netflix Inc. 720abea66SRandall Stewart * 820abea66SRandall Stewart * Redistribution and use in source and binary forms, with or without 920abea66SRandall Stewart * modification, are permitted provided that the following conditions 1020abea66SRandall Stewart * are met: 1120abea66SRandall Stewart * 1. Redistributions of source code must retain the above copyright 1220abea66SRandall Stewart * notice, this list of conditions and the following disclaimer. 1320abea66SRandall Stewart * 2. Redistributions in binary form must reproduce the above copyright 1420abea66SRandall Stewart * notice, this list of conditions and the following disclaimer in the 1520abea66SRandall Stewart * documentation and/or other materials provided with the distribution. 1620abea66SRandall Stewart * 1720abea66SRandall Stewart * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 1820abea66SRandall Stewart * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1920abea66SRandall Stewart * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 2020abea66SRandall Stewart * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 2120abea66SRandall Stewart * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2220abea66SRandall Stewart * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2320abea66SRandall Stewart * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2420abea66SRandall Stewart * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2520abea66SRandall Stewart * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 2620abea66SRandall Stewart * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 2720abea66SRandall Stewart * SUCH DAMAGE. 2820abea66SRandall Stewart * 2920abea66SRandall Stewart */ 3020abea66SRandall Stewart /** 3120abea66SRandall Stewart * Author: Randall Stewart <rrs@netflix.com> 3220abea66SRandall Stewart */ 3320abea66SRandall Stewart #ifndef __tcp_ratelimit_h__ 3420abea66SRandall Stewart #define __tcp_ratelimit_h__ 3520abea66SRandall Stewart 3620abea66SRandall Stewart struct m_snd_tag; 3720abea66SRandall Stewart 3826bdd35cSRandall Stewart #define RL_MIN_DIVISOR 50 3926bdd35cSRandall Stewart #define RL_DEFAULT_DIVISOR 1000 4026bdd35cSRandall Stewart 4120abea66SRandall Stewart /* Flags on an individual rate */ 4220abea66SRandall Stewart #define HDWRPACE_INITED 0x0001 4320abea66SRandall Stewart #define HDWRPACE_TAGPRESENT 0x0002 4420abea66SRandall Stewart #define HDWRPACE_IFPDEPARTED 0x0004 4520abea66SRandall Stewart struct tcp_hwrate_limit_table { 4620abea66SRandall Stewart const struct tcp_rate_set *ptbl; /* Pointer to parent table */ 4720abea66SRandall Stewart struct m_snd_tag *tag; /* Send tag if needed (chelsio) */ 485d8fd932SRandall Stewart long rate; /* Rate we get in Bytes per second (Bps) */ 495d8fd932SRandall Stewart long using; /* How many flows are using this hdwr rate. */ 505d8fd932SRandall Stewart long rs_num_enobufs; 5120abea66SRandall Stewart uint32_t time_between; /* Time-Gap between packets at this rate */ 5220abea66SRandall Stewart uint32_t flags; 5320abea66SRandall Stewart }; 5420abea66SRandall Stewart 5520abea66SRandall Stewart /* Rateset flags */ 5620abea66SRandall Stewart #define RS_IS_DEFF 0x0001 /* Its a lagg, do a double lookup */ 5720abea66SRandall Stewart #define RS_IS_INTF 0x0002 /* Its a plain interface */ 5820abea66SRandall Stewart #define RS_NO_PRE 0x0004 /* The interfacd has set rates */ 5920abea66SRandall Stewart #define RS_INT_TBL 0x0010 /* 6020abea66SRandall Stewart * The table is the internal version 6120abea66SRandall Stewart * which has special setup requirements. 6220abea66SRandall Stewart */ 6320abea66SRandall Stewart #define RS_IS_DEAD 0x0020 /* The RS is dead list */ 6420abea66SRandall Stewart #define RS_FUNERAL_SCHD 0x0040 /* Is a epoch call scheduled to bury this guy?*/ 6520abea66SRandall Stewart #define RS_INTF_NO_SUP 0x0100 /* The interface does not support the ratelimiting */ 6620abea66SRandall Stewart 6720abea66SRandall Stewart struct tcp_rate_set { 6820abea66SRandall Stewart struct sysctl_ctx_list sysctl_ctx; 6920abea66SRandall Stewart CK_LIST_ENTRY(tcp_rate_set) next; 7020abea66SRandall Stewart struct ifnet *rs_ifp; 7120abea66SRandall Stewart struct tcp_hwrate_limit_table *rs_rlt; 7220abea66SRandall Stewart uint64_t rs_flows_using; 7320abea66SRandall Stewart uint64_t rs_flow_limit; 7420abea66SRandall Stewart uint32_t rs_if_dunit; 7520abea66SRandall Stewart int rs_rate_cnt; 7620abea66SRandall Stewart int rs_min_seg; 7720abea66SRandall Stewart int rs_highest_valid; 7820abea66SRandall Stewart int rs_lowest_valid; 7920abea66SRandall Stewart int rs_disable; 8020abea66SRandall Stewart int rs_flags; 8120abea66SRandall Stewart struct epoch_context rs_epoch_ctx; 8220abea66SRandall Stewart }; 8320abea66SRandall Stewart 8420abea66SRandall Stewart CK_LIST_HEAD(head_tcp_rate_set, tcp_rate_set); 8520abea66SRandall Stewart 8620abea66SRandall Stewart /* Request flags */ 8720abea66SRandall Stewart #define RS_PACING_EXACT_MATCH 0x0001 /* Need an exact match for rate */ 8820abea66SRandall Stewart #define RS_PACING_GT 0x0002 /* Greater than requested */ 8920abea66SRandall Stewart #define RS_PACING_GEQ 0x0004 /* Greater than or equal too */ 9020abea66SRandall Stewart #define RS_PACING_LT 0x0008 /* Less than requested rate */ 9120abea66SRandall Stewart #define RS_PACING_SUB_OK 0x0010 /* If a rate can't be found get the 9220abea66SRandall Stewart * next best rate (highest or lowest). */ 9320abea66SRandall Stewart #ifdef _KERNEL 94d7313dc6SRandall Stewart #ifndef ETHERNET_SEGMENT_SIZE 95d7313dc6SRandall Stewart #define ETHERNET_SEGMENT_SIZE 1514 96d7313dc6SRandall Stewart #endif 97*1f628be8SAndrew Gallatin struct tcpcb; 98*1f628be8SAndrew Gallatin 992f1cc984SRandall Stewart #ifdef RATELIMIT 10020abea66SRandall Stewart #define DETAILED_RATELIMIT_SYSCTL 1 /* 10120abea66SRandall Stewart * Undefine this if you don't want 10220abea66SRandall Stewart * detailed rates to appear in 10320abea66SRandall Stewart * net.inet.tcp.rl. 10420abea66SRandall Stewart * With the defintion each rate 10520abea66SRandall Stewart * shows up in your sysctl tree 10620abea66SRandall Stewart * this can be big. 10720abea66SRandall Stewart */ 1081a714ff2SRandall Stewart uint64_t inline 1091a714ff2SRandall Stewart tcp_hw_highest_rate(const struct tcp_hwrate_limit_table *rle) 1101a714ff2SRandall Stewart { 1111a714ff2SRandall Stewart return (rle->ptbl->rs_rlt[rle->ptbl->rs_highest_valid].rate); 1121a714ff2SRandall Stewart } 1131a714ff2SRandall Stewart 1141a714ff2SRandall Stewart uint64_t 1151a714ff2SRandall Stewart tcp_hw_highest_rate_ifp(struct ifnet *ifp, struct inpcb *inp); 11620abea66SRandall Stewart 11720abea66SRandall Stewart const struct tcp_hwrate_limit_table * 11820abea66SRandall Stewart tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp, 1191a714ff2SRandall Stewart uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate); 12020abea66SRandall Stewart 12120abea66SRandall Stewart const struct tcp_hwrate_limit_table * 12220abea66SRandall Stewart tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte, 12320abea66SRandall Stewart struct tcpcb *tp, struct ifnet *ifp, 1241a714ff2SRandall Stewart uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate); 12520abea66SRandall Stewart void 12620abea66SRandall Stewart tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, 12720abea66SRandall Stewart struct tcpcb *tp); 12826bdd35cSRandall Stewart 12926bdd35cSRandall Stewart uint32_t 13026bdd35cSRandall Stewart tcp_get_pacing_burst_size_w_divisor(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss, 13126bdd35cSRandall Stewart const struct tcp_hwrate_limit_table *te, int *err, int divisor); 13226bdd35cSRandall Stewart 13326bdd35cSRandall Stewart void 13426bdd35cSRandall Stewart tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte); 13526bdd35cSRandall Stewart 136*1f628be8SAndrew Gallatin void 137*1f628be8SAndrew Gallatin tcp_rl_release_ifnet(struct ifnet *ifp); 138*1f628be8SAndrew Gallatin 13920abea66SRandall Stewart #else 14020abea66SRandall Stewart static inline const struct tcp_hwrate_limit_table * 14120abea66SRandall Stewart tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp, 1421a714ff2SRandall Stewart uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate) 14320abea66SRandall Stewart { 14420abea66SRandall Stewart if (error) 14520abea66SRandall Stewart *error = EOPNOTSUPP; 14620abea66SRandall Stewart return (NULL); 14720abea66SRandall Stewart } 14820abea66SRandall Stewart 14920abea66SRandall Stewart static inline const struct tcp_hwrate_limit_table * 15020abea66SRandall Stewart tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte, 15120abea66SRandall Stewart struct tcpcb *tp, struct ifnet *ifp, 1521a714ff2SRandall Stewart uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate) 15320abea66SRandall Stewart { 15420abea66SRandall Stewart if (error) 15520abea66SRandall Stewart *error = EOPNOTSUPP; 15620abea66SRandall Stewart return (NULL); 15720abea66SRandall Stewart } 15820abea66SRandall Stewart 15920abea66SRandall Stewart static inline void 16020abea66SRandall Stewart tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, 16120abea66SRandall Stewart struct tcpcb *tp) 16220abea66SRandall Stewart { 16320abea66SRandall Stewart return; 16420abea66SRandall Stewart } 1655a4333a5SRandall Stewart 1665a4333a5SRandall Stewart static uint64_t inline 1675a4333a5SRandall Stewart tcp_hw_highest_rate(const struct tcp_hwrate_limit_table *rle) 1685a4333a5SRandall Stewart { 1695a4333a5SRandall Stewart return (0); 1705a4333a5SRandall Stewart } 1715a4333a5SRandall Stewart 1725a4333a5SRandall Stewart static uint64_t inline 1735a4333a5SRandall Stewart tcp_hw_highest_rate_ifp(struct ifnet *ifp, struct inpcb *inp) 1745a4333a5SRandall Stewart { 1755a4333a5SRandall Stewart return (0); 1765a4333a5SRandall Stewart } 1775a4333a5SRandall Stewart 17826bdd35cSRandall Stewart static inline uint32_t 17926bdd35cSRandall Stewart tcp_get_pacing_burst_size_w_divisor(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss, 18026bdd35cSRandall Stewart const struct tcp_hwrate_limit_table *te, int *err, int divisor) 18126bdd35cSRandall Stewart { 18226bdd35cSRandall Stewart /* 18326bdd35cSRandall Stewart * We use the google formula to calculate the 18426bdd35cSRandall Stewart * TSO size. I.E. 18526bdd35cSRandall Stewart * bw < 24Meg 18626bdd35cSRandall Stewart * tso = 2mss 18726bdd35cSRandall Stewart * else 18826bdd35cSRandall Stewart * tso = min(bw/(div=1000), 64k) 18926bdd35cSRandall Stewart * 19026bdd35cSRandall Stewart * Note for these calculations we ignore the 19126bdd35cSRandall Stewart * packet overhead (enet hdr, ip hdr and tcp hdr). 19226bdd35cSRandall Stewart * We only get the google formula when we have 19326bdd35cSRandall Stewart * divisor = 1000, which is the default for now. 19426bdd35cSRandall Stewart */ 19526bdd35cSRandall Stewart uint64_t bytes; 19626bdd35cSRandall Stewart uint32_t new_tso, min_tso_segs; 19726bdd35cSRandall Stewart 19826bdd35cSRandall Stewart /* It can't be zero */ 19926bdd35cSRandall Stewart if ((divisor == 0) || 20026bdd35cSRandall Stewart (divisor < RL_MIN_DIVISOR)) { 20126bdd35cSRandall Stewart bytes = bw / RL_DEFAULT_DIVISOR; 20226bdd35cSRandall Stewart } else 20326bdd35cSRandall Stewart bytes = bw / divisor; 20426bdd35cSRandall Stewart /* We can't ever send more than 65k in a TSO */ 20526bdd35cSRandall Stewart if (bytes > 0xffff) { 20626bdd35cSRandall Stewart bytes = 0xffff; 20726bdd35cSRandall Stewart } 20826bdd35cSRandall Stewart /* Round up */ 20926bdd35cSRandall Stewart new_tso = (bytes + segsiz - 1) / segsiz; 21026bdd35cSRandall Stewart if (can_use_1mss) 21126bdd35cSRandall Stewart min_tso_segs = 1; 21226bdd35cSRandall Stewart else 21326bdd35cSRandall Stewart min_tso_segs = 2; 21426bdd35cSRandall Stewart if (new_tso < min_tso_segs) 21526bdd35cSRandall Stewart new_tso = min_tso_segs; 21626bdd35cSRandall Stewart new_tso *= segsiz; 21726bdd35cSRandall Stewart return (new_tso); 21826bdd35cSRandall Stewart } 21926bdd35cSRandall Stewart 22026bdd35cSRandall Stewart /* Do nothing if RATELIMIT is not defined */ 221876fddc8SMark Johnston static inline void 22226bdd35cSRandall Stewart tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte) 22326bdd35cSRandall Stewart { 22426bdd35cSRandall Stewart } 2255a4333a5SRandall Stewart 226*1f628be8SAndrew Gallatin static inline void 227*1f628be8SAndrew Gallatin tcp_rl_release_ifnet(struct ifnet *ifp) 228*1f628be8SAndrew Gallatin { 229*1f628be8SAndrew Gallatin } 23020abea66SRandall Stewart #endif 23126bdd35cSRandall Stewart 232d7313dc6SRandall Stewart /* 233d7313dc6SRandall Stewart * Given a b/w and a segsiz, and optional hardware 234d7313dc6SRandall Stewart * rate limit, return the ideal size to burst 235d7313dc6SRandall Stewart * out at once. Note the parameter can_use_1mss 236d7313dc6SRandall Stewart * dictates if the transport will tolerate a 1mss 237d7313dc6SRandall Stewart * limit, if not it will bottom out at 2mss (think 238d7313dc6SRandall Stewart * delayed ack). 239d7313dc6SRandall Stewart */ 24026bdd35cSRandall Stewart static inline uint32_t 2411a714ff2SRandall Stewart tcp_get_pacing_burst_size(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss, 24226bdd35cSRandall Stewart const struct tcp_hwrate_limit_table *te, int *err) 24326bdd35cSRandall Stewart { 244d7313dc6SRandall Stewart 24526bdd35cSRandall Stewart return (tcp_get_pacing_burst_size_w_divisor(tp, bw, segsiz, 24626bdd35cSRandall Stewart can_use_1mss, 24726bdd35cSRandall Stewart te, err, 0)); 24826bdd35cSRandall Stewart } 2491a714ff2SRandall Stewart 25020abea66SRandall Stewart #endif 25120abea66SRandall Stewart #endif 252