120abea66SRandall Stewart /*- 220abea66SRandall Stewart * 320abea66SRandall Stewart * SPDX-License-Identifier: BSD-3-Clause 420abea66SRandall Stewart * 528540ab1SWarner Losh * Copyright (c) 2018-2020 620abea66SRandall Stewart * Netflix Inc. 720abea66SRandall Stewart * 820abea66SRandall Stewart * Redistribution and use in source and binary forms, with or without 920abea66SRandall Stewart * modification, are permitted provided that the following conditions 1020abea66SRandall Stewart * are met: 1120abea66SRandall Stewart * 1. Redistributions of source code must retain the above copyright 1220abea66SRandall Stewart * notice, this list of conditions and the following disclaimer. 1320abea66SRandall Stewart * 2. Redistributions in binary form must reproduce the above copyright 1420abea66SRandall Stewart * notice, this list of conditions and the following disclaimer in the 1520abea66SRandall Stewart * documentation and/or other materials provided with the distribution. 1620abea66SRandall Stewart * 1720abea66SRandall Stewart * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 1820abea66SRandall Stewart * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1920abea66SRandall Stewart * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 2020abea66SRandall Stewart * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 2120abea66SRandall Stewart * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2220abea66SRandall Stewart * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2320abea66SRandall Stewart * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2420abea66SRandall Stewart * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2520abea66SRandall Stewart * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 2620abea66SRandall Stewart * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 2720abea66SRandall Stewart * SUCH DAMAGE. 2820abea66SRandall Stewart * 2920abea66SRandall Stewart */ 3020abea66SRandall Stewart /** 3120abea66SRandall Stewart * Author: Randall Stewart <rrs@netflix.com> 3220abea66SRandall Stewart */ 3320abea66SRandall Stewart 3420abea66SRandall Stewart #include <sys/cdefs.h> 3520abea66SRandall Stewart #include "opt_inet.h" 3620abea66SRandall Stewart #include "opt_inet6.h" 3720abea66SRandall Stewart #include "opt_ipsec.h" 3820abea66SRandall Stewart #include "opt_ratelimit.h" 3920abea66SRandall Stewart #include <sys/param.h> 4020abea66SRandall Stewart #include <sys/kernel.h> 4120abea66SRandall Stewart #include <sys/malloc.h> 4220abea66SRandall Stewart #include <sys/mbuf.h> 4320abea66SRandall Stewart #include <sys/socket.h> 4420abea66SRandall Stewart #include <sys/socketvar.h> 4520abea66SRandall Stewart #include <sys/sysctl.h> 4620abea66SRandall Stewart #include <sys/eventhandler.h> 4720abea66SRandall Stewart #include <sys/mutex.h> 4820abea66SRandall Stewart #include <sys/ck.h> 49348404bcSRandall Stewart #include <net/if.h> 50348404bcSRandall Stewart #include <net/if_var.h> 513d0d5b21SJustin Hibbits #include <net/if_private.h> 5220abea66SRandall Stewart #include <netinet/in.h> 5320abea66SRandall Stewart #include <netinet/in_pcb.h> 54348404bcSRandall Stewart #define TCPSTATES /* for logging */ 5520abea66SRandall Stewart #include <netinet/tcp_var.h> 561a714ff2SRandall Stewart #include <netinet/tcp_hpts.h> 571a714ff2SRandall Stewart #include <netinet/tcp_log_buf.h> 5820abea66SRandall Stewart #include <netinet/tcp_ratelimit.h> 5920abea66SRandall Stewart #ifndef USECS_IN_SECOND 6020abea66SRandall Stewart #define USECS_IN_SECOND 1000000 6120abea66SRandall Stewart #endif 6220abea66SRandall Stewart /* 6320abea66SRandall Stewart * For the purposes of each send, what is the size 6420abea66SRandall Stewart * of an ethernet frame. 6520abea66SRandall Stewart */ 6620abea66SRandall Stewart MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory"); 6720abea66SRandall Stewart #ifdef RATELIMIT 6820abea66SRandall Stewart 69d7313dc6SRandall Stewart /* 70d7313dc6SRandall Stewart * The following preferred table will seem weird to 71d7313dc6SRandall Stewart * the casual viewer. Why do we not have any rates below 72d7313dc6SRandall Stewart * 1Mbps? Why do we have a rate at 1.44Mbps called common? 73d7313dc6SRandall Stewart * Why do the rates cluster in the 1-100Mbps range more 74d7313dc6SRandall Stewart * than others? Why does the table jump around at the beginnign 75d7313dc6SRandall Stewart * and then be more consistently raising? 76d7313dc6SRandall Stewart * 77d7313dc6SRandall Stewart * Let me try to answer those questions. A lot of 78d7313dc6SRandall Stewart * this is dependant on the hardware. We have three basic 79d7313dc6SRandall Stewart * supporters of rate limiting 80d7313dc6SRandall Stewart * 81d7313dc6SRandall Stewart * Chelsio - Supporting 16 configurable rates. 82d7313dc6SRandall Stewart * Mlx - c4 supporting 13 fixed rates. 83d7313dc6SRandall Stewart * Mlx - c5 & c6 supporting 127 configurable rates. 84d7313dc6SRandall Stewart * 85d7313dc6SRandall Stewart * The c4 is why we have a common rate that is available 86d7313dc6SRandall Stewart * in all rate tables. This is a selected rate from the 87d7313dc6SRandall Stewart * c4 table and we assure its available in all ratelimit 88d7313dc6SRandall Stewart * tables. This way the tcp_ratelimit code has an assured 89d7313dc6SRandall Stewart * rate it should always be able to get. This answers a 90d7313dc6SRandall Stewart * couple of the questions above. 91d7313dc6SRandall Stewart * 92d7313dc6SRandall Stewart * So what about the rest, well the table is built to 93d7313dc6SRandall Stewart * try to get the most out of a joint hardware/software 94d7313dc6SRandall Stewart * pacing system. The software pacer will always pick 95d7313dc6SRandall Stewart * a rate higher than the b/w that it is estimating 96d7313dc6SRandall Stewart * 97d7313dc6SRandall Stewart * on the path. This is done for two reasons. 98d7313dc6SRandall Stewart * a) So we can discover more b/w 99d7313dc6SRandall Stewart * and 100d7313dc6SRandall Stewart * b) So we can send a block of MSS's down and then 101d7313dc6SRandall Stewart * have the software timer go off after the previous 102d7313dc6SRandall Stewart * send is completely out of the hardware. 103d7313dc6SRandall Stewart * 104d7313dc6SRandall Stewart * But when we do <b> we don't want to have the delay 105d7313dc6SRandall Stewart * between the last packet sent by the hardware be 106d7313dc6SRandall Stewart * excessively long (to reach our desired rate). 107d7313dc6SRandall Stewart * 108d7313dc6SRandall Stewart * So let me give an example for clarity. 109d7313dc6SRandall Stewart * 110d7313dc6SRandall Stewart * Lets assume that the tcp stack sees that 29,110,000 bps is 111d7313dc6SRandall Stewart * what the bw of the path is. The stack would select the 112d7313dc6SRandall Stewart * rate 31Mbps. 31Mbps means that each send that is done 113d7313dc6SRandall Stewart * by the hardware will cause a 390 micro-second gap between 114d7313dc6SRandall Stewart * the packets sent at that rate. For 29,110,000 bps we 115d7313dc6SRandall Stewart * would need 416 micro-seconds gap between each send. 116d7313dc6SRandall Stewart * 117d7313dc6SRandall Stewart * Note that are calculating a complete time for pacing 118d7313dc6SRandall Stewart * which includes the ethernet, IP and TCP overhead. So 119d7313dc6SRandall Stewart * a full 1514 bytes is used for the above calculations. 120d7313dc6SRandall Stewart * My testing has shown that both cards are also using this 121d7313dc6SRandall Stewart * as their basis i.e. full payload size of the ethernet frame. 122d7313dc6SRandall Stewart * The TCP stack caller needs to be aware of this and make the 123d7313dc6SRandall Stewart * appropriate overhead calculations be included in its choices. 124d7313dc6SRandall Stewart * 125d7313dc6SRandall Stewart * Now, continuing our example, we pick a MSS size based on the 126d7313dc6SRandall Stewart * delta between the two rates (416 - 390) divided into the rate 127d7313dc6SRandall Stewart * we really wish to send at rounded up. That results in a MSS 128d7313dc6SRandall Stewart * send of 17 mss's at once. The hardware then will 129d7313dc6SRandall Stewart * run out of data in a single 17MSS send in 6,630 micro-seconds. 130d7313dc6SRandall Stewart * 131d7313dc6SRandall Stewart * On the other hand the software pacer will send more data 132d7313dc6SRandall Stewart * in 7,072 micro-seconds. This means that we will refill 133d7313dc6SRandall Stewart * the hardware 52 microseconds after it would have sent 134d7313dc6SRandall Stewart * next if it had not ran out of data. This is a win since we are 135d7313dc6SRandall Stewart * only sending every 7ms or so and yet all the packets are spaced on 136d7313dc6SRandall Stewart * the wire with 94% of what they should be and only 137d7313dc6SRandall Stewart * the last packet is delayed extra to make up for the 138d7313dc6SRandall Stewart * difference. 139d7313dc6SRandall Stewart * 140d7313dc6SRandall Stewart * Note that the above formula has two important caveat. 141d7313dc6SRandall Stewart * If we are above (b/w wise) over 100Mbps we double the result 142d7313dc6SRandall Stewart * of the MSS calculation. The second caveat is if we are 500Mbps 143d7313dc6SRandall Stewart * or more we just send the maximum MSS at once i.e. 45MSS. At 144d7313dc6SRandall Stewart * the higher b/w's even the cards have limits to what times (timer granularity) 145d7313dc6SRandall Stewart * they can insert between packets and start to send more than one 146d7313dc6SRandall Stewart * packet at a time on the wire. 147d7313dc6SRandall Stewart * 148d7313dc6SRandall Stewart */ 14920abea66SRandall Stewart #define COMMON_RATE 180500 150d7313dc6SRandall Stewart const uint64_t desired_rates[] = { 151d7313dc6SRandall Stewart 122500, /* 1Mbps - rate 1 */ 152d7313dc6SRandall Stewart 180500, /* 1.44Mpbs - rate 2 common rate */ 153d7313dc6SRandall Stewart 375000, /* 3Mbps - rate 3 */ 154d7313dc6SRandall Stewart 625000, /* 5Mbps - rate 4 */ 1551a714ff2SRandall Stewart 1250000, /* 10Mbps - rate 5 */ 1561a714ff2SRandall Stewart 1875000, /* 15Mbps - rate 6 */ 1571a714ff2SRandall Stewart 2500000, /* 20Mbps - rate 7 */ 1581a714ff2SRandall Stewart 3125000, /* 25Mbps - rate 8 */ 1591a714ff2SRandall Stewart 3750000, /* 30Mbps - rate 9 */ 1601a714ff2SRandall Stewart 4375000, /* 35Mbps - rate 10 */ 1611a714ff2SRandall Stewart 5000000, /* 40Meg - rate 11 */ 1621a714ff2SRandall Stewart 6250000, /* 50Mbps - rate 12 */ 1631a714ff2SRandall Stewart 12500000, /* 100Mbps - rate 13 */ 1641a714ff2SRandall Stewart 25000000, /* 200Mbps - rate 14 */ 1651a714ff2SRandall Stewart 50000000, /* 400Mbps - rate 15 */ 166d7313dc6SRandall Stewart 100000000, /* 800Mbps - rate 16 */ 1671a714ff2SRandall Stewart 5625000, /* 45Mbps - rate 17 */ 1681a714ff2SRandall Stewart 6875000, /* 55Mbps - rate 19 */ 1691a714ff2SRandall Stewart 7500000, /* 60Mbps - rate 20 */ 1701a714ff2SRandall Stewart 8125000, /* 65Mbps - rate 21 */ 1711a714ff2SRandall Stewart 8750000, /* 70Mbps - rate 22 */ 1721a714ff2SRandall Stewart 9375000, /* 75Mbps - rate 23 */ 1731a714ff2SRandall Stewart 10000000, /* 80Mbps - rate 24 */ 1741a714ff2SRandall Stewart 10625000, /* 85Mbps - rate 25 */ 1751a714ff2SRandall Stewart 11250000, /* 90Mbps - rate 26 */ 1761a714ff2SRandall Stewart 11875000, /* 95Mbps - rate 27 */ 1771a714ff2SRandall Stewart 12500000, /* 100Mbps - rate 28 */ 1781a714ff2SRandall Stewart 13750000, /* 110Mbps - rate 29 */ 1791a714ff2SRandall Stewart 15000000, /* 120Mbps - rate 30 */ 1801a714ff2SRandall Stewart 16250000, /* 130Mbps - rate 31 */ 1811a714ff2SRandall Stewart 17500000, /* 140Mbps - rate 32 */ 1821a714ff2SRandall Stewart 18750000, /* 150Mbps - rate 33 */ 1831a714ff2SRandall Stewart 20000000, /* 160Mbps - rate 34 */ 1841a714ff2SRandall Stewart 21250000, /* 170Mbps - rate 35 */ 1851a714ff2SRandall Stewart 22500000, /* 180Mbps - rate 36 */ 1861a714ff2SRandall Stewart 23750000, /* 190Mbps - rate 37 */ 1871a714ff2SRandall Stewart 26250000, /* 210Mbps - rate 38 */ 1881a714ff2SRandall Stewart 27500000, /* 220Mbps - rate 39 */ 1891a714ff2SRandall Stewart 28750000, /* 230Mbps - rate 40 */ 1901a714ff2SRandall Stewart 30000000, /* 240Mbps - rate 41 */ 1911a714ff2SRandall Stewart 31250000, /* 250Mbps - rate 42 */ 1921a714ff2SRandall Stewart 34375000, /* 275Mbps - rate 43 */ 1931a714ff2SRandall Stewart 37500000, /* 300Mbps - rate 44 */ 1941a714ff2SRandall Stewart 40625000, /* 325Mbps - rate 45 */ 1951a714ff2SRandall Stewart 43750000, /* 350Mbps - rate 46 */ 1961a714ff2SRandall Stewart 46875000, /* 375Mbps - rate 47 */ 1971a714ff2SRandall Stewart 53125000, /* 425Mbps - rate 48 */ 1981a714ff2SRandall Stewart 56250000, /* 450Mbps - rate 49 */ 1991a714ff2SRandall Stewart 59375000, /* 475Mbps - rate 50 */ 2001a714ff2SRandall Stewart 62500000, /* 500Mbps - rate 51 */ 2011a714ff2SRandall Stewart 68750000, /* 550Mbps - rate 52 */ 2021a714ff2SRandall Stewart 75000000, /* 600Mbps - rate 53 */ 2031a714ff2SRandall Stewart 81250000, /* 650Mbps - rate 54 */ 2041a714ff2SRandall Stewart 87500000, /* 700Mbps - rate 55 */ 2051a714ff2SRandall Stewart 93750000, /* 750Mbps - rate 56 */ 2061a714ff2SRandall Stewart 106250000, /* 850Mbps - rate 57 */ 2071a714ff2SRandall Stewart 112500000, /* 900Mbps - rate 58 */ 2081a714ff2SRandall Stewart 125000000, /* 1Gbps - rate 59 */ 2091a714ff2SRandall Stewart 156250000, /* 1.25Gps - rate 60 */ 2101a714ff2SRandall Stewart 187500000, /* 1.5Gps - rate 61 */ 2111a714ff2SRandall Stewart 218750000, /* 1.75Gps - rate 62 */ 2121a714ff2SRandall Stewart 250000000, /* 2Gbps - rate 63 */ 2131a714ff2SRandall Stewart 281250000, /* 2.25Gps - rate 64 */ 2141a714ff2SRandall Stewart 312500000, /* 2.5Gbps - rate 65 */ 2151a714ff2SRandall Stewart 343750000, /* 2.75Gbps - rate 66 */ 2161a714ff2SRandall Stewart 375000000, /* 3Gbps - rate 67 */ 2171a714ff2SRandall Stewart 500000000, /* 4Gbps - rate 68 */ 2181a714ff2SRandall Stewart 625000000, /* 5Gbps - rate 69 */ 2191a714ff2SRandall Stewart 750000000, /* 6Gbps - rate 70 */ 2201a714ff2SRandall Stewart 875000000, /* 7Gbps - rate 71 */ 2211a714ff2SRandall Stewart 1000000000, /* 8Gbps - rate 72 */ 2221a714ff2SRandall Stewart 1125000000, /* 9Gbps - rate 73 */ 2231a714ff2SRandall Stewart 1250000000, /* 10Gbps - rate 74 */ 2241a714ff2SRandall Stewart 1875000000, /* 15Gbps - rate 75 */ 2251a714ff2SRandall Stewart 2500000000 /* 20Gbps - rate 76 */ 22620abea66SRandall Stewart }; 227d7313dc6SRandall Stewart 22820abea66SRandall Stewart #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t)) 22920abea66SRandall Stewart #define RS_ORDERED_COUNT 16 /* 23020abea66SRandall Stewart * Number that are in order 23120abea66SRandall Stewart * at the beginning of the table, 23220abea66SRandall Stewart * over this a sort is required. 23320abea66SRandall Stewart */ 23420abea66SRandall Stewart #define RS_NEXT_ORDER_GROUP 16 /* 23520abea66SRandall Stewart * The point in our table where 23620abea66SRandall Stewart * we come fill in a second ordered 23720abea66SRandall Stewart * group (index wise means -1). 23820abea66SRandall Stewart */ 23920abea66SRandall Stewart #define ALL_HARDWARE_RATES 1004 /* 24020abea66SRandall Stewart * 1Meg - 1Gig in 1 Meg steps 24120abea66SRandall Stewart * plus 100, 200k and 500k and 24220abea66SRandall Stewart * 10Gig 24320abea66SRandall Stewart */ 24420abea66SRandall Stewart 24520abea66SRandall Stewart #define RS_ONE_MEGABIT_PERSEC 1000000 24620abea66SRandall Stewart #define RS_ONE_GIGABIT_PERSEC 1000000000 24720abea66SRandall Stewart #define RS_TEN_GIGABIT_PERSEC 10000000000 24820abea66SRandall Stewart 24920abea66SRandall Stewart static struct head_tcp_rate_set int_rs; 25020abea66SRandall Stewart static struct mtx rs_mtx; 25120abea66SRandall Stewart uint32_t rs_number_alive; 25220abea66SRandall Stewart uint32_t rs_number_dead; 2531a714ff2SRandall Stewart static uint32_t rs_floor_mss = 0; 2541a714ff2SRandall Stewart static uint32_t wait_time_floor = 8000; /* 8 ms */ 2551a714ff2SRandall Stewart static uint32_t rs_hw_floor_mss = 16; 2561a714ff2SRandall Stewart static uint32_t num_of_waits_allowed = 1; /* How many time blocks are we willing to wait */ 25720abea66SRandall Stewart 25826bdd35cSRandall Stewart static uint32_t mss_divisor = RL_DEFAULT_DIVISOR; 25926bdd35cSRandall Stewart static uint32_t even_num_segs = 1; 26026bdd35cSRandall Stewart static uint32_t even_threshold = 4; 26126bdd35cSRandall Stewart 2627029da5cSPawel Biernacki SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 26320abea66SRandall Stewart "TCP Ratelimit stats"); 26420abea66SRandall Stewart SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW, 26520abea66SRandall Stewart &rs_number_alive, 0, 26620abea66SRandall Stewart "Number of interfaces initialized for ratelimiting"); 26720abea66SRandall Stewart SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW, 26820abea66SRandall Stewart &rs_number_dead, 0, 26920abea66SRandall Stewart "Number of interfaces departing from ratelimiting"); 2701a714ff2SRandall Stewart SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, floor_mss, CTLFLAG_RW, 2711a714ff2SRandall Stewart &rs_floor_mss, 0, 2721a714ff2SRandall Stewart "Number of MSS that will override the normal minimums (0 means don't enforce)"); 2731a714ff2SRandall Stewart SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, wait_floor, CTLFLAG_RW, 2741a714ff2SRandall Stewart &wait_time_floor, 2000, 2751a714ff2SRandall Stewart "Has b/w increases what is the wait floor we are willing to wait at the end?"); 2761a714ff2SRandall Stewart SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, time_blocks, CTLFLAG_RW, 2771a714ff2SRandall Stewart &num_of_waits_allowed, 1, 2781a714ff2SRandall Stewart "How many time blocks on the end should software pacing be willing to wait?"); 2791a714ff2SRandall Stewart 2801a714ff2SRandall Stewart SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, hw_floor_mss, CTLFLAG_RW, 2811a714ff2SRandall Stewart &rs_hw_floor_mss, 16, 2821a714ff2SRandall Stewart "Number of mss that are a minum for hardware pacing?"); 2831a714ff2SRandall Stewart 28426bdd35cSRandall Stewart SYSCTL_INT(_net_inet_tcp_rl, OID_AUTO, divisor, CTLFLAG_RW, 28526bdd35cSRandall Stewart &mss_divisor, RL_DEFAULT_DIVISOR, 28626bdd35cSRandall Stewart "The value divided into bytes per second to help establish mss size"); 28726bdd35cSRandall Stewart SYSCTL_INT(_net_inet_tcp_rl, OID_AUTO, even, CTLFLAG_RW, 28826bdd35cSRandall Stewart &even_num_segs, 1, 28926bdd35cSRandall Stewart "Do we round mss size up to an even number of segments for delayed ack"); 29026bdd35cSRandall Stewart SYSCTL_INT(_net_inet_tcp_rl, OID_AUTO, eventhresh, CTLFLAG_RW, 29126bdd35cSRandall Stewart &even_threshold, 4, 29226bdd35cSRandall Stewart "At what number of mss do we start rounding up to an even number of mss?"); 29320abea66SRandall Stewart 29420abea66SRandall Stewart static void 29520abea66SRandall Stewart rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs) 29620abea66SRandall Stewart { 29720abea66SRandall Stewart /* 29820abea66SRandall Stewart * Add sysctl entries for thus interface. 29920abea66SRandall Stewart */ 30020abea66SRandall Stewart if (rs->rs_flags & RS_INTF_NO_SUP) { 30120abea66SRandall Stewart SYSCTL_ADD_S32(&rs->sysctl_ctx, 30220abea66SRandall Stewart SYSCTL_CHILDREN(rl_sysctl_root), 30320abea66SRandall Stewart OID_AUTO, "disable", CTLFLAG_RD, 30420abea66SRandall Stewart &rs->rs_disable, 0, 30520abea66SRandall Stewart "Disable this interface from new hdwr limiting?"); 30620abea66SRandall Stewart } else { 30720abea66SRandall Stewart SYSCTL_ADD_S32(&rs->sysctl_ctx, 30820abea66SRandall Stewart SYSCTL_CHILDREN(rl_sysctl_root), 30920abea66SRandall Stewart OID_AUTO, "disable", CTLFLAG_RW, 31020abea66SRandall Stewart &rs->rs_disable, 0, 31120abea66SRandall Stewart "Disable this interface from new hdwr limiting?"); 31220abea66SRandall Stewart } 31320abea66SRandall Stewart SYSCTL_ADD_S32(&rs->sysctl_ctx, 31420abea66SRandall Stewart SYSCTL_CHILDREN(rl_sysctl_root), 31520abea66SRandall Stewart OID_AUTO, "minseg", CTLFLAG_RW, 31620abea66SRandall Stewart &rs->rs_min_seg, 0, 31720abea66SRandall Stewart "What is the minimum we need to send on this interface?"); 31820abea66SRandall Stewart SYSCTL_ADD_U64(&rs->sysctl_ctx, 31920abea66SRandall Stewart SYSCTL_CHILDREN(rl_sysctl_root), 32020abea66SRandall Stewart OID_AUTO, "flow_limit", CTLFLAG_RW, 32120abea66SRandall Stewart &rs->rs_flow_limit, 0, 32220abea66SRandall Stewart "What is the limit for number of flows (0=unlimited)?"); 32320abea66SRandall Stewart SYSCTL_ADD_S32(&rs->sysctl_ctx, 32420abea66SRandall Stewart SYSCTL_CHILDREN(rl_sysctl_root), 32520abea66SRandall Stewart OID_AUTO, "highest", CTLFLAG_RD, 32620abea66SRandall Stewart &rs->rs_highest_valid, 0, 32720abea66SRandall Stewart "Highest valid rate"); 32820abea66SRandall Stewart SYSCTL_ADD_S32(&rs->sysctl_ctx, 32920abea66SRandall Stewart SYSCTL_CHILDREN(rl_sysctl_root), 33020abea66SRandall Stewart OID_AUTO, "lowest", CTLFLAG_RD, 33120abea66SRandall Stewart &rs->rs_lowest_valid, 0, 33220abea66SRandall Stewart "Lowest valid rate"); 33320abea66SRandall Stewart SYSCTL_ADD_S32(&rs->sysctl_ctx, 33420abea66SRandall Stewart SYSCTL_CHILDREN(rl_sysctl_root), 33520abea66SRandall Stewart OID_AUTO, "flags", CTLFLAG_RD, 33620abea66SRandall Stewart &rs->rs_flags, 0, 33720abea66SRandall Stewart "What lags are on the entry?"); 33820abea66SRandall Stewart SYSCTL_ADD_S32(&rs->sysctl_ctx, 33920abea66SRandall Stewart SYSCTL_CHILDREN(rl_sysctl_root), 34020abea66SRandall Stewart OID_AUTO, "numrates", CTLFLAG_RD, 34120abea66SRandall Stewart &rs->rs_rate_cnt, 0, 34220abea66SRandall Stewart "How many rates re there?"); 34320abea66SRandall Stewart SYSCTL_ADD_U64(&rs->sysctl_ctx, 34420abea66SRandall Stewart SYSCTL_CHILDREN(rl_sysctl_root), 34520abea66SRandall Stewart OID_AUTO, "flows_using", CTLFLAG_RD, 34620abea66SRandall Stewart &rs->rs_flows_using, 0, 34720abea66SRandall Stewart "How many flows are using this interface now?"); 34820abea66SRandall Stewart #ifdef DETAILED_RATELIMIT_SYSCTL 34920abea66SRandall Stewart if (rs->rs_rlt && rs->rs_rate_cnt > 0) { 35020abea66SRandall Stewart /* Lets display the rates */ 35120abea66SRandall Stewart int i; 35220abea66SRandall Stewart struct sysctl_oid *rl_rates; 35320abea66SRandall Stewart struct sysctl_oid *rl_rate_num; 35420abea66SRandall Stewart char rate_num[16]; 35520abea66SRandall Stewart rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 35620abea66SRandall Stewart SYSCTL_CHILDREN(rl_sysctl_root), 35720abea66SRandall Stewart OID_AUTO, 35820abea66SRandall Stewart "rate", 3597029da5cSPawel Biernacki CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 36020abea66SRandall Stewart "Ratelist"); 36120abea66SRandall Stewart for( i = 0; i < rs->rs_rate_cnt; i++) { 36220abea66SRandall Stewart sprintf(rate_num, "%d", i); 36320abea66SRandall Stewart rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 36420abea66SRandall Stewart SYSCTL_CHILDREN(rl_rates), 36520abea66SRandall Stewart OID_AUTO, 36620abea66SRandall Stewart rate_num, 3677029da5cSPawel Biernacki CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 36820abea66SRandall Stewart "Individual Rate"); 36920abea66SRandall Stewart SYSCTL_ADD_U32(&rs->sysctl_ctx, 37020abea66SRandall Stewart SYSCTL_CHILDREN(rl_rate_num), 37120abea66SRandall Stewart OID_AUTO, "flags", CTLFLAG_RD, 37220abea66SRandall Stewart &rs->rs_rlt[i].flags, 0, 37320abea66SRandall Stewart "Flags on this rate"); 37420abea66SRandall Stewart SYSCTL_ADD_U32(&rs->sysctl_ctx, 37520abea66SRandall Stewart SYSCTL_CHILDREN(rl_rate_num), 37620abea66SRandall Stewart OID_AUTO, "pacetime", CTLFLAG_RD, 37720abea66SRandall Stewart &rs->rs_rlt[i].time_between, 0, 37820abea66SRandall Stewart "Time hardware inserts between 1500 byte sends"); 3795d8fd932SRandall Stewart SYSCTL_ADD_LONG(&rs->sysctl_ctx, 38020abea66SRandall Stewart SYSCTL_CHILDREN(rl_rate_num), 38120abea66SRandall Stewart OID_AUTO, "rate", CTLFLAG_RD, 3825d8fd932SRandall Stewart &rs->rs_rlt[i].rate, 38320abea66SRandall Stewart "Rate in bytes per second"); 3845d8fd932SRandall Stewart SYSCTL_ADD_LONG(&rs->sysctl_ctx, 3855d8fd932SRandall Stewart SYSCTL_CHILDREN(rl_rate_num), 3865d8fd932SRandall Stewart OID_AUTO, "using", CTLFLAG_RD, 3875d8fd932SRandall Stewart &rs->rs_rlt[i].using, 3885d8fd932SRandall Stewart "Number of flows using"); 3895d8fd932SRandall Stewart SYSCTL_ADD_LONG(&rs->sysctl_ctx, 3905d8fd932SRandall Stewart SYSCTL_CHILDREN(rl_rate_num), 3915d8fd932SRandall Stewart OID_AUTO, "enobufs", CTLFLAG_RD, 3925d8fd932SRandall Stewart &rs->rs_rlt[i].rs_num_enobufs, 3935d8fd932SRandall Stewart "Number of enobufs logged on this rate"); 3945d8fd932SRandall Stewart 39520abea66SRandall Stewart } 39620abea66SRandall Stewart } 39720abea66SRandall Stewart #endif 39820abea66SRandall Stewart } 39920abea66SRandall Stewart 40020abea66SRandall Stewart static void 40120abea66SRandall Stewart rs_destroy(epoch_context_t ctx) 40220abea66SRandall Stewart { 40320abea66SRandall Stewart struct tcp_rate_set *rs; 40424be1353SHans Petter Selasky bool do_free_rs; 40520abea66SRandall Stewart 40620abea66SRandall Stewart rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx); 40724be1353SHans Petter Selasky 40820abea66SRandall Stewart mtx_lock(&rs_mtx); 40920abea66SRandall Stewart rs->rs_flags &= ~RS_FUNERAL_SCHD; 41020abea66SRandall Stewart /* 41120abea66SRandall Stewart * In theory its possible (but unlikely) 41220abea66SRandall Stewart * that while the delete was occuring 41320abea66SRandall Stewart * and we were applying the DEAD flag 41420abea66SRandall Stewart * someone slipped in and found the 41520abea66SRandall Stewart * interface in a lookup. While we 41620abea66SRandall Stewart * decided rs_flows_using were 0 and 41720abea66SRandall Stewart * scheduling the epoch_call, the other 41820abea66SRandall Stewart * thread incremented rs_flow_using. This 41920abea66SRandall Stewart * is because users have a pointer and 42020abea66SRandall Stewart * we only use the rs_flows_using in an 42120abea66SRandall Stewart * atomic fashion, i.e. the other entities 42220abea66SRandall Stewart * are not protected. To assure this did 42320abea66SRandall Stewart * not occur, we check rs_flows_using here 42424be1353SHans Petter Selasky * before deleting. 42520abea66SRandall Stewart */ 42624be1353SHans Petter Selasky do_free_rs = (rs->rs_flows_using == 0); 42724be1353SHans Petter Selasky rs_number_dead--; 42824be1353SHans Petter Selasky mtx_unlock(&rs_mtx); 42924be1353SHans Petter Selasky 43024be1353SHans Petter Selasky if (do_free_rs) { 43120abea66SRandall Stewart sysctl_ctx_free(&rs->sysctl_ctx); 43220abea66SRandall Stewart free(rs->rs_rlt, M_TCPPACE); 43320abea66SRandall Stewart free(rs, M_TCPPACE); 43420abea66SRandall Stewart } 43520abea66SRandall Stewart } 43620abea66SRandall Stewart 437eabddb25SHans Petter Selasky static void 438eabddb25SHans Petter Selasky rs_defer_destroy(struct tcp_rate_set *rs) 439eabddb25SHans Petter Selasky { 440eabddb25SHans Petter Selasky 441eabddb25SHans Petter Selasky mtx_assert(&rs_mtx, MA_OWNED); 442eabddb25SHans Petter Selasky 443eabddb25SHans Petter Selasky /* Check if already pending. */ 444eabddb25SHans Petter Selasky if (rs->rs_flags & RS_FUNERAL_SCHD) 445eabddb25SHans Petter Selasky return; 446eabddb25SHans Petter Selasky 447eabddb25SHans Petter Selasky rs_number_dead++; 448eabddb25SHans Petter Selasky 449eabddb25SHans Petter Selasky /* Set flag to only defer once. */ 450eabddb25SHans Petter Selasky rs->rs_flags |= RS_FUNERAL_SCHD; 451348404bcSRandall Stewart NET_EPOCH_CALL(rs_destroy, &rs->rs_epoch_ctx); 452eabddb25SHans Petter Selasky } 453eabddb25SHans Petter Selasky 454903c4ee6SXin LI #ifdef INET 4551a714ff2SRandall Stewart extern counter_u64_t rate_limit_new; 4561a714ff2SRandall Stewart extern counter_u64_t rate_limit_chg; 45720abea66SRandall Stewart extern counter_u64_t rate_limit_set_ok; 45820abea66SRandall Stewart extern counter_u64_t rate_limit_active; 45920abea66SRandall Stewart extern counter_u64_t rate_limit_alloc_fail; 460903c4ee6SXin LI #endif 46120abea66SRandall Stewart 46220abea66SRandall Stewart static int 46320abea66SRandall Stewart rl_attach_txrtlmt(struct ifnet *ifp, 46420abea66SRandall Stewart uint32_t flowtype, 46520abea66SRandall Stewart int flowid, 46620abea66SRandall Stewart uint64_t cfg_rate, 46720abea66SRandall Stewart struct m_snd_tag **tag) 46820abea66SRandall Stewart { 46920abea66SRandall Stewart int error; 47020abea66SRandall Stewart union if_snd_tag_alloc_params params = { 47120abea66SRandall Stewart .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, 47220abea66SRandall Stewart .rate_limit.hdr.flowid = flowid, 47320abea66SRandall Stewart .rate_limit.hdr.flowtype = flowtype, 47420abea66SRandall Stewart .rate_limit.max_rate = cfg_rate, 47520abea66SRandall Stewart .rate_limit.flags = M_NOWAIT, 47620abea66SRandall Stewart }; 47720abea66SRandall Stewart 47836e0a362SJohn Baldwin error = m_snd_tag_alloc(ifp, ¶ms, tag); 479903c4ee6SXin LI #ifdef INET 48020abea66SRandall Stewart if (error == 0) { 48120abea66SRandall Stewart counter_u64_add(rate_limit_set_ok, 1); 48220abea66SRandall Stewart counter_u64_add(rate_limit_active, 1); 48336e0a362SJohn Baldwin } else if (error != EOPNOTSUPP) 48420abea66SRandall Stewart counter_u64_add(rate_limit_alloc_fail, 1); 485903c4ee6SXin LI #endif 48620abea66SRandall Stewart return (error); 48720abea66SRandall Stewart } 48820abea66SRandall Stewart 48920abea66SRandall Stewart static void 49020abea66SRandall Stewart populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act) 49120abea66SRandall Stewart { 49220abea66SRandall Stewart /* 49320abea66SRandall Stewart * The internal table is "special", it 49420abea66SRandall Stewart * is two seperate ordered tables that 49520abea66SRandall Stewart * must be merged. We get here when the 49620abea66SRandall Stewart * adapter specifies a number of rates that 49720abea66SRandall Stewart * covers both ranges in the table in some 49820abea66SRandall Stewart * form. 49920abea66SRandall Stewart */ 50020abea66SRandall Stewart int i, at_low, at_high; 50120abea66SRandall Stewart uint8_t low_disabled = 0, high_disabled = 0; 50220abea66SRandall Stewart 50320abea66SRandall Stewart for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) { 50420abea66SRandall Stewart rs->rs_rlt[i].flags = 0; 50520abea66SRandall Stewart rs->rs_rlt[i].time_between = 0; 50620abea66SRandall Stewart if ((low_disabled == 0) && 50720abea66SRandall Stewart (high_disabled || 50820abea66SRandall Stewart (rate_table_act[at_low] < rate_table_act[at_high]))) { 50920abea66SRandall Stewart rs->rs_rlt[i].rate = rate_table_act[at_low]; 51020abea66SRandall Stewart at_low++; 51120abea66SRandall Stewart if (at_low == RS_NEXT_ORDER_GROUP) 51220abea66SRandall Stewart low_disabled = 1; 51320abea66SRandall Stewart } else if (high_disabled == 0) { 51420abea66SRandall Stewart rs->rs_rlt[i].rate = rate_table_act[at_high]; 51520abea66SRandall Stewart at_high++; 51620abea66SRandall Stewart if (at_high == MAX_HDWR_RATES) 51720abea66SRandall Stewart high_disabled = 1; 51820abea66SRandall Stewart } 51920abea66SRandall Stewart } 52020abea66SRandall Stewart } 52120abea66SRandall Stewart 52220abea66SRandall Stewart static struct tcp_rate_set * 52320abea66SRandall Stewart rt_setup_new_rs(struct ifnet *ifp, int *error) 52420abea66SRandall Stewart { 52520abea66SRandall Stewart struct tcp_rate_set *rs; 52620abea66SRandall Stewart const uint64_t *rate_table_act; 52720abea66SRandall Stewart uint64_t lentim, res; 52820abea66SRandall Stewart size_t sz; 52920abea66SRandall Stewart uint32_t hash_type; 53020abea66SRandall Stewart int i; 53120abea66SRandall Stewart struct if_ratelimit_query_results rl; 53220abea66SRandall Stewart struct sysctl_oid *rl_sysctl_root; 5331a714ff2SRandall Stewart struct epoch_tracker et; 53420abea66SRandall Stewart /* 53520abea66SRandall Stewart * We expect to enter with the 53620abea66SRandall Stewart * mutex locked. 53720abea66SRandall Stewart */ 53820abea66SRandall Stewart 53920abea66SRandall Stewart if (ifp->if_ratelimit_query == NULL) { 54020abea66SRandall Stewart /* 54120abea66SRandall Stewart * We can do nothing if we cannot 54220abea66SRandall Stewart * get a query back from the driver. 54320abea66SRandall Stewart */ 544d7313dc6SRandall Stewart printf("Warning:No query functions for %s:%d-- failed\n", 545d7313dc6SRandall Stewart ifp->if_dname, ifp->if_dunit); 54620abea66SRandall Stewart return (NULL); 54720abea66SRandall Stewart } 54820abea66SRandall Stewart rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO); 54920abea66SRandall Stewart if (rs == NULL) { 55020abea66SRandall Stewart if (error) 55120abea66SRandall Stewart *error = ENOMEM; 552d7313dc6SRandall Stewart printf("Warning:No memory for malloc of tcp_rate_set\n"); 55320abea66SRandall Stewart return (NULL); 55420abea66SRandall Stewart } 555d7313dc6SRandall Stewart memset(&rl, 0, sizeof(rl)); 55620abea66SRandall Stewart rl.flags = RT_NOSUPPORT; 55720abea66SRandall Stewart ifp->if_ratelimit_query(ifp, &rl); 55820abea66SRandall Stewart if (rl.flags & RT_IS_UNUSABLE) { 55920abea66SRandall Stewart /* 56020abea66SRandall Stewart * The interface does not really support 56120abea66SRandall Stewart * the rate-limiting. 56220abea66SRandall Stewart */ 56320abea66SRandall Stewart memset(rs, 0, sizeof(struct tcp_rate_set)); 56420abea66SRandall Stewart rs->rs_ifp = ifp; 56520abea66SRandall Stewart rs->rs_if_dunit = ifp->if_dunit; 56620abea66SRandall Stewart rs->rs_flags = RS_INTF_NO_SUP; 56720abea66SRandall Stewart rs->rs_disable = 1; 56820abea66SRandall Stewart rs_number_alive++; 56920abea66SRandall Stewart sysctl_ctx_init(&rs->sysctl_ctx); 57020abea66SRandall Stewart rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 57120abea66SRandall Stewart SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 57220abea66SRandall Stewart OID_AUTO, 57320abea66SRandall Stewart rs->rs_ifp->if_xname, 5747029da5cSPawel Biernacki CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 57520abea66SRandall Stewart ""); 57620abea66SRandall Stewart rl_add_syctl_entries(rl_sysctl_root, rs); 5771a714ff2SRandall Stewart NET_EPOCH_ENTER(et); 57820abea66SRandall Stewart mtx_lock(&rs_mtx); 57915ddc5e4SMichael Tuexen CK_LIST_INSERT_HEAD(&int_rs, rs, next); 58015ddc5e4SMichael Tuexen mtx_unlock(&rs_mtx); 5811a714ff2SRandall Stewart NET_EPOCH_EXIT(et); 58220abea66SRandall Stewart return (rs); 58320abea66SRandall Stewart } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) { 58420abea66SRandall Stewart memset(rs, 0, sizeof(struct tcp_rate_set)); 58520abea66SRandall Stewart rs->rs_ifp = ifp; 58620abea66SRandall Stewart rs->rs_if_dunit = ifp->if_dunit; 58720abea66SRandall Stewart rs->rs_flags = RS_IS_DEFF; 58820abea66SRandall Stewart rs_number_alive++; 58920abea66SRandall Stewart sysctl_ctx_init(&rs->sysctl_ctx); 59020abea66SRandall Stewart rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 59120abea66SRandall Stewart SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 59220abea66SRandall Stewart OID_AUTO, 59320abea66SRandall Stewart rs->rs_ifp->if_xname, 5947029da5cSPawel Biernacki CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 59520abea66SRandall Stewart ""); 59620abea66SRandall Stewart rl_add_syctl_entries(rl_sysctl_root, rs); 5971a714ff2SRandall Stewart NET_EPOCH_ENTER(et); 59820abea66SRandall Stewart mtx_lock(&rs_mtx); 59915ddc5e4SMichael Tuexen CK_LIST_INSERT_HEAD(&int_rs, rs, next); 60015ddc5e4SMichael Tuexen mtx_unlock(&rs_mtx); 6011a714ff2SRandall Stewart NET_EPOCH_EXIT(et); 60220abea66SRandall Stewart return (rs); 60320abea66SRandall Stewart } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) { 604d7313dc6SRandall Stewart /* Mellanox C4 likely */ 60520abea66SRandall Stewart rs->rs_ifp = ifp; 60620abea66SRandall Stewart rs->rs_if_dunit = ifp->if_dunit; 60720abea66SRandall Stewart rs->rs_rate_cnt = rl.number_of_rates; 60820abea66SRandall Stewart rs->rs_min_seg = rl.min_segment_burst; 60920abea66SRandall Stewart rs->rs_highest_valid = 0; 61020abea66SRandall Stewart rs->rs_flow_limit = rl.max_flows; 61120abea66SRandall Stewart rs->rs_flags = RS_IS_INTF | RS_NO_PRE; 61220abea66SRandall Stewart rs->rs_disable = 0; 61320abea66SRandall Stewart rate_table_act = rl.rate_table; 61420abea66SRandall Stewart } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) { 615d7313dc6SRandall Stewart /* Chelsio, C5 and C6 of Mellanox? */ 61620abea66SRandall Stewart rs->rs_ifp = ifp; 61720abea66SRandall Stewart rs->rs_if_dunit = ifp->if_dunit; 61820abea66SRandall Stewart rs->rs_rate_cnt = rl.number_of_rates; 61920abea66SRandall Stewart rs->rs_min_seg = rl.min_segment_burst; 62020abea66SRandall Stewart rs->rs_disable = 0; 62120abea66SRandall Stewart rs->rs_flow_limit = rl.max_flows; 62220abea66SRandall Stewart rate_table_act = desired_rates; 62320abea66SRandall Stewart if ((rs->rs_rate_cnt > MAX_HDWR_RATES) && 62420abea66SRandall Stewart (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) { 62520abea66SRandall Stewart /* 62620abea66SRandall Stewart * Our desired table is not big 62720abea66SRandall Stewart * enough, do what we can. 62820abea66SRandall Stewart */ 62920abea66SRandall Stewart rs->rs_rate_cnt = MAX_HDWR_RATES; 63020abea66SRandall Stewart } 63120abea66SRandall Stewart if (rs->rs_rate_cnt <= RS_ORDERED_COUNT) 63220abea66SRandall Stewart rs->rs_flags = RS_IS_INTF; 63320abea66SRandall Stewart else 63420abea66SRandall Stewart rs->rs_flags = RS_IS_INTF | RS_INT_TBL; 63520abea66SRandall Stewart if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) 63620abea66SRandall Stewart rs->rs_rate_cnt = ALL_HARDWARE_RATES; 63720abea66SRandall Stewart } else { 63820abea66SRandall Stewart free(rs, M_TCPPACE); 63920abea66SRandall Stewart return (NULL); 64020abea66SRandall Stewart } 64120abea66SRandall Stewart sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt; 64220abea66SRandall Stewart rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT); 64320abea66SRandall Stewart if (rs->rs_rlt == NULL) { 64420abea66SRandall Stewart if (error) 64520abea66SRandall Stewart *error = ENOMEM; 64620abea66SRandall Stewart bail: 64720abea66SRandall Stewart free(rs, M_TCPPACE); 64820abea66SRandall Stewart return (NULL); 64920abea66SRandall Stewart } 65020abea66SRandall Stewart if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) { 65120abea66SRandall Stewart /* 65220abea66SRandall Stewart * The interface supports all 65320abea66SRandall Stewart * the rates we could possibly want. 65420abea66SRandall Stewart */ 65520abea66SRandall Stewart uint64_t rat; 65620abea66SRandall Stewart 65720abea66SRandall Stewart rs->rs_rlt[0].rate = 12500; /* 100k */ 65820abea66SRandall Stewart rs->rs_rlt[1].rate = 25000; /* 200k */ 65920abea66SRandall Stewart rs->rs_rlt[2].rate = 62500; /* 500k */ 66020abea66SRandall Stewart /* Note 125000 == 1Megabit 66120abea66SRandall Stewart * populate 1Meg - 1000meg. 66220abea66SRandall Stewart */ 66320abea66SRandall Stewart for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) { 66420abea66SRandall Stewart rs->rs_rlt[i].rate = rat; 66520abea66SRandall Stewart rat += 125000; 66620abea66SRandall Stewart } 66720abea66SRandall Stewart rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000; 66820abea66SRandall Stewart } else if (rs->rs_flags & RS_INT_TBL) { 66920abea66SRandall Stewart /* We populate this in a special way */ 67020abea66SRandall Stewart populate_canned_table(rs, rate_table_act); 67120abea66SRandall Stewart } else { 67220abea66SRandall Stewart /* 67320abea66SRandall Stewart * Just copy in the rates from 67420abea66SRandall Stewart * the table, it is in order. 67520abea66SRandall Stewart */ 67620abea66SRandall Stewart for (i=0; i<rs->rs_rate_cnt; i++) { 67720abea66SRandall Stewart rs->rs_rlt[i].rate = rate_table_act[i]; 67820abea66SRandall Stewart rs->rs_rlt[i].time_between = 0; 67920abea66SRandall Stewart rs->rs_rlt[i].flags = 0; 68020abea66SRandall Stewart } 68120abea66SRandall Stewart } 68220abea66SRandall Stewart for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) { 68320abea66SRandall Stewart /* 68420abea66SRandall Stewart * We go backwards through the list so that if we can't get 68520abea66SRandall Stewart * a rate and fail to init one, we have at least a chance of 68620abea66SRandall Stewart * getting the highest one. 68720abea66SRandall Stewart */ 68820abea66SRandall Stewart rs->rs_rlt[i].ptbl = rs; 68920abea66SRandall Stewart rs->rs_rlt[i].tag = NULL; 6905d8fd932SRandall Stewart rs->rs_rlt[i].using = 0; 6915d8fd932SRandall Stewart rs->rs_rlt[i].rs_num_enobufs = 0; 69220abea66SRandall Stewart /* 69320abea66SRandall Stewart * Calculate the time between. 69420abea66SRandall Stewart */ 69520abea66SRandall Stewart lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND; 69620abea66SRandall Stewart res = lentim / rs->rs_rlt[i].rate; 69720abea66SRandall Stewart if (res > 0) 69820abea66SRandall Stewart rs->rs_rlt[i].time_between = res; 69920abea66SRandall Stewart else 70020abea66SRandall Stewart rs->rs_rlt[i].time_between = 1; 70120abea66SRandall Stewart if (rs->rs_flags & RS_NO_PRE) { 70220abea66SRandall Stewart rs->rs_rlt[i].flags = HDWRPACE_INITED; 70320abea66SRandall Stewart rs->rs_lowest_valid = i; 70420abea66SRandall Stewart } else { 70520abea66SRandall Stewart int err; 706d7313dc6SRandall Stewart 707d7313dc6SRandall Stewart if ((rl.flags & RT_IS_SETUP_REQ) && 708d7313dc6SRandall Stewart (ifp->if_ratelimit_query)) { 709d7313dc6SRandall Stewart err = ifp->if_ratelimit_setup(ifp, 710d7313dc6SRandall Stewart rs->rs_rlt[i].rate, i); 711d7313dc6SRandall Stewart if (err) 712d7313dc6SRandall Stewart goto handle_err; 713d7313dc6SRandall Stewart } 71420abea66SRandall Stewart #ifdef RSS 71520abea66SRandall Stewart hash_type = M_HASHTYPE_RSS_TCP_IPV4; 71620abea66SRandall Stewart #else 71720abea66SRandall Stewart hash_type = M_HASHTYPE_OPAQUE_HASH; 71820abea66SRandall Stewart #endif 71920abea66SRandall Stewart err = rl_attach_txrtlmt(ifp, 72020abea66SRandall Stewart hash_type, 72120abea66SRandall Stewart (i + 1), 72220abea66SRandall Stewart rs->rs_rlt[i].rate, 72320abea66SRandall Stewart &rs->rs_rlt[i].tag); 72420abea66SRandall Stewart if (err) { 725d7313dc6SRandall Stewart handle_err: 72620abea66SRandall Stewart if (i == (rs->rs_rate_cnt - 1)) { 72720abea66SRandall Stewart /* 72820abea66SRandall Stewart * Huh - first rate and we can't get 72920abea66SRandall Stewart * it? 73020abea66SRandall Stewart */ 73120abea66SRandall Stewart free(rs->rs_rlt, M_TCPPACE); 73220abea66SRandall Stewart if (error) 73320abea66SRandall Stewart *error = err; 73420abea66SRandall Stewart goto bail; 73520abea66SRandall Stewart } else { 73620abea66SRandall Stewart if (error) 73720abea66SRandall Stewart *error = err; 73820abea66SRandall Stewart } 73920abea66SRandall Stewart break; 74020abea66SRandall Stewart } else { 74120abea66SRandall Stewart rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT; 74220abea66SRandall Stewart rs->rs_lowest_valid = i; 74320abea66SRandall Stewart } 74420abea66SRandall Stewart } 74520abea66SRandall Stewart } 74620abea66SRandall Stewart /* Did we get at least 1 rate? */ 74720abea66SRandall Stewart if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED) 74820abea66SRandall Stewart rs->rs_highest_valid = rs->rs_rate_cnt - 1; 74920abea66SRandall Stewart else { 75020abea66SRandall Stewart free(rs->rs_rlt, M_TCPPACE); 75120abea66SRandall Stewart goto bail; 75220abea66SRandall Stewart } 75320abea66SRandall Stewart rs_number_alive++; 75420abea66SRandall Stewart sysctl_ctx_init(&rs->sysctl_ctx); 75520abea66SRandall Stewart rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, 75620abea66SRandall Stewart SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), 75720abea66SRandall Stewart OID_AUTO, 75820abea66SRandall Stewart rs->rs_ifp->if_xname, 7597029da5cSPawel Biernacki CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 76020abea66SRandall Stewart ""); 76120abea66SRandall Stewart rl_add_syctl_entries(rl_sysctl_root, rs); 7621a714ff2SRandall Stewart NET_EPOCH_ENTER(et); 76320abea66SRandall Stewart mtx_lock(&rs_mtx); 76415ddc5e4SMichael Tuexen CK_LIST_INSERT_HEAD(&int_rs, rs, next); 76515ddc5e4SMichael Tuexen mtx_unlock(&rs_mtx); 7661a714ff2SRandall Stewart NET_EPOCH_EXIT(et); 76720abea66SRandall Stewart return (rs); 76820abea66SRandall Stewart } 76920abea66SRandall Stewart 7701a714ff2SRandall Stewart /* 7711a714ff2SRandall Stewart * For an explanation of why the argument is volatile please 7721a714ff2SRandall Stewart * look at the comments around rt_setup_rate(). 7731a714ff2SRandall Stewart */ 77420abea66SRandall Stewart static const struct tcp_hwrate_limit_table * 7751a714ff2SRandall Stewart tcp_int_find_suitable_rate(const volatile struct tcp_rate_set *rs, 7761a714ff2SRandall Stewart uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate) 77720abea66SRandall Stewart { 77820abea66SRandall Stewart struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL; 7791a714ff2SRandall Stewart uint64_t mbits_per_sec, ind_calc, previous_rate = 0; 78020abea66SRandall Stewart int i; 78120abea66SRandall Stewart 78220abea66SRandall Stewart mbits_per_sec = (bytes_per_sec * 8); 78320abea66SRandall Stewart if (flags & RS_PACING_LT) { 78420abea66SRandall Stewart if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 78520abea66SRandall Stewart (rs->rs_lowest_valid <= 2)){ 78620abea66SRandall Stewart /* 78720abea66SRandall Stewart * Smaller than 1Meg, only 78820abea66SRandall Stewart * 3 entries can match it. 78920abea66SRandall Stewart */ 7901a714ff2SRandall Stewart previous_rate = 0; 79120abea66SRandall Stewart for(i = rs->rs_lowest_valid; i < 3; i++) { 79220abea66SRandall Stewart if (bytes_per_sec <= rs->rs_rlt[i].rate) { 79320abea66SRandall Stewart rte = &rs->rs_rlt[i]; 79420abea66SRandall Stewart break; 79520abea66SRandall Stewart } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) { 79620abea66SRandall Stewart arte = &rs->rs_rlt[i]; 79720abea66SRandall Stewart } 7981a714ff2SRandall Stewart previous_rate = rs->rs_rlt[i].rate; 79920abea66SRandall Stewart } 80020abea66SRandall Stewart goto done; 80120abea66SRandall Stewart } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && 80220abea66SRandall Stewart (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ 80320abea66SRandall Stewart /* 80420abea66SRandall Stewart * Larger than 1G (the majority of 80520abea66SRandall Stewart * our table. 80620abea66SRandall Stewart */ 80720abea66SRandall Stewart if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC) 80820abea66SRandall Stewart rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 80920abea66SRandall Stewart else 81020abea66SRandall Stewart arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 8111a714ff2SRandall Stewart previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate; 81220abea66SRandall Stewart goto done; 81320abea66SRandall Stewart } 81420abea66SRandall Stewart /* 81520abea66SRandall Stewart * If we reach here its in our table (between 1Meg - 1000Meg), 81620abea66SRandall Stewart * just take the rounded down mbits per second, and add 81720abea66SRandall Stewart * 1Megabit to it, from this we can calculate 81820abea66SRandall Stewart * the index in the table. 81920abea66SRandall Stewart */ 82020abea66SRandall Stewart ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 82120abea66SRandall Stewart if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec) 82220abea66SRandall Stewart ind_calc++; 82320abea66SRandall Stewart /* our table is offset by 3, we add 2 */ 82420abea66SRandall Stewart ind_calc += 2; 82520abea66SRandall Stewart if (ind_calc > (ALL_HARDWARE_RATES-1)) { 82620abea66SRandall Stewart /* This should not happen */ 82720abea66SRandall Stewart ind_calc = ALL_HARDWARE_RATES-1; 82820abea66SRandall Stewart } 82920abea66SRandall Stewart if ((ind_calc >= rs->rs_lowest_valid) && 8301a714ff2SRandall Stewart (ind_calc <= rs->rs_highest_valid)) { 83120abea66SRandall Stewart rte = &rs->rs_rlt[ind_calc]; 8321a714ff2SRandall Stewart if (ind_calc >= 1) 8331a714ff2SRandall Stewart previous_rate = rs->rs_rlt[(ind_calc-1)].rate; 8341a714ff2SRandall Stewart } 83520abea66SRandall Stewart } else if (flags & RS_PACING_EXACT_MATCH) { 83620abea66SRandall Stewart if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 83720abea66SRandall Stewart (rs->rs_lowest_valid <= 2)){ 83820abea66SRandall Stewart for(i = rs->rs_lowest_valid; i < 3; i++) { 83920abea66SRandall Stewart if (bytes_per_sec == rs->rs_rlt[i].rate) { 84020abea66SRandall Stewart rte = &rs->rs_rlt[i]; 84120abea66SRandall Stewart break; 84220abea66SRandall Stewart } 84320abea66SRandall Stewart } 84420abea66SRandall Stewart } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && 84520abea66SRandall Stewart (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { 84620abea66SRandall Stewart /* > 1Gbps only one rate */ 84720abea66SRandall Stewart if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) { 84820abea66SRandall Stewart /* Its 10G wow */ 84920abea66SRandall Stewart rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 85020abea66SRandall Stewart } 85120abea66SRandall Stewart } else { 85220abea66SRandall Stewart /* Ok it must be a exact meg (its between 1G and 1Meg) */ 85320abea66SRandall Stewart ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 85420abea66SRandall Stewart if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { 85520abea66SRandall Stewart /* its an exact Mbps */ 85620abea66SRandall Stewart ind_calc += 2; 85720abea66SRandall Stewart if (ind_calc > (ALL_HARDWARE_RATES-1)) { 85820abea66SRandall Stewart /* This should not happen */ 85920abea66SRandall Stewart ind_calc = ALL_HARDWARE_RATES-1; 86020abea66SRandall Stewart } 86120abea66SRandall Stewart if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) 86220abea66SRandall Stewart rte = &rs->rs_rlt[ind_calc]; 86320abea66SRandall Stewart } 86420abea66SRandall Stewart } 86520abea66SRandall Stewart } else { 86620abea66SRandall Stewart /* we want greater than the requested rate */ 86720abea66SRandall Stewart if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && 86820abea66SRandall Stewart (rs->rs_lowest_valid <= 2)){ 86920abea66SRandall Stewart arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */ 87020abea66SRandall Stewart for (i=2; i>=rs->rs_lowest_valid; i--) { 87120abea66SRandall Stewart if (bytes_per_sec < rs->rs_rlt[i].rate) { 87220abea66SRandall Stewart rte = &rs->rs_rlt[i]; 8731a714ff2SRandall Stewart if (i >= 1) { 8741a714ff2SRandall Stewart previous_rate = rs->rs_rlt[(i-1)].rate; 8751a714ff2SRandall Stewart } 87620abea66SRandall Stewart break; 87720abea66SRandall Stewart } else if ((flags & RS_PACING_GEQ) && 87820abea66SRandall Stewart (bytes_per_sec == rs->rs_rlt[i].rate)) { 87920abea66SRandall Stewart rte = &rs->rs_rlt[i]; 8801a714ff2SRandall Stewart if (i >= 1) { 8811a714ff2SRandall Stewart previous_rate = rs->rs_rlt[(i-1)].rate; 8821a714ff2SRandall Stewart } 88320abea66SRandall Stewart break; 88420abea66SRandall Stewart } else { 88520abea66SRandall Stewart arte = &rs->rs_rlt[i]; /* new alternate */ 88620abea66SRandall Stewart } 88720abea66SRandall Stewart } 88820abea66SRandall Stewart } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) { 88920abea66SRandall Stewart if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && 89020abea66SRandall Stewart (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ 89120abea66SRandall Stewart /* Our top rate is larger than the request */ 89220abea66SRandall Stewart rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 89320abea66SRandall Stewart } else if ((flags & RS_PACING_GEQ) && 89420abea66SRandall Stewart (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && 89520abea66SRandall Stewart (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { 89620abea66SRandall Stewart /* It matches our top rate */ 89720abea66SRandall Stewart rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 89820abea66SRandall Stewart } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) { 89920abea66SRandall Stewart /* The top rate is an alternative */ 90020abea66SRandall Stewart arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; 90120abea66SRandall Stewart } 9021a714ff2SRandall Stewart previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate; 90320abea66SRandall Stewart } else { 90420abea66SRandall Stewart /* Its in our range 1Meg - 1Gig */ 90520abea66SRandall Stewart if (flags & RS_PACING_GEQ) { 90620abea66SRandall Stewart ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; 90720abea66SRandall Stewart if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { 90820abea66SRandall Stewart if (ind_calc > (ALL_HARDWARE_RATES-1)) { 90920abea66SRandall Stewart /* This should not happen */ 91020abea66SRandall Stewart ind_calc = (ALL_HARDWARE_RATES-1); 91120abea66SRandall Stewart } 91220abea66SRandall Stewart rte = &rs->rs_rlt[ind_calc]; 9131a714ff2SRandall Stewart if (ind_calc >= 1) 9141a714ff2SRandall Stewart previous_rate = rs->rs_rlt[(ind_calc-1)].rate; 91520abea66SRandall Stewart } 91620abea66SRandall Stewart goto done; 91720abea66SRandall Stewart } 91820abea66SRandall Stewart ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC; 91920abea66SRandall Stewart ind_calc += 2; 92020abea66SRandall Stewart if (ind_calc > (ALL_HARDWARE_RATES-1)) { 92120abea66SRandall Stewart /* This should not happen */ 92220abea66SRandall Stewart ind_calc = ALL_HARDWARE_RATES-1; 92320abea66SRandall Stewart } 9241a714ff2SRandall Stewart if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) { 92520abea66SRandall Stewart rte = &rs->rs_rlt[ind_calc]; 9261a714ff2SRandall Stewart if (ind_calc >= 1) 9271a714ff2SRandall Stewart previous_rate = rs->rs_rlt[(ind_calc-1)].rate; 9281a714ff2SRandall Stewart } 92920abea66SRandall Stewart } 93020abea66SRandall Stewart } 93120abea66SRandall Stewart done: 93220abea66SRandall Stewart if ((rte == NULL) && 93320abea66SRandall Stewart (arte != NULL) && 93420abea66SRandall Stewart (flags & RS_PACING_SUB_OK)) { 93520abea66SRandall Stewart /* We can use the substitute */ 93620abea66SRandall Stewart rte = arte; 93720abea66SRandall Stewart } 9381a714ff2SRandall Stewart if (lower_rate) 9391a714ff2SRandall Stewart *lower_rate = previous_rate; 94020abea66SRandall Stewart return (rte); 94120abea66SRandall Stewart } 94220abea66SRandall Stewart 9431a714ff2SRandall Stewart /* 9441a714ff2SRandall Stewart * For an explanation of why the argument is volatile please 9451a714ff2SRandall Stewart * look at the comments around rt_setup_rate(). 9461a714ff2SRandall Stewart */ 94720abea66SRandall Stewart static const struct tcp_hwrate_limit_table * 9481a714ff2SRandall Stewart tcp_find_suitable_rate(const volatile struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate) 94920abea66SRandall Stewart { 95020abea66SRandall Stewart /** 95120abea66SRandall Stewart * Hunt the rate table with the restrictions in flags and find a 95220abea66SRandall Stewart * suitable rate if possible. 95320abea66SRandall Stewart * RS_PACING_EXACT_MATCH - look for an exact match to rate. 95420abea66SRandall Stewart * RS_PACING_GT - must be greater than. 95520abea66SRandall Stewart * RS_PACING_GEQ - must be greater than or equal. 95620abea66SRandall Stewart * RS_PACING_LT - must be less than. 95720abea66SRandall Stewart * RS_PACING_SUB_OK - If we don't meet criteria a 95820abea66SRandall Stewart * substitute is ok. 95920abea66SRandall Stewart */ 96020abea66SRandall Stewart int i, matched; 96120abea66SRandall Stewart struct tcp_hwrate_limit_table *rte = NULL; 9621a714ff2SRandall Stewart uint64_t previous_rate = 0; 96320abea66SRandall Stewart 96420abea66SRandall Stewart if ((rs->rs_flags & RS_INT_TBL) && 96520abea66SRandall Stewart (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) { 96620abea66SRandall Stewart /* 96720abea66SRandall Stewart * Here we don't want to paw thru 96820abea66SRandall Stewart * a big table, we have everything 96920abea66SRandall Stewart * from 1Meg - 1000Meg in 1Meg increments. 97020abea66SRandall Stewart * Use an alternate method to "lookup". 97120abea66SRandall Stewart */ 9721a714ff2SRandall Stewart return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate)); 97320abea66SRandall Stewart } 97420abea66SRandall Stewart if ((flags & RS_PACING_LT) || 97520abea66SRandall Stewart (flags & RS_PACING_EXACT_MATCH)) { 97620abea66SRandall Stewart /* 97720abea66SRandall Stewart * For exact and less than we go forward through the table. 97820abea66SRandall Stewart * This way when we find one larger we stop (exact was a 97920abea66SRandall Stewart * toss up). 98020abea66SRandall Stewart */ 98120abea66SRandall Stewart for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) { 98220abea66SRandall Stewart if ((flags & RS_PACING_EXACT_MATCH) && 98320abea66SRandall Stewart (bytes_per_sec == rs->rs_rlt[i].rate)) { 98420abea66SRandall Stewart rte = &rs->rs_rlt[i]; 98520abea66SRandall Stewart matched = 1; 9861a714ff2SRandall Stewart if (lower_rate != NULL) 9871a714ff2SRandall Stewart *lower_rate = previous_rate; 98820abea66SRandall Stewart break; 98920abea66SRandall Stewart } else if ((flags & RS_PACING_LT) && 99020abea66SRandall Stewart (bytes_per_sec <= rs->rs_rlt[i].rate)) { 99120abea66SRandall Stewart rte = &rs->rs_rlt[i]; 99220abea66SRandall Stewart matched = 1; 9931a714ff2SRandall Stewart if (lower_rate != NULL) 9941a714ff2SRandall Stewart *lower_rate = previous_rate; 99520abea66SRandall Stewart break; 99620abea66SRandall Stewart } 9971a714ff2SRandall Stewart previous_rate = rs->rs_rlt[i].rate; 99820abea66SRandall Stewart if (bytes_per_sec > rs->rs_rlt[i].rate) 99920abea66SRandall Stewart break; 100020abea66SRandall Stewart } 100120abea66SRandall Stewart if ((matched == 0) && 100220abea66SRandall Stewart (flags & RS_PACING_LT) && 100320abea66SRandall Stewart (flags & RS_PACING_SUB_OK)) { 100420abea66SRandall Stewart /* Kick in a substitute (the lowest) */ 100520abea66SRandall Stewart rte = &rs->rs_rlt[rs->rs_lowest_valid]; 100620abea66SRandall Stewart } 100720abea66SRandall Stewart } else { 100820abea66SRandall Stewart /* 100920abea66SRandall Stewart * Here we go backward through the table so that we can find 101020abea66SRandall Stewart * the one greater in theory faster (but its probably a 101120abea66SRandall Stewart * wash). 101220abea66SRandall Stewart */ 101320abea66SRandall Stewart for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) { 101420abea66SRandall Stewart if (rs->rs_rlt[i].rate > bytes_per_sec) { 101520abea66SRandall Stewart /* A possible candidate */ 101620abea66SRandall Stewart rte = &rs->rs_rlt[i]; 101720abea66SRandall Stewart } 101820abea66SRandall Stewart if ((flags & RS_PACING_GEQ) && 101920abea66SRandall Stewart (bytes_per_sec == rs->rs_rlt[i].rate)) { 102020abea66SRandall Stewart /* An exact match and we want equal */ 102120abea66SRandall Stewart matched = 1; 102220abea66SRandall Stewart rte = &rs->rs_rlt[i]; 102320abea66SRandall Stewart break; 102420abea66SRandall Stewart } else if (rte) { 102520abea66SRandall Stewart /* 102620abea66SRandall Stewart * Found one that is larger than but don't 102720abea66SRandall Stewart * stop, there may be a more closer match. 102820abea66SRandall Stewart */ 102920abea66SRandall Stewart matched = 1; 103020abea66SRandall Stewart } 103120abea66SRandall Stewart if (rs->rs_rlt[i].rate < bytes_per_sec) { 103220abea66SRandall Stewart /* 103320abea66SRandall Stewart * We found a table entry that is smaller, 103420abea66SRandall Stewart * stop there will be none greater or equal. 103520abea66SRandall Stewart */ 10361a714ff2SRandall Stewart if (lower_rate != NULL) 10371a714ff2SRandall Stewart *lower_rate = rs->rs_rlt[i].rate; 103820abea66SRandall Stewart break; 103920abea66SRandall Stewart } 104020abea66SRandall Stewart } 104120abea66SRandall Stewart if ((matched == 0) && 104220abea66SRandall Stewart (flags & RS_PACING_SUB_OK)) { 104320abea66SRandall Stewart /* Kick in a substitute (the highest) */ 104420abea66SRandall Stewart rte = &rs->rs_rlt[rs->rs_highest_valid]; 104520abea66SRandall Stewart } 104620abea66SRandall Stewart } 104720abea66SRandall Stewart return (rte); 104820abea66SRandall Stewart } 104920abea66SRandall Stewart 105020abea66SRandall Stewart static struct ifnet * 105120abea66SRandall Stewart rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error) 105220abea66SRandall Stewart { 105320abea66SRandall Stewart struct ifnet *tifp; 10541a714ff2SRandall Stewart struct m_snd_tag *tag, *ntag; 105520abea66SRandall Stewart union if_snd_tag_alloc_params params = { 105620abea66SRandall Stewart .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, 10571a714ff2SRandall Stewart .rate_limit.hdr.flowid = inp->inp_flowid, 105898085baeSAndrew Gallatin .rate_limit.hdr.numa_domain = inp->inp_numa_domain, 105920abea66SRandall Stewart .rate_limit.max_rate = COMMON_RATE, 106020abea66SRandall Stewart .rate_limit.flags = M_NOWAIT, 106120abea66SRandall Stewart }; 106220abea66SRandall Stewart int err; 106320abea66SRandall Stewart #ifdef RSS 106420abea66SRandall Stewart params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ? 106520abea66SRandall Stewart M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4); 106620abea66SRandall Stewart #else 106720abea66SRandall Stewart params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH; 106820abea66SRandall Stewart #endif 106936e0a362SJohn Baldwin err = m_snd_tag_alloc(ifp, ¶ms, &tag); 107020abea66SRandall Stewart if (err) { 107120abea66SRandall Stewart /* Failed to setup a tag? */ 107220abea66SRandall Stewart if (error) 107320abea66SRandall Stewart *error = err; 107420abea66SRandall Stewart return (NULL); 107520abea66SRandall Stewart } 10761a714ff2SRandall Stewart ntag = tag; 1077c782ea8bSJohn Baldwin while (ntag->sw->next_snd_tag != NULL) { 1078c782ea8bSJohn Baldwin ntag = ntag->sw->next_snd_tag(ntag); 10791a714ff2SRandall Stewart } 10801a714ff2SRandall Stewart tifp = ntag->ifp; 108198d7a8d9SJohn Baldwin m_snd_tag_rele(tag); 108220abea66SRandall Stewart return (tifp); 108320abea66SRandall Stewart } 108420abea66SRandall Stewart 10851a714ff2SRandall Stewart static void 10861a714ff2SRandall Stewart rl_increment_using(const struct tcp_hwrate_limit_table *rte) 10871a714ff2SRandall Stewart { 10885d8fd932SRandall Stewart struct tcp_hwrate_limit_table *decon_rte; 10895d8fd932SRandall Stewart 10905d8fd932SRandall Stewart decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte); 10915d8fd932SRandall Stewart atomic_add_long(&decon_rte->using, 1); 10921a714ff2SRandall Stewart } 10931a714ff2SRandall Stewart 10941a714ff2SRandall Stewart static void 10951a714ff2SRandall Stewart rl_decrement_using(const struct tcp_hwrate_limit_table *rte) 10961a714ff2SRandall Stewart { 10975d8fd932SRandall Stewart struct tcp_hwrate_limit_table *decon_rte; 10985d8fd932SRandall Stewart 10995d8fd932SRandall Stewart decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte); 11005d8fd932SRandall Stewart atomic_subtract_long(&decon_rte->using, 1); 11011a714ff2SRandall Stewart } 11021a714ff2SRandall Stewart 11031a714ff2SRandall Stewart void 11041a714ff2SRandall Stewart tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte) 11051a714ff2SRandall Stewart { 11065d8fd932SRandall Stewart struct tcp_hwrate_limit_table *decon_rte; 11075d8fd932SRandall Stewart 11085d8fd932SRandall Stewart decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte); 11095d8fd932SRandall Stewart atomic_add_long(&decon_rte->rs_num_enobufs, 1); 11101a714ff2SRandall Stewart } 11111a714ff2SRandall Stewart 11121a714ff2SRandall Stewart /* 11131a714ff2SRandall Stewart * Do NOT take the __noinline out of the 11141a714ff2SRandall Stewart * find_rs_for_ifp() function. If you do the inline 11151a714ff2SRandall Stewart * of it for the rt_setup_rate() will show you a 11161a714ff2SRandall Stewart * compiler bug. For some reason the compiler thinks 11171a714ff2SRandall Stewart * the list can never be empty. The consequence of 11181a714ff2SRandall Stewart * this will be a crash when we dereference NULL 11191a714ff2SRandall Stewart * if an ifp is removed just has a hw rate limit 11201a714ff2SRandall Stewart * is attempted. If you are working on the compiler 11211a714ff2SRandall Stewart * and want to "test" this go ahead and take the noinline 11221a714ff2SRandall Stewart * out otherwise let sleeping dogs ly until such time 11231a714ff2SRandall Stewart * as we get a compiler fix 10/2/20 -- RRS 11241a714ff2SRandall Stewart */ 11251a714ff2SRandall Stewart static __noinline struct tcp_rate_set * 11261a714ff2SRandall Stewart find_rs_for_ifp(struct ifnet *ifp) 11271a714ff2SRandall Stewart { 11281a714ff2SRandall Stewart struct tcp_rate_set *rs; 11291a714ff2SRandall Stewart 11301a714ff2SRandall Stewart CK_LIST_FOREACH(rs, &int_rs, next) { 11311a714ff2SRandall Stewart if ((rs->rs_ifp == ifp) && 11321a714ff2SRandall Stewart (rs->rs_if_dunit == ifp->if_dunit)) { 11331a714ff2SRandall Stewart /* Ok we found it */ 11341a714ff2SRandall Stewart return (rs); 11351a714ff2SRandall Stewart } 11361a714ff2SRandall Stewart } 11371a714ff2SRandall Stewart return (NULL); 11381a714ff2SRandall Stewart } 11391a714ff2SRandall Stewart 11401a714ff2SRandall Stewart 114120abea66SRandall Stewart static const struct tcp_hwrate_limit_table * 114220abea66SRandall Stewart rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec, 11431a714ff2SRandall Stewart uint32_t flags, int *error, uint64_t *lower_rate) 114420abea66SRandall Stewart { 114520abea66SRandall Stewart /* First lets find the interface if it exists */ 114620abea66SRandall Stewart const struct tcp_hwrate_limit_table *rte; 11471a714ff2SRandall Stewart /* 11481a714ff2SRandall Stewart * So why is rs volatile? This is to defeat a 11491a714ff2SRandall Stewart * compiler bug where in the compiler is convinced 11501a714ff2SRandall Stewart * that rs can never be NULL (which is not true). Because 11511a714ff2SRandall Stewart * of its conviction it nicely optimizes out the if ((rs == NULL 11521a714ff2SRandall Stewart * below which means if you get a NULL back you dereference it. 11531a714ff2SRandall Stewart */ 11541a714ff2SRandall Stewart volatile struct tcp_rate_set *rs; 115520abea66SRandall Stewart struct epoch_tracker et; 11561a714ff2SRandall Stewart struct ifnet *oifp = ifp; 115720abea66SRandall Stewart int err; 115820abea66SRandall Stewart 1159348404bcSRandall Stewart NET_EPOCH_ENTER(et); 116020abea66SRandall Stewart use_real_interface: 11611a714ff2SRandall Stewart rs = find_rs_for_ifp(ifp); 116220abea66SRandall Stewart if ((rs == NULL) || 116320abea66SRandall Stewart (rs->rs_flags & RS_INTF_NO_SUP) || 116420abea66SRandall Stewart (rs->rs_flags & RS_IS_DEAD)) { 116520abea66SRandall Stewart /* 116620abea66SRandall Stewart * This means we got a packet *before* 116720abea66SRandall Stewart * the IF-UP was processed below, <or> 116820abea66SRandall Stewart * while or after we already received an interface 116920abea66SRandall Stewart * departed event. In either case we really don't 117020abea66SRandall Stewart * want to do anything with pacing, in 117120abea66SRandall Stewart * the departing case the packet is not 117220abea66SRandall Stewart * going to go very far. The new case 117320abea66SRandall Stewart * might be arguable, but its impossible 117420abea66SRandall Stewart * to tell from the departing case. 117520abea66SRandall Stewart */ 11761a714ff2SRandall Stewart if (error) 117720abea66SRandall Stewart *error = ENODEV; 1178348404bcSRandall Stewart NET_EPOCH_EXIT(et); 117920abea66SRandall Stewart return (NULL); 118020abea66SRandall Stewart } 118120abea66SRandall Stewart 118220abea66SRandall Stewart if ((rs == NULL) || (rs->rs_disable != 0)) { 11831a714ff2SRandall Stewart if (error) 118420abea66SRandall Stewart *error = ENOSPC; 1185348404bcSRandall Stewart NET_EPOCH_EXIT(et); 118620abea66SRandall Stewart return (NULL); 118720abea66SRandall Stewart } 118820abea66SRandall Stewart if (rs->rs_flags & RS_IS_DEFF) { 118920abea66SRandall Stewart /* We need to find the real interface */ 119020abea66SRandall Stewart struct ifnet *tifp; 119120abea66SRandall Stewart 119220abea66SRandall Stewart tifp = rt_find_real_interface(ifp, inp, error); 119320abea66SRandall Stewart if (tifp == NULL) { 119420abea66SRandall Stewart if (rs->rs_disable && error) 119520abea66SRandall Stewart *error = ENOTSUP; 1196348404bcSRandall Stewart NET_EPOCH_EXIT(et); 119720abea66SRandall Stewart return (NULL); 119820abea66SRandall Stewart } 11991a714ff2SRandall Stewart KASSERT((tifp != ifp), 12001a714ff2SRandall Stewart ("Lookup failure ifp:%p inp:%p rt_find_real_interface() returns the same interface tifp:%p?\n", 12011a714ff2SRandall Stewart ifp, inp, tifp)); 12021a714ff2SRandall Stewart ifp = tifp; 120320abea66SRandall Stewart goto use_real_interface; 120420abea66SRandall Stewart } 120520abea66SRandall Stewart if (rs->rs_flow_limit && 120620abea66SRandall Stewart ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) { 120720abea66SRandall Stewart if (error) 120820abea66SRandall Stewart *error = ENOSPC; 1209348404bcSRandall Stewart NET_EPOCH_EXIT(et); 121020abea66SRandall Stewart return (NULL); 121120abea66SRandall Stewart } 12121a714ff2SRandall Stewart rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate); 121320abea66SRandall Stewart if (rte) { 12141a714ff2SRandall Stewart err = in_pcbattach_txrtlmt(inp, oifp, 121520abea66SRandall Stewart inp->inp_flowtype, 121620abea66SRandall Stewart inp->inp_flowid, 121720abea66SRandall Stewart rte->rate, 121820abea66SRandall Stewart &inp->inp_snd_tag); 121920abea66SRandall Stewart if (err) { 122020abea66SRandall Stewart /* Failed to attach */ 122120abea66SRandall Stewart if (error) 122220abea66SRandall Stewart *error = err; 122320abea66SRandall Stewart rte = NULL; 12241a714ff2SRandall Stewart } else { 12251a714ff2SRandall Stewart KASSERT((inp->inp_snd_tag != NULL) , 1226db46c0d0SHans Petter Selasky ("Setup rate has no snd_tag inp:%p rte:%p rate:%llu rs:%p", 1227db46c0d0SHans Petter Selasky inp, rte, (unsigned long long)rte->rate, rs)); 1228db46c0d0SHans Petter Selasky #ifdef INET 12291a714ff2SRandall Stewart counter_u64_add(rate_limit_new, 1); 1230db46c0d0SHans Petter Selasky #endif 123120abea66SRandall Stewart } 123220abea66SRandall Stewart } 123320abea66SRandall Stewart if (rte) { 123420abea66SRandall Stewart /* 123520abea66SRandall Stewart * We use an atomic here for accounting so we don't have to 123620abea66SRandall Stewart * use locks when freeing. 123720abea66SRandall Stewart */ 123899c311c4SRandall Stewart atomic_add_64(&rs->rs_flows_using, 1); 123920abea66SRandall Stewart } 1240348404bcSRandall Stewart NET_EPOCH_EXIT(et); 124120abea66SRandall Stewart return (rte); 124220abea66SRandall Stewart } 124320abea66SRandall Stewart 124420abea66SRandall Stewart static void 124520abea66SRandall Stewart tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state) 124620abea66SRandall Stewart { 124720abea66SRandall Stewart int error; 124820abea66SRandall Stewart struct tcp_rate_set *rs; 12491a714ff2SRandall Stewart struct epoch_tracker et; 125020abea66SRandall Stewart 12519aed26b9SJohn Baldwin if (((ifp->if_capenable & IFCAP_TXRTLMT) == 0) || 125220abea66SRandall Stewart (link_state != LINK_STATE_UP)) { 125320abea66SRandall Stewart /* 125420abea66SRandall Stewart * We only care on an interface going up that is rate-limit 125520abea66SRandall Stewart * capable. 125620abea66SRandall Stewart */ 125720abea66SRandall Stewart return; 125820abea66SRandall Stewart } 12591a714ff2SRandall Stewart NET_EPOCH_ENTER(et); 126020abea66SRandall Stewart mtx_lock(&rs_mtx); 12611a714ff2SRandall Stewart rs = find_rs_for_ifp(ifp); 12621a714ff2SRandall Stewart if (rs) { 126320abea66SRandall Stewart /* We already have initialized this guy */ 126420abea66SRandall Stewart mtx_unlock(&rs_mtx); 12651a714ff2SRandall Stewart NET_EPOCH_EXIT(et); 126620abea66SRandall Stewart return; 126720abea66SRandall Stewart } 126820abea66SRandall Stewart mtx_unlock(&rs_mtx); 12691a714ff2SRandall Stewart NET_EPOCH_EXIT(et); 127015ddc5e4SMichael Tuexen rt_setup_new_rs(ifp, &error); 127120abea66SRandall Stewart } 127220abea66SRandall Stewart 127320abea66SRandall Stewart static void 127420abea66SRandall Stewart tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp) 127520abea66SRandall Stewart { 12761a714ff2SRandall Stewart struct tcp_rate_set *rs; 12771a714ff2SRandall Stewart struct epoch_tracker et; 127820abea66SRandall Stewart int i; 127920abea66SRandall Stewart 12801a714ff2SRandall Stewart NET_EPOCH_ENTER(et); 128120abea66SRandall Stewart mtx_lock(&rs_mtx); 12821a714ff2SRandall Stewart rs = find_rs_for_ifp(ifp); 12831a714ff2SRandall Stewart if (rs) { 128420abea66SRandall Stewart CK_LIST_REMOVE(rs, next); 128520abea66SRandall Stewart rs_number_alive--; 128620abea66SRandall Stewart rs->rs_flags |= RS_IS_DEAD; 128720abea66SRandall Stewart for (i = 0; i < rs->rs_rate_cnt; i++) { 128820abea66SRandall Stewart if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { 128998d7a8d9SJohn Baldwin in_pcbdetach_tag(rs->rs_rlt[i].tag); 129020abea66SRandall Stewart rs->rs_rlt[i].tag = NULL; 129120abea66SRandall Stewart } 129220abea66SRandall Stewart rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; 129320abea66SRandall Stewart } 1294eabddb25SHans Petter Selasky if (rs->rs_flows_using == 0) 1295eabddb25SHans Petter Selasky rs_defer_destroy(rs); 129620abea66SRandall Stewart } 129720abea66SRandall Stewart mtx_unlock(&rs_mtx); 12981a714ff2SRandall Stewart NET_EPOCH_EXIT(et); 129920abea66SRandall Stewart } 130020abea66SRandall Stewart 1301*1f628be8SAndrew Gallatin void 1302*1f628be8SAndrew Gallatin tcp_rl_release_ifnet(struct ifnet *ifp) 1303*1f628be8SAndrew Gallatin { 1304*1f628be8SAndrew Gallatin tcp_rl_ifnet_departure(NULL, ifp); 1305*1f628be8SAndrew Gallatin } 1306*1f628be8SAndrew Gallatin 130720abea66SRandall Stewart static void 130820abea66SRandall Stewart tcp_rl_shutdown(void *arg __unused, int howto __unused) 130920abea66SRandall Stewart { 131020abea66SRandall Stewart struct tcp_rate_set *rs, *nrs; 13111a714ff2SRandall Stewart struct epoch_tracker et; 131220abea66SRandall Stewart int i; 131320abea66SRandall Stewart 13141a714ff2SRandall Stewart NET_EPOCH_ENTER(et); 131520abea66SRandall Stewart mtx_lock(&rs_mtx); 131620abea66SRandall Stewart CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) { 131720abea66SRandall Stewart CK_LIST_REMOVE(rs, next); 131820abea66SRandall Stewart rs_number_alive--; 131920abea66SRandall Stewart rs->rs_flags |= RS_IS_DEAD; 132020abea66SRandall Stewart for (i = 0; i < rs->rs_rate_cnt; i++) { 132120abea66SRandall Stewart if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { 132298d7a8d9SJohn Baldwin in_pcbdetach_tag(rs->rs_rlt[i].tag); 132320abea66SRandall Stewart rs->rs_rlt[i].tag = NULL; 132420abea66SRandall Stewart } 132520abea66SRandall Stewart rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; 132620abea66SRandall Stewart } 1327eabddb25SHans Petter Selasky if (rs->rs_flows_using == 0) 1328eabddb25SHans Petter Selasky rs_defer_destroy(rs); 132920abea66SRandall Stewart } 133020abea66SRandall Stewart mtx_unlock(&rs_mtx); 13311a714ff2SRandall Stewart NET_EPOCH_EXIT(et); 133220abea66SRandall Stewart } 133320abea66SRandall Stewart 133420abea66SRandall Stewart const struct tcp_hwrate_limit_table * 133520abea66SRandall Stewart tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp, 13361a714ff2SRandall Stewart uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate) 133720abea66SRandall Stewart { 13389eb0e832SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp); 133920abea66SRandall Stewart const struct tcp_hwrate_limit_table *rte; 1340521eac97SJohn Baldwin #ifdef KERN_TLS 1341521eac97SJohn Baldwin struct ktls_session *tls; 1342521eac97SJohn Baldwin #endif 134320abea66SRandall Stewart 13449eb0e832SGleb Smirnoff INP_WLOCK_ASSERT(inp); 1345ce398115SJohn Baldwin 13469eb0e832SGleb Smirnoff if (inp->inp_snd_tag == NULL) { 134720abea66SRandall Stewart /* 134820abea66SRandall Stewart * We are setting up a rate for the first time. 134920abea66SRandall Stewart */ 13509aed26b9SJohn Baldwin if ((ifp->if_capenable & IFCAP_TXRTLMT) == 0) { 135120abea66SRandall Stewart /* Not supported by the egress */ 135220abea66SRandall Stewart if (error) 135320abea66SRandall Stewart *error = ENODEV; 135420abea66SRandall Stewart return (NULL); 135520abea66SRandall Stewart } 135620abea66SRandall Stewart #ifdef KERN_TLS 1357521eac97SJohn Baldwin tls = NULL; 1358c0e4090eSAndrew Gallatin if (tp->t_nic_ktls_xmit != 0) { 13599eb0e832SGleb Smirnoff tls = tptosocket(tp)->so_snd.sb_tls_info; 1360521eac97SJohn Baldwin 1361521eac97SJohn Baldwin if ((ifp->if_capenable & IFCAP_TXTLS_RTLMT) == 0 || 1362521eac97SJohn Baldwin tls->mode != TCP_TLS_MODE_IFNET) { 136320abea66SRandall Stewart if (error) 1364521eac97SJohn Baldwin *error = ENODEV; 136520abea66SRandall Stewart return (NULL); 136620abea66SRandall Stewart } 1367521eac97SJohn Baldwin } 136820abea66SRandall Stewart #endif 13699eb0e832SGleb Smirnoff rte = rt_setup_rate(inp, ifp, bytes_per_sec, flags, error, lower_rate); 13701a714ff2SRandall Stewart if (rte) 13711a714ff2SRandall Stewart rl_increment_using(rte); 1372521eac97SJohn Baldwin #ifdef KERN_TLS 1373521eac97SJohn Baldwin if (rte != NULL && tls != NULL && tls->snd_tag != NULL) { 1374521eac97SJohn Baldwin /* 1375521eac97SJohn Baldwin * Fake a route change error to reset the TLS 1376521eac97SJohn Baldwin * send tag. This will convert the existing 1377521eac97SJohn Baldwin * tag to a TLS ratelimit tag. 1378521eac97SJohn Baldwin */ 1379c782ea8bSJohn Baldwin MPASS(tls->snd_tag->sw->type == IF_SND_TAG_TYPE_TLS); 13809eb0e832SGleb Smirnoff ktls_output_eagain(inp, tls); 1381521eac97SJohn Baldwin } 1382521eac97SJohn Baldwin #endif 138320abea66SRandall Stewart } else { 138420abea66SRandall Stewart /* 138520abea66SRandall Stewart * We are modifying a rate, wrong interface? 138620abea66SRandall Stewart */ 138720abea66SRandall Stewart if (error) 138820abea66SRandall Stewart *error = EINVAL; 138920abea66SRandall Stewart rte = NULL; 139020abea66SRandall Stewart } 13911a714ff2SRandall Stewart if (rte != NULL) { 1392ce398115SJohn Baldwin tp->t_pacing_rate = rte->rate; 1393d7313dc6SRandall Stewart *error = 0; 13941a714ff2SRandall Stewart } 139520abea66SRandall Stewart return (rte); 139620abea66SRandall Stewart } 139720abea66SRandall Stewart 139820abea66SRandall Stewart const struct tcp_hwrate_limit_table * 139920abea66SRandall Stewart tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte, 140020abea66SRandall Stewart struct tcpcb *tp, struct ifnet *ifp, 14011a714ff2SRandall Stewart uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate) 140220abea66SRandall Stewart { 14039eb0e832SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp); 140420abea66SRandall Stewart const struct tcp_hwrate_limit_table *nrte; 140520abea66SRandall Stewart const struct tcp_rate_set *rs; 1406521eac97SJohn Baldwin #ifdef KERN_TLS 1407521eac97SJohn Baldwin struct ktls_session *tls = NULL; 1408521eac97SJohn Baldwin #endif 140920abea66SRandall Stewart int err; 141020abea66SRandall Stewart 14119eb0e832SGleb Smirnoff INP_WLOCK_ASSERT(inp); 1412ce398115SJohn Baldwin 1413521eac97SJohn Baldwin if (crte == NULL) { 1414521eac97SJohn Baldwin /* Wrong interface */ 1415521eac97SJohn Baldwin if (error) 1416521eac97SJohn Baldwin *error = EINVAL; 1417521eac97SJohn Baldwin return (NULL); 1418521eac97SJohn Baldwin } 1419521eac97SJohn Baldwin 1420521eac97SJohn Baldwin #ifdef KERN_TLS 1421c0e4090eSAndrew Gallatin if (tp->t_nic_ktls_xmit) { 14229eb0e832SGleb Smirnoff tls = tptosocket(tp)->so_snd.sb_tls_info; 1423d782385eSJohn Baldwin if (tls->mode != TCP_TLS_MODE_IFNET) 1424d782385eSJohn Baldwin tls = NULL; 1425d782385eSJohn Baldwin else if (tls->snd_tag != NULL && 1426c782ea8bSJohn Baldwin tls->snd_tag->sw->type != IF_SND_TAG_TYPE_TLS_RATE_LIMIT) { 1427d782385eSJohn Baldwin if (!tls->reset_pending) { 1428521eac97SJohn Baldwin /* 1429d782385eSJohn Baldwin * NIC probably doesn't support 1430d782385eSJohn Baldwin * ratelimit TLS tags if it didn't 1431d782385eSJohn Baldwin * allocate one when an existing rate 1432d782385eSJohn Baldwin * was present, so ignore. 1433521eac97SJohn Baldwin */ 14348a7404b2SAndrew Gallatin tcp_rel_pacing_rate(crte, tp); 1435521eac97SJohn Baldwin if (error) 1436521eac97SJohn Baldwin *error = EOPNOTSUPP; 1437521eac97SJohn Baldwin return (NULL); 1438521eac97SJohn Baldwin } 1439d782385eSJohn Baldwin 1440d782385eSJohn Baldwin /* 1441d782385eSJohn Baldwin * The send tag is being converted, so set the 1442d782385eSJohn Baldwin * rate limit on the inpcb tag. There is a 1443d782385eSJohn Baldwin * race that the new NIC send tag might use 1444d782385eSJohn Baldwin * the current rate instead of this one. 1445d782385eSJohn Baldwin */ 1446d782385eSJohn Baldwin tls = NULL; 1447d782385eSJohn Baldwin } 1448521eac97SJohn Baldwin } 1449521eac97SJohn Baldwin #endif 14509eb0e832SGleb Smirnoff if (inp->inp_snd_tag == NULL) { 145120abea66SRandall Stewart /* Wrong interface */ 14528a7404b2SAndrew Gallatin tcp_rel_pacing_rate(crte, tp); 145320abea66SRandall Stewart if (error) 145420abea66SRandall Stewart *error = EINVAL; 145520abea66SRandall Stewart return (NULL); 145620abea66SRandall Stewart } 145720abea66SRandall Stewart rs = crte->ptbl; 145820abea66SRandall Stewart if ((rs->rs_flags & RS_IS_DEAD) || 145920abea66SRandall Stewart (crte->flags & HDWRPACE_IFPDEPARTED)) { 146020abea66SRandall Stewart /* Release the rate, and try anew */ 14611a714ff2SRandall Stewart 146220abea66SRandall Stewart tcp_rel_pacing_rate(crte, tp); 146320abea66SRandall Stewart nrte = tcp_set_pacing_rate(tp, ifp, 14641a714ff2SRandall Stewart bytes_per_sec, flags, error, lower_rate); 146520abea66SRandall Stewart return (nrte); 146620abea66SRandall Stewart } 14671a714ff2SRandall Stewart nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate); 146820abea66SRandall Stewart if (nrte == crte) { 146920abea66SRandall Stewart /* No change */ 147020abea66SRandall Stewart if (error) 147120abea66SRandall Stewart *error = 0; 147220abea66SRandall Stewart return (crte); 147320abea66SRandall Stewart } 147420abea66SRandall Stewart if (nrte == NULL) { 147520abea66SRandall Stewart /* Release the old rate */ 14761a714ff2SRandall Stewart if (error) 14771a714ff2SRandall Stewart *error = ENOENT; 147820abea66SRandall Stewart tcp_rel_pacing_rate(crte, tp); 147920abea66SRandall Stewart return (NULL); 148020abea66SRandall Stewart } 14811a714ff2SRandall Stewart rl_decrement_using(crte); 14821a714ff2SRandall Stewart rl_increment_using(nrte); 148320abea66SRandall Stewart /* Change rates to our new entry */ 1484521eac97SJohn Baldwin #ifdef KERN_TLS 1485521eac97SJohn Baldwin if (tls != NULL) 1486521eac97SJohn Baldwin err = ktls_modify_txrtlmt(tls, nrte->rate); 1487521eac97SJohn Baldwin else 1488521eac97SJohn Baldwin #endif 14899eb0e832SGleb Smirnoff err = in_pcbmodify_txrtlmt(inp, nrte->rate); 149020abea66SRandall Stewart if (err) { 14918a7404b2SAndrew Gallatin struct tcp_rate_set *lrs; 14928a7404b2SAndrew Gallatin uint64_t pre; 14938a7404b2SAndrew Gallatin 14941a714ff2SRandall Stewart rl_decrement_using(nrte); 14958a7404b2SAndrew Gallatin lrs = __DECONST(struct tcp_rate_set *, rs); 14968a7404b2SAndrew Gallatin pre = atomic_fetchadd_64(&lrs->rs_flows_using, -1); 14971a714ff2SRandall Stewart /* Do we still have a snd-tag attached? */ 14989eb0e832SGleb Smirnoff if (inp->inp_snd_tag) 14999eb0e832SGleb Smirnoff in_pcbdetach_txrtlmt(inp); 15008a7404b2SAndrew Gallatin 15018a7404b2SAndrew Gallatin if (pre == 1) { 15028a7404b2SAndrew Gallatin struct epoch_tracker et; 15038a7404b2SAndrew Gallatin 15048a7404b2SAndrew Gallatin NET_EPOCH_ENTER(et); 15058a7404b2SAndrew Gallatin mtx_lock(&rs_mtx); 15068a7404b2SAndrew Gallatin /* 15078a7404b2SAndrew Gallatin * Is it dead? 15088a7404b2SAndrew Gallatin */ 15098a7404b2SAndrew Gallatin if (lrs->rs_flags & RS_IS_DEAD) 15108a7404b2SAndrew Gallatin rs_defer_destroy(lrs); 15118a7404b2SAndrew Gallatin mtx_unlock(&rs_mtx); 15128a7404b2SAndrew Gallatin NET_EPOCH_EXIT(et); 15138a7404b2SAndrew Gallatin } 151420abea66SRandall Stewart if (error) 151520abea66SRandall Stewart *error = err; 151620abea66SRandall Stewart return (NULL); 1517db46c0d0SHans Petter Selasky } else { 1518db46c0d0SHans Petter Selasky #ifdef INET 15191a714ff2SRandall Stewart counter_u64_add(rate_limit_chg, 1); 1520db46c0d0SHans Petter Selasky #endif 1521db46c0d0SHans Petter Selasky } 152220abea66SRandall Stewart if (error) 152320abea66SRandall Stewart *error = 0; 1524ce398115SJohn Baldwin tp->t_pacing_rate = nrte->rate; 152520abea66SRandall Stewart return (nrte); 152620abea66SRandall Stewart } 152720abea66SRandall Stewart 152820abea66SRandall Stewart void 152920abea66SRandall Stewart tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp) 153020abea66SRandall Stewart { 15319eb0e832SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp); 153220abea66SRandall Stewart const struct tcp_rate_set *crs; 153320abea66SRandall Stewart struct tcp_rate_set *rs; 153420abea66SRandall Stewart uint64_t pre; 153520abea66SRandall Stewart 15369eb0e832SGleb Smirnoff INP_WLOCK_ASSERT(inp); 1537ce398115SJohn Baldwin 1538ce398115SJohn Baldwin tp->t_pacing_rate = -1; 153920abea66SRandall Stewart crs = crte->ptbl; 154020abea66SRandall Stewart /* 154120abea66SRandall Stewart * Now we must break the const 154220abea66SRandall Stewart * in order to release our refcount. 154320abea66SRandall Stewart */ 154420abea66SRandall Stewart rs = __DECONST(struct tcp_rate_set *, crs); 15451a714ff2SRandall Stewart rl_decrement_using(crte); 1546a1589eb8SRandall Stewart pre = atomic_fetchadd_64(&rs->rs_flows_using, -1); 154720abea66SRandall Stewart if (pre == 1) { 15481a714ff2SRandall Stewart struct epoch_tracker et; 15491a714ff2SRandall Stewart 15501a714ff2SRandall Stewart NET_EPOCH_ENTER(et); 155120abea66SRandall Stewart mtx_lock(&rs_mtx); 155220abea66SRandall Stewart /* 155320abea66SRandall Stewart * Is it dead? 155420abea66SRandall Stewart */ 1555eabddb25SHans Petter Selasky if (rs->rs_flags & RS_IS_DEAD) 1556eabddb25SHans Petter Selasky rs_defer_destroy(rs); 155720abea66SRandall Stewart mtx_unlock(&rs_mtx); 15581a714ff2SRandall Stewart NET_EPOCH_EXIT(et); 155920abea66SRandall Stewart } 1560521eac97SJohn Baldwin 1561521eac97SJohn Baldwin /* 1562521eac97SJohn Baldwin * XXX: If this connection is using ifnet TLS, should we 1563521eac97SJohn Baldwin * switch it to using an unlimited rate, or perhaps use 1564521eac97SJohn Baldwin * ktls_output_eagain() to reset the send tag to a plain 1565521eac97SJohn Baldwin * TLS tag? 1566521eac97SJohn Baldwin */ 15679eb0e832SGleb Smirnoff in_pcbdetach_txrtlmt(inp); 156820abea66SRandall Stewart } 156920abea66SRandall Stewart 1570d7313dc6SRandall Stewart #define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */ 1571d7313dc6SRandall Stewart #define ONE_HUNDRED_MBPS 12500000 /* 100Mbps in bytes per second */ 1572d7313dc6SRandall Stewart #define FIVE_HUNDRED_MBPS 62500000 /* 500Mbps in bytes per second */ 1573d7313dc6SRandall Stewart #define MAX_MSS_SENT 43 /* 43 mss = 43 x 1500 = 64,500 bytes */ 1574d7313dc6SRandall Stewart 15751a714ff2SRandall Stewart static void 15761a714ff2SRandall Stewart tcp_log_pacing_size(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, uint32_t new_tso, 15771a714ff2SRandall Stewart uint64_t hw_rate, uint32_t time_between, uint32_t calc_time_between, 15781a714ff2SRandall Stewart uint32_t segs, uint32_t res_div, uint16_t mult, uint8_t mod) 15791a714ff2SRandall Stewart { 158069c7c811SRandall Stewart if (tcp_bblogging_on(tp)) { 15811a714ff2SRandall Stewart union tcp_log_stackspecific log; 15821a714ff2SRandall Stewart struct timeval tv; 15831a714ff2SRandall Stewart 15841a714ff2SRandall Stewart memset(&log, 0, sizeof(log)); 15851a714ff2SRandall Stewart log.u_bbr.flex1 = segsiz; 15861a714ff2SRandall Stewart log.u_bbr.flex2 = new_tso; 15871a714ff2SRandall Stewart log.u_bbr.flex3 = time_between; 15881a714ff2SRandall Stewart log.u_bbr.flex4 = calc_time_between; 15891a714ff2SRandall Stewart log.u_bbr.flex5 = segs; 15901a714ff2SRandall Stewart log.u_bbr.flex6 = res_div; 15911a714ff2SRandall Stewart log.u_bbr.flex7 = mult; 15921a714ff2SRandall Stewart log.u_bbr.flex8 = mod; 15931a714ff2SRandall Stewart log.u_bbr.timeStamp = tcp_get_usecs(&tv); 15941a714ff2SRandall Stewart log.u_bbr.cur_del_rate = bw; 15951a714ff2SRandall Stewart log.u_bbr.delRate = hw_rate; 15961a714ff2SRandall Stewart TCP_LOG_EVENTP(tp, NULL, 15979eb0e832SGleb Smirnoff &tptosocket(tp)->so_rcv, 15989eb0e832SGleb Smirnoff &tptosocket(tp)->so_snd, 15991a714ff2SRandall Stewart TCP_HDWR_PACE_SIZE, 0, 16001a714ff2SRandall Stewart 0, &log, false, &tv); 16011a714ff2SRandall Stewart } 16021a714ff2SRandall Stewart } 16031a714ff2SRandall Stewart 1604d7313dc6SRandall Stewart uint32_t 160526bdd35cSRandall Stewart tcp_get_pacing_burst_size_w_divisor(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss, 160626bdd35cSRandall Stewart const struct tcp_hwrate_limit_table *te, int *err, int divisor) 1607d7313dc6SRandall Stewart { 1608d7313dc6SRandall Stewart /* 1609d7313dc6SRandall Stewart * We use the google formula to calculate the 1610d7313dc6SRandall Stewart * TSO size. I.E. 1611d7313dc6SRandall Stewart * bw < 24Meg 1612d7313dc6SRandall Stewart * tso = 2mss 1613d7313dc6SRandall Stewart * else 161426bdd35cSRandall Stewart * tso = min(bw/(div=1000), 64k) 1615d7313dc6SRandall Stewart * 1616d7313dc6SRandall Stewart * Note for these calculations we ignore the 1617d7313dc6SRandall Stewart * packet overhead (enet hdr, ip hdr and tcp hdr). 161826bdd35cSRandall Stewart * We only get the google formula when we have 161926bdd35cSRandall Stewart * divisor = 1000, which is the default for now. 1620d7313dc6SRandall Stewart */ 1621d7313dc6SRandall Stewart uint64_t lentim, res, bytes; 1622d7313dc6SRandall Stewart uint32_t new_tso, min_tso_segs; 1623d7313dc6SRandall Stewart 162426bdd35cSRandall Stewart /* It can't be zero */ 162526bdd35cSRandall Stewart if ((divisor == 0) || 162626bdd35cSRandall Stewart (divisor < RL_MIN_DIVISOR)) { 162726bdd35cSRandall Stewart if (mss_divisor) 162826bdd35cSRandall Stewart bytes = bw / mss_divisor; 162926bdd35cSRandall Stewart else 1630d7313dc6SRandall Stewart bytes = bw / 1000; 163126bdd35cSRandall Stewart } else 163226bdd35cSRandall Stewart bytes = bw / divisor; 163326bdd35cSRandall Stewart /* We can't ever send more than 65k in a TSO */ 163426bdd35cSRandall Stewart if (bytes > 0xffff) { 163526bdd35cSRandall Stewart bytes = 0xffff; 163626bdd35cSRandall Stewart } 1637d7313dc6SRandall Stewart /* Round up */ 1638d7313dc6SRandall Stewart new_tso = (bytes + segsiz - 1) / segsiz; 163926bdd35cSRandall Stewart /* Are we enforcing even boundaries? */ 164026bdd35cSRandall Stewart if (even_num_segs && (new_tso & 1) && (new_tso > even_threshold)) 164126bdd35cSRandall Stewart new_tso++; 164226bdd35cSRandall Stewart if (can_use_1mss) 1643d7313dc6SRandall Stewart min_tso_segs = 1; 1644d7313dc6SRandall Stewart else 1645d7313dc6SRandall Stewart min_tso_segs = 2; 16461a714ff2SRandall Stewart if (rs_floor_mss && (new_tso < rs_floor_mss)) 16471a714ff2SRandall Stewart new_tso = rs_floor_mss; 16481a714ff2SRandall Stewart else if (new_tso < min_tso_segs) 1649d7313dc6SRandall Stewart new_tso = min_tso_segs; 1650d7313dc6SRandall Stewart if (new_tso > MAX_MSS_SENT) 1651d7313dc6SRandall Stewart new_tso = MAX_MSS_SENT; 1652d7313dc6SRandall Stewart new_tso *= segsiz; 16531a714ff2SRandall Stewart tcp_log_pacing_size(tp, bw, segsiz, new_tso, 16541a714ff2SRandall Stewart 0, 0, 0, 0, 0, 0, 1); 1655d7313dc6SRandall Stewart /* 1656d7313dc6SRandall Stewart * If we are not doing hardware pacing 1657d7313dc6SRandall Stewart * then we are done. 1658d7313dc6SRandall Stewart */ 1659d7313dc6SRandall Stewart if (te == NULL) { 1660d7313dc6SRandall Stewart if (err) 1661d7313dc6SRandall Stewart *err = 0; 1662d7313dc6SRandall Stewart return(new_tso); 1663d7313dc6SRandall Stewart } 1664d7313dc6SRandall Stewart /* 1665d7313dc6SRandall Stewart * For hardware pacing we look at the 1666d7313dc6SRandall Stewart * rate you are sending at and compare 1667d7313dc6SRandall Stewart * that to the rate you have in hardware. 1668d7313dc6SRandall Stewart * 1669d7313dc6SRandall Stewart * If the hardware rate is slower than your 1670d7313dc6SRandall Stewart * software rate then you are in error and 1671d7313dc6SRandall Stewart * we will build a queue in our hardware whic 1672d7313dc6SRandall Stewart * is probably not desired, in such a case 1673d7313dc6SRandall Stewart * just return the non-hardware TSO size. 1674d7313dc6SRandall Stewart * 1675d7313dc6SRandall Stewart * If the rate in hardware is faster (which 1676d7313dc6SRandall Stewart * it should be) then look at how long it 1677d7313dc6SRandall Stewart * takes to send one ethernet segment size at 1678d7313dc6SRandall Stewart * your b/w and compare that to the time it 1679d7313dc6SRandall Stewart * takes to send at the rate you had selected. 1680d7313dc6SRandall Stewart * 1681d7313dc6SRandall Stewart * If your time is greater (which we hope it is) 1682d7313dc6SRandall Stewart * we get the delta between the two, and then 1683d7313dc6SRandall Stewart * divide that into your pacing time. This tells 1684d7313dc6SRandall Stewart * us how many MSS you can send down at once (rounded up). 1685d7313dc6SRandall Stewart * 1686d7313dc6SRandall Stewart * Note we also double this value if the b/w is over 1687d7313dc6SRandall Stewart * 100Mbps. If its over 500meg we just set you to the 1688d7313dc6SRandall Stewart * max (43 segments). 1689d7313dc6SRandall Stewart */ 1690d7313dc6SRandall Stewart if (te->rate > FIVE_HUNDRED_MBPS) 16911a714ff2SRandall Stewart goto max; 1692d7313dc6SRandall Stewart if (te->rate == bw) { 1693d7313dc6SRandall Stewart /* We are pacing at exactly the hdwr rate */ 16941a714ff2SRandall Stewart max: 16951a714ff2SRandall Stewart tcp_log_pacing_size(tp, bw, segsiz, new_tso, 16961a714ff2SRandall Stewart te->rate, te->time_between, (uint32_t)0, 16971a714ff2SRandall Stewart (segsiz * MAX_MSS_SENT), 0, 0, 3); 1698d7313dc6SRandall Stewart return (segsiz * MAX_MSS_SENT); 1699d7313dc6SRandall Stewart } 1700d7313dc6SRandall Stewart lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND; 1701d7313dc6SRandall Stewart res = lentim / bw; 1702d7313dc6SRandall Stewart if (res > te->time_between) { 17031a714ff2SRandall Stewart uint32_t delta, segs, res_div; 1704d7313dc6SRandall Stewart 17051a714ff2SRandall Stewart res_div = ((res * num_of_waits_allowed) + wait_time_floor); 1706d7313dc6SRandall Stewart delta = res - te->time_between; 17071a714ff2SRandall Stewart segs = (res_div + delta - 1)/delta; 1708d7313dc6SRandall Stewart if (segs < min_tso_segs) 1709d7313dc6SRandall Stewart segs = min_tso_segs; 17101a714ff2SRandall Stewart if (segs < rs_hw_floor_mss) 17111a714ff2SRandall Stewart segs = rs_hw_floor_mss; 1712d7313dc6SRandall Stewart if (segs > MAX_MSS_SENT) 1713d7313dc6SRandall Stewart segs = MAX_MSS_SENT; 1714d7313dc6SRandall Stewart segs *= segsiz; 17151a714ff2SRandall Stewart tcp_log_pacing_size(tp, bw, segsiz, new_tso, 17161a714ff2SRandall Stewart te->rate, te->time_between, (uint32_t)res, 17171a714ff2SRandall Stewart segs, res_div, 1, 3); 1718d7313dc6SRandall Stewart if (err) 1719d7313dc6SRandall Stewart *err = 0; 1720d7313dc6SRandall Stewart if (segs < new_tso) { 1721d7313dc6SRandall Stewart /* unexpected ? */ 1722d7313dc6SRandall Stewart return(new_tso); 1723d7313dc6SRandall Stewart } else { 1724d7313dc6SRandall Stewart return (segs); 1725d7313dc6SRandall Stewart } 1726d7313dc6SRandall Stewart } else { 1727d7313dc6SRandall Stewart /* 1728d7313dc6SRandall Stewart * Your time is smaller which means 1729d7313dc6SRandall Stewart * we will grow a queue on our 1730d7313dc6SRandall Stewart * hardware. Send back the non-hardware 1731d7313dc6SRandall Stewart * rate. 1732d7313dc6SRandall Stewart */ 17331a714ff2SRandall Stewart tcp_log_pacing_size(tp, bw, segsiz, new_tso, 17341a714ff2SRandall Stewart te->rate, te->time_between, (uint32_t)res, 17351a714ff2SRandall Stewart 0, 0, 0, 4); 1736d7313dc6SRandall Stewart if (err) 1737d7313dc6SRandall Stewart *err = -1; 1738d7313dc6SRandall Stewart return (new_tso); 1739d7313dc6SRandall Stewart } 1740d7313dc6SRandall Stewart } 1741d7313dc6SRandall Stewart 17421a714ff2SRandall Stewart uint64_t 17431a714ff2SRandall Stewart tcp_hw_highest_rate_ifp(struct ifnet *ifp, struct inpcb *inp) 17441a714ff2SRandall Stewart { 17451a714ff2SRandall Stewart struct epoch_tracker et; 17461a714ff2SRandall Stewart struct tcp_rate_set *rs; 17471a714ff2SRandall Stewart uint64_t rate_ret; 17481a714ff2SRandall Stewart 17491a714ff2SRandall Stewart NET_EPOCH_ENTER(et); 17501a714ff2SRandall Stewart use_next_interface: 17511a714ff2SRandall Stewart rs = find_rs_for_ifp(ifp); 17521a714ff2SRandall Stewart if (rs == NULL) { 17531a714ff2SRandall Stewart /* This interface does not do ratelimiting */ 17541a714ff2SRandall Stewart rate_ret = 0; 17551a714ff2SRandall Stewart } else if (rs->rs_flags & RS_IS_DEFF) { 17561a714ff2SRandall Stewart /* We need to find the real interface */ 17571a714ff2SRandall Stewart struct ifnet *tifp; 17581a714ff2SRandall Stewart 17591a714ff2SRandall Stewart tifp = rt_find_real_interface(ifp, inp, NULL); 17601a714ff2SRandall Stewart if (tifp == NULL) { 17611a714ff2SRandall Stewart NET_EPOCH_EXIT(et); 17621a714ff2SRandall Stewart return (0); 17631a714ff2SRandall Stewart } 17641a714ff2SRandall Stewart ifp = tifp; 17651a714ff2SRandall Stewart goto use_next_interface; 17661a714ff2SRandall Stewart } else { 17671a714ff2SRandall Stewart /* Lets return the highest rate this guy has */ 17681a714ff2SRandall Stewart rate_ret = rs->rs_rlt[rs->rs_highest_valid].rate; 17691a714ff2SRandall Stewart } 17701a714ff2SRandall Stewart NET_EPOCH_EXIT(et); 17711a714ff2SRandall Stewart return(rate_ret); 17721a714ff2SRandall Stewart } 17731a714ff2SRandall Stewart 177420abea66SRandall Stewart static eventhandler_tag rl_ifnet_departs; 177520abea66SRandall Stewart static eventhandler_tag rl_ifnet_arrives; 177620abea66SRandall Stewart static eventhandler_tag rl_shutdown_start; 177720abea66SRandall Stewart 177820abea66SRandall Stewart static void 177920abea66SRandall Stewart tcp_rs_init(void *st __unused) 178020abea66SRandall Stewart { 178120abea66SRandall Stewart CK_LIST_INIT(&int_rs); 178220abea66SRandall Stewart rs_number_alive = 0; 1783c012cfe6SEd Maste rs_number_dead = 0; 178420abea66SRandall Stewart mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF); 178520abea66SRandall Stewart rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event, 178620abea66SRandall Stewart tcp_rl_ifnet_departure, 178720abea66SRandall Stewart NULL, EVENTHANDLER_PRI_ANY); 178820abea66SRandall Stewart rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event, 178920abea66SRandall Stewart tcp_rl_ifnet_link, 179020abea66SRandall Stewart NULL, EVENTHANDLER_PRI_ANY); 179120abea66SRandall Stewart rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync, 179220abea66SRandall Stewart tcp_rl_shutdown, NULL, 179320abea66SRandall Stewart SHUTDOWN_PRI_FIRST); 179420abea66SRandall Stewart printf("TCP_ratelimit: Is now initialized\n"); 179520abea66SRandall Stewart } 179620abea66SRandall Stewart 179720abea66SRandall Stewart SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL); 179820abea66SRandall Stewart #endif 1799