13ee9c3c4SRandall Stewart /*- 252467047SWarner Losh * Copyright (c) 2016-2018 Netflix, Inc. 33ee9c3c4SRandall Stewart * 43ee9c3c4SRandall Stewart * Redistribution and use in source and binary forms, with or without 53ee9c3c4SRandall Stewart * modification, are permitted provided that the following conditions 63ee9c3c4SRandall Stewart * are met: 73ee9c3c4SRandall Stewart * 1. Redistributions of source code must retain the above copyright 83ee9c3c4SRandall Stewart * notice, this list of conditions and the following disclaimer. 93ee9c3c4SRandall Stewart * 2. Redistributions in binary form must reproduce the above copyright 103ee9c3c4SRandall Stewart * notice, this list of conditions and the following disclaimer in the 113ee9c3c4SRandall Stewart * documentation and/or other materials provided with the distribution. 123ee9c3c4SRandall Stewart * 133ee9c3c4SRandall Stewart * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 143ee9c3c4SRandall Stewart * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 153ee9c3c4SRandall Stewart * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 163ee9c3c4SRandall Stewart * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 173ee9c3c4SRandall Stewart * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 183ee9c3c4SRandall Stewart * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 193ee9c3c4SRandall Stewart * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 203ee9c3c4SRandall Stewart * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 213ee9c3c4SRandall Stewart * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 223ee9c3c4SRandall Stewart * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 233ee9c3c4SRandall Stewart * SUCH DAMAGE. 243ee9c3c4SRandall Stewart * 253ee9c3c4SRandall Stewart */ 263ee9c3c4SRandall Stewart #include <sys/cdefs.h> 273ee9c3c4SRandall Stewart #include "opt_inet.h" 283ee9c3c4SRandall Stewart #include "opt_inet6.h" 294e1a3ff8SBjoern A. Zeeb #include "opt_rss.h" 304e1a3ff8SBjoern A. Zeeb 313ee9c3c4SRandall Stewart /** 323ee9c3c4SRandall Stewart * Some notes about usage. 333ee9c3c4SRandall Stewart * 343ee9c3c4SRandall Stewart * The tcp_hpts system is designed to provide a high precision timer 353ee9c3c4SRandall Stewart * system for tcp. Its main purpose is to provide a mechanism for 363ee9c3c4SRandall Stewart * pacing packets out onto the wire. It can be used in two ways 373ee9c3c4SRandall Stewart * by a given TCP stack (and those two methods can be used simultaneously). 383ee9c3c4SRandall Stewart * 393b0b41e6SRandall Stewart * First, and probably the main thing its used by Rack and BBR, it can 403ee9c3c4SRandall Stewart * be used to call tcp_output() of a transport stack at some time in the future. 413ee9c3c4SRandall Stewart * The normal way this is done is that tcp_output() of the stack schedules 423ee9c3c4SRandall Stewart * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The 433ee9c3c4SRandall Stewart * slot is the time from now that the stack wants to be called but it 443ee9c3c4SRandall Stewart * must be converted to tcp_hpts's notion of slot. This is done with 453ee9c3c4SRandall Stewart * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical 463ee9c3c4SRandall Stewart * call from the tcp_output() routine might look like: 473ee9c3c4SRandall Stewart * 483ee9c3c4SRandall Stewart * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550)); 493ee9c3c4SRandall Stewart * 50ef0ac0a1SGordon Bergling * The above would schedule tcp_output() to be called in 550 useconds. 513ee9c3c4SRandall Stewart * Note that if using this mechanism the stack will want to add near 523ee9c3c4SRandall Stewart * its top a check to prevent unwanted calls (from user land or the 533ee9c3c4SRandall Stewart * arrival of incoming ack's). So it would add something like: 543ee9c3c4SRandall Stewart * 55db0ac6deSCy Schubert * if (tcp_in_hpts(inp)) 563ee9c3c4SRandall Stewart * return; 573ee9c3c4SRandall Stewart * 583ee9c3c4SRandall Stewart * to prevent output processing until the time alotted has gone by. 593ee9c3c4SRandall Stewart * Of course this is a bare bones example and the stack will probably 603ee9c3c4SRandall Stewart * have more consideration then just the above. 613ee9c3c4SRandall Stewart * 62db0ac6deSCy Schubert * In order to run input queued segments from the HPTS context the 633b0b41e6SRandall Stewart * tcp stack must define an input function for 643b0b41e6SRandall Stewart * tfb_do_queued_segments(). This function understands 653b0b41e6SRandall Stewart * how to dequeue a array of packets that were input and 663b0b41e6SRandall Stewart * knows how to call the correct processing routine. 673ee9c3c4SRandall Stewart * 683b0b41e6SRandall Stewart * Locking in this is important as well so most likely the 693b0b41e6SRandall Stewart * stack will need to define the tfb_do_segment_nounlock() 703b0b41e6SRandall Stewart * splitting tfb_do_segment() into two parts. The main processing 713b0b41e6SRandall Stewart * part that does not unlock the INP and returns a value of 1 or 0. 723b0b41e6SRandall Stewart * It returns 0 if all is well and the lock was not released. It 733b0b41e6SRandall Stewart * returns 1 if we had to destroy the TCB (a reset received etc). 743b0b41e6SRandall Stewart * The remains of tfb_do_segment() then become just a simple call 753b0b41e6SRandall Stewart * to the tfb_do_segment_nounlock() function and check the return 763b0b41e6SRandall Stewart * code and possibly unlock. 773ee9c3c4SRandall Stewart * 783b0b41e6SRandall Stewart * The stack must also set the flag on the INP that it supports this 793b0b41e6SRandall Stewart * feature i.e. INP_SUPPORTS_MBUFQ. The LRO code recoginizes 803b0b41e6SRandall Stewart * this flag as well and will queue packets when it is set. 813b0b41e6SRandall Stewart * There are other flags as well INP_MBUF_QUEUE_READY and 823b0b41e6SRandall Stewart * INP_DONT_SACK_QUEUE. The first flag tells the LRO code 833b0b41e6SRandall Stewart * that we are in the pacer for output so there is no 843b0b41e6SRandall Stewart * need to wake up the hpts system to get immediate 853b0b41e6SRandall Stewart * input. The second tells the LRO code that its okay 863b0b41e6SRandall Stewart * if a SACK arrives you can still defer input and let 873b0b41e6SRandall Stewart * the current hpts timer run (this is usually set when 883b0b41e6SRandall Stewart * a rack timer is up so we know SACK's are happening 893b0b41e6SRandall Stewart * on the connection already and don't want to wakeup yet). 903ee9c3c4SRandall Stewart * 913b0b41e6SRandall Stewart * There is a common functions within the rack_bbr_common code 923b0b41e6SRandall Stewart * version i.e. ctf_do_queued_segments(). This function 93a540cdcaSGleb Smirnoff * knows how to take the input queue of packets from tp->t_inqueue 94a540cdcaSGleb Smirnoff * and process them digging out all the arguments, calling any bpf tap and 953b0b41e6SRandall Stewart * calling into tfb_do_segment_nounlock(). The common 963b0b41e6SRandall Stewart * function (ctf_do_queued_segments()) requires that 973b0b41e6SRandall Stewart * you have defined the tfb_do_segment_nounlock() as 983b0b41e6SRandall Stewart * described above. 993ee9c3c4SRandall Stewart */ 1003ee9c3c4SRandall Stewart 1013ee9c3c4SRandall Stewart #include <sys/param.h> 1023ee9c3c4SRandall Stewart #include <sys/bus.h> 1033ee9c3c4SRandall Stewart #include <sys/interrupt.h> 1043ee9c3c4SRandall Stewart #include <sys/module.h> 1053ee9c3c4SRandall Stewart #include <sys/kernel.h> 1063ee9c3c4SRandall Stewart #include <sys/hhook.h> 1073ee9c3c4SRandall Stewart #include <sys/malloc.h> 1083ee9c3c4SRandall Stewart #include <sys/mbuf.h> 1093ee9c3c4SRandall Stewart #include <sys/proc.h> /* for proc0 declaration */ 1103ee9c3c4SRandall Stewart #include <sys/socket.h> 1113ee9c3c4SRandall Stewart #include <sys/socketvar.h> 1123ee9c3c4SRandall Stewart #include <sys/sysctl.h> 1133ee9c3c4SRandall Stewart #include <sys/systm.h> 1143ee9c3c4SRandall Stewart #include <sys/refcount.h> 1153ee9c3c4SRandall Stewart #include <sys/sched.h> 1163ee9c3c4SRandall Stewart #include <sys/queue.h> 1173ee9c3c4SRandall Stewart #include <sys/smp.h> 1183ee9c3c4SRandall Stewart #include <sys/counter.h> 1193ee9c3c4SRandall Stewart #include <sys/time.h> 1203ee9c3c4SRandall Stewart #include <sys/kthread.h> 1213ee9c3c4SRandall Stewart #include <sys/kern_prefetch.h> 1223ee9c3c4SRandall Stewart 1233ee9c3c4SRandall Stewart #include <vm/uma.h> 1244e255d74SAndrew Gallatin #include <vm/vm.h> 1253ee9c3c4SRandall Stewart 1263ee9c3c4SRandall Stewart #include <net/route.h> 1273ee9c3c4SRandall Stewart #include <net/vnet.h> 1283ee9c3c4SRandall Stewart 1294e1a3ff8SBjoern A. Zeeb #ifdef RSS 1304e1a3ff8SBjoern A. Zeeb #include <net/netisr.h> 1314e1a3ff8SBjoern A. Zeeb #include <net/rss_config.h> 1324e1a3ff8SBjoern A. Zeeb #endif 1334e1a3ff8SBjoern A. Zeeb 1343ee9c3c4SRandall Stewart #define TCPSTATES /* for logging */ 1353ee9c3c4SRandall Stewart 1363ee9c3c4SRandall Stewart #include <netinet/in.h> 1373ee9c3c4SRandall Stewart #include <netinet/in_kdtrace.h> 1383ee9c3c4SRandall Stewart #include <netinet/in_pcb.h> 1393ee9c3c4SRandall Stewart #include <netinet/ip.h> 1403ee9c3c4SRandall Stewart #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 1413ee9c3c4SRandall Stewart #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 1423ee9c3c4SRandall Stewart #include <netinet/ip_var.h> 1433ee9c3c4SRandall Stewart #include <netinet/ip6.h> 1443ee9c3c4SRandall Stewart #include <netinet6/in6_pcb.h> 1453ee9c3c4SRandall Stewart #include <netinet6/ip6_var.h> 1463ee9c3c4SRandall Stewart #include <netinet/tcp.h> 1473ee9c3c4SRandall Stewart #include <netinet/tcp_fsm.h> 1483ee9c3c4SRandall Stewart #include <netinet/tcp_seq.h> 1493ee9c3c4SRandall Stewart #include <netinet/tcp_timer.h> 1503ee9c3c4SRandall Stewart #include <netinet/tcp_var.h> 1513ee9c3c4SRandall Stewart #include <netinet/tcpip.h> 1523ee9c3c4SRandall Stewart #include <netinet/cc/cc.h> 1533ee9c3c4SRandall Stewart #include <netinet/tcp_hpts.h> 1543b0b41e6SRandall Stewart #include <netinet/tcp_log_buf.h> 1553ee9c3c4SRandall Stewart 1563ee9c3c4SRandall Stewart #ifdef tcp_offload 1573ee9c3c4SRandall Stewart #include <netinet/tcp_offload.h> 1583ee9c3c4SRandall Stewart #endif 1593ee9c3c4SRandall Stewart 160db0ac6deSCy Schubert /* 161db0ac6deSCy Schubert * The hpts uses a 102400 wheel. The wheel 162db0ac6deSCy Schubert * defines the time in 10 usec increments (102400 x 10). 163db0ac6deSCy Schubert * This gives a range of 10usec - 1024ms to place 164db0ac6deSCy Schubert * an entry within. If the user requests more than 165db0ac6deSCy Schubert * 1.024 second, a remaineder is attached and the hpts 166db0ac6deSCy Schubert * when seeing the remainder will re-insert the 167db0ac6deSCy Schubert * inpcb forward in time from where it is until 168db0ac6deSCy Schubert * the remainder is zero. 169db0ac6deSCy Schubert */ 170db0ac6deSCy Schubert 171db0ac6deSCy Schubert #define NUM_OF_HPTSI_SLOTS 102400 172db0ac6deSCy Schubert 173db0ac6deSCy Schubert /* Each hpts has its own p_mtx which is used for locking */ 174db0ac6deSCy Schubert #define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED) 175db0ac6deSCy Schubert #define HPTS_LOCK(hpts) mtx_lock(&(hpts)->p_mtx) 176b2bde8a6SGleb Smirnoff #define HPTS_TRYLOCK(hpts) mtx_trylock(&(hpts)->p_mtx) 177db0ac6deSCy Schubert #define HPTS_UNLOCK(hpts) mtx_unlock(&(hpts)->p_mtx) 178db0ac6deSCy Schubert struct tcp_hpts_entry { 179db0ac6deSCy Schubert /* Cache line 0x00 */ 180db0ac6deSCy Schubert struct mtx p_mtx; /* Mutex for hpts */ 181db0ac6deSCy Schubert struct timeval p_mysleep; /* Our min sleep time */ 182db0ac6deSCy Schubert uint64_t syscall_cnt; 183db0ac6deSCy Schubert uint64_t sleeping; /* What the actual sleep was (if sleeping) */ 184db0ac6deSCy Schubert uint16_t p_hpts_active; /* Flag that says hpts is awake */ 185db0ac6deSCy Schubert uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */ 186db0ac6deSCy Schubert uint32_t p_curtick; /* Tick in 10 us the hpts is going to */ 187db0ac6deSCy Schubert uint32_t p_runningslot; /* Current tick we are at if we are running */ 188db0ac6deSCy Schubert uint32_t p_prev_slot; /* Previous slot we were on */ 189db0ac6deSCy Schubert uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */ 190db0ac6deSCy Schubert uint32_t p_nxt_slot; /* The next slot outside the current range of 191db0ac6deSCy Schubert * slots that the hpts is running on. */ 192db0ac6deSCy Schubert int32_t p_on_queue_cnt; /* Count on queue in this hpts */ 193db0ac6deSCy Schubert uint32_t p_lasttick; /* Last tick before the current one */ 194db0ac6deSCy Schubert uint8_t p_direct_wake :1, /* boolean */ 195db0ac6deSCy Schubert p_on_min_sleep:1, /* boolean */ 196db0ac6deSCy Schubert p_hpts_wake_scheduled:1, /* boolean */ 197b7b78c1cSRandall Stewart hit_callout_thresh:1, 198b7b78c1cSRandall Stewart p_avail:4; 199db0ac6deSCy Schubert uint8_t p_fill[3]; /* Fill to 32 bits */ 200db0ac6deSCy Schubert /* Cache line 0x40 */ 201db0ac6deSCy Schubert struct hptsh { 202c2a69e84SGleb Smirnoff TAILQ_HEAD(, tcpcb) head; 203db0ac6deSCy Schubert uint32_t count; 204db0ac6deSCy Schubert uint32_t gencnt; 205db0ac6deSCy Schubert } *p_hptss; /* Hptsi wheel */ 206db0ac6deSCy Schubert uint32_t p_hpts_sleep_time; /* Current sleep interval having a max 207db0ac6deSCy Schubert * of 255ms */ 208db0ac6deSCy Schubert uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */ 209db0ac6deSCy Schubert uint32_t saved_lasttick; /* for logging */ 210db0ac6deSCy Schubert uint32_t saved_curtick; /* for logging */ 211db0ac6deSCy Schubert uint32_t saved_curslot; /* for logging */ 212db0ac6deSCy Schubert uint32_t saved_prev_slot; /* for logging */ 213db0ac6deSCy Schubert uint32_t p_delayed_by; /* How much were we delayed by */ 214db0ac6deSCy Schubert /* Cache line 0x80 */ 215db0ac6deSCy Schubert struct sysctl_ctx_list hpts_ctx; 216db0ac6deSCy Schubert struct sysctl_oid *hpts_root; 217db0ac6deSCy Schubert struct intr_event *ie; 218db0ac6deSCy Schubert void *ie_cookie; 219db0ac6deSCy Schubert uint16_t p_num; /* The hpts number one per cpu */ 220db0ac6deSCy Schubert uint16_t p_cpu; /* The hpts CPU */ 221db0ac6deSCy Schubert /* There is extra space in here */ 222db0ac6deSCy Schubert /* Cache line 0x100 */ 223db0ac6deSCy Schubert struct callout co __aligned(CACHE_LINE_SIZE); 224db0ac6deSCy Schubert } __aligned(CACHE_LINE_SIZE); 225db0ac6deSCy Schubert 226db0ac6deSCy Schubert static struct tcp_hptsi { 2276e6439b2SRandall Stewart struct cpu_group **grps; 228db0ac6deSCy Schubert struct tcp_hpts_entry **rp_ent; /* Array of hptss */ 229db0ac6deSCy Schubert uint32_t *cts_last_ran; 2306e6439b2SRandall Stewart uint32_t grp_cnt; 231db0ac6deSCy Schubert uint32_t rp_num_hptss; /* Number of hpts threads */ 232db0ac6deSCy Schubert } tcp_pace; 233db0ac6deSCy Schubert 23448b55a7cSGleb Smirnoff static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts"); 2353ee9c3c4SRandall Stewart #ifdef RSS 2363ee9c3c4SRandall Stewart static int tcp_bind_threads = 1; 2373ee9c3c4SRandall Stewart #else 2384e255d74SAndrew Gallatin static int tcp_bind_threads = 2; 2393ee9c3c4SRandall Stewart #endif 240d7955cc0SRandall Stewart static int tcp_use_irq_cpu = 0; 2413b0b41e6SRandall Stewart static int hpts_does_tp_logging = 0; 2423ee9c3c4SRandall Stewart 24363446fd3SGleb Smirnoff static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout); 2443ee9c3c4SRandall Stewart static void tcp_hpts_thread(void *ctx); 2453ee9c3c4SRandall Stewart 2463ee9c3c4SRandall Stewart int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP; 247d7955cc0SRandall Stewart static int conn_cnt_thresh = DEFAULT_CONNECTION_THESHOLD; 248d7955cc0SRandall Stewart static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP; 249d7955cc0SRandall Stewart static int32_t dynamic_max_sleep = DYNAMIC_MAX_SLEEP; 250d7955cc0SRandall Stewart 251d7955cc0SRandall Stewart 2527029da5cSPawel Biernacki SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 2537029da5cSPawel Biernacki "TCP Hpts controls"); 254d7955cc0SRandall Stewart SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 255d7955cc0SRandall Stewart "TCP Hpts statistics"); 2563ee9c3c4SRandall Stewart 2573ee9c3c4SRandall Stewart #define timersub(tvp, uvp, vvp) \ 2583ee9c3c4SRandall Stewart do { \ 2593ee9c3c4SRandall Stewart (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ 2603ee9c3c4SRandall Stewart (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ 2613ee9c3c4SRandall Stewart if ((vvp)->tv_usec < 0) { \ 2623ee9c3c4SRandall Stewart (vvp)->tv_sec--; \ 2633ee9c3c4SRandall Stewart (vvp)->tv_usec += 1000000; \ 2643ee9c3c4SRandall Stewart } \ 2653ee9c3c4SRandall Stewart } while (0) 2663ee9c3c4SRandall Stewart 2673ee9c3c4SRandall Stewart static int32_t tcp_hpts_precision = 120; 2683ee9c3c4SRandall Stewart 269db0ac6deSCy Schubert static struct hpts_domain_info { 2704e255d74SAndrew Gallatin int count; 2714e255d74SAndrew Gallatin int cpu[MAXCPU]; 272db0ac6deSCy Schubert } hpts_domains[MAXMEMDOM]; 2732e27230fSGleb Smirnoff 2743b0b41e6SRandall Stewart counter_u64_t hpts_hopelessly_behind; 2753b0b41e6SRandall Stewart 276d7955cc0SRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD, 2773b0b41e6SRandall Stewart &hpts_hopelessly_behind, 2783b0b41e6SRandall Stewart "Number of times hpts could not catch up and was behind hopelessly"); 2793ee9c3c4SRandall Stewart 2803ee9c3c4SRandall Stewart counter_u64_t hpts_loops; 2813ee9c3c4SRandall Stewart 282d7955cc0SRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, loops, CTLFLAG_RD, 2833ee9c3c4SRandall Stewart &hpts_loops, "Number of times hpts had to loop to catch up"); 2843ee9c3c4SRandall Stewart 2853ee9c3c4SRandall Stewart counter_u64_t back_tosleep; 2863ee9c3c4SRandall Stewart 287d7955cc0SRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, no_tcbsfound, CTLFLAG_RD, 2883ee9c3c4SRandall Stewart &back_tosleep, "Number of times hpts found no tcbs"); 2893ee9c3c4SRandall Stewart 2903b0b41e6SRandall Stewart counter_u64_t combined_wheel_wrap; 2913ee9c3c4SRandall Stewart 292d7955cc0SRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD, 2933b0b41e6SRandall Stewart &combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap"); 2943ee9c3c4SRandall Stewart 2953b0b41e6SRandall Stewart counter_u64_t wheel_wrap; 2963ee9c3c4SRandall Stewart 297d7955cc0SRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, wheel_wrap, CTLFLAG_RD, 2983b0b41e6SRandall Stewart &wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap"); 2993b0b41e6SRandall Stewart 300d7955cc0SRandall Stewart counter_u64_t hpts_direct_call; 301d7955cc0SRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_call, CTLFLAG_RD, 302d7955cc0SRandall Stewart &hpts_direct_call, "Number of times hpts was called by syscall/trap or other entry"); 3033ee9c3c4SRandall Stewart 304d7955cc0SRandall Stewart counter_u64_t hpts_wake_timeout; 305d7955cc0SRandall Stewart 306d7955cc0SRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, timeout_wakeup, CTLFLAG_RD, 307d7955cc0SRandall Stewart &hpts_wake_timeout, "Number of times hpts threads woke up via the callout expiring"); 308d7955cc0SRandall Stewart 309d7955cc0SRandall Stewart counter_u64_t hpts_direct_awakening; 310d7955cc0SRandall Stewart 311d7955cc0SRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_awakening, CTLFLAG_RD, 312d7955cc0SRandall Stewart &hpts_direct_awakening, "Number of times hpts threads woke up via the callout expiring"); 313d7955cc0SRandall Stewart 314d7955cc0SRandall Stewart counter_u64_t hpts_back_tosleep; 315d7955cc0SRandall Stewart 316d7955cc0SRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, back_tosleep, CTLFLAG_RD, 317d7955cc0SRandall Stewart &hpts_back_tosleep, "Number of times hpts threads woke up via the callout expiring and went back to sleep no work"); 318d7955cc0SRandall Stewart 319d7955cc0SRandall Stewart counter_u64_t cpu_uses_flowid; 320d7955cc0SRandall Stewart counter_u64_t cpu_uses_random; 321d7955cc0SRandall Stewart 322d7955cc0SRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_flowid, CTLFLAG_RD, 323d7955cc0SRandall Stewart &cpu_uses_flowid, "Number of times when setting cpuid we used the flowid field"); 324d7955cc0SRandall Stewart SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_random, CTLFLAG_RD, 325d7955cc0SRandall Stewart &cpu_uses_random, "Number of times when setting cpuid we used the a random value"); 326d7955cc0SRandall Stewart 327d7955cc0SRandall Stewart TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads); 328d7955cc0SRandall Stewart TUNABLE_INT("net.inet.tcp.use_irq", &tcp_use_irq_cpu); 329d7955cc0SRandall Stewart SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, bind_hptss, CTLFLAG_RD, 330d7955cc0SRandall Stewart &tcp_bind_threads, 2, 331d7955cc0SRandall Stewart "Thread Binding tunable"); 332d7955cc0SRandall Stewart SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_irq, CTLFLAG_RD, 333d7955cc0SRandall Stewart &tcp_use_irq_cpu, 0, 334d7955cc0SRandall Stewart "Use of irq CPU tunable"); 335d7955cc0SRandall Stewart SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW, 336d7955cc0SRandall Stewart &tcp_hpts_precision, 120, 337d7955cc0SRandall Stewart "Value for PRE() precision of callout"); 338d7955cc0SRandall Stewart SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, cnt_thresh, CTLFLAG_RW, 339d7955cc0SRandall Stewart &conn_cnt_thresh, 0, 340d7955cc0SRandall Stewart "How many connections (below) make us use the callout based mechanism"); 3413b0b41e6SRandall Stewart SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW, 3423b0b41e6SRandall Stewart &hpts_does_tp_logging, 0, 3433b0b41e6SRandall Stewart "Do we add to any tp that has logging on pacer logs"); 344d7955cc0SRandall Stewart SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_minsleep, CTLFLAG_RW, 345d7955cc0SRandall Stewart &dynamic_min_sleep, 250, 346d7955cc0SRandall Stewart "What is the dynamic minsleep value?"); 347d7955cc0SRandall Stewart SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_maxsleep, CTLFLAG_RW, 348d7955cc0SRandall Stewart &dynamic_max_sleep, 5000, 349d7955cc0SRandall Stewart "What is the dynamic maxsleep value?"); 350d7955cc0SRandall Stewart 3513b0b41e6SRandall Stewart static int32_t max_pacer_loops = 10; 3523b0b41e6SRandall Stewart SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, loopmax, CTLFLAG_RW, 3533b0b41e6SRandall Stewart &max_pacer_loops, 10, 3543b0b41e6SRandall Stewart "What is the maximum number of times the pacer will loop trying to catch up"); 3553b0b41e6SRandall Stewart 3563b0b41e6SRandall Stewart #define HPTS_MAX_SLEEP_ALLOWED (NUM_OF_HPTSI_SLOTS/2) 3573b0b41e6SRandall Stewart 3583b0b41e6SRandall Stewart static uint32_t hpts_sleep_max = HPTS_MAX_SLEEP_ALLOWED; 3593b0b41e6SRandall Stewart 3603b0b41e6SRandall Stewart static int 3613b0b41e6SRandall Stewart sysctl_net_inet_tcp_hpts_max_sleep(SYSCTL_HANDLER_ARGS) 3623b0b41e6SRandall Stewart { 3633b0b41e6SRandall Stewart int error; 3643b0b41e6SRandall Stewart uint32_t new; 3653b0b41e6SRandall Stewart 3663b0b41e6SRandall Stewart new = hpts_sleep_max; 3673b0b41e6SRandall Stewart error = sysctl_handle_int(oidp, &new, 0, req); 3683b0b41e6SRandall Stewart if (error == 0 && req->newptr) { 3696e6439b2SRandall Stewart if ((new < (dynamic_min_sleep/HPTS_TICKS_PER_SLOT)) || 3703b0b41e6SRandall Stewart (new > HPTS_MAX_SLEEP_ALLOWED)) 3713b0b41e6SRandall Stewart error = EINVAL; 3723b0b41e6SRandall Stewart else 3733b0b41e6SRandall Stewart hpts_sleep_max = new; 3743b0b41e6SRandall Stewart } 3753b0b41e6SRandall Stewart return (error); 3763b0b41e6SRandall Stewart } 3773b0b41e6SRandall Stewart 378d7955cc0SRandall Stewart static int 379d7955cc0SRandall Stewart sysctl_net_inet_tcp_hpts_min_sleep(SYSCTL_HANDLER_ARGS) 380d7955cc0SRandall Stewart { 381d7955cc0SRandall Stewart int error; 382d7955cc0SRandall Stewart uint32_t new; 383d7955cc0SRandall Stewart 384d7955cc0SRandall Stewart new = tcp_min_hptsi_time; 385d7955cc0SRandall Stewart error = sysctl_handle_int(oidp, &new, 0, req); 386d7955cc0SRandall Stewart if (error == 0 && req->newptr) { 387d7955cc0SRandall Stewart if (new < LOWEST_SLEEP_ALLOWED) 388d7955cc0SRandall Stewart error = EINVAL; 389d7955cc0SRandall Stewart else 390d7955cc0SRandall Stewart tcp_min_hptsi_time = new; 391d7955cc0SRandall Stewart } 392d7955cc0SRandall Stewart return (error); 393d7955cc0SRandall Stewart } 394d7955cc0SRandall Stewart 3953b0b41e6SRandall Stewart SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep, 3966e6439b2SRandall Stewart CTLTYPE_UINT | CTLFLAG_RW, 3973ee9c3c4SRandall Stewart &hpts_sleep_max, 0, 3983b0b41e6SRandall Stewart &sysctl_net_inet_tcp_hpts_max_sleep, "IU", 3996e6439b2SRandall Stewart "Maximum time hpts will sleep in slots"); 4003ee9c3c4SRandall Stewart 401d7955cc0SRandall Stewart SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, minsleep, 4026e6439b2SRandall Stewart CTLTYPE_UINT | CTLFLAG_RW, 4033ee9c3c4SRandall Stewart &tcp_min_hptsi_time, 0, 404d7955cc0SRandall Stewart &sysctl_net_inet_tcp_hpts_min_sleep, "IU", 4053ee9c3c4SRandall Stewart "The minimum time the hpts must sleep before processing more slots"); 4063ee9c3c4SRandall Stewart 407d7955cc0SRandall Stewart static int ticks_indicate_more_sleep = TICKS_INDICATE_MORE_SLEEP; 408d7955cc0SRandall Stewart static int ticks_indicate_less_sleep = TICKS_INDICATE_LESS_SLEEP; 409d7955cc0SRandall Stewart static int tcp_hpts_no_wake_over_thresh = 1; 410d7955cc0SRandall Stewart 411d7955cc0SRandall Stewart SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, more_sleep, CTLFLAG_RW, 412d7955cc0SRandall Stewart &ticks_indicate_more_sleep, 0, 413d7955cc0SRandall Stewart "If we only process this many or less on a timeout, we need longer sleep on the next callout"); 414d7955cc0SRandall Stewart SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, less_sleep, CTLFLAG_RW, 415d7955cc0SRandall Stewart &ticks_indicate_less_sleep, 0, 416d7955cc0SRandall Stewart "If we process this many or more on a timeout, we need less sleep on the next callout"); 417d7955cc0SRandall Stewart SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW, 418d7955cc0SRandall Stewart &tcp_hpts_no_wake_over_thresh, 0, 419d7955cc0SRandall Stewart "When we are over the threshold on the pacer do we prohibit wakeups?"); 4203ee9c3c4SRandall Stewart 421c2a69e84SGleb Smirnoff static uint16_t 422c2a69e84SGleb Smirnoff hpts_random_cpu(void) 423c2a69e84SGleb Smirnoff { 424c2a69e84SGleb Smirnoff uint16_t cpuid; 425c2a69e84SGleb Smirnoff uint32_t ran; 426c2a69e84SGleb Smirnoff 427c2a69e84SGleb Smirnoff ran = arc4random(); 428c2a69e84SGleb Smirnoff cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss); 429c2a69e84SGleb Smirnoff return (cpuid); 430c2a69e84SGleb Smirnoff } 431c2a69e84SGleb Smirnoff 4323ee9c3c4SRandall Stewart static void 4333b0b41e6SRandall Stewart tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv, 43463446fd3SGleb Smirnoff int slots_to_run, int idx, bool from_callout) 4353ee9c3c4SRandall Stewart { 4363b0b41e6SRandall Stewart union tcp_log_stackspecific log; 437d7955cc0SRandall Stewart /* 438d7955cc0SRandall Stewart * Unused logs are 439d7955cc0SRandall Stewart * 64 bit - delRate, rttProp, bw_inuse 440d7955cc0SRandall Stewart * 16 bit - cwnd_gain 441a370832bSGleb Smirnoff * 8 bit - bbr_state, bbr_substate, inhpts; 442d7955cc0SRandall Stewart */ 4433b0b41e6SRandall Stewart memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 4443b0b41e6SRandall Stewart log.u_bbr.flex1 = hpts->p_nxt_slot; 4453b0b41e6SRandall Stewart log.u_bbr.flex2 = hpts->p_cur_slot; 4463b0b41e6SRandall Stewart log.u_bbr.flex3 = hpts->p_prev_slot; 4473b0b41e6SRandall Stewart log.u_bbr.flex4 = idx; 4483b0b41e6SRandall Stewart log.u_bbr.flex5 = hpts->p_curtick; 4493b0b41e6SRandall Stewart log.u_bbr.flex6 = hpts->p_on_queue_cnt; 450d7955cc0SRandall Stewart log.u_bbr.flex7 = hpts->p_cpu; 451d7955cc0SRandall Stewart log.u_bbr.flex8 = (uint8_t)from_callout; 452d7955cc0SRandall Stewart log.u_bbr.inflight = slots_to_run; 4533b0b41e6SRandall Stewart log.u_bbr.applimited = hpts->overidden_sleep; 4543b0b41e6SRandall Stewart log.u_bbr.delivered = hpts->saved_curtick; 4553b0b41e6SRandall Stewart log.u_bbr.timeStamp = tcp_tv_to_usectick(tv); 4563b0b41e6SRandall Stewart log.u_bbr.epoch = hpts->saved_curslot; 4573b0b41e6SRandall Stewart log.u_bbr.lt_epoch = hpts->saved_prev_slot; 4583b0b41e6SRandall Stewart log.u_bbr.pkts_out = hpts->p_delayed_by; 4593b0b41e6SRandall Stewart log.u_bbr.lost = hpts->p_hpts_sleep_time; 460d7955cc0SRandall Stewart log.u_bbr.pacing_gain = hpts->p_cpu; 461d7955cc0SRandall Stewart log.u_bbr.pkt_epoch = hpts->p_runningslot; 462d7955cc0SRandall Stewart log.u_bbr.use_lt_bw = 1; 4633b0b41e6SRandall Stewart TCP_LOG_EVENTP(tp, NULL, 4649eb0e832SGleb Smirnoff &tptosocket(tp)->so_rcv, 4659eb0e832SGleb Smirnoff &tptosocket(tp)->so_snd, 4663b0b41e6SRandall Stewart BBR_LOG_HPTSDIAG, 0, 4673b0b41e6SRandall Stewart 0, &log, false, tv); 4683ee9c3c4SRandall Stewart } 4693ee9c3c4SRandall Stewart 4703ee9c3c4SRandall Stewart static void 471d7955cc0SRandall Stewart tcp_wakehpts(struct tcp_hpts_entry *hpts) 472d7955cc0SRandall Stewart { 473d7955cc0SRandall Stewart HPTS_MTX_ASSERT(hpts); 474d7955cc0SRandall Stewart 475d7955cc0SRandall Stewart if (tcp_hpts_no_wake_over_thresh && (hpts->p_on_queue_cnt >= conn_cnt_thresh)) { 476d7955cc0SRandall Stewart hpts->p_direct_wake = 0; 477d7955cc0SRandall Stewart return; 478d7955cc0SRandall Stewart } 479d7955cc0SRandall Stewart if (hpts->p_hpts_wake_scheduled == 0) { 480d7955cc0SRandall Stewart hpts->p_hpts_wake_scheduled = 1; 481d7955cc0SRandall Stewart swi_sched(hpts->ie_cookie, 0); 482d7955cc0SRandall Stewart } 483d7955cc0SRandall Stewart } 484d7955cc0SRandall Stewart 485d7955cc0SRandall Stewart static void 4863ee9c3c4SRandall Stewart hpts_timeout_swi(void *arg) 4873ee9c3c4SRandall Stewart { 4883ee9c3c4SRandall Stewart struct tcp_hpts_entry *hpts; 4893ee9c3c4SRandall Stewart 4903ee9c3c4SRandall Stewart hpts = (struct tcp_hpts_entry *)arg; 4913ee9c3c4SRandall Stewart swi_sched(hpts->ie_cookie, 0); 4923ee9c3c4SRandall Stewart } 4933ee9c3c4SRandall Stewart 494db0ac6deSCy Schubert static void 495c2a69e84SGleb Smirnoff tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts) 4963ee9c3c4SRandall Stewart { 497c2a69e84SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp); 498db0ac6deSCy Schubert struct hptsh *hptsh; 4993ee9c3c4SRandall Stewart 500db0ac6deSCy Schubert INP_WLOCK_ASSERT(inp); 501266f97b5SCy Schubert HPTS_MTX_ASSERT(hpts); 502c2a69e84SGleb Smirnoff MPASS(hpts->p_cpu == tp->t_hpts_cpu); 50353af6903SGleb Smirnoff MPASS(!(inp->inp_flags & INP_DROPPED)); 504db0ac6deSCy Schubert 505c2a69e84SGleb Smirnoff hptsh = &hpts->p_hptss[tp->t_hpts_slot]; 506db0ac6deSCy Schubert 507c2a69e84SGleb Smirnoff if (tp->t_in_hpts == IHPTS_NONE) { 508c2a69e84SGleb Smirnoff tp->t_in_hpts = IHPTS_ONQUEUE; 509db0ac6deSCy Schubert in_pcbref(inp); 510c2a69e84SGleb Smirnoff } else if (tp->t_in_hpts == IHPTS_MOVING) { 511c2a69e84SGleb Smirnoff tp->t_in_hpts = IHPTS_ONQUEUE; 512db0ac6deSCy Schubert } else 513c2a69e84SGleb Smirnoff MPASS(tp->t_in_hpts == IHPTS_ONQUEUE); 514c2a69e84SGleb Smirnoff tp->t_hpts_gencnt = hptsh->gencnt; 515db0ac6deSCy Schubert 516c2a69e84SGleb Smirnoff TAILQ_INSERT_TAIL(&hptsh->head, tp, t_hpts); 517db0ac6deSCy Schubert hptsh->count++; 518266f97b5SCy Schubert hpts->p_on_queue_cnt++; 519266f97b5SCy Schubert } 520266f97b5SCy Schubert 521db0ac6deSCy Schubert static struct tcp_hpts_entry * 522c2a69e84SGleb Smirnoff tcp_hpts_lock(struct tcpcb *tp) 5233ee9c3c4SRandall Stewart { 5243ee9c3c4SRandall Stewart struct tcp_hpts_entry *hpts; 5253ee9c3c4SRandall Stewart 526c2a69e84SGleb Smirnoff INP_LOCK_ASSERT(tptoinpcb(tp)); 527db0ac6deSCy Schubert 528c2a69e84SGleb Smirnoff hpts = tcp_pace.rp_ent[tp->t_hpts_cpu]; 529db0ac6deSCy Schubert HPTS_LOCK(hpts); 530db0ac6deSCy Schubert 5313ee9c3c4SRandall Stewart return (hpts); 5323ee9c3c4SRandall Stewart } 5333ee9c3c4SRandall Stewart 5343ee9c3c4SRandall Stewart static void 535c2a69e84SGleb Smirnoff tcp_hpts_release(struct tcpcb *tp) 5363ee9c3c4SRandall Stewart { 537db0ac6deSCy Schubert bool released __diagused; 5383ee9c3c4SRandall Stewart 539c2a69e84SGleb Smirnoff tp->t_in_hpts = IHPTS_NONE; 540c2a69e84SGleb Smirnoff released = in_pcbrele_wlocked(tptoinpcb(tp)); 541db0ac6deSCy Schubert MPASS(released == false); 5423ee9c3c4SRandall Stewart } 5433ee9c3c4SRandall Stewart 5443ee9c3c4SRandall Stewart /* 5453f46be6aSGleb Smirnoff * Initialize tcpcb to get ready for use with HPTS. We will know which CPU 5463f46be6aSGleb Smirnoff * is preferred on the first incoming packet. Before that avoid crowding 5473f46be6aSGleb Smirnoff * a single CPU with newborn connections and use a random one. 5483f46be6aSGleb Smirnoff * This initialization is normally called on a newborn tcpcb, but potentially 5493f46be6aSGleb Smirnoff * can be called once again if stack is switched. In that case we inherit CPU 5503f46be6aSGleb Smirnoff * that the previous stack has set, be it random or not. In extreme cases, 5513f46be6aSGleb Smirnoff * e.g. syzkaller fuzzing, a tcpcb can already be in HPTS in IHPTS_MOVING state 5523f46be6aSGleb Smirnoff * and has never received a first packet. 553c2a69e84SGleb Smirnoff */ 554c2a69e84SGleb Smirnoff void 555c2a69e84SGleb Smirnoff tcp_hpts_init(struct tcpcb *tp) 556c2a69e84SGleb Smirnoff { 557c2a69e84SGleb Smirnoff 5583f46be6aSGleb Smirnoff if (__predict_true(tp->t_hpts_cpu == HPTS_CPU_NONE)) { 559c2a69e84SGleb Smirnoff tp->t_hpts_cpu = hpts_random_cpu(); 560c2a69e84SGleb Smirnoff MPASS(!(tp->t_flags2 & TF2_HPTS_CPU_SET)); 561c2a69e84SGleb Smirnoff } 5623f46be6aSGleb Smirnoff } 563c2a69e84SGleb Smirnoff 564c2a69e84SGleb Smirnoff /* 5653ee9c3c4SRandall Stewart * Called normally with the INP_LOCKED but it 5663ee9c3c4SRandall Stewart * does not matter, the hpts lock is the key 5673ee9c3c4SRandall Stewart * but the lock order allows us to hold the 5683ee9c3c4SRandall Stewart * INP lock and then get the hpts lock. 5693ee9c3c4SRandall Stewart */ 5703ee9c3c4SRandall Stewart void 571c2a69e84SGleb Smirnoff tcp_hpts_remove(struct tcpcb *tp) 5723ee9c3c4SRandall Stewart { 5733ee9c3c4SRandall Stewart struct tcp_hpts_entry *hpts; 574db0ac6deSCy Schubert struct hptsh *hptsh; 5753ee9c3c4SRandall Stewart 576c2a69e84SGleb Smirnoff INP_WLOCK_ASSERT(tptoinpcb(tp)); 577db0ac6deSCy Schubert 578c2a69e84SGleb Smirnoff hpts = tcp_hpts_lock(tp); 579c2a69e84SGleb Smirnoff if (tp->t_in_hpts == IHPTS_ONQUEUE) { 580c2a69e84SGleb Smirnoff hptsh = &hpts->p_hptss[tp->t_hpts_slot]; 581c2a69e84SGleb Smirnoff tp->t_hpts_request = 0; 582c2a69e84SGleb Smirnoff if (__predict_true(tp->t_hpts_gencnt == hptsh->gencnt)) { 583c2a69e84SGleb Smirnoff TAILQ_REMOVE(&hptsh->head, tp, t_hpts); 584db0ac6deSCy Schubert MPASS(hptsh->count > 0); 585db0ac6deSCy Schubert hptsh->count--; 586db0ac6deSCy Schubert MPASS(hpts->p_on_queue_cnt > 0); 587db0ac6deSCy Schubert hpts->p_on_queue_cnt--; 588c2a69e84SGleb Smirnoff tcp_hpts_release(tp); 589db0ac6deSCy Schubert } else { 590db0ac6deSCy Schubert /* 591db0ac6deSCy Schubert * tcp_hptsi() now owns the TAILQ head of this inp. 592db0ac6deSCy Schubert * Can't TAILQ_REMOVE, just mark it. 593db0ac6deSCy Schubert */ 594db0ac6deSCy Schubert #ifdef INVARIANTS 595c2a69e84SGleb Smirnoff struct tcpcb *tmp; 596db0ac6deSCy Schubert 597c2a69e84SGleb Smirnoff TAILQ_FOREACH(tmp, &hptsh->head, t_hpts) 598c2a69e84SGleb Smirnoff MPASS(tmp != tp); 599db0ac6deSCy Schubert #endif 600c2a69e84SGleb Smirnoff tp->t_in_hpts = IHPTS_MOVING; 601c2a69e84SGleb Smirnoff tp->t_hpts_slot = -1; 6023ee9c3c4SRandall Stewart } 603c2a69e84SGleb Smirnoff } else if (tp->t_in_hpts == IHPTS_MOVING) { 604db0ac6deSCy Schubert /* 605db0ac6deSCy Schubert * Handle a special race condition: 606db0ac6deSCy Schubert * tcp_hptsi() moves inpcb to detached tailq 607db0ac6deSCy Schubert * tcp_hpts_remove() marks as IHPTS_MOVING, slot = -1 608db0ac6deSCy Schubert * tcp_hpts_insert() sets slot to a meaningful value 609db0ac6deSCy Schubert * tcp_hpts_remove() again (we are here!), then in_pcbdrop() 610db0ac6deSCy Schubert * tcp_hptsi() finds pcb with meaningful slot and INP_DROPPED 611db0ac6deSCy Schubert */ 612c2a69e84SGleb Smirnoff tp->t_hpts_slot = -1; 6132e27230fSGleb Smirnoff } 614db0ac6deSCy Schubert HPTS_UNLOCK(hpts); 615db0ac6deSCy Schubert } 616db0ac6deSCy Schubert 6173ee9c3c4SRandall Stewart static inline int 618d7955cc0SRandall Stewart hpts_slot(uint32_t wheel_slot, uint32_t plus) 6193ee9c3c4SRandall Stewart { 6203b0b41e6SRandall Stewart /* 6213b0b41e6SRandall Stewart * Given a slot on the wheel, what slot 6223b0b41e6SRandall Stewart * is that plus ticks out? 6233b0b41e6SRandall Stewart */ 624d7955cc0SRandall Stewart KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_slot)); 625d7955cc0SRandall Stewart return ((wheel_slot + plus) % NUM_OF_HPTSI_SLOTS); 6263b0b41e6SRandall Stewart } 6273b0b41e6SRandall Stewart 6283b0b41e6SRandall Stewart static inline int 6293b0b41e6SRandall Stewart tick_to_wheel(uint32_t cts_in_wticks) 6303b0b41e6SRandall Stewart { 6313b0b41e6SRandall Stewart /* 632d7955cc0SRandall Stewart * Given a timestamp in ticks (so by 633d7955cc0SRandall Stewart * default to get it to a real time one 634d7955cc0SRandall Stewart * would multiply by 10.. i.e the number 635d7955cc0SRandall Stewart * of ticks in a slot) map it to our limited 636d7955cc0SRandall Stewart * space wheel. 6373b0b41e6SRandall Stewart */ 6383b0b41e6SRandall Stewart return (cts_in_wticks % NUM_OF_HPTSI_SLOTS); 6393b0b41e6SRandall Stewart } 6403b0b41e6SRandall Stewart 6413b0b41e6SRandall Stewart static inline int 642d7955cc0SRandall Stewart hpts_slots_diff(int prev_slot, int slot_now) 6433b0b41e6SRandall Stewart { 6443b0b41e6SRandall Stewart /* 645d7955cc0SRandall Stewart * Given two slots that are someplace 6463b0b41e6SRandall Stewart * on our wheel. How far are they apart? 6473b0b41e6SRandall Stewart */ 648d7955cc0SRandall Stewart if (slot_now > prev_slot) 649d7955cc0SRandall Stewart return (slot_now - prev_slot); 650d7955cc0SRandall Stewart else if (slot_now == prev_slot) 6513b0b41e6SRandall Stewart /* 6523b0b41e6SRandall Stewart * Special case, same means we can go all of our 6533b0b41e6SRandall Stewart * wheel less one slot. 6543b0b41e6SRandall Stewart */ 6553b0b41e6SRandall Stewart return (NUM_OF_HPTSI_SLOTS - 1); 6563b0b41e6SRandall Stewart else 657d7955cc0SRandall Stewart return ((NUM_OF_HPTSI_SLOTS - prev_slot) + slot_now); 6583b0b41e6SRandall Stewart } 6593b0b41e6SRandall Stewart 6603b0b41e6SRandall Stewart /* 661d7955cc0SRandall Stewart * Given a slot on the wheel that is the current time 662d7955cc0SRandall Stewart * mapped to the wheel (wheel_slot), what is the maximum 6633b0b41e6SRandall Stewart * distance forward that can be obtained without 664d7955cc0SRandall Stewart * wrapping past either prev_slot or running_slot 6653b0b41e6SRandall Stewart * depending on the htps state? Also if passed 666d7955cc0SRandall Stewart * a uint32_t *, fill it with the slot location. 6673b0b41e6SRandall Stewart * 6683b0b41e6SRandall Stewart * Note if you do not give this function the current 669d7955cc0SRandall Stewart * time (that you think it is) mapped to the wheel slot 6703b0b41e6SRandall Stewart * then the results will not be what you expect and 6713b0b41e6SRandall Stewart * could lead to invalid inserts. 6723b0b41e6SRandall Stewart */ 6733b0b41e6SRandall Stewart static inline int32_t 674d7955cc0SRandall Stewart max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *target_slot) 6753b0b41e6SRandall Stewart { 676d7955cc0SRandall Stewart uint32_t dis_to_travel, end_slot, pacer_to_now, avail_on_wheel; 6773b0b41e6SRandall Stewart 6783b0b41e6SRandall Stewart if ((hpts->p_hpts_active == 1) && 6793b0b41e6SRandall Stewart (hpts->p_wheel_complete == 0)) { 680d7955cc0SRandall Stewart end_slot = hpts->p_runningslot; 6813b0b41e6SRandall Stewart /* Back up one tick */ 682d7955cc0SRandall Stewart if (end_slot == 0) 683d7955cc0SRandall Stewart end_slot = NUM_OF_HPTSI_SLOTS - 1; 6843b0b41e6SRandall Stewart else 685d7955cc0SRandall Stewart end_slot--; 686d7955cc0SRandall Stewart if (target_slot) 687d7955cc0SRandall Stewart *target_slot = end_slot; 6883b0b41e6SRandall Stewart } else { 6893b0b41e6SRandall Stewart /* 6903b0b41e6SRandall Stewart * For the case where we are 6913b0b41e6SRandall Stewart * not active, or we have 6923b0b41e6SRandall Stewart * completed the pass over 6933b0b41e6SRandall Stewart * the wheel, we can use the 6943b0b41e6SRandall Stewart * prev tick and subtract one from it. This puts us 6953b0b41e6SRandall Stewart * as far out as possible on the wheel. 6963b0b41e6SRandall Stewart */ 697d7955cc0SRandall Stewart end_slot = hpts->p_prev_slot; 698d7955cc0SRandall Stewart if (end_slot == 0) 699d7955cc0SRandall Stewart end_slot = NUM_OF_HPTSI_SLOTS - 1; 7003b0b41e6SRandall Stewart else 701d7955cc0SRandall Stewart end_slot--; 702d7955cc0SRandall Stewart if (target_slot) 703d7955cc0SRandall Stewart *target_slot = end_slot; 7043b0b41e6SRandall Stewart /* 7053b0b41e6SRandall Stewart * Now we have close to the full wheel left minus the 7063b0b41e6SRandall Stewart * time it has been since the pacer went to sleep. Note 7073b0b41e6SRandall Stewart * that wheel_tick, passed in, should be the current time 7083b0b41e6SRandall Stewart * from the perspective of the caller, mapped to the wheel. 7093b0b41e6SRandall Stewart */ 710d7955cc0SRandall Stewart if (hpts->p_prev_slot != wheel_slot) 711d7955cc0SRandall Stewart dis_to_travel = hpts_slots_diff(hpts->p_prev_slot, wheel_slot); 7123b0b41e6SRandall Stewart else 7133b0b41e6SRandall Stewart dis_to_travel = 1; 7143b0b41e6SRandall Stewart /* 7153b0b41e6SRandall Stewart * dis_to_travel in this case is the space from when the 716d7955cc0SRandall Stewart * pacer stopped (p_prev_slot) and where our wheel_slot 7173b0b41e6SRandall Stewart * is now. To know how many slots we can put it in we 7183b0b41e6SRandall Stewart * subtract from the wheel size. We would not want 7193b0b41e6SRandall Stewart * to place something after p_prev_slot or it will 7203b0b41e6SRandall Stewart * get ran too soon. 7213b0b41e6SRandall Stewart */ 7223b0b41e6SRandall Stewart return (NUM_OF_HPTSI_SLOTS - dis_to_travel); 7233b0b41e6SRandall Stewart } 7243b0b41e6SRandall Stewart /* 725d7955cc0SRandall Stewart * So how many slots are open between p_runningslot -> p_cur_slot 7263b0b41e6SRandall Stewart * that is what is currently un-available for insertion. Special 7273b0b41e6SRandall Stewart * case when we are at the last slot, this gets 1, so that 7283b0b41e6SRandall Stewart * the answer to how many slots are available is all but 1. 7293b0b41e6SRandall Stewart */ 730d7955cc0SRandall Stewart if (hpts->p_runningslot == hpts->p_cur_slot) 7313b0b41e6SRandall Stewart dis_to_travel = 1; 7323b0b41e6SRandall Stewart else 733d7955cc0SRandall Stewart dis_to_travel = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot); 7343b0b41e6SRandall Stewart /* 7353b0b41e6SRandall Stewart * How long has the pacer been running? 7363b0b41e6SRandall Stewart */ 737d7955cc0SRandall Stewart if (hpts->p_cur_slot != wheel_slot) { 7383b0b41e6SRandall Stewart /* The pacer is a bit late */ 739d7955cc0SRandall Stewart pacer_to_now = hpts_slots_diff(hpts->p_cur_slot, wheel_slot); 7403b0b41e6SRandall Stewart } else { 7413b0b41e6SRandall Stewart /* The pacer is right on time, now == pacers start time */ 7423b0b41e6SRandall Stewart pacer_to_now = 0; 7433b0b41e6SRandall Stewart } 7443b0b41e6SRandall Stewart /* 7453b0b41e6SRandall Stewart * To get the number left we can insert into we simply 746d68f1542SGordon Bergling * subtract the distance the pacer has to run from how 7473b0b41e6SRandall Stewart * many slots there are. 7483b0b41e6SRandall Stewart */ 7493b0b41e6SRandall Stewart avail_on_wheel = NUM_OF_HPTSI_SLOTS - dis_to_travel; 7503b0b41e6SRandall Stewart /* 7513b0b41e6SRandall Stewart * Now how many of those we will eat due to the pacer's 7523b0b41e6SRandall Stewart * time (p_cur_slot) of start being behind the 753d7955cc0SRandall Stewart * real time (wheel_slot)? 7543b0b41e6SRandall Stewart */ 7553b0b41e6SRandall Stewart if (avail_on_wheel <= pacer_to_now) { 7563b0b41e6SRandall Stewart /* 7573b0b41e6SRandall Stewart * Wheel wrap, we can't fit on the wheel, that 7583b0b41e6SRandall Stewart * is unusual the system must be way overloaded! 759d7955cc0SRandall Stewart * Insert into the assured slot, and return special 7603b0b41e6SRandall Stewart * "0". 7613b0b41e6SRandall Stewart */ 7623b0b41e6SRandall Stewart counter_u64_add(combined_wheel_wrap, 1); 763b600644fSMichael Tuexen if (target_slot) 764d7955cc0SRandall Stewart *target_slot = hpts->p_nxt_slot; 7653b0b41e6SRandall Stewart return (0); 7663b0b41e6SRandall Stewart } else { 7673b0b41e6SRandall Stewart /* 7683b0b41e6SRandall Stewart * We know how many slots are open 7693b0b41e6SRandall Stewart * on the wheel (the reverse of what 7703b0b41e6SRandall Stewart * is left to run. Take away the time 771d7955cc0SRandall Stewart * the pacer started to now (wheel_slot) 7723b0b41e6SRandall Stewart * and that tells you how many slots are 7733b0b41e6SRandall Stewart * open that can be inserted into that won't 7743b0b41e6SRandall Stewart * be touched by the pacer until later. 7753b0b41e6SRandall Stewart */ 7763b0b41e6SRandall Stewart return (avail_on_wheel - pacer_to_now); 7773b0b41e6SRandall Stewart } 7783ee9c3c4SRandall Stewart } 7793ee9c3c4SRandall Stewart 7803ee9c3c4SRandall Stewart 7813b0b41e6SRandall Stewart #ifdef INVARIANTS 7823ee9c3c4SRandall Stewart static void 783c2a69e84SGleb Smirnoff check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp, 784c2a69e84SGleb Smirnoff uint32_t hptsslot, int line) 7853ee9c3c4SRandall Stewart { 7863b0b41e6SRandall Stewart /* 7873b0b41e6SRandall Stewart * Sanity checks for the pacer with invariants 7883b0b41e6SRandall Stewart * on insert. 7893b0b41e6SRandall Stewart */ 790c2a69e84SGleb Smirnoff KASSERT(hptsslot < NUM_OF_HPTSI_SLOTS, 791c2a69e84SGleb Smirnoff ("hpts:%p tp:%p slot:%d > max", hpts, tp, hptsslot)); 7923b0b41e6SRandall Stewart if ((hpts->p_hpts_active) && 7933b0b41e6SRandall Stewart (hpts->p_wheel_complete == 0)) { 7943b0b41e6SRandall Stewart /* 7953b0b41e6SRandall Stewart * If the pacer is processing a arc 7963b0b41e6SRandall Stewart * of the wheel, we need to make 7973b0b41e6SRandall Stewart * sure we are not inserting within 7983b0b41e6SRandall Stewart * that arc. 7993b0b41e6SRandall Stewart */ 8003b0b41e6SRandall Stewart int distance, yet_to_run; 8013b0b41e6SRandall Stewart 802c2a69e84SGleb Smirnoff distance = hpts_slots_diff(hpts->p_runningslot, hptsslot); 803d7955cc0SRandall Stewart if (hpts->p_runningslot != hpts->p_cur_slot) 804d7955cc0SRandall Stewart yet_to_run = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot); 8053b0b41e6SRandall Stewart else 8063b0b41e6SRandall Stewart yet_to_run = 0; /* processing last slot */ 807c2a69e84SGleb Smirnoff KASSERT(yet_to_run <= distance, ("hpts:%p tp:%p slot:%d " 808c2a69e84SGleb Smirnoff "distance:%d yet_to_run:%d rs:%d cs:%d", hpts, tp, 809c2a69e84SGleb Smirnoff hptsslot, distance, yet_to_run, hpts->p_runningslot, 810c2a69e84SGleb Smirnoff hpts->p_cur_slot)); 8113b0b41e6SRandall Stewart } 8123b0b41e6SRandall Stewart } 8133b0b41e6SRandall Stewart #endif 8143b0b41e6SRandall Stewart 815db0ac6deSCy Schubert uint32_t 816c2a69e84SGleb Smirnoff tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_diag *diag) 8173b0b41e6SRandall Stewart { 818db0ac6deSCy Schubert struct tcp_hpts_entry *hpts; 819db0ac6deSCy Schubert struct timeval tv; 820db0ac6deSCy Schubert uint32_t slot_on, wheel_cts, last_slot, need_new_to = 0; 821db0ac6deSCy Schubert int32_t wheel_slot, maxslots; 822db0ac6deSCy Schubert bool need_wakeup = false; 8233ee9c3c4SRandall Stewart 824c2a69e84SGleb Smirnoff INP_WLOCK_ASSERT(tptoinpcb(tp)); 825c2a69e84SGleb Smirnoff MPASS(!(tptoinpcb(tp)->inp_flags & INP_DROPPED)); 826638b5ae1SRandall Stewart MPASS(!(tp->t_in_hpts == IHPTS_ONQUEUE)); 827db0ac6deSCy Schubert 828db0ac6deSCy Schubert /* 829db0ac6deSCy Schubert * We now return the next-slot the hpts will be on, beyond its 830db0ac6deSCy Schubert * current run (if up) or where it was when it stopped if it is 831db0ac6deSCy Schubert * sleeping. 832db0ac6deSCy Schubert */ 833c2a69e84SGleb Smirnoff hpts = tcp_hpts_lock(tp); 834db0ac6deSCy Schubert microuptime(&tv); 8353ee9c3c4SRandall Stewart if (diag) { 8363ee9c3c4SRandall Stewart memset(diag, 0, sizeof(struct hpts_diag)); 8373ee9c3c4SRandall Stewart diag->p_hpts_active = hpts->p_hpts_active; 8383b0b41e6SRandall Stewart diag->p_prev_slot = hpts->p_prev_slot; 839d7955cc0SRandall Stewart diag->p_runningslot = hpts->p_runningslot; 8403ee9c3c4SRandall Stewart diag->p_nxt_slot = hpts->p_nxt_slot; 8413ee9c3c4SRandall Stewart diag->p_cur_slot = hpts->p_cur_slot; 8423b0b41e6SRandall Stewart diag->p_curtick = hpts->p_curtick; 8433b0b41e6SRandall Stewart diag->p_lasttick = hpts->p_lasttick; 8443ee9c3c4SRandall Stewart diag->slot_req = slot; 8453b0b41e6SRandall Stewart diag->p_on_min_sleep = hpts->p_on_min_sleep; 8463b0b41e6SRandall Stewart diag->hpts_sleep_time = hpts->p_hpts_sleep_time; 8473ee9c3c4SRandall Stewart } 8483ee9c3c4SRandall Stewart if (slot == 0) { 849db0ac6deSCy Schubert /* Ok we need to set it on the hpts in the current slot */ 850c2a69e84SGleb Smirnoff tp->t_hpts_request = 0; 851db0ac6deSCy Schubert if ((hpts->p_hpts_active == 0) || (hpts->p_wheel_complete)) { 852db0ac6deSCy Schubert /* 853db0ac6deSCy Schubert * A sleeping hpts we want in next slot to run 854db0ac6deSCy Schubert * note that in this state p_prev_slot == p_cur_slot 855db0ac6deSCy Schubert */ 856c2a69e84SGleb Smirnoff tp->t_hpts_slot = hpts_slot(hpts->p_prev_slot, 1); 857db0ac6deSCy Schubert if ((hpts->p_on_min_sleep == 0) && 858db0ac6deSCy Schubert (hpts->p_hpts_active == 0)) 859db0ac6deSCy Schubert need_wakeup = true; 860db0ac6deSCy Schubert } else 861c2a69e84SGleb Smirnoff tp->t_hpts_slot = hpts->p_runningslot; 862c2a69e84SGleb Smirnoff if (__predict_true(tp->t_in_hpts != IHPTS_MOVING)) 863c2a69e84SGleb Smirnoff tcp_hpts_insert_internal(tp, hpts); 864db0ac6deSCy Schubert if (need_wakeup) { 865db0ac6deSCy Schubert /* 866db0ac6deSCy Schubert * Activate the hpts if it is sleeping and its 867db0ac6deSCy Schubert * timeout is not 1. 868db0ac6deSCy Schubert */ 869db0ac6deSCy Schubert hpts->p_direct_wake = 1; 870db0ac6deSCy Schubert tcp_wakehpts(hpts); 871db0ac6deSCy Schubert } 872db0ac6deSCy Schubert slot_on = hpts->p_nxt_slot; 873db0ac6deSCy Schubert HPTS_UNLOCK(hpts); 874db0ac6deSCy Schubert 875db0ac6deSCy Schubert return (slot_on); 8763ee9c3c4SRandall Stewart } 8773b0b41e6SRandall Stewart /* Get the current time relative to the wheel */ 878db0ac6deSCy Schubert wheel_cts = tcp_tv_to_hptstick(&tv); 8793b0b41e6SRandall Stewart /* Map it onto the wheel */ 880d7955cc0SRandall Stewart wheel_slot = tick_to_wheel(wheel_cts); 8813b0b41e6SRandall Stewart /* Now what's the max we can place it at? */ 882d7955cc0SRandall Stewart maxslots = max_slots_available(hpts, wheel_slot, &last_slot); 8833ee9c3c4SRandall Stewart if (diag) { 884d7955cc0SRandall Stewart diag->wheel_slot = wheel_slot; 885d7955cc0SRandall Stewart diag->maxslots = maxslots; 8863b0b41e6SRandall Stewart diag->wheel_cts = wheel_cts; 8873b0b41e6SRandall Stewart } 888d7955cc0SRandall Stewart if (maxslots == 0) { 8893b0b41e6SRandall Stewart /* The pacer is in a wheel wrap behind, yikes! */ 8903b0b41e6SRandall Stewart if (slot > 1) { 8913b0b41e6SRandall Stewart /* 8923b0b41e6SRandall Stewart * Reduce by 1 to prevent a forever loop in 8933b0b41e6SRandall Stewart * case something else is wrong. Note this 8943b0b41e6SRandall Stewart * probably does not hurt because the pacer 8953b0b41e6SRandall Stewart * if its true is so far behind we will be 8963b0b41e6SRandall Stewart * > 1second late calling anyway. 8973b0b41e6SRandall Stewart */ 8983b0b41e6SRandall Stewart slot--; 8993b0b41e6SRandall Stewart } 900c2a69e84SGleb Smirnoff tp->t_hpts_slot = last_slot; 901c2a69e84SGleb Smirnoff tp->t_hpts_request = slot; 902d7955cc0SRandall Stewart } else if (maxslots >= slot) { 9033b0b41e6SRandall Stewart /* It all fits on the wheel */ 904c2a69e84SGleb Smirnoff tp->t_hpts_request = 0; 905c2a69e84SGleb Smirnoff tp->t_hpts_slot = hpts_slot(wheel_slot, slot); 9063b0b41e6SRandall Stewart } else { 9073b0b41e6SRandall Stewart /* It does not fit */ 908c2a69e84SGleb Smirnoff tp->t_hpts_request = slot - maxslots; 909c2a69e84SGleb Smirnoff tp->t_hpts_slot = last_slot; 9103b0b41e6SRandall Stewart } 9113b0b41e6SRandall Stewart if (diag) { 912c2a69e84SGleb Smirnoff diag->slot_remaining = tp->t_hpts_request; 913c2a69e84SGleb Smirnoff diag->inp_hptsslot = tp->t_hpts_slot; 9143ee9c3c4SRandall Stewart } 9153b0b41e6SRandall Stewart #ifdef INVARIANTS 916c2a69e84SGleb Smirnoff check_if_slot_would_be_wrong(hpts, tp, tp->t_hpts_slot, line); 9173b0b41e6SRandall Stewart #endif 918c2a69e84SGleb Smirnoff if (__predict_true(tp->t_in_hpts != IHPTS_MOVING)) 919c2a69e84SGleb Smirnoff tcp_hpts_insert_internal(tp, hpts); 9203b0b41e6SRandall Stewart if ((hpts->p_hpts_active == 0) && 921c2a69e84SGleb Smirnoff (tp->t_hpts_request == 0) && 9223b0b41e6SRandall Stewart (hpts->p_on_min_sleep == 0)) { 9233ee9c3c4SRandall Stewart /* 924d7955cc0SRandall Stewart * The hpts is sleeping and NOT on a minimum 9253b0b41e6SRandall Stewart * sleep time, we need to figure out where 9263ee9c3c4SRandall Stewart * it will wake up at and if we need to reschedule 9273ee9c3c4SRandall Stewart * its time-out. 9283ee9c3c4SRandall Stewart */ 9293ee9c3c4SRandall Stewart uint32_t have_slept, yet_to_sleep; 9303ee9c3c4SRandall Stewart 9313ee9c3c4SRandall Stewart /* Now do we need to restart the hpts's timer? */ 932d7955cc0SRandall Stewart have_slept = hpts_slots_diff(hpts->p_prev_slot, wheel_slot); 9333b0b41e6SRandall Stewart if (have_slept < hpts->p_hpts_sleep_time) 9343ee9c3c4SRandall Stewart yet_to_sleep = hpts->p_hpts_sleep_time - have_slept; 9353b0b41e6SRandall Stewart else { 9363ee9c3c4SRandall Stewart /* We are over-due */ 9373ee9c3c4SRandall Stewart yet_to_sleep = 0; 9383ee9c3c4SRandall Stewart need_wakeup = 1; 9393ee9c3c4SRandall Stewart } 9403ee9c3c4SRandall Stewart if (diag) { 9413ee9c3c4SRandall Stewart diag->have_slept = have_slept; 9423ee9c3c4SRandall Stewart diag->yet_to_sleep = yet_to_sleep; 9433ee9c3c4SRandall Stewart } 9443b0b41e6SRandall Stewart if (yet_to_sleep && 9453b0b41e6SRandall Stewart (yet_to_sleep > slot)) { 9463ee9c3c4SRandall Stewart /* 9473b0b41e6SRandall Stewart * We need to reschedule the hpts's time-out. 9483ee9c3c4SRandall Stewart */ 9493ee9c3c4SRandall Stewart hpts->p_hpts_sleep_time = slot; 950d7955cc0SRandall Stewart need_new_to = slot * HPTS_TICKS_PER_SLOT; 9513ee9c3c4SRandall Stewart } 9523ee9c3c4SRandall Stewart } 9533ee9c3c4SRandall Stewart /* 9543ee9c3c4SRandall Stewart * Now how far is the hpts sleeping to? if active is 1, its 9553ee9c3c4SRandall Stewart * up and ticking we do nothing, otherwise we may need to 9563ee9c3c4SRandall Stewart * reschedule its callout if need_new_to is set from above. 9573ee9c3c4SRandall Stewart */ 9583ee9c3c4SRandall Stewart if (need_wakeup) { 9593ee9c3c4SRandall Stewart hpts->p_direct_wake = 1; 9603ee9c3c4SRandall Stewart tcp_wakehpts(hpts); 9613ee9c3c4SRandall Stewart if (diag) { 9623ee9c3c4SRandall Stewart diag->need_new_to = 0; 9633ee9c3c4SRandall Stewart diag->co_ret = 0xffff0000; 9643ee9c3c4SRandall Stewart } 9653ee9c3c4SRandall Stewart } else if (need_new_to) { 9663ee9c3c4SRandall Stewart int32_t co_ret; 9673ee9c3c4SRandall Stewart struct timeval tv; 9683ee9c3c4SRandall Stewart sbintime_t sb; 9693ee9c3c4SRandall Stewart 9703ee9c3c4SRandall Stewart tv.tv_sec = 0; 9713ee9c3c4SRandall Stewart tv.tv_usec = 0; 9723ee9c3c4SRandall Stewart while (need_new_to > HPTS_USEC_IN_SEC) { 9733ee9c3c4SRandall Stewart tv.tv_sec++; 9743ee9c3c4SRandall Stewart need_new_to -= HPTS_USEC_IN_SEC; 9753ee9c3c4SRandall Stewart } 9763ee9c3c4SRandall Stewart tv.tv_usec = need_new_to; 9773ee9c3c4SRandall Stewart sb = tvtosbt(tv); 9783ee9c3c4SRandall Stewart co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, 9796e6439b2SRandall Stewart hpts_timeout_swi, hpts, hpts->p_cpu, 9803ee9c3c4SRandall Stewart (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); 9813ee9c3c4SRandall Stewart if (diag) { 9823ee9c3c4SRandall Stewart diag->need_new_to = need_new_to; 9833ee9c3c4SRandall Stewart diag->co_ret = co_ret; 9843ee9c3c4SRandall Stewart } 9853ee9c3c4SRandall Stewart } 986266f97b5SCy Schubert slot_on = hpts->p_nxt_slot; 987db0ac6deSCy Schubert HPTS_UNLOCK(hpts); 988db0ac6deSCy Schubert 9892e27230fSGleb Smirnoff return (slot_on); 9903ee9c3c4SRandall Stewart } 991d7955cc0SRandall Stewart 9923ee9c3c4SRandall Stewart static uint16_t 993c2a69e84SGleb Smirnoff hpts_cpuid(struct tcpcb *tp, int *failed) 9944e1a3ff8SBjoern A. Zeeb { 995c2a69e84SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp); 9966d2b0c01SBjoern A. Zeeb u_int cpuid; 99747ded797SFranco Fichtner #ifdef NUMA 9984e255d74SAndrew Gallatin struct hpts_domain_info *di; 9994e255d74SAndrew Gallatin #endif 10003ee9c3c4SRandall Stewart 1001d7955cc0SRandall Stewart *failed = 0; 1002c2a69e84SGleb Smirnoff if (tp->t_flags2 & TF2_HPTS_CPU_SET) { 1003c2a69e84SGleb Smirnoff return (tp->t_hpts_cpu); 10043ee9c3c4SRandall Stewart } 1005d7955cc0SRandall Stewart /* 1006d7955cc0SRandall Stewart * If we are using the irq cpu set by LRO or 1007d7955cc0SRandall Stewart * the driver then it overrides all other domains. 1008d7955cc0SRandall Stewart */ 1009d7955cc0SRandall Stewart if (tcp_use_irq_cpu) { 1010c2a69e84SGleb Smirnoff if (tp->t_lro_cpu == HPTS_CPU_NONE) { 1011d7955cc0SRandall Stewart *failed = 1; 1012d7955cc0SRandall Stewart return (0); 1013d7955cc0SRandall Stewart } 1014c2a69e84SGleb Smirnoff return (tp->t_lro_cpu); 1015d7955cc0SRandall Stewart } 10163ee9c3c4SRandall Stewart /* If one is set the other must be the same */ 10173ee9c3c4SRandall Stewart #ifdef RSS 10183ee9c3c4SRandall Stewart cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 10193ee9c3c4SRandall Stewart if (cpuid == NETISR_CPUID_NONE) 1020c2a69e84SGleb Smirnoff return (hpts_random_cpu()); 10213ee9c3c4SRandall Stewart else 10223ee9c3c4SRandall Stewart return (cpuid); 102347ded797SFranco Fichtner #endif 10243ee9c3c4SRandall Stewart /* 10253ee9c3c4SRandall Stewart * We don't have a flowid -> cpuid mapping, so cheat and just map 10263ee9c3c4SRandall Stewart * unknown cpuids to curcpu. Not the best, but apparently better 10273ee9c3c4SRandall Stewart * than defaulting to swi 0. 10283ee9c3c4SRandall Stewart */ 1029d7955cc0SRandall Stewart if (inp->inp_flowtype == M_HASHTYPE_NONE) { 1030d7955cc0SRandall Stewart counter_u64_add(cpu_uses_random, 1); 1031c2a69e84SGleb Smirnoff return (hpts_random_cpu()); 1032d7955cc0SRandall Stewart } 10334e255d74SAndrew Gallatin /* 10344e255d74SAndrew Gallatin * Hash to a thread based on the flowid. If we are using numa, 10354e255d74SAndrew Gallatin * then restrict the hash to the numa domain where the inp lives. 10364e255d74SAndrew Gallatin */ 10376e6439b2SRandall Stewart 10384e255d74SAndrew Gallatin #ifdef NUMA 10396e6439b2SRandall Stewart if ((vm_ndomains == 1) || 10406e6439b2SRandall Stewart (inp->inp_numa_domain == M_NODOM)) { 10414e255d74SAndrew Gallatin #endif 10423ee9c3c4SRandall Stewart cpuid = inp->inp_flowid % mp_ncpus; 10436e6439b2SRandall Stewart #ifdef NUMA 10446e6439b2SRandall Stewart } else { 10456e6439b2SRandall Stewart /* Hash into the cpu's that use that domain */ 10466e6439b2SRandall Stewart di = &hpts_domains[inp->inp_numa_domain]; 10476e6439b2SRandall Stewart cpuid = di->cpu[inp->inp_flowid % di->count]; 10486e6439b2SRandall Stewart } 10496e6439b2SRandall Stewart #endif 1050d7955cc0SRandall Stewart counter_u64_add(cpu_uses_flowid, 1); 10513ee9c3c4SRandall Stewart return (cpuid); 10523ee9c3c4SRandall Stewart } 10533ee9c3c4SRandall Stewart 10543ee9c3c4SRandall Stewart static void 1055d7955cc0SRandall Stewart tcp_hpts_set_max_sleep(struct tcp_hpts_entry *hpts, int wrap_loop_cnt) 1056d7955cc0SRandall Stewart { 10576e6439b2SRandall Stewart uint32_t t = 0, i; 1058d7955cc0SRandall Stewart 1059d7955cc0SRandall Stewart if ((hpts->p_on_queue_cnt) && (wrap_loop_cnt < 2)) { 1060d7955cc0SRandall Stewart /* 1061d7955cc0SRandall Stewart * Find next slot that is occupied and use that to 1062d7955cc0SRandall Stewart * be the sleep time. 1063d7955cc0SRandall Stewart */ 1064d7955cc0SRandall Stewart for (i = 0, t = hpts_slot(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) { 1065db0ac6deSCy Schubert if (TAILQ_EMPTY(&hpts->p_hptss[t].head) == 0) { 1066d7955cc0SRandall Stewart break; 1067d7955cc0SRandall Stewart } 1068d7955cc0SRandall Stewart t = (t + 1) % NUM_OF_HPTSI_SLOTS; 1069d7955cc0SRandall Stewart } 10706e6439b2SRandall Stewart KASSERT((i != NUM_OF_HPTSI_SLOTS), ("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt)); 1071d7955cc0SRandall Stewart hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max); 1072d7955cc0SRandall Stewart } else { 1073d7955cc0SRandall Stewart /* No one on the wheel sleep for all but 400 slots or sleep max */ 1074d7955cc0SRandall Stewart hpts->p_hpts_sleep_time = hpts_sleep_max; 1075d7955cc0SRandall Stewart } 1076d7955cc0SRandall Stewart } 1077d7955cc0SRandall Stewart 1078d7955cc0SRandall Stewart static int32_t 107963446fd3SGleb Smirnoff tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout) 10803ee9c3c4SRandall Stewart { 10813ee9c3c4SRandall Stewart struct tcpcb *tp; 10823ee9c3c4SRandall Stewart struct timeval tv; 1083d7955cc0SRandall Stewart int32_t slots_to_run, i, error; 10843b0b41e6SRandall Stewart int32_t loop_cnt = 0; 10853ee9c3c4SRandall Stewart int32_t did_prefetch = 0; 10863ee9c3c4SRandall Stewart int32_t prefetch_tp = 0; 10873b0b41e6SRandall Stewart int32_t wrap_loop_cnt = 0; 1088d7955cc0SRandall Stewart int32_t slot_pos_of_endpoint = 0; 1089d7955cc0SRandall Stewart int32_t orig_exit_slot; 10905cb73dbeSGleb Smirnoff bool completed_measure, seen_endpoint; 10915cb73dbeSGleb Smirnoff 10925cb73dbeSGleb Smirnoff completed_measure = false; 10935cb73dbeSGleb Smirnoff seen_endpoint = false; 10943ee9c3c4SRandall Stewart 10953ee9c3c4SRandall Stewart HPTS_MTX_ASSERT(hpts); 109643e8b279SGleb Smirnoff NET_EPOCH_ASSERT(); 10973b0b41e6SRandall Stewart /* record previous info for any logging */ 10983b0b41e6SRandall Stewart hpts->saved_lasttick = hpts->p_lasttick; 10993b0b41e6SRandall Stewart hpts->saved_curtick = hpts->p_curtick; 11003b0b41e6SRandall Stewart hpts->saved_curslot = hpts->p_cur_slot; 11013b0b41e6SRandall Stewart hpts->saved_prev_slot = hpts->p_prev_slot; 11023ee9c3c4SRandall Stewart 11033b0b41e6SRandall Stewart hpts->p_lasttick = hpts->p_curtick; 11043b0b41e6SRandall Stewart hpts->p_curtick = tcp_gethptstick(&tv); 1105175d4d69SGleb Smirnoff tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv); 1106d7955cc0SRandall Stewart orig_exit_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); 11073b0b41e6SRandall Stewart if ((hpts->p_on_queue_cnt == 0) || 11083b0b41e6SRandall Stewart (hpts->p_lasttick == hpts->p_curtick)) { 11093b0b41e6SRandall Stewart /* 11103b0b41e6SRandall Stewart * No time has yet passed, 11113b0b41e6SRandall Stewart * or nothing to do. 11123b0b41e6SRandall Stewart */ 11133b0b41e6SRandall Stewart hpts->p_prev_slot = hpts->p_cur_slot; 11143b0b41e6SRandall Stewart hpts->p_lasttick = hpts->p_curtick; 11153b0b41e6SRandall Stewart goto no_run; 11163b0b41e6SRandall Stewart } 11173ee9c3c4SRandall Stewart again: 11183b0b41e6SRandall Stewart hpts->p_wheel_complete = 0; 11193ee9c3c4SRandall Stewart HPTS_MTX_ASSERT(hpts); 1120d7955cc0SRandall Stewart slots_to_run = hpts_slots_diff(hpts->p_prev_slot, hpts->p_cur_slot); 1121d7955cc0SRandall Stewart if (((hpts->p_curtick - hpts->p_lasttick) > 1122d7955cc0SRandall Stewart ((NUM_OF_HPTSI_SLOTS-1) * HPTS_TICKS_PER_SLOT)) && 11233b0b41e6SRandall Stewart (hpts->p_on_queue_cnt != 0)) { 11243b0b41e6SRandall Stewart /* 11253b0b41e6SRandall Stewart * Wheel wrap is occuring, basically we 11263b0b41e6SRandall Stewart * are behind and the distance between 11273b0b41e6SRandall Stewart * run's has spread so much it has exceeded 11283b0b41e6SRandall Stewart * the time on the wheel (1.024 seconds). This 11293b0b41e6SRandall Stewart * is ugly and should NOT be happening. We 11303b0b41e6SRandall Stewart * need to run the entire wheel. We last processed 11313b0b41e6SRandall Stewart * p_prev_slot, so that needs to be the last slot 11323b0b41e6SRandall Stewart * we run. The next slot after that should be our 11333b0b41e6SRandall Stewart * reserved first slot for new, and then starts 11341f2aaef2SGordon Bergling * the running position. Now the problem is the 11353b0b41e6SRandall Stewart * reserved "not to yet" place does not exist 11363b0b41e6SRandall Stewart * and there may be inp's in there that need 11373b0b41e6SRandall Stewart * running. We can merge those into the 11383b0b41e6SRandall Stewart * first slot at the head. 11393b0b41e6SRandall Stewart */ 11403b0b41e6SRandall Stewart wrap_loop_cnt++; 1141d7955cc0SRandall Stewart hpts->p_nxt_slot = hpts_slot(hpts->p_prev_slot, 1); 1142d7955cc0SRandall Stewart hpts->p_runningslot = hpts_slot(hpts->p_prev_slot, 2); 11433b0b41e6SRandall Stewart /* 11443b0b41e6SRandall Stewart * Adjust p_cur_slot to be where we are starting from 11453b0b41e6SRandall Stewart * hopefully we will catch up (fat chance if something 11463b0b41e6SRandall Stewart * is broken this bad :( ) 11473b0b41e6SRandall Stewart */ 11483b0b41e6SRandall Stewart hpts->p_cur_slot = hpts->p_prev_slot; 11493b0b41e6SRandall Stewart /* 11503b0b41e6SRandall Stewart * The next slot has guys to run too, and that would 11513b0b41e6SRandall Stewart * be where we would normally start, lets move them into 11523b0b41e6SRandall Stewart * the next slot (p_prev_slot + 2) so that we will 11533b0b41e6SRandall Stewart * run them, the extra 10usecs of late (by being 11543b0b41e6SRandall Stewart * put behind) does not really matter in this situation. 11553b0b41e6SRandall Stewart */ 1156c2a69e84SGleb Smirnoff TAILQ_FOREACH(tp, &hpts->p_hptss[hpts->p_nxt_slot].head, 1157c2a69e84SGleb Smirnoff t_hpts) { 1158c2a69e84SGleb Smirnoff MPASS(tp->t_hpts_slot == hpts->p_nxt_slot); 1159c2a69e84SGleb Smirnoff MPASS(tp->t_hpts_gencnt == 1160db0ac6deSCy Schubert hpts->p_hptss[hpts->p_nxt_slot].gencnt); 1161c2a69e84SGleb Smirnoff MPASS(tp->t_in_hpts == IHPTS_ONQUEUE); 1162db0ac6deSCy Schubert 11633b0b41e6SRandall Stewart /* 1164db0ac6deSCy Schubert * Update gencnt and nextslot accordingly to match 1165db0ac6deSCy Schubert * the new location. This is safe since it takes both 1166db0ac6deSCy Schubert * the INP lock and the pacer mutex to change the 1167c2a69e84SGleb Smirnoff * t_hptsslot and t_hpts_gencnt. 11683b0b41e6SRandall Stewart */ 1169c2a69e84SGleb Smirnoff tp->t_hpts_gencnt = 1170db0ac6deSCy Schubert hpts->p_hptss[hpts->p_runningslot].gencnt; 1171c2a69e84SGleb Smirnoff tp->t_hpts_slot = hpts->p_runningslot; 11723b0b41e6SRandall Stewart } 1173db0ac6deSCy Schubert TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningslot].head, 1174c2a69e84SGleb Smirnoff &hpts->p_hptss[hpts->p_nxt_slot].head, t_hpts); 1175db0ac6deSCy Schubert hpts->p_hptss[hpts->p_runningslot].count += 1176db0ac6deSCy Schubert hpts->p_hptss[hpts->p_nxt_slot].count; 1177db0ac6deSCy Schubert hpts->p_hptss[hpts->p_nxt_slot].count = 0; 1178db0ac6deSCy Schubert hpts->p_hptss[hpts->p_nxt_slot].gencnt++; 1179d7955cc0SRandall Stewart slots_to_run = NUM_OF_HPTSI_SLOTS - 1; 11803b0b41e6SRandall Stewart counter_u64_add(wheel_wrap, 1); 11813b0b41e6SRandall Stewart } else { 11823b0b41e6SRandall Stewart /* 1183d7955cc0SRandall Stewart * Nxt slot is always one after p_runningslot though 11843b0b41e6SRandall Stewart * its not used usually unless we are doing wheel wrap. 11853b0b41e6SRandall Stewart */ 11863b0b41e6SRandall Stewart hpts->p_nxt_slot = hpts->p_prev_slot; 1187d7955cc0SRandall Stewart hpts->p_runningslot = hpts_slot(hpts->p_prev_slot, 1); 11883ee9c3c4SRandall Stewart } 11893ee9c3c4SRandall Stewart if (hpts->p_on_queue_cnt == 0) { 11903ee9c3c4SRandall Stewart goto no_one; 11913ee9c3c4SRandall Stewart } 1192d7955cc0SRandall Stewart for (i = 0; i < slots_to_run; i++) { 1193c2a69e84SGleb Smirnoff struct tcpcb *tp, *ntp; 1194c2a69e84SGleb Smirnoff TAILQ_HEAD(, tcpcb) head = TAILQ_HEAD_INITIALIZER(head); 1195db0ac6deSCy Schubert struct hptsh *hptsh; 1196aac52f94SRandall Stewart uint32_t runningslot; 1197db0ac6deSCy Schubert 11983ee9c3c4SRandall Stewart /* 11993ee9c3c4SRandall Stewart * Calculate our delay, if there are no extra ticks there 1200d7955cc0SRandall Stewart * was not any (i.e. if slots_to_run == 1, no delay). 12013ee9c3c4SRandall Stewart */ 1202db0ac6deSCy Schubert hpts->p_delayed_by = (slots_to_run - (i + 1)) * 1203db0ac6deSCy Schubert HPTS_TICKS_PER_SLOT; 1204db0ac6deSCy Schubert 1205db0ac6deSCy Schubert runningslot = hpts->p_runningslot; 1206db0ac6deSCy Schubert hptsh = &hpts->p_hptss[runningslot]; 1207c2a69e84SGleb Smirnoff TAILQ_SWAP(&head, &hptsh->head, tcpcb, t_hpts); 1208db0ac6deSCy Schubert hpts->p_on_queue_cnt -= hptsh->count; 1209db0ac6deSCy Schubert hptsh->count = 0; 1210aac52f94SRandall Stewart hptsh->gencnt++; 1211db0ac6deSCy Schubert 1212db0ac6deSCy Schubert HPTS_UNLOCK(hpts); 1213db0ac6deSCy Schubert 1214c2a69e84SGleb Smirnoff TAILQ_FOREACH_SAFE(tp, &head, t_hpts, ntp) { 1215c2a69e84SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp); 1216db0ac6deSCy Schubert bool set_cpu; 1217db0ac6deSCy Schubert 1218c2a69e84SGleb Smirnoff if (ntp != NULL) { 12193ee9c3c4SRandall Stewart /* 1220c2a69e84SGleb Smirnoff * If we have a next tcpcb, see if we can 1221e68b3792SGleb Smirnoff * prefetch it. Note this may seem 12223ee9c3c4SRandall Stewart * "risky" since we have no locks (other 12233ee9c3c4SRandall Stewart * than the previous inp) and there no 1224c2a69e84SGleb Smirnoff * assurance that ntp was not pulled while 1225c2a69e84SGleb Smirnoff * we were processing tp and freed. If this 1226d07a5018SGordon Bergling * occurred it could mean that either: 12273ee9c3c4SRandall Stewart * 12283ee9c3c4SRandall Stewart * a) Its NULL (which is fine we won't go 12293ee9c3c4SRandall Stewart * here) <or> b) Its valid (which is cool we 12303ee9c3c4SRandall Stewart * will prefetch it) <or> c) The inp got 12313ee9c3c4SRandall Stewart * freed back to the slab which was 12323ee9c3c4SRandall Stewart * reallocated. Then the piece of memory was 12333ee9c3c4SRandall Stewart * re-used and something else (not an 12343ee9c3c4SRandall Stewart * address) is in inp_ppcb. If that occurs 12353ee9c3c4SRandall Stewart * we don't crash, but take a TLB shootdown 12363ee9c3c4SRandall Stewart * performance hit (same as if it was NULL 12373ee9c3c4SRandall Stewart * and we tried to pre-fetch it). 12383ee9c3c4SRandall Stewart * 12393ee9c3c4SRandall Stewart * Considering that the likelyhood of <c> is 12403ee9c3c4SRandall Stewart * quite rare we will take a risk on doing 12413ee9c3c4SRandall Stewart * this. If performance drops after testing 12423ee9c3c4SRandall Stewart * we can always take this out. NB: the 12433ee9c3c4SRandall Stewart * kern_prefetch on amd64 actually has 12443ee9c3c4SRandall Stewart * protection against a bad address now via 12453ee9c3c4SRandall Stewart * the DMAP_() tests. This will prevent the 12463ee9c3c4SRandall Stewart * TLB hit, and instead if <c> occurs just 12473ee9c3c4SRandall Stewart * cause us to load cache with a useless 12483ee9c3c4SRandall Stewart * address (to us). 1249e68b3792SGleb Smirnoff * 1250c2a69e84SGleb Smirnoff * XXXGL: this comment and the prefetch action 1251c2a69e84SGleb Smirnoff * could be outdated after tp == inp change. 12523ee9c3c4SRandall Stewart */ 1253c2a69e84SGleb Smirnoff kern_prefetch(ntp, &prefetch_tp); 12543ee9c3c4SRandall Stewart prefetch_tp = 1; 12553ee9c3c4SRandall Stewart } 1256c2a69e84SGleb Smirnoff 1257c2a69e84SGleb Smirnoff /* For debugging */ 12585cb73dbeSGleb Smirnoff if (!seen_endpoint) { 12595cb73dbeSGleb Smirnoff seen_endpoint = true; 1260c2a69e84SGleb Smirnoff orig_exit_slot = slot_pos_of_endpoint = 1261c2a69e84SGleb Smirnoff runningslot; 12625cb73dbeSGleb Smirnoff } else if (!completed_measure) { 1263c2a69e84SGleb Smirnoff /* Record the new position */ 1264c2a69e84SGleb Smirnoff orig_exit_slot = runningslot; 1265c2a69e84SGleb Smirnoff } 1266c2a69e84SGleb Smirnoff 1267c2a69e84SGleb Smirnoff INP_WLOCK(inp); 1268c2a69e84SGleb Smirnoff if ((tp->t_flags2 & TF2_HPTS_CPU_SET) == 0) { 1269c2a69e84SGleb Smirnoff set_cpu = true; 1270c2a69e84SGleb Smirnoff } else { 1271c2a69e84SGleb Smirnoff set_cpu = false; 1272c2a69e84SGleb Smirnoff } 1273c2a69e84SGleb Smirnoff 1274c2a69e84SGleb Smirnoff if (__predict_false(tp->t_in_hpts == IHPTS_MOVING)) { 1275c2a69e84SGleb Smirnoff if (tp->t_hpts_slot == -1) { 1276c2a69e84SGleb Smirnoff tp->t_in_hpts = IHPTS_NONE; 1277c2a69e84SGleb Smirnoff if (in_pcbrele_wlocked(inp) == false) 1278c2a69e84SGleb Smirnoff INP_WUNLOCK(inp); 1279c2a69e84SGleb Smirnoff } else { 1280c2a69e84SGleb Smirnoff HPTS_LOCK(hpts); 1281c2a69e84SGleb Smirnoff tcp_hpts_insert_internal(tp, hpts); 1282c2a69e84SGleb Smirnoff HPTS_UNLOCK(hpts); 1283c2a69e84SGleb Smirnoff INP_WUNLOCK(inp); 1284c2a69e84SGleb Smirnoff } 1285c2a69e84SGleb Smirnoff continue; 1286c2a69e84SGleb Smirnoff } 1287c2a69e84SGleb Smirnoff 1288c2a69e84SGleb Smirnoff MPASS(tp->t_in_hpts == IHPTS_ONQUEUE); 1289c2a69e84SGleb Smirnoff MPASS(!(inp->inp_flags & INP_DROPPED)); 1290c2a69e84SGleb Smirnoff KASSERT(runningslot == tp->t_hpts_slot, 1291c2a69e84SGleb Smirnoff ("Hpts:%p inp:%p slot mis-aligned %u vs %u", 1292c2a69e84SGleb Smirnoff hpts, inp, runningslot, tp->t_hpts_slot)); 1293c2a69e84SGleb Smirnoff 1294c2a69e84SGleb Smirnoff if (tp->t_hpts_request) { 1295c2a69e84SGleb Smirnoff /* 1296c2a69e84SGleb Smirnoff * This guy is deferred out further in time 1297c2a69e84SGleb Smirnoff * then our wheel had available on it. 1298c2a69e84SGleb Smirnoff * Push him back on the wheel or run it 1299c2a69e84SGleb Smirnoff * depending. 1300c2a69e84SGleb Smirnoff */ 1301c2a69e84SGleb Smirnoff uint32_t maxslots, last_slot, remaining_slots; 1302c2a69e84SGleb Smirnoff 1303c2a69e84SGleb Smirnoff remaining_slots = slots_to_run - (i + 1); 1304c2a69e84SGleb Smirnoff if (tp->t_hpts_request > remaining_slots) { 1305c2a69e84SGleb Smirnoff HPTS_LOCK(hpts); 1306c2a69e84SGleb Smirnoff /* 1307c2a69e84SGleb Smirnoff * How far out can we go? 1308c2a69e84SGleb Smirnoff */ 1309c2a69e84SGleb Smirnoff maxslots = max_slots_available(hpts, 1310c2a69e84SGleb Smirnoff hpts->p_cur_slot, &last_slot); 1311c2a69e84SGleb Smirnoff if (maxslots >= tp->t_hpts_request) { 1312c2a69e84SGleb Smirnoff /* We can place it finally to 1313c2a69e84SGleb Smirnoff * be processed. */ 1314c2a69e84SGleb Smirnoff tp->t_hpts_slot = hpts_slot( 1315c2a69e84SGleb Smirnoff hpts->p_runningslot, 1316c2a69e84SGleb Smirnoff tp->t_hpts_request); 1317c2a69e84SGleb Smirnoff tp->t_hpts_request = 0; 1318c2a69e84SGleb Smirnoff } else { 1319c2a69e84SGleb Smirnoff /* Work off some more time */ 1320c2a69e84SGleb Smirnoff tp->t_hpts_slot = last_slot; 1321c2a69e84SGleb Smirnoff tp->t_hpts_request -= 1322c2a69e84SGleb Smirnoff maxslots; 1323c2a69e84SGleb Smirnoff } 1324c2a69e84SGleb Smirnoff tcp_hpts_insert_internal(tp, hpts); 1325c2a69e84SGleb Smirnoff HPTS_UNLOCK(hpts); 1326c2a69e84SGleb Smirnoff INP_WUNLOCK(inp); 1327c2a69e84SGleb Smirnoff continue; 1328c2a69e84SGleb Smirnoff } 1329c2a69e84SGleb Smirnoff tp->t_hpts_request = 0; 1330c2a69e84SGleb Smirnoff /* Fall through we will so do it now */ 1331c2a69e84SGleb Smirnoff } 1332c2a69e84SGleb Smirnoff 1333c2a69e84SGleb Smirnoff tcp_hpts_release(tp); 1334c2a69e84SGleb Smirnoff if (set_cpu) { 1335c2a69e84SGleb Smirnoff /* 1336c2a69e84SGleb Smirnoff * Setup so the next time we will move to 1337c2a69e84SGleb Smirnoff * the right CPU. This should be a rare 1338c2a69e84SGleb Smirnoff * event. It will sometimes happens when we 1339c2a69e84SGleb Smirnoff * are the client side (usually not the 1340c2a69e84SGleb Smirnoff * server). Somehow tcp_output() gets called 1341c2a69e84SGleb Smirnoff * before the tcp_do_segment() sets the 1342c2a69e84SGleb Smirnoff * intial state. This means the r_cpu and 1343c2a69e84SGleb Smirnoff * r_hpts_cpu is 0. We get on the hpts, and 1344c2a69e84SGleb Smirnoff * then tcp_input() gets called setting up 1345c2a69e84SGleb Smirnoff * the r_cpu to the correct value. The hpts 1346c2a69e84SGleb Smirnoff * goes off and sees the mis-match. We 1347c2a69e84SGleb Smirnoff * simply correct it here and the CPU will 1348c2a69e84SGleb Smirnoff * switch to the new hpts nextime the tcb 1349c2a69e84SGleb Smirnoff * gets added to the hpts (not this one) 1350c2a69e84SGleb Smirnoff * :-) 1351c2a69e84SGleb Smirnoff */ 1352c2a69e84SGleb Smirnoff tcp_set_hpts(tp); 1353c2a69e84SGleb Smirnoff } 1354c2a69e84SGleb Smirnoff CURVNET_SET(inp->inp_vnet); 1355c2a69e84SGleb Smirnoff /* Lets do any logging that we might want to */ 1356c2a69e84SGleb Smirnoff if (hpts_does_tp_logging && tcp_bblogging_on(tp)) { 135763446fd3SGleb Smirnoff tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, 135863446fd3SGleb Smirnoff from_callout); 1359c2a69e84SGleb Smirnoff } 1360c2a69e84SGleb Smirnoff 1361c2a69e84SGleb Smirnoff if (tp->t_fb_ptr != NULL) { 1362c2a69e84SGleb Smirnoff kern_prefetch(tp->t_fb_ptr, &did_prefetch); 1363c2a69e84SGleb Smirnoff did_prefetch = 1; 1364c2a69e84SGleb Smirnoff } 1365c2a69e84SGleb Smirnoff /* 1366c2a69e84SGleb Smirnoff * We set TF2_HPTS_CALLS before any possible output. 1367c2a69e84SGleb Smirnoff * The contract with the transport is that if it cares 1368c2a69e84SGleb Smirnoff * about hpts calling it should clear the flag. That 1369c2a69e84SGleb Smirnoff * way next time it is called it will know it is hpts. 1370c2a69e84SGleb Smirnoff * 1371c2a69e84SGleb Smirnoff * We also only call tfb_do_queued_segments() <or> 1372c2a69e84SGleb Smirnoff * tcp_output(). It is expected that if segments are 1373c2a69e84SGleb Smirnoff * queued and come in that the final input mbuf will 137408c33cd9SGleb Smirnoff * cause a call to output if it is needed so we do 137508c33cd9SGleb Smirnoff * not need a second call to tcp_output(). So we do 137608c33cd9SGleb Smirnoff * one or the other but not both. 1377*3604a050SGleb Smirnoff * 1378*3604a050SGleb Smirnoff * XXXGL: some KPI abuse here. tfb_do_queued_segments 1379*3604a050SGleb Smirnoff * returns unlocked with positive error (always 1) and 1380*3604a050SGleb Smirnoff * tcp_output returns unlocked with negative error. 1381c2a69e84SGleb Smirnoff */ 1382c2a69e84SGleb Smirnoff tp->t_flags2 |= TF2_HPTS_CALLS; 1383c3c20de3SGleb Smirnoff if ((tp->t_flags2 & TF2_SUPPORTS_MBUFQ) && 1384*3604a050SGleb Smirnoff !STAILQ_EMPTY(&tp->t_inqueue)) 1385*3604a050SGleb Smirnoff error = -(*tp->t_fb->tfb_do_queued_segments)(tp, 1386*3604a050SGleb Smirnoff 0); 1387*3604a050SGleb Smirnoff else 1388c2a69e84SGleb Smirnoff error = tcp_output(tp); 1389*3604a050SGleb Smirnoff if (__predict_true(error >= 0)) 13903ee9c3c4SRandall Stewart INP_WUNLOCK(inp); 13913ee9c3c4SRandall Stewart CURVNET_RESTORE(); 13923ee9c3c4SRandall Stewart } 1393d7955cc0SRandall Stewart if (seen_endpoint) { 1394d7955cc0SRandall Stewart /* 1395d7955cc0SRandall Stewart * We now have a accurate distance between 1396d7955cc0SRandall Stewart * slot_pos_of_endpoint <-> orig_exit_slot 1397d7955cc0SRandall Stewart * to tell us how late we were, orig_exit_slot 1398d7955cc0SRandall Stewart * is where we calculated the end of our cycle to 1399d7955cc0SRandall Stewart * be when we first entered. 1400d7955cc0SRandall Stewart */ 14015cb73dbeSGleb Smirnoff completed_measure = true; 1402d7955cc0SRandall Stewart } 1403db0ac6deSCy Schubert HPTS_LOCK(hpts); 1404d7955cc0SRandall Stewart hpts->p_runningslot++; 1405d7955cc0SRandall Stewart if (hpts->p_runningslot >= NUM_OF_HPTSI_SLOTS) { 1406d7955cc0SRandall Stewart hpts->p_runningslot = 0; 14073ee9c3c4SRandall Stewart } 14083ee9c3c4SRandall Stewart } 14093ee9c3c4SRandall Stewart no_one: 14103ee9c3c4SRandall Stewart HPTS_MTX_ASSERT(hpts); 14113ee9c3c4SRandall Stewart hpts->p_delayed_by = 0; 14123ee9c3c4SRandall Stewart /* 14133ee9c3c4SRandall Stewart * Check to see if we took an excess amount of time and need to run 14143ee9c3c4SRandall Stewart * more ticks (if we did not hit eno-bufs). 14153ee9c3c4SRandall Stewart */ 14163b0b41e6SRandall Stewart hpts->p_prev_slot = hpts->p_cur_slot; 14173b0b41e6SRandall Stewart hpts->p_lasttick = hpts->p_curtick; 141863446fd3SGleb Smirnoff if (!from_callout || (loop_cnt > max_pacer_loops)) { 14193b0b41e6SRandall Stewart /* 14203b0b41e6SRandall Stewart * Something is serious slow we have 14213b0b41e6SRandall Stewart * looped through processing the wheel 14223b0b41e6SRandall Stewart * and by the time we cleared the 14233b0b41e6SRandall Stewart * needs to run max_pacer_loops time 14243b0b41e6SRandall Stewart * we still needed to run. That means 14253b0b41e6SRandall Stewart * the system is hopelessly behind and 14263b0b41e6SRandall Stewart * can never catch up :( 14273b0b41e6SRandall Stewart * 14283b0b41e6SRandall Stewart * We will just lie to this thread 14293b0b41e6SRandall Stewart * and let it thing p_curtick is 14303b0b41e6SRandall Stewart * correct. When it next awakens 14313b0b41e6SRandall Stewart * it will find itself further behind. 14323b0b41e6SRandall Stewart */ 1433d7955cc0SRandall Stewart if (from_callout) 14343b0b41e6SRandall Stewart counter_u64_add(hpts_hopelessly_behind, 1); 14353b0b41e6SRandall Stewart goto no_run; 14363ee9c3c4SRandall Stewart } 14373b0b41e6SRandall Stewart hpts->p_curtick = tcp_gethptstick(&tv); 14383b0b41e6SRandall Stewart hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); 14395cb73dbeSGleb Smirnoff if (!seen_endpoint) { 1440d7955cc0SRandall Stewart /* We saw no endpoint but we may be looping */ 1441d7955cc0SRandall Stewart orig_exit_slot = hpts->p_cur_slot; 1442d7955cc0SRandall Stewart } 14433b0b41e6SRandall Stewart if ((wrap_loop_cnt < 2) && 14443b0b41e6SRandall Stewart (hpts->p_lasttick != hpts->p_curtick)) { 14453b0b41e6SRandall Stewart counter_u64_add(hpts_loops, 1); 14463b0b41e6SRandall Stewart loop_cnt++; 14473b0b41e6SRandall Stewart goto again; 14483b0b41e6SRandall Stewart } 14493b0b41e6SRandall Stewart no_run: 1450175d4d69SGleb Smirnoff tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv); 14513b0b41e6SRandall Stewart /* 14523b0b41e6SRandall Stewart * Set flag to tell that we are done for 14533b0b41e6SRandall Stewart * any slot input that happens during 14543b0b41e6SRandall Stewart * input. 14553b0b41e6SRandall Stewart */ 14563b0b41e6SRandall Stewart hpts->p_wheel_complete = 1; 14573b0b41e6SRandall Stewart /* 1458d7955cc0SRandall Stewart * Now did we spend too long running input and need to run more ticks? 1459d7955cc0SRandall Stewart * Note that if wrap_loop_cnt < 2 then we should have the conditions 1460d7955cc0SRandall Stewart * in the KASSERT's true. But if the wheel is behind i.e. wrap_loop_cnt 1461db0ac6deSCy Schubert * is greater than 2, then the condtion most likely are *not* true. 1462db0ac6deSCy Schubert * Also if we are called not from the callout, we don't run the wheel 1463db0ac6deSCy Schubert * multiple times so the slots may not align either. 14643b0b41e6SRandall Stewart */ 1465d7955cc0SRandall Stewart KASSERT(((hpts->p_prev_slot == hpts->p_cur_slot) || 146663446fd3SGleb Smirnoff (wrap_loop_cnt >= 2) || !from_callout), 14673b0b41e6SRandall Stewart ("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts, 14683b0b41e6SRandall Stewart hpts->p_prev_slot, hpts->p_cur_slot)); 1469d7955cc0SRandall Stewart KASSERT(((hpts->p_lasttick == hpts->p_curtick) 147063446fd3SGleb Smirnoff || (wrap_loop_cnt >= 2) || !from_callout), 14713b0b41e6SRandall Stewart ("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts, 14723b0b41e6SRandall Stewart hpts->p_lasttick, hpts->p_curtick)); 1473d7955cc0SRandall Stewart if (from_callout && (hpts->p_lasttick != hpts->p_curtick)) { 14743b0b41e6SRandall Stewart hpts->p_curtick = tcp_gethptstick(&tv); 14753b0b41e6SRandall Stewart counter_u64_add(hpts_loops, 1); 14763b0b41e6SRandall Stewart hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); 14773ee9c3c4SRandall Stewart goto again; 14783ee9c3c4SRandall Stewart } 1479db0ac6deSCy Schubert 1480d7955cc0SRandall Stewart if (from_callout) { 1481d7955cc0SRandall Stewart tcp_hpts_set_max_sleep(hpts, wrap_loop_cnt); 14823ee9c3c4SRandall Stewart } 1483d7955cc0SRandall Stewart if (seen_endpoint) 1484d7955cc0SRandall Stewart return(hpts_slots_diff(slot_pos_of_endpoint, orig_exit_slot)); 1485d7955cc0SRandall Stewart else 1486d7955cc0SRandall Stewart return (0); 14873ee9c3c4SRandall Stewart } 14883ee9c3c4SRandall Stewart 14893ee9c3c4SRandall Stewart void 1490c2a69e84SGleb Smirnoff __tcp_set_hpts(struct tcpcb *tp, int32_t line) 14913ee9c3c4SRandall Stewart { 14923ee9c3c4SRandall Stewart struct tcp_hpts_entry *hpts; 1493d7955cc0SRandall Stewart int failed; 14943ee9c3c4SRandall Stewart 1495c2a69e84SGleb Smirnoff INP_WLOCK_ASSERT(tptoinpcb(tp)); 1496c2a69e84SGleb Smirnoff 1497c2a69e84SGleb Smirnoff hpts = tcp_hpts_lock(tp); 1498c2a69e84SGleb Smirnoff if (tp->t_in_hpts == IHPTS_NONE && !(tp->t_flags2 & TF2_HPTS_CPU_SET)) { 1499c2a69e84SGleb Smirnoff tp->t_hpts_cpu = hpts_cpuid(tp, &failed); 1500d7955cc0SRandall Stewart if (failed == 0) 1501c2a69e84SGleb Smirnoff tp->t_flags2 |= TF2_HPTS_CPU_SET; 15023ee9c3c4SRandall Stewart } 1503b2bde8a6SGleb Smirnoff HPTS_UNLOCK(hpts); 15043ee9c3c4SRandall Stewart } 15053ee9c3c4SRandall Stewart 15062c6fc36aSGleb Smirnoff static struct tcp_hpts_entry * 15072c6fc36aSGleb Smirnoff tcp_choose_hpts_to_run(void) 1508d7955cc0SRandall Stewart { 15092c6fc36aSGleb Smirnoff int i, oldest_idx, start, end; 15102c6fc36aSGleb Smirnoff uint32_t cts, time_since_ran, calc; 15112c6fc36aSGleb Smirnoff 15122c6fc36aSGleb Smirnoff cts = tcp_get_usecs(NULL); 15132c6fc36aSGleb Smirnoff time_since_ran = 0; 15142c6fc36aSGleb Smirnoff /* Default is all one group */ 15152c6fc36aSGleb Smirnoff start = 0; 15162c6fc36aSGleb Smirnoff end = tcp_pace.rp_num_hptss; 15172c6fc36aSGleb Smirnoff /* 15182c6fc36aSGleb Smirnoff * If we have more than one L3 group figure out which one 15192c6fc36aSGleb Smirnoff * this CPU is in. 15202c6fc36aSGleb Smirnoff */ 15212c6fc36aSGleb Smirnoff if (tcp_pace.grp_cnt > 1) { 15222c6fc36aSGleb Smirnoff for (i = 0; i < tcp_pace.grp_cnt; i++) { 15232c6fc36aSGleb Smirnoff if (CPU_ISSET(curcpu, &tcp_pace.grps[i]->cg_mask)) { 15242c6fc36aSGleb Smirnoff start = tcp_pace.grps[i]->cg_first; 15252c6fc36aSGleb Smirnoff end = (tcp_pace.grps[i]->cg_last + 1); 15262c6fc36aSGleb Smirnoff break; 15272c6fc36aSGleb Smirnoff } 15282c6fc36aSGleb Smirnoff } 15292c6fc36aSGleb Smirnoff } 15302c6fc36aSGleb Smirnoff oldest_idx = -1; 15312c6fc36aSGleb Smirnoff for (i = start; i < end; i++) { 1532175d4d69SGleb Smirnoff if (TSTMP_GT(cts, tcp_pace.cts_last_ran[i])) 1533175d4d69SGleb Smirnoff calc = cts - tcp_pace.cts_last_ran[i]; 15342c6fc36aSGleb Smirnoff else 15352c6fc36aSGleb Smirnoff calc = 0; 15362c6fc36aSGleb Smirnoff if (calc > time_since_ran) { 15372c6fc36aSGleb Smirnoff oldest_idx = i; 15382c6fc36aSGleb Smirnoff time_since_ran = calc; 15392c6fc36aSGleb Smirnoff } 15402c6fc36aSGleb Smirnoff } 15412c6fc36aSGleb Smirnoff if (oldest_idx >= 0) 15422c6fc36aSGleb Smirnoff return(tcp_pace.rp_ent[oldest_idx]); 15432c6fc36aSGleb Smirnoff else 15442c6fc36aSGleb Smirnoff return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]); 15452c6fc36aSGleb Smirnoff } 15462c6fc36aSGleb Smirnoff 15472c6fc36aSGleb Smirnoff static void 15482c6fc36aSGleb Smirnoff __tcp_run_hpts(void) 15492c6fc36aSGleb Smirnoff { 15502c6fc36aSGleb Smirnoff struct epoch_tracker et; 15512c6fc36aSGleb Smirnoff struct tcp_hpts_entry *hpts; 1552d7955cc0SRandall Stewart int ticks_ran; 1553d7955cc0SRandall Stewart 15542c6fc36aSGleb Smirnoff hpts = tcp_choose_hpts_to_run(); 15552c6fc36aSGleb Smirnoff 1556d7955cc0SRandall Stewart if (hpts->p_hpts_active) { 1557d7955cc0SRandall Stewart /* Already active */ 1558d7955cc0SRandall Stewart return; 1559d7955cc0SRandall Stewart } 1560b2bde8a6SGleb Smirnoff if (!HPTS_TRYLOCK(hpts)) { 1561d7955cc0SRandall Stewart /* Someone else got the lock */ 1562d7955cc0SRandall Stewart return; 1563d7955cc0SRandall Stewart } 15642c6fc36aSGleb Smirnoff NET_EPOCH_ENTER(et); 1565d7955cc0SRandall Stewart if (hpts->p_hpts_active) 1566d7955cc0SRandall Stewart goto out_with_mtx; 1567d7955cc0SRandall Stewart hpts->syscall_cnt++; 1568d7955cc0SRandall Stewart counter_u64_add(hpts_direct_call, 1); 1569d7955cc0SRandall Stewart hpts->p_hpts_active = 1; 157063446fd3SGleb Smirnoff ticks_ran = tcp_hptsi(hpts, false); 1571d7955cc0SRandall Stewart /* We may want to adjust the sleep values here */ 1572d7955cc0SRandall Stewart if (hpts->p_on_queue_cnt >= conn_cnt_thresh) { 1573d7955cc0SRandall Stewart if (ticks_ran > ticks_indicate_less_sleep) { 1574d7955cc0SRandall Stewart struct timeval tv; 1575d7955cc0SRandall Stewart sbintime_t sb; 1576d7955cc0SRandall Stewart 1577d7955cc0SRandall Stewart hpts->p_mysleep.tv_usec /= 2; 1578d7955cc0SRandall Stewart if (hpts->p_mysleep.tv_usec < dynamic_min_sleep) 1579d7955cc0SRandall Stewart hpts->p_mysleep.tv_usec = dynamic_min_sleep; 1580d7955cc0SRandall Stewart /* Reschedule with new to value */ 1581d7955cc0SRandall Stewart tcp_hpts_set_max_sleep(hpts, 0); 15826a79e480SRandall Stewart tv.tv_sec = 0; 1583d7955cc0SRandall Stewart tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT; 1584d7955cc0SRandall Stewart /* Validate its in the right ranges */ 1585d7955cc0SRandall Stewart if (tv.tv_usec < hpts->p_mysleep.tv_usec) { 1586d7955cc0SRandall Stewart hpts->overidden_sleep = tv.tv_usec; 1587d7955cc0SRandall Stewart tv.tv_usec = hpts->p_mysleep.tv_usec; 1588d7955cc0SRandall Stewart } else if (tv.tv_usec > dynamic_max_sleep) { 1589d7955cc0SRandall Stewart /* Lets not let sleep get above this value */ 1590d7955cc0SRandall Stewart hpts->overidden_sleep = tv.tv_usec; 1591d7955cc0SRandall Stewart tv.tv_usec = dynamic_max_sleep; 1592d7955cc0SRandall Stewart } 1593d7955cc0SRandall Stewart /* 1594d7955cc0SRandall Stewart * In this mode the timer is a backstop to 1595d7955cc0SRandall Stewart * all the userret/lro_flushes so we use 1596d7955cc0SRandall Stewart * the dynamic value and set the on_min_sleep 1597d7955cc0SRandall Stewart * flag so we will not be awoken. 1598d7955cc0SRandall Stewart */ 1599d7955cc0SRandall Stewart sb = tvtosbt(tv); 1600d7955cc0SRandall Stewart /* Store off to make visible the actual sleep time */ 1601d7955cc0SRandall Stewart hpts->sleeping = tv.tv_usec; 1602d7955cc0SRandall Stewart callout_reset_sbt_on(&hpts->co, sb, 0, 16036e6439b2SRandall Stewart hpts_timeout_swi, hpts, hpts->p_cpu, 1604d7955cc0SRandall Stewart (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); 1605d7955cc0SRandall Stewart } else if (ticks_ran < ticks_indicate_more_sleep) { 1606d7955cc0SRandall Stewart /* For the further sleep, don't reschedule hpts */ 1607d7955cc0SRandall Stewart hpts->p_mysleep.tv_usec *= 2; 1608d7955cc0SRandall Stewart if (hpts->p_mysleep.tv_usec > dynamic_max_sleep) 1609d7955cc0SRandall Stewart hpts->p_mysleep.tv_usec = dynamic_max_sleep; 1610d7955cc0SRandall Stewart } 1611d7955cc0SRandall Stewart hpts->p_on_min_sleep = 1; 1612d7955cc0SRandall Stewart } 1613d7955cc0SRandall Stewart hpts->p_hpts_active = 0; 1614d7955cc0SRandall Stewart out_with_mtx: 1615b2bde8a6SGleb Smirnoff HPTS_UNLOCK(hpts); 1616d7955cc0SRandall Stewart NET_EPOCH_EXIT(et); 1617d7955cc0SRandall Stewart } 1618d7955cc0SRandall Stewart 1619d7955cc0SRandall Stewart static void 16203ee9c3c4SRandall Stewart tcp_hpts_thread(void *ctx) 16213ee9c3c4SRandall Stewart { 16223ee9c3c4SRandall Stewart struct tcp_hpts_entry *hpts; 162343e8b279SGleb Smirnoff struct epoch_tracker et; 16243ee9c3c4SRandall Stewart struct timeval tv; 16253ee9c3c4SRandall Stewart sbintime_t sb; 16266e6439b2SRandall Stewart int ticks_ran; 16273ee9c3c4SRandall Stewart 16283ee9c3c4SRandall Stewart hpts = (struct tcp_hpts_entry *)ctx; 1629b2bde8a6SGleb Smirnoff HPTS_LOCK(hpts); 16303ee9c3c4SRandall Stewart if (hpts->p_direct_wake) { 1631d7955cc0SRandall Stewart /* Signaled by input or output with low occupancy count. */ 16323ee9c3c4SRandall Stewart callout_stop(&hpts->co); 1633d7955cc0SRandall Stewart counter_u64_add(hpts_direct_awakening, 1); 16343ee9c3c4SRandall Stewart } else { 1635d7955cc0SRandall Stewart /* Timed out, the normal case. */ 1636d7955cc0SRandall Stewart counter_u64_add(hpts_wake_timeout, 1); 16373ee9c3c4SRandall Stewart if (callout_pending(&hpts->co) || 16383ee9c3c4SRandall Stewart !callout_active(&hpts->co)) { 1639b2bde8a6SGleb Smirnoff HPTS_UNLOCK(hpts); 16403ee9c3c4SRandall Stewart return; 16413ee9c3c4SRandall Stewart } 16423ee9c3c4SRandall Stewart } 1643d7955cc0SRandall Stewart callout_deactivate(&hpts->co); 16443b0b41e6SRandall Stewart hpts->p_hpts_wake_scheduled = 0; 164543e8b279SGleb Smirnoff NET_EPOCH_ENTER(et); 1646d7955cc0SRandall Stewart if (hpts->p_hpts_active) { 1647d7955cc0SRandall Stewart /* 1648d7955cc0SRandall Stewart * We are active already. This means that a syscall 1649d7955cc0SRandall Stewart * trap or LRO is running in behalf of hpts. In that case 1650d7955cc0SRandall Stewart * we need to double our timeout since there seems to be 1651d7955cc0SRandall Stewart * enough activity in the system that we don't need to 1652d7955cc0SRandall Stewart * run as often (if we were not directly woken). 1653d7955cc0SRandall Stewart */ 1654aaaa01c0SMichael Tuexen tv.tv_sec = 0; 1655d7955cc0SRandall Stewart if (hpts->p_direct_wake == 0) { 1656d7955cc0SRandall Stewart counter_u64_add(hpts_back_tosleep, 1); 1657d7955cc0SRandall Stewart if (hpts->p_on_queue_cnt >= conn_cnt_thresh) { 1658d7955cc0SRandall Stewart hpts->p_mysleep.tv_usec *= 2; 1659d7955cc0SRandall Stewart if (hpts->p_mysleep.tv_usec > dynamic_max_sleep) 1660d7955cc0SRandall Stewart hpts->p_mysleep.tv_usec = dynamic_max_sleep; 1661d7955cc0SRandall Stewart tv.tv_usec = hpts->p_mysleep.tv_usec; 1662d7955cc0SRandall Stewart hpts->p_on_min_sleep = 1; 1663d7955cc0SRandall Stewart } else { 1664d7955cc0SRandall Stewart /* 1665d7955cc0SRandall Stewart * Here we have low count on the wheel, but 1666d7955cc0SRandall Stewart * somehow we still collided with one of the 1667d7955cc0SRandall Stewart * connections. Lets go back to sleep for a 1668d7955cc0SRandall Stewart * min sleep time, but clear the flag so we 1669d7955cc0SRandall Stewart * can be awoken by insert. 1670d7955cc0SRandall Stewart */ 1671d7955cc0SRandall Stewart hpts->p_on_min_sleep = 0; 1672d7955cc0SRandall Stewart tv.tv_usec = tcp_min_hptsi_time; 1673d7955cc0SRandall Stewart } 1674d7955cc0SRandall Stewart } else { 1675d7955cc0SRandall Stewart /* 1676d7955cc0SRandall Stewart * Directly woken most likely to reset the 1677d7955cc0SRandall Stewart * callout time. 1678d7955cc0SRandall Stewart */ 1679d7955cc0SRandall Stewart tv.tv_usec = hpts->p_mysleep.tv_usec; 1680d7955cc0SRandall Stewart } 1681d7955cc0SRandall Stewart goto back_to_sleep; 1682d7955cc0SRandall Stewart } 1683d7955cc0SRandall Stewart hpts->sleeping = 0; 1684d7955cc0SRandall Stewart hpts->p_hpts_active = 1; 168563446fd3SGleb Smirnoff ticks_ran = tcp_hptsi(hpts, true); 1686d7955cc0SRandall Stewart tv.tv_sec = 0; 1687d7955cc0SRandall Stewart tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT; 1688b7b78c1cSRandall Stewart if ((hpts->p_on_queue_cnt > conn_cnt_thresh) && (hpts->hit_callout_thresh == 0)) { 1689b7b78c1cSRandall Stewart hpts->hit_callout_thresh = 1; 1690b7b78c1cSRandall Stewart atomic_add_int(&hpts_that_need_softclock, 1); 1691b7b78c1cSRandall Stewart } else if ((hpts->p_on_queue_cnt <= conn_cnt_thresh) && (hpts->hit_callout_thresh == 1)) { 1692b7b78c1cSRandall Stewart hpts->hit_callout_thresh = 0; 1693b7b78c1cSRandall Stewart atomic_subtract_int(&hpts_that_need_softclock, 1); 1694b7b78c1cSRandall Stewart } 1695d7955cc0SRandall Stewart if (hpts->p_on_queue_cnt >= conn_cnt_thresh) { 1696d7955cc0SRandall Stewart if(hpts->p_direct_wake == 0) { 1697d7955cc0SRandall Stewart /* 1698d7955cc0SRandall Stewart * Only adjust sleep time if we were 1699d7955cc0SRandall Stewart * called from the callout i.e. direct_wake == 0. 1700d7955cc0SRandall Stewart */ 1701d7955cc0SRandall Stewart if (ticks_ran < ticks_indicate_more_sleep) { 1702d7955cc0SRandall Stewart hpts->p_mysleep.tv_usec *= 2; 1703d7955cc0SRandall Stewart if (hpts->p_mysleep.tv_usec > dynamic_max_sleep) 1704d7955cc0SRandall Stewart hpts->p_mysleep.tv_usec = dynamic_max_sleep; 1705d7955cc0SRandall Stewart } else if (ticks_ran > ticks_indicate_less_sleep) { 1706d7955cc0SRandall Stewart hpts->p_mysleep.tv_usec /= 2; 1707d7955cc0SRandall Stewart if (hpts->p_mysleep.tv_usec < dynamic_min_sleep) 1708d7955cc0SRandall Stewart hpts->p_mysleep.tv_usec = dynamic_min_sleep; 1709d7955cc0SRandall Stewart } 1710d7955cc0SRandall Stewart } 1711d7955cc0SRandall Stewart if (tv.tv_usec < hpts->p_mysleep.tv_usec) { 1712d7955cc0SRandall Stewart hpts->overidden_sleep = tv.tv_usec; 1713d7955cc0SRandall Stewart tv.tv_usec = hpts->p_mysleep.tv_usec; 1714d7955cc0SRandall Stewart } else if (tv.tv_usec > dynamic_max_sleep) { 1715d7955cc0SRandall Stewart /* Lets not let sleep get above this value */ 1716d7955cc0SRandall Stewart hpts->overidden_sleep = tv.tv_usec; 1717d7955cc0SRandall Stewart tv.tv_usec = dynamic_max_sleep; 1718d7955cc0SRandall Stewart } 1719d7955cc0SRandall Stewart /* 1720d7955cc0SRandall Stewart * In this mode the timer is a backstop to 1721d7955cc0SRandall Stewart * all the userret/lro_flushes so we use 1722d7955cc0SRandall Stewart * the dynamic value and set the on_min_sleep 1723d7955cc0SRandall Stewart * flag so we will not be awoken. 1724d7955cc0SRandall Stewart */ 1725d7955cc0SRandall Stewart hpts->p_on_min_sleep = 1; 1726d7955cc0SRandall Stewart } else if (hpts->p_on_queue_cnt == 0) { 1727d7955cc0SRandall Stewart /* 1728d7955cc0SRandall Stewart * No one on the wheel, please wake us up 1729d7955cc0SRandall Stewart * if you insert on the wheel. 1730d7955cc0SRandall Stewart */ 1731d7955cc0SRandall Stewart hpts->p_on_min_sleep = 0; 1732d7955cc0SRandall Stewart hpts->overidden_sleep = 0; 1733d7955cc0SRandall Stewart } else { 1734d7955cc0SRandall Stewart /* 1735d7955cc0SRandall Stewart * We hit here when we have a low number of 1736d7955cc0SRandall Stewart * clients on the wheel (our else clause). 1737d7955cc0SRandall Stewart * We may need to go on min sleep, if we set 1738d7955cc0SRandall Stewart * the flag we will not be awoken if someone 1739d7955cc0SRandall Stewart * is inserted ahead of us. Clearing the flag 1740d7955cc0SRandall Stewart * means we can be awoken. This is "old mode" 1741d7955cc0SRandall Stewart * where the timer is what runs hpts mainly. 1742d7955cc0SRandall Stewart */ 1743d7955cc0SRandall Stewart if (tv.tv_usec < tcp_min_hptsi_time) { 1744d7955cc0SRandall Stewart /* 1745d7955cc0SRandall Stewart * Yes on min sleep, which means 1746d7955cc0SRandall Stewart * we cannot be awoken. 1747d7955cc0SRandall Stewart */ 17483b0b41e6SRandall Stewart hpts->overidden_sleep = tv.tv_usec; 17493ee9c3c4SRandall Stewart tv.tv_usec = tcp_min_hptsi_time; 17503ee9c3c4SRandall Stewart hpts->p_on_min_sleep = 1; 17513ee9c3c4SRandall Stewart } else { 17523ee9c3c4SRandall Stewart /* Clear the min sleep flag */ 17533b0b41e6SRandall Stewart hpts->overidden_sleep = 0; 17543ee9c3c4SRandall Stewart hpts->p_on_min_sleep = 0; 17553ee9c3c4SRandall Stewart } 17563ee9c3c4SRandall Stewart } 1757d7955cc0SRandall Stewart HPTS_MTX_ASSERT(hpts); 1758d7955cc0SRandall Stewart hpts->p_hpts_active = 0; 1759d7955cc0SRandall Stewart back_to_sleep: 17603ee9c3c4SRandall Stewart hpts->p_direct_wake = 0; 1761d7955cc0SRandall Stewart sb = tvtosbt(tv); 1762d7955cc0SRandall Stewart /* Store off to make visible the actual sleep time */ 1763d7955cc0SRandall Stewart hpts->sleeping = tv.tv_usec; 1764d7955cc0SRandall Stewart callout_reset_sbt_on(&hpts->co, sb, 0, 17656e6439b2SRandall Stewart hpts_timeout_swi, hpts, hpts->p_cpu, 1766d7955cc0SRandall Stewart (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); 1767d7955cc0SRandall Stewart NET_EPOCH_EXIT(et); 1768b2bde8a6SGleb Smirnoff HPTS_UNLOCK(hpts); 17693ee9c3c4SRandall Stewart } 17703ee9c3c4SRandall Stewart 17713ee9c3c4SRandall Stewart #undef timersub 17723ee9c3c4SRandall Stewart 17736e6439b2SRandall Stewart static int32_t 17746e6439b2SRandall Stewart hpts_count_level(struct cpu_group *cg) 17756e6439b2SRandall Stewart { 17766e6439b2SRandall Stewart int32_t count_l3, i; 17776e6439b2SRandall Stewart 17786e6439b2SRandall Stewart count_l3 = 0; 17796e6439b2SRandall Stewart if (cg->cg_level == CG_SHARE_L3) 17806e6439b2SRandall Stewart count_l3++; 17816e6439b2SRandall Stewart /* Walk all the children looking for L3 */ 17826e6439b2SRandall Stewart for (i = 0; i < cg->cg_children; i++) { 17836e6439b2SRandall Stewart count_l3 += hpts_count_level(&cg->cg_child[i]); 17846e6439b2SRandall Stewart } 17856e6439b2SRandall Stewart return (count_l3); 17866e6439b2SRandall Stewart } 17876e6439b2SRandall Stewart 17886e6439b2SRandall Stewart static void 17896e6439b2SRandall Stewart hpts_gather_grps(struct cpu_group **grps, int32_t *at, int32_t max, struct cpu_group *cg) 17906e6439b2SRandall Stewart { 17916e6439b2SRandall Stewart int32_t idx, i; 17926e6439b2SRandall Stewart 17936e6439b2SRandall Stewart idx = *at; 17946e6439b2SRandall Stewart if (cg->cg_level == CG_SHARE_L3) { 17956e6439b2SRandall Stewart grps[idx] = cg; 17966e6439b2SRandall Stewart idx++; 17976e6439b2SRandall Stewart if (idx == max) { 17986e6439b2SRandall Stewart *at = idx; 17996e6439b2SRandall Stewart return; 18006e6439b2SRandall Stewart } 18016e6439b2SRandall Stewart } 18026e6439b2SRandall Stewart *at = idx; 18036e6439b2SRandall Stewart /* Walk all the children looking for L3 */ 18046e6439b2SRandall Stewart for (i = 0; i < cg->cg_children; i++) { 18056e6439b2SRandall Stewart hpts_gather_grps(grps, at, max, &cg->cg_child[i]); 18066e6439b2SRandall Stewart } 18076e6439b2SRandall Stewart } 18086e6439b2SRandall Stewart 18093ee9c3c4SRandall Stewart static void 181048b55a7cSGleb Smirnoff tcp_hpts_mod_load(void) 18113ee9c3c4SRandall Stewart { 18126e6439b2SRandall Stewart struct cpu_group *cpu_top; 18136e6439b2SRandall Stewart int32_t error __diagused; 18146e6439b2SRandall Stewart int32_t i, j, bound = 0, created = 0; 18153ee9c3c4SRandall Stewart size_t sz, asz; 18163ee9c3c4SRandall Stewart struct timeval tv; 18173ee9c3c4SRandall Stewart sbintime_t sb; 18183ee9c3c4SRandall Stewart struct tcp_hpts_entry *hpts; 18194e255d74SAndrew Gallatin struct pcpu *pc; 18203ee9c3c4SRandall Stewart char unit[16]; 18213ee9c3c4SRandall Stewart uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU; 18226e6439b2SRandall Stewart int count, domain; 18233ee9c3c4SRandall Stewart 18246e6439b2SRandall Stewart #ifdef SMP 18256e6439b2SRandall Stewart cpu_top = smp_topo(); 18266e6439b2SRandall Stewart #else 18276e6439b2SRandall Stewart cpu_top = NULL; 18286e6439b2SRandall Stewart #endif 18293ee9c3c4SRandall Stewart tcp_pace.rp_num_hptss = ncpus; 18303b0b41e6SRandall Stewart hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK); 18313ee9c3c4SRandall Stewart hpts_loops = counter_u64_alloc(M_WAITOK); 18323ee9c3c4SRandall Stewart back_tosleep = counter_u64_alloc(M_WAITOK); 18333b0b41e6SRandall Stewart combined_wheel_wrap = counter_u64_alloc(M_WAITOK); 18343b0b41e6SRandall Stewart wheel_wrap = counter_u64_alloc(M_WAITOK); 1835d7955cc0SRandall Stewart hpts_wake_timeout = counter_u64_alloc(M_WAITOK); 1836d7955cc0SRandall Stewart hpts_direct_awakening = counter_u64_alloc(M_WAITOK); 1837d7955cc0SRandall Stewart hpts_back_tosleep = counter_u64_alloc(M_WAITOK); 1838d7955cc0SRandall Stewart hpts_direct_call = counter_u64_alloc(M_WAITOK); 1839d7955cc0SRandall Stewart cpu_uses_flowid = counter_u64_alloc(M_WAITOK); 1840d7955cc0SRandall Stewart cpu_uses_random = counter_u64_alloc(M_WAITOK); 1841d7955cc0SRandall Stewart 18423ee9c3c4SRandall Stewart sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *)); 18433ee9c3c4SRandall Stewart tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); 1844d7955cc0SRandall Stewart sz = (sizeof(uint32_t) * tcp_pace.rp_num_hptss); 1845175d4d69SGleb Smirnoff tcp_pace.cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK); 18466e6439b2SRandall Stewart tcp_pace.grp_cnt = 0; 18476e6439b2SRandall Stewart if (cpu_top == NULL) { 18486e6439b2SRandall Stewart tcp_pace.grp_cnt = 1; 18496e6439b2SRandall Stewart } else { 18506e6439b2SRandall Stewart /* Find out how many cache level 3 domains we have */ 18516e6439b2SRandall Stewart count = 0; 18526e6439b2SRandall Stewart tcp_pace.grp_cnt = hpts_count_level(cpu_top); 18536e6439b2SRandall Stewart if (tcp_pace.grp_cnt == 0) { 18546e6439b2SRandall Stewart tcp_pace.grp_cnt = 1; 18556e6439b2SRandall Stewart } 18566e6439b2SRandall Stewart sz = (tcp_pace.grp_cnt * sizeof(struct cpu_group *)); 18576e6439b2SRandall Stewart tcp_pace.grps = malloc(sz, M_TCPHPTS, M_WAITOK); 18586e6439b2SRandall Stewart /* Now populate the groups */ 18596e6439b2SRandall Stewart if (tcp_pace.grp_cnt == 1) { 18606e6439b2SRandall Stewart /* 18616e6439b2SRandall Stewart * All we need is the top level all cpu's are in 18626e6439b2SRandall Stewart * the same cache so when we use grp[0]->cg_mask 18636e6439b2SRandall Stewart * with the cg_first <-> cg_last it will include 18646e6439b2SRandall Stewart * all cpu's in it. The level here is probably 18656e6439b2SRandall Stewart * zero which is ok. 18666e6439b2SRandall Stewart */ 18676e6439b2SRandall Stewart tcp_pace.grps[0] = cpu_top; 18686e6439b2SRandall Stewart } else { 18696e6439b2SRandall Stewart /* 18706e6439b2SRandall Stewart * Here we must find all the level three cache domains 18716e6439b2SRandall Stewart * and setup our pointers to them. 18726e6439b2SRandall Stewart */ 18736e6439b2SRandall Stewart count = 0; 18746e6439b2SRandall Stewart hpts_gather_grps(tcp_pace.grps, &count, tcp_pace.grp_cnt, cpu_top); 18756e6439b2SRandall Stewart } 18766e6439b2SRandall Stewart } 18773ee9c3c4SRandall Stewart asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS; 18783ee9c3c4SRandall Stewart for (i = 0; i < tcp_pace.rp_num_hptss; i++) { 18793ee9c3c4SRandall Stewart tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry), 18803ee9c3c4SRandall Stewart M_TCPHPTS, M_WAITOK | M_ZERO); 18816e6439b2SRandall Stewart tcp_pace.rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS, M_WAITOK); 18823ee9c3c4SRandall Stewart hpts = tcp_pace.rp_ent[i]; 18833ee9c3c4SRandall Stewart /* 18843ee9c3c4SRandall Stewart * Init all the hpts structures that are not specifically 18853ee9c3c4SRandall Stewart * zero'd by the allocations. Also lets attach them to the 18863ee9c3c4SRandall Stewart * appropriate sysctl block as well. 18873ee9c3c4SRandall Stewart */ 18883ee9c3c4SRandall Stewart mtx_init(&hpts->p_mtx, "tcp_hpts_lck", 18893ee9c3c4SRandall Stewart "hpts", MTX_DEF | MTX_DUPOK); 18903ee9c3c4SRandall Stewart for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) { 1891db0ac6deSCy Schubert TAILQ_INIT(&hpts->p_hptss[j].head); 1892db0ac6deSCy Schubert hpts->p_hptss[j].count = 0; 1893db0ac6deSCy Schubert hpts->p_hptss[j].gencnt = 0; 18943ee9c3c4SRandall Stewart } 18953ee9c3c4SRandall Stewart sysctl_ctx_init(&hpts->hpts_ctx); 18963ee9c3c4SRandall Stewart sprintf(unit, "%d", i); 18973ee9c3c4SRandall Stewart hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx, 18983ee9c3c4SRandall Stewart SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts), 18993ee9c3c4SRandall Stewart OID_AUTO, 19003ee9c3c4SRandall Stewart unit, 19017029da5cSPawel Biernacki CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 19023ee9c3c4SRandall Stewart ""); 19033ee9c3c4SRandall Stewart SYSCTL_ADD_INT(&hpts->hpts_ctx, 19043ee9c3c4SRandall Stewart SYSCTL_CHILDREN(hpts->hpts_root), 19053ee9c3c4SRandall Stewart OID_AUTO, "out_qcnt", CTLFLAG_RD, 19063ee9c3c4SRandall Stewart &hpts->p_on_queue_cnt, 0, 19073ee9c3c4SRandall Stewart "Count TCB's awaiting output processing"); 19083b0b41e6SRandall Stewart SYSCTL_ADD_U16(&hpts->hpts_ctx, 19093ee9c3c4SRandall Stewart SYSCTL_CHILDREN(hpts->hpts_root), 19103ee9c3c4SRandall Stewart OID_AUTO, "active", CTLFLAG_RD, 19113ee9c3c4SRandall Stewart &hpts->p_hpts_active, 0, 19123ee9c3c4SRandall Stewart "Is the hpts active"); 19133ee9c3c4SRandall Stewart SYSCTL_ADD_UINT(&hpts->hpts_ctx, 19143ee9c3c4SRandall Stewart SYSCTL_CHILDREN(hpts->hpts_root), 19153ee9c3c4SRandall Stewart OID_AUTO, "curslot", CTLFLAG_RD, 19163ee9c3c4SRandall Stewart &hpts->p_cur_slot, 0, 19173b0b41e6SRandall Stewart "What the current running pacers goal"); 19183b0b41e6SRandall Stewart SYSCTL_ADD_UINT(&hpts->hpts_ctx, 19193b0b41e6SRandall Stewart SYSCTL_CHILDREN(hpts->hpts_root), 19203b0b41e6SRandall Stewart OID_AUTO, "runtick", CTLFLAG_RD, 1921d7955cc0SRandall Stewart &hpts->p_runningslot, 0, 19223b0b41e6SRandall Stewart "What the running pacers current slot is"); 19233ee9c3c4SRandall Stewart SYSCTL_ADD_UINT(&hpts->hpts_ctx, 19243ee9c3c4SRandall Stewart SYSCTL_CHILDREN(hpts->hpts_root), 19253ee9c3c4SRandall Stewart OID_AUTO, "curtick", CTLFLAG_RD, 19263ee9c3c4SRandall Stewart &hpts->p_curtick, 0, 19273b0b41e6SRandall Stewart "What the running pacers last tick mapped to the wheel was"); 1928d7955cc0SRandall Stewart SYSCTL_ADD_UINT(&hpts->hpts_ctx, 1929d7955cc0SRandall Stewart SYSCTL_CHILDREN(hpts->hpts_root), 1930d7955cc0SRandall Stewart OID_AUTO, "lastran", CTLFLAG_RD, 1931175d4d69SGleb Smirnoff &tcp_pace.cts_last_ran[i], 0, 1932d7955cc0SRandall Stewart "The last usec tick that this hpts ran"); 19337312e4e5SRandall Stewart SYSCTL_ADD_LONG(&hpts->hpts_ctx, 1934d7955cc0SRandall Stewart SYSCTL_CHILDREN(hpts->hpts_root), 1935d7955cc0SRandall Stewart OID_AUTO, "cur_min_sleep", CTLFLAG_RD, 19367312e4e5SRandall Stewart &hpts->p_mysleep.tv_usec, 1937d7955cc0SRandall Stewart "What the running pacers is using for p_mysleep.tv_usec"); 1938d7955cc0SRandall Stewart SYSCTL_ADD_U64(&hpts->hpts_ctx, 1939d7955cc0SRandall Stewart SYSCTL_CHILDREN(hpts->hpts_root), 1940d7955cc0SRandall Stewart OID_AUTO, "now_sleeping", CTLFLAG_RD, 1941d7955cc0SRandall Stewart &hpts->sleeping, 0, 1942d7955cc0SRandall Stewart "What the running pacers is actually sleeping for"); 1943d7955cc0SRandall Stewart SYSCTL_ADD_U64(&hpts->hpts_ctx, 1944d7955cc0SRandall Stewart SYSCTL_CHILDREN(hpts->hpts_root), 1945d7955cc0SRandall Stewart OID_AUTO, "syscall_cnt", CTLFLAG_RD, 1946d7955cc0SRandall Stewart &hpts->syscall_cnt, 0, 1947d7955cc0SRandall Stewart "How many times we had syscalls on this hpts"); 1948d7955cc0SRandall Stewart 19493b0b41e6SRandall Stewart hpts->p_hpts_sleep_time = hpts_sleep_max; 19503ee9c3c4SRandall Stewart hpts->p_num = i; 19513b0b41e6SRandall Stewart hpts->p_curtick = tcp_gethptstick(&tv); 1952175d4d69SGleb Smirnoff tcp_pace.cts_last_ran[i] = tcp_tv_to_usectick(&tv); 19533b0b41e6SRandall Stewart hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); 19543ee9c3c4SRandall Stewart hpts->p_cpu = 0xffff; 1955d7955cc0SRandall Stewart hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1); 19563ee9c3c4SRandall Stewart callout_init(&hpts->co, 1); 19573ee9c3c4SRandall Stewart } 19584e255d74SAndrew Gallatin /* Don't try to bind to NUMA domains if we don't have any */ 19594e255d74SAndrew Gallatin if (vm_ndomains == 1 && tcp_bind_threads == 2) 19604e255d74SAndrew Gallatin tcp_bind_threads = 0; 19614e255d74SAndrew Gallatin 19623ee9c3c4SRandall Stewart /* 19633ee9c3c4SRandall Stewart * Now lets start ithreads to handle the hptss. 19643ee9c3c4SRandall Stewart */ 1965d7955cc0SRandall Stewart for (i = 0; i < tcp_pace.rp_num_hptss; i++) { 19663ee9c3c4SRandall Stewart hpts = tcp_pace.rp_ent[i]; 19673ee9c3c4SRandall Stewart hpts->p_cpu = i; 19686e6439b2SRandall Stewart 19693ee9c3c4SRandall Stewart error = swi_add(&hpts->ie, "hpts", 19703ee9c3c4SRandall Stewart tcp_hpts_thread, (void *)hpts, 19713ee9c3c4SRandall Stewart SWI_NET, INTR_MPSAFE, &hpts->ie_cookie); 1972d7955cc0SRandall Stewart KASSERT(error == 0, 1973d7955cc0SRandall Stewart ("Can't add hpts:%p i:%d err:%d", 1974d7955cc0SRandall Stewart hpts, i, error)); 19753ee9c3c4SRandall Stewart created++; 1976d7955cc0SRandall Stewart hpts->p_mysleep.tv_sec = 0; 1977d7955cc0SRandall Stewart hpts->p_mysleep.tv_usec = tcp_min_hptsi_time; 19784e255d74SAndrew Gallatin if (tcp_bind_threads == 1) { 19793ee9c3c4SRandall Stewart if (intr_event_bind(hpts->ie, i) == 0) 19803ee9c3c4SRandall Stewart bound++; 19814e255d74SAndrew Gallatin } else if (tcp_bind_threads == 2) { 19826e6439b2SRandall Stewart /* Find the group for this CPU (i) and bind into it */ 19836e6439b2SRandall Stewart for (j = 0; j < tcp_pace.grp_cnt; j++) { 19846e6439b2SRandall Stewart if (CPU_ISSET(i, &tcp_pace.grps[j]->cg_mask)) { 19856e6439b2SRandall Stewart if (intr_event_bind_ithread_cpuset(hpts->ie, 19866e6439b2SRandall Stewart &tcp_pace.grps[j]->cg_mask) == 0) { 19876e6439b2SRandall Stewart bound++; 19884e255d74SAndrew Gallatin pc = pcpu_find(i); 19894e255d74SAndrew Gallatin domain = pc->pc_domain; 19904e255d74SAndrew Gallatin count = hpts_domains[domain].count; 19914e255d74SAndrew Gallatin hpts_domains[domain].cpu[count] = i; 19924e255d74SAndrew Gallatin hpts_domains[domain].count++; 19936e6439b2SRandall Stewart break; 19946e6439b2SRandall Stewart } 19956e6439b2SRandall Stewart } 19964e255d74SAndrew Gallatin } 19973ee9c3c4SRandall Stewart } 19983ee9c3c4SRandall Stewart tv.tv_sec = 0; 1999d7955cc0SRandall Stewart tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT; 2000d7955cc0SRandall Stewart hpts->sleeping = tv.tv_usec; 20013ee9c3c4SRandall Stewart sb = tvtosbt(tv); 20023ee9c3c4SRandall Stewart callout_reset_sbt_on(&hpts->co, sb, 0, 20036e6439b2SRandall Stewart hpts_timeout_swi, hpts, hpts->p_cpu, 20043ee9c3c4SRandall Stewart (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); 20053ee9c3c4SRandall Stewart } 20064e255d74SAndrew Gallatin /* 20074e255d74SAndrew Gallatin * If we somehow have an empty domain, fall back to choosing 20084e255d74SAndrew Gallatin * among all htps threads. 20094e255d74SAndrew Gallatin */ 20104e255d74SAndrew Gallatin for (i = 0; i < vm_ndomains; i++) { 20114e255d74SAndrew Gallatin if (hpts_domains[i].count == 0) { 20124e255d74SAndrew Gallatin tcp_bind_threads = 0; 20134e255d74SAndrew Gallatin break; 20144e255d74SAndrew Gallatin } 20154e255d74SAndrew Gallatin } 20162c6fc36aSGleb Smirnoff tcp_hpts_softclock = __tcp_run_hpts; 20172c6fc36aSGleb Smirnoff tcp_lro_hpts_init(); 20184e255d74SAndrew Gallatin printf("TCP Hpts created %d swi interrupt threads and bound %d to %s\n", 20194e255d74SAndrew Gallatin created, bound, 20204e255d74SAndrew Gallatin tcp_bind_threads == 2 ? "NUMA domains" : "cpus"); 20213ee9c3c4SRandall Stewart } 20223ee9c3c4SRandall Stewart 202348b55a7cSGleb Smirnoff static void 202448b55a7cSGleb Smirnoff tcp_hpts_mod_unload(void) 202548b55a7cSGleb Smirnoff { 202648b55a7cSGleb Smirnoff int rv __diagused; 202748b55a7cSGleb Smirnoff 202848b55a7cSGleb Smirnoff tcp_lro_hpts_uninit(); 202948b55a7cSGleb Smirnoff atomic_store_ptr(&tcp_hpts_softclock, NULL); 203048b55a7cSGleb Smirnoff 203148b55a7cSGleb Smirnoff for (int i = 0; i < tcp_pace.rp_num_hptss; i++) { 203248b55a7cSGleb Smirnoff struct tcp_hpts_entry *hpts = tcp_pace.rp_ent[i]; 203348b55a7cSGleb Smirnoff 203448b55a7cSGleb Smirnoff rv = callout_drain(&hpts->co); 203548b55a7cSGleb Smirnoff MPASS(rv != 0); 203648b55a7cSGleb Smirnoff 203748b55a7cSGleb Smirnoff rv = swi_remove(hpts->ie_cookie); 203848b55a7cSGleb Smirnoff MPASS(rv == 0); 203948b55a7cSGleb Smirnoff 204048b55a7cSGleb Smirnoff rv = sysctl_ctx_free(&hpts->hpts_ctx); 204148b55a7cSGleb Smirnoff MPASS(rv == 0); 204248b55a7cSGleb Smirnoff 204348b55a7cSGleb Smirnoff mtx_destroy(&hpts->p_mtx); 204448b55a7cSGleb Smirnoff free(hpts->p_hptss, M_TCPHPTS); 204548b55a7cSGleb Smirnoff free(hpts, M_TCPHPTS); 204648b55a7cSGleb Smirnoff } 204748b55a7cSGleb Smirnoff 204848b55a7cSGleb Smirnoff free(tcp_pace.rp_ent, M_TCPHPTS); 204948b55a7cSGleb Smirnoff free(tcp_pace.cts_last_ran, M_TCPHPTS); 205048b55a7cSGleb Smirnoff #ifdef SMP 205148b55a7cSGleb Smirnoff free(tcp_pace.grps, M_TCPHPTS); 205248b55a7cSGleb Smirnoff #endif 205348b55a7cSGleb Smirnoff 205448b55a7cSGleb Smirnoff counter_u64_free(hpts_hopelessly_behind); 205548b55a7cSGleb Smirnoff counter_u64_free(hpts_loops); 205648b55a7cSGleb Smirnoff counter_u64_free(back_tosleep); 205748b55a7cSGleb Smirnoff counter_u64_free(combined_wheel_wrap); 205848b55a7cSGleb Smirnoff counter_u64_free(wheel_wrap); 205948b55a7cSGleb Smirnoff counter_u64_free(hpts_wake_timeout); 206048b55a7cSGleb Smirnoff counter_u64_free(hpts_direct_awakening); 206148b55a7cSGleb Smirnoff counter_u64_free(hpts_back_tosleep); 206248b55a7cSGleb Smirnoff counter_u64_free(hpts_direct_call); 206348b55a7cSGleb Smirnoff counter_u64_free(cpu_uses_flowid); 206448b55a7cSGleb Smirnoff counter_u64_free(cpu_uses_random); 206548b55a7cSGleb Smirnoff } 206648b55a7cSGleb Smirnoff 206748b55a7cSGleb Smirnoff static int 206848b55a7cSGleb Smirnoff tcp_hpts_modevent(module_t mod, int what, void *arg) 206948b55a7cSGleb Smirnoff { 207048b55a7cSGleb Smirnoff 207148b55a7cSGleb Smirnoff switch (what) { 207248b55a7cSGleb Smirnoff case MOD_LOAD: 207348b55a7cSGleb Smirnoff tcp_hpts_mod_load(); 207448b55a7cSGleb Smirnoff return (0); 207548b55a7cSGleb Smirnoff case MOD_QUIESCE: 207648b55a7cSGleb Smirnoff /* 207748b55a7cSGleb Smirnoff * Since we are a dependency of TCP stack modules, they should 207848b55a7cSGleb Smirnoff * already be unloaded, and the HPTS ring is empty. However, 207948b55a7cSGleb Smirnoff * function pointer manipulations aren't 100% safe. Although, 208048b55a7cSGleb Smirnoff * tcp_hpts_mod_unload() use atomic(9) the userret() doesn't. 208148b55a7cSGleb Smirnoff * Thus, allow only forced unload of HPTS. 208248b55a7cSGleb Smirnoff */ 208348b55a7cSGleb Smirnoff return (EBUSY); 208448b55a7cSGleb Smirnoff case MOD_UNLOAD: 208548b55a7cSGleb Smirnoff tcp_hpts_mod_unload(); 208648b55a7cSGleb Smirnoff return (0); 208748b55a7cSGleb Smirnoff default: 208848b55a7cSGleb Smirnoff return (EINVAL); 208948b55a7cSGleb Smirnoff }; 209048b55a7cSGleb Smirnoff } 209148b55a7cSGleb Smirnoff 209248b55a7cSGleb Smirnoff static moduledata_t tcp_hpts_module = { 209348b55a7cSGleb Smirnoff .name = "tcphpts", 209448b55a7cSGleb Smirnoff .evhand = tcp_hpts_modevent, 209548b55a7cSGleb Smirnoff }; 209648b55a7cSGleb Smirnoff 209748b55a7cSGleb Smirnoff DECLARE_MODULE(tcphpts, tcp_hpts_module, SI_SUB_SOFTINTR, SI_ORDER_ANY); 2098cff21e48SJonathan T. Looney MODULE_VERSION(tcphpts, 1); 2099