1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2024 Ericsson AB 3 */ 4 5 #define MAX_MODS 1024 6 7 #include <stdio.h> 8 9 #include <rte_bitops.h> 10 #include <rte_cycles.h> 11 #include <rte_lcore_var.h> 12 #include <rte_per_lcore.h> 13 #include <rte_random.h> 14 15 #include "test.h" 16 17 struct mod_lcore_state { 18 uint64_t a; 19 uint64_t b; 20 uint64_t sum; 21 }; 22 23 static void 24 mod_init(struct mod_lcore_state *state) 25 { 26 state->a = rte_rand(); 27 state->b = rte_rand(); 28 state->sum = 0; 29 } 30 31 static __rte_always_inline void 32 mod_update(volatile struct mod_lcore_state *state) 33 { 34 state->sum += state->a * state->b; 35 } 36 37 struct __rte_cache_aligned mod_lcore_state_aligned { 38 struct mod_lcore_state mod_state; 39 40 RTE_CACHE_GUARD; 41 }; 42 43 static struct mod_lcore_state_aligned 44 sarray_lcore_state[MAX_MODS][RTE_MAX_LCORE]; 45 46 static void 47 sarray_init(void) 48 { 49 unsigned int lcore_id = rte_lcore_id(); 50 int mod; 51 52 for (mod = 0; mod < MAX_MODS; mod++) { 53 struct mod_lcore_state *mod_state = &sarray_lcore_state[mod][lcore_id].mod_state; 54 55 mod_init(mod_state); 56 } 57 } 58 59 static __rte_noinline void 60 sarray_update(unsigned int mod) 61 { 62 unsigned int lcore_id = rte_lcore_id(); 63 struct mod_lcore_state *mod_state = &sarray_lcore_state[mod][lcore_id].mod_state; 64 65 mod_update(mod_state); 66 } 67 68 struct mod_lcore_state_lazy { 69 struct mod_lcore_state mod_state; 70 bool initialized; 71 }; 72 73 /* 74 * Note: it's usually a bad idea have this much thread-local storage 75 * allocated in a real application, since it will incur a cost on 76 * thread creation and non-lcore thread memory usage. 77 */ 78 static RTE_DEFINE_PER_LCORE(struct mod_lcore_state_lazy, tls_lcore_state)[MAX_MODS]; 79 80 static inline void 81 tls_init(struct mod_lcore_state_lazy *state) 82 { 83 mod_init(&state->mod_state); 84 85 state->initialized = true; 86 } 87 88 static __rte_noinline void 89 tls_lazy_update(unsigned int mod) 90 { 91 struct mod_lcore_state_lazy *state = 92 &RTE_PER_LCORE(tls_lcore_state[mod]); 93 94 /* With thread-local storage, initialization must usually be lazy */ 95 if (!state->initialized) 96 tls_init(state); 97 98 mod_update(&state->mod_state); 99 } 100 101 static __rte_noinline void 102 tls_update(unsigned int mod) 103 { 104 struct mod_lcore_state_lazy *state = 105 &RTE_PER_LCORE(tls_lcore_state[mod]); 106 107 mod_update(&state->mod_state); 108 } 109 110 RTE_LCORE_VAR_HANDLE(struct mod_lcore_state, lvar_lcore_state)[MAX_MODS]; 111 112 static void 113 lvar_init(void) 114 { 115 unsigned int mod; 116 117 for (mod = 0; mod < MAX_MODS; mod++) { 118 RTE_LCORE_VAR_ALLOC(lvar_lcore_state[mod]); 119 120 struct mod_lcore_state *state = RTE_LCORE_VAR(lvar_lcore_state[mod]); 121 122 mod_init(state); 123 } 124 } 125 126 static __rte_noinline void 127 lvar_update(unsigned int mod) 128 { 129 struct mod_lcore_state *state = RTE_LCORE_VAR(lvar_lcore_state[mod]); 130 131 mod_update(state); 132 } 133 134 static void 135 shuffle(unsigned int *elems, size_t len) 136 { 137 size_t i; 138 139 for (i = len - 1; i > 0; i--) { 140 unsigned int other = rte_rand_max(i + 1); 141 142 unsigned int tmp = elems[other]; 143 elems[other] = elems[i]; 144 elems[i] = tmp; 145 } 146 } 147 148 #define ITERATIONS UINT64_C(10000000) 149 150 static inline double 151 benchmark_access(const unsigned int *mods, unsigned int num_mods, 152 void (*init_fun)(void), void (*update_fun)(unsigned int)) 153 { 154 unsigned int i; 155 double start; 156 double end; 157 double latency; 158 unsigned int num_mods_mask = num_mods - 1; 159 160 RTE_VERIFY(rte_is_power_of_2(num_mods)); 161 162 if (init_fun != NULL) 163 init_fun(); 164 165 /* Warm up cache and make sure TLS variables are initialized */ 166 for (i = 0; i < num_mods; i++) 167 update_fun(i); 168 169 start = rte_rdtsc(); 170 171 for (i = 0; i < ITERATIONS; i++) 172 update_fun(mods[i & num_mods_mask]); 173 174 end = rte_rdtsc(); 175 176 latency = (end - start) / (double)ITERATIONS; 177 178 return latency; 179 } 180 181 static void 182 test_lcore_var_access_n(unsigned int num_mods) 183 { 184 double sarray_latency; 185 double tls_latency; 186 double lazy_tls_latency; 187 double lvar_latency; 188 unsigned int mods[num_mods]; 189 unsigned int i; 190 191 for (i = 0; i < num_mods; i++) 192 mods[i] = i; 193 194 shuffle(mods, num_mods); 195 196 sarray_latency = 197 benchmark_access(mods, num_mods, sarray_init, sarray_update); 198 199 tls_latency = 200 benchmark_access(mods, num_mods, NULL, tls_update); 201 202 lazy_tls_latency = 203 benchmark_access(mods, num_mods, NULL, tls_lazy_update); 204 205 lvar_latency = 206 benchmark_access(mods, num_mods, lvar_init, lvar_update); 207 208 printf("%17u %8.1f %14.1f %15.1f %10.1f\n", num_mods, sarray_latency, 209 tls_latency, lazy_tls_latency, lvar_latency); 210 } 211 212 /* 213 * The potential performance benefit of lcore variables compared to 214 * the use of statically sized, lcore id-indexed arrays is not 215 * shorter latencies in a scenario with low cache pressure, but rather 216 * fewer cache misses in a real-world scenario, with extensive cache 217 * usage. These tests are a crude simulation of such, using <N> dummy 218 * modules, each with a small, per-lcore state. Note however that 219 * these tests have very little non-lcore/thread local state, which is 220 * unrealistic. 221 */ 222 223 static int 224 test_lcore_var_access(void) 225 { 226 unsigned int num_mods = 1; 227 228 printf("- Latencies [TSC cycles/update] -\n"); 229 printf("Number of Static Thread-local Thread-local Lcore\n"); 230 printf("Modules/Variables Array Storage Storage (Lazy) Variables\n"); 231 232 for (num_mods = 1; num_mods <= MAX_MODS; num_mods *= 2) 233 test_lcore_var_access_n(num_mods); 234 235 return TEST_SUCCESS; 236 } 237 238 static struct unit_test_suite lcore_var_testsuite = { 239 .suite_name = "lcore variable perf autotest", 240 .unit_test_cases = { 241 TEST_CASE(test_lcore_var_access), 242 TEST_CASES_END() 243 }, 244 }; 245 246 static int 247 test_lcore_var_perf(void) 248 { 249 return unit_test_suite_runner(&lcore_var_testsuite); 250 } 251 252 REGISTER_PERF_TEST(lcore_var_perf_autotest, test_lcore_var_perf); 253