1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2019 Intel Corporation 3 */ 4 5 6 #include <stdio.h> 7 #include <inttypes.h> 8 9 #include <rte_cycles.h> 10 #include <rte_launch.h> 11 #include <rte_pause.h> 12 #include <rte_stack.h> 13 14 #include "test.h" 15 16 #define STACK_NAME "STACK_PERF" 17 #define MAX_BURST 32 18 #define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST) 19 20 /* 21 * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time 22 * constants. 23 */ 24 static volatile unsigned int bulk_sizes[] = {8, MAX_BURST}; 25 26 static uint32_t lcore_barrier; 27 28 struct lcore_pair { 29 unsigned int c1; 30 unsigned int c2; 31 }; 32 33 static int 34 get_two_hyperthreads(struct lcore_pair *lcp) 35 { 36 unsigned int socket[2]; 37 unsigned int core[2]; 38 unsigned int id[2]; 39 40 RTE_LCORE_FOREACH(id[0]) { 41 RTE_LCORE_FOREACH(id[1]) { 42 if (id[0] == id[1]) 43 continue; 44 core[0] = rte_lcore_to_cpu_id(id[0]); 45 core[1] = rte_lcore_to_cpu_id(id[1]); 46 socket[0] = rte_lcore_to_socket_id(id[0]); 47 socket[1] = rte_lcore_to_socket_id(id[1]); 48 if ((core[0] == core[1]) && (socket[0] == socket[1])) { 49 lcp->c1 = id[0]; 50 lcp->c2 = id[1]; 51 return 0; 52 } 53 } 54 } 55 56 return 1; 57 } 58 59 static int 60 get_two_cores(struct lcore_pair *lcp) 61 { 62 unsigned int socket[2]; 63 unsigned int core[2]; 64 unsigned int id[2]; 65 66 RTE_LCORE_FOREACH(id[0]) { 67 RTE_LCORE_FOREACH(id[1]) { 68 if (id[0] == id[1]) 69 continue; 70 core[0] = rte_lcore_to_cpu_id(id[0]); 71 core[1] = rte_lcore_to_cpu_id(id[1]); 72 socket[0] = rte_lcore_to_socket_id(id[0]); 73 socket[1] = rte_lcore_to_socket_id(id[1]); 74 if ((core[0] != core[1]) && (socket[0] == socket[1])) { 75 lcp->c1 = id[0]; 76 lcp->c2 = id[1]; 77 return 0; 78 } 79 } 80 } 81 82 return 1; 83 } 84 85 static int 86 get_two_sockets(struct lcore_pair *lcp) 87 { 88 unsigned int socket[2]; 89 unsigned int id[2]; 90 91 RTE_LCORE_FOREACH(id[0]) { 92 RTE_LCORE_FOREACH(id[1]) { 93 if (id[0] == id[1]) 94 continue; 95 socket[0] = rte_lcore_to_socket_id(id[0]); 96 socket[1] = rte_lcore_to_socket_id(id[1]); 97 if (socket[0] != socket[1]) { 98 lcp->c1 = id[0]; 99 lcp->c2 = id[1]; 100 return 0; 101 } 102 } 103 } 104 105 return 1; 106 } 107 108 /* Measure the cycle cost of popping an empty stack. */ 109 static void 110 test_empty_pop(struct rte_stack *s) 111 { 112 unsigned int iterations = 100000000; 113 void *objs[MAX_BURST]; 114 unsigned int i; 115 116 uint64_t start = rte_rdtsc(); 117 118 for (i = 0; i < iterations; i++) 119 rte_stack_pop(s, objs, bulk_sizes[0]); 120 121 uint64_t end = rte_rdtsc(); 122 123 printf("Stack empty pop: %.2F\n", 124 (double)(end - start) / iterations); 125 } 126 127 struct thread_args { 128 struct rte_stack *s; 129 unsigned int sz; 130 double avg; 131 }; 132 133 /* Measure the average per-pointer cycle cost of stack push and pop */ 134 static int 135 bulk_push_pop(void *p) 136 { 137 unsigned int iterations = 1000000; 138 struct thread_args *args = p; 139 void *objs[MAX_BURST] = {0}; 140 unsigned int size, i; 141 struct rte_stack *s; 142 143 s = args->s; 144 size = args->sz; 145 146 __atomic_fetch_sub(&lcore_barrier, 1, __ATOMIC_RELAXED); 147 rte_wait_until_equal_32(&lcore_barrier, 0, __ATOMIC_RELAXED); 148 149 uint64_t start = rte_rdtsc(); 150 151 for (i = 0; i < iterations; i++) { 152 rte_stack_push(s, objs, size); 153 rte_stack_pop(s, objs, size); 154 } 155 156 uint64_t end = rte_rdtsc(); 157 158 args->avg = ((double)(end - start))/(iterations * size); 159 160 return 0; 161 } 162 163 /* 164 * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack 165 * perf when between hyperthread siblings, cores on the same socket, and cores 166 * on different sockets. 167 */ 168 static void 169 run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s, 170 lcore_function_t fn) 171 { 172 struct thread_args args[2]; 173 unsigned int i; 174 175 for (i = 0; i < RTE_DIM(bulk_sizes); i++) { 176 __atomic_store_n(&lcore_barrier, 2, __ATOMIC_RELAXED); 177 178 args[0].sz = args[1].sz = bulk_sizes[i]; 179 args[0].s = args[1].s = s; 180 181 if (cores->c1 == rte_get_main_lcore()) { 182 rte_eal_remote_launch(fn, &args[1], cores->c2); 183 fn(&args[0]); 184 rte_eal_wait_lcore(cores->c2); 185 } else { 186 rte_eal_remote_launch(fn, &args[0], cores->c1); 187 rte_eal_remote_launch(fn, &args[1], cores->c2); 188 rte_eal_wait_lcore(cores->c1); 189 rte_eal_wait_lcore(cores->c2); 190 } 191 192 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n", 193 bulk_sizes[i], (args[0].avg + args[1].avg) / 2); 194 } 195 } 196 197 /* Run bulk_push_pop() simultaneously on 1+ cores. */ 198 static void 199 run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n) 200 { 201 struct thread_args args[RTE_MAX_LCORE]; 202 unsigned int i; 203 204 for (i = 0; i < RTE_DIM(bulk_sizes); i++) { 205 unsigned int lcore_id; 206 int cnt = 0; 207 double avg; 208 209 __atomic_store_n(&lcore_barrier, n, __ATOMIC_RELAXED); 210 211 RTE_LCORE_FOREACH_WORKER(lcore_id) { 212 if (++cnt >= n) 213 break; 214 215 args[lcore_id].s = s; 216 args[lcore_id].sz = bulk_sizes[i]; 217 218 if (rte_eal_remote_launch(fn, &args[lcore_id], 219 lcore_id)) 220 rte_panic("Failed to launch lcore %d\n", 221 lcore_id); 222 } 223 224 lcore_id = rte_lcore_id(); 225 226 args[lcore_id].s = s; 227 args[lcore_id].sz = bulk_sizes[i]; 228 229 fn(&args[lcore_id]); 230 231 rte_eal_mp_wait_lcore(); 232 233 avg = args[rte_lcore_id()].avg; 234 235 cnt = 0; 236 RTE_LCORE_FOREACH_WORKER(lcore_id) { 237 if (++cnt >= n) 238 break; 239 avg += args[lcore_id].avg; 240 } 241 242 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n", 243 bulk_sizes[i], avg / n); 244 } 245 } 246 247 /* 248 * Measure the cycle cost of pushing and popping a single pointer on a single 249 * lcore. 250 */ 251 static void 252 test_single_push_pop(struct rte_stack *s) 253 { 254 unsigned int iterations = 16000000; 255 void *obj = NULL; 256 unsigned int i; 257 258 uint64_t start = rte_rdtsc(); 259 260 for (i = 0; i < iterations; i++) { 261 rte_stack_push(s, &obj, 1); 262 rte_stack_pop(s, &obj, 1); 263 } 264 265 uint64_t end = rte_rdtsc(); 266 267 printf("Average cycles per single object push/pop: %.2F\n", 268 ((double)(end - start)) / iterations); 269 } 270 271 /* Measure the cycle cost of bulk pushing and popping on a single lcore. */ 272 static void 273 test_bulk_push_pop(struct rte_stack *s) 274 { 275 unsigned int iterations = 8000000; 276 void *objs[MAX_BURST]; 277 unsigned int sz, i; 278 279 for (sz = 0; sz < RTE_DIM(bulk_sizes); sz++) { 280 uint64_t start = rte_rdtsc(); 281 282 for (i = 0; i < iterations; i++) { 283 rte_stack_push(s, objs, bulk_sizes[sz]); 284 rte_stack_pop(s, objs, bulk_sizes[sz]); 285 } 286 287 uint64_t end = rte_rdtsc(); 288 289 double avg = ((double)(end - start) / 290 (iterations * bulk_sizes[sz])); 291 292 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n", 293 bulk_sizes[sz], avg); 294 } 295 } 296 297 static int 298 __test_stack_perf(uint32_t flags) 299 { 300 struct lcore_pair cores; 301 struct rte_stack *s; 302 303 __atomic_store_n(&lcore_barrier, 0, __ATOMIC_RELAXED); 304 305 s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), flags); 306 if (s == NULL) { 307 printf("[%s():%u] failed to create a stack\n", 308 __func__, __LINE__); 309 return -1; 310 } 311 312 printf("### Testing single element push/pop ###\n"); 313 test_single_push_pop(s); 314 315 printf("\n### Testing empty pop ###\n"); 316 test_empty_pop(s); 317 318 printf("\n### Testing using a single lcore ###\n"); 319 test_bulk_push_pop(s); 320 321 if (get_two_hyperthreads(&cores) == 0) { 322 printf("\n### Testing using two hyperthreads ###\n"); 323 run_on_core_pair(&cores, s, bulk_push_pop); 324 } 325 if (get_two_cores(&cores) == 0) { 326 printf("\n### Testing using two physical cores ###\n"); 327 run_on_core_pair(&cores, s, bulk_push_pop); 328 } 329 if (get_two_sockets(&cores) == 0) { 330 printf("\n### Testing using two NUMA nodes ###\n"); 331 run_on_core_pair(&cores, s, bulk_push_pop); 332 } 333 334 printf("\n### Testing on all %u lcores ###\n", rte_lcore_count()); 335 run_on_n_cores(s, bulk_push_pop, rte_lcore_count()); 336 337 rte_stack_free(s); 338 return 0; 339 } 340 341 static int 342 test_stack_perf(void) 343 { 344 return __test_stack_perf(0); 345 } 346 347 static int 348 test_lf_stack_perf(void) 349 { 350 #if defined(RTE_STACK_LF_SUPPORTED) 351 return __test_stack_perf(RTE_STACK_F_LF); 352 #else 353 return TEST_SKIPPED; 354 #endif 355 } 356 357 REGISTER_TEST_COMMAND(stack_perf_autotest, test_stack_perf); 358 REGISTER_TEST_COMMAND(stack_lf_perf_autotest, test_lf_stack_perf); 359