1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2019 Intel Corporation 3 */ 4 5 6 #include <stdio.h> 7 #include <inttypes.h> 8 9 #include <rte_atomic.h> 10 #include <rte_cycles.h> 11 #include <rte_launch.h> 12 #include <rte_pause.h> 13 #include <rte_stack.h> 14 15 #include "test.h" 16 17 #define STACK_NAME "STACK_PERF" 18 #define MAX_BURST 32 19 #define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST) 20 21 /* 22 * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time 23 * constants. 24 */ 25 static volatile unsigned int bulk_sizes[] = {8, MAX_BURST}; 26 27 static rte_atomic32_t lcore_barrier; 28 29 struct lcore_pair { 30 unsigned int c1; 31 unsigned int c2; 32 }; 33 34 static int 35 get_two_hyperthreads(struct lcore_pair *lcp) 36 { 37 unsigned int socket[2]; 38 unsigned int core[2]; 39 unsigned int id[2]; 40 41 RTE_LCORE_FOREACH(id[0]) { 42 RTE_LCORE_FOREACH(id[1]) { 43 if (id[0] == id[1]) 44 continue; 45 core[0] = rte_lcore_to_cpu_id(id[0]); 46 core[1] = rte_lcore_to_cpu_id(id[1]); 47 socket[0] = rte_lcore_to_socket_id(id[0]); 48 socket[1] = rte_lcore_to_socket_id(id[1]); 49 if ((core[0] == core[1]) && (socket[0] == socket[1])) { 50 lcp->c1 = id[0]; 51 lcp->c2 = id[1]; 52 return 0; 53 } 54 } 55 } 56 57 return 1; 58 } 59 60 static int 61 get_two_cores(struct lcore_pair *lcp) 62 { 63 unsigned int socket[2]; 64 unsigned int core[2]; 65 unsigned int id[2]; 66 67 RTE_LCORE_FOREACH(id[0]) { 68 RTE_LCORE_FOREACH(id[1]) { 69 if (id[0] == id[1]) 70 continue; 71 core[0] = rte_lcore_to_cpu_id(id[0]); 72 core[1] = rte_lcore_to_cpu_id(id[1]); 73 socket[0] = rte_lcore_to_socket_id(id[0]); 74 socket[1] = rte_lcore_to_socket_id(id[1]); 75 if ((core[0] != core[1]) && (socket[0] == socket[1])) { 76 lcp->c1 = id[0]; 77 lcp->c2 = id[1]; 78 return 0; 79 } 80 } 81 } 82 83 return 1; 84 } 85 86 static int 87 get_two_sockets(struct lcore_pair *lcp) 88 { 89 unsigned int socket[2]; 90 unsigned int id[2]; 91 92 RTE_LCORE_FOREACH(id[0]) { 93 RTE_LCORE_FOREACH(id[1]) { 94 if (id[0] == id[1]) 95 continue; 96 socket[0] = rte_lcore_to_socket_id(id[0]); 97 socket[1] = rte_lcore_to_socket_id(id[1]); 98 if (socket[0] != socket[1]) { 99 lcp->c1 = id[0]; 100 lcp->c2 = id[1]; 101 return 0; 102 } 103 } 104 } 105 106 return 1; 107 } 108 109 /* Measure the cycle cost of popping an empty stack. */ 110 static void 111 test_empty_pop(struct rte_stack *s) 112 { 113 unsigned int iterations = 100000000; 114 void *objs[MAX_BURST]; 115 unsigned int i; 116 117 uint64_t start = rte_rdtsc(); 118 119 for (i = 0; i < iterations; i++) 120 rte_stack_pop(s, objs, bulk_sizes[0]); 121 122 uint64_t end = rte_rdtsc(); 123 124 printf("Stack empty pop: %.2F\n", 125 (double)(end - start) / iterations); 126 } 127 128 struct thread_args { 129 struct rte_stack *s; 130 unsigned int sz; 131 double avg; 132 }; 133 134 /* Measure the average per-pointer cycle cost of stack push and pop */ 135 static int 136 bulk_push_pop(void *p) 137 { 138 unsigned int iterations = 1000000; 139 struct thread_args *args = p; 140 void *objs[MAX_BURST] = {0}; 141 unsigned int size, i; 142 struct rte_stack *s; 143 144 s = args->s; 145 size = args->sz; 146 147 rte_atomic32_sub(&lcore_barrier, 1); 148 while (rte_atomic32_read(&lcore_barrier) != 0) 149 rte_pause(); 150 151 uint64_t start = rte_rdtsc(); 152 153 for (i = 0; i < iterations; i++) { 154 rte_stack_push(s, objs, size); 155 rte_stack_pop(s, objs, size); 156 } 157 158 uint64_t end = rte_rdtsc(); 159 160 args->avg = ((double)(end - start))/(iterations * size); 161 162 return 0; 163 } 164 165 /* 166 * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack 167 * perf when between hyperthread siblings, cores on the same socket, and cores 168 * on different sockets. 169 */ 170 static void 171 run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s, 172 lcore_function_t fn) 173 { 174 struct thread_args args[2]; 175 unsigned int i; 176 177 for (i = 0; i < RTE_DIM(bulk_sizes); i++) { 178 rte_atomic32_set(&lcore_barrier, 2); 179 180 args[0].sz = args[1].sz = bulk_sizes[i]; 181 args[0].s = args[1].s = s; 182 183 if (cores->c1 == rte_get_main_lcore()) { 184 rte_eal_remote_launch(fn, &args[1], cores->c2); 185 fn(&args[0]); 186 rte_eal_wait_lcore(cores->c2); 187 } else { 188 rte_eal_remote_launch(fn, &args[0], cores->c1); 189 rte_eal_remote_launch(fn, &args[1], cores->c2); 190 rte_eal_wait_lcore(cores->c1); 191 rte_eal_wait_lcore(cores->c2); 192 } 193 194 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n", 195 bulk_sizes[i], (args[0].avg + args[1].avg) / 2); 196 } 197 } 198 199 /* Run bulk_push_pop() simultaneously on 1+ cores. */ 200 static void 201 run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n) 202 { 203 struct thread_args args[RTE_MAX_LCORE]; 204 unsigned int i; 205 206 for (i = 0; i < RTE_DIM(bulk_sizes); i++) { 207 unsigned int lcore_id; 208 int cnt = 0; 209 double avg; 210 211 rte_atomic32_set(&lcore_barrier, n); 212 213 RTE_LCORE_FOREACH_WORKER(lcore_id) { 214 if (++cnt >= n) 215 break; 216 217 args[lcore_id].s = s; 218 args[lcore_id].sz = bulk_sizes[i]; 219 220 if (rte_eal_remote_launch(fn, &args[lcore_id], 221 lcore_id)) 222 rte_panic("Failed to launch lcore %d\n", 223 lcore_id); 224 } 225 226 lcore_id = rte_lcore_id(); 227 228 args[lcore_id].s = s; 229 args[lcore_id].sz = bulk_sizes[i]; 230 231 fn(&args[lcore_id]); 232 233 rte_eal_mp_wait_lcore(); 234 235 avg = args[rte_lcore_id()].avg; 236 237 cnt = 0; 238 RTE_LCORE_FOREACH_WORKER(lcore_id) { 239 if (++cnt >= n) 240 break; 241 avg += args[lcore_id].avg; 242 } 243 244 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n", 245 bulk_sizes[i], avg / n); 246 } 247 } 248 249 /* 250 * Measure the cycle cost of pushing and popping a single pointer on a single 251 * lcore. 252 */ 253 static void 254 test_single_push_pop(struct rte_stack *s) 255 { 256 unsigned int iterations = 16000000; 257 void *obj = NULL; 258 unsigned int i; 259 260 uint64_t start = rte_rdtsc(); 261 262 for (i = 0; i < iterations; i++) { 263 rte_stack_push(s, &obj, 1); 264 rte_stack_pop(s, &obj, 1); 265 } 266 267 uint64_t end = rte_rdtsc(); 268 269 printf("Average cycles per single object push/pop: %.2F\n", 270 ((double)(end - start)) / iterations); 271 } 272 273 /* Measure the cycle cost of bulk pushing and popping on a single lcore. */ 274 static void 275 test_bulk_push_pop(struct rte_stack *s) 276 { 277 unsigned int iterations = 8000000; 278 void *objs[MAX_BURST]; 279 unsigned int sz, i; 280 281 for (sz = 0; sz < RTE_DIM(bulk_sizes); sz++) { 282 uint64_t start = rte_rdtsc(); 283 284 for (i = 0; i < iterations; i++) { 285 rte_stack_push(s, objs, bulk_sizes[sz]); 286 rte_stack_pop(s, objs, bulk_sizes[sz]); 287 } 288 289 uint64_t end = rte_rdtsc(); 290 291 double avg = ((double)(end - start) / 292 (iterations * bulk_sizes[sz])); 293 294 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n", 295 bulk_sizes[sz], avg); 296 } 297 } 298 299 static int 300 __test_stack_perf(uint32_t flags) 301 { 302 struct lcore_pair cores; 303 struct rte_stack *s; 304 305 rte_atomic32_init(&lcore_barrier); 306 307 s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), flags); 308 if (s == NULL) { 309 printf("[%s():%u] failed to create a stack\n", 310 __func__, __LINE__); 311 return -1; 312 } 313 314 printf("### Testing single element push/pop ###\n"); 315 test_single_push_pop(s); 316 317 printf("\n### Testing empty pop ###\n"); 318 test_empty_pop(s); 319 320 printf("\n### Testing using a single lcore ###\n"); 321 test_bulk_push_pop(s); 322 323 if (get_two_hyperthreads(&cores) == 0) { 324 printf("\n### Testing using two hyperthreads ###\n"); 325 run_on_core_pair(&cores, s, bulk_push_pop); 326 } 327 if (get_two_cores(&cores) == 0) { 328 printf("\n### Testing using two physical cores ###\n"); 329 run_on_core_pair(&cores, s, bulk_push_pop); 330 } 331 if (get_two_sockets(&cores) == 0) { 332 printf("\n### Testing using two NUMA nodes ###\n"); 333 run_on_core_pair(&cores, s, bulk_push_pop); 334 } 335 336 printf("\n### Testing on all %u lcores ###\n", rte_lcore_count()); 337 run_on_n_cores(s, bulk_push_pop, rte_lcore_count()); 338 339 rte_stack_free(s); 340 return 0; 341 } 342 343 static int 344 test_stack_perf(void) 345 { 346 return __test_stack_perf(0); 347 } 348 349 static int 350 test_lf_stack_perf(void) 351 { 352 #if defined(RTE_STACK_LF_SUPPORTED) 353 return __test_stack_perf(RTE_STACK_F_LF); 354 #else 355 return TEST_SKIPPED; 356 #endif 357 } 358 359 REGISTER_TEST_COMMAND(stack_perf_autotest, test_stack_perf); 360 REGISTER_TEST_COMMAND(stack_lf_perf_autotest, test_lf_stack_perf); 361