1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2019 Intel Corporation 3 */ 4 5 6 #include <stdio.h> 7 #include <inttypes.h> 8 9 #include <rte_atomic.h> 10 #include <rte_cycles.h> 11 #include <rte_launch.h> 12 #include <rte_pause.h> 13 #include <rte_stack.h> 14 15 #include "test.h" 16 17 #define STACK_NAME "STACK_PERF" 18 #define MAX_BURST 32 19 #define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST) 20 21 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 22 23 /* 24 * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time 25 * constants. 26 */ 27 static volatile unsigned int bulk_sizes[] = {8, MAX_BURST}; 28 29 static rte_atomic32_t lcore_barrier; 30 31 struct lcore_pair { 32 unsigned int c1; 33 unsigned int c2; 34 }; 35 36 static int 37 get_two_hyperthreads(struct lcore_pair *lcp) 38 { 39 unsigned int socket[2]; 40 unsigned int core[2]; 41 unsigned int id[2]; 42 43 RTE_LCORE_FOREACH(id[0]) { 44 RTE_LCORE_FOREACH(id[1]) { 45 if (id[0] == id[1]) 46 continue; 47 core[0] = lcore_config[id[0]].core_id; 48 core[1] = lcore_config[id[1]].core_id; 49 socket[0] = lcore_config[id[0]].socket_id; 50 socket[1] = lcore_config[id[1]].socket_id; 51 if ((core[0] == core[1]) && (socket[0] == socket[1])) { 52 lcp->c1 = id[0]; 53 lcp->c2 = id[1]; 54 return 0; 55 } 56 } 57 } 58 59 return 1; 60 } 61 62 static int 63 get_two_cores(struct lcore_pair *lcp) 64 { 65 unsigned int socket[2]; 66 unsigned int core[2]; 67 unsigned int id[2]; 68 69 RTE_LCORE_FOREACH(id[0]) { 70 RTE_LCORE_FOREACH(id[1]) { 71 if (id[0] == id[1]) 72 continue; 73 core[0] = lcore_config[id[0]].core_id; 74 core[1] = lcore_config[id[1]].core_id; 75 socket[0] = lcore_config[id[0]].socket_id; 76 socket[1] = lcore_config[id[1]].socket_id; 77 if ((core[0] != core[1]) && (socket[0] == socket[1])) { 78 lcp->c1 = id[0]; 79 lcp->c2 = id[1]; 80 return 0; 81 } 82 } 83 } 84 85 return 1; 86 } 87 88 static int 89 get_two_sockets(struct lcore_pair *lcp) 90 { 91 unsigned int socket[2]; 92 unsigned int id[2]; 93 94 RTE_LCORE_FOREACH(id[0]) { 95 RTE_LCORE_FOREACH(id[1]) { 96 if (id[0] == id[1]) 97 continue; 98 socket[0] = lcore_config[id[0]].socket_id; 99 socket[1] = lcore_config[id[1]].socket_id; 100 if (socket[0] != socket[1]) { 101 lcp->c1 = id[0]; 102 lcp->c2 = id[1]; 103 return 0; 104 } 105 } 106 } 107 108 return 1; 109 } 110 111 /* Measure the cycle cost of popping an empty stack. */ 112 static void 113 test_empty_pop(struct rte_stack *s) 114 { 115 unsigned int iterations = 100000000; 116 void *objs[MAX_BURST]; 117 unsigned int i; 118 119 uint64_t start = rte_rdtsc(); 120 121 for (i = 0; i < iterations; i++) 122 rte_stack_pop(s, objs, bulk_sizes[0]); 123 124 uint64_t end = rte_rdtsc(); 125 126 printf("Stack empty pop: %.2F\n", 127 (double)(end - start) / iterations); 128 } 129 130 struct thread_args { 131 struct rte_stack *s; 132 unsigned int sz; 133 double avg; 134 }; 135 136 /* Measure the average per-pointer cycle cost of stack push and pop */ 137 static int 138 bulk_push_pop(void *p) 139 { 140 unsigned int iterations = 1000000; 141 struct thread_args *args = p; 142 void *objs[MAX_BURST] = {0}; 143 unsigned int size, i; 144 struct rte_stack *s; 145 146 s = args->s; 147 size = args->sz; 148 149 rte_atomic32_sub(&lcore_barrier, 1); 150 while (rte_atomic32_read(&lcore_barrier) != 0) 151 rte_pause(); 152 153 uint64_t start = rte_rdtsc(); 154 155 for (i = 0; i < iterations; i++) { 156 rte_stack_push(s, objs, size); 157 rte_stack_pop(s, objs, size); 158 } 159 160 uint64_t end = rte_rdtsc(); 161 162 args->avg = ((double)(end - start))/(iterations * size); 163 164 return 0; 165 } 166 167 /* 168 * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack 169 * perf when between hyperthread siblings, cores on the same socket, and cores 170 * on different sockets. 171 */ 172 static void 173 run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s, 174 lcore_function_t fn) 175 { 176 struct thread_args args[2]; 177 unsigned int i; 178 179 for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) { 180 rte_atomic32_set(&lcore_barrier, 2); 181 182 args[0].sz = args[1].sz = bulk_sizes[i]; 183 args[0].s = args[1].s = s; 184 185 if (cores->c1 == rte_get_master_lcore()) { 186 rte_eal_remote_launch(fn, &args[1], cores->c2); 187 fn(&args[0]); 188 rte_eal_wait_lcore(cores->c2); 189 } else { 190 rte_eal_remote_launch(fn, &args[0], cores->c1); 191 rte_eal_remote_launch(fn, &args[1], cores->c2); 192 rte_eal_wait_lcore(cores->c1); 193 rte_eal_wait_lcore(cores->c2); 194 } 195 196 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n", 197 bulk_sizes[i], (args[0].avg + args[1].avg) / 2); 198 } 199 } 200 201 /* Run bulk_push_pop() simultaneously on 1+ cores. */ 202 static void 203 run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n) 204 { 205 struct thread_args args[RTE_MAX_LCORE]; 206 unsigned int i; 207 208 for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) { 209 unsigned int lcore_id; 210 int cnt = 0; 211 double avg; 212 213 rte_atomic32_set(&lcore_barrier, n); 214 215 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 216 if (++cnt >= n) 217 break; 218 219 args[lcore_id].s = s; 220 args[lcore_id].sz = bulk_sizes[i]; 221 222 if (rte_eal_remote_launch(fn, &args[lcore_id], 223 lcore_id)) 224 rte_panic("Failed to launch lcore %d\n", 225 lcore_id); 226 } 227 228 lcore_id = rte_lcore_id(); 229 230 args[lcore_id].s = s; 231 args[lcore_id].sz = bulk_sizes[i]; 232 233 fn(&args[lcore_id]); 234 235 rte_eal_mp_wait_lcore(); 236 237 avg = args[rte_lcore_id()].avg; 238 239 cnt = 0; 240 RTE_LCORE_FOREACH_SLAVE(lcore_id) { 241 if (++cnt >= n) 242 break; 243 avg += args[lcore_id].avg; 244 } 245 246 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n", 247 bulk_sizes[i], avg / n); 248 } 249 } 250 251 /* 252 * Measure the cycle cost of pushing and popping a single pointer on a single 253 * lcore. 254 */ 255 static void 256 test_single_push_pop(struct rte_stack *s) 257 { 258 unsigned int iterations = 16000000; 259 void *obj = NULL; 260 unsigned int i; 261 262 uint64_t start = rte_rdtsc(); 263 264 for (i = 0; i < iterations; i++) { 265 rte_stack_push(s, &obj, 1); 266 rte_stack_pop(s, &obj, 1); 267 } 268 269 uint64_t end = rte_rdtsc(); 270 271 printf("Average cycles per single object push/pop: %.2F\n", 272 ((double)(end - start)) / iterations); 273 } 274 275 /* Measure the cycle cost of bulk pushing and popping on a single lcore. */ 276 static void 277 test_bulk_push_pop(struct rte_stack *s) 278 { 279 unsigned int iterations = 8000000; 280 void *objs[MAX_BURST]; 281 unsigned int sz, i; 282 283 for (sz = 0; sz < ARRAY_SIZE(bulk_sizes); sz++) { 284 uint64_t start = rte_rdtsc(); 285 286 for (i = 0; i < iterations; i++) { 287 rte_stack_push(s, objs, bulk_sizes[sz]); 288 rte_stack_pop(s, objs, bulk_sizes[sz]); 289 } 290 291 uint64_t end = rte_rdtsc(); 292 293 double avg = ((double)(end - start) / 294 (iterations * bulk_sizes[sz])); 295 296 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n", 297 bulk_sizes[sz], avg); 298 } 299 } 300 301 static int 302 __test_stack_perf(uint32_t flags) 303 { 304 struct lcore_pair cores; 305 struct rte_stack *s; 306 307 rte_atomic32_init(&lcore_barrier); 308 309 s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), flags); 310 if (s == NULL) { 311 printf("[%s():%u] failed to create a stack\n", 312 __func__, __LINE__); 313 return -1; 314 } 315 316 printf("### Testing single element push/pop ###\n"); 317 test_single_push_pop(s); 318 319 printf("\n### Testing empty pop ###\n"); 320 test_empty_pop(s); 321 322 printf("\n### Testing using a single lcore ###\n"); 323 test_bulk_push_pop(s); 324 325 if (get_two_hyperthreads(&cores) == 0) { 326 printf("\n### Testing using two hyperthreads ###\n"); 327 run_on_core_pair(&cores, s, bulk_push_pop); 328 } 329 if (get_two_cores(&cores) == 0) { 330 printf("\n### Testing using two physical cores ###\n"); 331 run_on_core_pair(&cores, s, bulk_push_pop); 332 } 333 if (get_two_sockets(&cores) == 0) { 334 printf("\n### Testing using two NUMA nodes ###\n"); 335 run_on_core_pair(&cores, s, bulk_push_pop); 336 } 337 338 printf("\n### Testing on all %u lcores ###\n", rte_lcore_count()); 339 run_on_n_cores(s, bulk_push_pop, rte_lcore_count()); 340 341 rte_stack_free(s); 342 return 0; 343 } 344 345 static int 346 test_stack_perf(void) 347 { 348 return __test_stack_perf(0); 349 } 350 351 static int 352 test_lf_stack_perf(void) 353 { 354 return __test_stack_perf(RTE_STACK_F_LF); 355 } 356 357 REGISTER_TEST_COMMAND(stack_perf_autotest, test_stack_perf); 358 REGISTER_TEST_COMMAND(stack_lf_perf_autotest, test_lf_stack_perf); 359