1cfe6fab0SGage Eads /* SPDX-License-Identifier: BSD-3-Clause 2cfe6fab0SGage Eads * Copyright(c) 2019 Intel Corporation 3cfe6fab0SGage Eads */ 4cfe6fab0SGage Eads 5cfe6fab0SGage Eads 6cfe6fab0SGage Eads #include <stdio.h> 7cfe6fab0SGage Eads #include <inttypes.h> 8cfe6fab0SGage Eads 9cfe6fab0SGage Eads #include <rte_atomic.h> 10cfe6fab0SGage Eads #include <rte_cycles.h> 11cfe6fab0SGage Eads #include <rte_launch.h> 12cfe6fab0SGage Eads #include <rte_pause.h> 13cfe6fab0SGage Eads #include <rte_stack.h> 14cfe6fab0SGage Eads 15cfe6fab0SGage Eads #include "test.h" 16cfe6fab0SGage Eads 17cfe6fab0SGage Eads #define STACK_NAME "STACK_PERF" 18cfe6fab0SGage Eads #define MAX_BURST 32 19cfe6fab0SGage Eads #define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST) 20cfe6fab0SGage Eads 21cfe6fab0SGage Eads /* 22cfe6fab0SGage Eads * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time 23cfe6fab0SGage Eads * constants. 24cfe6fab0SGage Eads */ 25cfe6fab0SGage Eads static volatile unsigned int bulk_sizes[] = {8, MAX_BURST}; 26cfe6fab0SGage Eads 27cfe6fab0SGage Eads static rte_atomic32_t lcore_barrier; 28cfe6fab0SGage Eads 29cfe6fab0SGage Eads struct lcore_pair { 30cfe6fab0SGage Eads unsigned int c1; 31cfe6fab0SGage Eads unsigned int c2; 32cfe6fab0SGage Eads }; 33cfe6fab0SGage Eads 34cfe6fab0SGage Eads static int 35cfe6fab0SGage Eads get_two_hyperthreads(struct lcore_pair *lcp) 36cfe6fab0SGage Eads { 37cfe6fab0SGage Eads unsigned int socket[2]; 38cfe6fab0SGage Eads unsigned int core[2]; 39cfe6fab0SGage Eads unsigned int id[2]; 40cfe6fab0SGage Eads 41cfe6fab0SGage Eads RTE_LCORE_FOREACH(id[0]) { 42cfe6fab0SGage Eads RTE_LCORE_FOREACH(id[1]) { 43cfe6fab0SGage Eads if (id[0] == id[1]) 44cfe6fab0SGage Eads continue; 45de307f7aSStephen Hemminger core[0] = rte_lcore_to_cpu_id(id[0]); 46de307f7aSStephen Hemminger core[1] = rte_lcore_to_cpu_id(id[1]); 47de307f7aSStephen Hemminger socket[0] = rte_lcore_to_socket_id(id[0]); 48de307f7aSStephen Hemminger socket[1] = rte_lcore_to_socket_id(id[1]); 49cfe6fab0SGage Eads if ((core[0] == core[1]) && (socket[0] == socket[1])) { 50cfe6fab0SGage Eads lcp->c1 = id[0]; 51cfe6fab0SGage Eads lcp->c2 = id[1]; 52cfe6fab0SGage Eads return 0; 53cfe6fab0SGage Eads } 54cfe6fab0SGage Eads } 55cfe6fab0SGage Eads } 56cfe6fab0SGage Eads 57cfe6fab0SGage Eads return 1; 58cfe6fab0SGage Eads } 59cfe6fab0SGage Eads 60cfe6fab0SGage Eads static int 61cfe6fab0SGage Eads get_two_cores(struct lcore_pair *lcp) 62cfe6fab0SGage Eads { 63cfe6fab0SGage Eads unsigned int socket[2]; 64cfe6fab0SGage Eads unsigned int core[2]; 65cfe6fab0SGage Eads unsigned int id[2]; 66cfe6fab0SGage Eads 67cfe6fab0SGage Eads RTE_LCORE_FOREACH(id[0]) { 68cfe6fab0SGage Eads RTE_LCORE_FOREACH(id[1]) { 69cfe6fab0SGage Eads if (id[0] == id[1]) 70cfe6fab0SGage Eads continue; 71de307f7aSStephen Hemminger core[0] = rte_lcore_to_cpu_id(id[0]); 72de307f7aSStephen Hemminger core[1] = rte_lcore_to_cpu_id(id[1]); 73de307f7aSStephen Hemminger socket[0] = rte_lcore_to_socket_id(id[0]); 74de307f7aSStephen Hemminger socket[1] = rte_lcore_to_socket_id(id[1]); 75cfe6fab0SGage Eads if ((core[0] != core[1]) && (socket[0] == socket[1])) { 76cfe6fab0SGage Eads lcp->c1 = id[0]; 77cfe6fab0SGage Eads lcp->c2 = id[1]; 78cfe6fab0SGage Eads return 0; 79cfe6fab0SGage Eads } 80cfe6fab0SGage Eads } 81cfe6fab0SGage Eads } 82cfe6fab0SGage Eads 83cfe6fab0SGage Eads return 1; 84cfe6fab0SGage Eads } 85cfe6fab0SGage Eads 86cfe6fab0SGage Eads static int 87cfe6fab0SGage Eads get_two_sockets(struct lcore_pair *lcp) 88cfe6fab0SGage Eads { 89cfe6fab0SGage Eads unsigned int socket[2]; 90cfe6fab0SGage Eads unsigned int id[2]; 91cfe6fab0SGage Eads 92cfe6fab0SGage Eads RTE_LCORE_FOREACH(id[0]) { 93cfe6fab0SGage Eads RTE_LCORE_FOREACH(id[1]) { 94cfe6fab0SGage Eads if (id[0] == id[1]) 95cfe6fab0SGage Eads continue; 96de307f7aSStephen Hemminger socket[0] = rte_lcore_to_socket_id(id[0]); 97de307f7aSStephen Hemminger socket[1] = rte_lcore_to_socket_id(id[1]); 98cfe6fab0SGage Eads if (socket[0] != socket[1]) { 99cfe6fab0SGage Eads lcp->c1 = id[0]; 100cfe6fab0SGage Eads lcp->c2 = id[1]; 101cfe6fab0SGage Eads return 0; 102cfe6fab0SGage Eads } 103cfe6fab0SGage Eads } 104cfe6fab0SGage Eads } 105cfe6fab0SGage Eads 106cfe6fab0SGage Eads return 1; 107cfe6fab0SGage Eads } 108cfe6fab0SGage Eads 109cfe6fab0SGage Eads /* Measure the cycle cost of popping an empty stack. */ 110cfe6fab0SGage Eads static void 111cfe6fab0SGage Eads test_empty_pop(struct rte_stack *s) 112cfe6fab0SGage Eads { 113cfe6fab0SGage Eads unsigned int iterations = 100000000; 114cfe6fab0SGage Eads void *objs[MAX_BURST]; 115cfe6fab0SGage Eads unsigned int i; 116cfe6fab0SGage Eads 117cfe6fab0SGage Eads uint64_t start = rte_rdtsc(); 118cfe6fab0SGage Eads 119cfe6fab0SGage Eads for (i = 0; i < iterations; i++) 120cfe6fab0SGage Eads rte_stack_pop(s, objs, bulk_sizes[0]); 121cfe6fab0SGage Eads 122cfe6fab0SGage Eads uint64_t end = rte_rdtsc(); 123cfe6fab0SGage Eads 124cfe6fab0SGage Eads printf("Stack empty pop: %.2F\n", 125cfe6fab0SGage Eads (double)(end - start) / iterations); 126cfe6fab0SGage Eads } 127cfe6fab0SGage Eads 128cfe6fab0SGage Eads struct thread_args { 129cfe6fab0SGage Eads struct rte_stack *s; 130cfe6fab0SGage Eads unsigned int sz; 131cfe6fab0SGage Eads double avg; 132cfe6fab0SGage Eads }; 133cfe6fab0SGage Eads 134cfe6fab0SGage Eads /* Measure the average per-pointer cycle cost of stack push and pop */ 135cfe6fab0SGage Eads static int 136cfe6fab0SGage Eads bulk_push_pop(void *p) 137cfe6fab0SGage Eads { 138cfe6fab0SGage Eads unsigned int iterations = 1000000; 139cfe6fab0SGage Eads struct thread_args *args = p; 140cfe6fab0SGage Eads void *objs[MAX_BURST] = {0}; 141cfe6fab0SGage Eads unsigned int size, i; 142cfe6fab0SGage Eads struct rte_stack *s; 143cfe6fab0SGage Eads 144cfe6fab0SGage Eads s = args->s; 145cfe6fab0SGage Eads size = args->sz; 146cfe6fab0SGage Eads 147cfe6fab0SGage Eads rte_atomic32_sub(&lcore_barrier, 1); 148cfe6fab0SGage Eads while (rte_atomic32_read(&lcore_barrier) != 0) 149cfe6fab0SGage Eads rte_pause(); 150cfe6fab0SGage Eads 151cfe6fab0SGage Eads uint64_t start = rte_rdtsc(); 152cfe6fab0SGage Eads 153cfe6fab0SGage Eads for (i = 0; i < iterations; i++) { 154cfe6fab0SGage Eads rte_stack_push(s, objs, size); 155cfe6fab0SGage Eads rte_stack_pop(s, objs, size); 156cfe6fab0SGage Eads } 157cfe6fab0SGage Eads 158cfe6fab0SGage Eads uint64_t end = rte_rdtsc(); 159cfe6fab0SGage Eads 160cfe6fab0SGage Eads args->avg = ((double)(end - start))/(iterations * size); 161cfe6fab0SGage Eads 162cfe6fab0SGage Eads return 0; 163cfe6fab0SGage Eads } 164cfe6fab0SGage Eads 165cfe6fab0SGage Eads /* 166cfe6fab0SGage Eads * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack 167cfe6fab0SGage Eads * perf when between hyperthread siblings, cores on the same socket, and cores 168cfe6fab0SGage Eads * on different sockets. 169cfe6fab0SGage Eads */ 170cfe6fab0SGage Eads static void 171cfe6fab0SGage Eads run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s, 172cfe6fab0SGage Eads lcore_function_t fn) 173cfe6fab0SGage Eads { 174cfe6fab0SGage Eads struct thread_args args[2]; 175cfe6fab0SGage Eads unsigned int i; 176cfe6fab0SGage Eads 177*8ada5b15SPavan Nikhilesh for (i = 0; i < RTE_DIM(bulk_sizes); i++) { 178cfe6fab0SGage Eads rte_atomic32_set(&lcore_barrier, 2); 179cfe6fab0SGage Eads 180cfe6fab0SGage Eads args[0].sz = args[1].sz = bulk_sizes[i]; 181cfe6fab0SGage Eads args[0].s = args[1].s = s; 182cfe6fab0SGage Eads 183cfe6fab0SGage Eads if (cores->c1 == rte_get_master_lcore()) { 184cfe6fab0SGage Eads rte_eal_remote_launch(fn, &args[1], cores->c2); 185cfe6fab0SGage Eads fn(&args[0]); 186cfe6fab0SGage Eads rte_eal_wait_lcore(cores->c2); 187cfe6fab0SGage Eads } else { 188cfe6fab0SGage Eads rte_eal_remote_launch(fn, &args[0], cores->c1); 189cfe6fab0SGage Eads rte_eal_remote_launch(fn, &args[1], cores->c2); 190cfe6fab0SGage Eads rte_eal_wait_lcore(cores->c1); 191cfe6fab0SGage Eads rte_eal_wait_lcore(cores->c2); 192cfe6fab0SGage Eads } 193cfe6fab0SGage Eads 194cfe6fab0SGage Eads printf("Average cycles per object push/pop (bulk size: %u): %.2F\n", 195cfe6fab0SGage Eads bulk_sizes[i], (args[0].avg + args[1].avg) / 2); 196cfe6fab0SGage Eads } 197cfe6fab0SGage Eads } 198cfe6fab0SGage Eads 199cfe6fab0SGage Eads /* Run bulk_push_pop() simultaneously on 1+ cores. */ 200cfe6fab0SGage Eads static void 201cfe6fab0SGage Eads run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n) 202cfe6fab0SGage Eads { 203cfe6fab0SGage Eads struct thread_args args[RTE_MAX_LCORE]; 204cfe6fab0SGage Eads unsigned int i; 205cfe6fab0SGage Eads 206*8ada5b15SPavan Nikhilesh for (i = 0; i < RTE_DIM(bulk_sizes); i++) { 207cfe6fab0SGage Eads unsigned int lcore_id; 208cfe6fab0SGage Eads int cnt = 0; 209cfe6fab0SGage Eads double avg; 210cfe6fab0SGage Eads 211cfe6fab0SGage Eads rte_atomic32_set(&lcore_barrier, n); 212cfe6fab0SGage Eads 213cfe6fab0SGage Eads RTE_LCORE_FOREACH_SLAVE(lcore_id) { 214cfe6fab0SGage Eads if (++cnt >= n) 215cfe6fab0SGage Eads break; 216cfe6fab0SGage Eads 217cfe6fab0SGage Eads args[lcore_id].s = s; 218cfe6fab0SGage Eads args[lcore_id].sz = bulk_sizes[i]; 219cfe6fab0SGage Eads 220cfe6fab0SGage Eads if (rte_eal_remote_launch(fn, &args[lcore_id], 221cfe6fab0SGage Eads lcore_id)) 222cfe6fab0SGage Eads rte_panic("Failed to launch lcore %d\n", 223cfe6fab0SGage Eads lcore_id); 224cfe6fab0SGage Eads } 225cfe6fab0SGage Eads 226cfe6fab0SGage Eads lcore_id = rte_lcore_id(); 227cfe6fab0SGage Eads 228cfe6fab0SGage Eads args[lcore_id].s = s; 229cfe6fab0SGage Eads args[lcore_id].sz = bulk_sizes[i]; 230cfe6fab0SGage Eads 231cfe6fab0SGage Eads fn(&args[lcore_id]); 232cfe6fab0SGage Eads 233cfe6fab0SGage Eads rte_eal_mp_wait_lcore(); 234cfe6fab0SGage Eads 235cfe6fab0SGage Eads avg = args[rte_lcore_id()].avg; 236cfe6fab0SGage Eads 237cfe6fab0SGage Eads cnt = 0; 238cfe6fab0SGage Eads RTE_LCORE_FOREACH_SLAVE(lcore_id) { 239cfe6fab0SGage Eads if (++cnt >= n) 240cfe6fab0SGage Eads break; 241cfe6fab0SGage Eads avg += args[lcore_id].avg; 242cfe6fab0SGage Eads } 243cfe6fab0SGage Eads 244cfe6fab0SGage Eads printf("Average cycles per object push/pop (bulk size: %u): %.2F\n", 245cfe6fab0SGage Eads bulk_sizes[i], avg / n); 246cfe6fab0SGage Eads } 247cfe6fab0SGage Eads } 248cfe6fab0SGage Eads 249cfe6fab0SGage Eads /* 250cfe6fab0SGage Eads * Measure the cycle cost of pushing and popping a single pointer on a single 251cfe6fab0SGage Eads * lcore. 252cfe6fab0SGage Eads */ 253cfe6fab0SGage Eads static void 254cfe6fab0SGage Eads test_single_push_pop(struct rte_stack *s) 255cfe6fab0SGage Eads { 256cfe6fab0SGage Eads unsigned int iterations = 16000000; 257cfe6fab0SGage Eads void *obj = NULL; 258cfe6fab0SGage Eads unsigned int i; 259cfe6fab0SGage Eads 260cfe6fab0SGage Eads uint64_t start = rte_rdtsc(); 261cfe6fab0SGage Eads 262cfe6fab0SGage Eads for (i = 0; i < iterations; i++) { 263cfe6fab0SGage Eads rte_stack_push(s, &obj, 1); 264cfe6fab0SGage Eads rte_stack_pop(s, &obj, 1); 265cfe6fab0SGage Eads } 266cfe6fab0SGage Eads 267cfe6fab0SGage Eads uint64_t end = rte_rdtsc(); 268cfe6fab0SGage Eads 269cfe6fab0SGage Eads printf("Average cycles per single object push/pop: %.2F\n", 270cfe6fab0SGage Eads ((double)(end - start)) / iterations); 271cfe6fab0SGage Eads } 272cfe6fab0SGage Eads 273cfe6fab0SGage Eads /* Measure the cycle cost of bulk pushing and popping on a single lcore. */ 274cfe6fab0SGage Eads static void 275cfe6fab0SGage Eads test_bulk_push_pop(struct rte_stack *s) 276cfe6fab0SGage Eads { 277cfe6fab0SGage Eads unsigned int iterations = 8000000; 278cfe6fab0SGage Eads void *objs[MAX_BURST]; 279cfe6fab0SGage Eads unsigned int sz, i; 280cfe6fab0SGage Eads 281*8ada5b15SPavan Nikhilesh for (sz = 0; sz < RTE_DIM(bulk_sizes); sz++) { 282cfe6fab0SGage Eads uint64_t start = rte_rdtsc(); 283cfe6fab0SGage Eads 284cfe6fab0SGage Eads for (i = 0; i < iterations; i++) { 285cfe6fab0SGage Eads rte_stack_push(s, objs, bulk_sizes[sz]); 286cfe6fab0SGage Eads rte_stack_pop(s, objs, bulk_sizes[sz]); 287cfe6fab0SGage Eads } 288cfe6fab0SGage Eads 289cfe6fab0SGage Eads uint64_t end = rte_rdtsc(); 290cfe6fab0SGage Eads 291cfe6fab0SGage Eads double avg = ((double)(end - start) / 292cfe6fab0SGage Eads (iterations * bulk_sizes[sz])); 293cfe6fab0SGage Eads 294cfe6fab0SGage Eads printf("Average cycles per object push/pop (bulk size: %u): %.2F\n", 295cfe6fab0SGage Eads bulk_sizes[sz], avg); 296cfe6fab0SGage Eads } 297cfe6fab0SGage Eads } 298cfe6fab0SGage Eads 299cfe6fab0SGage Eads static int 3000420378bSGage Eads __test_stack_perf(uint32_t flags) 301cfe6fab0SGage Eads { 302cfe6fab0SGage Eads struct lcore_pair cores; 303cfe6fab0SGage Eads struct rte_stack *s; 304cfe6fab0SGage Eads 305cfe6fab0SGage Eads rte_atomic32_init(&lcore_barrier); 306cfe6fab0SGage Eads 3070420378bSGage Eads s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), flags); 308cfe6fab0SGage Eads if (s == NULL) { 309cfe6fab0SGage Eads printf("[%s():%u] failed to create a stack\n", 310cfe6fab0SGage Eads __func__, __LINE__); 311cfe6fab0SGage Eads return -1; 312cfe6fab0SGage Eads } 313cfe6fab0SGage Eads 314cfe6fab0SGage Eads printf("### Testing single element push/pop ###\n"); 315cfe6fab0SGage Eads test_single_push_pop(s); 316cfe6fab0SGage Eads 317cfe6fab0SGage Eads printf("\n### Testing empty pop ###\n"); 318cfe6fab0SGage Eads test_empty_pop(s); 319cfe6fab0SGage Eads 320cfe6fab0SGage Eads printf("\n### Testing using a single lcore ###\n"); 321cfe6fab0SGage Eads test_bulk_push_pop(s); 322cfe6fab0SGage Eads 323cfe6fab0SGage Eads if (get_two_hyperthreads(&cores) == 0) { 324cfe6fab0SGage Eads printf("\n### Testing using two hyperthreads ###\n"); 325cfe6fab0SGage Eads run_on_core_pair(&cores, s, bulk_push_pop); 326cfe6fab0SGage Eads } 327cfe6fab0SGage Eads if (get_two_cores(&cores) == 0) { 328cfe6fab0SGage Eads printf("\n### Testing using two physical cores ###\n"); 329cfe6fab0SGage Eads run_on_core_pair(&cores, s, bulk_push_pop); 330cfe6fab0SGage Eads } 331cfe6fab0SGage Eads if (get_two_sockets(&cores) == 0) { 332cfe6fab0SGage Eads printf("\n### Testing using two NUMA nodes ###\n"); 333cfe6fab0SGage Eads run_on_core_pair(&cores, s, bulk_push_pop); 334cfe6fab0SGage Eads } 335cfe6fab0SGage Eads 336cfe6fab0SGage Eads printf("\n### Testing on all %u lcores ###\n", rte_lcore_count()); 337cfe6fab0SGage Eads run_on_n_cores(s, bulk_push_pop, rte_lcore_count()); 338cfe6fab0SGage Eads 339cfe6fab0SGage Eads rte_stack_free(s); 340cfe6fab0SGage Eads return 0; 341cfe6fab0SGage Eads } 342cfe6fab0SGage Eads 3430420378bSGage Eads static int 3440420378bSGage Eads test_stack_perf(void) 3450420378bSGage Eads { 3460420378bSGage Eads return __test_stack_perf(0); 3470420378bSGage Eads } 3480420378bSGage Eads 3490420378bSGage Eads static int 3500420378bSGage Eads test_lf_stack_perf(void) 3510420378bSGage Eads { 3520420378bSGage Eads return __test_stack_perf(RTE_STACK_F_LF); 3530420378bSGage Eads } 3540420378bSGage Eads 355cfe6fab0SGage Eads REGISTER_TEST_COMMAND(stack_perf_autotest, test_stack_perf); 3560420378bSGage Eads REGISTER_TEST_COMMAND(stack_lf_perf_autotest, test_lf_stack_perf); 357