1cfe6fab0SGage Eads /* SPDX-License-Identifier: BSD-3-Clause 2cfe6fab0SGage Eads * Copyright(c) 2019 Intel Corporation 3cfe6fab0SGage Eads */ 4cfe6fab0SGage Eads 5cfe6fab0SGage Eads 6cfe6fab0SGage Eads #include <stdio.h> 7cfe6fab0SGage Eads #include <inttypes.h> 8cfe6fab0SGage Eads 9cfe6fab0SGage Eads #include <rte_cycles.h> 10cfe6fab0SGage Eads #include <rte_launch.h> 11cfe6fab0SGage Eads #include <rte_pause.h> 12cfe6fab0SGage Eads #include <rte_stack.h> 13cfe6fab0SGage Eads 14cfe6fab0SGage Eads #include "test.h" 15cfe6fab0SGage Eads 16cfe6fab0SGage Eads #define STACK_NAME "STACK_PERF" 17cfe6fab0SGage Eads #define MAX_BURST 32 18cfe6fab0SGage Eads #define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST) 19cfe6fab0SGage Eads 20cfe6fab0SGage Eads /* 21cfe6fab0SGage Eads * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time 22cfe6fab0SGage Eads * constants. 23cfe6fab0SGage Eads */ 24cfe6fab0SGage Eads static volatile unsigned int bulk_sizes[] = {8, MAX_BURST}; 25cfe6fab0SGage Eads 26e64f423bSJoyce Kong static uint32_t lcore_barrier; 27cfe6fab0SGage Eads 28cfe6fab0SGage Eads struct lcore_pair { 29cfe6fab0SGage Eads unsigned int c1; 30cfe6fab0SGage Eads unsigned int c2; 31cfe6fab0SGage Eads }; 32cfe6fab0SGage Eads 33cfe6fab0SGage Eads static int 34cfe6fab0SGage Eads get_two_hyperthreads(struct lcore_pair *lcp) 35cfe6fab0SGage Eads { 36cfe6fab0SGage Eads unsigned int socket[2]; 37cfe6fab0SGage Eads unsigned int core[2]; 38cfe6fab0SGage Eads unsigned int id[2]; 39cfe6fab0SGage Eads 40cfe6fab0SGage Eads RTE_LCORE_FOREACH(id[0]) { 41cfe6fab0SGage Eads RTE_LCORE_FOREACH(id[1]) { 42cfe6fab0SGage Eads if (id[0] == id[1]) 43cfe6fab0SGage Eads continue; 44de307f7aSStephen Hemminger core[0] = rte_lcore_to_cpu_id(id[0]); 45de307f7aSStephen Hemminger core[1] = rte_lcore_to_cpu_id(id[1]); 46de307f7aSStephen Hemminger socket[0] = rte_lcore_to_socket_id(id[0]); 47de307f7aSStephen Hemminger socket[1] = rte_lcore_to_socket_id(id[1]); 48cfe6fab0SGage Eads if ((core[0] == core[1]) && (socket[0] == socket[1])) { 49cfe6fab0SGage Eads lcp->c1 = id[0]; 50cfe6fab0SGage Eads lcp->c2 = id[1]; 51cfe6fab0SGage Eads return 0; 52cfe6fab0SGage Eads } 53cfe6fab0SGage Eads } 54cfe6fab0SGage Eads } 55cfe6fab0SGage Eads 56cfe6fab0SGage Eads return 1; 57cfe6fab0SGage Eads } 58cfe6fab0SGage Eads 59cfe6fab0SGage Eads static int 60cfe6fab0SGage Eads get_two_cores(struct lcore_pair *lcp) 61cfe6fab0SGage Eads { 62cfe6fab0SGage Eads unsigned int socket[2]; 63cfe6fab0SGage Eads unsigned int core[2]; 64cfe6fab0SGage Eads unsigned int id[2]; 65cfe6fab0SGage Eads 66cfe6fab0SGage Eads RTE_LCORE_FOREACH(id[0]) { 67cfe6fab0SGage Eads RTE_LCORE_FOREACH(id[1]) { 68cfe6fab0SGage Eads if (id[0] == id[1]) 69cfe6fab0SGage Eads continue; 70de307f7aSStephen Hemminger core[0] = rte_lcore_to_cpu_id(id[0]); 71de307f7aSStephen Hemminger core[1] = rte_lcore_to_cpu_id(id[1]); 72de307f7aSStephen Hemminger socket[0] = rte_lcore_to_socket_id(id[0]); 73de307f7aSStephen Hemminger socket[1] = rte_lcore_to_socket_id(id[1]); 74cfe6fab0SGage Eads if ((core[0] != core[1]) && (socket[0] == socket[1])) { 75cfe6fab0SGage Eads lcp->c1 = id[0]; 76cfe6fab0SGage Eads lcp->c2 = id[1]; 77cfe6fab0SGage Eads return 0; 78cfe6fab0SGage Eads } 79cfe6fab0SGage Eads } 80cfe6fab0SGage Eads } 81cfe6fab0SGage Eads 82cfe6fab0SGage Eads return 1; 83cfe6fab0SGage Eads } 84cfe6fab0SGage Eads 85cfe6fab0SGage Eads static int 86cfe6fab0SGage Eads get_two_sockets(struct lcore_pair *lcp) 87cfe6fab0SGage Eads { 88cfe6fab0SGage Eads unsigned int socket[2]; 89cfe6fab0SGage Eads unsigned int id[2]; 90cfe6fab0SGage Eads 91cfe6fab0SGage Eads RTE_LCORE_FOREACH(id[0]) { 92cfe6fab0SGage Eads RTE_LCORE_FOREACH(id[1]) { 93cfe6fab0SGage Eads if (id[0] == id[1]) 94cfe6fab0SGage Eads continue; 95de307f7aSStephen Hemminger socket[0] = rte_lcore_to_socket_id(id[0]); 96de307f7aSStephen Hemminger socket[1] = rte_lcore_to_socket_id(id[1]); 97cfe6fab0SGage Eads if (socket[0] != socket[1]) { 98cfe6fab0SGage Eads lcp->c1 = id[0]; 99cfe6fab0SGage Eads lcp->c2 = id[1]; 100cfe6fab0SGage Eads return 0; 101cfe6fab0SGage Eads } 102cfe6fab0SGage Eads } 103cfe6fab0SGage Eads } 104cfe6fab0SGage Eads 105cfe6fab0SGage Eads return 1; 106cfe6fab0SGage Eads } 107cfe6fab0SGage Eads 108cfe6fab0SGage Eads /* Measure the cycle cost of popping an empty stack. */ 109cfe6fab0SGage Eads static void 110cfe6fab0SGage Eads test_empty_pop(struct rte_stack *s) 111cfe6fab0SGage Eads { 112cfe6fab0SGage Eads unsigned int iterations = 100000000; 113cfe6fab0SGage Eads void *objs[MAX_BURST]; 114cfe6fab0SGage Eads unsigned int i; 115cfe6fab0SGage Eads 116cfe6fab0SGage Eads uint64_t start = rte_rdtsc(); 117cfe6fab0SGage Eads 118cfe6fab0SGage Eads for (i = 0; i < iterations; i++) 119cfe6fab0SGage Eads rte_stack_pop(s, objs, bulk_sizes[0]); 120cfe6fab0SGage Eads 121cfe6fab0SGage Eads uint64_t end = rte_rdtsc(); 122cfe6fab0SGage Eads 123cfe6fab0SGage Eads printf("Stack empty pop: %.2F\n", 124cfe6fab0SGage Eads (double)(end - start) / iterations); 125cfe6fab0SGage Eads } 126cfe6fab0SGage Eads 127cfe6fab0SGage Eads struct thread_args { 128cfe6fab0SGage Eads struct rte_stack *s; 129cfe6fab0SGage Eads unsigned int sz; 130cfe6fab0SGage Eads double avg; 131cfe6fab0SGage Eads }; 132cfe6fab0SGage Eads 133cfe6fab0SGage Eads /* Measure the average per-pointer cycle cost of stack push and pop */ 134cfe6fab0SGage Eads static int 135cfe6fab0SGage Eads bulk_push_pop(void *p) 136cfe6fab0SGage Eads { 137cfe6fab0SGage Eads unsigned int iterations = 1000000; 138cfe6fab0SGage Eads struct thread_args *args = p; 139cfe6fab0SGage Eads void *objs[MAX_BURST] = {0}; 140cfe6fab0SGage Eads unsigned int size, i; 141cfe6fab0SGage Eads struct rte_stack *s; 142cfe6fab0SGage Eads 143cfe6fab0SGage Eads s = args->s; 144cfe6fab0SGage Eads size = args->sz; 145cfe6fab0SGage Eads 146e64f423bSJoyce Kong __atomic_fetch_sub(&lcore_barrier, 1, __ATOMIC_RELAXED); 147e64f423bSJoyce Kong rte_wait_until_equal_32(&lcore_barrier, 0, __ATOMIC_RELAXED); 148cfe6fab0SGage Eads 149cfe6fab0SGage Eads uint64_t start = rte_rdtsc(); 150cfe6fab0SGage Eads 151cfe6fab0SGage Eads for (i = 0; i < iterations; i++) { 152cfe6fab0SGage Eads rte_stack_push(s, objs, size); 153cfe6fab0SGage Eads rte_stack_pop(s, objs, size); 154cfe6fab0SGage Eads } 155cfe6fab0SGage Eads 156cfe6fab0SGage Eads uint64_t end = rte_rdtsc(); 157cfe6fab0SGage Eads 158cfe6fab0SGage Eads args->avg = ((double)(end - start))/(iterations * size); 159cfe6fab0SGage Eads 160cfe6fab0SGage Eads return 0; 161cfe6fab0SGage Eads } 162cfe6fab0SGage Eads 163cfe6fab0SGage Eads /* 164cfe6fab0SGage Eads * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack 165cfe6fab0SGage Eads * perf when between hyperthread siblings, cores on the same socket, and cores 166cfe6fab0SGage Eads * on different sockets. 167cfe6fab0SGage Eads */ 168cfe6fab0SGage Eads static void 169cfe6fab0SGage Eads run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s, 170cfe6fab0SGage Eads lcore_function_t fn) 171cfe6fab0SGage Eads { 172cfe6fab0SGage Eads struct thread_args args[2]; 173cfe6fab0SGage Eads unsigned int i; 174cfe6fab0SGage Eads 1758ada5b15SPavan Nikhilesh for (i = 0; i < RTE_DIM(bulk_sizes); i++) { 176e64f423bSJoyce Kong __atomic_store_n(&lcore_barrier, 2, __ATOMIC_RELAXED); 177cfe6fab0SGage Eads 178cfe6fab0SGage Eads args[0].sz = args[1].sz = bulk_sizes[i]; 179cfe6fab0SGage Eads args[0].s = args[1].s = s; 180cfe6fab0SGage Eads 181cb056611SStephen Hemminger if (cores->c1 == rte_get_main_lcore()) { 182cfe6fab0SGage Eads rte_eal_remote_launch(fn, &args[1], cores->c2); 183cfe6fab0SGage Eads fn(&args[0]); 184cfe6fab0SGage Eads rte_eal_wait_lcore(cores->c2); 185cfe6fab0SGage Eads } else { 186cfe6fab0SGage Eads rte_eal_remote_launch(fn, &args[0], cores->c1); 187cfe6fab0SGage Eads rte_eal_remote_launch(fn, &args[1], cores->c2); 188cfe6fab0SGage Eads rte_eal_wait_lcore(cores->c1); 189cfe6fab0SGage Eads rte_eal_wait_lcore(cores->c2); 190cfe6fab0SGage Eads } 191cfe6fab0SGage Eads 192cfe6fab0SGage Eads printf("Average cycles per object push/pop (bulk size: %u): %.2F\n", 193cfe6fab0SGage Eads bulk_sizes[i], (args[0].avg + args[1].avg) / 2); 194cfe6fab0SGage Eads } 195cfe6fab0SGage Eads } 196cfe6fab0SGage Eads 197cfe6fab0SGage Eads /* Run bulk_push_pop() simultaneously on 1+ cores. */ 198cfe6fab0SGage Eads static void 199cfe6fab0SGage Eads run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n) 200cfe6fab0SGage Eads { 201cfe6fab0SGage Eads struct thread_args args[RTE_MAX_LCORE]; 202cfe6fab0SGage Eads unsigned int i; 203cfe6fab0SGage Eads 2048ada5b15SPavan Nikhilesh for (i = 0; i < RTE_DIM(bulk_sizes); i++) { 205cfe6fab0SGage Eads unsigned int lcore_id; 206cfe6fab0SGage Eads int cnt = 0; 207cfe6fab0SGage Eads double avg; 208cfe6fab0SGage Eads 209e64f423bSJoyce Kong __atomic_store_n(&lcore_barrier, n, __ATOMIC_RELAXED); 210cfe6fab0SGage Eads 211cb056611SStephen Hemminger RTE_LCORE_FOREACH_WORKER(lcore_id) { 212cfe6fab0SGage Eads if (++cnt >= n) 213cfe6fab0SGage Eads break; 214cfe6fab0SGage Eads 215cfe6fab0SGage Eads args[lcore_id].s = s; 216cfe6fab0SGage Eads args[lcore_id].sz = bulk_sizes[i]; 217cfe6fab0SGage Eads 218cfe6fab0SGage Eads if (rte_eal_remote_launch(fn, &args[lcore_id], 219cfe6fab0SGage Eads lcore_id)) 220cfe6fab0SGage Eads rte_panic("Failed to launch lcore %d\n", 221cfe6fab0SGage Eads lcore_id); 222cfe6fab0SGage Eads } 223cfe6fab0SGage Eads 224cfe6fab0SGage Eads lcore_id = rte_lcore_id(); 225cfe6fab0SGage Eads 226cfe6fab0SGage Eads args[lcore_id].s = s; 227cfe6fab0SGage Eads args[lcore_id].sz = bulk_sizes[i]; 228cfe6fab0SGage Eads 229cfe6fab0SGage Eads fn(&args[lcore_id]); 230cfe6fab0SGage Eads 231cfe6fab0SGage Eads rte_eal_mp_wait_lcore(); 232cfe6fab0SGage Eads 233cfe6fab0SGage Eads avg = args[rte_lcore_id()].avg; 234cfe6fab0SGage Eads 235cfe6fab0SGage Eads cnt = 0; 236cb056611SStephen Hemminger RTE_LCORE_FOREACH_WORKER(lcore_id) { 237cfe6fab0SGage Eads if (++cnt >= n) 238cfe6fab0SGage Eads break; 239cfe6fab0SGage Eads avg += args[lcore_id].avg; 240cfe6fab0SGage Eads } 241cfe6fab0SGage Eads 242cfe6fab0SGage Eads printf("Average cycles per object push/pop (bulk size: %u): %.2F\n", 243cfe6fab0SGage Eads bulk_sizes[i], avg / n); 244cfe6fab0SGage Eads } 245cfe6fab0SGage Eads } 246cfe6fab0SGage Eads 247cfe6fab0SGage Eads /* 248cfe6fab0SGage Eads * Measure the cycle cost of pushing and popping a single pointer on a single 249cfe6fab0SGage Eads * lcore. 250cfe6fab0SGage Eads */ 251cfe6fab0SGage Eads static void 252cfe6fab0SGage Eads test_single_push_pop(struct rte_stack *s) 253cfe6fab0SGage Eads { 254cfe6fab0SGage Eads unsigned int iterations = 16000000; 255cfe6fab0SGage Eads void *obj = NULL; 256cfe6fab0SGage Eads unsigned int i; 257cfe6fab0SGage Eads 258cfe6fab0SGage Eads uint64_t start = rte_rdtsc(); 259cfe6fab0SGage Eads 260cfe6fab0SGage Eads for (i = 0; i < iterations; i++) { 261cfe6fab0SGage Eads rte_stack_push(s, &obj, 1); 262cfe6fab0SGage Eads rte_stack_pop(s, &obj, 1); 263cfe6fab0SGage Eads } 264cfe6fab0SGage Eads 265cfe6fab0SGage Eads uint64_t end = rte_rdtsc(); 266cfe6fab0SGage Eads 267cfe6fab0SGage Eads printf("Average cycles per single object push/pop: %.2F\n", 268cfe6fab0SGage Eads ((double)(end - start)) / iterations); 269cfe6fab0SGage Eads } 270cfe6fab0SGage Eads 271cfe6fab0SGage Eads /* Measure the cycle cost of bulk pushing and popping on a single lcore. */ 272cfe6fab0SGage Eads static void 273cfe6fab0SGage Eads test_bulk_push_pop(struct rte_stack *s) 274cfe6fab0SGage Eads { 275cfe6fab0SGage Eads unsigned int iterations = 8000000; 276cfe6fab0SGage Eads void *objs[MAX_BURST]; 277cfe6fab0SGage Eads unsigned int sz, i; 278cfe6fab0SGage Eads 2798ada5b15SPavan Nikhilesh for (sz = 0; sz < RTE_DIM(bulk_sizes); sz++) { 280cfe6fab0SGage Eads uint64_t start = rte_rdtsc(); 281cfe6fab0SGage Eads 282cfe6fab0SGage Eads for (i = 0; i < iterations; i++) { 283cfe6fab0SGage Eads rte_stack_push(s, objs, bulk_sizes[sz]); 284cfe6fab0SGage Eads rte_stack_pop(s, objs, bulk_sizes[sz]); 285cfe6fab0SGage Eads } 286cfe6fab0SGage Eads 287cfe6fab0SGage Eads uint64_t end = rte_rdtsc(); 288cfe6fab0SGage Eads 289cfe6fab0SGage Eads double avg = ((double)(end - start) / 290cfe6fab0SGage Eads (iterations * bulk_sizes[sz])); 291cfe6fab0SGage Eads 292cfe6fab0SGage Eads printf("Average cycles per object push/pop (bulk size: %u): %.2F\n", 293cfe6fab0SGage Eads bulk_sizes[sz], avg); 294cfe6fab0SGage Eads } 295cfe6fab0SGage Eads } 296cfe6fab0SGage Eads 297cfe6fab0SGage Eads static int 2980420378bSGage Eads __test_stack_perf(uint32_t flags) 299cfe6fab0SGage Eads { 300cfe6fab0SGage Eads struct lcore_pair cores; 301cfe6fab0SGage Eads struct rte_stack *s; 302cfe6fab0SGage Eads 303e64f423bSJoyce Kong __atomic_store_n(&lcore_barrier, 0, __ATOMIC_RELAXED); 304cfe6fab0SGage Eads 3050420378bSGage Eads s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), flags); 306cfe6fab0SGage Eads if (s == NULL) { 307cfe6fab0SGage Eads printf("[%s():%u] failed to create a stack\n", 308cfe6fab0SGage Eads __func__, __LINE__); 309cfe6fab0SGage Eads return -1; 310cfe6fab0SGage Eads } 311cfe6fab0SGage Eads 312cfe6fab0SGage Eads printf("### Testing single element push/pop ###\n"); 313cfe6fab0SGage Eads test_single_push_pop(s); 314cfe6fab0SGage Eads 315cfe6fab0SGage Eads printf("\n### Testing empty pop ###\n"); 316cfe6fab0SGage Eads test_empty_pop(s); 317cfe6fab0SGage Eads 318cfe6fab0SGage Eads printf("\n### Testing using a single lcore ###\n"); 319cfe6fab0SGage Eads test_bulk_push_pop(s); 320cfe6fab0SGage Eads 321cfe6fab0SGage Eads if (get_two_hyperthreads(&cores) == 0) { 322cfe6fab0SGage Eads printf("\n### Testing using two hyperthreads ###\n"); 323cfe6fab0SGage Eads run_on_core_pair(&cores, s, bulk_push_pop); 324cfe6fab0SGage Eads } 325cfe6fab0SGage Eads if (get_two_cores(&cores) == 0) { 326cfe6fab0SGage Eads printf("\n### Testing using two physical cores ###\n"); 327cfe6fab0SGage Eads run_on_core_pair(&cores, s, bulk_push_pop); 328cfe6fab0SGage Eads } 329cfe6fab0SGage Eads if (get_two_sockets(&cores) == 0) { 330cfe6fab0SGage Eads printf("\n### Testing using two NUMA nodes ###\n"); 331cfe6fab0SGage Eads run_on_core_pair(&cores, s, bulk_push_pop); 332cfe6fab0SGage Eads } 333cfe6fab0SGage Eads 334cfe6fab0SGage Eads printf("\n### Testing on all %u lcores ###\n", rte_lcore_count()); 335cfe6fab0SGage Eads run_on_n_cores(s, bulk_push_pop, rte_lcore_count()); 336cfe6fab0SGage Eads 337cfe6fab0SGage Eads rte_stack_free(s); 338cfe6fab0SGage Eads return 0; 339cfe6fab0SGage Eads } 340cfe6fab0SGage Eads 3410420378bSGage Eads static int 3420420378bSGage Eads test_stack_perf(void) 3430420378bSGage Eads { 3440420378bSGage Eads return __test_stack_perf(0); 3450420378bSGage Eads } 3460420378bSGage Eads 3470420378bSGage Eads static int 3480420378bSGage Eads test_lf_stack_perf(void) 3490420378bSGage Eads { 3501abb185dSStanislaw Kardach #if defined(RTE_STACK_LF_SUPPORTED) 3510420378bSGage Eads return __test_stack_perf(RTE_STACK_F_LF); 3521abb185dSStanislaw Kardach #else 3531abb185dSStanislaw Kardach return TEST_SKIPPED; 3541abb185dSStanislaw Kardach #endif 3550420378bSGage Eads } 3560420378bSGage Eads 357*e0a8442cSBruce Richardson REGISTER_PERF_TEST(stack_perf_autotest, test_stack_perf); 358*e0a8442cSBruce Richardson REGISTER_PERF_TEST(stack_lf_perf_autotest, test_lf_stack_perf); 359