xref: /dpdk/app/test/test_stack_perf.c (revision b6a7e6852e9ab82ae0e05e2d2a0b83abca17de3b)
1cfe6fab0SGage Eads /* SPDX-License-Identifier: BSD-3-Clause
2cfe6fab0SGage Eads  * Copyright(c) 2019 Intel Corporation
3cfe6fab0SGage Eads  */
4cfe6fab0SGage Eads 
5cfe6fab0SGage Eads 
6cfe6fab0SGage Eads #include <stdio.h>
7cfe6fab0SGage Eads #include <inttypes.h>
8cfe6fab0SGage Eads 
9cfe6fab0SGage Eads #include <rte_cycles.h>
10cfe6fab0SGage Eads #include <rte_launch.h>
11cfe6fab0SGage Eads #include <rte_pause.h>
12cfe6fab0SGage Eads #include <rte_stack.h>
13cfe6fab0SGage Eads 
14cfe6fab0SGage Eads #include "test.h"
15cfe6fab0SGage Eads 
16cfe6fab0SGage Eads #define STACK_NAME "STACK_PERF"
17cfe6fab0SGage Eads #define MAX_BURST 32
18cfe6fab0SGage Eads #define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST)
19cfe6fab0SGage Eads 
20cfe6fab0SGage Eads /*
21cfe6fab0SGage Eads  * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time
22cfe6fab0SGage Eads  * constants.
23cfe6fab0SGage Eads  */
24cfe6fab0SGage Eads static volatile unsigned int bulk_sizes[] = {8, MAX_BURST};
25cfe6fab0SGage Eads 
26*b6a7e685STyler Retzlaff static RTE_ATOMIC(uint32_t) lcore_barrier;
27cfe6fab0SGage Eads 
28cfe6fab0SGage Eads struct lcore_pair {
29cfe6fab0SGage Eads 	unsigned int c1;
30cfe6fab0SGage Eads 	unsigned int c2;
31cfe6fab0SGage Eads };
32cfe6fab0SGage Eads 
33cfe6fab0SGage Eads static int
get_two_hyperthreads(struct lcore_pair * lcp)34cfe6fab0SGage Eads get_two_hyperthreads(struct lcore_pair *lcp)
35cfe6fab0SGage Eads {
36cfe6fab0SGage Eads 	unsigned int socket[2];
37cfe6fab0SGage Eads 	unsigned int core[2];
38cfe6fab0SGage Eads 	unsigned int id[2];
39cfe6fab0SGage Eads 
40cfe6fab0SGage Eads 	RTE_LCORE_FOREACH(id[0]) {
41cfe6fab0SGage Eads 		RTE_LCORE_FOREACH(id[1]) {
42cfe6fab0SGage Eads 			if (id[0] == id[1])
43cfe6fab0SGage Eads 				continue;
44de307f7aSStephen Hemminger 			core[0] = rte_lcore_to_cpu_id(id[0]);
45de307f7aSStephen Hemminger 			core[1] = rte_lcore_to_cpu_id(id[1]);
46de307f7aSStephen Hemminger 			socket[0] = rte_lcore_to_socket_id(id[0]);
47de307f7aSStephen Hemminger 			socket[1] = rte_lcore_to_socket_id(id[1]);
48cfe6fab0SGage Eads 			if ((core[0] == core[1]) && (socket[0] == socket[1])) {
49cfe6fab0SGage Eads 				lcp->c1 = id[0];
50cfe6fab0SGage Eads 				lcp->c2 = id[1];
51cfe6fab0SGage Eads 				return 0;
52cfe6fab0SGage Eads 			}
53cfe6fab0SGage Eads 		}
54cfe6fab0SGage Eads 	}
55cfe6fab0SGage Eads 
56cfe6fab0SGage Eads 	return 1;
57cfe6fab0SGage Eads }
58cfe6fab0SGage Eads 
59cfe6fab0SGage Eads static int
get_two_cores(struct lcore_pair * lcp)60cfe6fab0SGage Eads get_two_cores(struct lcore_pair *lcp)
61cfe6fab0SGage Eads {
62cfe6fab0SGage Eads 	unsigned int socket[2];
63cfe6fab0SGage Eads 	unsigned int core[2];
64cfe6fab0SGage Eads 	unsigned int id[2];
65cfe6fab0SGage Eads 
66cfe6fab0SGage Eads 	RTE_LCORE_FOREACH(id[0]) {
67cfe6fab0SGage Eads 		RTE_LCORE_FOREACH(id[1]) {
68cfe6fab0SGage Eads 			if (id[0] == id[1])
69cfe6fab0SGage Eads 				continue;
70de307f7aSStephen Hemminger 			core[0] = rte_lcore_to_cpu_id(id[0]);
71de307f7aSStephen Hemminger 			core[1] = rte_lcore_to_cpu_id(id[1]);
72de307f7aSStephen Hemminger 			socket[0] = rte_lcore_to_socket_id(id[0]);
73de307f7aSStephen Hemminger 			socket[1] = rte_lcore_to_socket_id(id[1]);
74cfe6fab0SGage Eads 			if ((core[0] != core[1]) && (socket[0] == socket[1])) {
75cfe6fab0SGage Eads 				lcp->c1 = id[0];
76cfe6fab0SGage Eads 				lcp->c2 = id[1];
77cfe6fab0SGage Eads 				return 0;
78cfe6fab0SGage Eads 			}
79cfe6fab0SGage Eads 		}
80cfe6fab0SGage Eads 	}
81cfe6fab0SGage Eads 
82cfe6fab0SGage Eads 	return 1;
83cfe6fab0SGage Eads }
84cfe6fab0SGage Eads 
85cfe6fab0SGage Eads static int
get_two_sockets(struct lcore_pair * lcp)86cfe6fab0SGage Eads get_two_sockets(struct lcore_pair *lcp)
87cfe6fab0SGage Eads {
88cfe6fab0SGage Eads 	unsigned int socket[2];
89cfe6fab0SGage Eads 	unsigned int id[2];
90cfe6fab0SGage Eads 
91cfe6fab0SGage Eads 	RTE_LCORE_FOREACH(id[0]) {
92cfe6fab0SGage Eads 		RTE_LCORE_FOREACH(id[1]) {
93cfe6fab0SGage Eads 			if (id[0] == id[1])
94cfe6fab0SGage Eads 				continue;
95de307f7aSStephen Hemminger 			socket[0] = rte_lcore_to_socket_id(id[0]);
96de307f7aSStephen Hemminger 			socket[1] = rte_lcore_to_socket_id(id[1]);
97cfe6fab0SGage Eads 			if (socket[0] != socket[1]) {
98cfe6fab0SGage Eads 				lcp->c1 = id[0];
99cfe6fab0SGage Eads 				lcp->c2 = id[1];
100cfe6fab0SGage Eads 				return 0;
101cfe6fab0SGage Eads 			}
102cfe6fab0SGage Eads 		}
103cfe6fab0SGage Eads 	}
104cfe6fab0SGage Eads 
105cfe6fab0SGage Eads 	return 1;
106cfe6fab0SGage Eads }
107cfe6fab0SGage Eads 
108cfe6fab0SGage Eads /* Measure the cycle cost of popping an empty stack. */
109cfe6fab0SGage Eads static void
test_empty_pop(struct rte_stack * s)110cfe6fab0SGage Eads test_empty_pop(struct rte_stack *s)
111cfe6fab0SGage Eads {
112cfe6fab0SGage Eads 	unsigned int iterations = 100000000;
113cfe6fab0SGage Eads 	void *objs[MAX_BURST];
114cfe6fab0SGage Eads 	unsigned int i;
115cfe6fab0SGage Eads 
116cfe6fab0SGage Eads 	uint64_t start = rte_rdtsc();
117cfe6fab0SGage Eads 
118cfe6fab0SGage Eads 	for (i = 0; i < iterations; i++)
119cfe6fab0SGage Eads 		rte_stack_pop(s, objs, bulk_sizes[0]);
120cfe6fab0SGage Eads 
121cfe6fab0SGage Eads 	uint64_t end = rte_rdtsc();
122cfe6fab0SGage Eads 
123cfe6fab0SGage Eads 	printf("Stack empty pop: %.2F\n",
124cfe6fab0SGage Eads 	       (double)(end - start) / iterations);
125cfe6fab0SGage Eads }
126cfe6fab0SGage Eads 
127cfe6fab0SGage Eads struct thread_args {
128cfe6fab0SGage Eads 	struct rte_stack *s;
129cfe6fab0SGage Eads 	unsigned int sz;
130cfe6fab0SGage Eads 	double avg;
131cfe6fab0SGage Eads };
132cfe6fab0SGage Eads 
133cfe6fab0SGage Eads /* Measure the average per-pointer cycle cost of stack push and pop */
134cfe6fab0SGage Eads static int
bulk_push_pop(void * p)135cfe6fab0SGage Eads bulk_push_pop(void *p)
136cfe6fab0SGage Eads {
137cfe6fab0SGage Eads 	unsigned int iterations = 1000000;
138cfe6fab0SGage Eads 	struct thread_args *args = p;
139cfe6fab0SGage Eads 	void *objs[MAX_BURST] = {0};
140cfe6fab0SGage Eads 	unsigned int size, i;
141cfe6fab0SGage Eads 	struct rte_stack *s;
142cfe6fab0SGage Eads 
143cfe6fab0SGage Eads 	s = args->s;
144cfe6fab0SGage Eads 	size = args->sz;
145cfe6fab0SGage Eads 
146*b6a7e685STyler Retzlaff 	rte_atomic_fetch_sub_explicit(&lcore_barrier, 1, rte_memory_order_relaxed);
147*b6a7e685STyler Retzlaff 	rte_wait_until_equal_32((uint32_t *)(uintptr_t)&lcore_barrier, 0, rte_memory_order_relaxed);
148cfe6fab0SGage Eads 
149cfe6fab0SGage Eads 	uint64_t start = rte_rdtsc();
150cfe6fab0SGage Eads 
151cfe6fab0SGage Eads 	for (i = 0; i < iterations; i++) {
152cfe6fab0SGage Eads 		rte_stack_push(s, objs, size);
153cfe6fab0SGage Eads 		rte_stack_pop(s, objs, size);
154cfe6fab0SGage Eads 	}
155cfe6fab0SGage Eads 
156cfe6fab0SGage Eads 	uint64_t end = rte_rdtsc();
157cfe6fab0SGage Eads 
158cfe6fab0SGage Eads 	args->avg = ((double)(end - start))/(iterations * size);
159cfe6fab0SGage Eads 
160cfe6fab0SGage Eads 	return 0;
161cfe6fab0SGage Eads }
162cfe6fab0SGage Eads 
163cfe6fab0SGage Eads /*
164cfe6fab0SGage Eads  * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack
165cfe6fab0SGage Eads  * perf when between hyperthread siblings, cores on the same socket, and cores
166cfe6fab0SGage Eads  * on different sockets.
167cfe6fab0SGage Eads  */
168cfe6fab0SGage Eads static void
run_on_core_pair(struct lcore_pair * cores,struct rte_stack * s,lcore_function_t fn)169cfe6fab0SGage Eads run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s,
170cfe6fab0SGage Eads 		 lcore_function_t fn)
171cfe6fab0SGage Eads {
172cfe6fab0SGage Eads 	struct thread_args args[2];
173cfe6fab0SGage Eads 	unsigned int i;
174cfe6fab0SGage Eads 
1758ada5b15SPavan Nikhilesh 	for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
176*b6a7e685STyler Retzlaff 		rte_atomic_store_explicit(&lcore_barrier, 2, rte_memory_order_relaxed);
177cfe6fab0SGage Eads 
178cfe6fab0SGage Eads 		args[0].sz = args[1].sz = bulk_sizes[i];
179cfe6fab0SGage Eads 		args[0].s = args[1].s = s;
180cfe6fab0SGage Eads 
181cb056611SStephen Hemminger 		if (cores->c1 == rte_get_main_lcore()) {
182cfe6fab0SGage Eads 			rte_eal_remote_launch(fn, &args[1], cores->c2);
183cfe6fab0SGage Eads 			fn(&args[0]);
184cfe6fab0SGage Eads 			rte_eal_wait_lcore(cores->c2);
185cfe6fab0SGage Eads 		} else {
186cfe6fab0SGage Eads 			rte_eal_remote_launch(fn, &args[0], cores->c1);
187cfe6fab0SGage Eads 			rte_eal_remote_launch(fn, &args[1], cores->c2);
188cfe6fab0SGage Eads 			rte_eal_wait_lcore(cores->c1);
189cfe6fab0SGage Eads 			rte_eal_wait_lcore(cores->c2);
190cfe6fab0SGage Eads 		}
191cfe6fab0SGage Eads 
192cfe6fab0SGage Eads 		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
193cfe6fab0SGage Eads 		       bulk_sizes[i], (args[0].avg + args[1].avg) / 2);
194cfe6fab0SGage Eads 	}
195cfe6fab0SGage Eads }
196cfe6fab0SGage Eads 
197cfe6fab0SGage Eads /* Run bulk_push_pop() simultaneously on 1+ cores. */
198cfe6fab0SGage Eads static void
run_on_n_cores(struct rte_stack * s,lcore_function_t fn,int n)199cfe6fab0SGage Eads run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n)
200cfe6fab0SGage Eads {
201cfe6fab0SGage Eads 	struct thread_args args[RTE_MAX_LCORE];
202cfe6fab0SGage Eads 	unsigned int i;
203cfe6fab0SGage Eads 
2048ada5b15SPavan Nikhilesh 	for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
205cfe6fab0SGage Eads 		unsigned int lcore_id;
206cfe6fab0SGage Eads 		int cnt = 0;
207cfe6fab0SGage Eads 		double avg;
208cfe6fab0SGage Eads 
209*b6a7e685STyler Retzlaff 		rte_atomic_store_explicit(&lcore_barrier, n, rte_memory_order_relaxed);
210cfe6fab0SGage Eads 
211cb056611SStephen Hemminger 		RTE_LCORE_FOREACH_WORKER(lcore_id) {
212cfe6fab0SGage Eads 			if (++cnt >= n)
213cfe6fab0SGage Eads 				break;
214cfe6fab0SGage Eads 
215cfe6fab0SGage Eads 			args[lcore_id].s = s;
216cfe6fab0SGage Eads 			args[lcore_id].sz = bulk_sizes[i];
217cfe6fab0SGage Eads 
218cfe6fab0SGage Eads 			if (rte_eal_remote_launch(fn, &args[lcore_id],
219cfe6fab0SGage Eads 						  lcore_id))
220cfe6fab0SGage Eads 				rte_panic("Failed to launch lcore %d\n",
221cfe6fab0SGage Eads 					  lcore_id);
222cfe6fab0SGage Eads 		}
223cfe6fab0SGage Eads 
224cfe6fab0SGage Eads 		lcore_id = rte_lcore_id();
225cfe6fab0SGage Eads 
226cfe6fab0SGage Eads 		args[lcore_id].s = s;
227cfe6fab0SGage Eads 		args[lcore_id].sz = bulk_sizes[i];
228cfe6fab0SGage Eads 
229cfe6fab0SGage Eads 		fn(&args[lcore_id]);
230cfe6fab0SGage Eads 
231cfe6fab0SGage Eads 		rte_eal_mp_wait_lcore();
232cfe6fab0SGage Eads 
233cfe6fab0SGage Eads 		avg = args[rte_lcore_id()].avg;
234cfe6fab0SGage Eads 
235cfe6fab0SGage Eads 		cnt = 0;
236cb056611SStephen Hemminger 		RTE_LCORE_FOREACH_WORKER(lcore_id) {
237cfe6fab0SGage Eads 			if (++cnt >= n)
238cfe6fab0SGage Eads 				break;
239cfe6fab0SGage Eads 			avg += args[lcore_id].avg;
240cfe6fab0SGage Eads 		}
241cfe6fab0SGage Eads 
242cfe6fab0SGage Eads 		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
243cfe6fab0SGage Eads 		       bulk_sizes[i], avg / n);
244cfe6fab0SGage Eads 	}
245cfe6fab0SGage Eads }
246cfe6fab0SGage Eads 
247cfe6fab0SGage Eads /*
248cfe6fab0SGage Eads  * Measure the cycle cost of pushing and popping a single pointer on a single
249cfe6fab0SGage Eads  * lcore.
250cfe6fab0SGage Eads  */
251cfe6fab0SGage Eads static void
test_single_push_pop(struct rte_stack * s)252cfe6fab0SGage Eads test_single_push_pop(struct rte_stack *s)
253cfe6fab0SGage Eads {
254cfe6fab0SGage Eads 	unsigned int iterations = 16000000;
255cfe6fab0SGage Eads 	void *obj = NULL;
256cfe6fab0SGage Eads 	unsigned int i;
257cfe6fab0SGage Eads 
258cfe6fab0SGage Eads 	uint64_t start = rte_rdtsc();
259cfe6fab0SGage Eads 
260cfe6fab0SGage Eads 	for (i = 0; i < iterations; i++) {
261cfe6fab0SGage Eads 		rte_stack_push(s, &obj, 1);
262cfe6fab0SGage Eads 		rte_stack_pop(s, &obj, 1);
263cfe6fab0SGage Eads 	}
264cfe6fab0SGage Eads 
265cfe6fab0SGage Eads 	uint64_t end = rte_rdtsc();
266cfe6fab0SGage Eads 
267cfe6fab0SGage Eads 	printf("Average cycles per single object push/pop: %.2F\n",
268cfe6fab0SGage Eads 	       ((double)(end - start)) / iterations);
269cfe6fab0SGage Eads }
270cfe6fab0SGage Eads 
271cfe6fab0SGage Eads /* Measure the cycle cost of bulk pushing and popping on a single lcore. */
272cfe6fab0SGage Eads static void
test_bulk_push_pop(struct rte_stack * s)273cfe6fab0SGage Eads test_bulk_push_pop(struct rte_stack *s)
274cfe6fab0SGage Eads {
275cfe6fab0SGage Eads 	unsigned int iterations = 8000000;
276cfe6fab0SGage Eads 	void *objs[MAX_BURST];
277cfe6fab0SGage Eads 	unsigned int sz, i;
278cfe6fab0SGage Eads 
2798ada5b15SPavan Nikhilesh 	for (sz = 0; sz < RTE_DIM(bulk_sizes); sz++) {
280cfe6fab0SGage Eads 		uint64_t start = rte_rdtsc();
281cfe6fab0SGage Eads 
282cfe6fab0SGage Eads 		for (i = 0; i < iterations; i++) {
283cfe6fab0SGage Eads 			rte_stack_push(s, objs, bulk_sizes[sz]);
284cfe6fab0SGage Eads 			rte_stack_pop(s, objs, bulk_sizes[sz]);
285cfe6fab0SGage Eads 		}
286cfe6fab0SGage Eads 
287cfe6fab0SGage Eads 		uint64_t end = rte_rdtsc();
288cfe6fab0SGage Eads 
289cfe6fab0SGage Eads 		double avg = ((double)(end - start) /
290cfe6fab0SGage Eads 			      (iterations * bulk_sizes[sz]));
291cfe6fab0SGage Eads 
292cfe6fab0SGage Eads 		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
293cfe6fab0SGage Eads 		       bulk_sizes[sz], avg);
294cfe6fab0SGage Eads 	}
295cfe6fab0SGage Eads }
296cfe6fab0SGage Eads 
297cfe6fab0SGage Eads static int
__test_stack_perf(uint32_t flags)2980420378bSGage Eads __test_stack_perf(uint32_t flags)
299cfe6fab0SGage Eads {
300cfe6fab0SGage Eads 	struct lcore_pair cores;
301cfe6fab0SGage Eads 	struct rte_stack *s;
302cfe6fab0SGage Eads 
303*b6a7e685STyler Retzlaff 	rte_atomic_store_explicit(&lcore_barrier, 0, rte_memory_order_relaxed);
304cfe6fab0SGage Eads 
3050420378bSGage Eads 	s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), flags);
306cfe6fab0SGage Eads 	if (s == NULL) {
307cfe6fab0SGage Eads 		printf("[%s():%u] failed to create a stack\n",
308cfe6fab0SGage Eads 		       __func__, __LINE__);
309cfe6fab0SGage Eads 		return -1;
310cfe6fab0SGage Eads 	}
311cfe6fab0SGage Eads 
312cfe6fab0SGage Eads 	printf("### Testing single element push/pop ###\n");
313cfe6fab0SGage Eads 	test_single_push_pop(s);
314cfe6fab0SGage Eads 
315cfe6fab0SGage Eads 	printf("\n### Testing empty pop ###\n");
316cfe6fab0SGage Eads 	test_empty_pop(s);
317cfe6fab0SGage Eads 
318cfe6fab0SGage Eads 	printf("\n### Testing using a single lcore ###\n");
319cfe6fab0SGage Eads 	test_bulk_push_pop(s);
320cfe6fab0SGage Eads 
321cfe6fab0SGage Eads 	if (get_two_hyperthreads(&cores) == 0) {
322cfe6fab0SGage Eads 		printf("\n### Testing using two hyperthreads ###\n");
323cfe6fab0SGage Eads 		run_on_core_pair(&cores, s, bulk_push_pop);
324cfe6fab0SGage Eads 	}
325cfe6fab0SGage Eads 	if (get_two_cores(&cores) == 0) {
326cfe6fab0SGage Eads 		printf("\n### Testing using two physical cores ###\n");
327cfe6fab0SGage Eads 		run_on_core_pair(&cores, s, bulk_push_pop);
328cfe6fab0SGage Eads 	}
329cfe6fab0SGage Eads 	if (get_two_sockets(&cores) == 0) {
330cfe6fab0SGage Eads 		printf("\n### Testing using two NUMA nodes ###\n");
331cfe6fab0SGage Eads 		run_on_core_pair(&cores, s, bulk_push_pop);
332cfe6fab0SGage Eads 	}
333cfe6fab0SGage Eads 
334cfe6fab0SGage Eads 	printf("\n### Testing on all %u lcores ###\n", rte_lcore_count());
335cfe6fab0SGage Eads 	run_on_n_cores(s, bulk_push_pop, rte_lcore_count());
336cfe6fab0SGage Eads 
337cfe6fab0SGage Eads 	rte_stack_free(s);
338cfe6fab0SGage Eads 	return 0;
339cfe6fab0SGage Eads }
340cfe6fab0SGage Eads 
3410420378bSGage Eads static int
test_stack_perf(void)3420420378bSGage Eads test_stack_perf(void)
3430420378bSGage Eads {
3440420378bSGage Eads 	return __test_stack_perf(0);
3450420378bSGage Eads }
3460420378bSGage Eads 
3470420378bSGage Eads static int
test_lf_stack_perf(void)3480420378bSGage Eads test_lf_stack_perf(void)
3490420378bSGage Eads {
3501abb185dSStanislaw Kardach #if defined(RTE_STACK_LF_SUPPORTED)
3510420378bSGage Eads 	return __test_stack_perf(RTE_STACK_F_LF);
3521abb185dSStanislaw Kardach #else
3531abb185dSStanislaw Kardach 	return TEST_SKIPPED;
3541abb185dSStanislaw Kardach #endif
3550420378bSGage Eads }
3560420378bSGage Eads 
357e0a8442cSBruce Richardson REGISTER_PERF_TEST(stack_perf_autotest, test_stack_perf);
358e0a8442cSBruce Richardson REGISTER_PERF_TEST(stack_lf_perf_autotest, test_lf_stack_perf);
359