xref: /dpdk/app/test/test_stack_perf.c (revision 8ada5b15a94e1b356f45d4232a2f8fddca234044)
1cfe6fab0SGage Eads /* SPDX-License-Identifier: BSD-3-Clause
2cfe6fab0SGage Eads  * Copyright(c) 2019 Intel Corporation
3cfe6fab0SGage Eads  */
4cfe6fab0SGage Eads 
5cfe6fab0SGage Eads 
6cfe6fab0SGage Eads #include <stdio.h>
7cfe6fab0SGage Eads #include <inttypes.h>
8cfe6fab0SGage Eads 
9cfe6fab0SGage Eads #include <rte_atomic.h>
10cfe6fab0SGage Eads #include <rte_cycles.h>
11cfe6fab0SGage Eads #include <rte_launch.h>
12cfe6fab0SGage Eads #include <rte_pause.h>
13cfe6fab0SGage Eads #include <rte_stack.h>
14cfe6fab0SGage Eads 
15cfe6fab0SGage Eads #include "test.h"
16cfe6fab0SGage Eads 
17cfe6fab0SGage Eads #define STACK_NAME "STACK_PERF"
18cfe6fab0SGage Eads #define MAX_BURST 32
19cfe6fab0SGage Eads #define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST)
20cfe6fab0SGage Eads 
21cfe6fab0SGage Eads /*
22cfe6fab0SGage Eads  * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time
23cfe6fab0SGage Eads  * constants.
24cfe6fab0SGage Eads  */
25cfe6fab0SGage Eads static volatile unsigned int bulk_sizes[] = {8, MAX_BURST};
26cfe6fab0SGage Eads 
27cfe6fab0SGage Eads static rte_atomic32_t lcore_barrier;
28cfe6fab0SGage Eads 
29cfe6fab0SGage Eads struct lcore_pair {
30cfe6fab0SGage Eads 	unsigned int c1;
31cfe6fab0SGage Eads 	unsigned int c2;
32cfe6fab0SGage Eads };
33cfe6fab0SGage Eads 
34cfe6fab0SGage Eads static int
35cfe6fab0SGage Eads get_two_hyperthreads(struct lcore_pair *lcp)
36cfe6fab0SGage Eads {
37cfe6fab0SGage Eads 	unsigned int socket[2];
38cfe6fab0SGage Eads 	unsigned int core[2];
39cfe6fab0SGage Eads 	unsigned int id[2];
40cfe6fab0SGage Eads 
41cfe6fab0SGage Eads 	RTE_LCORE_FOREACH(id[0]) {
42cfe6fab0SGage Eads 		RTE_LCORE_FOREACH(id[1]) {
43cfe6fab0SGage Eads 			if (id[0] == id[1])
44cfe6fab0SGage Eads 				continue;
45de307f7aSStephen Hemminger 			core[0] = rte_lcore_to_cpu_id(id[0]);
46de307f7aSStephen Hemminger 			core[1] = rte_lcore_to_cpu_id(id[1]);
47de307f7aSStephen Hemminger 			socket[0] = rte_lcore_to_socket_id(id[0]);
48de307f7aSStephen Hemminger 			socket[1] = rte_lcore_to_socket_id(id[1]);
49cfe6fab0SGage Eads 			if ((core[0] == core[1]) && (socket[0] == socket[1])) {
50cfe6fab0SGage Eads 				lcp->c1 = id[0];
51cfe6fab0SGage Eads 				lcp->c2 = id[1];
52cfe6fab0SGage Eads 				return 0;
53cfe6fab0SGage Eads 			}
54cfe6fab0SGage Eads 		}
55cfe6fab0SGage Eads 	}
56cfe6fab0SGage Eads 
57cfe6fab0SGage Eads 	return 1;
58cfe6fab0SGage Eads }
59cfe6fab0SGage Eads 
60cfe6fab0SGage Eads static int
61cfe6fab0SGage Eads get_two_cores(struct lcore_pair *lcp)
62cfe6fab0SGage Eads {
63cfe6fab0SGage Eads 	unsigned int socket[2];
64cfe6fab0SGage Eads 	unsigned int core[2];
65cfe6fab0SGage Eads 	unsigned int id[2];
66cfe6fab0SGage Eads 
67cfe6fab0SGage Eads 	RTE_LCORE_FOREACH(id[0]) {
68cfe6fab0SGage Eads 		RTE_LCORE_FOREACH(id[1]) {
69cfe6fab0SGage Eads 			if (id[0] == id[1])
70cfe6fab0SGage Eads 				continue;
71de307f7aSStephen Hemminger 			core[0] = rte_lcore_to_cpu_id(id[0]);
72de307f7aSStephen Hemminger 			core[1] = rte_lcore_to_cpu_id(id[1]);
73de307f7aSStephen Hemminger 			socket[0] = rte_lcore_to_socket_id(id[0]);
74de307f7aSStephen Hemminger 			socket[1] = rte_lcore_to_socket_id(id[1]);
75cfe6fab0SGage Eads 			if ((core[0] != core[1]) && (socket[0] == socket[1])) {
76cfe6fab0SGage Eads 				lcp->c1 = id[0];
77cfe6fab0SGage Eads 				lcp->c2 = id[1];
78cfe6fab0SGage Eads 				return 0;
79cfe6fab0SGage Eads 			}
80cfe6fab0SGage Eads 		}
81cfe6fab0SGage Eads 	}
82cfe6fab0SGage Eads 
83cfe6fab0SGage Eads 	return 1;
84cfe6fab0SGage Eads }
85cfe6fab0SGage Eads 
86cfe6fab0SGage Eads static int
87cfe6fab0SGage Eads get_two_sockets(struct lcore_pair *lcp)
88cfe6fab0SGage Eads {
89cfe6fab0SGage Eads 	unsigned int socket[2];
90cfe6fab0SGage Eads 	unsigned int id[2];
91cfe6fab0SGage Eads 
92cfe6fab0SGage Eads 	RTE_LCORE_FOREACH(id[0]) {
93cfe6fab0SGage Eads 		RTE_LCORE_FOREACH(id[1]) {
94cfe6fab0SGage Eads 			if (id[0] == id[1])
95cfe6fab0SGage Eads 				continue;
96de307f7aSStephen Hemminger 			socket[0] = rte_lcore_to_socket_id(id[0]);
97de307f7aSStephen Hemminger 			socket[1] = rte_lcore_to_socket_id(id[1]);
98cfe6fab0SGage Eads 			if (socket[0] != socket[1]) {
99cfe6fab0SGage Eads 				lcp->c1 = id[0];
100cfe6fab0SGage Eads 				lcp->c2 = id[1];
101cfe6fab0SGage Eads 				return 0;
102cfe6fab0SGage Eads 			}
103cfe6fab0SGage Eads 		}
104cfe6fab0SGage Eads 	}
105cfe6fab0SGage Eads 
106cfe6fab0SGage Eads 	return 1;
107cfe6fab0SGage Eads }
108cfe6fab0SGage Eads 
109cfe6fab0SGage Eads /* Measure the cycle cost of popping an empty stack. */
110cfe6fab0SGage Eads static void
111cfe6fab0SGage Eads test_empty_pop(struct rte_stack *s)
112cfe6fab0SGage Eads {
113cfe6fab0SGage Eads 	unsigned int iterations = 100000000;
114cfe6fab0SGage Eads 	void *objs[MAX_BURST];
115cfe6fab0SGage Eads 	unsigned int i;
116cfe6fab0SGage Eads 
117cfe6fab0SGage Eads 	uint64_t start = rte_rdtsc();
118cfe6fab0SGage Eads 
119cfe6fab0SGage Eads 	for (i = 0; i < iterations; i++)
120cfe6fab0SGage Eads 		rte_stack_pop(s, objs, bulk_sizes[0]);
121cfe6fab0SGage Eads 
122cfe6fab0SGage Eads 	uint64_t end = rte_rdtsc();
123cfe6fab0SGage Eads 
124cfe6fab0SGage Eads 	printf("Stack empty pop: %.2F\n",
125cfe6fab0SGage Eads 	       (double)(end - start) / iterations);
126cfe6fab0SGage Eads }
127cfe6fab0SGage Eads 
128cfe6fab0SGage Eads struct thread_args {
129cfe6fab0SGage Eads 	struct rte_stack *s;
130cfe6fab0SGage Eads 	unsigned int sz;
131cfe6fab0SGage Eads 	double avg;
132cfe6fab0SGage Eads };
133cfe6fab0SGage Eads 
134cfe6fab0SGage Eads /* Measure the average per-pointer cycle cost of stack push and pop */
135cfe6fab0SGage Eads static int
136cfe6fab0SGage Eads bulk_push_pop(void *p)
137cfe6fab0SGage Eads {
138cfe6fab0SGage Eads 	unsigned int iterations = 1000000;
139cfe6fab0SGage Eads 	struct thread_args *args = p;
140cfe6fab0SGage Eads 	void *objs[MAX_BURST] = {0};
141cfe6fab0SGage Eads 	unsigned int size, i;
142cfe6fab0SGage Eads 	struct rte_stack *s;
143cfe6fab0SGage Eads 
144cfe6fab0SGage Eads 	s = args->s;
145cfe6fab0SGage Eads 	size = args->sz;
146cfe6fab0SGage Eads 
147cfe6fab0SGage Eads 	rte_atomic32_sub(&lcore_barrier, 1);
148cfe6fab0SGage Eads 	while (rte_atomic32_read(&lcore_barrier) != 0)
149cfe6fab0SGage Eads 		rte_pause();
150cfe6fab0SGage Eads 
151cfe6fab0SGage Eads 	uint64_t start = rte_rdtsc();
152cfe6fab0SGage Eads 
153cfe6fab0SGage Eads 	for (i = 0; i < iterations; i++) {
154cfe6fab0SGage Eads 		rte_stack_push(s, objs, size);
155cfe6fab0SGage Eads 		rte_stack_pop(s, objs, size);
156cfe6fab0SGage Eads 	}
157cfe6fab0SGage Eads 
158cfe6fab0SGage Eads 	uint64_t end = rte_rdtsc();
159cfe6fab0SGage Eads 
160cfe6fab0SGage Eads 	args->avg = ((double)(end - start))/(iterations * size);
161cfe6fab0SGage Eads 
162cfe6fab0SGage Eads 	return 0;
163cfe6fab0SGage Eads }
164cfe6fab0SGage Eads 
165cfe6fab0SGage Eads /*
166cfe6fab0SGage Eads  * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack
167cfe6fab0SGage Eads  * perf when between hyperthread siblings, cores on the same socket, and cores
168cfe6fab0SGage Eads  * on different sockets.
169cfe6fab0SGage Eads  */
170cfe6fab0SGage Eads static void
171cfe6fab0SGage Eads run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s,
172cfe6fab0SGage Eads 		 lcore_function_t fn)
173cfe6fab0SGage Eads {
174cfe6fab0SGage Eads 	struct thread_args args[2];
175cfe6fab0SGage Eads 	unsigned int i;
176cfe6fab0SGage Eads 
177*8ada5b15SPavan Nikhilesh 	for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
178cfe6fab0SGage Eads 		rte_atomic32_set(&lcore_barrier, 2);
179cfe6fab0SGage Eads 
180cfe6fab0SGage Eads 		args[0].sz = args[1].sz = bulk_sizes[i];
181cfe6fab0SGage Eads 		args[0].s = args[1].s = s;
182cfe6fab0SGage Eads 
183cfe6fab0SGage Eads 		if (cores->c1 == rte_get_master_lcore()) {
184cfe6fab0SGage Eads 			rte_eal_remote_launch(fn, &args[1], cores->c2);
185cfe6fab0SGage Eads 			fn(&args[0]);
186cfe6fab0SGage Eads 			rte_eal_wait_lcore(cores->c2);
187cfe6fab0SGage Eads 		} else {
188cfe6fab0SGage Eads 			rte_eal_remote_launch(fn, &args[0], cores->c1);
189cfe6fab0SGage Eads 			rte_eal_remote_launch(fn, &args[1], cores->c2);
190cfe6fab0SGage Eads 			rte_eal_wait_lcore(cores->c1);
191cfe6fab0SGage Eads 			rte_eal_wait_lcore(cores->c2);
192cfe6fab0SGage Eads 		}
193cfe6fab0SGage Eads 
194cfe6fab0SGage Eads 		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
195cfe6fab0SGage Eads 		       bulk_sizes[i], (args[0].avg + args[1].avg) / 2);
196cfe6fab0SGage Eads 	}
197cfe6fab0SGage Eads }
198cfe6fab0SGage Eads 
199cfe6fab0SGage Eads /* Run bulk_push_pop() simultaneously on 1+ cores. */
200cfe6fab0SGage Eads static void
201cfe6fab0SGage Eads run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n)
202cfe6fab0SGage Eads {
203cfe6fab0SGage Eads 	struct thread_args args[RTE_MAX_LCORE];
204cfe6fab0SGage Eads 	unsigned int i;
205cfe6fab0SGage Eads 
206*8ada5b15SPavan Nikhilesh 	for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
207cfe6fab0SGage Eads 		unsigned int lcore_id;
208cfe6fab0SGage Eads 		int cnt = 0;
209cfe6fab0SGage Eads 		double avg;
210cfe6fab0SGage Eads 
211cfe6fab0SGage Eads 		rte_atomic32_set(&lcore_barrier, n);
212cfe6fab0SGage Eads 
213cfe6fab0SGage Eads 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
214cfe6fab0SGage Eads 			if (++cnt >= n)
215cfe6fab0SGage Eads 				break;
216cfe6fab0SGage Eads 
217cfe6fab0SGage Eads 			args[lcore_id].s = s;
218cfe6fab0SGage Eads 			args[lcore_id].sz = bulk_sizes[i];
219cfe6fab0SGage Eads 
220cfe6fab0SGage Eads 			if (rte_eal_remote_launch(fn, &args[lcore_id],
221cfe6fab0SGage Eads 						  lcore_id))
222cfe6fab0SGage Eads 				rte_panic("Failed to launch lcore %d\n",
223cfe6fab0SGage Eads 					  lcore_id);
224cfe6fab0SGage Eads 		}
225cfe6fab0SGage Eads 
226cfe6fab0SGage Eads 		lcore_id = rte_lcore_id();
227cfe6fab0SGage Eads 
228cfe6fab0SGage Eads 		args[lcore_id].s = s;
229cfe6fab0SGage Eads 		args[lcore_id].sz = bulk_sizes[i];
230cfe6fab0SGage Eads 
231cfe6fab0SGage Eads 		fn(&args[lcore_id]);
232cfe6fab0SGage Eads 
233cfe6fab0SGage Eads 		rte_eal_mp_wait_lcore();
234cfe6fab0SGage Eads 
235cfe6fab0SGage Eads 		avg = args[rte_lcore_id()].avg;
236cfe6fab0SGage Eads 
237cfe6fab0SGage Eads 		cnt = 0;
238cfe6fab0SGage Eads 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
239cfe6fab0SGage Eads 			if (++cnt >= n)
240cfe6fab0SGage Eads 				break;
241cfe6fab0SGage Eads 			avg += args[lcore_id].avg;
242cfe6fab0SGage Eads 		}
243cfe6fab0SGage Eads 
244cfe6fab0SGage Eads 		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
245cfe6fab0SGage Eads 		       bulk_sizes[i], avg / n);
246cfe6fab0SGage Eads 	}
247cfe6fab0SGage Eads }
248cfe6fab0SGage Eads 
249cfe6fab0SGage Eads /*
250cfe6fab0SGage Eads  * Measure the cycle cost of pushing and popping a single pointer on a single
251cfe6fab0SGage Eads  * lcore.
252cfe6fab0SGage Eads  */
253cfe6fab0SGage Eads static void
254cfe6fab0SGage Eads test_single_push_pop(struct rte_stack *s)
255cfe6fab0SGage Eads {
256cfe6fab0SGage Eads 	unsigned int iterations = 16000000;
257cfe6fab0SGage Eads 	void *obj = NULL;
258cfe6fab0SGage Eads 	unsigned int i;
259cfe6fab0SGage Eads 
260cfe6fab0SGage Eads 	uint64_t start = rte_rdtsc();
261cfe6fab0SGage Eads 
262cfe6fab0SGage Eads 	for (i = 0; i < iterations; i++) {
263cfe6fab0SGage Eads 		rte_stack_push(s, &obj, 1);
264cfe6fab0SGage Eads 		rte_stack_pop(s, &obj, 1);
265cfe6fab0SGage Eads 	}
266cfe6fab0SGage Eads 
267cfe6fab0SGage Eads 	uint64_t end = rte_rdtsc();
268cfe6fab0SGage Eads 
269cfe6fab0SGage Eads 	printf("Average cycles per single object push/pop: %.2F\n",
270cfe6fab0SGage Eads 	       ((double)(end - start)) / iterations);
271cfe6fab0SGage Eads }
272cfe6fab0SGage Eads 
273cfe6fab0SGage Eads /* Measure the cycle cost of bulk pushing and popping on a single lcore. */
274cfe6fab0SGage Eads static void
275cfe6fab0SGage Eads test_bulk_push_pop(struct rte_stack *s)
276cfe6fab0SGage Eads {
277cfe6fab0SGage Eads 	unsigned int iterations = 8000000;
278cfe6fab0SGage Eads 	void *objs[MAX_BURST];
279cfe6fab0SGage Eads 	unsigned int sz, i;
280cfe6fab0SGage Eads 
281*8ada5b15SPavan Nikhilesh 	for (sz = 0; sz < RTE_DIM(bulk_sizes); sz++) {
282cfe6fab0SGage Eads 		uint64_t start = rte_rdtsc();
283cfe6fab0SGage Eads 
284cfe6fab0SGage Eads 		for (i = 0; i < iterations; i++) {
285cfe6fab0SGage Eads 			rte_stack_push(s, objs, bulk_sizes[sz]);
286cfe6fab0SGage Eads 			rte_stack_pop(s, objs, bulk_sizes[sz]);
287cfe6fab0SGage Eads 		}
288cfe6fab0SGage Eads 
289cfe6fab0SGage Eads 		uint64_t end = rte_rdtsc();
290cfe6fab0SGage Eads 
291cfe6fab0SGage Eads 		double avg = ((double)(end - start) /
292cfe6fab0SGage Eads 			      (iterations * bulk_sizes[sz]));
293cfe6fab0SGage Eads 
294cfe6fab0SGage Eads 		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
295cfe6fab0SGage Eads 		       bulk_sizes[sz], avg);
296cfe6fab0SGage Eads 	}
297cfe6fab0SGage Eads }
298cfe6fab0SGage Eads 
299cfe6fab0SGage Eads static int
3000420378bSGage Eads __test_stack_perf(uint32_t flags)
301cfe6fab0SGage Eads {
302cfe6fab0SGage Eads 	struct lcore_pair cores;
303cfe6fab0SGage Eads 	struct rte_stack *s;
304cfe6fab0SGage Eads 
305cfe6fab0SGage Eads 	rte_atomic32_init(&lcore_barrier);
306cfe6fab0SGage Eads 
3070420378bSGage Eads 	s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), flags);
308cfe6fab0SGage Eads 	if (s == NULL) {
309cfe6fab0SGage Eads 		printf("[%s():%u] failed to create a stack\n",
310cfe6fab0SGage Eads 		       __func__, __LINE__);
311cfe6fab0SGage Eads 		return -1;
312cfe6fab0SGage Eads 	}
313cfe6fab0SGage Eads 
314cfe6fab0SGage Eads 	printf("### Testing single element push/pop ###\n");
315cfe6fab0SGage Eads 	test_single_push_pop(s);
316cfe6fab0SGage Eads 
317cfe6fab0SGage Eads 	printf("\n### Testing empty pop ###\n");
318cfe6fab0SGage Eads 	test_empty_pop(s);
319cfe6fab0SGage Eads 
320cfe6fab0SGage Eads 	printf("\n### Testing using a single lcore ###\n");
321cfe6fab0SGage Eads 	test_bulk_push_pop(s);
322cfe6fab0SGage Eads 
323cfe6fab0SGage Eads 	if (get_two_hyperthreads(&cores) == 0) {
324cfe6fab0SGage Eads 		printf("\n### Testing using two hyperthreads ###\n");
325cfe6fab0SGage Eads 		run_on_core_pair(&cores, s, bulk_push_pop);
326cfe6fab0SGage Eads 	}
327cfe6fab0SGage Eads 	if (get_two_cores(&cores) == 0) {
328cfe6fab0SGage Eads 		printf("\n### Testing using two physical cores ###\n");
329cfe6fab0SGage Eads 		run_on_core_pair(&cores, s, bulk_push_pop);
330cfe6fab0SGage Eads 	}
331cfe6fab0SGage Eads 	if (get_two_sockets(&cores) == 0) {
332cfe6fab0SGage Eads 		printf("\n### Testing using two NUMA nodes ###\n");
333cfe6fab0SGage Eads 		run_on_core_pair(&cores, s, bulk_push_pop);
334cfe6fab0SGage Eads 	}
335cfe6fab0SGage Eads 
336cfe6fab0SGage Eads 	printf("\n### Testing on all %u lcores ###\n", rte_lcore_count());
337cfe6fab0SGage Eads 	run_on_n_cores(s, bulk_push_pop, rte_lcore_count());
338cfe6fab0SGage Eads 
339cfe6fab0SGage Eads 	rte_stack_free(s);
340cfe6fab0SGage Eads 	return 0;
341cfe6fab0SGage Eads }
342cfe6fab0SGage Eads 
3430420378bSGage Eads static int
3440420378bSGage Eads test_stack_perf(void)
3450420378bSGage Eads {
3460420378bSGage Eads 	return __test_stack_perf(0);
3470420378bSGage Eads }
3480420378bSGage Eads 
3490420378bSGage Eads static int
3500420378bSGage Eads test_lf_stack_perf(void)
3510420378bSGage Eads {
3520420378bSGage Eads 	return __test_stack_perf(RTE_STACK_F_LF);
3530420378bSGage Eads }
3540420378bSGage Eads 
355cfe6fab0SGage Eads REGISTER_TEST_COMMAND(stack_perf_autotest, test_stack_perf);
3560420378bSGage Eads REGISTER_TEST_COMMAND(stack_lf_perf_autotest, test_lf_stack_perf);
357