xref: /dpdk/app/test/test_stack_perf.c (revision b6a7e6852e9ab82ae0e05e2d2a0b83abca17de3b)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2019 Intel Corporation
3  */
4 
5 
6 #include <stdio.h>
7 #include <inttypes.h>
8 
9 #include <rte_cycles.h>
10 #include <rte_launch.h>
11 #include <rte_pause.h>
12 #include <rte_stack.h>
13 
14 #include "test.h"
15 
16 #define STACK_NAME "STACK_PERF"
17 #define MAX_BURST 32
18 #define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST)
19 
20 /*
21  * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time
22  * constants.
23  */
24 static volatile unsigned int bulk_sizes[] = {8, MAX_BURST};
25 
26 static RTE_ATOMIC(uint32_t) lcore_barrier;
27 
28 struct lcore_pair {
29 	unsigned int c1;
30 	unsigned int c2;
31 };
32 
33 static int
get_two_hyperthreads(struct lcore_pair * lcp)34 get_two_hyperthreads(struct lcore_pair *lcp)
35 {
36 	unsigned int socket[2];
37 	unsigned int core[2];
38 	unsigned int id[2];
39 
40 	RTE_LCORE_FOREACH(id[0]) {
41 		RTE_LCORE_FOREACH(id[1]) {
42 			if (id[0] == id[1])
43 				continue;
44 			core[0] = rte_lcore_to_cpu_id(id[0]);
45 			core[1] = rte_lcore_to_cpu_id(id[1]);
46 			socket[0] = rte_lcore_to_socket_id(id[0]);
47 			socket[1] = rte_lcore_to_socket_id(id[1]);
48 			if ((core[0] == core[1]) && (socket[0] == socket[1])) {
49 				lcp->c1 = id[0];
50 				lcp->c2 = id[1];
51 				return 0;
52 			}
53 		}
54 	}
55 
56 	return 1;
57 }
58 
59 static int
get_two_cores(struct lcore_pair * lcp)60 get_two_cores(struct lcore_pair *lcp)
61 {
62 	unsigned int socket[2];
63 	unsigned int core[2];
64 	unsigned int id[2];
65 
66 	RTE_LCORE_FOREACH(id[0]) {
67 		RTE_LCORE_FOREACH(id[1]) {
68 			if (id[0] == id[1])
69 				continue;
70 			core[0] = rte_lcore_to_cpu_id(id[0]);
71 			core[1] = rte_lcore_to_cpu_id(id[1]);
72 			socket[0] = rte_lcore_to_socket_id(id[0]);
73 			socket[1] = rte_lcore_to_socket_id(id[1]);
74 			if ((core[0] != core[1]) && (socket[0] == socket[1])) {
75 				lcp->c1 = id[0];
76 				lcp->c2 = id[1];
77 				return 0;
78 			}
79 		}
80 	}
81 
82 	return 1;
83 }
84 
85 static int
get_two_sockets(struct lcore_pair * lcp)86 get_two_sockets(struct lcore_pair *lcp)
87 {
88 	unsigned int socket[2];
89 	unsigned int id[2];
90 
91 	RTE_LCORE_FOREACH(id[0]) {
92 		RTE_LCORE_FOREACH(id[1]) {
93 			if (id[0] == id[1])
94 				continue;
95 			socket[0] = rte_lcore_to_socket_id(id[0]);
96 			socket[1] = rte_lcore_to_socket_id(id[1]);
97 			if (socket[0] != socket[1]) {
98 				lcp->c1 = id[0];
99 				lcp->c2 = id[1];
100 				return 0;
101 			}
102 		}
103 	}
104 
105 	return 1;
106 }
107 
108 /* Measure the cycle cost of popping an empty stack. */
109 static void
test_empty_pop(struct rte_stack * s)110 test_empty_pop(struct rte_stack *s)
111 {
112 	unsigned int iterations = 100000000;
113 	void *objs[MAX_BURST];
114 	unsigned int i;
115 
116 	uint64_t start = rte_rdtsc();
117 
118 	for (i = 0; i < iterations; i++)
119 		rte_stack_pop(s, objs, bulk_sizes[0]);
120 
121 	uint64_t end = rte_rdtsc();
122 
123 	printf("Stack empty pop: %.2F\n",
124 	       (double)(end - start) / iterations);
125 }
126 
127 struct thread_args {
128 	struct rte_stack *s;
129 	unsigned int sz;
130 	double avg;
131 };
132 
133 /* Measure the average per-pointer cycle cost of stack push and pop */
134 static int
bulk_push_pop(void * p)135 bulk_push_pop(void *p)
136 {
137 	unsigned int iterations = 1000000;
138 	struct thread_args *args = p;
139 	void *objs[MAX_BURST] = {0};
140 	unsigned int size, i;
141 	struct rte_stack *s;
142 
143 	s = args->s;
144 	size = args->sz;
145 
146 	rte_atomic_fetch_sub_explicit(&lcore_barrier, 1, rte_memory_order_relaxed);
147 	rte_wait_until_equal_32((uint32_t *)(uintptr_t)&lcore_barrier, 0, rte_memory_order_relaxed);
148 
149 	uint64_t start = rte_rdtsc();
150 
151 	for (i = 0; i < iterations; i++) {
152 		rte_stack_push(s, objs, size);
153 		rte_stack_pop(s, objs, size);
154 	}
155 
156 	uint64_t end = rte_rdtsc();
157 
158 	args->avg = ((double)(end - start))/(iterations * size);
159 
160 	return 0;
161 }
162 
163 /*
164  * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack
165  * perf when between hyperthread siblings, cores on the same socket, and cores
166  * on different sockets.
167  */
168 static void
run_on_core_pair(struct lcore_pair * cores,struct rte_stack * s,lcore_function_t fn)169 run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s,
170 		 lcore_function_t fn)
171 {
172 	struct thread_args args[2];
173 	unsigned int i;
174 
175 	for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
176 		rte_atomic_store_explicit(&lcore_barrier, 2, rte_memory_order_relaxed);
177 
178 		args[0].sz = args[1].sz = bulk_sizes[i];
179 		args[0].s = args[1].s = s;
180 
181 		if (cores->c1 == rte_get_main_lcore()) {
182 			rte_eal_remote_launch(fn, &args[1], cores->c2);
183 			fn(&args[0]);
184 			rte_eal_wait_lcore(cores->c2);
185 		} else {
186 			rte_eal_remote_launch(fn, &args[0], cores->c1);
187 			rte_eal_remote_launch(fn, &args[1], cores->c2);
188 			rte_eal_wait_lcore(cores->c1);
189 			rte_eal_wait_lcore(cores->c2);
190 		}
191 
192 		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
193 		       bulk_sizes[i], (args[0].avg + args[1].avg) / 2);
194 	}
195 }
196 
197 /* Run bulk_push_pop() simultaneously on 1+ cores. */
198 static void
run_on_n_cores(struct rte_stack * s,lcore_function_t fn,int n)199 run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n)
200 {
201 	struct thread_args args[RTE_MAX_LCORE];
202 	unsigned int i;
203 
204 	for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
205 		unsigned int lcore_id;
206 		int cnt = 0;
207 		double avg;
208 
209 		rte_atomic_store_explicit(&lcore_barrier, n, rte_memory_order_relaxed);
210 
211 		RTE_LCORE_FOREACH_WORKER(lcore_id) {
212 			if (++cnt >= n)
213 				break;
214 
215 			args[lcore_id].s = s;
216 			args[lcore_id].sz = bulk_sizes[i];
217 
218 			if (rte_eal_remote_launch(fn, &args[lcore_id],
219 						  lcore_id))
220 				rte_panic("Failed to launch lcore %d\n",
221 					  lcore_id);
222 		}
223 
224 		lcore_id = rte_lcore_id();
225 
226 		args[lcore_id].s = s;
227 		args[lcore_id].sz = bulk_sizes[i];
228 
229 		fn(&args[lcore_id]);
230 
231 		rte_eal_mp_wait_lcore();
232 
233 		avg = args[rte_lcore_id()].avg;
234 
235 		cnt = 0;
236 		RTE_LCORE_FOREACH_WORKER(lcore_id) {
237 			if (++cnt >= n)
238 				break;
239 			avg += args[lcore_id].avg;
240 		}
241 
242 		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
243 		       bulk_sizes[i], avg / n);
244 	}
245 }
246 
247 /*
248  * Measure the cycle cost of pushing and popping a single pointer on a single
249  * lcore.
250  */
251 static void
test_single_push_pop(struct rte_stack * s)252 test_single_push_pop(struct rte_stack *s)
253 {
254 	unsigned int iterations = 16000000;
255 	void *obj = NULL;
256 	unsigned int i;
257 
258 	uint64_t start = rte_rdtsc();
259 
260 	for (i = 0; i < iterations; i++) {
261 		rte_stack_push(s, &obj, 1);
262 		rte_stack_pop(s, &obj, 1);
263 	}
264 
265 	uint64_t end = rte_rdtsc();
266 
267 	printf("Average cycles per single object push/pop: %.2F\n",
268 	       ((double)(end - start)) / iterations);
269 }
270 
271 /* Measure the cycle cost of bulk pushing and popping on a single lcore. */
272 static void
test_bulk_push_pop(struct rte_stack * s)273 test_bulk_push_pop(struct rte_stack *s)
274 {
275 	unsigned int iterations = 8000000;
276 	void *objs[MAX_BURST];
277 	unsigned int sz, i;
278 
279 	for (sz = 0; sz < RTE_DIM(bulk_sizes); sz++) {
280 		uint64_t start = rte_rdtsc();
281 
282 		for (i = 0; i < iterations; i++) {
283 			rte_stack_push(s, objs, bulk_sizes[sz]);
284 			rte_stack_pop(s, objs, bulk_sizes[sz]);
285 		}
286 
287 		uint64_t end = rte_rdtsc();
288 
289 		double avg = ((double)(end - start) /
290 			      (iterations * bulk_sizes[sz]));
291 
292 		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
293 		       bulk_sizes[sz], avg);
294 	}
295 }
296 
297 static int
__test_stack_perf(uint32_t flags)298 __test_stack_perf(uint32_t flags)
299 {
300 	struct lcore_pair cores;
301 	struct rte_stack *s;
302 
303 	rte_atomic_store_explicit(&lcore_barrier, 0, rte_memory_order_relaxed);
304 
305 	s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), flags);
306 	if (s == NULL) {
307 		printf("[%s():%u] failed to create a stack\n",
308 		       __func__, __LINE__);
309 		return -1;
310 	}
311 
312 	printf("### Testing single element push/pop ###\n");
313 	test_single_push_pop(s);
314 
315 	printf("\n### Testing empty pop ###\n");
316 	test_empty_pop(s);
317 
318 	printf("\n### Testing using a single lcore ###\n");
319 	test_bulk_push_pop(s);
320 
321 	if (get_two_hyperthreads(&cores) == 0) {
322 		printf("\n### Testing using two hyperthreads ###\n");
323 		run_on_core_pair(&cores, s, bulk_push_pop);
324 	}
325 	if (get_two_cores(&cores) == 0) {
326 		printf("\n### Testing using two physical cores ###\n");
327 		run_on_core_pair(&cores, s, bulk_push_pop);
328 	}
329 	if (get_two_sockets(&cores) == 0) {
330 		printf("\n### Testing using two NUMA nodes ###\n");
331 		run_on_core_pair(&cores, s, bulk_push_pop);
332 	}
333 
334 	printf("\n### Testing on all %u lcores ###\n", rte_lcore_count());
335 	run_on_n_cores(s, bulk_push_pop, rte_lcore_count());
336 
337 	rte_stack_free(s);
338 	return 0;
339 }
340 
341 static int
test_stack_perf(void)342 test_stack_perf(void)
343 {
344 	return __test_stack_perf(0);
345 }
346 
347 static int
test_lf_stack_perf(void)348 test_lf_stack_perf(void)
349 {
350 #if defined(RTE_STACK_LF_SUPPORTED)
351 	return __test_stack_perf(RTE_STACK_F_LF);
352 #else
353 	return TEST_SKIPPED;
354 #endif
355 }
356 
357 REGISTER_PERF_TEST(stack_perf_autotest, test_stack_perf);
358 REGISTER_PERF_TEST(stack_lf_perf_autotest, test_lf_stack_perf);
359