xref: /dpdk/app/test/test_stack_perf.c (revision daa02b5cddbb8e11b31d41e2bf7bb1ae64dcae2f)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2019 Intel Corporation
3  */
4 
5 
6 #include <stdio.h>
7 #include <inttypes.h>
8 
9 #include <rte_atomic.h>
10 #include <rte_cycles.h>
11 #include <rte_launch.h>
12 #include <rte_pause.h>
13 #include <rte_stack.h>
14 
15 #include "test.h"
16 
17 #define STACK_NAME "STACK_PERF"
18 #define MAX_BURST 32
19 #define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST)
20 
21 /*
22  * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time
23  * constants.
24  */
25 static volatile unsigned int bulk_sizes[] = {8, MAX_BURST};
26 
27 static rte_atomic32_t lcore_barrier;
28 
29 struct lcore_pair {
30 	unsigned int c1;
31 	unsigned int c2;
32 };
33 
34 static int
35 get_two_hyperthreads(struct lcore_pair *lcp)
36 {
37 	unsigned int socket[2];
38 	unsigned int core[2];
39 	unsigned int id[2];
40 
41 	RTE_LCORE_FOREACH(id[0]) {
42 		RTE_LCORE_FOREACH(id[1]) {
43 			if (id[0] == id[1])
44 				continue;
45 			core[0] = rte_lcore_to_cpu_id(id[0]);
46 			core[1] = rte_lcore_to_cpu_id(id[1]);
47 			socket[0] = rte_lcore_to_socket_id(id[0]);
48 			socket[1] = rte_lcore_to_socket_id(id[1]);
49 			if ((core[0] == core[1]) && (socket[0] == socket[1])) {
50 				lcp->c1 = id[0];
51 				lcp->c2 = id[1];
52 				return 0;
53 			}
54 		}
55 	}
56 
57 	return 1;
58 }
59 
60 static int
61 get_two_cores(struct lcore_pair *lcp)
62 {
63 	unsigned int socket[2];
64 	unsigned int core[2];
65 	unsigned int id[2];
66 
67 	RTE_LCORE_FOREACH(id[0]) {
68 		RTE_LCORE_FOREACH(id[1]) {
69 			if (id[0] == id[1])
70 				continue;
71 			core[0] = rte_lcore_to_cpu_id(id[0]);
72 			core[1] = rte_lcore_to_cpu_id(id[1]);
73 			socket[0] = rte_lcore_to_socket_id(id[0]);
74 			socket[1] = rte_lcore_to_socket_id(id[1]);
75 			if ((core[0] != core[1]) && (socket[0] == socket[1])) {
76 				lcp->c1 = id[0];
77 				lcp->c2 = id[1];
78 				return 0;
79 			}
80 		}
81 	}
82 
83 	return 1;
84 }
85 
86 static int
87 get_two_sockets(struct lcore_pair *lcp)
88 {
89 	unsigned int socket[2];
90 	unsigned int id[2];
91 
92 	RTE_LCORE_FOREACH(id[0]) {
93 		RTE_LCORE_FOREACH(id[1]) {
94 			if (id[0] == id[1])
95 				continue;
96 			socket[0] = rte_lcore_to_socket_id(id[0]);
97 			socket[1] = rte_lcore_to_socket_id(id[1]);
98 			if (socket[0] != socket[1]) {
99 				lcp->c1 = id[0];
100 				lcp->c2 = id[1];
101 				return 0;
102 			}
103 		}
104 	}
105 
106 	return 1;
107 }
108 
109 /* Measure the cycle cost of popping an empty stack. */
110 static void
111 test_empty_pop(struct rte_stack *s)
112 {
113 	unsigned int iterations = 100000000;
114 	void *objs[MAX_BURST];
115 	unsigned int i;
116 
117 	uint64_t start = rte_rdtsc();
118 
119 	for (i = 0; i < iterations; i++)
120 		rte_stack_pop(s, objs, bulk_sizes[0]);
121 
122 	uint64_t end = rte_rdtsc();
123 
124 	printf("Stack empty pop: %.2F\n",
125 	       (double)(end - start) / iterations);
126 }
127 
128 struct thread_args {
129 	struct rte_stack *s;
130 	unsigned int sz;
131 	double avg;
132 };
133 
134 /* Measure the average per-pointer cycle cost of stack push and pop */
135 static int
136 bulk_push_pop(void *p)
137 {
138 	unsigned int iterations = 1000000;
139 	struct thread_args *args = p;
140 	void *objs[MAX_BURST] = {0};
141 	unsigned int size, i;
142 	struct rte_stack *s;
143 
144 	s = args->s;
145 	size = args->sz;
146 
147 	rte_atomic32_sub(&lcore_barrier, 1);
148 	while (rte_atomic32_read(&lcore_barrier) != 0)
149 		rte_pause();
150 
151 	uint64_t start = rte_rdtsc();
152 
153 	for (i = 0; i < iterations; i++) {
154 		rte_stack_push(s, objs, size);
155 		rte_stack_pop(s, objs, size);
156 	}
157 
158 	uint64_t end = rte_rdtsc();
159 
160 	args->avg = ((double)(end - start))/(iterations * size);
161 
162 	return 0;
163 }
164 
165 /*
166  * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack
167  * perf when between hyperthread siblings, cores on the same socket, and cores
168  * on different sockets.
169  */
170 static void
171 run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s,
172 		 lcore_function_t fn)
173 {
174 	struct thread_args args[2];
175 	unsigned int i;
176 
177 	for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
178 		rte_atomic32_set(&lcore_barrier, 2);
179 
180 		args[0].sz = args[1].sz = bulk_sizes[i];
181 		args[0].s = args[1].s = s;
182 
183 		if (cores->c1 == rte_get_main_lcore()) {
184 			rte_eal_remote_launch(fn, &args[1], cores->c2);
185 			fn(&args[0]);
186 			rte_eal_wait_lcore(cores->c2);
187 		} else {
188 			rte_eal_remote_launch(fn, &args[0], cores->c1);
189 			rte_eal_remote_launch(fn, &args[1], cores->c2);
190 			rte_eal_wait_lcore(cores->c1);
191 			rte_eal_wait_lcore(cores->c2);
192 		}
193 
194 		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
195 		       bulk_sizes[i], (args[0].avg + args[1].avg) / 2);
196 	}
197 }
198 
199 /* Run bulk_push_pop() simultaneously on 1+ cores. */
200 static void
201 run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n)
202 {
203 	struct thread_args args[RTE_MAX_LCORE];
204 	unsigned int i;
205 
206 	for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
207 		unsigned int lcore_id;
208 		int cnt = 0;
209 		double avg;
210 
211 		rte_atomic32_set(&lcore_barrier, n);
212 
213 		RTE_LCORE_FOREACH_WORKER(lcore_id) {
214 			if (++cnt >= n)
215 				break;
216 
217 			args[lcore_id].s = s;
218 			args[lcore_id].sz = bulk_sizes[i];
219 
220 			if (rte_eal_remote_launch(fn, &args[lcore_id],
221 						  lcore_id))
222 				rte_panic("Failed to launch lcore %d\n",
223 					  lcore_id);
224 		}
225 
226 		lcore_id = rte_lcore_id();
227 
228 		args[lcore_id].s = s;
229 		args[lcore_id].sz = bulk_sizes[i];
230 
231 		fn(&args[lcore_id]);
232 
233 		rte_eal_mp_wait_lcore();
234 
235 		avg = args[rte_lcore_id()].avg;
236 
237 		cnt = 0;
238 		RTE_LCORE_FOREACH_WORKER(lcore_id) {
239 			if (++cnt >= n)
240 				break;
241 			avg += args[lcore_id].avg;
242 		}
243 
244 		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
245 		       bulk_sizes[i], avg / n);
246 	}
247 }
248 
249 /*
250  * Measure the cycle cost of pushing and popping a single pointer on a single
251  * lcore.
252  */
253 static void
254 test_single_push_pop(struct rte_stack *s)
255 {
256 	unsigned int iterations = 16000000;
257 	void *obj = NULL;
258 	unsigned int i;
259 
260 	uint64_t start = rte_rdtsc();
261 
262 	for (i = 0; i < iterations; i++) {
263 		rte_stack_push(s, &obj, 1);
264 		rte_stack_pop(s, &obj, 1);
265 	}
266 
267 	uint64_t end = rte_rdtsc();
268 
269 	printf("Average cycles per single object push/pop: %.2F\n",
270 	       ((double)(end - start)) / iterations);
271 }
272 
273 /* Measure the cycle cost of bulk pushing and popping on a single lcore. */
274 static void
275 test_bulk_push_pop(struct rte_stack *s)
276 {
277 	unsigned int iterations = 8000000;
278 	void *objs[MAX_BURST];
279 	unsigned int sz, i;
280 
281 	for (sz = 0; sz < RTE_DIM(bulk_sizes); sz++) {
282 		uint64_t start = rte_rdtsc();
283 
284 		for (i = 0; i < iterations; i++) {
285 			rte_stack_push(s, objs, bulk_sizes[sz]);
286 			rte_stack_pop(s, objs, bulk_sizes[sz]);
287 		}
288 
289 		uint64_t end = rte_rdtsc();
290 
291 		double avg = ((double)(end - start) /
292 			      (iterations * bulk_sizes[sz]));
293 
294 		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
295 		       bulk_sizes[sz], avg);
296 	}
297 }
298 
299 static int
300 __test_stack_perf(uint32_t flags)
301 {
302 	struct lcore_pair cores;
303 	struct rte_stack *s;
304 
305 	rte_atomic32_init(&lcore_barrier);
306 
307 	s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), flags);
308 	if (s == NULL) {
309 		printf("[%s():%u] failed to create a stack\n",
310 		       __func__, __LINE__);
311 		return -1;
312 	}
313 
314 	printf("### Testing single element push/pop ###\n");
315 	test_single_push_pop(s);
316 
317 	printf("\n### Testing empty pop ###\n");
318 	test_empty_pop(s);
319 
320 	printf("\n### Testing using a single lcore ###\n");
321 	test_bulk_push_pop(s);
322 
323 	if (get_two_hyperthreads(&cores) == 0) {
324 		printf("\n### Testing using two hyperthreads ###\n");
325 		run_on_core_pair(&cores, s, bulk_push_pop);
326 	}
327 	if (get_two_cores(&cores) == 0) {
328 		printf("\n### Testing using two physical cores ###\n");
329 		run_on_core_pair(&cores, s, bulk_push_pop);
330 	}
331 	if (get_two_sockets(&cores) == 0) {
332 		printf("\n### Testing using two NUMA nodes ###\n");
333 		run_on_core_pair(&cores, s, bulk_push_pop);
334 	}
335 
336 	printf("\n### Testing on all %u lcores ###\n", rte_lcore_count());
337 	run_on_n_cores(s, bulk_push_pop, rte_lcore_count());
338 
339 	rte_stack_free(s);
340 	return 0;
341 }
342 
343 static int
344 test_stack_perf(void)
345 {
346 	return __test_stack_perf(0);
347 }
348 
349 static int
350 test_lf_stack_perf(void)
351 {
352 #if defined(RTE_STACK_LF_SUPPORTED)
353 	return __test_stack_perf(RTE_STACK_F_LF);
354 #else
355 	return TEST_SKIPPED;
356 #endif
357 }
358 
359 REGISTER_TEST_COMMAND(stack_perf_autotest, test_stack_perf);
360 REGISTER_TEST_COMMAND(stack_lf_perf_autotest, test_lf_stack_perf);
361