xref: /dpdk/app/test/test_stack_perf.c (revision 5ecb687a5698d2d8ec1f3b3b5a7a16bceca3e29c)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2019 Intel Corporation
3  */
4 
5 
6 #include <stdio.h>
7 #include <inttypes.h>
8 
9 #include <rte_atomic.h>
10 #include <rte_cycles.h>
11 #include <rte_launch.h>
12 #include <rte_pause.h>
13 #include <rte_stack.h>
14 
15 #include "test.h"
16 
17 #define STACK_NAME "STACK_PERF"
18 #define MAX_BURST 32
19 #define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST)
20 
21 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
22 
23 /*
24  * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time
25  * constants.
26  */
27 static volatile unsigned int bulk_sizes[] = {8, MAX_BURST};
28 
29 static rte_atomic32_t lcore_barrier;
30 
31 struct lcore_pair {
32 	unsigned int c1;
33 	unsigned int c2;
34 };
35 
36 static int
37 get_two_hyperthreads(struct lcore_pair *lcp)
38 {
39 	unsigned int socket[2];
40 	unsigned int core[2];
41 	unsigned int id[2];
42 
43 	RTE_LCORE_FOREACH(id[0]) {
44 		RTE_LCORE_FOREACH(id[1]) {
45 			if (id[0] == id[1])
46 				continue;
47 			core[0] = lcore_config[id[0]].core_id;
48 			core[1] = lcore_config[id[1]].core_id;
49 			socket[0] = lcore_config[id[0]].socket_id;
50 			socket[1] = lcore_config[id[1]].socket_id;
51 			if ((core[0] == core[1]) && (socket[0] == socket[1])) {
52 				lcp->c1 = id[0];
53 				lcp->c2 = id[1];
54 				return 0;
55 			}
56 		}
57 	}
58 
59 	return 1;
60 }
61 
62 static int
63 get_two_cores(struct lcore_pair *lcp)
64 {
65 	unsigned int socket[2];
66 	unsigned int core[2];
67 	unsigned int id[2];
68 
69 	RTE_LCORE_FOREACH(id[0]) {
70 		RTE_LCORE_FOREACH(id[1]) {
71 			if (id[0] == id[1])
72 				continue;
73 			core[0] = lcore_config[id[0]].core_id;
74 			core[1] = lcore_config[id[1]].core_id;
75 			socket[0] = lcore_config[id[0]].socket_id;
76 			socket[1] = lcore_config[id[1]].socket_id;
77 			if ((core[0] != core[1]) && (socket[0] == socket[1])) {
78 				lcp->c1 = id[0];
79 				lcp->c2 = id[1];
80 				return 0;
81 			}
82 		}
83 	}
84 
85 	return 1;
86 }
87 
88 static int
89 get_two_sockets(struct lcore_pair *lcp)
90 {
91 	unsigned int socket[2];
92 	unsigned int id[2];
93 
94 	RTE_LCORE_FOREACH(id[0]) {
95 		RTE_LCORE_FOREACH(id[1]) {
96 			if (id[0] == id[1])
97 				continue;
98 			socket[0] = lcore_config[id[0]].socket_id;
99 			socket[1] = lcore_config[id[1]].socket_id;
100 			if (socket[0] != socket[1]) {
101 				lcp->c1 = id[0];
102 				lcp->c2 = id[1];
103 				return 0;
104 			}
105 		}
106 	}
107 
108 	return 1;
109 }
110 
111 /* Measure the cycle cost of popping an empty stack. */
112 static void
113 test_empty_pop(struct rte_stack *s)
114 {
115 	unsigned int iterations = 100000000;
116 	void *objs[MAX_BURST];
117 	unsigned int i;
118 
119 	uint64_t start = rte_rdtsc();
120 
121 	for (i = 0; i < iterations; i++)
122 		rte_stack_pop(s, objs, bulk_sizes[0]);
123 
124 	uint64_t end = rte_rdtsc();
125 
126 	printf("Stack empty pop: %.2F\n",
127 	       (double)(end - start) / iterations);
128 }
129 
130 struct thread_args {
131 	struct rte_stack *s;
132 	unsigned int sz;
133 	double avg;
134 };
135 
136 /* Measure the average per-pointer cycle cost of stack push and pop */
137 static int
138 bulk_push_pop(void *p)
139 {
140 	unsigned int iterations = 1000000;
141 	struct thread_args *args = p;
142 	void *objs[MAX_BURST] = {0};
143 	unsigned int size, i;
144 	struct rte_stack *s;
145 
146 	s = args->s;
147 	size = args->sz;
148 
149 	rte_atomic32_sub(&lcore_barrier, 1);
150 	while (rte_atomic32_read(&lcore_barrier) != 0)
151 		rte_pause();
152 
153 	uint64_t start = rte_rdtsc();
154 
155 	for (i = 0; i < iterations; i++) {
156 		rte_stack_push(s, objs, size);
157 		rte_stack_pop(s, objs, size);
158 	}
159 
160 	uint64_t end = rte_rdtsc();
161 
162 	args->avg = ((double)(end - start))/(iterations * size);
163 
164 	return 0;
165 }
166 
167 /*
168  * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack
169  * perf when between hyperthread siblings, cores on the same socket, and cores
170  * on different sockets.
171  */
172 static void
173 run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s,
174 		 lcore_function_t fn)
175 {
176 	struct thread_args args[2];
177 	unsigned int i;
178 
179 	for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) {
180 		rte_atomic32_set(&lcore_barrier, 2);
181 
182 		args[0].sz = args[1].sz = bulk_sizes[i];
183 		args[0].s = args[1].s = s;
184 
185 		if (cores->c1 == rte_get_master_lcore()) {
186 			rte_eal_remote_launch(fn, &args[1], cores->c2);
187 			fn(&args[0]);
188 			rte_eal_wait_lcore(cores->c2);
189 		} else {
190 			rte_eal_remote_launch(fn, &args[0], cores->c1);
191 			rte_eal_remote_launch(fn, &args[1], cores->c2);
192 			rte_eal_wait_lcore(cores->c1);
193 			rte_eal_wait_lcore(cores->c2);
194 		}
195 
196 		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
197 		       bulk_sizes[i], (args[0].avg + args[1].avg) / 2);
198 	}
199 }
200 
201 /* Run bulk_push_pop() simultaneously on 1+ cores. */
202 static void
203 run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n)
204 {
205 	struct thread_args args[RTE_MAX_LCORE];
206 	unsigned int i;
207 
208 	for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) {
209 		unsigned int lcore_id;
210 		int cnt = 0;
211 		double avg;
212 
213 		rte_atomic32_set(&lcore_barrier, n);
214 
215 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
216 			if (++cnt >= n)
217 				break;
218 
219 			args[lcore_id].s = s;
220 			args[lcore_id].sz = bulk_sizes[i];
221 
222 			if (rte_eal_remote_launch(fn, &args[lcore_id],
223 						  lcore_id))
224 				rte_panic("Failed to launch lcore %d\n",
225 					  lcore_id);
226 		}
227 
228 		lcore_id = rte_lcore_id();
229 
230 		args[lcore_id].s = s;
231 		args[lcore_id].sz = bulk_sizes[i];
232 
233 		fn(&args[lcore_id]);
234 
235 		rte_eal_mp_wait_lcore();
236 
237 		avg = args[rte_lcore_id()].avg;
238 
239 		cnt = 0;
240 		RTE_LCORE_FOREACH_SLAVE(lcore_id) {
241 			if (++cnt >= n)
242 				break;
243 			avg += args[lcore_id].avg;
244 		}
245 
246 		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
247 		       bulk_sizes[i], avg / n);
248 	}
249 }
250 
251 /*
252  * Measure the cycle cost of pushing and popping a single pointer on a single
253  * lcore.
254  */
255 static void
256 test_single_push_pop(struct rte_stack *s)
257 {
258 	unsigned int iterations = 16000000;
259 	void *obj = NULL;
260 	unsigned int i;
261 
262 	uint64_t start = rte_rdtsc();
263 
264 	for (i = 0; i < iterations; i++) {
265 		rte_stack_push(s, &obj, 1);
266 		rte_stack_pop(s, &obj, 1);
267 	}
268 
269 	uint64_t end = rte_rdtsc();
270 
271 	printf("Average cycles per single object push/pop: %.2F\n",
272 	       ((double)(end - start)) / iterations);
273 }
274 
275 /* Measure the cycle cost of bulk pushing and popping on a single lcore. */
276 static void
277 test_bulk_push_pop(struct rte_stack *s)
278 {
279 	unsigned int iterations = 8000000;
280 	void *objs[MAX_BURST];
281 	unsigned int sz, i;
282 
283 	for (sz = 0; sz < ARRAY_SIZE(bulk_sizes); sz++) {
284 		uint64_t start = rte_rdtsc();
285 
286 		for (i = 0; i < iterations; i++) {
287 			rte_stack_push(s, objs, bulk_sizes[sz]);
288 			rte_stack_pop(s, objs, bulk_sizes[sz]);
289 		}
290 
291 		uint64_t end = rte_rdtsc();
292 
293 		double avg = ((double)(end - start) /
294 			      (iterations * bulk_sizes[sz]));
295 
296 		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
297 		       bulk_sizes[sz], avg);
298 	}
299 }
300 
301 static int
302 __test_stack_perf(uint32_t flags)
303 {
304 	struct lcore_pair cores;
305 	struct rte_stack *s;
306 
307 	rte_atomic32_init(&lcore_barrier);
308 
309 	s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), flags);
310 	if (s == NULL) {
311 		printf("[%s():%u] failed to create a stack\n",
312 		       __func__, __LINE__);
313 		return -1;
314 	}
315 
316 	printf("### Testing single element push/pop ###\n");
317 	test_single_push_pop(s);
318 
319 	printf("\n### Testing empty pop ###\n");
320 	test_empty_pop(s);
321 
322 	printf("\n### Testing using a single lcore ###\n");
323 	test_bulk_push_pop(s);
324 
325 	if (get_two_hyperthreads(&cores) == 0) {
326 		printf("\n### Testing using two hyperthreads ###\n");
327 		run_on_core_pair(&cores, s, bulk_push_pop);
328 	}
329 	if (get_two_cores(&cores) == 0) {
330 		printf("\n### Testing using two physical cores ###\n");
331 		run_on_core_pair(&cores, s, bulk_push_pop);
332 	}
333 	if (get_two_sockets(&cores) == 0) {
334 		printf("\n### Testing using two NUMA nodes ###\n");
335 		run_on_core_pair(&cores, s, bulk_push_pop);
336 	}
337 
338 	printf("\n### Testing on all %u lcores ###\n", rte_lcore_count());
339 	run_on_n_cores(s, bulk_push_pop, rte_lcore_count());
340 
341 	rte_stack_free(s);
342 	return 0;
343 }
344 
345 static int
346 test_stack_perf(void)
347 {
348 	return __test_stack_perf(0);
349 }
350 
351 static int
352 test_lf_stack_perf(void)
353 {
354 	return __test_stack_perf(RTE_STACK_F_LF);
355 }
356 
357 REGISTER_TEST_COMMAND(stack_perf_autotest, test_stack_perf);
358 REGISTER_TEST_COMMAND(stack_lf_perf_autotest, test_lf_stack_perf);
359