1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2019 Intel Corporation
3 */
4
5
6 #include <stdio.h>
7 #include <inttypes.h>
8
9 #include <rte_cycles.h>
10 #include <rte_launch.h>
11 #include <rte_pause.h>
12 #include <rte_stack.h>
13
14 #include "test.h"
15
16 #define STACK_NAME "STACK_PERF"
17 #define MAX_BURST 32
18 #define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST)
19
20 /*
21 * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time
22 * constants.
23 */
24 static volatile unsigned int bulk_sizes[] = {8, MAX_BURST};
25
26 static RTE_ATOMIC(uint32_t) lcore_barrier;
27
28 struct lcore_pair {
29 unsigned int c1;
30 unsigned int c2;
31 };
32
33 static int
get_two_hyperthreads(struct lcore_pair * lcp)34 get_two_hyperthreads(struct lcore_pair *lcp)
35 {
36 unsigned int socket[2];
37 unsigned int core[2];
38 unsigned int id[2];
39
40 RTE_LCORE_FOREACH(id[0]) {
41 RTE_LCORE_FOREACH(id[1]) {
42 if (id[0] == id[1])
43 continue;
44 core[0] = rte_lcore_to_cpu_id(id[0]);
45 core[1] = rte_lcore_to_cpu_id(id[1]);
46 socket[0] = rte_lcore_to_socket_id(id[0]);
47 socket[1] = rte_lcore_to_socket_id(id[1]);
48 if ((core[0] == core[1]) && (socket[0] == socket[1])) {
49 lcp->c1 = id[0];
50 lcp->c2 = id[1];
51 return 0;
52 }
53 }
54 }
55
56 return 1;
57 }
58
59 static int
get_two_cores(struct lcore_pair * lcp)60 get_two_cores(struct lcore_pair *lcp)
61 {
62 unsigned int socket[2];
63 unsigned int core[2];
64 unsigned int id[2];
65
66 RTE_LCORE_FOREACH(id[0]) {
67 RTE_LCORE_FOREACH(id[1]) {
68 if (id[0] == id[1])
69 continue;
70 core[0] = rte_lcore_to_cpu_id(id[0]);
71 core[1] = rte_lcore_to_cpu_id(id[1]);
72 socket[0] = rte_lcore_to_socket_id(id[0]);
73 socket[1] = rte_lcore_to_socket_id(id[1]);
74 if ((core[0] != core[1]) && (socket[0] == socket[1])) {
75 lcp->c1 = id[0];
76 lcp->c2 = id[1];
77 return 0;
78 }
79 }
80 }
81
82 return 1;
83 }
84
85 static int
get_two_sockets(struct lcore_pair * lcp)86 get_two_sockets(struct lcore_pair *lcp)
87 {
88 unsigned int socket[2];
89 unsigned int id[2];
90
91 RTE_LCORE_FOREACH(id[0]) {
92 RTE_LCORE_FOREACH(id[1]) {
93 if (id[0] == id[1])
94 continue;
95 socket[0] = rte_lcore_to_socket_id(id[0]);
96 socket[1] = rte_lcore_to_socket_id(id[1]);
97 if (socket[0] != socket[1]) {
98 lcp->c1 = id[0];
99 lcp->c2 = id[1];
100 return 0;
101 }
102 }
103 }
104
105 return 1;
106 }
107
108 /* Measure the cycle cost of popping an empty stack. */
109 static void
test_empty_pop(struct rte_stack * s)110 test_empty_pop(struct rte_stack *s)
111 {
112 unsigned int iterations = 100000000;
113 void *objs[MAX_BURST];
114 unsigned int i;
115
116 uint64_t start = rte_rdtsc();
117
118 for (i = 0; i < iterations; i++)
119 rte_stack_pop(s, objs, bulk_sizes[0]);
120
121 uint64_t end = rte_rdtsc();
122
123 printf("Stack empty pop: %.2F\n",
124 (double)(end - start) / iterations);
125 }
126
127 struct thread_args {
128 struct rte_stack *s;
129 unsigned int sz;
130 double avg;
131 };
132
133 /* Measure the average per-pointer cycle cost of stack push and pop */
134 static int
bulk_push_pop(void * p)135 bulk_push_pop(void *p)
136 {
137 unsigned int iterations = 1000000;
138 struct thread_args *args = p;
139 void *objs[MAX_BURST] = {0};
140 unsigned int size, i;
141 struct rte_stack *s;
142
143 s = args->s;
144 size = args->sz;
145
146 rte_atomic_fetch_sub_explicit(&lcore_barrier, 1, rte_memory_order_relaxed);
147 rte_wait_until_equal_32((uint32_t *)(uintptr_t)&lcore_barrier, 0, rte_memory_order_relaxed);
148
149 uint64_t start = rte_rdtsc();
150
151 for (i = 0; i < iterations; i++) {
152 rte_stack_push(s, objs, size);
153 rte_stack_pop(s, objs, size);
154 }
155
156 uint64_t end = rte_rdtsc();
157
158 args->avg = ((double)(end - start))/(iterations * size);
159
160 return 0;
161 }
162
163 /*
164 * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack
165 * perf when between hyperthread siblings, cores on the same socket, and cores
166 * on different sockets.
167 */
168 static void
run_on_core_pair(struct lcore_pair * cores,struct rte_stack * s,lcore_function_t fn)169 run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s,
170 lcore_function_t fn)
171 {
172 struct thread_args args[2];
173 unsigned int i;
174
175 for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
176 rte_atomic_store_explicit(&lcore_barrier, 2, rte_memory_order_relaxed);
177
178 args[0].sz = args[1].sz = bulk_sizes[i];
179 args[0].s = args[1].s = s;
180
181 if (cores->c1 == rte_get_main_lcore()) {
182 rte_eal_remote_launch(fn, &args[1], cores->c2);
183 fn(&args[0]);
184 rte_eal_wait_lcore(cores->c2);
185 } else {
186 rte_eal_remote_launch(fn, &args[0], cores->c1);
187 rte_eal_remote_launch(fn, &args[1], cores->c2);
188 rte_eal_wait_lcore(cores->c1);
189 rte_eal_wait_lcore(cores->c2);
190 }
191
192 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
193 bulk_sizes[i], (args[0].avg + args[1].avg) / 2);
194 }
195 }
196
197 /* Run bulk_push_pop() simultaneously on 1+ cores. */
198 static void
run_on_n_cores(struct rte_stack * s,lcore_function_t fn,int n)199 run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n)
200 {
201 struct thread_args args[RTE_MAX_LCORE];
202 unsigned int i;
203
204 for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
205 unsigned int lcore_id;
206 int cnt = 0;
207 double avg;
208
209 rte_atomic_store_explicit(&lcore_barrier, n, rte_memory_order_relaxed);
210
211 RTE_LCORE_FOREACH_WORKER(lcore_id) {
212 if (++cnt >= n)
213 break;
214
215 args[lcore_id].s = s;
216 args[lcore_id].sz = bulk_sizes[i];
217
218 if (rte_eal_remote_launch(fn, &args[lcore_id],
219 lcore_id))
220 rte_panic("Failed to launch lcore %d\n",
221 lcore_id);
222 }
223
224 lcore_id = rte_lcore_id();
225
226 args[lcore_id].s = s;
227 args[lcore_id].sz = bulk_sizes[i];
228
229 fn(&args[lcore_id]);
230
231 rte_eal_mp_wait_lcore();
232
233 avg = args[rte_lcore_id()].avg;
234
235 cnt = 0;
236 RTE_LCORE_FOREACH_WORKER(lcore_id) {
237 if (++cnt >= n)
238 break;
239 avg += args[lcore_id].avg;
240 }
241
242 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
243 bulk_sizes[i], avg / n);
244 }
245 }
246
247 /*
248 * Measure the cycle cost of pushing and popping a single pointer on a single
249 * lcore.
250 */
251 static void
test_single_push_pop(struct rte_stack * s)252 test_single_push_pop(struct rte_stack *s)
253 {
254 unsigned int iterations = 16000000;
255 void *obj = NULL;
256 unsigned int i;
257
258 uint64_t start = rte_rdtsc();
259
260 for (i = 0; i < iterations; i++) {
261 rte_stack_push(s, &obj, 1);
262 rte_stack_pop(s, &obj, 1);
263 }
264
265 uint64_t end = rte_rdtsc();
266
267 printf("Average cycles per single object push/pop: %.2F\n",
268 ((double)(end - start)) / iterations);
269 }
270
271 /* Measure the cycle cost of bulk pushing and popping on a single lcore. */
272 static void
test_bulk_push_pop(struct rte_stack * s)273 test_bulk_push_pop(struct rte_stack *s)
274 {
275 unsigned int iterations = 8000000;
276 void *objs[MAX_BURST];
277 unsigned int sz, i;
278
279 for (sz = 0; sz < RTE_DIM(bulk_sizes); sz++) {
280 uint64_t start = rte_rdtsc();
281
282 for (i = 0; i < iterations; i++) {
283 rte_stack_push(s, objs, bulk_sizes[sz]);
284 rte_stack_pop(s, objs, bulk_sizes[sz]);
285 }
286
287 uint64_t end = rte_rdtsc();
288
289 double avg = ((double)(end - start) /
290 (iterations * bulk_sizes[sz]));
291
292 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
293 bulk_sizes[sz], avg);
294 }
295 }
296
297 static int
__test_stack_perf(uint32_t flags)298 __test_stack_perf(uint32_t flags)
299 {
300 struct lcore_pair cores;
301 struct rte_stack *s;
302
303 rte_atomic_store_explicit(&lcore_barrier, 0, rte_memory_order_relaxed);
304
305 s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), flags);
306 if (s == NULL) {
307 printf("[%s():%u] failed to create a stack\n",
308 __func__, __LINE__);
309 return -1;
310 }
311
312 printf("### Testing single element push/pop ###\n");
313 test_single_push_pop(s);
314
315 printf("\n### Testing empty pop ###\n");
316 test_empty_pop(s);
317
318 printf("\n### Testing using a single lcore ###\n");
319 test_bulk_push_pop(s);
320
321 if (get_two_hyperthreads(&cores) == 0) {
322 printf("\n### Testing using two hyperthreads ###\n");
323 run_on_core_pair(&cores, s, bulk_push_pop);
324 }
325 if (get_two_cores(&cores) == 0) {
326 printf("\n### Testing using two physical cores ###\n");
327 run_on_core_pair(&cores, s, bulk_push_pop);
328 }
329 if (get_two_sockets(&cores) == 0) {
330 printf("\n### Testing using two NUMA nodes ###\n");
331 run_on_core_pair(&cores, s, bulk_push_pop);
332 }
333
334 printf("\n### Testing on all %u lcores ###\n", rte_lcore_count());
335 run_on_n_cores(s, bulk_push_pop, rte_lcore_count());
336
337 rte_stack_free(s);
338 return 0;
339 }
340
341 static int
test_stack_perf(void)342 test_stack_perf(void)
343 {
344 return __test_stack_perf(0);
345 }
346
347 static int
test_lf_stack_perf(void)348 test_lf_stack_perf(void)
349 {
350 #if defined(RTE_STACK_LF_SUPPORTED)
351 return __test_stack_perf(RTE_STACK_F_LF);
352 #else
353 return TEST_SKIPPED;
354 #endif
355 }
356
357 REGISTER_PERF_TEST(stack_perf_autotest, test_stack_perf);
358 REGISTER_PERF_TEST(stack_lf_perf_autotest, test_lf_stack_perf);
359