xref: /dpdk/app/test/test_lcore_var_perf.c (revision 2cd441bd17fc43768755162bfb218395c795b82d)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2024 Ericsson AB
3  */
4 
5 #define MAX_MODS 1024
6 
7 #include <stdio.h>
8 
9 #include <rte_bitops.h>
10 #include <rte_cycles.h>
11 #include <rte_lcore_var.h>
12 #include <rte_per_lcore.h>
13 #include <rte_random.h>
14 
15 #include "test.h"
16 
17 struct mod_lcore_state {
18 	uint64_t a;
19 	uint64_t b;
20 	uint64_t sum;
21 };
22 
23 static void
24 mod_init(struct mod_lcore_state *state)
25 {
26 	state->a = rte_rand();
27 	state->b = rte_rand();
28 	state->sum = 0;
29 }
30 
31 static __rte_always_inline void
32 mod_update(volatile struct mod_lcore_state *state)
33 {
34 	state->sum += state->a * state->b;
35 }
36 
37 struct __rte_cache_aligned mod_lcore_state_aligned {
38 	struct mod_lcore_state mod_state;
39 
40 	RTE_CACHE_GUARD;
41 };
42 
43 static struct mod_lcore_state_aligned
44 sarray_lcore_state[MAX_MODS][RTE_MAX_LCORE];
45 
46 static void
47 sarray_init(void)
48 {
49 	unsigned int lcore_id = rte_lcore_id();
50 	int mod;
51 
52 	for (mod = 0; mod < MAX_MODS; mod++) {
53 		struct mod_lcore_state *mod_state = &sarray_lcore_state[mod][lcore_id].mod_state;
54 
55 		mod_init(mod_state);
56 	}
57 }
58 
59 static __rte_noinline void
60 sarray_update(unsigned int mod)
61 {
62 	unsigned int lcore_id = rte_lcore_id();
63 	struct mod_lcore_state *mod_state = &sarray_lcore_state[mod][lcore_id].mod_state;
64 
65 	mod_update(mod_state);
66 }
67 
68 struct mod_lcore_state_lazy {
69 	struct mod_lcore_state mod_state;
70 	bool initialized;
71 };
72 
73 /*
74  * Note: it's usually a bad idea have this much thread-local storage
75  * allocated in a real application, since it will incur a cost on
76  * thread creation and non-lcore thread memory usage.
77  */
78 static RTE_DEFINE_PER_LCORE(struct mod_lcore_state_lazy, tls_lcore_state)[MAX_MODS];
79 
80 static inline void
81 tls_init(struct mod_lcore_state_lazy *state)
82 {
83 	mod_init(&state->mod_state);
84 
85 	state->initialized = true;
86 }
87 
88 static __rte_noinline void
89 tls_lazy_update(unsigned int mod)
90 {
91 	struct mod_lcore_state_lazy *state =
92 		&RTE_PER_LCORE(tls_lcore_state[mod]);
93 
94 	/* With thread-local storage, initialization must usually be lazy */
95 	if (!state->initialized)
96 		tls_init(state);
97 
98 	mod_update(&state->mod_state);
99 }
100 
101 static __rte_noinline void
102 tls_update(unsigned int mod)
103 {
104 	struct mod_lcore_state_lazy *state =
105 		&RTE_PER_LCORE(tls_lcore_state[mod]);
106 
107 	mod_update(&state->mod_state);
108 }
109 
110 RTE_LCORE_VAR_HANDLE(struct mod_lcore_state, lvar_lcore_state)[MAX_MODS];
111 
112 static void
113 lvar_init(void)
114 {
115 	unsigned int mod;
116 
117 	for (mod = 0; mod < MAX_MODS; mod++) {
118 		RTE_LCORE_VAR_ALLOC(lvar_lcore_state[mod]);
119 
120 		struct mod_lcore_state *state = RTE_LCORE_VAR(lvar_lcore_state[mod]);
121 
122 		mod_init(state);
123 	}
124 }
125 
126 static __rte_noinline void
127 lvar_update(unsigned int mod)
128 {
129 	struct mod_lcore_state *state =	RTE_LCORE_VAR(lvar_lcore_state[mod]);
130 
131 	mod_update(state);
132 }
133 
134 static void
135 shuffle(unsigned int *elems, size_t len)
136 {
137 	size_t i;
138 
139 	for (i = len - 1; i > 0; i--) {
140 		unsigned int other = rte_rand_max(i + 1);
141 
142 		unsigned int tmp = elems[other];
143 		elems[other] = elems[i];
144 		elems[i] = tmp;
145 	}
146 }
147 
148 #define ITERATIONS UINT64_C(10000000)
149 
150 static inline double
151 benchmark_access(const unsigned int *mods, unsigned int num_mods,
152 		 void (*init_fun)(void), void (*update_fun)(unsigned int))
153 {
154 	unsigned int i;
155 	double start;
156 	double end;
157 	double latency;
158 	unsigned int num_mods_mask = num_mods - 1;
159 
160 	RTE_VERIFY(rte_is_power_of_2(num_mods));
161 
162 	if (init_fun != NULL)
163 		init_fun();
164 
165 	/* Warm up cache and make sure TLS variables are initialized */
166 	for (i = 0; i < num_mods; i++)
167 		update_fun(i);
168 
169 	start = rte_rdtsc();
170 
171 	for (i = 0; i < ITERATIONS; i++)
172 		update_fun(mods[i & num_mods_mask]);
173 
174 	end = rte_rdtsc();
175 
176 	latency = (end - start) / (double)ITERATIONS;
177 
178 	return latency;
179 }
180 
181 static void
182 test_lcore_var_access_n(unsigned int num_mods)
183 {
184 	double sarray_latency;
185 	double tls_latency;
186 	double lazy_tls_latency;
187 	double lvar_latency;
188 	unsigned int mods[num_mods];
189 	unsigned int i;
190 
191 	for (i = 0; i < num_mods; i++)
192 		mods[i] = i;
193 
194 	shuffle(mods, num_mods);
195 
196 	sarray_latency =
197 		benchmark_access(mods, num_mods, sarray_init, sarray_update);
198 
199 	tls_latency =
200 		benchmark_access(mods, num_mods, NULL, tls_update);
201 
202 	lazy_tls_latency =
203 		benchmark_access(mods, num_mods, NULL, tls_lazy_update);
204 
205 	lvar_latency =
206 		benchmark_access(mods, num_mods, lvar_init, lvar_update);
207 
208 	printf("%17u %8.1f %14.1f %15.1f %10.1f\n", num_mods, sarray_latency,
209 			tls_latency, lazy_tls_latency, lvar_latency);
210 }
211 
212 /*
213  * The potential performance benefit of lcore variables compared to
214  * the use of statically sized, lcore id-indexed arrays is not
215  * shorter latencies in a scenario with low cache pressure, but rather
216  * fewer cache misses in a real-world scenario, with extensive cache
217  * usage. These tests are a crude simulation of such, using <N> dummy
218  * modules, each with a small, per-lcore state. Note however that
219  * these tests have very little non-lcore/thread local state, which is
220  * unrealistic.
221  */
222 
223 static int
224 test_lcore_var_access(void)
225 {
226 	unsigned int num_mods = 1;
227 
228 	printf("- Latencies [TSC cycles/update] -\n");
229 	printf("Number of           Static   Thread-local    Thread-local      Lcore\n");
230 	printf("Modules/Variables    Array        Storage  Storage (Lazy)  Variables\n");
231 
232 	for (num_mods = 1; num_mods <= MAX_MODS; num_mods *= 2)
233 		test_lcore_var_access_n(num_mods);
234 
235 	return TEST_SUCCESS;
236 }
237 
238 static struct unit_test_suite lcore_var_testsuite = {
239 	.suite_name = "lcore variable perf autotest",
240 	.unit_test_cases = {
241 		TEST_CASE(test_lcore_var_access),
242 		TEST_CASES_END()
243 	},
244 };
245 
246 static int
247 test_lcore_var_perf(void)
248 {
249 	return unit_test_suite_runner(&lcore_var_testsuite);
250 }
251 
252 REGISTER_PERF_TEST(lcore_var_perf_autotest, test_lcore_var_perf);
253