xref: /dpdk/app/test/test_mempool_perf.c (revision 7775adc618811cd3713403cae7d6acc5b296d558)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  * Copyright(c) 2022 SmartShare Systems
4  */
5 
6 #include <string.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <stdint.h>
10 #include <inttypes.h>
11 #include <stdarg.h>
12 #include <errno.h>
13 #include <sys/queue.h>
14 
15 #include <rte_common.h>
16 #include <rte_log.h>
17 #include <rte_debug.h>
18 #include <rte_memory.h>
19 #include <rte_launch.h>
20 #include <rte_cycles.h>
21 #include <rte_eal.h>
22 #include <rte_per_lcore.h>
23 #include <rte_lcore.h>
24 #include <rte_branch_prediction.h>
25 #include <rte_mempool.h>
26 #include <rte_spinlock.h>
27 #include <rte_malloc.h>
28 #include <rte_mbuf_pool_ops.h>
29 
30 #include "test.h"
31 
32 /*
33  * Mempool performance
34  * =======
35  *
36  *    Each core get *n_keep* objects per bulk of *n_get_bulk*. Then,
37  *    objects are put back in the pool per bulk of *n_put_bulk*.
38  *
39  *    This sequence is done during TIME_S seconds.
40  *
41  *    This test is done on the following configurations:
42  *
43  *    - Cores configuration (*cores*)
44  *
45  *      - One core with cache
46  *      - Two cores with cache
47  *      - Max. cores with cache
48  *      - One core without cache
49  *      - Two cores without cache
50  *      - Max. cores without cache
51  *      - One core with user-owned cache
52  *      - Two cores with user-owned cache
53  *      - Max. cores with user-owned cache
54  *
55  *    - Bulk size (*n_get_bulk*, *n_put_bulk*)
56  *
57  *      - Bulk get from 1 to 256, and RTE_MEMPOOL_CACHE_MAX_SIZE
58  *      - Bulk put from 1 to 256, and RTE_MEMPOOL_CACHE_MAX_SIZE
59  *      - Bulk get and put from 1 to 256, and RTE_MEMPOOL_CACHE_MAX_SIZE, compile time constant
60  *
61  *    - Number of kept objects (*n_keep*)
62  *
63  *      - 32
64  *      - 128
65  *      - 512
66  *      - 2048
67  *      - 8192
68  *      - 32768
69  */
70 
71 #define TIME_S 1
72 #define MEMPOOL_ELT_SIZE 2048
73 #define MAX_KEEP 32768
74 #define N (128 * MAX_KEEP)
75 #define MEMPOOL_SIZE ((rte_lcore_count()*(MAX_KEEP+RTE_MEMPOOL_CACHE_MAX_SIZE*2))-1)
76 
77 /* Number of pointers fitting into one cache line. */
78 #define CACHE_LINE_BURST (RTE_CACHE_LINE_SIZE / sizeof(uintptr_t))
79 
80 #define LOG_ERR() printf("test failed at %s():%d\n", __func__, __LINE__)
81 #define RET_ERR() do {							\
82 		LOG_ERR();						\
83 		return -1;						\
84 	} while (0)
85 #define GOTO_ERR(var, label) do {					\
86 		LOG_ERR();						\
87 		var = -1;						\
88 		goto label;						\
89 	} while (0)
90 
91 static int use_external_cache;
92 static unsigned external_cache_size = RTE_MEMPOOL_CACHE_MAX_SIZE;
93 
94 static RTE_ATOMIC(uint32_t) synchro;
95 
96 /* number of objects in one bulk operation (get or put) */
97 static unsigned n_get_bulk;
98 static unsigned n_put_bulk;
99 
100 /* number of objects retrieved from mempool before putting them back */
101 static unsigned n_keep;
102 
103 /* true if we want to test with constant n_get_bulk and n_put_bulk */
104 static int use_constant_values;
105 
106 /* number of enqueues / dequeues, and time used */
107 struct __rte_cache_aligned mempool_test_stats {
108 	uint64_t enq_count;
109 	uint64_t duration_cycles;
110 	RTE_CACHE_GUARD;
111 };
112 
113 static struct mempool_test_stats stats[RTE_MAX_LCORE];
114 
115 /*
116  * save the object number in the first 4 bytes of object data. All
117  * other bytes are set to 0.
118  */
119 static void
120 my_obj_init(struct rte_mempool *mp, __rte_unused void *arg,
121 	    void *obj, unsigned i)
122 {
123 	uint32_t *objnum = obj;
124 	memset(obj, 0, mp->elt_size);
125 	*objnum = i;
126 }
127 
128 static __rte_always_inline int
129 test_loop(struct rte_mempool *mp, struct rte_mempool_cache *cache,
130 	  unsigned int x_keep, unsigned int x_get_bulk, unsigned int x_put_bulk)
131 {
132 	alignas(RTE_CACHE_LINE_SIZE) void *obj_table[MAX_KEEP];
133 	unsigned int idx;
134 	unsigned int i;
135 	int ret;
136 
137 	for (i = 0; likely(i < (N / x_keep)); i++) {
138 		/* get x_keep objects by bulk of x_get_bulk */
139 		for (idx = 0; idx < x_keep; idx += x_get_bulk) {
140 			ret = rte_mempool_generic_get(mp,
141 						      &obj_table[idx],
142 						      x_get_bulk,
143 						      cache);
144 			if (unlikely(ret < 0)) {
145 				rte_mempool_dump(stdout, mp);
146 				return ret;
147 			}
148 		}
149 
150 		/* put the objects back by bulk of x_put_bulk */
151 		for (idx = 0; idx < x_keep; idx += x_put_bulk) {
152 			rte_mempool_generic_put(mp,
153 						&obj_table[idx],
154 						x_put_bulk,
155 						cache);
156 		}
157 	}
158 
159 	return 0;
160 }
161 
162 static int
163 per_lcore_mempool_test(void *arg)
164 {
165 	struct rte_mempool *mp = arg;
166 	unsigned lcore_id = rte_lcore_id();
167 	int ret = 0;
168 	uint64_t start_cycles, end_cycles;
169 	uint64_t time_diff = 0, hz = rte_get_timer_hz();
170 	struct rte_mempool_cache *cache;
171 
172 	if (use_external_cache) {
173 		/* Create a user-owned mempool cache. */
174 		cache = rte_mempool_cache_create(external_cache_size,
175 						 SOCKET_ID_ANY);
176 		if (cache == NULL)
177 			RET_ERR();
178 	} else {
179 		/* May be NULL if cache is disabled. */
180 		cache = rte_mempool_default_cache(mp, lcore_id);
181 	}
182 
183 	/* n_get_bulk and n_put_bulk must be divisors of n_keep */
184 	if (((n_keep / n_get_bulk) * n_get_bulk) != n_keep)
185 		GOTO_ERR(ret, out);
186 	if (((n_keep / n_put_bulk) * n_put_bulk) != n_keep)
187 		GOTO_ERR(ret, out);
188 	/* for constant n, n_get_bulk and n_put_bulk must be the same */
189 	if (use_constant_values && n_put_bulk != n_get_bulk)
190 		GOTO_ERR(ret, out);
191 
192 	stats[lcore_id].enq_count = 0;
193 	stats[lcore_id].duration_cycles = 0;
194 
195 	/* wait synchro for workers */
196 	if (lcore_id != rte_get_main_lcore())
197 		rte_wait_until_equal_32((uint32_t *)(uintptr_t)&synchro, 1,
198 				rte_memory_order_relaxed);
199 
200 	start_cycles = rte_get_timer_cycles();
201 
202 	while (time_diff/hz < TIME_S) {
203 		if (!use_constant_values)
204 			ret = test_loop(mp, cache, n_keep, n_get_bulk, n_put_bulk);
205 		else if (n_get_bulk == 1)
206 			ret = test_loop(mp, cache, n_keep, 1, 1);
207 		else if (n_get_bulk == 4)
208 			ret = test_loop(mp, cache, n_keep, 4, 4);
209 		else if (n_get_bulk == CACHE_LINE_BURST)
210 			ret = test_loop(mp, cache, n_keep,
211 					CACHE_LINE_BURST, CACHE_LINE_BURST);
212 		else if (n_get_bulk == 32)
213 			ret = test_loop(mp, cache, n_keep, 32, 32);
214 		else if (n_get_bulk == 64)
215 			ret = test_loop(mp, cache, n_keep, 64, 64);
216 		else if (n_get_bulk == 128)
217 			ret = test_loop(mp, cache, n_keep, 128, 128);
218 		else if (n_get_bulk == 256)
219 			ret = test_loop(mp, cache, n_keep, 256, 256);
220 		else if (n_get_bulk == RTE_MEMPOOL_CACHE_MAX_SIZE)
221 			ret = test_loop(mp, cache, n_keep,
222 					RTE_MEMPOOL_CACHE_MAX_SIZE, RTE_MEMPOOL_CACHE_MAX_SIZE);
223 		else
224 			ret = -1;
225 
226 		if (ret < 0)
227 			GOTO_ERR(ret, out);
228 
229 		end_cycles = rte_get_timer_cycles();
230 		time_diff = end_cycles - start_cycles;
231 		stats[lcore_id].enq_count += N;
232 	}
233 
234 	stats[lcore_id].duration_cycles = time_diff;
235 
236 out:
237 	if (use_external_cache) {
238 		rte_mempool_cache_flush(cache, mp);
239 		rte_mempool_cache_free(cache);
240 	}
241 
242 	return ret;
243 }
244 
245 /* launch all the per-lcore test, and display the result */
246 static int
247 launch_cores(struct rte_mempool *mp, unsigned int cores)
248 {
249 	unsigned lcore_id;
250 	uint64_t rate;
251 	int ret;
252 	unsigned cores_save = cores;
253 	double hz = rte_get_timer_hz();
254 
255 	rte_atomic_store_explicit(&synchro, 0, rte_memory_order_relaxed);
256 
257 	/* reset stats */
258 	memset(stats, 0, sizeof(stats));
259 
260 	printf("mempool_autotest cache=%u cores=%u n_get_bulk=%u "
261 	       "n_put_bulk=%u n_keep=%u constant_n=%u ",
262 	       use_external_cache ?
263 		   external_cache_size : (unsigned) mp->cache_size,
264 	       cores, n_get_bulk, n_put_bulk, n_keep, use_constant_values);
265 
266 	if (rte_mempool_avail_count(mp) != MEMPOOL_SIZE) {
267 		printf("mempool is not full\n");
268 		return -1;
269 	}
270 
271 	RTE_LCORE_FOREACH_WORKER(lcore_id) {
272 		if (cores == 1)
273 			break;
274 		cores--;
275 		rte_eal_remote_launch(per_lcore_mempool_test,
276 				      mp, lcore_id);
277 	}
278 
279 	/* start synchro and launch test on main */
280 	rte_atomic_store_explicit(&synchro, 1, rte_memory_order_relaxed);
281 
282 	ret = per_lcore_mempool_test(mp);
283 
284 	cores = cores_save;
285 	RTE_LCORE_FOREACH_WORKER(lcore_id) {
286 		if (cores == 1)
287 			break;
288 		cores--;
289 		if (rte_eal_wait_lcore(lcore_id) < 0)
290 			ret = -1;
291 	}
292 
293 	if (ret < 0) {
294 		printf("per-lcore test returned -1\n");
295 		return -1;
296 	}
297 
298 	rate = 0;
299 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
300 		if (stats[lcore_id].duration_cycles != 0)
301 			rate += (double)stats[lcore_id].enq_count * hz /
302 					(double)stats[lcore_id].duration_cycles;
303 
304 	printf("rate_persec=%" PRIu64 "\n", rate);
305 
306 	return 0;
307 }
308 
309 /* for a given number of core, launch all test cases */
310 static int
311 do_one_mempool_test(struct rte_mempool *mp, unsigned int cores, int external_cache)
312 {
313 	unsigned int bulk_tab_get[] = { 1, 4, CACHE_LINE_BURST, 32, 64, 128, 256,
314 			RTE_MEMPOOL_CACHE_MAX_SIZE, 0 };
315 	unsigned int bulk_tab_put[] = { 1, 4, CACHE_LINE_BURST, 32, 64, 128, 256,
316 			RTE_MEMPOOL_CACHE_MAX_SIZE, 0 };
317 	unsigned int keep_tab[] = { 32, 128, 512, 2048, 8192, 32768, 0 };
318 	unsigned *get_bulk_ptr;
319 	unsigned *put_bulk_ptr;
320 	unsigned *keep_ptr;
321 	int ret;
322 
323 	for (get_bulk_ptr = bulk_tab_get; *get_bulk_ptr; get_bulk_ptr++) {
324 		for (put_bulk_ptr = bulk_tab_put; *put_bulk_ptr; put_bulk_ptr++) {
325 			for (keep_ptr = keep_tab; *keep_ptr; keep_ptr++) {
326 
327 				if (*keep_ptr < *get_bulk_ptr || *keep_ptr < *put_bulk_ptr)
328 					continue;
329 
330 				use_external_cache = external_cache;
331 				use_constant_values = 0;
332 				n_get_bulk = *get_bulk_ptr;
333 				n_put_bulk = *put_bulk_ptr;
334 				n_keep = *keep_ptr;
335 				ret = launch_cores(mp, cores);
336 				if (ret < 0)
337 					return -1;
338 
339 				/* replay test with constant values */
340 				if (n_get_bulk == n_put_bulk) {
341 					use_constant_values = 1;
342 					ret = launch_cores(mp, cores);
343 					if (ret < 0)
344 						return -1;
345 				}
346 			}
347 		}
348 	}
349 	return 0;
350 }
351 
352 static int
353 do_all_mempool_perf_tests(unsigned int cores)
354 {
355 	struct rte_mempool *mp_cache = NULL;
356 	struct rte_mempool *mp_nocache = NULL;
357 	struct rte_mempool *default_pool = NULL;
358 	const char *default_pool_ops;
359 	int ret = -1;
360 
361 	/* create a mempool (without cache) */
362 	mp_nocache = rte_mempool_create("perf_test_nocache", MEMPOOL_SIZE,
363 					MEMPOOL_ELT_SIZE, 0, 0,
364 					NULL, NULL,
365 					my_obj_init, NULL,
366 					SOCKET_ID_ANY, 0);
367 	if (mp_nocache == NULL) {
368 		printf("cannot allocate mempool (without cache)\n");
369 		goto err;
370 	}
371 
372 	/* create a mempool (with cache) */
373 	mp_cache = rte_mempool_create("perf_test_cache", MEMPOOL_SIZE,
374 				      MEMPOOL_ELT_SIZE,
375 				      RTE_MEMPOOL_CACHE_MAX_SIZE, 0,
376 				      NULL, NULL,
377 				      my_obj_init, NULL,
378 				      SOCKET_ID_ANY, 0);
379 	if (mp_cache == NULL) {
380 		printf("cannot allocate mempool (with cache)\n");
381 		goto err;
382 	}
383 
384 	default_pool_ops = rte_mbuf_best_mempool_ops();
385 	/* Create a mempool based on Default handler */
386 	default_pool = rte_mempool_create_empty("default_pool",
387 						MEMPOOL_SIZE,
388 						MEMPOOL_ELT_SIZE,
389 						0, 0,
390 						SOCKET_ID_ANY, 0);
391 
392 	if (default_pool == NULL) {
393 		printf("cannot allocate %s mempool\n", default_pool_ops);
394 		goto err;
395 	}
396 
397 	if (rte_mempool_set_ops_byname(default_pool, default_pool_ops, NULL)
398 				       < 0) {
399 		printf("cannot set %s handler\n", default_pool_ops);
400 		goto err;
401 	}
402 
403 	if (rte_mempool_populate_default(default_pool) < 0) {
404 		printf("cannot populate %s mempool\n", default_pool_ops);
405 		goto err;
406 	}
407 
408 	rte_mempool_obj_iter(default_pool, my_obj_init, NULL);
409 
410 	printf("start performance test (without cache)\n");
411 	if (do_one_mempool_test(mp_nocache, cores, 0) < 0)
412 		goto err;
413 
414 	printf("start performance test for %s (without cache)\n",
415 	       default_pool_ops);
416 	if (do_one_mempool_test(default_pool, cores, 0) < 0)
417 		goto err;
418 
419 	printf("start performance test (with cache)\n");
420 	if (do_one_mempool_test(mp_cache, cores, 0) < 0)
421 		goto err;
422 
423 	printf("start performance test (with user-owned cache)\n");
424 	if (do_one_mempool_test(mp_nocache, cores, 1) < 0)
425 		goto err;
426 
427 	rte_mempool_list_dump(stdout);
428 
429 	ret = 0;
430 
431 err:
432 	rte_mempool_free(mp_cache);
433 	rte_mempool_free(mp_nocache);
434 	rte_mempool_free(default_pool);
435 	return ret;
436 }
437 
438 static int
439 test_mempool_perf_1core(void)
440 {
441 	return do_all_mempool_perf_tests(1);
442 }
443 
444 static int
445 test_mempool_perf_2cores(void)
446 {
447 	if (rte_lcore_count() < 2) {
448 		printf("not enough lcores\n");
449 		return -1;
450 	}
451 	return do_all_mempool_perf_tests(2);
452 }
453 
454 static int
455 test_mempool_perf_allcores(void)
456 {
457 	return do_all_mempool_perf_tests(rte_lcore_count());
458 }
459 
460 static int
461 test_mempool_perf(void)
462 {
463 	int ret = -1;
464 
465 	/* performance test with 1, 2 and max cores */
466 	if (do_all_mempool_perf_tests(1) < 0)
467 		goto err;
468 	if (rte_lcore_count() == 1)
469 		goto done;
470 
471 	if (do_all_mempool_perf_tests(2) < 0)
472 		goto err;
473 	if (rte_lcore_count() == 2)
474 		goto done;
475 
476 	if (do_all_mempool_perf_tests(rte_lcore_count()) < 0)
477 		goto err;
478 
479 done:
480 	ret = 0;
481 
482 err:
483 	return ret;
484 }
485 
486 REGISTER_PERF_TEST(mempool_perf_autotest, test_mempool_perf);
487 REGISTER_PERF_TEST(mempool_perf_autotest_1core, test_mempool_perf_1core);
488 REGISTER_PERF_TEST(mempool_perf_autotest_2cores, test_mempool_perf_2cores);
489 REGISTER_PERF_TEST(mempool_perf_autotest_allcores, test_mempool_perf_allcores);
490