1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2014 Intel Corporation 3 * Copyright(c) 2022 SmartShare Systems 4 */ 5 6 #include <string.h> 7 #include <stdio.h> 8 #include <stdlib.h> 9 #include <stdint.h> 10 #include <inttypes.h> 11 #include <stdarg.h> 12 #include <errno.h> 13 #include <sys/queue.h> 14 15 #include <rte_common.h> 16 #include <rte_log.h> 17 #include <rte_debug.h> 18 #include <rte_memory.h> 19 #include <rte_launch.h> 20 #include <rte_cycles.h> 21 #include <rte_eal.h> 22 #include <rte_per_lcore.h> 23 #include <rte_lcore.h> 24 #include <rte_branch_prediction.h> 25 #include <rte_mempool.h> 26 #include <rte_spinlock.h> 27 #include <rte_malloc.h> 28 #include <rte_mbuf_pool_ops.h> 29 30 #include "test.h" 31 32 /* 33 * Mempool performance 34 * ======= 35 * 36 * Each core get *n_keep* objects per bulk of *n_get_bulk*. Then, 37 * objects are put back in the pool per bulk of *n_put_bulk*. 38 * 39 * This sequence is done during TIME_S seconds. 40 * 41 * This test is done on the following configurations: 42 * 43 * - Cores configuration (*cores*) 44 * 45 * - One core with cache 46 * - Two cores with cache 47 * - Max. cores with cache 48 * - One core without cache 49 * - Two cores without cache 50 * - Max. cores without cache 51 * - One core with user-owned cache 52 * - Two cores with user-owned cache 53 * - Max. cores with user-owned cache 54 * 55 * - Bulk size (*n_get_bulk*, *n_put_bulk*) 56 * 57 * - Bulk get from 1 to 256, and RTE_MEMPOOL_CACHE_MAX_SIZE 58 * - Bulk put from 1 to 256, and RTE_MEMPOOL_CACHE_MAX_SIZE 59 * - Bulk get and put from 1 to 256, and RTE_MEMPOOL_CACHE_MAX_SIZE, compile time constant 60 * 61 * - Number of kept objects (*n_keep*) 62 * 63 * - 32 64 * - 128 65 * - 512 66 * - 2048 67 * - 8192 68 * - 32768 69 */ 70 71 #define TIME_S 1 72 #define MEMPOOL_ELT_SIZE 2048 73 #define MAX_KEEP 32768 74 #define N (128 * MAX_KEEP) 75 #define MEMPOOL_SIZE ((rte_lcore_count()*(MAX_KEEP+RTE_MEMPOOL_CACHE_MAX_SIZE*2))-1) 76 77 /* Number of pointers fitting into one cache line. */ 78 #define CACHE_LINE_BURST (RTE_CACHE_LINE_SIZE / sizeof(uintptr_t)) 79 80 #define LOG_ERR() printf("test failed at %s():%d\n", __func__, __LINE__) 81 #define RET_ERR() do { \ 82 LOG_ERR(); \ 83 return -1; \ 84 } while (0) 85 #define GOTO_ERR(var, label) do { \ 86 LOG_ERR(); \ 87 var = -1; \ 88 goto label; \ 89 } while (0) 90 91 static int use_external_cache; 92 static unsigned external_cache_size = RTE_MEMPOOL_CACHE_MAX_SIZE; 93 94 static RTE_ATOMIC(uint32_t) synchro; 95 96 /* number of objects in one bulk operation (get or put) */ 97 static unsigned n_get_bulk; 98 static unsigned n_put_bulk; 99 100 /* number of objects retrieved from mempool before putting them back */ 101 static unsigned n_keep; 102 103 /* true if we want to test with constant n_get_bulk and n_put_bulk */ 104 static int use_constant_values; 105 106 /* number of enqueues / dequeues, and time used */ 107 struct __rte_cache_aligned mempool_test_stats { 108 uint64_t enq_count; 109 uint64_t duration_cycles; 110 RTE_CACHE_GUARD; 111 }; 112 113 static struct mempool_test_stats stats[RTE_MAX_LCORE]; 114 115 /* 116 * save the object number in the first 4 bytes of object data. All 117 * other bytes are set to 0. 118 */ 119 static void 120 my_obj_init(struct rte_mempool *mp, __rte_unused void *arg, 121 void *obj, unsigned i) 122 { 123 uint32_t *objnum = obj; 124 memset(obj, 0, mp->elt_size); 125 *objnum = i; 126 } 127 128 static __rte_always_inline int 129 test_loop(struct rte_mempool *mp, struct rte_mempool_cache *cache, 130 unsigned int x_keep, unsigned int x_get_bulk, unsigned int x_put_bulk) 131 { 132 alignas(RTE_CACHE_LINE_SIZE) void *obj_table[MAX_KEEP]; 133 unsigned int idx; 134 unsigned int i; 135 int ret; 136 137 for (i = 0; likely(i < (N / x_keep)); i++) { 138 /* get x_keep objects by bulk of x_get_bulk */ 139 for (idx = 0; idx < x_keep; idx += x_get_bulk) { 140 ret = rte_mempool_generic_get(mp, 141 &obj_table[idx], 142 x_get_bulk, 143 cache); 144 if (unlikely(ret < 0)) { 145 rte_mempool_dump(stdout, mp); 146 return ret; 147 } 148 } 149 150 /* put the objects back by bulk of x_put_bulk */ 151 for (idx = 0; idx < x_keep; idx += x_put_bulk) { 152 rte_mempool_generic_put(mp, 153 &obj_table[idx], 154 x_put_bulk, 155 cache); 156 } 157 } 158 159 return 0; 160 } 161 162 static int 163 per_lcore_mempool_test(void *arg) 164 { 165 struct rte_mempool *mp = arg; 166 unsigned lcore_id = rte_lcore_id(); 167 int ret = 0; 168 uint64_t start_cycles, end_cycles; 169 uint64_t time_diff = 0, hz = rte_get_timer_hz(); 170 struct rte_mempool_cache *cache; 171 172 if (use_external_cache) { 173 /* Create a user-owned mempool cache. */ 174 cache = rte_mempool_cache_create(external_cache_size, 175 SOCKET_ID_ANY); 176 if (cache == NULL) 177 RET_ERR(); 178 } else { 179 /* May be NULL if cache is disabled. */ 180 cache = rte_mempool_default_cache(mp, lcore_id); 181 } 182 183 /* n_get_bulk and n_put_bulk must be divisors of n_keep */ 184 if (((n_keep / n_get_bulk) * n_get_bulk) != n_keep) 185 GOTO_ERR(ret, out); 186 if (((n_keep / n_put_bulk) * n_put_bulk) != n_keep) 187 GOTO_ERR(ret, out); 188 /* for constant n, n_get_bulk and n_put_bulk must be the same */ 189 if (use_constant_values && n_put_bulk != n_get_bulk) 190 GOTO_ERR(ret, out); 191 192 stats[lcore_id].enq_count = 0; 193 stats[lcore_id].duration_cycles = 0; 194 195 /* wait synchro for workers */ 196 if (lcore_id != rte_get_main_lcore()) 197 rte_wait_until_equal_32((uint32_t *)(uintptr_t)&synchro, 1, 198 rte_memory_order_relaxed); 199 200 start_cycles = rte_get_timer_cycles(); 201 202 while (time_diff/hz < TIME_S) { 203 if (!use_constant_values) 204 ret = test_loop(mp, cache, n_keep, n_get_bulk, n_put_bulk); 205 else if (n_get_bulk == 1) 206 ret = test_loop(mp, cache, n_keep, 1, 1); 207 else if (n_get_bulk == 4) 208 ret = test_loop(mp, cache, n_keep, 4, 4); 209 else if (n_get_bulk == CACHE_LINE_BURST) 210 ret = test_loop(mp, cache, n_keep, 211 CACHE_LINE_BURST, CACHE_LINE_BURST); 212 else if (n_get_bulk == 32) 213 ret = test_loop(mp, cache, n_keep, 32, 32); 214 else if (n_get_bulk == 64) 215 ret = test_loop(mp, cache, n_keep, 64, 64); 216 else if (n_get_bulk == 128) 217 ret = test_loop(mp, cache, n_keep, 128, 128); 218 else if (n_get_bulk == 256) 219 ret = test_loop(mp, cache, n_keep, 256, 256); 220 else if (n_get_bulk == RTE_MEMPOOL_CACHE_MAX_SIZE) 221 ret = test_loop(mp, cache, n_keep, 222 RTE_MEMPOOL_CACHE_MAX_SIZE, RTE_MEMPOOL_CACHE_MAX_SIZE); 223 else 224 ret = -1; 225 226 if (ret < 0) 227 GOTO_ERR(ret, out); 228 229 end_cycles = rte_get_timer_cycles(); 230 time_diff = end_cycles - start_cycles; 231 stats[lcore_id].enq_count += N; 232 } 233 234 stats[lcore_id].duration_cycles = time_diff; 235 236 out: 237 if (use_external_cache) { 238 rte_mempool_cache_flush(cache, mp); 239 rte_mempool_cache_free(cache); 240 } 241 242 return ret; 243 } 244 245 /* launch all the per-lcore test, and display the result */ 246 static int 247 launch_cores(struct rte_mempool *mp, unsigned int cores) 248 { 249 unsigned lcore_id; 250 uint64_t rate; 251 int ret; 252 unsigned cores_save = cores; 253 double hz = rte_get_timer_hz(); 254 255 rte_atomic_store_explicit(&synchro, 0, rte_memory_order_relaxed); 256 257 /* reset stats */ 258 memset(stats, 0, sizeof(stats)); 259 260 printf("mempool_autotest cache=%u cores=%u n_get_bulk=%u " 261 "n_put_bulk=%u n_keep=%u constant_n=%u ", 262 use_external_cache ? 263 external_cache_size : (unsigned) mp->cache_size, 264 cores, n_get_bulk, n_put_bulk, n_keep, use_constant_values); 265 266 if (rte_mempool_avail_count(mp) != MEMPOOL_SIZE) { 267 printf("mempool is not full\n"); 268 return -1; 269 } 270 271 RTE_LCORE_FOREACH_WORKER(lcore_id) { 272 if (cores == 1) 273 break; 274 cores--; 275 rte_eal_remote_launch(per_lcore_mempool_test, 276 mp, lcore_id); 277 } 278 279 /* start synchro and launch test on main */ 280 rte_atomic_store_explicit(&synchro, 1, rte_memory_order_relaxed); 281 282 ret = per_lcore_mempool_test(mp); 283 284 cores = cores_save; 285 RTE_LCORE_FOREACH_WORKER(lcore_id) { 286 if (cores == 1) 287 break; 288 cores--; 289 if (rte_eal_wait_lcore(lcore_id) < 0) 290 ret = -1; 291 } 292 293 if (ret < 0) { 294 printf("per-lcore test returned -1\n"); 295 return -1; 296 } 297 298 rate = 0; 299 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) 300 if (stats[lcore_id].duration_cycles != 0) 301 rate += (double)stats[lcore_id].enq_count * hz / 302 (double)stats[lcore_id].duration_cycles; 303 304 printf("rate_persec=%" PRIu64 "\n", rate); 305 306 return 0; 307 } 308 309 /* for a given number of core, launch all test cases */ 310 static int 311 do_one_mempool_test(struct rte_mempool *mp, unsigned int cores, int external_cache) 312 { 313 unsigned int bulk_tab_get[] = { 1, 4, CACHE_LINE_BURST, 32, 64, 128, 256, 314 RTE_MEMPOOL_CACHE_MAX_SIZE, 0 }; 315 unsigned int bulk_tab_put[] = { 1, 4, CACHE_LINE_BURST, 32, 64, 128, 256, 316 RTE_MEMPOOL_CACHE_MAX_SIZE, 0 }; 317 unsigned int keep_tab[] = { 32, 128, 512, 2048, 8192, 32768, 0 }; 318 unsigned *get_bulk_ptr; 319 unsigned *put_bulk_ptr; 320 unsigned *keep_ptr; 321 int ret; 322 323 for (get_bulk_ptr = bulk_tab_get; *get_bulk_ptr; get_bulk_ptr++) { 324 for (put_bulk_ptr = bulk_tab_put; *put_bulk_ptr; put_bulk_ptr++) { 325 for (keep_ptr = keep_tab; *keep_ptr; keep_ptr++) { 326 327 if (*keep_ptr < *get_bulk_ptr || *keep_ptr < *put_bulk_ptr) 328 continue; 329 330 use_external_cache = external_cache; 331 use_constant_values = 0; 332 n_get_bulk = *get_bulk_ptr; 333 n_put_bulk = *put_bulk_ptr; 334 n_keep = *keep_ptr; 335 ret = launch_cores(mp, cores); 336 if (ret < 0) 337 return -1; 338 339 /* replay test with constant values */ 340 if (n_get_bulk == n_put_bulk) { 341 use_constant_values = 1; 342 ret = launch_cores(mp, cores); 343 if (ret < 0) 344 return -1; 345 } 346 } 347 } 348 } 349 return 0; 350 } 351 352 static int 353 do_all_mempool_perf_tests(unsigned int cores) 354 { 355 struct rte_mempool *mp_cache = NULL; 356 struct rte_mempool *mp_nocache = NULL; 357 struct rte_mempool *default_pool = NULL; 358 const char *default_pool_ops; 359 int ret = -1; 360 361 /* create a mempool (without cache) */ 362 mp_nocache = rte_mempool_create("perf_test_nocache", MEMPOOL_SIZE, 363 MEMPOOL_ELT_SIZE, 0, 0, 364 NULL, NULL, 365 my_obj_init, NULL, 366 SOCKET_ID_ANY, 0); 367 if (mp_nocache == NULL) { 368 printf("cannot allocate mempool (without cache)\n"); 369 goto err; 370 } 371 372 /* create a mempool (with cache) */ 373 mp_cache = rte_mempool_create("perf_test_cache", MEMPOOL_SIZE, 374 MEMPOOL_ELT_SIZE, 375 RTE_MEMPOOL_CACHE_MAX_SIZE, 0, 376 NULL, NULL, 377 my_obj_init, NULL, 378 SOCKET_ID_ANY, 0); 379 if (mp_cache == NULL) { 380 printf("cannot allocate mempool (with cache)\n"); 381 goto err; 382 } 383 384 default_pool_ops = rte_mbuf_best_mempool_ops(); 385 /* Create a mempool based on Default handler */ 386 default_pool = rte_mempool_create_empty("default_pool", 387 MEMPOOL_SIZE, 388 MEMPOOL_ELT_SIZE, 389 0, 0, 390 SOCKET_ID_ANY, 0); 391 392 if (default_pool == NULL) { 393 printf("cannot allocate %s mempool\n", default_pool_ops); 394 goto err; 395 } 396 397 if (rte_mempool_set_ops_byname(default_pool, default_pool_ops, NULL) 398 < 0) { 399 printf("cannot set %s handler\n", default_pool_ops); 400 goto err; 401 } 402 403 if (rte_mempool_populate_default(default_pool) < 0) { 404 printf("cannot populate %s mempool\n", default_pool_ops); 405 goto err; 406 } 407 408 rte_mempool_obj_iter(default_pool, my_obj_init, NULL); 409 410 printf("start performance test (without cache)\n"); 411 if (do_one_mempool_test(mp_nocache, cores, 0) < 0) 412 goto err; 413 414 printf("start performance test for %s (without cache)\n", 415 default_pool_ops); 416 if (do_one_mempool_test(default_pool, cores, 0) < 0) 417 goto err; 418 419 printf("start performance test (with cache)\n"); 420 if (do_one_mempool_test(mp_cache, cores, 0) < 0) 421 goto err; 422 423 printf("start performance test (with user-owned cache)\n"); 424 if (do_one_mempool_test(mp_nocache, cores, 1) < 0) 425 goto err; 426 427 rte_mempool_list_dump(stdout); 428 429 ret = 0; 430 431 err: 432 rte_mempool_free(mp_cache); 433 rte_mempool_free(mp_nocache); 434 rte_mempool_free(default_pool); 435 return ret; 436 } 437 438 static int 439 test_mempool_perf_1core(void) 440 { 441 return do_all_mempool_perf_tests(1); 442 } 443 444 static int 445 test_mempool_perf_2cores(void) 446 { 447 if (rte_lcore_count() < 2) { 448 printf("not enough lcores\n"); 449 return -1; 450 } 451 return do_all_mempool_perf_tests(2); 452 } 453 454 static int 455 test_mempool_perf_allcores(void) 456 { 457 return do_all_mempool_perf_tests(rte_lcore_count()); 458 } 459 460 static int 461 test_mempool_perf(void) 462 { 463 int ret = -1; 464 465 /* performance test with 1, 2 and max cores */ 466 if (do_all_mempool_perf_tests(1) < 0) 467 goto err; 468 if (rte_lcore_count() == 1) 469 goto done; 470 471 if (do_all_mempool_perf_tests(2) < 0) 472 goto err; 473 if (rte_lcore_count() == 2) 474 goto done; 475 476 if (do_all_mempool_perf_tests(rte_lcore_count()) < 0) 477 goto err; 478 479 done: 480 ret = 0; 481 482 err: 483 return ret; 484 } 485 486 REGISTER_PERF_TEST(mempool_perf_autotest, test_mempool_perf); 487 REGISTER_PERF_TEST(mempool_perf_autotest_1core, test_mempool_perf_1core); 488 REGISTER_PERF_TEST(mempool_perf_autotest_2cores, test_mempool_perf_2cores); 489 REGISTER_PERF_TEST(mempool_perf_autotest_allcores, test_mempool_perf_allcores); 490