1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2017 Intel Corporation 3 */ 4 5 #include <stdbool.h> 6 #include <stdlib.h> 7 8 #include <rte_crypto.h> 9 #include <rte_cryptodev.h> 10 #include <rte_cycles.h> 11 #include <rte_malloc.h> 12 13 #include "cperf_ops.h" 14 #include "cperf_test_pmd_cyclecount.h" 15 #include "cperf_test_common.h" 16 17 #define PRETTY_HDR_FMT "%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s\n\n" 18 #define PRETTY_LINE_FMT "%12u%12u%12u%12u%12u%12u%12u%12.0f%12.0f%12.0f\n" 19 #define CSV_HDR_FMT "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n" 20 #define CSV_LINE_FMT "%10u,%10u,%u,%u,%u,%u,%u,%.3f,%.3f,%.3f\n" 21 22 struct cperf_pmd_cyclecount_ctx { 23 uint8_t dev_id; 24 uint16_t qp_id; 25 uint8_t lcore_id; 26 27 struct rte_mempool *pool; 28 struct rte_crypto_op **ops; 29 struct rte_crypto_op **ops_processed; 30 31 void *sess; 32 uint8_t sess_owner; 33 34 cperf_populate_ops_t populate_ops; 35 36 uint32_t src_buf_offset; 37 uint32_t dst_buf_offset; 38 39 const struct cperf_options *options; 40 const struct cperf_test_vector *test_vector; 41 }; 42 43 struct pmd_cyclecount_state { 44 struct cperf_pmd_cyclecount_ctx *ctx; 45 const struct cperf_options *opts; 46 uint32_t lcore; 47 uint64_t delay; 48 int linearize; 49 uint32_t ops_enqd; 50 uint32_t ops_deqd; 51 uint32_t ops_enq_retries; 52 uint32_t ops_deq_retries; 53 double cycles_per_build; 54 double cycles_per_enq; 55 double cycles_per_deq; 56 }; 57 58 static const uint16_t iv_offset = 59 sizeof(struct rte_crypto_op) + sizeof(struct rte_crypto_sym_op); 60 61 static void 62 cperf_pmd_cyclecount_test_free(struct cperf_pmd_cyclecount_ctx *ctx) 63 { 64 if (!ctx) 65 return; 66 67 if (ctx->sess != NULL && ctx->sess_owner) { 68 #ifdef RTE_LIB_SECURITY 69 if (ctx->options->op_type == CPERF_PDCP || 70 ctx->options->op_type == CPERF_DOCSIS) { 71 void *sec_ctx = rte_cryptodev_get_sec_ctx(ctx->dev_id); 72 73 rte_security_session_destroy(sec_ctx, (void *)ctx->sess); 74 } else 75 #endif 76 rte_cryptodev_sym_session_free(ctx->dev_id, ctx->sess); 77 } 78 79 rte_mempool_free(ctx->pool); 80 81 rte_free(ctx->ops); 82 83 rte_free(ctx->ops_processed); 84 85 rte_free(ctx); 86 } 87 88 void * 89 cperf_pmd_cyclecount_test_constructor(struct rte_mempool *sess_mp, 90 uint8_t dev_id, uint16_t qp_id, 91 const struct cperf_options *options, 92 const struct cperf_test_vector *test_vector, 93 const struct cperf_op_fns *op_fns, 94 void **sess) 95 { 96 struct cperf_pmd_cyclecount_ctx *ctx = NULL; 97 98 /* preallocate buffers for crypto ops as they can get quite big */ 99 size_t alloc_sz = sizeof(struct rte_crypto_op *) * 100 options->nb_descriptors; 101 102 ctx = rte_malloc(NULL, sizeof(struct cperf_pmd_cyclecount_ctx), 0); 103 if (ctx == NULL) 104 goto err; 105 106 ctx->dev_id = dev_id; 107 ctx->qp_id = qp_id; 108 109 ctx->populate_ops = op_fns->populate_ops; 110 ctx->options = options; 111 ctx->test_vector = test_vector; 112 113 /* IV goes at the end of the crypto operation */ 114 uint16_t iv_offset = sizeof(struct rte_crypto_op) + 115 sizeof(struct rte_crypto_sym_op); 116 117 if (*sess != NULL) { 118 ctx->sess = *sess; 119 ctx->sess_owner = false; 120 } else { 121 ctx->sess = op_fns->sess_create(sess_mp, dev_id, options, test_vector, 122 iv_offset); 123 if (ctx->sess == NULL) 124 goto err; 125 *sess = ctx->sess; 126 ctx->sess_owner = true; 127 } 128 129 if (cperf_alloc_common_memory(options, test_vector, dev_id, qp_id, 0, 130 &ctx->src_buf_offset, &ctx->dst_buf_offset, 131 &ctx->pool) < 0) 132 goto err; 133 134 ctx->ops = rte_malloc("ops", alloc_sz, 0); 135 if (!ctx->ops) 136 goto err; 137 138 ctx->ops_processed = rte_malloc("ops_processed", alloc_sz, 0); 139 if (!ctx->ops_processed) 140 goto err; 141 142 return ctx; 143 144 err: 145 cperf_pmd_cyclecount_test_free(ctx); 146 147 return NULL; 148 } 149 150 /* benchmark alloc-build-free of ops */ 151 static inline int 152 pmd_cyclecount_bench_ops(struct pmd_cyclecount_state *state, uint32_t cur_op, 153 uint16_t test_burst_size) 154 { 155 uint32_t iter_ops_left = state->opts->total_ops - cur_op; 156 uint32_t iter_ops_needed = 157 RTE_MIN(state->opts->nb_descriptors, iter_ops_left); 158 uint32_t cur_iter_op; 159 uint32_t imix_idx = 0; 160 161 for (cur_iter_op = 0; cur_iter_op < iter_ops_needed; 162 cur_iter_op += test_burst_size) { 163 uint32_t burst_size = RTE_MIN(iter_ops_needed - cur_iter_op, 164 test_burst_size); 165 struct rte_crypto_op **ops = &state->ctx->ops[cur_iter_op]; 166 167 /* Allocate objects containing crypto operations and mbufs */ 168 if (rte_mempool_get_bulk(state->ctx->pool, (void **)ops, 169 burst_size) != 0) { 170 RTE_LOG(ERR, USER1, 171 "Failed to allocate more crypto operations " 172 "from the crypto operation pool.\n" 173 "Consider increasing the pool size " 174 "with --pool-sz\n"); 175 return -1; 176 } 177 178 /* Setup crypto op, attach mbuf etc */ 179 (state->ctx->populate_ops)(ops, 180 state->ctx->src_buf_offset, 181 state->ctx->dst_buf_offset, 182 burst_size, 183 state->ctx->sess, state->opts, 184 state->ctx->test_vector, iv_offset, 185 &imix_idx, NULL); 186 187 #ifdef CPERF_LINEARIZATION_ENABLE 188 /* Check if source mbufs require coalescing */ 189 if (state->linearize) { 190 uint8_t i; 191 for (i = 0; i < burst_size; i++) { 192 struct rte_mbuf *src = ops[i]->sym->m_src; 193 rte_pktmbuf_linearize(src); 194 } 195 } 196 #endif /* CPERF_LINEARIZATION_ENABLE */ 197 rte_mempool_put_bulk(state->ctx->pool, (void **)ops, 198 burst_size); 199 } 200 201 return 0; 202 } 203 204 /* allocate and build ops (no free) */ 205 static int 206 pmd_cyclecount_build_ops(struct pmd_cyclecount_state *state, 207 uint32_t iter_ops_needed, uint16_t test_burst_size) 208 { 209 uint32_t cur_iter_op; 210 uint32_t imix_idx = 0; 211 212 for (cur_iter_op = 0; cur_iter_op < iter_ops_needed; 213 cur_iter_op += test_burst_size) { 214 uint32_t burst_size = RTE_MIN( 215 iter_ops_needed - cur_iter_op, test_burst_size); 216 struct rte_crypto_op **ops = &state->ctx->ops[cur_iter_op]; 217 218 /* Allocate objects containing crypto operations and mbufs */ 219 if (rte_mempool_get_bulk(state->ctx->pool, (void **)ops, 220 burst_size) != 0) { 221 RTE_LOG(ERR, USER1, 222 "Failed to allocate more crypto operations " 223 "from the crypto operation pool.\n" 224 "Consider increasing the pool size " 225 "with --pool-sz\n"); 226 return -1; 227 } 228 229 /* Setup crypto op, attach mbuf etc */ 230 (state->ctx->populate_ops)(ops, 231 state->ctx->src_buf_offset, 232 state->ctx->dst_buf_offset, 233 burst_size, 234 state->ctx->sess, state->opts, 235 state->ctx->test_vector, iv_offset, 236 &imix_idx, NULL); 237 } 238 return 0; 239 } 240 241 /* benchmark enqueue, returns number of ops enqueued */ 242 static uint32_t 243 pmd_cyclecount_bench_enq(struct pmd_cyclecount_state *state, 244 uint32_t iter_ops_needed, uint16_t test_burst_size) 245 { 246 /* Enqueue full descriptor ring of ops on crypto device */ 247 uint32_t cur_iter_op = 0; 248 while (cur_iter_op < iter_ops_needed) { 249 uint32_t burst_size = RTE_MIN(iter_ops_needed - cur_iter_op, 250 test_burst_size); 251 struct rte_crypto_op **ops = &state->ctx->ops[cur_iter_op]; 252 uint32_t burst_enqd; 253 254 burst_enqd = rte_cryptodev_enqueue_burst(state->ctx->dev_id, 255 state->ctx->qp_id, ops, burst_size); 256 257 /* if we couldn't enqueue anything, the queue is full */ 258 if (!burst_enqd) { 259 /* don't try to dequeue anything we didn't enqueue */ 260 return cur_iter_op; 261 } 262 263 if (burst_enqd < burst_size) 264 state->ops_enq_retries++; 265 state->ops_enqd += burst_enqd; 266 cur_iter_op += burst_enqd; 267 } 268 return iter_ops_needed; 269 } 270 271 /* benchmark dequeue */ 272 static void 273 pmd_cyclecount_bench_deq(struct pmd_cyclecount_state *state, 274 uint32_t iter_ops_needed, uint16_t test_burst_size) 275 { 276 /* Dequeue full descriptor ring of ops on crypto device */ 277 uint32_t cur_iter_op = 0; 278 while (cur_iter_op < iter_ops_needed) { 279 uint32_t burst_size = RTE_MIN(iter_ops_needed - cur_iter_op, 280 test_burst_size); 281 struct rte_crypto_op **ops_processed = 282 &state->ctx->ops[cur_iter_op]; 283 uint32_t burst_deqd; 284 285 burst_deqd = rte_cryptodev_dequeue_burst(state->ctx->dev_id, 286 state->ctx->qp_id, ops_processed, burst_size); 287 288 if (burst_deqd < burst_size) 289 state->ops_deq_retries++; 290 state->ops_deqd += burst_deqd; 291 cur_iter_op += burst_deqd; 292 } 293 } 294 295 /* run benchmark per burst size */ 296 static inline int 297 pmd_cyclecount_bench_burst_sz( 298 struct pmd_cyclecount_state *state, uint16_t test_burst_size) 299 { 300 uint64_t tsc_start; 301 uint64_t tsc_end; 302 uint64_t tsc_op; 303 uint64_t tsc_enq; 304 uint64_t tsc_deq; 305 uint32_t cur_op; 306 307 /* reset all counters */ 308 tsc_enq = 0; 309 tsc_deq = 0; 310 state->ops_enqd = 0; 311 state->ops_enq_retries = 0; 312 state->ops_deqd = 0; 313 state->ops_deq_retries = 0; 314 315 /* 316 * Benchmark crypto op alloc-build-free separately. 317 */ 318 tsc_start = rte_rdtsc_precise(); 319 320 for (cur_op = 0; cur_op < state->opts->total_ops; 321 cur_op += state->opts->nb_descriptors) { 322 if (unlikely(pmd_cyclecount_bench_ops( 323 state, cur_op, test_burst_size))) 324 return -1; 325 } 326 327 tsc_end = rte_rdtsc_precise(); 328 tsc_op = tsc_end - tsc_start; 329 330 331 /* 332 * Hardware acceleration cyclecount benchmarking loop. 333 * 334 * We're benchmarking raw enq/deq performance by filling up the device 335 * queue, so we never get any failed enqs unless the driver won't accept 336 * the exact number of descriptors we requested, or the driver won't 337 * wrap around the end of the TX ring. However, since we're only 338 * dequeuing once we've filled up the queue, we have to benchmark it 339 * piecemeal and then average out the results. 340 */ 341 cur_op = 0; 342 while (cur_op < state->opts->total_ops) { 343 uint32_t iter_ops_left = state->opts->total_ops - cur_op; 344 uint32_t iter_ops_needed = RTE_MIN( 345 state->opts->nb_descriptors, iter_ops_left); 346 uint32_t iter_ops_allocd = iter_ops_needed; 347 348 /* allocate and build ops */ 349 if (unlikely(pmd_cyclecount_build_ops(state, iter_ops_needed, 350 test_burst_size))) 351 return -1; 352 353 tsc_start = rte_rdtsc_precise(); 354 355 /* fill up TX ring */ 356 iter_ops_needed = pmd_cyclecount_bench_enq(state, 357 iter_ops_needed, test_burst_size); 358 359 tsc_end = rte_rdtsc_precise(); 360 361 tsc_enq += tsc_end - tsc_start; 362 363 /* allow for HW to catch up */ 364 if (state->delay) 365 rte_delay_us_block(state->delay); 366 367 tsc_start = rte_rdtsc_precise(); 368 369 /* drain RX ring */ 370 pmd_cyclecount_bench_deq(state, iter_ops_needed, 371 test_burst_size); 372 373 tsc_end = rte_rdtsc_precise(); 374 375 tsc_deq += tsc_end - tsc_start; 376 377 cur_op += iter_ops_needed; 378 379 /* 380 * we may not have processed all ops that we allocated, so 381 * free everything we've allocated. 382 */ 383 rte_mempool_put_bulk(state->ctx->pool, 384 (void **)state->ctx->ops, iter_ops_allocd); 385 } 386 387 state->cycles_per_build = (double)tsc_op / state->opts->total_ops; 388 state->cycles_per_enq = (double)tsc_enq / state->ops_enqd; 389 state->cycles_per_deq = (double)tsc_deq / state->ops_deqd; 390 391 return 0; 392 } 393 394 int 395 cperf_pmd_cyclecount_test_runner(void *test_ctx) 396 { 397 struct pmd_cyclecount_state state = {0}; 398 const struct cperf_options *opts; 399 uint16_t test_burst_size; 400 uint8_t burst_size_idx = 0; 401 402 state.ctx = test_ctx; 403 opts = state.ctx->options; 404 state.opts = opts; 405 state.lcore = rte_lcore_id(); 406 state.linearize = 0; 407 408 static RTE_ATOMIC(uint16_t) display_once; 409 static bool warmup = true; 410 411 /* 412 * We need a small delay to allow for hardware to process all the crypto 413 * operations. We can't automatically figure out what the delay should 414 * be, so we leave it up to the user (by default it's 0). 415 */ 416 state.delay = 1000 * opts->pmdcc_delay; 417 418 #ifdef CPERF_LINEARIZATION_ENABLE 419 struct rte_cryptodev_info dev_info; 420 421 /* Check if source mbufs require coalescing */ 422 if (opts->segments_sz < ctx->options->max_buffer_size) { 423 rte_cryptodev_info_get(state.ctx->dev_id, &dev_info); 424 if ((dev_info.feature_flags & 425 RTE_CRYPTODEV_FF_MBUF_SCATTER_GATHER) == 426 0) { 427 state.linearize = 1; 428 } 429 } 430 #endif /* CPERF_LINEARIZATION_ENABLE */ 431 432 state.ctx->lcore_id = state.lcore; 433 434 /* Get first size from range or list */ 435 if (opts->inc_burst_size != 0) 436 test_burst_size = opts->min_burst_size; 437 else 438 test_burst_size = opts->burst_size_list[0]; 439 440 while (test_burst_size <= opts->max_burst_size) { 441 /* do a benchmark run */ 442 if (pmd_cyclecount_bench_burst_sz(&state, test_burst_size)) 443 return -1; 444 445 /* 446 * First run is always a warm up run. 447 */ 448 if (warmup) { 449 warmup = false; 450 continue; 451 } 452 453 uint16_t exp = 0; 454 if (!opts->csv) { 455 if (rte_atomic_compare_exchange_strong_explicit(&display_once, &exp, 1, 456 rte_memory_order_relaxed, rte_memory_order_relaxed)) 457 printf(PRETTY_HDR_FMT, "lcore id", "Buf Size", 458 "Burst Size", "Enqueued", 459 "Dequeued", "Enq Retries", 460 "Deq Retries", "Cycles/Op", 461 "Cycles/Enq", "Cycles/Deq"); 462 463 printf(PRETTY_LINE_FMT, state.ctx->lcore_id, 464 opts->test_buffer_size, test_burst_size, 465 state.ops_enqd, state.ops_deqd, 466 state.ops_enq_retries, 467 state.ops_deq_retries, 468 state.cycles_per_build, 469 state.cycles_per_enq, 470 state.cycles_per_deq); 471 } else { 472 if (rte_atomic_compare_exchange_strong_explicit(&display_once, &exp, 1, 473 rte_memory_order_relaxed, rte_memory_order_relaxed)) 474 printf(CSV_HDR_FMT, "# lcore id", "Buf Size", 475 "Burst Size", "Enqueued", 476 "Dequeued", "Enq Retries", 477 "Deq Retries", "Cycles/Op", 478 "Cycles/Enq", "Cycles/Deq"); 479 480 printf(CSV_LINE_FMT, state.ctx->lcore_id, 481 opts->test_buffer_size, test_burst_size, 482 state.ops_enqd, state.ops_deqd, 483 state.ops_enq_retries, 484 state.ops_deq_retries, 485 state.cycles_per_build, 486 state.cycles_per_enq, 487 state.cycles_per_deq); 488 } 489 490 /* Get next size from range or list */ 491 if (opts->inc_burst_size != 0) 492 test_burst_size += opts->inc_burst_size; 493 else { 494 if (++burst_size_idx == opts->burst_size_count) 495 break; 496 test_burst_size = opts->burst_size_list[burst_size_idx]; 497 } 498 } 499 500 return 0; 501 } 502 503 void 504 cperf_pmd_cyclecount_test_destructor(void *arg) 505 { 506 struct cperf_pmd_cyclecount_ctx *ctx = arg; 507 508 if (ctx == NULL) 509 return; 510 511 cperf_pmd_cyclecount_test_free(ctx); 512 } 513