1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2017 Intel Corporation 3 */ 4 5 #include <stdbool.h> 6 7 #include <rte_crypto.h> 8 #include <rte_cryptodev.h> 9 #include <rte_cycles.h> 10 #include <rte_malloc.h> 11 12 #include "cperf_ops.h" 13 #include "cperf_test_pmd_cyclecount.h" 14 #include "cperf_test_common.h" 15 16 #define PRETTY_HDR_FMT "%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s\n\n" 17 #define PRETTY_LINE_FMT "%12u%12u%12u%12u%12u%12u%12u%12.0f%12.0f%12.0f\n" 18 #define CSV_HDR_FMT "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n" 19 #define CSV_LINE_FMT "%10u,%10u,%u,%u,%u,%u,%u,%.3f,%.3f,%.3f\n" 20 21 struct cperf_pmd_cyclecount_ctx { 22 uint8_t dev_id; 23 uint16_t qp_id; 24 uint8_t lcore_id; 25 26 struct rte_mempool *pool; 27 struct rte_crypto_op **ops; 28 struct rte_crypto_op **ops_processed; 29 30 struct rte_cryptodev_sym_session *sess; 31 32 cperf_populate_ops_t populate_ops; 33 34 uint32_t src_buf_offset; 35 uint32_t dst_buf_offset; 36 37 const struct cperf_options *options; 38 const struct cperf_test_vector *test_vector; 39 }; 40 41 struct pmd_cyclecount_state { 42 struct cperf_pmd_cyclecount_ctx *ctx; 43 const struct cperf_options *opts; 44 uint32_t lcore; 45 uint64_t delay; 46 int linearize; 47 uint32_t ops_enqd; 48 uint32_t ops_deqd; 49 uint32_t ops_enq_retries; 50 uint32_t ops_deq_retries; 51 double cycles_per_build; 52 double cycles_per_enq; 53 double cycles_per_deq; 54 }; 55 56 static const uint16_t iv_offset = 57 sizeof(struct rte_crypto_op) + sizeof(struct rte_crypto_sym_op); 58 59 static void 60 cperf_pmd_cyclecount_test_free(struct cperf_pmd_cyclecount_ctx *ctx) 61 { 62 if (!ctx) 63 return; 64 65 if (ctx->sess) { 66 #ifdef RTE_LIB_SECURITY 67 if (ctx->options->op_type == CPERF_PDCP || 68 ctx->options->op_type == CPERF_DOCSIS) { 69 struct rte_security_ctx *sec_ctx = 70 (struct rte_security_ctx *) 71 rte_cryptodev_get_sec_ctx(ctx->dev_id); 72 rte_security_session_destroy(sec_ctx, 73 (struct rte_security_session *)ctx->sess); 74 } else 75 #endif 76 { 77 rte_cryptodev_sym_session_clear(ctx->dev_id, ctx->sess); 78 rte_cryptodev_sym_session_free(ctx->sess); 79 } 80 } 81 82 rte_mempool_free(ctx->pool); 83 84 rte_free(ctx->ops); 85 86 rte_free(ctx->ops_processed); 87 88 rte_free(ctx); 89 } 90 91 void * 92 cperf_pmd_cyclecount_test_constructor(struct rte_mempool *sess_mp, 93 struct rte_mempool *sess_priv_mp, 94 uint8_t dev_id, uint16_t qp_id, 95 const struct cperf_options *options, 96 const struct cperf_test_vector *test_vector, 97 const struct cperf_op_fns *op_fns) 98 { 99 struct cperf_pmd_cyclecount_ctx *ctx = NULL; 100 101 /* preallocate buffers for crypto ops as they can get quite big */ 102 size_t alloc_sz = sizeof(struct rte_crypto_op *) * 103 options->nb_descriptors; 104 105 ctx = rte_malloc(NULL, sizeof(struct cperf_pmd_cyclecount_ctx), 0); 106 if (ctx == NULL) 107 goto err; 108 109 ctx->dev_id = dev_id; 110 ctx->qp_id = qp_id; 111 112 ctx->populate_ops = op_fns->populate_ops; 113 ctx->options = options; 114 ctx->test_vector = test_vector; 115 116 /* IV goes at the end of the crypto operation */ 117 uint16_t iv_offset = sizeof(struct rte_crypto_op) + 118 sizeof(struct rte_crypto_sym_op); 119 120 ctx->sess = op_fns->sess_create(sess_mp, sess_priv_mp, dev_id, options, 121 test_vector, iv_offset); 122 if (ctx->sess == NULL) 123 goto err; 124 125 if (cperf_alloc_common_memory(options, test_vector, dev_id, qp_id, 0, 126 &ctx->src_buf_offset, &ctx->dst_buf_offset, 127 &ctx->pool) < 0) 128 goto err; 129 130 ctx->ops = rte_malloc("ops", alloc_sz, 0); 131 if (!ctx->ops) 132 goto err; 133 134 ctx->ops_processed = rte_malloc("ops_processed", alloc_sz, 0); 135 if (!ctx->ops_processed) 136 goto err; 137 138 return ctx; 139 140 err: 141 cperf_pmd_cyclecount_test_free(ctx); 142 143 return NULL; 144 } 145 146 /* benchmark alloc-build-free of ops */ 147 static inline int 148 pmd_cyclecount_bench_ops(struct pmd_cyclecount_state *state, uint32_t cur_op, 149 uint16_t test_burst_size) 150 { 151 uint32_t iter_ops_left = state->opts->total_ops - cur_op; 152 uint32_t iter_ops_needed = 153 RTE_MIN(state->opts->nb_descriptors, iter_ops_left); 154 uint32_t cur_iter_op; 155 uint32_t imix_idx = 0; 156 157 for (cur_iter_op = 0; cur_iter_op < iter_ops_needed; 158 cur_iter_op += test_burst_size) { 159 uint32_t burst_size = RTE_MIN(iter_ops_needed - cur_iter_op, 160 test_burst_size); 161 struct rte_crypto_op **ops = &state->ctx->ops[cur_iter_op]; 162 163 /* Allocate objects containing crypto operations and mbufs */ 164 if (rte_mempool_get_bulk(state->ctx->pool, (void **)ops, 165 burst_size) != 0) { 166 RTE_LOG(ERR, USER1, 167 "Failed to allocate more crypto operations " 168 "from the crypto operation pool.\n" 169 "Consider increasing the pool size " 170 "with --pool-sz\n"); 171 return -1; 172 } 173 174 /* Setup crypto op, attach mbuf etc */ 175 (state->ctx->populate_ops)(ops, 176 state->ctx->src_buf_offset, 177 state->ctx->dst_buf_offset, 178 burst_size, 179 state->ctx->sess, state->opts, 180 state->ctx->test_vector, iv_offset, 181 &imix_idx, NULL); 182 183 #ifdef CPERF_LINEARIZATION_ENABLE 184 /* Check if source mbufs require coalescing */ 185 if (state->linearize) { 186 uint8_t i; 187 for (i = 0; i < burst_size; i++) { 188 struct rte_mbuf *src = ops[i]->sym->m_src; 189 rte_pktmbuf_linearize(src); 190 } 191 } 192 #endif /* CPERF_LINEARIZATION_ENABLE */ 193 rte_mempool_put_bulk(state->ctx->pool, (void **)ops, 194 burst_size); 195 } 196 197 return 0; 198 } 199 200 /* allocate and build ops (no free) */ 201 static int 202 pmd_cyclecount_build_ops(struct pmd_cyclecount_state *state, 203 uint32_t iter_ops_needed, uint16_t test_burst_size) 204 { 205 uint32_t cur_iter_op; 206 uint32_t imix_idx = 0; 207 208 for (cur_iter_op = 0; cur_iter_op < iter_ops_needed; 209 cur_iter_op += test_burst_size) { 210 uint32_t burst_size = RTE_MIN( 211 iter_ops_needed - cur_iter_op, test_burst_size); 212 struct rte_crypto_op **ops = &state->ctx->ops[cur_iter_op]; 213 214 /* Allocate objects containing crypto operations and mbufs */ 215 if (rte_mempool_get_bulk(state->ctx->pool, (void **)ops, 216 burst_size) != 0) { 217 RTE_LOG(ERR, USER1, 218 "Failed to allocate more crypto operations " 219 "from the crypto operation pool.\n" 220 "Consider increasing the pool size " 221 "with --pool-sz\n"); 222 return -1; 223 } 224 225 /* Setup crypto op, attach mbuf etc */ 226 (state->ctx->populate_ops)(ops, 227 state->ctx->src_buf_offset, 228 state->ctx->dst_buf_offset, 229 burst_size, 230 state->ctx->sess, state->opts, 231 state->ctx->test_vector, iv_offset, 232 &imix_idx, NULL); 233 } 234 return 0; 235 } 236 237 /* benchmark enqueue, returns number of ops enqueued */ 238 static uint32_t 239 pmd_cyclecount_bench_enq(struct pmd_cyclecount_state *state, 240 uint32_t iter_ops_needed, uint16_t test_burst_size) 241 { 242 /* Enqueue full descriptor ring of ops on crypto device */ 243 uint32_t cur_iter_op = 0; 244 while (cur_iter_op < iter_ops_needed) { 245 uint32_t burst_size = RTE_MIN(iter_ops_needed - cur_iter_op, 246 test_burst_size); 247 struct rte_crypto_op **ops = &state->ctx->ops[cur_iter_op]; 248 uint32_t burst_enqd; 249 250 burst_enqd = rte_cryptodev_enqueue_burst(state->ctx->dev_id, 251 state->ctx->qp_id, ops, burst_size); 252 253 /* if we couldn't enqueue anything, the queue is full */ 254 if (!burst_enqd) { 255 /* don't try to dequeue anything we didn't enqueue */ 256 return cur_iter_op; 257 } 258 259 if (burst_enqd < burst_size) 260 state->ops_enq_retries++; 261 state->ops_enqd += burst_enqd; 262 cur_iter_op += burst_enqd; 263 } 264 return iter_ops_needed; 265 } 266 267 /* benchmark dequeue */ 268 static void 269 pmd_cyclecount_bench_deq(struct pmd_cyclecount_state *state, 270 uint32_t iter_ops_needed, uint16_t test_burst_size) 271 { 272 /* Dequeue full descriptor ring of ops on crypto device */ 273 uint32_t cur_iter_op = 0; 274 while (cur_iter_op < iter_ops_needed) { 275 uint32_t burst_size = RTE_MIN(iter_ops_needed - cur_iter_op, 276 test_burst_size); 277 struct rte_crypto_op **ops_processed = 278 &state->ctx->ops[cur_iter_op]; 279 uint32_t burst_deqd; 280 281 burst_deqd = rte_cryptodev_dequeue_burst(state->ctx->dev_id, 282 state->ctx->qp_id, ops_processed, burst_size); 283 284 if (burst_deqd < burst_size) 285 state->ops_deq_retries++; 286 state->ops_deqd += burst_deqd; 287 cur_iter_op += burst_deqd; 288 } 289 } 290 291 /* run benchmark per burst size */ 292 static inline int 293 pmd_cyclecount_bench_burst_sz( 294 struct pmd_cyclecount_state *state, uint16_t test_burst_size) 295 { 296 uint64_t tsc_start; 297 uint64_t tsc_end; 298 uint64_t tsc_op; 299 uint64_t tsc_enq; 300 uint64_t tsc_deq; 301 uint32_t cur_op; 302 303 /* reset all counters */ 304 tsc_enq = 0; 305 tsc_deq = 0; 306 state->ops_enqd = 0; 307 state->ops_enq_retries = 0; 308 state->ops_deqd = 0; 309 state->ops_deq_retries = 0; 310 311 /* 312 * Benchmark crypto op alloc-build-free separately. 313 */ 314 tsc_start = rte_rdtsc_precise(); 315 316 for (cur_op = 0; cur_op < state->opts->total_ops; 317 cur_op += state->opts->nb_descriptors) { 318 if (unlikely(pmd_cyclecount_bench_ops( 319 state, cur_op, test_burst_size))) 320 return -1; 321 } 322 323 tsc_end = rte_rdtsc_precise(); 324 tsc_op = tsc_end - tsc_start; 325 326 327 /* 328 * Hardware acceleration cyclecount benchmarking loop. 329 * 330 * We're benchmarking raw enq/deq performance by filling up the device 331 * queue, so we never get any failed enqs unless the driver won't accept 332 * the exact number of descriptors we requested, or the driver won't 333 * wrap around the end of the TX ring. However, since we're only 334 * dequeuing once we've filled up the queue, we have to benchmark it 335 * piecemeal and then average out the results. 336 */ 337 cur_op = 0; 338 while (cur_op < state->opts->total_ops) { 339 uint32_t iter_ops_left = state->opts->total_ops - cur_op; 340 uint32_t iter_ops_needed = RTE_MIN( 341 state->opts->nb_descriptors, iter_ops_left); 342 uint32_t iter_ops_allocd = iter_ops_needed; 343 344 /* allocate and build ops */ 345 if (unlikely(pmd_cyclecount_build_ops(state, iter_ops_needed, 346 test_burst_size))) 347 return -1; 348 349 tsc_start = rte_rdtsc_precise(); 350 351 /* fill up TX ring */ 352 iter_ops_needed = pmd_cyclecount_bench_enq(state, 353 iter_ops_needed, test_burst_size); 354 355 tsc_end = rte_rdtsc_precise(); 356 357 tsc_enq += tsc_end - tsc_start; 358 359 /* allow for HW to catch up */ 360 if (state->delay) 361 rte_delay_us_block(state->delay); 362 363 tsc_start = rte_rdtsc_precise(); 364 365 /* drain RX ring */ 366 pmd_cyclecount_bench_deq(state, iter_ops_needed, 367 test_burst_size); 368 369 tsc_end = rte_rdtsc_precise(); 370 371 tsc_deq += tsc_end - tsc_start; 372 373 cur_op += iter_ops_needed; 374 375 /* 376 * we may not have processed all ops that we allocated, so 377 * free everything we've allocated. 378 */ 379 rte_mempool_put_bulk(state->ctx->pool, 380 (void **)state->ctx->ops, iter_ops_allocd); 381 } 382 383 state->cycles_per_build = (double)tsc_op / state->opts->total_ops; 384 state->cycles_per_enq = (double)tsc_enq / state->ops_enqd; 385 state->cycles_per_deq = (double)tsc_deq / state->ops_deqd; 386 387 return 0; 388 } 389 390 int 391 cperf_pmd_cyclecount_test_runner(void *test_ctx) 392 { 393 struct pmd_cyclecount_state state = {0}; 394 const struct cperf_options *opts; 395 uint16_t test_burst_size; 396 uint8_t burst_size_idx = 0; 397 398 state.ctx = test_ctx; 399 opts = state.ctx->options; 400 state.opts = opts; 401 state.lcore = rte_lcore_id(); 402 state.linearize = 0; 403 404 static uint16_t display_once; 405 static bool warmup = true; 406 407 /* 408 * We need a small delay to allow for hardware to process all the crypto 409 * operations. We can't automatically figure out what the delay should 410 * be, so we leave it up to the user (by default it's 0). 411 */ 412 state.delay = 1000 * opts->pmdcc_delay; 413 414 #ifdef CPERF_LINEARIZATION_ENABLE 415 struct rte_cryptodev_info dev_info; 416 417 /* Check if source mbufs require coalescing */ 418 if (opts->segments_sz < ctx->options->max_buffer_size) { 419 rte_cryptodev_info_get(state.ctx->dev_id, &dev_info); 420 if ((dev_info.feature_flags & 421 RTE_CRYPTODEV_FF_MBUF_SCATTER_GATHER) == 422 0) { 423 state.linearize = 1; 424 } 425 } 426 #endif /* CPERF_LINEARIZATION_ENABLE */ 427 428 state.ctx->lcore_id = state.lcore; 429 430 /* Get first size from range or list */ 431 if (opts->inc_burst_size != 0) 432 test_burst_size = opts->min_burst_size; 433 else 434 test_burst_size = opts->burst_size_list[0]; 435 436 while (test_burst_size <= opts->max_burst_size) { 437 /* do a benchmark run */ 438 if (pmd_cyclecount_bench_burst_sz(&state, test_burst_size)) 439 return -1; 440 441 /* 442 * First run is always a warm up run. 443 */ 444 if (warmup) { 445 warmup = false; 446 continue; 447 } 448 449 uint16_t exp = 0; 450 if (!opts->csv) { 451 if (__atomic_compare_exchange_n(&display_once, &exp, 1, 0, 452 __ATOMIC_RELAXED, __ATOMIC_RELAXED)) 453 printf(PRETTY_HDR_FMT, "lcore id", "Buf Size", 454 "Burst Size", "Enqueued", 455 "Dequeued", "Enq Retries", 456 "Deq Retries", "Cycles/Op", 457 "Cycles/Enq", "Cycles/Deq"); 458 459 printf(PRETTY_LINE_FMT, state.ctx->lcore_id, 460 opts->test_buffer_size, test_burst_size, 461 state.ops_enqd, state.ops_deqd, 462 state.ops_enq_retries, 463 state.ops_deq_retries, 464 state.cycles_per_build, 465 state.cycles_per_enq, 466 state.cycles_per_deq); 467 } else { 468 if (__atomic_compare_exchange_n(&display_once, &exp, 1, 0, 469 __ATOMIC_RELAXED, __ATOMIC_RELAXED)) 470 printf(CSV_HDR_FMT, "# lcore id", "Buf Size", 471 "Burst Size", "Enqueued", 472 "Dequeued", "Enq Retries", 473 "Deq Retries", "Cycles/Op", 474 "Cycles/Enq", "Cycles/Deq"); 475 476 printf(CSV_LINE_FMT, state.ctx->lcore_id, 477 opts->test_buffer_size, test_burst_size, 478 state.ops_enqd, state.ops_deqd, 479 state.ops_enq_retries, 480 state.ops_deq_retries, 481 state.cycles_per_build, 482 state.cycles_per_enq, 483 state.cycles_per_deq); 484 } 485 486 /* Get next size from range or list */ 487 if (opts->inc_burst_size != 0) 488 test_burst_size += opts->inc_burst_size; 489 else { 490 if (++burst_size_idx == opts->burst_size_count) 491 break; 492 test_burst_size = opts->burst_size_list[burst_size_idx]; 493 } 494 } 495 496 return 0; 497 } 498 499 void 500 cperf_pmd_cyclecount_test_destructor(void *arg) 501 { 502 struct cperf_pmd_cyclecount_ctx *ctx = arg; 503 504 if (ctx == NULL) 505 return; 506 507 cperf_pmd_cyclecount_test_free(ctx); 508 } 509