1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2019 Intel Corporation
3 */
4
5 #include <stdlib.h>
6
7 #include <rte_malloc.h>
8 #include <rte_eal.h>
9 #include <rte_log.h>
10 #include <rte_cycles.h>
11 #include "rte_spinlock.h"
12 #include <rte_compressdev.h>
13
14 #include "comp_perf_test_cyclecount.h"
15
16 struct cperf_cyclecount_ctx {
17 struct cperf_verify_ctx ver;
18
19 uint32_t ops_enq_retries;
20 uint32_t ops_deq_retries;
21
22 uint64_t duration_op;
23 uint64_t duration_enq;
24 uint64_t duration_deq;
25 };
26
27 void
cperf_cyclecount_test_destructor(void * arg)28 cperf_cyclecount_test_destructor(void *arg)
29 {
30 struct cperf_cyclecount_ctx *ctx = arg;
31
32 if (arg) {
33 comp_perf_free_memory(ctx->ver.options, &ctx->ver.mem);
34 rte_free(arg);
35 }
36 }
37
38 void *
cperf_cyclecount_test_constructor(uint8_t dev_id,uint16_t qp_id,struct comp_test_data * options)39 cperf_cyclecount_test_constructor(uint8_t dev_id, uint16_t qp_id,
40 struct comp_test_data *options)
41 {
42 struct cperf_cyclecount_ctx *ctx = NULL;
43
44 ctx = rte_malloc(NULL, sizeof(struct cperf_cyclecount_ctx), 0);
45
46 if (ctx == NULL)
47 return NULL;
48
49 ctx->ver.mem.dev_id = dev_id;
50 ctx->ver.mem.qp_id = qp_id;
51 ctx->ver.options = options;
52 ctx->ver.silent = 1; /* ver. part will be silent */
53
54 if (!comp_perf_allocate_memory(ctx->ver.options, &ctx->ver.mem)
55 && !prepare_bufs(ctx->ver.options, &ctx->ver.mem))
56 return ctx;
57
58 cperf_cyclecount_test_destructor(ctx);
59 return NULL;
60 }
61
62 static int
cperf_cyclecount_op_setup(struct rte_comp_op ** ops,struct cperf_cyclecount_ctx * ctx,struct rte_mbuf ** input_bufs,struct rte_mbuf ** output_bufs,void * priv_xform,uint32_t out_seg_sz)63 cperf_cyclecount_op_setup(struct rte_comp_op **ops,
64 struct cperf_cyclecount_ctx *ctx,
65 struct rte_mbuf **input_bufs,
66 struct rte_mbuf **output_bufs,
67 void *priv_xform,
68 uint32_t out_seg_sz)
69 {
70 struct comp_test_data *test_data = ctx->ver.options;
71 struct cperf_mem_resources *mem = &ctx->ver.mem;
72
73 uint32_t i, iter, num_iter;
74 int res = 0;
75 uint16_t ops_needed;
76
77 num_iter = test_data->num_iter;
78
79 for (iter = 0; iter < num_iter; iter++) {
80 uint32_t remaining_ops = mem->total_bufs;
81 uint32_t total_enq_ops = 0;
82 uint16_t num_enq = 0;
83 uint16_t num_deq = 0;
84
85 while (remaining_ops > 0) {
86 uint16_t num_ops = RTE_MIN(remaining_ops,
87 test_data->burst_sz);
88 ops_needed = num_ops;
89
90 /* Allocate compression operations */
91 if (ops_needed && rte_mempool_get_bulk(
92 mem->op_pool,
93 (void **)ops,
94 ops_needed) != 0) {
95 RTE_LOG(ERR, USER1,
96 "Cyclecount: could not allocate enough operations\n");
97 res = -1;
98 goto end;
99 }
100
101 for (i = 0; i < ops_needed; i++) {
102
103 /* Calculate next buffer to attach */
104 /* to operation */
105 uint32_t buf_id = total_enq_ops + i;
106 uint16_t op_id = i;
107
108 /* Reset all data in output buffers */
109 struct rte_mbuf *m = output_bufs[buf_id];
110
111 m->pkt_len = out_seg_sz * m->nb_segs;
112 while (m) {
113 m->data_len = m->buf_len - m->data_off;
114 m = m->next;
115 }
116 ops[op_id]->m_src = input_bufs[buf_id];
117 ops[op_id]->m_dst = output_bufs[buf_id];
118 ops[op_id]->src.offset = 0;
119 ops[op_id]->src.length =
120 rte_pktmbuf_pkt_len(input_bufs[buf_id]);
121 ops[op_id]->dst.offset = 0;
122 ops[op_id]->flush_flag = RTE_COMP_FLUSH_FINAL;
123 ops[op_id]->input_chksum = buf_id;
124 ops[op_id]->private_xform = priv_xform;
125 }
126
127 /* E N Q U E U I N G */
128 /* assuming that all ops are enqueued */
129 /* instead of the real enqueue operation */
130 num_enq = num_ops;
131
132 remaining_ops -= num_enq;
133 total_enq_ops += num_enq;
134
135 /* D E Q U E U I N G */
136 /* assuming that all ops dequeued */
137 /* instead of the real dequeue operation */
138 num_deq = num_ops;
139
140 rte_mempool_put_bulk(mem->op_pool,
141 (void **)ops, num_deq);
142 }
143 }
144 return res;
145 end:
146 rte_mempool_put_bulk(mem->op_pool, (void **)ops, ops_needed);
147 rte_free(ops);
148
149 return res;
150 }
151
152 static int
main_loop(struct cperf_cyclecount_ctx * ctx,enum rte_comp_xform_type type)153 main_loop(struct cperf_cyclecount_ctx *ctx, enum rte_comp_xform_type type)
154 {
155 struct comp_test_data *test_data = ctx->ver.options;
156 struct cperf_mem_resources *mem = &ctx->ver.mem;
157 uint8_t dev_id = mem->dev_id;
158 uint32_t i, iter, num_iter;
159 struct rte_comp_op **ops, **deq_ops;
160 void *priv_xform = NULL;
161 struct rte_comp_xform xform;
162 struct rte_mbuf **input_bufs, **output_bufs;
163 int ret, res = 0;
164 int allocated = 0;
165 uint32_t out_seg_sz;
166
167 uint64_t tsc_start, tsc_end, tsc_duration;
168
169 if (test_data == NULL || !test_data->burst_sz) {
170 RTE_LOG(ERR, USER1, "Unknown burst size\n");
171 return -1;
172 }
173 ctx->duration_enq = 0;
174 ctx->duration_deq = 0;
175 ctx->ops_enq_retries = 0;
176 ctx->ops_deq_retries = 0;
177
178 /* one array for both enqueue and dequeue */
179 ops = rte_zmalloc_socket(NULL,
180 (test_data->burst_sz + mem->total_bufs) *
181 sizeof(struct rte_comp_op *),
182 0, rte_socket_id());
183
184 if (ops == NULL) {
185 RTE_LOG(ERR, USER1,
186 "Can't allocate memory for ops structures\n");
187 return -1;
188 }
189
190 deq_ops = &ops[test_data->burst_sz];
191
192 if (type == RTE_COMP_COMPRESS) {
193 xform = (struct rte_comp_xform) {
194 .type = RTE_COMP_COMPRESS,
195 .compress = {
196 .algo = test_data->test_algo,
197 .level = test_data->level,
198 .window_size = test_data->window_sz,
199 .chksum = RTE_COMP_CHECKSUM_NONE,
200 .hash_algo = RTE_COMP_HASH_ALGO_NONE
201 }
202 };
203 if (test_data->test_algo == RTE_COMP_ALGO_DEFLATE)
204 xform.compress.deflate.huffman = test_data->huffman_enc;
205 else if (test_data->test_algo == RTE_COMP_ALGO_LZ4)
206 xform.compress.lz4.flags = test_data->lz4_flags;
207 input_bufs = mem->decomp_bufs;
208 output_bufs = mem->comp_bufs;
209 out_seg_sz = test_data->out_seg_sz;
210 } else {
211 xform = (struct rte_comp_xform) {
212 .type = RTE_COMP_DECOMPRESS,
213 .decompress = {
214 .algo = test_data->test_algo,
215 .chksum = RTE_COMP_CHECKSUM_NONE,
216 .window_size = test_data->window_sz,
217 .hash_algo = RTE_COMP_HASH_ALGO_NONE
218 }
219 };
220 if (test_data->test_algo == RTE_COMP_ALGO_LZ4)
221 xform.decompress.lz4.flags = test_data->lz4_flags;
222 input_bufs = mem->comp_bufs;
223 output_bufs = mem->decomp_bufs;
224 out_seg_sz = test_data->seg_sz;
225 }
226
227 /* Create private xform */
228 if (rte_compressdev_private_xform_create(dev_id, &xform,
229 &priv_xform) < 0) {
230 RTE_LOG(ERR, USER1, "Private xform could not be created\n");
231 res = -1;
232 goto end;
233 }
234
235 tsc_start = rte_rdtsc_precise();
236 ret = cperf_cyclecount_op_setup(ops,
237 ctx,
238 input_bufs,
239 output_bufs,
240 priv_xform,
241 out_seg_sz);
242
243 tsc_end = rte_rdtsc_precise();
244
245 /* ret value check postponed a bit to cancel extra 'if' bias */
246 if (ret < 0) {
247 RTE_LOG(ERR, USER1, "Setup function failed\n");
248 res = -1;
249 goto end;
250 }
251
252 tsc_duration = tsc_end - tsc_start;
253 ctx->duration_op = tsc_duration;
254
255 num_iter = test_data->num_iter;
256 for (iter = 0; iter < num_iter; iter++) {
257 uint32_t total_ops = mem->total_bufs;
258 uint32_t remaining_ops = mem->total_bufs;
259 uint32_t total_deq_ops = 0;
260 uint32_t total_enq_ops = 0;
261 uint16_t ops_unused = 0;
262 uint16_t num_enq = 0;
263 uint16_t num_deq = 0;
264
265 while (remaining_ops > 0) {
266 uint16_t num_ops = RTE_MIN(remaining_ops,
267 test_data->burst_sz);
268 uint16_t ops_needed = num_ops - ops_unused;
269
270 /*
271 * Move the unused operations from the previous
272 * enqueue_burst call to the front, to maintain order
273 */
274 if ((ops_unused > 0) && (num_enq > 0)) {
275 size_t nb_b_to_mov =
276 ops_unused * sizeof(struct rte_comp_op *);
277
278 memmove(ops, &ops[num_enq], nb_b_to_mov);
279 }
280
281 /* Allocate compression operations */
282 if (ops_needed && rte_mempool_get_bulk(
283 mem->op_pool,
284 (void **)&ops[ops_unused],
285 ops_needed) != 0) {
286 RTE_LOG(ERR, USER1,
287 "Could not allocate enough operations\n");
288 res = -1;
289 goto end;
290 }
291 allocated += ops_needed;
292
293 for (i = 0; i < ops_needed; i++) {
294 /*
295 * Calculate next buffer to attach to operation
296 */
297 uint32_t buf_id = total_enq_ops + i +
298 ops_unused;
299 uint16_t op_id = ops_unused + i;
300 /* Reset all data in output buffers */
301 struct rte_mbuf *m = output_bufs[buf_id];
302
303 m->pkt_len = out_seg_sz * m->nb_segs;
304 while (m) {
305 m->data_len = m->buf_len - m->data_off;
306 m = m->next;
307 }
308 ops[op_id]->m_src = input_bufs[buf_id];
309 ops[op_id]->m_dst = output_bufs[buf_id];
310 ops[op_id]->src.offset = 0;
311 ops[op_id]->src.length =
312 rte_pktmbuf_pkt_len(input_bufs[buf_id]);
313 ops[op_id]->dst.offset = 0;
314 ops[op_id]->flush_flag = RTE_COMP_FLUSH_FINAL;
315 ops[op_id]->input_chksum = buf_id;
316 ops[op_id]->private_xform = priv_xform;
317 }
318
319 if (unlikely(test_data->perf_comp_force_stop))
320 goto end;
321
322 tsc_start = rte_rdtsc_precise();
323 num_enq = rte_compressdev_enqueue_burst(dev_id,
324 mem->qp_id, ops,
325 num_ops);
326 tsc_end = rte_rdtsc_precise();
327 tsc_duration = tsc_end - tsc_start;
328 ctx->duration_enq += tsc_duration;
329
330 if (num_enq < num_ops)
331 ctx->ops_enq_retries++;
332
333 if (test_data->cyclecount_delay)
334 rte_delay_us_block(test_data->cyclecount_delay);
335
336 if (num_enq == 0) {
337 struct rte_compressdev_stats stats;
338
339 rte_compressdev_stats_get(dev_id, &stats);
340 if (stats.enqueue_err_count) {
341 res = -1;
342 goto end;
343 }
344 }
345
346 ops_unused = num_ops - num_enq;
347 remaining_ops -= num_enq;
348 total_enq_ops += num_enq;
349
350 tsc_start = rte_rdtsc_precise();
351 num_deq = rte_compressdev_dequeue_burst(dev_id,
352 mem->qp_id,
353 deq_ops,
354 allocated);
355 tsc_end = rte_rdtsc_precise();
356 tsc_duration = tsc_end - tsc_start;
357 ctx->duration_deq += tsc_duration;
358
359 if (num_deq < allocated)
360 ctx->ops_deq_retries++;
361
362 total_deq_ops += num_deq;
363
364 if (iter == num_iter - 1) {
365 for (i = 0; i < num_deq; i++) {
366 struct rte_comp_op *op = deq_ops[i];
367
368 if (op->status !=
369 RTE_COMP_OP_STATUS_SUCCESS) {
370 RTE_LOG(ERR, USER1, "Some operations were not successful\n");
371 goto end;
372 }
373
374 struct rte_mbuf *m = op->m_dst;
375
376 m->pkt_len = op->produced;
377 uint32_t remaining_data = op->produced;
378 uint16_t data_to_append;
379
380 while (remaining_data > 0) {
381 data_to_append =
382 RTE_MIN(remaining_data,
383 out_seg_sz);
384 m->data_len = data_to_append;
385 remaining_data -=
386 data_to_append;
387 m = m->next;
388 }
389 }
390 }
391 rte_mempool_put_bulk(mem->op_pool,
392 (void **)deq_ops, num_deq);
393 allocated -= num_deq;
394 }
395
396 /* Dequeue the last operations */
397 while (total_deq_ops < total_ops) {
398 if (unlikely(test_data->perf_comp_force_stop))
399 goto end;
400
401 tsc_start = rte_rdtsc_precise();
402 num_deq = rte_compressdev_dequeue_burst(dev_id,
403 mem->qp_id,
404 deq_ops,
405 test_data->burst_sz);
406 tsc_end = rte_rdtsc_precise();
407 tsc_duration = tsc_end - tsc_start;
408 ctx->duration_deq += tsc_duration;
409 ctx->ops_deq_retries++;
410
411 if (num_deq == 0) {
412 struct rte_compressdev_stats stats;
413
414 rte_compressdev_stats_get(dev_id, &stats);
415 if (stats.dequeue_err_count) {
416 res = -1;
417 goto end;
418 }
419 }
420 total_deq_ops += num_deq;
421
422 if (iter == num_iter - 1) {
423 for (i = 0; i < num_deq; i++) {
424 struct rte_comp_op *op = deq_ops[i];
425
426 if (op->status !=
427 RTE_COMP_OP_STATUS_SUCCESS) {
428 RTE_LOG(ERR, USER1, "Some operations were not successful\n");
429 goto end;
430 }
431
432 struct rte_mbuf *m = op->m_dst;
433
434 m->pkt_len = op->produced;
435 uint32_t remaining_data = op->produced;
436 uint16_t data_to_append;
437
438 while (remaining_data > 0) {
439 data_to_append =
440 RTE_MIN(remaining_data,
441 out_seg_sz);
442 m->data_len = data_to_append;
443 remaining_data -=
444 data_to_append;
445 m = m->next;
446 }
447 }
448 }
449 rte_mempool_put_bulk(mem->op_pool,
450 (void **)deq_ops, num_deq);
451 allocated -= num_deq;
452 }
453 }
454 allocated = 0;
455
456 end:
457 if (allocated)
458 rte_mempool_put_bulk(mem->op_pool, (void **)ops, allocated);
459 rte_compressdev_private_xform_free(dev_id, priv_xform);
460 rte_free(ops);
461
462 if (test_data->perf_comp_force_stop) {
463 RTE_LOG(ERR, USER1,
464 "lcore: %d Perf. test has been aborted by user\n",
465 mem->lcore_id);
466 res = -1;
467 }
468 return res;
469 }
470
471 int
cperf_cyclecount_test_runner(void * test_ctx)472 cperf_cyclecount_test_runner(void *test_ctx)
473 {
474 struct cperf_cyclecount_ctx *ctx = test_ctx;
475 struct comp_test_data *test_data = ctx->ver.options;
476 uint32_t lcore = rte_lcore_id();
477 static uint16_t display_once;
478 static rte_spinlock_t print_spinlock;
479 int i;
480
481 uint32_t ops_enq_retries_comp;
482 uint32_t ops_deq_retries_comp;
483
484 uint32_t ops_enq_retries_decomp;
485 uint32_t ops_deq_retries_decomp;
486
487 uint32_t duration_setup_per_op;
488
489 uint32_t duration_enq_per_op_comp;
490 uint32_t duration_deq_per_op_comp;
491
492 uint32_t duration_enq_per_op_decomp;
493 uint32_t duration_deq_per_op_decomp;
494
495 ctx->ver.mem.lcore_id = lcore;
496
497 uint16_t exp = 0;
498 /*
499 * printing information about current compression thread
500 */
501 if (rte_atomic_compare_exchange_strong_explicit(&ctx->ver.mem.print_info_once, &exp,
502 1, rte_memory_order_relaxed, rte_memory_order_relaxed))
503 printf(" lcore: %u,"
504 " driver name: %s,"
505 " device name: %s,"
506 " device id: %u,"
507 " socket id: %u,"
508 " queue pair id: %u\n",
509 lcore,
510 ctx->ver.options->driver_name,
511 rte_compressdev_name_get(ctx->ver.mem.dev_id),
512 ctx->ver.mem.dev_id,
513 rte_compressdev_socket_id(ctx->ver.mem.dev_id),
514 ctx->ver.mem.qp_id);
515
516 /*
517 * First the verification part is needed
518 */
519 if (cperf_verify_test_runner(&ctx->ver))
520 return EXIT_FAILURE;
521
522 if (test_data->test_op & COMPRESS) {
523 /*
524 * Run the test twice, discarding the first performance
525 * results, before the cache is warmed up
526 */
527 for (i = 0; i < 2; i++) {
528 if (main_loop(ctx, RTE_COMP_COMPRESS) < 0)
529 return EXIT_FAILURE;
530 }
531
532 ops_enq_retries_comp = ctx->ops_enq_retries;
533 ops_deq_retries_comp = ctx->ops_deq_retries;
534
535 duration_enq_per_op_comp = ctx->duration_enq /
536 (ctx->ver.mem.total_bufs * test_data->num_iter);
537 duration_deq_per_op_comp = ctx->duration_deq /
538 (ctx->ver.mem.total_bufs * test_data->num_iter);
539 } else {
540 ops_enq_retries_comp = 0;
541 ops_deq_retries_comp = 0;
542
543 duration_enq_per_op_comp = 0;
544 duration_deq_per_op_comp = 0;
545 }
546
547 if (test_data->test_op & DECOMPRESS) {
548 /*
549 * Run the test twice, discarding the first performance
550 * results, before the cache is warmed up
551 */
552 for (i = 0; i < 2; i++) {
553 if (main_loop(ctx, RTE_COMP_DECOMPRESS) < 0)
554 return EXIT_FAILURE;
555 }
556
557 ops_enq_retries_decomp = ctx->ops_enq_retries;
558 ops_deq_retries_decomp = ctx->ops_deq_retries;
559
560 duration_enq_per_op_decomp = ctx->duration_enq /
561 (ctx->ver.mem.total_bufs * test_data->num_iter);
562 duration_deq_per_op_decomp = ctx->duration_deq /
563 (ctx->ver.mem.total_bufs * test_data->num_iter);
564 } else {
565 ops_enq_retries_decomp = 0;
566 ops_deq_retries_decomp = 0;
567
568 duration_enq_per_op_decomp = 0;
569 duration_deq_per_op_decomp = 0;
570 }
571
572 duration_setup_per_op = ctx->duration_op /
573 (ctx->ver.mem.total_bufs * test_data->num_iter);
574
575 /* R E P O R T processing */
576 rte_spinlock_lock(&print_spinlock);
577
578 if (display_once == 0) {
579 display_once = 1;
580
581 printf("\nLegend for the table\n"
582 " - Retries section: number of retries for the following operations:\n"
583 " [C-e] - compression enqueue\n"
584 " [C-d] - compression dequeue\n"
585 " [D-e] - decompression enqueue\n"
586 " [D-d] - decompression dequeue\n"
587 " - Cycles section: number of cycles per 'op' for the following operations:\n"
588 " setup/op - memory allocation, op configuration and memory deallocation\n"
589 " [C-e] - compression enqueue\n"
590 " [C-d] - compression dequeue\n"
591 " [D-e] - decompression enqueue\n"
592 " [D-d] - decompression dequeue\n\n");
593
594 printf("\n%12s%6s%12s%17s",
595 "lcore id", "Level", "Comp size", "Comp ratio [%]");
596
597 printf(" |%10s %6s %8s %6s %8s",
598 " Retries:",
599 "[C-e]", "[C-d]",
600 "[D-e]", "[D-d]");
601
602 printf(" |%9s %9s %9s %9s %9s %9s\n",
603 " Cycles:",
604 "setup/op",
605 "[C-e]", "[C-d]",
606 "[D-e]", "[D-d]");
607 }
608
609 printf("%12u"
610 "%6u"
611 "%12zu"
612 "%17.2f",
613 ctx->ver.mem.lcore_id,
614 test_data->level,
615 ctx->ver.comp_data_sz,
616 ctx->ver.ratio);
617
618 printf(" |%10s %6u %8u %6u %8u",
619 " ",
620 ops_enq_retries_comp,
621 ops_deq_retries_comp,
622 ops_enq_retries_decomp,
623 ops_deq_retries_decomp);
624
625 printf(" |%9s %9u %9u %9u %9u %9u\n",
626 " ",
627 duration_setup_per_op,
628 duration_enq_per_op_comp,
629 duration_deq_per_op_comp,
630 duration_enq_per_op_decomp,
631 duration_deq_per_op_decomp);
632
633 rte_spinlock_unlock(&print_spinlock);
634
635 return EXIT_SUCCESS;
636 }
637