xref: /dpdk/app/test-compress-perf/comp_perf_test_cyclecount.c (revision 8f1d23ece06adff5eae9f1b4365bdbbd3abee2b2)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2019 Intel Corporation
3  */
4 
5 #include <stdlib.h>
6 
7 #include <rte_malloc.h>
8 #include <rte_eal.h>
9 #include <rte_log.h>
10 #include <rte_cycles.h>
11 #include "rte_spinlock.h"
12 #include <rte_compressdev.h>
13 
14 #include "comp_perf_test_cyclecount.h"
15 
16 struct cperf_cyclecount_ctx {
17 	struct cperf_verify_ctx ver;
18 
19 	uint32_t ops_enq_retries;
20 	uint32_t ops_deq_retries;
21 
22 	uint64_t duration_op;
23 	uint64_t duration_enq;
24 	uint64_t duration_deq;
25 };
26 
27 void
28 cperf_cyclecount_test_destructor(void *arg)
29 {
30 	struct cperf_cyclecount_ctx *ctx = arg;
31 
32 	if (arg) {
33 		comp_perf_free_memory(ctx->ver.options, &ctx->ver.mem);
34 		rte_free(arg);
35 	}
36 }
37 
38 void *
39 cperf_cyclecount_test_constructor(uint8_t dev_id, uint16_t qp_id,
40 		struct comp_test_data *options)
41 {
42 	struct cperf_cyclecount_ctx *ctx = NULL;
43 
44 	ctx = rte_malloc(NULL, sizeof(struct cperf_cyclecount_ctx), 0);
45 
46 	if (ctx == NULL)
47 		return NULL;
48 
49 	ctx->ver.mem.dev_id = dev_id;
50 	ctx->ver.mem.qp_id = qp_id;
51 	ctx->ver.options = options;
52 	ctx->ver.silent = 1; /* ver. part will be silent */
53 
54 	if (!comp_perf_allocate_memory(ctx->ver.options, &ctx->ver.mem)
55 			&& !prepare_bufs(ctx->ver.options, &ctx->ver.mem))
56 		return ctx;
57 
58 	cperf_cyclecount_test_destructor(ctx);
59 	return NULL;
60 }
61 
62 static int
63 cperf_cyclecount_op_setup(struct rte_comp_op **ops,
64 				 struct cperf_cyclecount_ctx *ctx,
65 				 struct rte_mbuf **input_bufs,
66 				 struct rte_mbuf **output_bufs,
67 				 void *priv_xform,
68 				 uint32_t out_seg_sz)
69 {
70 	struct comp_test_data *test_data = ctx->ver.options;
71 	struct cperf_mem_resources *mem = &ctx->ver.mem;
72 
73 	uint32_t i, iter, num_iter;
74 	int res = 0;
75 	uint16_t ops_needed;
76 
77 	num_iter = test_data->num_iter;
78 
79 	for (iter = 0; iter < num_iter; iter++) {
80 		uint32_t remaining_ops = mem->total_bufs;
81 		uint32_t total_enq_ops = 0;
82 		uint16_t num_enq = 0;
83 		uint16_t num_deq = 0;
84 
85 		while (remaining_ops > 0) {
86 			uint16_t num_ops = RTE_MIN(remaining_ops,
87 						   test_data->burst_sz);
88 			ops_needed = num_ops;
89 
90 			/* Allocate compression operations */
91 			if (ops_needed && rte_mempool_get_bulk(
92 						mem->op_pool,
93 						(void **)ops,
94 						ops_needed) != 0) {
95 				RTE_LOG(ERR, USER1,
96 				      "Cyclecount: could not allocate enough operations\n");
97 				res = -1;
98 				goto end;
99 			}
100 
101 			for (i = 0; i < ops_needed; i++) {
102 
103 				/* Calculate next buffer to attach */
104 				/* to operation */
105 				uint32_t buf_id = total_enq_ops + i;
106 				uint16_t op_id = i;
107 
108 				/* Reset all data in output buffers */
109 				struct rte_mbuf *m = output_bufs[buf_id];
110 
111 				m->pkt_len = out_seg_sz * m->nb_segs;
112 				while (m) {
113 					m->data_len = m->buf_len - m->data_off;
114 					m = m->next;
115 				}
116 				ops[op_id]->m_src = input_bufs[buf_id];
117 				ops[op_id]->m_dst = output_bufs[buf_id];
118 				ops[op_id]->src.offset = 0;
119 				ops[op_id]->src.length =
120 					rte_pktmbuf_pkt_len(input_bufs[buf_id]);
121 				ops[op_id]->dst.offset = 0;
122 				ops[op_id]->flush_flag = RTE_COMP_FLUSH_FINAL;
123 				ops[op_id]->input_chksum = buf_id;
124 				ops[op_id]->private_xform = priv_xform;
125 			}
126 
127 			/* E N Q U E U I N G */
128 			/* assuming that all ops are enqueued */
129 			/* instead of the real enqueue operation */
130 			num_enq = num_ops;
131 
132 			remaining_ops -= num_enq;
133 			total_enq_ops += num_enq;
134 
135 			/* D E Q U E U I N G */
136 			/* assuming that all ops dequeued */
137 			/* instead of the real dequeue operation */
138 			num_deq = num_ops;
139 
140 			rte_mempool_put_bulk(mem->op_pool,
141 					     (void **)ops, num_deq);
142 		}
143 	}
144 	return res;
145 end:
146 	rte_mempool_put_bulk(mem->op_pool, (void **)ops, ops_needed);
147 	rte_free(ops);
148 
149 	return res;
150 }
151 
152 static int
153 main_loop(struct cperf_cyclecount_ctx *ctx, enum rte_comp_xform_type type)
154 {
155 	struct comp_test_data *test_data = ctx->ver.options;
156 	struct cperf_mem_resources *mem = &ctx->ver.mem;
157 	uint8_t dev_id = mem->dev_id;
158 	uint32_t i, iter, num_iter;
159 	struct rte_comp_op **ops, **deq_ops;
160 	void *priv_xform = NULL;
161 	struct rte_comp_xform xform;
162 	struct rte_mbuf **input_bufs, **output_bufs;
163 	int ret, res = 0;
164 	int allocated = 0;
165 	uint32_t out_seg_sz;
166 
167 	uint64_t tsc_start, tsc_end, tsc_duration;
168 
169 	if (test_data == NULL || !test_data->burst_sz) {
170 		RTE_LOG(ERR, USER1, "Unknown burst size\n");
171 		return -1;
172 	}
173 	ctx->duration_enq = 0;
174 	ctx->duration_deq = 0;
175 	ctx->ops_enq_retries = 0;
176 	ctx->ops_deq_retries = 0;
177 
178 	/* one array for both enqueue and dequeue */
179 	ops = rte_zmalloc_socket(NULL,
180 		(test_data->burst_sz + mem->total_bufs) *
181 		sizeof(struct rte_comp_op *),
182 		0, rte_socket_id());
183 
184 	if (ops == NULL) {
185 		RTE_LOG(ERR, USER1,
186 			"Can't allocate memory for ops structures\n");
187 		return -1;
188 	}
189 
190 	deq_ops = &ops[test_data->burst_sz];
191 
192 	if (type == RTE_COMP_COMPRESS) {
193 		xform = (struct rte_comp_xform) {
194 			.type = RTE_COMP_COMPRESS,
195 			.compress = {
196 				.algo = RTE_COMP_ALGO_DEFLATE,
197 				.deflate.huffman = test_data->huffman_enc,
198 				.level = test_data->level,
199 				.window_size = test_data->window_sz,
200 				.chksum = RTE_COMP_CHECKSUM_NONE,
201 				.hash_algo = RTE_COMP_HASH_ALGO_NONE
202 			}
203 		};
204 		input_bufs = mem->decomp_bufs;
205 		output_bufs = mem->comp_bufs;
206 		out_seg_sz = test_data->out_seg_sz;
207 	} else {
208 		xform = (struct rte_comp_xform) {
209 			.type = RTE_COMP_DECOMPRESS,
210 			.decompress = {
211 				.algo = RTE_COMP_ALGO_DEFLATE,
212 				.chksum = RTE_COMP_CHECKSUM_NONE,
213 				.window_size = test_data->window_sz,
214 				.hash_algo = RTE_COMP_HASH_ALGO_NONE
215 			}
216 		};
217 		input_bufs = mem->comp_bufs;
218 		output_bufs = mem->decomp_bufs;
219 		out_seg_sz = test_data->seg_sz;
220 	}
221 
222 	/* Create private xform */
223 	if (rte_compressdev_private_xform_create(dev_id, &xform,
224 						&priv_xform) < 0) {
225 		RTE_LOG(ERR, USER1, "Private xform could not be created\n");
226 		res = -1;
227 		goto end;
228 	}
229 
230 	tsc_start = rte_rdtsc_precise();
231 	ret = cperf_cyclecount_op_setup(ops,
232 				ctx,
233 				input_bufs,
234 				output_bufs,
235 				priv_xform,
236 				out_seg_sz);
237 
238 	tsc_end = rte_rdtsc_precise();
239 
240 	/* ret value check postponed a bit to cancel extra 'if' bias */
241 	if (ret < 0) {
242 		RTE_LOG(ERR, USER1, "Setup function failed\n");
243 		res = -1;
244 		goto end;
245 	}
246 
247 	tsc_duration = tsc_end - tsc_start;
248 	ctx->duration_op = tsc_duration;
249 
250 	num_iter = test_data->num_iter;
251 	for (iter = 0; iter < num_iter; iter++) {
252 		uint32_t total_ops = mem->total_bufs;
253 		uint32_t remaining_ops = mem->total_bufs;
254 		uint32_t total_deq_ops = 0;
255 		uint32_t total_enq_ops = 0;
256 		uint16_t ops_unused = 0;
257 		uint16_t num_enq = 0;
258 		uint16_t num_deq = 0;
259 
260 		while (remaining_ops > 0) {
261 			uint16_t num_ops = RTE_MIN(remaining_ops,
262 						   test_data->burst_sz);
263 			uint16_t ops_needed = num_ops - ops_unused;
264 
265 			/*
266 			 * Move the unused operations from the previous
267 			 * enqueue_burst call to the front, to maintain order
268 			 */
269 			if ((ops_unused > 0) && (num_enq > 0)) {
270 				size_t nb_b_to_mov =
271 				      ops_unused * sizeof(struct rte_comp_op *);
272 
273 				memmove(ops, &ops[num_enq], nb_b_to_mov);
274 			}
275 
276 			/* Allocate compression operations */
277 			if (ops_needed && rte_mempool_get_bulk(
278 						mem->op_pool,
279 						(void **)&ops[ops_unused],
280 						ops_needed) != 0) {
281 				RTE_LOG(ERR, USER1,
282 				      "Could not allocate enough operations\n");
283 				res = -1;
284 				goto end;
285 			}
286 			allocated += ops_needed;
287 
288 			for (i = 0; i < ops_needed; i++) {
289 				/*
290 				 * Calculate next buffer to attach to operation
291 				 */
292 				uint32_t buf_id = total_enq_ops + i +
293 						ops_unused;
294 				uint16_t op_id = ops_unused + i;
295 				/* Reset all data in output buffers */
296 				struct rte_mbuf *m = output_bufs[buf_id];
297 
298 				m->pkt_len = out_seg_sz * m->nb_segs;
299 				while (m) {
300 					m->data_len = m->buf_len - m->data_off;
301 					m = m->next;
302 				}
303 				ops[op_id]->m_src = input_bufs[buf_id];
304 				ops[op_id]->m_dst = output_bufs[buf_id];
305 				ops[op_id]->src.offset = 0;
306 				ops[op_id]->src.length =
307 					rte_pktmbuf_pkt_len(input_bufs[buf_id]);
308 				ops[op_id]->dst.offset = 0;
309 				ops[op_id]->flush_flag = RTE_COMP_FLUSH_FINAL;
310 				ops[op_id]->input_chksum = buf_id;
311 				ops[op_id]->private_xform = priv_xform;
312 			}
313 
314 			if (unlikely(test_data->perf_comp_force_stop))
315 				goto end;
316 
317 			tsc_start = rte_rdtsc_precise();
318 			num_enq = rte_compressdev_enqueue_burst(dev_id,
319 								mem->qp_id, ops,
320 								num_ops);
321 			tsc_end = rte_rdtsc_precise();
322 			tsc_duration = tsc_end - tsc_start;
323 			ctx->duration_enq += tsc_duration;
324 
325 			if (num_enq < num_ops)
326 				ctx->ops_enq_retries++;
327 
328 			if (test_data->cyclecount_delay)
329 				rte_delay_us_block(test_data->cyclecount_delay);
330 
331 			if (num_enq == 0) {
332 				struct rte_compressdev_stats stats;
333 
334 				rte_compressdev_stats_get(dev_id, &stats);
335 				if (stats.enqueue_err_count) {
336 					res = -1;
337 					goto end;
338 				}
339 			}
340 
341 			ops_unused = num_ops - num_enq;
342 			remaining_ops -= num_enq;
343 			total_enq_ops += num_enq;
344 
345 			tsc_start = rte_rdtsc_precise();
346 			num_deq = rte_compressdev_dequeue_burst(dev_id,
347 							   mem->qp_id,
348 							   deq_ops,
349 							   allocated);
350 			tsc_end = rte_rdtsc_precise();
351 			tsc_duration = tsc_end - tsc_start;
352 			ctx->duration_deq += tsc_duration;
353 
354 			if (num_deq < allocated)
355 				ctx->ops_deq_retries++;
356 
357 			total_deq_ops += num_deq;
358 
359 			if (iter == num_iter - 1) {
360 				for (i = 0; i < num_deq; i++) {
361 					struct rte_comp_op *op = deq_ops[i];
362 
363 					if (op->status !=
364 						RTE_COMP_OP_STATUS_SUCCESS) {
365 						RTE_LOG(ERR, USER1, "Some operations were not successful\n");
366 						goto end;
367 					}
368 
369 					struct rte_mbuf *m = op->m_dst;
370 
371 					m->pkt_len = op->produced;
372 					uint32_t remaining_data = op->produced;
373 					uint16_t data_to_append;
374 
375 					while (remaining_data > 0) {
376 						data_to_append =
377 							RTE_MIN(remaining_data,
378 							     out_seg_sz);
379 						m->data_len = data_to_append;
380 						remaining_data -=
381 								data_to_append;
382 						m = m->next;
383 					}
384 				}
385 			}
386 			rte_mempool_put_bulk(mem->op_pool,
387 					     (void **)deq_ops, num_deq);
388 			allocated -= num_deq;
389 		}
390 
391 		/* Dequeue the last operations */
392 		while (total_deq_ops < total_ops) {
393 			if (unlikely(test_data->perf_comp_force_stop))
394 				goto end;
395 
396 			tsc_start = rte_rdtsc_precise();
397 			num_deq = rte_compressdev_dequeue_burst(dev_id,
398 						mem->qp_id,
399 						deq_ops,
400 						test_data->burst_sz);
401 			tsc_end = rte_rdtsc_precise();
402 			tsc_duration = tsc_end - tsc_start;
403 			ctx->duration_deq += tsc_duration;
404 			ctx->ops_deq_retries++;
405 
406 			if (num_deq == 0) {
407 				struct rte_compressdev_stats stats;
408 
409 				rte_compressdev_stats_get(dev_id, &stats);
410 				if (stats.dequeue_err_count) {
411 					res = -1;
412 					goto end;
413 				}
414 			}
415 			total_deq_ops += num_deq;
416 
417 			if (iter == num_iter - 1) {
418 				for (i = 0; i < num_deq; i++) {
419 					struct rte_comp_op *op = deq_ops[i];
420 
421 					if (op->status !=
422 						RTE_COMP_OP_STATUS_SUCCESS) {
423 						RTE_LOG(ERR, USER1, "Some operations were not successful\n");
424 						goto end;
425 					}
426 
427 					struct rte_mbuf *m = op->m_dst;
428 
429 					m->pkt_len = op->produced;
430 					uint32_t remaining_data = op->produced;
431 					uint16_t data_to_append;
432 
433 					while (remaining_data > 0) {
434 						data_to_append =
435 						RTE_MIN(remaining_data,
436 							out_seg_sz);
437 						m->data_len = data_to_append;
438 						remaining_data -=
439 								data_to_append;
440 						m = m->next;
441 					}
442 				}
443 			}
444 			rte_mempool_put_bulk(mem->op_pool,
445 					     (void **)deq_ops, num_deq);
446 			allocated -= num_deq;
447 		}
448 	}
449 	allocated = 0;
450 
451 end:
452 	if (allocated)
453 		rte_mempool_put_bulk(mem->op_pool, (void **)ops, allocated);
454 	rte_compressdev_private_xform_free(dev_id, priv_xform);
455 	rte_free(ops);
456 
457 	if (test_data->perf_comp_force_stop) {
458 		RTE_LOG(ERR, USER1,
459 		      "lcore: %d Perf. test has been aborted by user\n",
460 			mem->lcore_id);
461 		res = -1;
462 	}
463 	return res;
464 }
465 
466 int
467 cperf_cyclecount_test_runner(void *test_ctx)
468 {
469 	struct cperf_cyclecount_ctx *ctx = test_ctx;
470 	struct comp_test_data *test_data = ctx->ver.options;
471 	uint32_t lcore = rte_lcore_id();
472 	static uint16_t display_once;
473 	static rte_spinlock_t print_spinlock;
474 	int i;
475 
476 	uint32_t ops_enq_retries_comp;
477 	uint32_t ops_deq_retries_comp;
478 
479 	uint32_t ops_enq_retries_decomp;
480 	uint32_t ops_deq_retries_decomp;
481 
482 	uint32_t duration_setup_per_op;
483 
484 	uint32_t duration_enq_per_op_comp;
485 	uint32_t duration_deq_per_op_comp;
486 
487 	uint32_t duration_enq_per_op_decomp;
488 	uint32_t duration_deq_per_op_decomp;
489 
490 	ctx->ver.mem.lcore_id = lcore;
491 
492 	uint16_t exp = 0;
493 	/*
494 	 * printing information about current compression thread
495 	 */
496 	if (__atomic_compare_exchange_n(&ctx->ver.mem.print_info_once, &exp,
497 				1, 0, __ATOMIC_RELAXED,  __ATOMIC_RELAXED))
498 		printf("    lcore: %u,"
499 				" driver name: %s,"
500 				" device name: %s,"
501 				" device id: %u,"
502 				" socket id: %u,"
503 				" queue pair id: %u\n",
504 			lcore,
505 			ctx->ver.options->driver_name,
506 			rte_compressdev_name_get(ctx->ver.mem.dev_id),
507 			ctx->ver.mem.dev_id,
508 			rte_compressdev_socket_id(ctx->ver.mem.dev_id),
509 			ctx->ver.mem.qp_id);
510 
511 	/*
512 	 * First the verification part is needed
513 	 */
514 	if (cperf_verify_test_runner(&ctx->ver))
515 		return EXIT_FAILURE;
516 
517 	/*
518 	 * Run the tests twice, discarding the first performance
519 	 * results, before the cache is warmed up
520 	 */
521 
522 	/* C O M P R E S S */
523 	for (i = 0; i < 2; i++) {
524 		if (main_loop(ctx, RTE_COMP_COMPRESS) < 0)
525 			return EXIT_FAILURE;
526 	}
527 
528 	ops_enq_retries_comp = ctx->ops_enq_retries;
529 	ops_deq_retries_comp = ctx->ops_deq_retries;
530 
531 	duration_enq_per_op_comp = ctx->duration_enq /
532 			(ctx->ver.mem.total_bufs * test_data->num_iter);
533 	duration_deq_per_op_comp = ctx->duration_deq /
534 			(ctx->ver.mem.total_bufs * test_data->num_iter);
535 
536 	/* D E C O M P R E S S */
537 	for (i = 0; i < 2; i++) {
538 		if (main_loop(ctx, RTE_COMP_DECOMPRESS) < 0)
539 			return EXIT_FAILURE;
540 	}
541 
542 	ops_enq_retries_decomp = ctx->ops_enq_retries;
543 	ops_deq_retries_decomp = ctx->ops_deq_retries;
544 
545 	duration_enq_per_op_decomp = ctx->duration_enq /
546 			(ctx->ver.mem.total_bufs * test_data->num_iter);
547 	duration_deq_per_op_decomp = ctx->duration_deq /
548 			(ctx->ver.mem.total_bufs * test_data->num_iter);
549 
550 	duration_setup_per_op = ctx->duration_op /
551 			(ctx->ver.mem.total_bufs * test_data->num_iter);
552 
553 	/* R E P O R T processing */
554 	rte_spinlock_lock(&print_spinlock);
555 
556 	if (display_once == 0) {
557 		display_once = 1;
558 
559 		printf("\nLegend for the table\n"
560 		"  - Retries section: number of retries for the following operations:\n"
561 		"    [C-e] - compression enqueue\n"
562 		"    [C-d] - compression dequeue\n"
563 		"    [D-e] - decompression enqueue\n"
564 		"    [D-d] - decompression dequeue\n"
565 		"  - Cycles section: number of cycles per 'op' for the following operations:\n"
566 		"    setup/op - memory allocation, op configuration and memory dealocation\n"
567 		"    [C-e] - compression enqueue\n"
568 		"    [C-d] - compression dequeue\n"
569 		"    [D-e] - decompression enqueue\n"
570 		"    [D-d] - decompression dequeue\n\n");
571 
572 		printf("\n%12s%6s%12s%17s",
573 			"lcore id", "Level", "Comp size", "Comp ratio [%]");
574 
575 		printf("  |%10s %6s %8s %6s %8s",
576 			" Retries:",
577 			"[C-e]", "[C-d]",
578 			"[D-e]", "[D-d]");
579 
580 		printf("  |%9s %9s %9s %9s %9s %9s\n",
581 			" Cycles:",
582 			"setup/op",
583 			"[C-e]", "[C-d]",
584 			"[D-e]", "[D-d]");
585 	}
586 
587 	printf("%12u"
588 	       "%6u"
589 	       "%12zu"
590 	       "%17.2f",
591 		ctx->ver.mem.lcore_id,
592 		test_data->level,
593 		ctx->ver.comp_data_sz,
594 		ctx->ver.ratio);
595 
596 	printf("  |%10s %6u %8u %6u %8u",
597 	       " ",
598 		ops_enq_retries_comp,
599 		ops_deq_retries_comp,
600 		ops_enq_retries_decomp,
601 		ops_deq_retries_decomp);
602 
603 	printf("  |%9s %9u %9u %9u %9u %9u\n",
604 	       " ",
605 		duration_setup_per_op,
606 		duration_enq_per_op_comp,
607 		duration_deq_per_op_comp,
608 		duration_enq_per_op_decomp,
609 		duration_deq_per_op_decomp);
610 
611 	rte_spinlock_unlock(&print_spinlock);
612 
613 	return EXIT_SUCCESS;
614 }
615