xref: /dpdk/app/test-compress-perf/comp_perf_test_cyclecount.c (revision b6a7e6852e9ab82ae0e05e2d2a0b83abca17de3b)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2019 Intel Corporation
3  */
4 
5 #include <stdlib.h>
6 
7 #include <rte_malloc.h>
8 #include <rte_eal.h>
9 #include <rte_log.h>
10 #include <rte_cycles.h>
11 #include "rte_spinlock.h"
12 #include <rte_compressdev.h>
13 
14 #include "comp_perf_test_cyclecount.h"
15 
16 struct cperf_cyclecount_ctx {
17 	struct cperf_verify_ctx ver;
18 
19 	uint32_t ops_enq_retries;
20 	uint32_t ops_deq_retries;
21 
22 	uint64_t duration_op;
23 	uint64_t duration_enq;
24 	uint64_t duration_deq;
25 };
26 
27 void
cperf_cyclecount_test_destructor(void * arg)28 cperf_cyclecount_test_destructor(void *arg)
29 {
30 	struct cperf_cyclecount_ctx *ctx = arg;
31 
32 	if (arg) {
33 		comp_perf_free_memory(ctx->ver.options, &ctx->ver.mem);
34 		rte_free(arg);
35 	}
36 }
37 
38 void *
cperf_cyclecount_test_constructor(uint8_t dev_id,uint16_t qp_id,struct comp_test_data * options)39 cperf_cyclecount_test_constructor(uint8_t dev_id, uint16_t qp_id,
40 		struct comp_test_data *options)
41 {
42 	struct cperf_cyclecount_ctx *ctx = NULL;
43 
44 	ctx = rte_malloc(NULL, sizeof(struct cperf_cyclecount_ctx), 0);
45 
46 	if (ctx == NULL)
47 		return NULL;
48 
49 	ctx->ver.mem.dev_id = dev_id;
50 	ctx->ver.mem.qp_id = qp_id;
51 	ctx->ver.options = options;
52 	ctx->ver.silent = 1; /* ver. part will be silent */
53 
54 	if (!comp_perf_allocate_memory(ctx->ver.options, &ctx->ver.mem)
55 			&& !prepare_bufs(ctx->ver.options, &ctx->ver.mem))
56 		return ctx;
57 
58 	cperf_cyclecount_test_destructor(ctx);
59 	return NULL;
60 }
61 
62 static int
cperf_cyclecount_op_setup(struct rte_comp_op ** ops,struct cperf_cyclecount_ctx * ctx,struct rte_mbuf ** input_bufs,struct rte_mbuf ** output_bufs,void * priv_xform,uint32_t out_seg_sz)63 cperf_cyclecount_op_setup(struct rte_comp_op **ops,
64 				 struct cperf_cyclecount_ctx *ctx,
65 				 struct rte_mbuf **input_bufs,
66 				 struct rte_mbuf **output_bufs,
67 				 void *priv_xform,
68 				 uint32_t out_seg_sz)
69 {
70 	struct comp_test_data *test_data = ctx->ver.options;
71 	struct cperf_mem_resources *mem = &ctx->ver.mem;
72 
73 	uint32_t i, iter, num_iter;
74 	int res = 0;
75 	uint16_t ops_needed;
76 
77 	num_iter = test_data->num_iter;
78 
79 	for (iter = 0; iter < num_iter; iter++) {
80 		uint32_t remaining_ops = mem->total_bufs;
81 		uint32_t total_enq_ops = 0;
82 		uint16_t num_enq = 0;
83 		uint16_t num_deq = 0;
84 
85 		while (remaining_ops > 0) {
86 			uint16_t num_ops = RTE_MIN(remaining_ops,
87 						   test_data->burst_sz);
88 			ops_needed = num_ops;
89 
90 			/* Allocate compression operations */
91 			if (ops_needed && rte_mempool_get_bulk(
92 						mem->op_pool,
93 						(void **)ops,
94 						ops_needed) != 0) {
95 				RTE_LOG(ERR, USER1,
96 				      "Cyclecount: could not allocate enough operations\n");
97 				res = -1;
98 				goto end;
99 			}
100 
101 			for (i = 0; i < ops_needed; i++) {
102 
103 				/* Calculate next buffer to attach */
104 				/* to operation */
105 				uint32_t buf_id = total_enq_ops + i;
106 				uint16_t op_id = i;
107 
108 				/* Reset all data in output buffers */
109 				struct rte_mbuf *m = output_bufs[buf_id];
110 
111 				m->pkt_len = out_seg_sz * m->nb_segs;
112 				while (m) {
113 					m->data_len = m->buf_len - m->data_off;
114 					m = m->next;
115 				}
116 				ops[op_id]->m_src = input_bufs[buf_id];
117 				ops[op_id]->m_dst = output_bufs[buf_id];
118 				ops[op_id]->src.offset = 0;
119 				ops[op_id]->src.length =
120 					rte_pktmbuf_pkt_len(input_bufs[buf_id]);
121 				ops[op_id]->dst.offset = 0;
122 				ops[op_id]->flush_flag = RTE_COMP_FLUSH_FINAL;
123 				ops[op_id]->input_chksum = buf_id;
124 				ops[op_id]->private_xform = priv_xform;
125 			}
126 
127 			/* E N Q U E U I N G */
128 			/* assuming that all ops are enqueued */
129 			/* instead of the real enqueue operation */
130 			num_enq = num_ops;
131 
132 			remaining_ops -= num_enq;
133 			total_enq_ops += num_enq;
134 
135 			/* D E Q U E U I N G */
136 			/* assuming that all ops dequeued */
137 			/* instead of the real dequeue operation */
138 			num_deq = num_ops;
139 
140 			rte_mempool_put_bulk(mem->op_pool,
141 					     (void **)ops, num_deq);
142 		}
143 	}
144 	return res;
145 end:
146 	rte_mempool_put_bulk(mem->op_pool, (void **)ops, ops_needed);
147 	rte_free(ops);
148 
149 	return res;
150 }
151 
152 static int
main_loop(struct cperf_cyclecount_ctx * ctx,enum rte_comp_xform_type type)153 main_loop(struct cperf_cyclecount_ctx *ctx, enum rte_comp_xform_type type)
154 {
155 	struct comp_test_data *test_data = ctx->ver.options;
156 	struct cperf_mem_resources *mem = &ctx->ver.mem;
157 	uint8_t dev_id = mem->dev_id;
158 	uint32_t i, iter, num_iter;
159 	struct rte_comp_op **ops, **deq_ops;
160 	void *priv_xform = NULL;
161 	struct rte_comp_xform xform;
162 	struct rte_mbuf **input_bufs, **output_bufs;
163 	int ret, res = 0;
164 	int allocated = 0;
165 	uint32_t out_seg_sz;
166 
167 	uint64_t tsc_start, tsc_end, tsc_duration;
168 
169 	if (test_data == NULL || !test_data->burst_sz) {
170 		RTE_LOG(ERR, USER1, "Unknown burst size\n");
171 		return -1;
172 	}
173 	ctx->duration_enq = 0;
174 	ctx->duration_deq = 0;
175 	ctx->ops_enq_retries = 0;
176 	ctx->ops_deq_retries = 0;
177 
178 	/* one array for both enqueue and dequeue */
179 	ops = rte_zmalloc_socket(NULL,
180 		(test_data->burst_sz + mem->total_bufs) *
181 		sizeof(struct rte_comp_op *),
182 		0, rte_socket_id());
183 
184 	if (ops == NULL) {
185 		RTE_LOG(ERR, USER1,
186 			"Can't allocate memory for ops structures\n");
187 		return -1;
188 	}
189 
190 	deq_ops = &ops[test_data->burst_sz];
191 
192 	if (type == RTE_COMP_COMPRESS) {
193 		xform = (struct rte_comp_xform) {
194 			.type = RTE_COMP_COMPRESS,
195 			.compress = {
196 				.algo = test_data->test_algo,
197 				.level = test_data->level,
198 				.window_size = test_data->window_sz,
199 				.chksum = RTE_COMP_CHECKSUM_NONE,
200 				.hash_algo = RTE_COMP_HASH_ALGO_NONE
201 			}
202 		};
203 		if (test_data->test_algo == RTE_COMP_ALGO_DEFLATE)
204 			xform.compress.deflate.huffman = test_data->huffman_enc;
205 		else if (test_data->test_algo == RTE_COMP_ALGO_LZ4)
206 			xform.compress.lz4.flags = test_data->lz4_flags;
207 		input_bufs = mem->decomp_bufs;
208 		output_bufs = mem->comp_bufs;
209 		out_seg_sz = test_data->out_seg_sz;
210 	} else {
211 		xform = (struct rte_comp_xform) {
212 			.type = RTE_COMP_DECOMPRESS,
213 			.decompress = {
214 				.algo = test_data->test_algo,
215 				.chksum = RTE_COMP_CHECKSUM_NONE,
216 				.window_size = test_data->window_sz,
217 				.hash_algo = RTE_COMP_HASH_ALGO_NONE
218 			}
219 		};
220 		if (test_data->test_algo == RTE_COMP_ALGO_LZ4)
221 			xform.decompress.lz4.flags = test_data->lz4_flags;
222 		input_bufs = mem->comp_bufs;
223 		output_bufs = mem->decomp_bufs;
224 		out_seg_sz = test_data->seg_sz;
225 	}
226 
227 	/* Create private xform */
228 	if (rte_compressdev_private_xform_create(dev_id, &xform,
229 						&priv_xform) < 0) {
230 		RTE_LOG(ERR, USER1, "Private xform could not be created\n");
231 		res = -1;
232 		goto end;
233 	}
234 
235 	tsc_start = rte_rdtsc_precise();
236 	ret = cperf_cyclecount_op_setup(ops,
237 				ctx,
238 				input_bufs,
239 				output_bufs,
240 				priv_xform,
241 				out_seg_sz);
242 
243 	tsc_end = rte_rdtsc_precise();
244 
245 	/* ret value check postponed a bit to cancel extra 'if' bias */
246 	if (ret < 0) {
247 		RTE_LOG(ERR, USER1, "Setup function failed\n");
248 		res = -1;
249 		goto end;
250 	}
251 
252 	tsc_duration = tsc_end - tsc_start;
253 	ctx->duration_op = tsc_duration;
254 
255 	num_iter = test_data->num_iter;
256 	for (iter = 0; iter < num_iter; iter++) {
257 		uint32_t total_ops = mem->total_bufs;
258 		uint32_t remaining_ops = mem->total_bufs;
259 		uint32_t total_deq_ops = 0;
260 		uint32_t total_enq_ops = 0;
261 		uint16_t ops_unused = 0;
262 		uint16_t num_enq = 0;
263 		uint16_t num_deq = 0;
264 
265 		while (remaining_ops > 0) {
266 			uint16_t num_ops = RTE_MIN(remaining_ops,
267 						   test_data->burst_sz);
268 			uint16_t ops_needed = num_ops - ops_unused;
269 
270 			/*
271 			 * Move the unused operations from the previous
272 			 * enqueue_burst call to the front, to maintain order
273 			 */
274 			if ((ops_unused > 0) && (num_enq > 0)) {
275 				size_t nb_b_to_mov =
276 				      ops_unused * sizeof(struct rte_comp_op *);
277 
278 				memmove(ops, &ops[num_enq], nb_b_to_mov);
279 			}
280 
281 			/* Allocate compression operations */
282 			if (ops_needed && rte_mempool_get_bulk(
283 						mem->op_pool,
284 						(void **)&ops[ops_unused],
285 						ops_needed) != 0) {
286 				RTE_LOG(ERR, USER1,
287 				      "Could not allocate enough operations\n");
288 				res = -1;
289 				goto end;
290 			}
291 			allocated += ops_needed;
292 
293 			for (i = 0; i < ops_needed; i++) {
294 				/*
295 				 * Calculate next buffer to attach to operation
296 				 */
297 				uint32_t buf_id = total_enq_ops + i +
298 						ops_unused;
299 				uint16_t op_id = ops_unused + i;
300 				/* Reset all data in output buffers */
301 				struct rte_mbuf *m = output_bufs[buf_id];
302 
303 				m->pkt_len = out_seg_sz * m->nb_segs;
304 				while (m) {
305 					m->data_len = m->buf_len - m->data_off;
306 					m = m->next;
307 				}
308 				ops[op_id]->m_src = input_bufs[buf_id];
309 				ops[op_id]->m_dst = output_bufs[buf_id];
310 				ops[op_id]->src.offset = 0;
311 				ops[op_id]->src.length =
312 					rte_pktmbuf_pkt_len(input_bufs[buf_id]);
313 				ops[op_id]->dst.offset = 0;
314 				ops[op_id]->flush_flag = RTE_COMP_FLUSH_FINAL;
315 				ops[op_id]->input_chksum = buf_id;
316 				ops[op_id]->private_xform = priv_xform;
317 			}
318 
319 			if (unlikely(test_data->perf_comp_force_stop))
320 				goto end;
321 
322 			tsc_start = rte_rdtsc_precise();
323 			num_enq = rte_compressdev_enqueue_burst(dev_id,
324 								mem->qp_id, ops,
325 								num_ops);
326 			tsc_end = rte_rdtsc_precise();
327 			tsc_duration = tsc_end - tsc_start;
328 			ctx->duration_enq += tsc_duration;
329 
330 			if (num_enq < num_ops)
331 				ctx->ops_enq_retries++;
332 
333 			if (test_data->cyclecount_delay)
334 				rte_delay_us_block(test_data->cyclecount_delay);
335 
336 			if (num_enq == 0) {
337 				struct rte_compressdev_stats stats;
338 
339 				rte_compressdev_stats_get(dev_id, &stats);
340 				if (stats.enqueue_err_count) {
341 					res = -1;
342 					goto end;
343 				}
344 			}
345 
346 			ops_unused = num_ops - num_enq;
347 			remaining_ops -= num_enq;
348 			total_enq_ops += num_enq;
349 
350 			tsc_start = rte_rdtsc_precise();
351 			num_deq = rte_compressdev_dequeue_burst(dev_id,
352 							   mem->qp_id,
353 							   deq_ops,
354 							   allocated);
355 			tsc_end = rte_rdtsc_precise();
356 			tsc_duration = tsc_end - tsc_start;
357 			ctx->duration_deq += tsc_duration;
358 
359 			if (num_deq < allocated)
360 				ctx->ops_deq_retries++;
361 
362 			total_deq_ops += num_deq;
363 
364 			if (iter == num_iter - 1) {
365 				for (i = 0; i < num_deq; i++) {
366 					struct rte_comp_op *op = deq_ops[i];
367 
368 					if (op->status !=
369 						RTE_COMP_OP_STATUS_SUCCESS) {
370 						RTE_LOG(ERR, USER1, "Some operations were not successful\n");
371 						goto end;
372 					}
373 
374 					struct rte_mbuf *m = op->m_dst;
375 
376 					m->pkt_len = op->produced;
377 					uint32_t remaining_data = op->produced;
378 					uint16_t data_to_append;
379 
380 					while (remaining_data > 0) {
381 						data_to_append =
382 							RTE_MIN(remaining_data,
383 							     out_seg_sz);
384 						m->data_len = data_to_append;
385 						remaining_data -=
386 								data_to_append;
387 						m = m->next;
388 					}
389 				}
390 			}
391 			rte_mempool_put_bulk(mem->op_pool,
392 					     (void **)deq_ops, num_deq);
393 			allocated -= num_deq;
394 		}
395 
396 		/* Dequeue the last operations */
397 		while (total_deq_ops < total_ops) {
398 			if (unlikely(test_data->perf_comp_force_stop))
399 				goto end;
400 
401 			tsc_start = rte_rdtsc_precise();
402 			num_deq = rte_compressdev_dequeue_burst(dev_id,
403 						mem->qp_id,
404 						deq_ops,
405 						test_data->burst_sz);
406 			tsc_end = rte_rdtsc_precise();
407 			tsc_duration = tsc_end - tsc_start;
408 			ctx->duration_deq += tsc_duration;
409 			ctx->ops_deq_retries++;
410 
411 			if (num_deq == 0) {
412 				struct rte_compressdev_stats stats;
413 
414 				rte_compressdev_stats_get(dev_id, &stats);
415 				if (stats.dequeue_err_count) {
416 					res = -1;
417 					goto end;
418 				}
419 			}
420 			total_deq_ops += num_deq;
421 
422 			if (iter == num_iter - 1) {
423 				for (i = 0; i < num_deq; i++) {
424 					struct rte_comp_op *op = deq_ops[i];
425 
426 					if (op->status !=
427 						RTE_COMP_OP_STATUS_SUCCESS) {
428 						RTE_LOG(ERR, USER1, "Some operations were not successful\n");
429 						goto end;
430 					}
431 
432 					struct rte_mbuf *m = op->m_dst;
433 
434 					m->pkt_len = op->produced;
435 					uint32_t remaining_data = op->produced;
436 					uint16_t data_to_append;
437 
438 					while (remaining_data > 0) {
439 						data_to_append =
440 						RTE_MIN(remaining_data,
441 							out_seg_sz);
442 						m->data_len = data_to_append;
443 						remaining_data -=
444 								data_to_append;
445 						m = m->next;
446 					}
447 				}
448 			}
449 			rte_mempool_put_bulk(mem->op_pool,
450 					     (void **)deq_ops, num_deq);
451 			allocated -= num_deq;
452 		}
453 	}
454 	allocated = 0;
455 
456 end:
457 	if (allocated)
458 		rte_mempool_put_bulk(mem->op_pool, (void **)ops, allocated);
459 	rte_compressdev_private_xform_free(dev_id, priv_xform);
460 	rte_free(ops);
461 
462 	if (test_data->perf_comp_force_stop) {
463 		RTE_LOG(ERR, USER1,
464 		      "lcore: %d Perf. test has been aborted by user\n",
465 			mem->lcore_id);
466 		res = -1;
467 	}
468 	return res;
469 }
470 
471 int
cperf_cyclecount_test_runner(void * test_ctx)472 cperf_cyclecount_test_runner(void *test_ctx)
473 {
474 	struct cperf_cyclecount_ctx *ctx = test_ctx;
475 	struct comp_test_data *test_data = ctx->ver.options;
476 	uint32_t lcore = rte_lcore_id();
477 	static uint16_t display_once;
478 	static rte_spinlock_t print_spinlock;
479 	int i;
480 
481 	uint32_t ops_enq_retries_comp;
482 	uint32_t ops_deq_retries_comp;
483 
484 	uint32_t ops_enq_retries_decomp;
485 	uint32_t ops_deq_retries_decomp;
486 
487 	uint32_t duration_setup_per_op;
488 
489 	uint32_t duration_enq_per_op_comp;
490 	uint32_t duration_deq_per_op_comp;
491 
492 	uint32_t duration_enq_per_op_decomp;
493 	uint32_t duration_deq_per_op_decomp;
494 
495 	ctx->ver.mem.lcore_id = lcore;
496 
497 	uint16_t exp = 0;
498 	/*
499 	 * printing information about current compression thread
500 	 */
501 	if (rte_atomic_compare_exchange_strong_explicit(&ctx->ver.mem.print_info_once, &exp,
502 				1, rte_memory_order_relaxed,  rte_memory_order_relaxed))
503 		printf("    lcore: %u,"
504 				" driver name: %s,"
505 				" device name: %s,"
506 				" device id: %u,"
507 				" socket id: %u,"
508 				" queue pair id: %u\n",
509 			lcore,
510 			ctx->ver.options->driver_name,
511 			rte_compressdev_name_get(ctx->ver.mem.dev_id),
512 			ctx->ver.mem.dev_id,
513 			rte_compressdev_socket_id(ctx->ver.mem.dev_id),
514 			ctx->ver.mem.qp_id);
515 
516 	/*
517 	 * First the verification part is needed
518 	 */
519 	if (cperf_verify_test_runner(&ctx->ver))
520 		return EXIT_FAILURE;
521 
522 	if (test_data->test_op & COMPRESS) {
523 		/*
524 		 * Run the test twice, discarding the first performance
525 		 * results, before the cache is warmed up
526 		 */
527 		for (i = 0; i < 2; i++) {
528 			if (main_loop(ctx, RTE_COMP_COMPRESS) < 0)
529 				return EXIT_FAILURE;
530 		}
531 
532 		ops_enq_retries_comp = ctx->ops_enq_retries;
533 		ops_deq_retries_comp = ctx->ops_deq_retries;
534 
535 		duration_enq_per_op_comp = ctx->duration_enq /
536 				(ctx->ver.mem.total_bufs * test_data->num_iter);
537 		duration_deq_per_op_comp = ctx->duration_deq /
538 				(ctx->ver.mem.total_bufs * test_data->num_iter);
539 	} else {
540 		ops_enq_retries_comp = 0;
541 		ops_deq_retries_comp = 0;
542 
543 		duration_enq_per_op_comp = 0;
544 		duration_deq_per_op_comp = 0;
545 	}
546 
547 	if (test_data->test_op & DECOMPRESS) {
548 		/*
549 		 * Run the test twice, discarding the first performance
550 		 * results, before the cache is warmed up
551 		 */
552 		for (i = 0; i < 2; i++) {
553 			if (main_loop(ctx, RTE_COMP_DECOMPRESS) < 0)
554 				return EXIT_FAILURE;
555 		}
556 
557 		ops_enq_retries_decomp = ctx->ops_enq_retries;
558 		ops_deq_retries_decomp = ctx->ops_deq_retries;
559 
560 		duration_enq_per_op_decomp = ctx->duration_enq /
561 				(ctx->ver.mem.total_bufs * test_data->num_iter);
562 		duration_deq_per_op_decomp = ctx->duration_deq /
563 				(ctx->ver.mem.total_bufs * test_data->num_iter);
564 	} else {
565 		ops_enq_retries_decomp = 0;
566 		ops_deq_retries_decomp = 0;
567 
568 		duration_enq_per_op_decomp = 0;
569 		duration_deq_per_op_decomp = 0;
570 	}
571 
572 	duration_setup_per_op = ctx->duration_op /
573 			(ctx->ver.mem.total_bufs * test_data->num_iter);
574 
575 	/* R E P O R T processing */
576 	rte_spinlock_lock(&print_spinlock);
577 
578 	if (display_once == 0) {
579 		display_once = 1;
580 
581 		printf("\nLegend for the table\n"
582 		"  - Retries section: number of retries for the following operations:\n"
583 		"    [C-e] - compression enqueue\n"
584 		"    [C-d] - compression dequeue\n"
585 		"    [D-e] - decompression enqueue\n"
586 		"    [D-d] - decompression dequeue\n"
587 		"  - Cycles section: number of cycles per 'op' for the following operations:\n"
588 		"    setup/op - memory allocation, op configuration and memory deallocation\n"
589 		"    [C-e] - compression enqueue\n"
590 		"    [C-d] - compression dequeue\n"
591 		"    [D-e] - decompression enqueue\n"
592 		"    [D-d] - decompression dequeue\n\n");
593 
594 		printf("\n%12s%6s%12s%17s",
595 			"lcore id", "Level", "Comp size", "Comp ratio [%]");
596 
597 		printf("  |%10s %6s %8s %6s %8s",
598 			" Retries:",
599 			"[C-e]", "[C-d]",
600 			"[D-e]", "[D-d]");
601 
602 		printf("  |%9s %9s %9s %9s %9s %9s\n",
603 			" Cycles:",
604 			"setup/op",
605 			"[C-e]", "[C-d]",
606 			"[D-e]", "[D-d]");
607 	}
608 
609 	printf("%12u"
610 	       "%6u"
611 	       "%12zu"
612 	       "%17.2f",
613 		ctx->ver.mem.lcore_id,
614 		test_data->level,
615 		ctx->ver.comp_data_sz,
616 		ctx->ver.ratio);
617 
618 	printf("  |%10s %6u %8u %6u %8u",
619 	       " ",
620 		ops_enq_retries_comp,
621 		ops_deq_retries_comp,
622 		ops_enq_retries_decomp,
623 		ops_deq_retries_decomp);
624 
625 	printf("  |%9s %9u %9u %9u %9u %9u\n",
626 	       " ",
627 		duration_setup_per_op,
628 		duration_enq_per_op_comp,
629 		duration_deq_per_op_comp,
630 		duration_enq_per_op_decomp,
631 		duration_deq_per_op_decomp);
632 
633 	rte_spinlock_unlock(&print_spinlock);
634 
635 	return EXIT_SUCCESS;
636 }
637