xref: /dpdk/app/test-compress-perf/comp_perf_test_cyclecount.c (revision 68a03efeed657e6e05f281479b33b51102797e15)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2019 Intel Corporation
3  */
4 
5 #include <rte_malloc.h>
6 #include <rte_eal.h>
7 #include <rte_log.h>
8 #include <rte_cycles.h>
9 #include "rte_spinlock.h"
10 #include <rte_compressdev.h>
11 
12 #include "comp_perf_test_cyclecount.h"
13 
14 struct cperf_cyclecount_ctx {
15 	struct cperf_verify_ctx ver;
16 
17 	uint32_t ops_enq_retries;
18 	uint32_t ops_deq_retries;
19 
20 	uint64_t duration_op;
21 	uint64_t duration_enq;
22 	uint64_t duration_deq;
23 };
24 
25 void
26 cperf_cyclecount_test_destructor(void *arg)
27 {
28 	struct cperf_cyclecount_ctx *ctx = arg;
29 
30 	if (arg) {
31 		comp_perf_free_memory(ctx->ver.options, &ctx->ver.mem);
32 		rte_free(arg);
33 	}
34 }
35 
36 void *
37 cperf_cyclecount_test_constructor(uint8_t dev_id, uint16_t qp_id,
38 		struct comp_test_data *options)
39 {
40 	struct cperf_cyclecount_ctx *ctx = NULL;
41 
42 	ctx = rte_malloc(NULL, sizeof(struct cperf_cyclecount_ctx), 0);
43 
44 	if (ctx == NULL)
45 		return NULL;
46 
47 	ctx->ver.mem.dev_id = dev_id;
48 	ctx->ver.mem.qp_id = qp_id;
49 	ctx->ver.options = options;
50 	ctx->ver.silent = 1; /* ver. part will be silent */
51 
52 	if (!comp_perf_allocate_memory(ctx->ver.options, &ctx->ver.mem)
53 			&& !prepare_bufs(ctx->ver.options, &ctx->ver.mem))
54 		return ctx;
55 
56 	cperf_cyclecount_test_destructor(ctx);
57 	return NULL;
58 }
59 
60 static int
61 cperf_cyclecount_op_setup(struct rte_comp_op **ops,
62 				 struct cperf_cyclecount_ctx *ctx,
63 				 struct rte_mbuf **input_bufs,
64 				 struct rte_mbuf **output_bufs,
65 				 void *priv_xform,
66 				 uint32_t out_seg_sz)
67 {
68 	struct comp_test_data *test_data = ctx->ver.options;
69 	struct cperf_mem_resources *mem = &ctx->ver.mem;
70 
71 	uint32_t i, iter, num_iter;
72 	int res = 0;
73 	uint16_t ops_needed;
74 
75 	num_iter = test_data->num_iter;
76 
77 	for (iter = 0; iter < num_iter; iter++) {
78 		uint32_t remaining_ops = mem->total_bufs;
79 		uint32_t total_deq_ops = 0;
80 		uint32_t total_enq_ops = 0;
81 		uint16_t num_enq = 0;
82 		uint16_t num_deq = 0;
83 
84 		while (remaining_ops > 0) {
85 			uint16_t num_ops = RTE_MIN(remaining_ops,
86 						   test_data->burst_sz);
87 			ops_needed = num_ops;
88 
89 			/* Allocate compression operations */
90 			if (ops_needed && rte_mempool_get_bulk(
91 						mem->op_pool,
92 						(void **)ops,
93 						ops_needed) != 0) {
94 				RTE_LOG(ERR, USER1,
95 				      "Cyclecount: could not allocate enough operations\n");
96 				res = -1;
97 				goto end;
98 			}
99 
100 			for (i = 0; i < ops_needed; i++) {
101 
102 				/* Calculate next buffer to attach */
103 				/* to operation */
104 				uint32_t buf_id = total_enq_ops + i;
105 				uint16_t op_id = i;
106 
107 				/* Reset all data in output buffers */
108 				struct rte_mbuf *m = output_bufs[buf_id];
109 
110 				m->pkt_len = out_seg_sz * m->nb_segs;
111 				while (m) {
112 					m->data_len = m->buf_len - m->data_off;
113 					m = m->next;
114 				}
115 				ops[op_id]->m_src = input_bufs[buf_id];
116 				ops[op_id]->m_dst = output_bufs[buf_id];
117 				ops[op_id]->src.offset = 0;
118 				ops[op_id]->src.length =
119 					rte_pktmbuf_pkt_len(input_bufs[buf_id]);
120 				ops[op_id]->dst.offset = 0;
121 				ops[op_id]->flush_flag = RTE_COMP_FLUSH_FINAL;
122 				ops[op_id]->input_chksum = buf_id;
123 				ops[op_id]->private_xform = priv_xform;
124 			}
125 
126 			/* E N Q U E U I N G */
127 			/* assuming that all ops are enqueued */
128 			/* instead of the real enqueue operation */
129 			num_enq = num_ops;
130 
131 			remaining_ops -= num_enq;
132 			total_enq_ops += num_enq;
133 
134 			/* D E Q U E U I N G */
135 			/* assuming that all ops dequeued */
136 			/* instead of the real dequeue operation */
137 			num_deq = num_ops;
138 
139 			total_deq_ops += num_deq;
140 			rte_mempool_put_bulk(mem->op_pool,
141 					     (void **)ops, num_deq);
142 		}
143 	}
144 	return res;
145 end:
146 	rte_mempool_put_bulk(mem->op_pool, (void **)ops, ops_needed);
147 	rte_free(ops);
148 
149 	return res;
150 }
151 
152 static int
153 main_loop(struct cperf_cyclecount_ctx *ctx, enum rte_comp_xform_type type)
154 {
155 	struct comp_test_data *test_data = ctx->ver.options;
156 	struct cperf_mem_resources *mem = &ctx->ver.mem;
157 	uint8_t dev_id = mem->dev_id;
158 	uint32_t i, iter, num_iter;
159 	struct rte_comp_op **ops, **deq_ops;
160 	void *priv_xform = NULL;
161 	struct rte_comp_xform xform;
162 	struct rte_mbuf **input_bufs, **output_bufs;
163 	int ret, res = 0;
164 	int allocated = 0;
165 	uint32_t out_seg_sz;
166 
167 	uint64_t tsc_start, tsc_end, tsc_duration;
168 
169 	if (test_data == NULL || !test_data->burst_sz) {
170 		RTE_LOG(ERR, USER1, "Unknown burst size\n");
171 		return -1;
172 	}
173 	ctx->duration_enq = 0;
174 	ctx->duration_deq = 0;
175 	ctx->ops_enq_retries = 0;
176 	ctx->ops_deq_retries = 0;
177 
178 	/* one array for both enqueue and dequeue */
179 	ops = rte_zmalloc_socket(NULL,
180 		2 * mem->total_bufs * sizeof(struct rte_comp_op *),
181 		0, rte_socket_id());
182 
183 	if (ops == NULL) {
184 		RTE_LOG(ERR, USER1,
185 			"Can't allocate memory for ops strucures\n");
186 		return -1;
187 	}
188 
189 	deq_ops = &ops[mem->total_bufs];
190 
191 	if (type == RTE_COMP_COMPRESS) {
192 		xform = (struct rte_comp_xform) {
193 			.type = RTE_COMP_COMPRESS,
194 			.compress = {
195 				.algo = RTE_COMP_ALGO_DEFLATE,
196 				.deflate.huffman = test_data->huffman_enc,
197 				.level = test_data->level,
198 				.window_size = test_data->window_sz,
199 				.chksum = RTE_COMP_CHECKSUM_NONE,
200 				.hash_algo = RTE_COMP_HASH_ALGO_NONE
201 			}
202 		};
203 		input_bufs = mem->decomp_bufs;
204 		output_bufs = mem->comp_bufs;
205 		out_seg_sz = test_data->out_seg_sz;
206 	} else {
207 		xform = (struct rte_comp_xform) {
208 			.type = RTE_COMP_DECOMPRESS,
209 			.decompress = {
210 				.algo = RTE_COMP_ALGO_DEFLATE,
211 				.chksum = RTE_COMP_CHECKSUM_NONE,
212 				.window_size = test_data->window_sz,
213 				.hash_algo = RTE_COMP_HASH_ALGO_NONE
214 			}
215 		};
216 		input_bufs = mem->comp_bufs;
217 		output_bufs = mem->decomp_bufs;
218 		out_seg_sz = test_data->seg_sz;
219 	}
220 
221 	/* Create private xform */
222 	if (rte_compressdev_private_xform_create(dev_id, &xform,
223 						&priv_xform) < 0) {
224 		RTE_LOG(ERR, USER1, "Private xform could not be created\n");
225 		res = -1;
226 		goto end;
227 	}
228 
229 	tsc_start = rte_rdtsc_precise();
230 	ret = cperf_cyclecount_op_setup(ops,
231 				ctx,
232 				input_bufs,
233 				output_bufs,
234 				priv_xform,
235 				out_seg_sz);
236 
237 	tsc_end = rte_rdtsc_precise();
238 
239 	/* ret value check postponed a bit to cancel extra 'if' bias */
240 	if (ret < 0) {
241 		RTE_LOG(ERR, USER1, "Setup function failed\n");
242 		res = -1;
243 		goto end;
244 	}
245 
246 	tsc_duration = tsc_end - tsc_start;
247 	ctx->duration_op = tsc_duration;
248 
249 	num_iter = test_data->num_iter;
250 	for (iter = 0; iter < num_iter; iter++) {
251 		uint32_t total_ops = mem->total_bufs;
252 		uint32_t remaining_ops = mem->total_bufs;
253 		uint32_t total_deq_ops = 0;
254 		uint32_t total_enq_ops = 0;
255 		uint16_t ops_unused = 0;
256 		uint16_t num_enq = 0;
257 		uint16_t num_deq = 0;
258 
259 		while (remaining_ops > 0) {
260 			uint16_t num_ops = RTE_MIN(remaining_ops,
261 						   test_data->burst_sz);
262 			uint16_t ops_needed = num_ops - ops_unused;
263 
264 			/*
265 			 * Move the unused operations from the previous
266 			 * enqueue_burst call to the front, to maintain order
267 			 */
268 			if ((ops_unused > 0) && (num_enq > 0)) {
269 				size_t nb_b_to_mov =
270 				      ops_unused * sizeof(struct rte_comp_op *);
271 
272 				memmove(ops, &ops[num_enq], nb_b_to_mov);
273 			}
274 
275 			/* Allocate compression operations */
276 			if (ops_needed && rte_mempool_get_bulk(
277 						mem->op_pool,
278 						(void **)ops,
279 						ops_needed) != 0) {
280 				RTE_LOG(ERR, USER1,
281 				      "Could not allocate enough operations\n");
282 				res = -1;
283 				goto end;
284 			}
285 			allocated += ops_needed;
286 
287 			for (i = 0; i < ops_needed; i++) {
288 				/*
289 				 * Calculate next buffer to attach to operation
290 				 */
291 				uint32_t buf_id = total_enq_ops + i +
292 						ops_unused;
293 				uint16_t op_id = ops_unused + i;
294 				/* Reset all data in output buffers */
295 				struct rte_mbuf *m = output_bufs[buf_id];
296 
297 				m->pkt_len = out_seg_sz * m->nb_segs;
298 				while (m) {
299 					m->data_len = m->buf_len - m->data_off;
300 					m = m->next;
301 				}
302 				ops[op_id]->m_src = input_bufs[buf_id];
303 				ops[op_id]->m_dst = output_bufs[buf_id];
304 				ops[op_id]->src.offset = 0;
305 				ops[op_id]->src.length =
306 					rte_pktmbuf_pkt_len(input_bufs[buf_id]);
307 				ops[op_id]->dst.offset = 0;
308 				ops[op_id]->flush_flag = RTE_COMP_FLUSH_FINAL;
309 				ops[op_id]->input_chksum = buf_id;
310 				ops[op_id]->private_xform = priv_xform;
311 			}
312 
313 			if (unlikely(test_data->perf_comp_force_stop))
314 				goto end;
315 
316 			tsc_start = rte_rdtsc_precise();
317 			num_enq = rte_compressdev_enqueue_burst(dev_id,
318 								mem->qp_id, ops,
319 								num_ops);
320 			tsc_end = rte_rdtsc_precise();
321 			tsc_duration = tsc_end - tsc_start;
322 			ctx->duration_enq += tsc_duration;
323 
324 			if (num_enq < num_ops)
325 				ctx->ops_enq_retries++;
326 
327 			if (test_data->cyclecount_delay)
328 				rte_delay_us_block(test_data->cyclecount_delay);
329 
330 			if (num_enq == 0) {
331 				struct rte_compressdev_stats stats;
332 
333 				rte_compressdev_stats_get(dev_id, &stats);
334 				if (stats.enqueue_err_count) {
335 					res = -1;
336 					goto end;
337 				}
338 			}
339 
340 			ops_unused = num_ops - num_enq;
341 			remaining_ops -= num_enq;
342 			total_enq_ops += num_enq;
343 
344 			tsc_start = rte_rdtsc_precise();
345 			num_deq = rte_compressdev_dequeue_burst(dev_id,
346 							   mem->qp_id,
347 							   deq_ops,
348 							   allocated);
349 			tsc_end = rte_rdtsc_precise();
350 			tsc_duration = tsc_end - tsc_start;
351 			ctx->duration_deq += tsc_duration;
352 
353 			if (num_deq < allocated)
354 				ctx->ops_deq_retries++;
355 
356 			total_deq_ops += num_deq;
357 
358 			if (iter == num_iter - 1) {
359 				for (i = 0; i < num_deq; i++) {
360 					struct rte_comp_op *op = deq_ops[i];
361 
362 					if (op->status !=
363 						RTE_COMP_OP_STATUS_SUCCESS) {
364 						RTE_LOG(ERR, USER1, "Some operations were not successful\n");
365 						goto end;
366 					}
367 
368 					struct rte_mbuf *m = op->m_dst;
369 
370 					m->pkt_len = op->produced;
371 					uint32_t remaining_data = op->produced;
372 					uint16_t data_to_append;
373 
374 					while (remaining_data > 0) {
375 						data_to_append =
376 							RTE_MIN(remaining_data,
377 							     out_seg_sz);
378 						m->data_len = data_to_append;
379 						remaining_data -=
380 								data_to_append;
381 						m = m->next;
382 					}
383 				}
384 			}
385 			rte_mempool_put_bulk(mem->op_pool,
386 					     (void **)deq_ops, num_deq);
387 			allocated -= num_deq;
388 		}
389 
390 		/* Dequeue the last operations */
391 		while (total_deq_ops < total_ops) {
392 			if (unlikely(test_data->perf_comp_force_stop))
393 				goto end;
394 
395 			tsc_start = rte_rdtsc_precise();
396 			num_deq = rte_compressdev_dequeue_burst(dev_id,
397 						mem->qp_id,
398 						deq_ops,
399 						test_data->burst_sz);
400 			tsc_end = rte_rdtsc_precise();
401 			tsc_duration = tsc_end - tsc_start;
402 			ctx->duration_deq += tsc_duration;
403 			ctx->ops_deq_retries++;
404 
405 			if (num_deq == 0) {
406 				struct rte_compressdev_stats stats;
407 
408 				rte_compressdev_stats_get(dev_id, &stats);
409 				if (stats.dequeue_err_count) {
410 					res = -1;
411 					goto end;
412 				}
413 			}
414 			total_deq_ops += num_deq;
415 
416 			if (iter == num_iter - 1) {
417 				for (i = 0; i < num_deq; i++) {
418 					struct rte_comp_op *op = deq_ops[i];
419 
420 					if (op->status !=
421 						RTE_COMP_OP_STATUS_SUCCESS) {
422 						RTE_LOG(ERR, USER1, "Some operations were not successful\n");
423 						goto end;
424 					}
425 
426 					struct rte_mbuf *m = op->m_dst;
427 
428 					m->pkt_len = op->produced;
429 					uint32_t remaining_data = op->produced;
430 					uint16_t data_to_append;
431 
432 					while (remaining_data > 0) {
433 						data_to_append =
434 						RTE_MIN(remaining_data,
435 							out_seg_sz);
436 						m->data_len = data_to_append;
437 						remaining_data -=
438 								data_to_append;
439 						m = m->next;
440 					}
441 				}
442 			}
443 			rte_mempool_put_bulk(mem->op_pool,
444 					     (void **)deq_ops, num_deq);
445 			allocated -= num_deq;
446 		}
447 	}
448 	allocated = 0;
449 
450 end:
451 	if (allocated)
452 		rte_mempool_put_bulk(mem->op_pool, (void **)ops, allocated);
453 	rte_compressdev_private_xform_free(dev_id, priv_xform);
454 	rte_free(ops);
455 
456 	if (test_data->perf_comp_force_stop) {
457 		RTE_LOG(ERR, USER1,
458 		      "lcore: %d Perf. test has been aborted by user\n",
459 			mem->lcore_id);
460 		res = -1;
461 	}
462 	return res;
463 }
464 
465 int
466 cperf_cyclecount_test_runner(void *test_ctx)
467 {
468 	struct cperf_cyclecount_ctx *ctx = test_ctx;
469 	struct comp_test_data *test_data = ctx->ver.options;
470 	uint32_t lcore = rte_lcore_id();
471 	static rte_atomic16_t display_once = RTE_ATOMIC16_INIT(0);
472 	static rte_spinlock_t print_spinlock;
473 	int i;
474 
475 	uint32_t ops_enq_retries_comp;
476 	uint32_t ops_deq_retries_comp;
477 
478 	uint32_t ops_enq_retries_decomp;
479 	uint32_t ops_deq_retries_decomp;
480 
481 	uint32_t duration_setup_per_op;
482 
483 	uint32_t duration_enq_per_op_comp;
484 	uint32_t duration_deq_per_op_comp;
485 
486 	uint32_t duration_enq_per_op_decomp;
487 	uint32_t duration_deq_per_op_decomp;
488 
489 	ctx->ver.mem.lcore_id = lcore;
490 
491 	/*
492 	 * printing information about current compression thread
493 	 */
494 	if (rte_atomic16_test_and_set(&ctx->ver.mem.print_info_once))
495 		printf("    lcore: %u,"
496 				" driver name: %s,"
497 				" device name: %s,"
498 				" device id: %u,"
499 				" socket id: %u,"
500 				" queue pair id: %u\n",
501 			lcore,
502 			ctx->ver.options->driver_name,
503 			rte_compressdev_name_get(ctx->ver.mem.dev_id),
504 			ctx->ver.mem.dev_id,
505 			rte_compressdev_socket_id(ctx->ver.mem.dev_id),
506 			ctx->ver.mem.qp_id);
507 
508 	/*
509 	 * First the verification part is needed
510 	 */
511 	if (cperf_verify_test_runner(&ctx->ver))
512 		return EXIT_FAILURE;
513 
514 	/*
515 	 * Run the tests twice, discarding the first performance
516 	 * results, before the cache is warmed up
517 	 */
518 
519 	/* C O M P R E S S */
520 	for (i = 0; i < 2; i++) {
521 		if (main_loop(ctx, RTE_COMP_COMPRESS) < 0)
522 			return EXIT_FAILURE;
523 	}
524 
525 	ops_enq_retries_comp = ctx->ops_enq_retries;
526 	ops_deq_retries_comp = ctx->ops_deq_retries;
527 
528 	duration_enq_per_op_comp = ctx->duration_enq /
529 			(ctx->ver.mem.total_bufs * test_data->num_iter);
530 	duration_deq_per_op_comp = ctx->duration_deq /
531 			(ctx->ver.mem.total_bufs * test_data->num_iter);
532 
533 	/* D E C O M P R E S S */
534 	for (i = 0; i < 2; i++) {
535 		if (main_loop(ctx, RTE_COMP_DECOMPRESS) < 0)
536 			return EXIT_FAILURE;
537 	}
538 
539 	ops_enq_retries_decomp = ctx->ops_enq_retries;
540 	ops_deq_retries_decomp = ctx->ops_deq_retries;
541 
542 	duration_enq_per_op_decomp = ctx->duration_enq /
543 			(ctx->ver.mem.total_bufs * test_data->num_iter);
544 	duration_deq_per_op_decomp = ctx->duration_deq /
545 			(ctx->ver.mem.total_bufs * test_data->num_iter);
546 
547 	duration_setup_per_op = ctx->duration_op /
548 			(ctx->ver.mem.total_bufs * test_data->num_iter);
549 
550 	/* R E P O R T processing */
551 	if (rte_atomic16_test_and_set(&display_once)) {
552 
553 		rte_spinlock_lock(&print_spinlock);
554 
555 		printf("\nLegend for the table\n"
556 		"  - Retries section: number of retries for the following operations:\n"
557 		"    [C-e] - compression enqueue\n"
558 		"    [C-d] - compression dequeue\n"
559 		"    [D-e] - decompression enqueue\n"
560 		"    [D-d] - decompression dequeue\n"
561 		"  - Cycles section: number of cycles per 'op' for the following operations:\n"
562 		"    setup/op - memory allocation, op configuration and memory dealocation\n"
563 		"    [C-e] - compression enqueue\n"
564 		"    [C-d] - compression dequeue\n"
565 		"    [D-e] - decompression enqueue\n"
566 		"    [D-d] - decompression dequeue\n\n");
567 
568 		printf("\n%12s%6s%12s%17s",
569 			"lcore id", "Level", "Comp size", "Comp ratio [%]");
570 
571 		printf("  |%10s %6s %8s %6s %8s",
572 			" Retries:",
573 			"[C-e]", "[C-d]",
574 			"[D-e]", "[D-d]");
575 
576 		printf("  |%9s %9s %9s %9s %9s %9s\n",
577 			" Cycles:",
578 			"setup/op",
579 			"[C-e]", "[C-d]",
580 			"[D-e]", "[D-d]");
581 
582 		rte_spinlock_unlock(&print_spinlock);
583 	}
584 
585 	rte_spinlock_lock(&print_spinlock);
586 
587 	printf("%12u"
588 	       "%6u"
589 	       "%12zu"
590 	       "%17.2f",
591 		ctx->ver.mem.lcore_id,
592 		test_data->level,
593 		ctx->ver.comp_data_sz,
594 		ctx->ver.ratio);
595 
596 	printf("  |%10s %6u %8u %6u %8u",
597 	       " ",
598 		ops_enq_retries_comp,
599 		ops_deq_retries_comp,
600 		ops_enq_retries_decomp,
601 		ops_deq_retries_decomp);
602 
603 	printf("  |%9s %9u %9u %9u %9u %9u\n",
604 	       " ",
605 		duration_setup_per_op,
606 		duration_enq_per_op_comp,
607 		duration_deq_per_op_comp,
608 		duration_enq_per_op_decomp,
609 		duration_deq_per_op_decomp);
610 
611 	rte_spinlock_unlock(&print_spinlock);
612 
613 	return EXIT_SUCCESS;
614 }
615