xref: /spdk/test/dma/test_dma/test_dma.c (revision b37db06935181fd0e8f5592a96d860040abaa201)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (c) 2021, 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  */
4 
5 #include "spdk/stdinc.h"
6 
7 #include "spdk/dma.h"
8 #include "spdk/bdev.h"
9 #include "spdk/env.h"
10 #include "spdk/event.h"
11 #include "spdk/likely.h"
12 #include "spdk/string.h"
13 #include "spdk/util.h"
14 #include "spdk/md5.h"
15 
16 #include <infiniband/verbs.h>
17 
18 struct dma_test_task;
19 
20 struct dma_test_req {
21 	struct iovec *iovs;
22 	struct spdk_bdev_ext_io_opts io_opts;
23 	uint64_t io_offset;
24 	uint64_t submit_tsc;
25 	struct ibv_mr *mr;
26 	struct dma_test_task *task;
27 	void *buffer;
28 	uint32_t idx;
29 	uint8_t md5_orig[SPDK_MD5DIGEST_LEN];
30 };
31 
32 struct dma_test_task_stats {
33 	uint64_t io_completed;
34 	uint64_t total_tsc;
35 	uint64_t min_tsc;
36 	uint64_t max_tsc;
37 };
38 
39 struct dma_test_task {
40 	struct spdk_bdev_desc *desc;
41 	struct spdk_io_channel *channel;
42 	uint64_t cur_io_offset;
43 	uint64_t max_offset_in_ios;
44 	uint64_t num_blocks_per_io;
45 	uint64_t num_blocks_per_core;
46 	int rw_percentage;
47 	uint32_t seed;
48 	uint32_t io_inflight;
49 	struct dma_test_task_stats stats;
50 	struct dma_test_task_stats last_stats;
51 	bool is_draining;
52 	struct dma_test_req *reqs;
53 	struct spdk_thread *thread;
54 	const char *bdev_name;
55 	uint64_t num_translations;
56 	uint64_t num_pull_push;
57 	uint64_t num_mem_zero;
58 	uint32_t lcore;
59 	uint32_t idx; /* sequential number of this task */
60 
61 	TAILQ_ENTRY(dma_test_task) link;
62 };
63 
64 struct dma_test_data_cpl_ctx {
65 	spdk_memory_domain_data_cpl_cb data_cpl;
66 	void *data_cpl_arg;
67 };
68 
69 enum dma_test_domain_ops {
70 	DMA_TEST_DOMAIN_OP_TRANSLATE = 1u << 0,
71 	DMA_TEST_DOMAIN_OP_PULL_PUSH = 1u << 1,
72 	DMA_TEST_DOMAIN_OP_MEMZERO = 1u << 2,
73 };
74 
75 TAILQ_HEAD(, dma_test_task) g_tasks = TAILQ_HEAD_INITIALIZER(g_tasks);
76 
77 /* User's input */
78 static char *g_bdev_name;
79 static const char *g_rw_mode_str;
80 static int g_rw_percentage = -1;
81 static uint32_t g_queue_depth;
82 static uint32_t g_io_size;
83 static uint32_t g_run_time_sec;
84 static uint32_t g_run_count;
85 static uint32_t g_test_ops;
86 static uint32_t g_corrupt_mkey_counter;
87 static uint32_t g_iovcnt = 1;
88 static bool g_is_random;
89 static bool g_verify;
90 static bool g_force_memory_domains_support;
91 
92 static struct spdk_thread *g_main_thread;
93 static struct spdk_poller *g_runtime_poller;
94 static struct spdk_memory_domain *g_domain;
95 static uint64_t g_num_blocks_per_io;
96 static uint32_t g_num_construct_tasks;
97 static uint32_t g_num_complete_tasks;
98 static uint64_t g_start_tsc;
99 static int g_run_rc;
100 
101 static void destroy_tasks(void);
102 static int dma_test_submit_io(struct dma_test_req *req);
103 
104 static void
105 print_total_stats(void)
106 {
107 	struct dma_test_task *task;
108 	uint64_t tsc_rate = spdk_get_ticks_hz();
109 	uint64_t test_time_usec = (spdk_get_ticks() - g_start_tsc) * SPDK_SEC_TO_USEC / tsc_rate;
110 	uint64_t total_tsc = 0, total_io_completed = 0;
111 	double task_iops, task_bw, task_min_lat, task_avg_lat, task_max_lat;
112 	double total_iops = 0, total_bw = 0, total_min_lat = (double)UINT64_MAX, total_max_lat = 0,
113 	       total_avg_lat;
114 
115 	printf("==========================================================================\n");
116 	printf("%*s\n", 55, "Latency [us]");
117 	printf("%*s %10s %10s %10s %10s\n", 19, "IOPS", "MiB/s", "Average", "min", "max");
118 
119 	TAILQ_FOREACH(task, &g_tasks, link) {
120 		if (!task->stats.io_completed) {
121 			continue;
122 		}
123 		task_iops = (double)task->stats.io_completed * SPDK_SEC_TO_USEC / test_time_usec;
124 		task_bw = task_iops * g_io_size / (1024 * 1024);
125 		task_avg_lat = (double)task->stats.total_tsc / task->stats.io_completed * SPDK_SEC_TO_USEC /
126 			       tsc_rate;
127 		task_min_lat = (double)task->stats.min_tsc * SPDK_SEC_TO_USEC / tsc_rate;
128 		task_max_lat = (double)task->stats.max_tsc * SPDK_SEC_TO_USEC / tsc_rate;
129 
130 		total_iops += task_iops;
131 		total_bw += task_bw;
132 		total_io_completed += task->stats.io_completed;
133 		total_tsc += task->stats.total_tsc;
134 		if (task_min_lat < total_min_lat) {
135 			total_min_lat = task_min_lat;
136 		}
137 		if (task_max_lat > total_max_lat) {
138 			total_max_lat = task_max_lat;
139 		}
140 		printf("Core %2u: %10.2f %10.2f %10.2f %10.2f %10.2f\n",
141 		       task->lcore, task_iops, task_bw, task_avg_lat, task_min_lat, task_max_lat);
142 	}
143 
144 	if (total_io_completed) {
145 		total_avg_lat = (double)total_tsc / total_io_completed  * SPDK_SEC_TO_USEC / tsc_rate;
146 		printf("==========================================================================\n");
147 		printf("%-*s %10.2f %10.2f %10.2f %10.2f %10.2f\n",
148 		       8, "Total  :", total_iops, total_bw, total_avg_lat, total_min_lat, total_max_lat);
149 		printf("\n");
150 	}
151 }
152 
153 static void
154 print_periodic_stats(void)
155 {
156 	struct dma_test_task *task;
157 	uint64_t io_last_sec = 0, tsc_last_sec = 0;
158 	double lat_last_sec, bw_last_sec;
159 
160 	TAILQ_FOREACH(task, &g_tasks, link) {
161 		io_last_sec += task->stats.io_completed - task->last_stats.io_completed;
162 		tsc_last_sec += task->stats.total_tsc - task->last_stats.total_tsc;
163 		memcpy(&task->last_stats, &task->stats, sizeof(task->stats));
164 	}
165 
166 	printf("Running %3u/%-3u sec", g_run_count, g_run_time_sec);
167 	if (io_last_sec) {
168 		lat_last_sec =	(double)tsc_last_sec / io_last_sec * SPDK_SEC_TO_USEC / spdk_get_ticks_hz();
169 		bw_last_sec = (double)io_last_sec * g_io_size / (1024 * 1024);
170 		printf(" IOPS: %-8"PRIu64" BW: %-6.2f [MiB/s] avg.lat %-5.2f [us]",
171 		       io_last_sec, bw_last_sec, lat_last_sec);
172 	}
173 
174 	printf("\r");
175 	fflush(stdout);
176 }
177 
178 static void
179 dma_test_task_complete(void *ctx)
180 {
181 	assert(g_num_complete_tasks > 0);
182 
183 	if (--g_num_complete_tasks == 0) {
184 		spdk_poller_unregister(&g_runtime_poller);
185 		print_total_stats();
186 		spdk_app_stop(g_run_rc);
187 	}
188 }
189 
190 static inline void
191 dma_test_check_and_signal_task_done(struct dma_test_task *task)
192 {
193 	if (task->io_inflight == 0) {
194 		spdk_put_io_channel(task->channel);
195 		spdk_bdev_close(task->desc);
196 		spdk_thread_send_msg(g_main_thread, dma_test_task_complete, task);
197 		spdk_thread_exit(spdk_get_thread());
198 	}
199 }
200 
201 static inline void
202 dma_test_task_update_stats(struct dma_test_task *task, uint64_t submit_tsc)
203 {
204 	uint64_t tsc_diff = spdk_get_ticks() - submit_tsc;
205 
206 	task->stats.io_completed++;
207 	task->stats.total_tsc += tsc_diff;
208 	if (spdk_unlikely(tsc_diff < task->stats.min_tsc)) {
209 		task->stats.min_tsc = tsc_diff;
210 	}
211 	if (spdk_unlikely(tsc_diff > task->stats.max_tsc)) {
212 		task->stats.max_tsc = tsc_diff;
213 	}
214 }
215 
216 static void
217 dma_test_bdev_io_completion_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
218 {
219 	struct dma_test_req *req = cb_arg;
220 	struct dma_test_task *task = req->task;
221 
222 	assert(task->io_inflight > 0);
223 	--task->io_inflight;
224 	dma_test_task_update_stats(task, req->submit_tsc);
225 
226 	if (!success && !g_corrupt_mkey_counter) {
227 		if (!g_run_rc) {
228 			fprintf(stderr, "IO completed with error\n");
229 			g_run_rc = -1;
230 		}
231 		task->is_draining = true;
232 	}
233 
234 	spdk_bdev_free_io(bdev_io);
235 
236 	if (spdk_unlikely(task->is_draining)) {
237 		dma_test_check_and_signal_task_done(task);
238 		return;
239 	}
240 
241 	dma_test_submit_io(req);
242 }
243 
244 static void
245 dma_test_bdev_io_completion_verify_read_done(struct spdk_bdev_io *bdev_io, bool success,
246 		void *cb_arg)
247 {
248 	uint8_t md5_new[SPDK_MD5DIGEST_LEN];
249 	struct dma_test_req *req = cb_arg;
250 	struct dma_test_task *task = req->task;
251 	struct spdk_md5ctx md5ctx;
252 
253 	assert(task->io_inflight > 0);
254 	--task->io_inflight;
255 	dma_test_task_update_stats(task, req->submit_tsc);
256 
257 	if (!success && !g_corrupt_mkey_counter) {
258 		if (!g_run_rc) {
259 			fprintf(stderr, "IO completed with error\n");
260 			g_run_rc = -1;
261 		}
262 		task->is_draining = true;
263 	}
264 
265 	spdk_bdev_free_io(bdev_io);
266 
267 	if (spdk_unlikely(task->is_draining)) {
268 		dma_test_check_and_signal_task_done(task);
269 		return;
270 	}
271 
272 	spdk_md5init(&md5ctx);
273 	spdk_md5update(&md5ctx, req->buffer, g_io_size);
274 	spdk_md5final(md5_new, &md5ctx);
275 
276 	if (memcmp(req->md5_orig, md5_new, SPDK_MD5DIGEST_LEN) != 0) {
277 		fprintf(stderr, "lcore %u, offset %"PRIu64" md5 mismatch\n", task->lcore, req->io_offset);
278 		if (!g_run_rc) {
279 			g_run_rc = -1;
280 		}
281 		task->is_draining = true;
282 		dma_test_check_and_signal_task_done(task);
283 		return;
284 	}
285 
286 	dma_test_submit_io(req);
287 }
288 
289 static void
290 dma_test_bdev_io_completion_verify_write_done(struct spdk_bdev_io *bdev_io, bool success,
291 		void *cb_arg)
292 {
293 	struct dma_test_req *req = cb_arg;
294 	struct dma_test_task *task = req->task;
295 	int rc;
296 
297 	assert(task->io_inflight > 0);
298 	--task->io_inflight;
299 	dma_test_task_update_stats(task, req->submit_tsc);
300 
301 	if (!success && !g_corrupt_mkey_counter) {
302 		if (!g_run_rc) {
303 			fprintf(stderr, "IO completed with error\n");
304 			g_run_rc = -1;
305 		}
306 		task->is_draining = true;
307 	}
308 
309 	spdk_bdev_free_io(bdev_io);
310 
311 	if (spdk_unlikely(task->is_draining)) {
312 		dma_test_check_and_signal_task_done(task);
313 		return;
314 	}
315 
316 	req->submit_tsc = spdk_get_ticks();
317 	rc = spdk_bdev_readv_blocks_ext(task->desc, task->channel, req->iovs, g_iovcnt,
318 					req->io_offset, task->num_blocks_per_io,
319 					dma_test_bdev_io_completion_verify_read_done, req, &req->io_opts);
320 	if (spdk_unlikely(rc)) {
321 		if (!g_run_rc) {
322 			/* log an error only once */
323 			fprintf(stderr, "Failed to submit read IO, rc %d, stop sending IO\n", rc);
324 			g_run_rc = rc;
325 		}
326 		task->is_draining = true;
327 		dma_test_check_and_signal_task_done(task);
328 		return;
329 	}
330 
331 	task->io_inflight++;
332 }
333 
334 static inline uint64_t
335 dma_test_get_offset_in_ios(struct dma_test_task *task, uint32_t req_offset)
336 {
337 	uint64_t offset;
338 
339 	if (g_is_random) {
340 		offset = rand_r(&task->seed) % task->max_offset_in_ios;
341 		if (g_verify) {
342 			offset += task->num_blocks_per_core * task->idx;
343 			offset += task->max_offset_in_ios * req_offset;
344 		}
345 	} else {
346 		offset = task->cur_io_offset++;
347 		if (spdk_unlikely(task->cur_io_offset == task->max_offset_in_ios)) {
348 			task->cur_io_offset = 0;
349 		}
350 	}
351 
352 	return offset;
353 }
354 
355 static inline bool
356 dma_test_task_is_read(struct dma_test_task *task)
357 {
358 	if (g_verify) {
359 		return false;
360 	}
361 	if (task->rw_percentage == 100) {
362 		return true;
363 	}
364 	if (task->rw_percentage != 0 && (rand_r(&task->seed) % 100) <  task->rw_percentage) {
365 		return true;
366 	}
367 	return false;
368 }
369 
370 static void
371 dma_test_data_cpl(void *ctx)
372 {
373 	struct dma_test_data_cpl_ctx *cpl_ctx = ctx;
374 
375 	cpl_ctx->data_cpl(cpl_ctx->data_cpl_arg, 0);
376 	free(cpl_ctx);
377 }
378 
379 static int
380 dma_test_copy_memory(struct dma_test_req *req, struct iovec *dst_iov, uint32_t dst_iovcnt,
381 		     struct iovec *src_iov, uint32_t src_iovcnt, spdk_memory_domain_data_cpl_cb cpl_cb, void *cpl_cb_arg)
382 {
383 	struct dma_test_data_cpl_ctx *cpl_ctx;
384 
385 	cpl_ctx = calloc(1, sizeof(*cpl_ctx));
386 	if (!cpl_ctx) {
387 		return -ENOMEM;
388 	}
389 
390 	cpl_ctx->data_cpl = cpl_cb;
391 	cpl_ctx->data_cpl_arg = cpl_cb_arg;
392 
393 	spdk_iovcpy(src_iov, src_iovcnt, dst_iov, dst_iovcnt);
394 	req->task->num_pull_push++;
395 	spdk_thread_send_msg(req->task->thread, dma_test_data_cpl, cpl_ctx);
396 
397 	return 0;
398 }
399 
400 static int
401 dma_test_push_memory_cb(struct spdk_memory_domain *dst_domain,
402 			void *dst_domain_ctx,
403 			struct iovec *dst_iov, uint32_t dst_iovcnt, struct iovec *src_iov, uint32_t src_iovcnt,
404 			spdk_memory_domain_data_cpl_cb cpl_cb, void *cpl_cb_arg)
405 {
406 	struct dma_test_req *req = dst_domain_ctx;
407 
408 	return dma_test_copy_memory(req, dst_iov, dst_iovcnt, src_iov, src_iovcnt, cpl_cb, cpl_cb_arg);
409 }
410 
411 static int
412 dma_test_pull_memory_cb(struct spdk_memory_domain *src_domain,
413 			void *src_domain_ctx,
414 			struct iovec *src_iov, uint32_t src_iovcnt, struct iovec *dst_iov, uint32_t dst_iovcnt,
415 			spdk_memory_domain_data_cpl_cb cpl_cb, void *cpl_cb_arg)
416 {
417 	struct dma_test_req *req = src_domain_ctx;
418 
419 	return dma_test_copy_memory(req, dst_iov, dst_iovcnt, src_iov, src_iovcnt, cpl_cb, cpl_cb_arg);
420 }
421 
422 static int
423 dma_test_memzero_cb(struct spdk_memory_domain *src_domain, void *src_domain_ctx,
424 		    struct iovec *iov, uint32_t iovcnt,
425 		    spdk_memory_domain_data_cpl_cb cpl_cb, void *cpl_cb_arg)
426 {
427 	struct dma_test_req *req = src_domain_ctx;
428 	struct dma_test_data_cpl_ctx *cpl_ctx;
429 	uint32_t i;
430 
431 	cpl_ctx = calloc(1, sizeof(*cpl_ctx));
432 	if (!cpl_ctx) {
433 		return -ENOMEM;
434 	}
435 
436 	cpl_ctx->data_cpl = cpl_cb;
437 	cpl_ctx->data_cpl_arg = cpl_cb_arg;
438 
439 	for (i = 0; i < iovcnt; i++) {
440 		memset(iov[i].iov_base, 0, iov[i].iov_len);
441 	}
442 	req->task->num_mem_zero++;
443 
444 	spdk_thread_send_msg(req->task->thread, dma_test_data_cpl, cpl_ctx);
445 
446 	return 0;
447 }
448 
449 
450 static int
451 dma_test_translate_memory_cb(struct spdk_memory_domain *src_domain, void *src_domain_ctx,
452 			     struct spdk_memory_domain *dst_domain, struct spdk_memory_domain_translation_ctx *dst_domain_ctx,
453 			     void *addr, size_t len, struct spdk_memory_domain_translation_result *result)
454 {
455 	struct dma_test_req *req = src_domain_ctx;
456 	struct dma_test_task *task = req->task;
457 	struct ibv_qp *dst_domain_qp = (struct ibv_qp *)dst_domain_ctx->rdma.ibv_qp;
458 
459 	if (spdk_unlikely(addr < req->buffer ||
460 			  (uint8_t *)addr + len > (uint8_t *)req->buffer + g_io_size)) {
461 		fprintf(stderr, "incorrect data %p, len %zu\n", addr, len);
462 		return -1;
463 	}
464 
465 	if (spdk_unlikely(!req->mr)) {
466 		req->mr = ibv_reg_mr(dst_domain_qp->pd, req->buffer, g_io_size,
467 				     IBV_ACCESS_LOCAL_WRITE |
468 				     IBV_ACCESS_REMOTE_READ |
469 				     IBV_ACCESS_REMOTE_WRITE);
470 		if (!req->mr) {
471 			fprintf(stderr, "Failed to register memory region, errno %d\n", errno);
472 			return -1;
473 		}
474 	}
475 
476 	result->iov.iov_base = addr;
477 	result->iov.iov_len = len;
478 	result->iov_count = 1;
479 	result->rdma.lkey = req->mr->lkey;
480 	result->rdma.rkey = req->mr->rkey;
481 	result->dst_domain = dst_domain;
482 
483 	task->num_translations++;
484 
485 	if (g_corrupt_mkey_counter && task->num_translations >= g_corrupt_mkey_counter &&
486 	    task->num_translations % g_corrupt_mkey_counter == 0) {
487 		SPDK_NOTICELOG("Corrupt mkey on core %u\n", task->lcore);
488 		result->rdma.lkey = 0xffffffff;
489 		result->rdma.rkey = 0xffffffff;
490 	}
491 
492 	return 0;
493 }
494 
495 static int
496 dma_test_submit_io(struct dma_test_req *req)
497 {
498 	struct dma_test_task *task = req->task;
499 	int rc;
500 	bool is_read;
501 
502 	req->io_offset = dma_test_get_offset_in_ios(task, req->idx) * task->num_blocks_per_io;
503 	req->submit_tsc = spdk_get_ticks();
504 	is_read = dma_test_task_is_read(task);
505 	if (is_read) {
506 		rc = spdk_bdev_readv_blocks_ext(task->desc, task->channel, req->iovs, g_iovcnt,
507 						req->io_offset, task->num_blocks_per_io,
508 						dma_test_bdev_io_completion_cb, req, &req->io_opts);
509 	} else {
510 		rc = spdk_bdev_writev_blocks_ext(task->desc, task->channel, req->iovs, g_iovcnt,
511 						 req->io_offset, task->num_blocks_per_io,
512 						 g_verify ? dma_test_bdev_io_completion_verify_write_done
513 						 : dma_test_bdev_io_completion_cb,
514 						 req, &req->io_opts);
515 	}
516 
517 	if (spdk_unlikely(rc)) {
518 		if (!g_run_rc) {
519 			/* log an error only once */
520 			fprintf(stderr, "Failed to submit %s IO, rc %d, stop sending IO\n", is_read ? "read" : "write", rc);
521 			g_run_rc = rc;
522 		}
523 		task->is_draining = true;
524 		dma_test_check_and_signal_task_done(task);
525 		return rc;
526 	}
527 
528 	task->io_inflight++;
529 
530 	return 0;
531 }
532 
533 static void
534 dma_test_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *event_ctx)
535 {
536 	struct dma_test_task *task = event_ctx;
537 
538 	if (type == SPDK_BDEV_EVENT_REMOVE) {
539 		task->is_draining = true;
540 	}
541 }
542 
543 static void
544 dma_test_bdev_dummy_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
545 			     void *event_ctx)
546 {
547 }
548 
549 static void
550 dma_test_task_run(void *ctx)
551 {
552 	struct dma_test_task *task = ctx;
553 	uint32_t i;
554 	int rc = 0;
555 
556 	for (i = 0; i < g_queue_depth && rc == 0; i++) {
557 		rc = dma_test_submit_io(&task->reqs[i]);
558 	}
559 }
560 
561 static void
562 dma_test_drain_task(void *ctx)
563 {
564 	struct dma_test_task *task = ctx;
565 
566 	task->is_draining = true;
567 }
568 
569 static void
570 dma_test_shutdown_cb(void)
571 {
572 	struct dma_test_task *task;
573 
574 	spdk_poller_unregister(&g_runtime_poller);
575 
576 	TAILQ_FOREACH(task, &g_tasks, link) {
577 		spdk_thread_send_msg(task->thread, dma_test_drain_task, task);
578 	}
579 }
580 
581 static int
582 dma_test_run_time_poller(void *ctx)
583 {
584 	g_run_count++;
585 
586 	if (g_run_count < g_run_time_sec) {
587 		if (isatty(STDOUT_FILENO)) {
588 			print_periodic_stats();
589 		}
590 	} else {
591 		dma_test_shutdown_cb();
592 	}
593 
594 	return SPDK_POLLER_BUSY;
595 }
596 
597 static void
598 dma_test_construct_task_done(void *ctx)
599 {
600 	struct dma_test_task *task;
601 
602 	assert(g_num_construct_tasks > 0);
603 	--g_num_construct_tasks;
604 
605 	if (g_num_construct_tasks != 0) {
606 		return;
607 	}
608 
609 	if (g_run_rc) {
610 		fprintf(stderr, "Initialization failed with error %d\n", g_run_rc);
611 		spdk_app_stop(g_run_rc);
612 		return;
613 	}
614 
615 	g_runtime_poller = spdk_poller_register_named(dma_test_run_time_poller, NULL, 1 * 1000 * 1000,
616 			   "dma_test_run_time_poller");
617 	if (!g_runtime_poller) {
618 		fprintf(stderr, "Failed to run timer\n");
619 		spdk_app_stop(-1);
620 		return;
621 	}
622 
623 	printf("Initialization complete, running %s IO for %u sec on %u cores\n", g_rw_mode_str,
624 	       g_run_time_sec, spdk_env_get_core_count());
625 	g_start_tsc = spdk_get_ticks();
626 	TAILQ_FOREACH(task, &g_tasks, link) {
627 		spdk_thread_send_msg(task->thread, dma_test_task_run, task);
628 	}
629 }
630 
631 static void
632 dma_test_construct_task_on_thread(void *ctx)
633 {
634 	struct dma_test_task *task = ctx;
635 	int rc;
636 
637 	rc = spdk_bdev_open_ext(task->bdev_name, true, dma_test_bdev_event_cb, task, &task->desc);
638 	if (rc) {
639 		fprintf(stderr, "Failed to open bdev %s, rc %d\n", task->bdev_name, rc);
640 		g_run_rc = rc;
641 		spdk_thread_send_msg(g_main_thread, dma_test_construct_task_done, NULL);
642 		return;
643 	}
644 
645 	task->channel = spdk_bdev_get_io_channel(task->desc);
646 	if (!task->channel) {
647 		spdk_bdev_close(task->desc);
648 		task->desc = NULL;
649 		fprintf(stderr, "Failed to open bdev %s, rc %d\n", task->bdev_name, rc);
650 		g_run_rc = rc;
651 		spdk_thread_send_msg(g_main_thread, dma_test_construct_task_done, NULL);
652 		return;
653 	}
654 
655 	task->max_offset_in_ios = spdk_bdev_get_num_blocks(spdk_bdev_desc_get_bdev(
656 					  task->desc)) / task->num_blocks_per_io;
657 	if (g_verify) {
658 		/* In verify mode each req writes a buffer and then reads its context again. It is possible that
659 		 * while some req is reading a buffer, another req from another thread writes a new data to
660 		 * the same lba. To prevent it, split lba range among threads and then split a smaller range
661 		 * among requests */
662 		task->num_blocks_per_core = task->max_offset_in_ios / spdk_env_get_core_count();
663 		task->max_offset_in_ios = task->num_blocks_per_core;
664 		if (!task->max_offset_in_ios) {
665 			fprintf(stderr, "Disk is too small to run on %u cores\n", spdk_env_get_core_count());
666 			g_run_rc = -EINVAL;
667 			spdk_thread_send_msg(g_main_thread, dma_test_construct_task_done, NULL);
668 		}
669 		task->max_offset_in_ios /= g_queue_depth;
670 		if (!task->max_offset_in_ios) {
671 			fprintf(stderr, "Disk is too small to run on %u cores with qdepth %u\n", spdk_env_get_core_count(),
672 				g_queue_depth);
673 			g_run_rc = -EINVAL;
674 			spdk_thread_send_msg(g_main_thread, dma_test_construct_task_done, NULL);
675 		}
676 	}
677 
678 	spdk_thread_send_msg(g_main_thread, dma_test_construct_task_done, task);
679 }
680 
681 static bool
682 dma_test_check_bdev_supports_rdma_memory_domain(struct spdk_bdev *bdev)
683 {
684 	struct spdk_memory_domain **bdev_domains;
685 	int bdev_domains_count, bdev_domains_count_tmp, i;
686 	bool rdma_domain_supported = false;
687 
688 	bdev_domains_count = spdk_bdev_get_memory_domains(bdev, NULL, 0);
689 
690 	if (bdev_domains_count < 0) {
691 		fprintf(stderr, "Failed to get bdev memory domains count, rc %d\n", bdev_domains_count);
692 		return false;
693 	} else if (bdev_domains_count == 0) {
694 		fprintf(stderr, "bdev %s doesn't support any memory domains\n", spdk_bdev_get_name(bdev));
695 		return false;
696 	}
697 
698 	fprintf(stdout, "bdev %s reports %d memory domains\n", spdk_bdev_get_name(bdev),
699 		bdev_domains_count);
700 
701 	bdev_domains = calloc((size_t)bdev_domains_count, sizeof(*bdev_domains));
702 	if (!bdev_domains) {
703 		fprintf(stderr, "Failed to allocate memory domains\n");
704 		return false;
705 	}
706 
707 	bdev_domains_count_tmp = spdk_bdev_get_memory_domains(bdev, bdev_domains, bdev_domains_count);
708 	if (bdev_domains_count_tmp != bdev_domains_count) {
709 		fprintf(stderr, "Unexpected bdev domains return value %d\n", bdev_domains_count_tmp);
710 		return false;
711 	}
712 
713 	for (i = 0; i < bdev_domains_count; i++) {
714 		if (spdk_memory_domain_get_dma_device_type(bdev_domains[i]) == SPDK_DMA_DEVICE_TYPE_RDMA) {
715 			/* Bdev supports memory domain of RDMA type, we can try to submit IO request to it using
716 			 * bdev ext API */
717 			rdma_domain_supported = true;
718 			break;
719 		}
720 	}
721 
722 	fprintf(stdout, "bdev %s %s RDMA memory domain\n", spdk_bdev_get_name(bdev),
723 		rdma_domain_supported ? "supports" : "doesn't support");
724 	free(bdev_domains);
725 
726 	return rdma_domain_supported;
727 }
728 
729 static int
730 req_alloc_buffers(struct dma_test_req *req)
731 {
732 	struct spdk_md5ctx md5ctx;
733 	size_t iov_len, remainder;
734 	uint32_t i;
735 
736 	iov_len = g_io_size / g_iovcnt;
737 	remainder = g_io_size - iov_len * g_iovcnt;
738 
739 	req->buffer = malloc(g_io_size);
740 	if (!req->buffer) {
741 		return -ENOMEM;
742 	}
743 	memset(req->buffer, (int)req->idx + 1, g_io_size);
744 	req->iovs = calloc(g_iovcnt, sizeof(struct iovec));
745 	if (!req->iovs) {
746 		return -ENOMEM;
747 	}
748 	for (i = 0; i < g_iovcnt; i++) {
749 		req->iovs[i].iov_len = iov_len;
750 		req->iovs[i].iov_base = (uint8_t *)req->buffer + iov_len * i;
751 	}
752 	req->iovs[g_iovcnt - 1].iov_len += remainder;
753 	if (g_verify) {
754 		spdk_md5init(&md5ctx);
755 		spdk_md5update(&md5ctx, req->buffer, g_io_size);
756 		spdk_md5final(req->md5_orig, &md5ctx);
757 	}
758 
759 	return 0;
760 }
761 
762 static int
763 allocate_task(uint32_t core, const char *bdev_name)
764 {
765 	char thread_name[32];
766 	struct spdk_cpuset cpu_set;
767 	uint32_t i;
768 	struct dma_test_task *task;
769 	struct dma_test_req *req;
770 	int rc;
771 
772 	task = calloc(1, sizeof(*task));
773 	if (!task) {
774 		fprintf(stderr, "Failed to allocate per thread task\n");
775 		return -ENOMEM;
776 	}
777 
778 	TAILQ_INSERT_TAIL(&g_tasks, task, link);
779 
780 	task->reqs = calloc(g_queue_depth, sizeof(*task->reqs));
781 	if (!task->reqs) {
782 		fprintf(stderr, "Failed to allocate requests\n");
783 		return -ENOMEM;
784 	}
785 
786 	task->lcore = core;
787 	task->seed = core;
788 	for (i = 0; i < g_queue_depth; i++) {
789 		req = &task->reqs[i];
790 		req->task = task;
791 		req->idx = i;
792 		rc = req_alloc_buffers(req);
793 		if (rc) {
794 			fprintf(stderr, "Failed to allocate request data buffer\n");
795 			return rc;
796 		}
797 
798 		req->io_opts.size = sizeof(req->io_opts);
799 		req->io_opts.memory_domain = g_domain;
800 		req->io_opts.memory_domain_ctx = req;
801 	}
802 
803 	snprintf(thread_name, 32, "task_%u", core);
804 	spdk_cpuset_zero(&cpu_set);
805 	spdk_cpuset_set_cpu(&cpu_set, core, true);
806 	task->thread = spdk_thread_create(thread_name, &cpu_set);
807 	if (!task->thread) {
808 		fprintf(stderr, "Failed to create SPDK thread, core %u, cpu_mask %s\n", core,
809 			spdk_cpuset_fmt(&cpu_set));
810 		return -ENOMEM;
811 	}
812 	task->idx = g_num_construct_tasks++;
813 	task->bdev_name = bdev_name;
814 	task->rw_percentage = g_rw_percentage;
815 	task->num_blocks_per_io = g_num_blocks_per_io;
816 	task->stats.min_tsc = UINT64_MAX;
817 
818 	return 0;
819 }
820 
821 static void
822 destroy_task(struct dma_test_task *task)
823 {
824 	struct dma_test_req *req;
825 	uint32_t i;
826 
827 	for (i = 0; i < g_queue_depth; i++) {
828 		req = &task->reqs[i];
829 		if (req->mr) {
830 			ibv_dereg_mr(req->mr);
831 		}
832 		free(req->buffer);
833 		free(req->iovs);
834 	}
835 	free(task->reqs);
836 	TAILQ_REMOVE(&g_tasks, task, link);
837 	free(task);
838 }
839 
840 static void
841 destroy_tasks(void)
842 {
843 	struct dma_test_task *task, *tmp_task;
844 
845 	TAILQ_FOREACH_SAFE(task, &g_tasks, link, tmp_task) {
846 		destroy_task(task);
847 	}
848 }
849 
850 static int
851 verify_tasks(void)
852 {
853 	struct dma_test_task *task;
854 	uint64_t total_requests = 0;
855 	uint64_t num_translations = 0;
856 	uint64_t num_pull_push = 0;
857 	uint64_t num_memzero = 0;
858 	int rc = 0;
859 
860 	if (!g_test_ops) {
861 		/* No specific ops were requested, nothing to check */
862 		return rc;
863 	}
864 
865 	TAILQ_FOREACH(task, &g_tasks, link) {
866 		total_requests += task->stats.io_completed;
867 		num_translations += task->num_translations;
868 		num_pull_push += task->num_pull_push;
869 		num_memzero += task->num_mem_zero;
870 	}
871 
872 	if (g_test_ops & DMA_TEST_DOMAIN_OP_TRANSLATE) {
873 		if (num_translations == 0) {
874 			fprintf(stderr, "Requested \"translate\" operation, but it was not executed\n");
875 			rc = -EINVAL;
876 		}
877 	}
878 	if (g_test_ops & DMA_TEST_DOMAIN_OP_PULL_PUSH) {
879 		if (num_pull_push == 0) {
880 			fprintf(stderr, "Requested \"pull_push\" operation, but it was not executed\n");
881 			rc = -EINVAL;
882 		}
883 	}
884 	if (g_test_ops & DMA_TEST_DOMAIN_OP_MEMZERO) {
885 		if (num_memzero == 0) {
886 			fprintf(stderr, "Requested \"memzero\" operation, but it was not executed\n");
887 			rc = -EINVAL;
888 		}
889 	}
890 
891 	/* bdev request can be split, so the total number of pull_push +translate operations
892 	 * can be bigger than total_number of requests */
893 	if (num_translations + num_pull_push + num_memzero < total_requests) {
894 		fprintf(stderr,
895 			"Operations number mismatch: translate %"PRIu64", pull_push %"PRIu64", mem_zero %"PRIu64" expected total %"PRIu64"\n",
896 			num_translations, num_pull_push, num_memzero, total_requests);
897 		rc = -EINVAL;
898 	} else {
899 		fprintf(stdout,
900 			"Total operations: %"PRIu64", translate %"PRIu64" pull_push %"PRIu64" memzero %"PRIu64"\n",
901 			total_requests, num_translations, num_pull_push, num_memzero);
902 	}
903 
904 	return rc;
905 }
906 
907 static void
908 dma_test_start(void *arg)
909 {
910 	struct spdk_bdev_desc *desc;
911 	struct spdk_bdev *bdev;
912 	struct dma_test_task *task;
913 	uint32_t block_size, i;
914 	int rc;
915 
916 	rc = spdk_bdev_open_ext(g_bdev_name, true, dma_test_bdev_dummy_event_cb, NULL, &desc);
917 	if (rc) {
918 		fprintf(stderr, "Can't find bdev %s\n", g_bdev_name);
919 		spdk_app_stop(-ENODEV);
920 		return;
921 	}
922 	bdev = spdk_bdev_desc_get_bdev(desc);
923 	/* This function checks if bdev supports memory domains. Test is not failed if there are
924 	 * no memory domains since bdev layer can pull/push data */
925 	if (!dma_test_check_bdev_supports_rdma_memory_domain(bdev) && g_force_memory_domains_support) {
926 		fprintf(stderr, "Test aborted due to \"-f\" (force memory domains support) option\n");
927 		spdk_bdev_close(desc);
928 		spdk_app_stop(-ENODEV);
929 		return;
930 	}
931 
932 	g_main_thread = spdk_get_thread();
933 
934 	block_size = spdk_bdev_get_block_size(bdev);
935 	if (g_io_size < block_size || g_io_size % block_size != 0) {
936 		fprintf(stderr, "Invalid io_size %u requested, bdev block size %u\n", g_io_size, block_size);
937 		spdk_bdev_close(desc);
938 		spdk_app_stop(-EINVAL);
939 		return;
940 	}
941 	g_num_blocks_per_io = g_io_size / block_size;
942 
943 	/* Create a memory domain to represent the source memory domain.
944 	 * Since we don't actually have a remote memory domain in this test, this will describe memory
945 	 * on the local system and the translation to the destination memory domain will be trivial.
946 	 * But this at least allows us to demonstrate the flow and test the functionality. */
947 	rc = spdk_memory_domain_create(&g_domain, SPDK_DMA_DEVICE_TYPE_RDMA, NULL, "test_dma");
948 	if (rc != 0) {
949 		spdk_bdev_close(desc);
950 		spdk_app_stop(rc);
951 		return;
952 	}
953 	spdk_memory_domain_set_translation(g_domain, dma_test_translate_memory_cb);
954 	spdk_memory_domain_set_pull(g_domain, dma_test_pull_memory_cb);
955 	spdk_memory_domain_set_push(g_domain, dma_test_push_memory_cb);
956 	spdk_memory_domain_set_memzero(g_domain, dma_test_memzero_cb);
957 
958 	SPDK_ENV_FOREACH_CORE(i) {
959 		rc = allocate_task(i, g_bdev_name);
960 		if (rc) {
961 			destroy_tasks();
962 			spdk_bdev_close(desc);
963 			spdk_app_stop(rc);
964 			return;
965 		}
966 		g_num_complete_tasks++;
967 	}
968 
969 	TAILQ_FOREACH(task, &g_tasks, link) {
970 		spdk_thread_send_msg(task->thread, dma_test_construct_task_on_thread, task);
971 	}
972 
973 	spdk_bdev_close(desc);
974 }
975 
976 static void
977 print_usage(void)
978 {
979 	printf(" -b <bdev>         bdev name for test\n");
980 	printf(" -f                force memory domains support - abort test if bdev doesn't report memory domains\n");
981 	printf(" -q <val>          io depth\n");
982 	printf(" -o <val>          io size in bytes\n");
983 	printf(" -t <val>          run time in seconds\n");
984 	printf(" -x <op,op>        Comma separated memory domain operations expected in the test. Values are \"translate\" and \"pull_push\"\n");
985 	printf(" -w <str>          io pattern (read, write, randread, randwrite, randrw)\n");
986 	printf(" -M <0-100>        rw percentage (100 for reads, 0 for writes)\n");
987 	printf(" -O <val>          iovs count to be used in IO, default 1\n");
988 	printf(" -Y <val>          Return invalid mkey each <val>th translation\n");
989 }
990 
991 static int
992 parse_expected_ops(const char *_str)
993 {
994 	char *str = strdup(_str);
995 	char *tok;
996 	char *sp = NULL;
997 	int rc = 0;
998 
999 	if (!str) {
1000 		fprintf(stderr, "Failed to dup args\n");
1001 		return -ENOMEM;
1002 	}
1003 
1004 	tok = strtok_r(str, ",", &sp);
1005 	while (tok) {
1006 		if (strcmp(tok, "translate") == 0) {
1007 			g_test_ops |= DMA_TEST_DOMAIN_OP_TRANSLATE;
1008 		} else if (strcmp(tok, "pull_push") == 0) {
1009 			g_test_ops |= DMA_TEST_DOMAIN_OP_PULL_PUSH;
1010 		} else if (strcmp(tok, "memzero") == 0) {
1011 			g_test_ops |= DMA_TEST_DOMAIN_OP_MEMZERO;
1012 		} else {
1013 			fprintf(stderr, "Unknown value %s\n", tok);
1014 			rc = -EINVAL;
1015 			break;
1016 		}
1017 		tok = strtok_r(NULL, ",", &sp);
1018 	}
1019 
1020 	free(str);
1021 
1022 	if (g_test_ops == 0 || rc) {
1023 		fprintf(stderr, "-e \"%s\" specified but nothing was parsed\n", _str);
1024 		return -EINVAL;
1025 	}
1026 
1027 	return rc;
1028 }
1029 
1030 static int
1031 parse_arg(int ch, char *arg)
1032 {
1033 	long tmp;
1034 
1035 	switch (ch) {
1036 	case 'q':
1037 	case 'o':
1038 	case 't':
1039 	case 'M':
1040 	case 'O':
1041 	case 'Y':
1042 		tmp = spdk_strtol(arg, 10);
1043 		if (tmp < 0) {
1044 			fprintf(stderr, "Invalid option %c value %s\n", ch, arg);
1045 			return 1;
1046 		}
1047 
1048 		switch (ch) {
1049 		case 'q':
1050 			g_queue_depth = (uint32_t) tmp;
1051 			break;
1052 		case 'o':
1053 			g_io_size = (uint32_t) tmp;
1054 			break;
1055 		case 't':
1056 			g_run_time_sec = (uint32_t) tmp;
1057 			break;
1058 		case 'M':
1059 			g_rw_percentage = (uint32_t) tmp;
1060 			break;
1061 		case 'O':
1062 			g_iovcnt = (uint32_t) tmp;
1063 			break;
1064 		case 'Y':
1065 			g_corrupt_mkey_counter = (uint32_t) tmp;
1066 			break;
1067 		}
1068 		break;
1069 	case 'w':
1070 		g_rw_mode_str = arg;
1071 		break;
1072 	case 'b':
1073 		g_bdev_name = arg;
1074 		break;
1075 	case 'f':
1076 		g_force_memory_domains_support = true;
1077 		break;
1078 	case 'x':
1079 		if (parse_expected_ops(arg)) {
1080 			return 1;
1081 		}
1082 		break;
1083 	default:
1084 		fprintf(stderr, "Unknown option %c\n", ch);
1085 		return 1;
1086 	}
1087 
1088 	return 0;
1089 }
1090 
1091 static int
1092 verify_args(void)
1093 {
1094 	const char *rw_mode = g_rw_mode_str;
1095 
1096 	if (g_queue_depth == 0) {
1097 		fprintf(stderr, "queue depth (-q) is not set\n");
1098 		return 1;
1099 	}
1100 	if (g_io_size == 0) {
1101 		fprintf(stderr, "io size (-o) is not set\n");
1102 		return 1;
1103 	}
1104 	if (g_iovcnt == 0) {
1105 		fprintf(stderr, "iov count (-O) is invalid\n");
1106 		return 1;
1107 	}
1108 	if (g_run_time_sec == 0) {
1109 		fprintf(stderr, "test run time (-t) is not set\n");
1110 		return 1;
1111 	}
1112 	if (!rw_mode) {
1113 		fprintf(stderr, "io pattern (-w) is not set\n");
1114 		return 1;
1115 	}
1116 	if (strncmp(rw_mode, "rand", 4) == 0) {
1117 		g_is_random = true;
1118 		rw_mode = &rw_mode[4];
1119 	}
1120 	if (strcmp(rw_mode, "read") == 0 || strcmp(rw_mode, "write") == 0) {
1121 		if (g_rw_percentage > 0) {
1122 			fprintf(stderr, "Ignoring -M option\n");
1123 		}
1124 		g_rw_percentage = strcmp(rw_mode, "read") == 0 ? 100 : 0;
1125 	} else if (strcmp(rw_mode, "rw") == 0) {
1126 		if (g_rw_percentage < 0 || g_rw_percentage > 100) {
1127 			fprintf(stderr, "Invalid -M value (%d) must be 0..100\n", g_rw_percentage);
1128 			return 1;
1129 		}
1130 	} else if (strcmp(rw_mode, "verify") == 0) {
1131 		g_is_random = true;
1132 		g_verify = true;
1133 		if (g_rw_percentage > 0) {
1134 			fprintf(stderr, "Ignoring -M option\n");
1135 		}
1136 	} else {
1137 		fprintf(stderr, "io pattern (-w) one of [read, write, randread, randwrite, rw, randrw, verify]\n");
1138 		return 1;
1139 	}
1140 	if (!g_bdev_name) {
1141 		fprintf(stderr, "bdev name (-b) is not set\n");
1142 		return 1;
1143 	}
1144 
1145 	return 0;
1146 }
1147 
1148 int
1149 main(int argc, char **argv)
1150 {
1151 	struct spdk_app_opts opts = {};
1152 	int rc;
1153 
1154 	spdk_app_opts_init(&opts, sizeof(opts));
1155 	opts.name = "test_dma";
1156 	opts.shutdown_cb = dma_test_shutdown_cb;
1157 	opts.rpc_addr = NULL;
1158 
1159 	rc = spdk_app_parse_args(argc, argv, &opts, "b:fq:o:t:x:w:M:O:Y:", NULL, parse_arg, print_usage);
1160 	if (rc != SPDK_APP_PARSE_ARGS_SUCCESS) {
1161 		exit(rc);
1162 	}
1163 
1164 	rc = verify_args();
1165 	if (rc) {
1166 		exit(rc);
1167 	}
1168 
1169 	rc = spdk_app_start(&opts, dma_test_start, NULL);
1170 	if (rc == 0) {
1171 		rc = verify_tasks();
1172 	}
1173 	destroy_tasks();
1174 	spdk_app_fini();
1175 
1176 	return rc;
1177 }
1178