xref: /spdk/examples/bdev/bdevperf/bdevperf.c (revision 7aa2cc29c0b532d2ec949c8d0c6084df9a3d6cab)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2016 Intel Corporation.
3  *   Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES.
4  *   All rights reserved.
5  */
6 
7 #include "spdk/stdinc.h"
8 
9 #include "spdk/bdev.h"
10 #include "spdk/accel.h"
11 #include "spdk/endian.h"
12 #include "spdk/env.h"
13 #include "spdk/event.h"
14 #include "spdk/log.h"
15 #include "spdk/util.h"
16 #include "spdk/thread.h"
17 #include "spdk/string.h"
18 #include "spdk/rpc.h"
19 #include "spdk/bit_array.h"
20 #include "spdk/conf.h"
21 #include "spdk/zipf.h"
22 
23 #define BDEVPERF_CONFIG_MAX_FILENAME 1024
24 #define BDEVPERF_CONFIG_UNDEFINED -1
25 #define BDEVPERF_CONFIG_ERROR -2
26 
27 struct bdevperf_task {
28 	struct iovec			iov;
29 	struct bdevperf_job		*job;
30 	struct spdk_bdev_io		*bdev_io;
31 	void				*buf;
32 	void				*md_buf;
33 	uint64_t			offset_blocks;
34 	struct bdevperf_task		*task_to_abort;
35 	enum spdk_bdev_io_type		io_type;
36 	TAILQ_ENTRY(bdevperf_task)	link;
37 	struct spdk_bdev_io_wait_entry	bdev_io_wait;
38 };
39 
40 static const char *g_workload_type = NULL;
41 static int g_io_size = 0;
42 /* initialize to invalid value so we can detect if user overrides it. */
43 static int g_rw_percentage = -1;
44 static bool g_verify = false;
45 static bool g_reset = false;
46 static bool g_continue_on_failure = false;
47 static bool g_abort = false;
48 static bool g_error_to_exit = false;
49 static int g_queue_depth = 0;
50 static uint64_t g_time_in_usec;
51 static int g_show_performance_real_time = 0;
52 static uint64_t g_show_performance_period_in_usec = 1000000;
53 static uint64_t g_show_performance_period_num = 0;
54 static uint64_t g_show_performance_ema_period = 0;
55 static int g_run_rc = 0;
56 static bool g_shutdown = false;
57 static uint64_t g_start_tsc;
58 static uint64_t g_shutdown_tsc;
59 static bool g_zcopy = false;
60 static struct spdk_thread *g_main_thread;
61 static int g_time_in_sec = 0;
62 static bool g_mix_specified = false;
63 static const char *g_job_bdev_name;
64 static bool g_wait_for_tests = false;
65 static struct spdk_jsonrpc_request *g_request = NULL;
66 static bool g_multithread_mode = false;
67 static int g_timeout_in_sec;
68 static struct spdk_conf *g_bdevperf_conf = NULL;
69 static const char *g_bdevperf_conf_file = NULL;
70 static double g_zipf_theta;
71 
72 static struct spdk_cpuset g_all_cpuset;
73 static struct spdk_poller *g_perf_timer = NULL;
74 
75 static void bdevperf_submit_single(struct bdevperf_job *job, struct bdevperf_task *task);
76 static void rpc_perform_tests_cb(void);
77 
78 struct bdevperf_job {
79 	char				*name;
80 	struct spdk_bdev		*bdev;
81 	struct spdk_bdev_desc		*bdev_desc;
82 	struct spdk_io_channel		*ch;
83 	TAILQ_ENTRY(bdevperf_job)	link;
84 	struct spdk_thread		*thread;
85 
86 	const char			*workload_type;
87 	int				io_size;
88 	int				rw_percentage;
89 	bool				is_random;
90 	bool				verify;
91 	bool				reset;
92 	bool				continue_on_failure;
93 	bool				unmap;
94 	bool				write_zeroes;
95 	bool				flush;
96 	bool				abort;
97 	int				queue_depth;
98 	unsigned int			seed;
99 
100 	uint64_t			io_completed;
101 	uint64_t			io_failed;
102 	uint64_t			io_timeout;
103 	uint64_t			prev_io_completed;
104 	double				ema_io_per_second;
105 	int				current_queue_depth;
106 	uint64_t			size_in_ios;
107 	uint64_t			ios_base;
108 	uint64_t			offset_in_ios;
109 	uint64_t			io_size_blocks;
110 	uint64_t			buf_size;
111 	uint32_t			dif_check_flags;
112 	bool				is_draining;
113 	struct spdk_poller		*run_timer;
114 	struct spdk_poller		*reset_timer;
115 	struct spdk_bit_array		*outstanding;
116 	struct spdk_zipf		*zipf;
117 	TAILQ_HEAD(, bdevperf_task)	task_list;
118 	uint64_t			run_time_in_usec;
119 };
120 
121 struct spdk_bdevperf {
122 	TAILQ_HEAD(, bdevperf_job)	jobs;
123 	uint32_t			running_jobs;
124 };
125 
126 static struct spdk_bdevperf g_bdevperf = {
127 	.jobs = TAILQ_HEAD_INITIALIZER(g_bdevperf.jobs),
128 	.running_jobs = 0,
129 };
130 
131 enum job_config_rw {
132 	JOB_CONFIG_RW_READ = 0,
133 	JOB_CONFIG_RW_WRITE,
134 	JOB_CONFIG_RW_RANDREAD,
135 	JOB_CONFIG_RW_RANDWRITE,
136 	JOB_CONFIG_RW_RW,
137 	JOB_CONFIG_RW_RANDRW,
138 	JOB_CONFIG_RW_VERIFY,
139 	JOB_CONFIG_RW_RESET,
140 	JOB_CONFIG_RW_UNMAP,
141 	JOB_CONFIG_RW_FLUSH,
142 	JOB_CONFIG_RW_WRITE_ZEROES,
143 };
144 
145 /* Storing values from a section of job config file */
146 struct job_config {
147 	const char			*name;
148 	const char			*filename;
149 	struct spdk_cpuset		cpumask;
150 	int				bs;
151 	int				iodepth;
152 	int				rwmixread;
153 	int64_t				offset;
154 	uint64_t			length;
155 	enum job_config_rw		rw;
156 	TAILQ_ENTRY(job_config)	link;
157 };
158 
159 TAILQ_HEAD(, job_config) job_config_list
160 	= TAILQ_HEAD_INITIALIZER(job_config_list);
161 
162 static bool g_performance_dump_active = false;
163 
164 struct bdevperf_aggregate_stats {
165 	struct bdevperf_job		*current_job;
166 	uint64_t			io_time_in_usec;
167 	uint64_t			ema_period;
168 	double				total_io_per_second;
169 	double				total_mb_per_second;
170 	double				total_failed_per_second;
171 	double				total_timeout_per_second;
172 };
173 
174 static struct bdevperf_aggregate_stats g_stats = {};
175 
176 /*
177  * Cumulative Moving Average (CMA): average of all data up to current
178  * Exponential Moving Average (EMA): weighted mean of the previous n data and more weight is given to recent
179  * Simple Moving Average (SMA): unweighted mean of the previous n data
180  *
181  * Bdevperf supports CMA and EMA.
182  */
183 static double
184 get_cma_io_per_second(struct bdevperf_job *job, uint64_t io_time_in_usec)
185 {
186 	return (double)job->io_completed * 1000000 / io_time_in_usec;
187 }
188 
189 static double
190 get_ema_io_per_second(struct bdevperf_job *job, uint64_t ema_period)
191 {
192 	double io_completed, io_per_second;
193 
194 	io_completed = job->io_completed;
195 	io_per_second = (double)(io_completed - job->prev_io_completed) * 1000000
196 			/ g_show_performance_period_in_usec;
197 	job->prev_io_completed = io_completed;
198 
199 	job->ema_io_per_second += (io_per_second - job->ema_io_per_second) * 2
200 				  / (ema_period + 1);
201 	return job->ema_io_per_second;
202 }
203 
204 static void
205 performance_dump_job(struct bdevperf_aggregate_stats *stats, struct bdevperf_job *job)
206 {
207 	double io_per_second, mb_per_second, failed_per_second, timeout_per_second;
208 	uint64_t time_in_usec;
209 
210 	printf("\r Job: %s (Core Mask 0x%s)\n", spdk_thread_get_name(job->thread),
211 	       spdk_cpuset_fmt(spdk_thread_get_cpumask(job->thread)));
212 
213 	if (job->io_failed > 0 && !job->reset && !job->continue_on_failure) {
214 		printf("\r Job: %s ended in about %.2f seconds with error\n",
215 		       spdk_thread_get_name(job->thread), (double)job->run_time_in_usec / 1000000);
216 	}
217 	if (job->verify) {
218 		printf("\t Verification LBA range: start 0x%" PRIx64 " length 0x%" PRIx64 "\n",
219 		       job->ios_base, job->size_in_ios);
220 	}
221 
222 	if (g_performance_dump_active == true) {
223 		/* Use job's actual run time as Job has ended */
224 		if (job->io_failed > 0 && !job->continue_on_failure) {
225 			time_in_usec = job->run_time_in_usec;
226 		} else {
227 			time_in_usec = stats->io_time_in_usec;
228 		}
229 	} else {
230 		time_in_usec = job->run_time_in_usec;
231 	}
232 
233 	if (stats->ema_period == 0) {
234 		io_per_second = get_cma_io_per_second(job, time_in_usec);
235 	} else {
236 		io_per_second = get_ema_io_per_second(job, stats->ema_period);
237 	}
238 	mb_per_second = io_per_second * job->io_size / (1024 * 1024);
239 
240 	failed_per_second = (double)job->io_failed * 1000000 / time_in_usec;
241 	timeout_per_second = (double)job->io_timeout * 1000000 / time_in_usec;
242 
243 	printf("\t %-20s: %10.2f %10.2f %10.2f",
244 	       job->name, (float)time_in_usec / 1000000, io_per_second, mb_per_second);
245 	printf(" %10.2f %8.2f\n",
246 	       failed_per_second, timeout_per_second);
247 
248 	stats->total_io_per_second += io_per_second;
249 	stats->total_mb_per_second += mb_per_second;
250 	stats->total_failed_per_second += failed_per_second;
251 	stats->total_timeout_per_second += timeout_per_second;
252 }
253 
254 static void
255 generate_data(void *buf, int buf_len, int block_size, void *md_buf, int md_size,
256 	      int num_blocks)
257 {
258 	int offset_blocks = 0, md_offset, data_block_size, inner_offset;
259 
260 	if (buf_len < num_blocks * block_size) {
261 		return;
262 	}
263 
264 	if (md_buf == NULL) {
265 		data_block_size = block_size - md_size;
266 		md_buf = (char *)buf + data_block_size;
267 		md_offset = block_size;
268 	} else {
269 		data_block_size = block_size;
270 		md_offset = md_size;
271 	}
272 
273 	while (offset_blocks < num_blocks) {
274 		inner_offset = 0;
275 		while (inner_offset < data_block_size) {
276 			*(uint32_t *)buf = offset_blocks + inner_offset;
277 			inner_offset += sizeof(uint32_t);
278 			buf += sizeof(uint32_t);
279 		}
280 		memset(md_buf, offset_blocks, md_size);
281 		md_buf += md_offset;
282 		offset_blocks++;
283 	}
284 }
285 
286 static bool
287 copy_data(void *wr_buf, int wr_buf_len, void *rd_buf, int rd_buf_len, int block_size,
288 	  void *wr_md_buf, void *rd_md_buf, int md_size, int num_blocks)
289 {
290 	if (wr_buf_len < num_blocks * block_size || rd_buf_len < num_blocks * block_size) {
291 		return false;
292 	}
293 
294 	assert((wr_md_buf != NULL) == (rd_md_buf != NULL));
295 
296 	memcpy(wr_buf, rd_buf, block_size * num_blocks);
297 
298 	if (wr_md_buf != NULL) {
299 		memcpy(wr_md_buf, rd_md_buf, md_size * num_blocks);
300 	}
301 
302 	return true;
303 }
304 
305 static bool
306 verify_data(void *wr_buf, int wr_buf_len, void *rd_buf, int rd_buf_len, int block_size,
307 	    void *wr_md_buf, void *rd_md_buf, int md_size, int num_blocks, bool md_check)
308 {
309 	int offset_blocks = 0, md_offset, data_block_size;
310 
311 	if (wr_buf_len < num_blocks * block_size || rd_buf_len < num_blocks * block_size) {
312 		return false;
313 	}
314 
315 	assert((wr_md_buf != NULL) == (rd_md_buf != NULL));
316 
317 	if (wr_md_buf == NULL) {
318 		data_block_size = block_size - md_size;
319 		wr_md_buf = (char *)wr_buf + data_block_size;
320 		rd_md_buf = (char *)rd_buf + data_block_size;
321 		md_offset = block_size;
322 	} else {
323 		data_block_size = block_size;
324 		md_offset = md_size;
325 	}
326 
327 	while (offset_blocks < num_blocks) {
328 		if (memcmp(wr_buf, rd_buf, data_block_size) != 0) {
329 			return false;
330 		}
331 
332 		wr_buf += block_size;
333 		rd_buf += block_size;
334 
335 		if (md_check) {
336 			if (memcmp(wr_md_buf, rd_md_buf, md_size) != 0) {
337 				return false;
338 			}
339 
340 			wr_md_buf += md_offset;
341 			rd_md_buf += md_offset;
342 		}
343 
344 		offset_blocks++;
345 	}
346 
347 	return true;
348 }
349 
350 static void
351 free_job_config(void)
352 {
353 	struct job_config *config, *tmp;
354 
355 	spdk_conf_free(g_bdevperf_conf);
356 	g_bdevperf_conf = NULL;
357 
358 	TAILQ_FOREACH_SAFE(config, &job_config_list, link, tmp) {
359 		TAILQ_REMOVE(&job_config_list, config, link);
360 		free(config);
361 	}
362 }
363 
364 static void
365 bdevperf_test_done(void *ctx)
366 {
367 	struct bdevperf_job *job, *jtmp;
368 	struct bdevperf_task *task, *ttmp;
369 	int rc;
370 	uint64_t time_in_usec;
371 
372 	if (g_time_in_usec) {
373 		g_stats.io_time_in_usec = g_time_in_usec;
374 
375 		if (!g_run_rc && g_performance_dump_active) {
376 			spdk_thread_send_msg(spdk_get_thread(), bdevperf_test_done, NULL);
377 			return;
378 		}
379 	}
380 
381 	if (g_show_performance_real_time) {
382 		spdk_poller_unregister(&g_perf_timer);
383 	}
384 
385 	if (g_shutdown) {
386 		g_shutdown_tsc = spdk_get_ticks() - g_start_tsc;
387 		time_in_usec = g_shutdown_tsc * 1000000 / spdk_get_ticks_hz();
388 		g_time_in_usec = (g_time_in_usec > time_in_usec) ? time_in_usec : g_time_in_usec;
389 		printf("Received shutdown signal, test time was about %.6f seconds\n",
390 		       (double)g_time_in_usec / 1000000);
391 	}
392 
393 	printf("\n\r %-*s: %10s %10s %10s %10s %8s\n",
394 	       28, "Device Information", "runtime(s)", "IOPS", "MiB/s", "Fail/s", "TO/s");
395 
396 	TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, jtmp) {
397 		TAILQ_REMOVE(&g_bdevperf.jobs, job, link);
398 
399 		performance_dump_job(&g_stats, job);
400 
401 		TAILQ_FOREACH_SAFE(task, &job->task_list, link, ttmp) {
402 			TAILQ_REMOVE(&job->task_list, task, link);
403 			spdk_free(task->buf);
404 			spdk_free(task->md_buf);
405 			free(task);
406 		}
407 
408 		spdk_bit_array_free(&job->outstanding);
409 		spdk_zipf_free(&job->zipf);
410 		free(job->name);
411 		free(job);
412 	}
413 
414 	printf("\r ==================================================================================\n");
415 	printf("\r %-28s: %10s %10.2f %10.2f",
416 	       "Total", "", g_stats.total_io_per_second, g_stats.total_mb_per_second);
417 	printf(" %10.2f %8.2f\n",
418 	       g_stats.total_failed_per_second, g_stats.total_timeout_per_second);
419 	fflush(stdout);
420 
421 	rc = g_run_rc;
422 	if (g_request && !g_shutdown) {
423 		rpc_perform_tests_cb();
424 		if (rc != 0) {
425 			spdk_app_stop(rc);
426 		}
427 	} else {
428 		spdk_app_stop(rc);
429 	}
430 }
431 
432 static void
433 bdevperf_job_end(void *ctx)
434 {
435 	assert(g_main_thread == spdk_get_thread());
436 
437 	if (--g_bdevperf.running_jobs == 0) {
438 		bdevperf_test_done(NULL);
439 	}
440 }
441 
442 static void
443 bdevperf_end_task(struct bdevperf_task *task)
444 {
445 	struct bdevperf_job     *job = task->job;
446 	uint64_t		end_tsc = 0;
447 
448 	TAILQ_INSERT_TAIL(&job->task_list, task, link);
449 	if (job->is_draining) {
450 		if (job->current_queue_depth == 0) {
451 			end_tsc = spdk_get_ticks() - g_start_tsc;
452 			job->run_time_in_usec = end_tsc * 1000000 / spdk_get_ticks_hz();
453 			spdk_put_io_channel(job->ch);
454 			spdk_bdev_close(job->bdev_desc);
455 			spdk_thread_send_msg(g_main_thread, bdevperf_job_end, NULL);
456 		}
457 	}
458 }
459 
460 static void
461 bdevperf_queue_io_wait_with_cb(struct bdevperf_task *task, spdk_bdev_io_wait_cb cb_fn)
462 {
463 	struct bdevperf_job	*job = task->job;
464 
465 	task->bdev_io_wait.bdev = job->bdev;
466 	task->bdev_io_wait.cb_fn = cb_fn;
467 	task->bdev_io_wait.cb_arg = task;
468 	spdk_bdev_queue_io_wait(job->bdev, job->ch, &task->bdev_io_wait);
469 }
470 
471 static int
472 bdevperf_job_drain(void *ctx)
473 {
474 	struct bdevperf_job *job = ctx;
475 
476 	spdk_poller_unregister(&job->run_timer);
477 	if (job->reset) {
478 		spdk_poller_unregister(&job->reset_timer);
479 	}
480 
481 	job->is_draining = true;
482 
483 	return -1;
484 }
485 
486 static void
487 bdevperf_abort_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
488 {
489 	struct bdevperf_task	*task = cb_arg;
490 	struct bdevperf_job	*job = task->job;
491 
492 	job->current_queue_depth--;
493 
494 	if (success) {
495 		job->io_completed++;
496 	} else {
497 		job->io_failed++;
498 		if (!job->continue_on_failure) {
499 			bdevperf_job_drain(job);
500 			g_run_rc = -1;
501 		}
502 	}
503 
504 	spdk_bdev_free_io(bdev_io);
505 	bdevperf_end_task(task);
506 }
507 
508 static int
509 bdevperf_verify_dif(struct bdevperf_task *task, struct iovec *iovs, int iovcnt)
510 {
511 	struct bdevperf_job	*job = task->job;
512 	struct spdk_bdev	*bdev = job->bdev;
513 	struct spdk_dif_ctx	dif_ctx;
514 	struct spdk_dif_error	err_blk = {};
515 	int			rc;
516 
517 	rc = spdk_dif_ctx_init(&dif_ctx,
518 			       spdk_bdev_get_block_size(bdev),
519 			       spdk_bdev_get_md_size(bdev),
520 			       spdk_bdev_is_md_interleaved(bdev),
521 			       spdk_bdev_is_dif_head_of_md(bdev),
522 			       spdk_bdev_get_dif_type(bdev),
523 			       job->dif_check_flags,
524 			       task->offset_blocks, 0, 0, 0, 0);
525 	if (rc != 0) {
526 		fprintf(stderr, "Initialization of DIF context failed\n");
527 		return rc;
528 	}
529 
530 	if (spdk_bdev_is_md_interleaved(bdev)) {
531 		rc = spdk_dif_verify(iovs, iovcnt, job->io_size_blocks, &dif_ctx, &err_blk);
532 	} else {
533 		struct iovec md_iov = {
534 			.iov_base	= task->md_buf,
535 			.iov_len	= spdk_bdev_get_md_size(bdev) * job->io_size_blocks,
536 		};
537 
538 		rc = spdk_dix_verify(iovs, iovcnt, &md_iov, job->io_size_blocks, &dif_ctx, &err_blk);
539 	}
540 
541 	if (rc != 0) {
542 		fprintf(stderr, "DIF/DIX error detected. type=%d, offset=%" PRIu32 "\n",
543 			err_blk.err_type, err_blk.err_offset);
544 	}
545 
546 	return rc;
547 }
548 
549 static void
550 bdevperf_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
551 {
552 	struct bdevperf_job	*job;
553 	struct bdevperf_task	*task = cb_arg;
554 	struct iovec		*iovs;
555 	int			iovcnt;
556 	bool			md_check;
557 	uint64_t		offset_in_ios;
558 	int			rc;
559 
560 	job = task->job;
561 	md_check = spdk_bdev_get_dif_type(job->bdev) == SPDK_DIF_DISABLE;
562 
563 	if (g_error_to_exit == true) {
564 		bdevperf_job_drain(job);
565 	} else if (!success) {
566 		if (!job->reset && !job->continue_on_failure) {
567 			bdevperf_job_drain(job);
568 			g_run_rc = -1;
569 			g_error_to_exit = true;
570 			printf("task offset: %" PRIu64 " on job bdev=%s fails\n",
571 			       task->offset_blocks, job->name);
572 		}
573 	} else if (job->verify || job->reset) {
574 		spdk_bdev_io_get_iovec(bdev_io, &iovs, &iovcnt);
575 		assert(iovcnt == 1);
576 		assert(iovs != NULL);
577 		if (!verify_data(task->buf, job->buf_size, iovs[0].iov_base, iovs[0].iov_len,
578 				 spdk_bdev_get_block_size(job->bdev),
579 				 task->md_buf, spdk_bdev_io_get_md_buf(bdev_io),
580 				 spdk_bdev_get_md_size(job->bdev),
581 				 job->io_size_blocks, md_check)) {
582 			printf("Buffer mismatch! Target: %s Disk Offset: %" PRIu64 "\n", job->name, task->offset_blocks);
583 			printf("   First dword expected 0x%x got 0x%x\n", *(int *)task->buf, *(int *)iovs[0].iov_base);
584 			bdevperf_job_drain(job);
585 			g_run_rc = -1;
586 		}
587 	} else if (job->dif_check_flags != 0) {
588 		if (task->io_type == SPDK_BDEV_IO_TYPE_READ && spdk_bdev_get_md_size(job->bdev) != 0) {
589 			spdk_bdev_io_get_iovec(bdev_io, &iovs, &iovcnt);
590 			assert(iovcnt == 1);
591 			assert(iovs != NULL);
592 			rc = bdevperf_verify_dif(task, iovs, iovcnt);
593 			if (rc != 0) {
594 				printf("DIF error detected. task offset: %" PRIu64 " on job bdev=%s\n",
595 				       task->offset_blocks, job->name);
596 
597 				success = false;
598 				if (!job->reset && !job->continue_on_failure) {
599 					bdevperf_job_drain(job);
600 					g_run_rc = -1;
601 					g_error_to_exit = true;
602 				}
603 			}
604 		}
605 	}
606 
607 	job->current_queue_depth--;
608 
609 	if (success) {
610 		job->io_completed++;
611 	} else {
612 		job->io_failed++;
613 	}
614 
615 	if (job->verify) {
616 		assert(task->offset_blocks / job->io_size_blocks >= job->ios_base);
617 		offset_in_ios = task->offset_blocks / job->io_size_blocks - job->ios_base;
618 
619 		assert(spdk_bit_array_get(job->outstanding, offset_in_ios) == true);
620 		spdk_bit_array_clear(job->outstanding, offset_in_ios);
621 	}
622 
623 	spdk_bdev_free_io(bdev_io);
624 
625 	/*
626 	 * is_draining indicates when time has expired for the test run
627 	 * and we are just waiting for the previously submitted I/O
628 	 * to complete.  In this case, do not submit a new I/O to replace
629 	 * the one just completed.
630 	 */
631 	if (!job->is_draining) {
632 		bdevperf_submit_single(job, task);
633 	} else {
634 		bdevperf_end_task(task);
635 	}
636 }
637 
638 static void
639 bdevperf_verify_submit_read(void *cb_arg)
640 {
641 	struct bdevperf_job	*job;
642 	struct bdevperf_task	*task = cb_arg;
643 	int			rc;
644 
645 	job = task->job;
646 
647 	/* Read the data back in */
648 	rc = spdk_bdev_read_blocks_with_md(job->bdev_desc, job->ch, NULL, NULL,
649 					   task->offset_blocks, job->io_size_blocks,
650 					   bdevperf_complete, task);
651 
652 	if (rc == -ENOMEM) {
653 		bdevperf_queue_io_wait_with_cb(task, bdevperf_verify_submit_read);
654 	} else if (rc != 0) {
655 		printf("Failed to submit read: %d\n", rc);
656 		bdevperf_job_drain(job);
657 		g_run_rc = rc;
658 	}
659 }
660 
661 static void
662 bdevperf_verify_write_complete(struct spdk_bdev_io *bdev_io, bool success,
663 			       void *cb_arg)
664 {
665 	if (success) {
666 		spdk_bdev_free_io(bdev_io);
667 		bdevperf_verify_submit_read(cb_arg);
668 	} else {
669 		bdevperf_complete(bdev_io, success, cb_arg);
670 	}
671 }
672 
673 static void
674 bdevperf_zcopy_populate_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
675 {
676 	if (!success) {
677 		bdevperf_complete(bdev_io, success, cb_arg);
678 		return;
679 	}
680 
681 	spdk_bdev_zcopy_end(bdev_io, false, bdevperf_complete, cb_arg);
682 }
683 
684 static int
685 bdevperf_generate_dif(struct bdevperf_task *task)
686 {
687 	struct bdevperf_job	*job = task->job;
688 	struct spdk_bdev	*bdev = job->bdev;
689 	struct spdk_dif_ctx	dif_ctx;
690 	int			rc;
691 
692 	rc = spdk_dif_ctx_init(&dif_ctx,
693 			       spdk_bdev_get_block_size(bdev),
694 			       spdk_bdev_get_md_size(bdev),
695 			       spdk_bdev_is_md_interleaved(bdev),
696 			       spdk_bdev_is_dif_head_of_md(bdev),
697 			       spdk_bdev_get_dif_type(bdev),
698 			       job->dif_check_flags,
699 			       task->offset_blocks, 0, 0, 0, 0);
700 	if (rc != 0) {
701 		fprintf(stderr, "Initialization of DIF context failed\n");
702 		return rc;
703 	}
704 
705 	if (spdk_bdev_is_md_interleaved(bdev)) {
706 		rc = spdk_dif_generate(&task->iov, 1, job->io_size_blocks, &dif_ctx);
707 	} else {
708 		struct iovec md_iov = {
709 			.iov_base	= task->md_buf,
710 			.iov_len	= spdk_bdev_get_md_size(bdev) * job->io_size_blocks,
711 		};
712 
713 		rc = spdk_dix_generate(&task->iov, 1, &md_iov, job->io_size_blocks, &dif_ctx);
714 	}
715 
716 	if (rc != 0) {
717 		fprintf(stderr, "Generation of DIF/DIX failed\n");
718 	}
719 
720 	return rc;
721 }
722 
723 static void
724 bdevperf_submit_task(void *arg)
725 {
726 	struct bdevperf_task	*task = arg;
727 	struct bdevperf_job	*job = task->job;
728 	struct spdk_bdev_desc	*desc;
729 	struct spdk_io_channel	*ch;
730 	spdk_bdev_io_completion_cb cb_fn;
731 	uint64_t		offset_in_ios;
732 	int			rc = 0;
733 
734 	desc = job->bdev_desc;
735 	ch = job->ch;
736 
737 	switch (task->io_type) {
738 	case SPDK_BDEV_IO_TYPE_WRITE:
739 		if (spdk_bdev_get_md_size(job->bdev) != 0 && job->dif_check_flags != 0) {
740 			rc = bdevperf_generate_dif(task);
741 		}
742 		if (rc == 0) {
743 			cb_fn = (job->verify || job->reset) ? bdevperf_verify_write_complete : bdevperf_complete;
744 
745 			if (g_zcopy) {
746 				spdk_bdev_zcopy_end(task->bdev_io, true, cb_fn, task);
747 				return;
748 			} else {
749 				rc = spdk_bdev_writev_blocks_with_md(desc, ch, &task->iov, 1,
750 								     task->md_buf,
751 								     task->offset_blocks,
752 								     job->io_size_blocks,
753 								     cb_fn, task);
754 			}
755 		}
756 		break;
757 	case SPDK_BDEV_IO_TYPE_FLUSH:
758 		rc = spdk_bdev_flush_blocks(desc, ch, task->offset_blocks,
759 					    job->io_size_blocks, bdevperf_complete, task);
760 		break;
761 	case SPDK_BDEV_IO_TYPE_UNMAP:
762 		rc = spdk_bdev_unmap_blocks(desc, ch, task->offset_blocks,
763 					    job->io_size_blocks, bdevperf_complete, task);
764 		break;
765 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
766 		rc = spdk_bdev_write_zeroes_blocks(desc, ch, task->offset_blocks,
767 						   job->io_size_blocks, bdevperf_complete, task);
768 		break;
769 	case SPDK_BDEV_IO_TYPE_READ:
770 		if (g_zcopy) {
771 			rc = spdk_bdev_zcopy_start(desc, ch, NULL, 0, task->offset_blocks, job->io_size_blocks,
772 						   true, bdevperf_zcopy_populate_complete, task);
773 		} else {
774 			rc = spdk_bdev_read_blocks_with_md(desc, ch, task->buf, task->md_buf,
775 							   task->offset_blocks,
776 							   job->io_size_blocks,
777 							   bdevperf_complete, task);
778 		}
779 		break;
780 	case SPDK_BDEV_IO_TYPE_ABORT:
781 		rc = spdk_bdev_abort(desc, ch, task->task_to_abort, bdevperf_abort_complete, task);
782 		break;
783 	default:
784 		assert(false);
785 		rc = -EINVAL;
786 		break;
787 	}
788 
789 	if (rc == -ENOMEM) {
790 		bdevperf_queue_io_wait_with_cb(task, bdevperf_submit_task);
791 		return;
792 	} else if (rc != 0) {
793 		printf("Failed to submit bdev_io: %d\n", rc);
794 		if (job->verify) {
795 			assert(task->offset_blocks / job->io_size_blocks >= job->ios_base);
796 			offset_in_ios = task->offset_blocks / job->io_size_blocks - job->ios_base;
797 
798 			assert(spdk_bit_array_get(job->outstanding, offset_in_ios) == true);
799 			spdk_bit_array_clear(job->outstanding, offset_in_ios);
800 		}
801 		bdevperf_job_drain(job);
802 		g_run_rc = rc;
803 		return;
804 	}
805 
806 	job->current_queue_depth++;
807 }
808 
809 static void
810 bdevperf_zcopy_get_buf_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
811 {
812 	struct bdevperf_task	*task = cb_arg;
813 	struct bdevperf_job	*job = task->job;
814 	struct iovec		*iovs;
815 	int			iovcnt;
816 
817 	if (!success) {
818 		bdevperf_job_drain(job);
819 		g_run_rc = -1;
820 		return;
821 	}
822 
823 	task->bdev_io = bdev_io;
824 	task->io_type = SPDK_BDEV_IO_TYPE_WRITE;
825 
826 	if (job->verify || job->reset) {
827 		/* When job->verify or job->reset is enabled, task->buf is used for
828 		 *  verification of read after write.  For write I/O, when zcopy APIs
829 		 *  are used, task->buf cannot be used, and data must be written to
830 		 *  the data buffer allocated underneath bdev layer instead.
831 		 *  Hence we copy task->buf to the allocated data buffer here.
832 		 */
833 		spdk_bdev_io_get_iovec(bdev_io, &iovs, &iovcnt);
834 		assert(iovcnt == 1);
835 		assert(iovs != NULL);
836 
837 		copy_data(iovs[0].iov_base, iovs[0].iov_len, task->buf, job->buf_size,
838 			  spdk_bdev_get_block_size(job->bdev),
839 			  spdk_bdev_io_get_md_buf(bdev_io), task->md_buf,
840 			  spdk_bdev_get_md_size(job->bdev), job->io_size_blocks);
841 	}
842 
843 	bdevperf_submit_task(task);
844 }
845 
846 static void
847 bdevperf_prep_zcopy_write_task(void *arg)
848 {
849 	struct bdevperf_task	*task = arg;
850 	struct bdevperf_job	*job = task->job;
851 	int			rc;
852 
853 	rc = spdk_bdev_zcopy_start(job->bdev_desc, job->ch, NULL, 0,
854 				   task->offset_blocks, job->io_size_blocks,
855 				   false, bdevperf_zcopy_get_buf_complete, task);
856 	if (rc != 0) {
857 		assert(rc == -ENOMEM);
858 		bdevperf_queue_io_wait_with_cb(task, bdevperf_prep_zcopy_write_task);
859 		return;
860 	}
861 
862 	job->current_queue_depth++;
863 }
864 
865 static struct bdevperf_task *
866 bdevperf_job_get_task(struct bdevperf_job *job)
867 {
868 	struct bdevperf_task *task;
869 
870 	task = TAILQ_FIRST(&job->task_list);
871 	if (!task) {
872 		printf("Task allocation failed\n");
873 		abort();
874 	}
875 
876 	TAILQ_REMOVE(&job->task_list, task, link);
877 	return task;
878 }
879 
880 static void
881 bdevperf_submit_single(struct bdevperf_job *job, struct bdevperf_task *task)
882 {
883 	uint64_t offset_in_ios;
884 
885 	if (job->zipf) {
886 		offset_in_ios = spdk_zipf_generate(job->zipf);
887 	} else if (job->is_random) {
888 		offset_in_ios = rand_r(&job->seed) % job->size_in_ios;
889 	} else {
890 		offset_in_ios = job->offset_in_ios++;
891 		if (job->offset_in_ios == job->size_in_ios) {
892 			job->offset_in_ios = 0;
893 		}
894 
895 		/* Increment of offset_in_ios if there's already an outstanding IO
896 		 * to that location. We only need this with job->verify as random
897 		 * offsets are not supported with job->verify at this time.
898 		 */
899 		if (job->verify) {
900 			assert(spdk_bit_array_find_first_clear(job->outstanding, 0) != UINT32_MAX);
901 
902 			while (spdk_bit_array_get(job->outstanding, offset_in_ios)) {
903 				offset_in_ios = job->offset_in_ios++;
904 				if (job->offset_in_ios == job->size_in_ios) {
905 					job->offset_in_ios = 0;
906 				}
907 			}
908 			spdk_bit_array_set(job->outstanding, offset_in_ios);
909 		}
910 	}
911 
912 	/* For multi-thread to same job, offset_in_ios is relative
913 	 * to the LBA range assigned for that job. job->offset_blocks
914 	 * is absolute (entire bdev LBA range).
915 	 */
916 	task->offset_blocks = (offset_in_ios + job->ios_base) * job->io_size_blocks;
917 
918 	if (job->verify || job->reset) {
919 		generate_data(task->buf, job->buf_size,
920 			      spdk_bdev_get_block_size(job->bdev),
921 			      task->md_buf, spdk_bdev_get_md_size(job->bdev),
922 			      job->io_size_blocks);
923 		if (g_zcopy) {
924 			bdevperf_prep_zcopy_write_task(task);
925 			return;
926 		} else {
927 			task->iov.iov_base = task->buf;
928 			task->iov.iov_len = job->buf_size;
929 			task->io_type = SPDK_BDEV_IO_TYPE_WRITE;
930 		}
931 	} else if (job->flush) {
932 		task->io_type = SPDK_BDEV_IO_TYPE_FLUSH;
933 	} else if (job->unmap) {
934 		task->io_type = SPDK_BDEV_IO_TYPE_UNMAP;
935 	} else if (job->write_zeroes) {
936 		task->io_type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES;
937 	} else if ((job->rw_percentage == 100) ||
938 		   (job->rw_percentage != 0 && ((rand_r(&job->seed) % 100) < job->rw_percentage))) {
939 		task->io_type = SPDK_BDEV_IO_TYPE_READ;
940 	} else {
941 		if (g_zcopy) {
942 			bdevperf_prep_zcopy_write_task(task);
943 			return;
944 		} else {
945 			task->iov.iov_base = task->buf;
946 			task->iov.iov_len = job->buf_size;
947 			task->io_type = SPDK_BDEV_IO_TYPE_WRITE;
948 		}
949 	}
950 
951 	bdevperf_submit_task(task);
952 }
953 
954 static int reset_job(void *arg);
955 
956 static void
957 reset_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
958 {
959 	struct bdevperf_task	*task = cb_arg;
960 	struct bdevperf_job	*job = task->job;
961 
962 	if (!success) {
963 		printf("Reset blockdev=%s failed\n", spdk_bdev_get_name(job->bdev));
964 		bdevperf_job_drain(job);
965 		g_run_rc = -1;
966 	}
967 
968 	TAILQ_INSERT_TAIL(&job->task_list, task, link);
969 	spdk_bdev_free_io(bdev_io);
970 
971 	job->reset_timer = SPDK_POLLER_REGISTER(reset_job, job,
972 						10 * 1000000);
973 }
974 
975 static int
976 reset_job(void *arg)
977 {
978 	struct bdevperf_job *job = arg;
979 	struct bdevperf_task *task;
980 	int rc;
981 
982 	spdk_poller_unregister(&job->reset_timer);
983 
984 	/* Do reset. */
985 	task = bdevperf_job_get_task(job);
986 	rc = spdk_bdev_reset(job->bdev_desc, job->ch,
987 			     reset_cb, task);
988 	if (rc) {
989 		printf("Reset failed: %d\n", rc);
990 		bdevperf_job_drain(job);
991 		g_run_rc = -1;
992 	}
993 
994 	return -1;
995 }
996 
997 static void
998 bdevperf_timeout_cb(void *cb_arg, struct spdk_bdev_io *bdev_io)
999 {
1000 	struct bdevperf_job *job = cb_arg;
1001 	struct bdevperf_task *task;
1002 
1003 	job->io_timeout++;
1004 
1005 	if (job->is_draining || !job->abort ||
1006 	    !spdk_bdev_io_type_supported(job->bdev, SPDK_BDEV_IO_TYPE_ABORT)) {
1007 		return;
1008 	}
1009 
1010 	task = bdevperf_job_get_task(job);
1011 	if (task == NULL) {
1012 		return;
1013 	}
1014 
1015 	task->task_to_abort = spdk_bdev_io_get_cb_arg(bdev_io);
1016 	task->io_type = SPDK_BDEV_IO_TYPE_ABORT;
1017 
1018 	bdevperf_submit_task(task);
1019 }
1020 
1021 static void
1022 bdevperf_job_run(void *ctx)
1023 {
1024 	struct bdevperf_job *job = ctx;
1025 	struct bdevperf_task *task;
1026 	int i;
1027 
1028 	/* Submit initial I/O for this job. Each time one
1029 	 * completes, another will be submitted. */
1030 
1031 	/* Start a timer to stop this I/O chain when the run is over */
1032 	job->run_timer = SPDK_POLLER_REGISTER(bdevperf_job_drain, job, g_time_in_usec);
1033 	if (job->reset) {
1034 		job->reset_timer = SPDK_POLLER_REGISTER(reset_job, job,
1035 							10 * 1000000);
1036 	}
1037 
1038 	spdk_bdev_set_timeout(job->bdev_desc, g_timeout_in_sec, bdevperf_timeout_cb, job);
1039 
1040 	for (i = 0; i < job->queue_depth; i++) {
1041 		task = bdevperf_job_get_task(job);
1042 		bdevperf_submit_single(job, task);
1043 	}
1044 }
1045 
1046 static void
1047 _performance_dump_done(void *ctx)
1048 {
1049 	struct bdevperf_aggregate_stats *stats = ctx;
1050 
1051 	printf("\r ==================================================================================\n");
1052 	printf("\r %-28s: %10s %10.2f %10.2f",
1053 	       "Total", "", stats->total_io_per_second, stats->total_mb_per_second);
1054 	printf(" %10.2f %8.2f\n",
1055 	       stats->total_failed_per_second, stats->total_timeout_per_second);
1056 	fflush(stdout);
1057 
1058 	g_performance_dump_active = false;
1059 
1060 	free(stats);
1061 }
1062 
1063 static void
1064 _performance_dump(void *ctx)
1065 {
1066 	struct bdevperf_aggregate_stats *stats = ctx;
1067 
1068 	performance_dump_job(stats, stats->current_job);
1069 
1070 	/* This assumes the jobs list is static after start up time.
1071 	 * That's true right now, but if that ever changed this would need a lock. */
1072 	stats->current_job = TAILQ_NEXT(stats->current_job, link);
1073 	if (stats->current_job == NULL) {
1074 		spdk_thread_send_msg(g_main_thread, _performance_dump_done, stats);
1075 	} else {
1076 		spdk_thread_send_msg(stats->current_job->thread, _performance_dump, stats);
1077 	}
1078 }
1079 
1080 static int
1081 performance_statistics_thread(void *arg)
1082 {
1083 	struct bdevperf_aggregate_stats *stats;
1084 
1085 	if (g_performance_dump_active) {
1086 		return -1;
1087 	}
1088 
1089 	g_performance_dump_active = true;
1090 
1091 	stats = calloc(1, sizeof(*stats));
1092 	if (stats == NULL) {
1093 		return -1;
1094 	}
1095 
1096 	g_show_performance_period_num++;
1097 
1098 	stats->io_time_in_usec = g_show_performance_period_num * g_show_performance_period_in_usec;
1099 	stats->ema_period = g_show_performance_ema_period;
1100 
1101 	/* Iterate all of the jobs to gather stats
1102 	 * These jobs will not get removed here until a final performance dump is run,
1103 	 * so this should be safe without locking.
1104 	 */
1105 	stats->current_job = TAILQ_FIRST(&g_bdevperf.jobs);
1106 	if (stats->current_job == NULL) {
1107 		spdk_thread_send_msg(g_main_thread, _performance_dump_done, stats);
1108 	} else {
1109 		spdk_thread_send_msg(stats->current_job->thread, _performance_dump, stats);
1110 	}
1111 
1112 	return -1;
1113 }
1114 
1115 static void
1116 bdevperf_test(void)
1117 {
1118 	struct bdevperf_job *job;
1119 
1120 	printf("Running I/O for %" PRIu64 " seconds...\n", g_time_in_usec / 1000000);
1121 	fflush(stdout);
1122 
1123 	/* Start a timer to dump performance numbers */
1124 	g_start_tsc = spdk_get_ticks();
1125 	if (g_show_performance_real_time && !g_perf_timer) {
1126 		printf("\r %-*s: %10s %10s %10s %10s %8s\n",
1127 		       28, "Device Information", "runtime(s)", "IOPS", "MiB/s", "Fail/s", "TO/s");
1128 
1129 		g_perf_timer = SPDK_POLLER_REGISTER(performance_statistics_thread, NULL,
1130 						    g_show_performance_period_in_usec);
1131 	}
1132 
1133 	/* Iterate jobs to start all I/O */
1134 	TAILQ_FOREACH(job, &g_bdevperf.jobs, link) {
1135 		g_bdevperf.running_jobs++;
1136 		spdk_thread_send_msg(job->thread, bdevperf_job_run, job);
1137 	}
1138 }
1139 
1140 static void
1141 bdevperf_bdev_removed(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *event_ctx)
1142 {
1143 	struct bdevperf_job *job = event_ctx;
1144 
1145 	if (SPDK_BDEV_EVENT_REMOVE == type) {
1146 		bdevperf_job_drain(job);
1147 	}
1148 }
1149 
1150 static uint32_t g_construct_job_count = 0;
1151 
1152 static void
1153 _bdevperf_construct_job_done(void *ctx)
1154 {
1155 	if (--g_construct_job_count == 0) {
1156 
1157 		if (g_run_rc != 0) {
1158 			/* Something failed. */
1159 			bdevperf_test_done(NULL);
1160 			return;
1161 		}
1162 
1163 		/* Ready to run the test */
1164 		bdevperf_test();
1165 	} else if (g_run_rc != 0) {
1166 		/* Reset error as some jobs constructed right */
1167 		g_run_rc = 0;
1168 		if (g_continue_on_failure == false) {
1169 			g_error_to_exit = true;
1170 		}
1171 	}
1172 }
1173 
1174 /* Checkformat will not allow to use inlined type,
1175    this is a workaround */
1176 typedef struct spdk_thread *spdk_thread_t;
1177 
1178 static spdk_thread_t
1179 construct_job_thread(struct spdk_cpuset *cpumask, const char *tag)
1180 {
1181 	struct spdk_cpuset tmp;
1182 
1183 	/* This function runs on the main thread. */
1184 	assert(g_main_thread == spdk_get_thread());
1185 
1186 	/* Handle default mask */
1187 	if (spdk_cpuset_count(cpumask) == 0) {
1188 		cpumask = &g_all_cpuset;
1189 	}
1190 
1191 	/* Warn user that mask might need to be changed */
1192 	spdk_cpuset_copy(&tmp, cpumask);
1193 	spdk_cpuset_or(&tmp, &g_all_cpuset);
1194 	if (!spdk_cpuset_equal(&tmp, &g_all_cpuset)) {
1195 		fprintf(stderr, "cpumask for '%s' is too big\n", tag);
1196 	}
1197 
1198 	return spdk_thread_create(tag, cpumask);
1199 }
1200 
1201 static uint32_t
1202 _get_next_core(void)
1203 {
1204 	static uint32_t current_core = SPDK_ENV_LCORE_ID_ANY;
1205 
1206 	if (current_core == SPDK_ENV_LCORE_ID_ANY) {
1207 		current_core = spdk_env_get_first_core();
1208 		return current_core;
1209 	}
1210 
1211 	current_core = spdk_env_get_next_core(current_core);
1212 	if (current_core == SPDK_ENV_LCORE_ID_ANY) {
1213 		current_core = spdk_env_get_first_core();
1214 	}
1215 
1216 	return current_core;
1217 }
1218 
1219 static void
1220 _bdevperf_construct_job(void *ctx)
1221 {
1222 	struct bdevperf_job *job = ctx;
1223 	int rc;
1224 
1225 	rc = spdk_bdev_open_ext(spdk_bdev_get_name(job->bdev), true, bdevperf_bdev_removed, job,
1226 				&job->bdev_desc);
1227 	if (rc != 0) {
1228 		SPDK_ERRLOG("Could not open leaf bdev %s, error=%d\n", spdk_bdev_get_name(job->bdev), rc);
1229 		g_run_rc = -EINVAL;
1230 		goto end;
1231 	}
1232 
1233 	if (g_zcopy) {
1234 		if (!spdk_bdev_io_type_supported(job->bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) {
1235 			printf("Test requires ZCOPY but bdev module does not support ZCOPY\n");
1236 			g_run_rc = -ENOTSUP;
1237 			goto end;
1238 		}
1239 	}
1240 
1241 	job->ch = spdk_bdev_get_io_channel(job->bdev_desc);
1242 	if (!job->ch) {
1243 		SPDK_ERRLOG("Could not get io_channel for device %s, error=%d\n", spdk_bdev_get_name(job->bdev),
1244 			    rc);
1245 		spdk_bdev_close(job->bdev_desc);
1246 		TAILQ_REMOVE(&g_bdevperf.jobs, job, link);
1247 		g_run_rc = -ENOMEM;
1248 		goto end;
1249 	}
1250 
1251 end:
1252 	spdk_thread_send_msg(g_main_thread, _bdevperf_construct_job_done, NULL);
1253 }
1254 
1255 static void
1256 job_init_rw(struct bdevperf_job *job, enum job_config_rw rw)
1257 {
1258 	switch (rw) {
1259 	case JOB_CONFIG_RW_READ:
1260 		job->rw_percentage = 100;
1261 		break;
1262 	case JOB_CONFIG_RW_WRITE:
1263 		job->rw_percentage = 0;
1264 		break;
1265 	case JOB_CONFIG_RW_RANDREAD:
1266 		job->is_random = true;
1267 		job->rw_percentage = 100;
1268 		job->seed = rand();
1269 		break;
1270 	case JOB_CONFIG_RW_RANDWRITE:
1271 		job->is_random = true;
1272 		job->rw_percentage = 0;
1273 		job->seed = rand();
1274 		break;
1275 	case JOB_CONFIG_RW_RW:
1276 		job->is_random = false;
1277 		break;
1278 	case JOB_CONFIG_RW_RANDRW:
1279 		job->is_random = true;
1280 		job->seed = rand();
1281 		break;
1282 	case JOB_CONFIG_RW_VERIFY:
1283 		job->verify = true;
1284 		job->rw_percentage = 50;
1285 		break;
1286 	case JOB_CONFIG_RW_RESET:
1287 		job->reset = true;
1288 		job->verify = true;
1289 		job->rw_percentage = 50;
1290 		break;
1291 	case JOB_CONFIG_RW_UNMAP:
1292 		job->unmap = true;
1293 		break;
1294 	case JOB_CONFIG_RW_FLUSH:
1295 		job->flush = true;
1296 		break;
1297 	case JOB_CONFIG_RW_WRITE_ZEROES:
1298 		job->write_zeroes = true;
1299 		break;
1300 	}
1301 }
1302 
1303 static int
1304 bdevperf_construct_job(struct spdk_bdev *bdev, struct job_config *config,
1305 		       struct spdk_thread *thread)
1306 {
1307 	struct bdevperf_job *job;
1308 	struct bdevperf_task *task;
1309 	int block_size, data_block_size;
1310 	int rc;
1311 	int task_num, n;
1312 
1313 	block_size = spdk_bdev_get_block_size(bdev);
1314 	data_block_size = spdk_bdev_get_data_block_size(bdev);
1315 
1316 	job = calloc(1, sizeof(struct bdevperf_job));
1317 	if (!job) {
1318 		fprintf(stderr, "Unable to allocate memory for new job.\n");
1319 		return -ENOMEM;
1320 	}
1321 
1322 	job->name = strdup(spdk_bdev_get_name(bdev));
1323 	if (!job->name) {
1324 		fprintf(stderr, "Unable to allocate memory for job name.\n");
1325 		free(job);
1326 		return -ENOMEM;
1327 	}
1328 
1329 	job->workload_type = g_workload_type;
1330 	job->io_size = config->bs;
1331 	job->rw_percentage = config->rwmixread;
1332 	job->continue_on_failure = g_continue_on_failure;
1333 	job->queue_depth = config->iodepth;
1334 	job->bdev = bdev;
1335 	job->io_size_blocks = job->io_size / data_block_size;
1336 	job->buf_size = job->io_size_blocks * block_size;
1337 	job->abort = g_abort;
1338 	job_init_rw(job, config->rw);
1339 
1340 	if ((job->io_size % data_block_size) != 0) {
1341 		SPDK_ERRLOG("IO size (%d) is not multiples of data block size of bdev %s (%"PRIu32")\n",
1342 			    job->io_size, spdk_bdev_get_name(bdev), data_block_size);
1343 		free(job->name);
1344 		free(job);
1345 		return -ENOTSUP;
1346 	}
1347 
1348 	if (job->unmap && !spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
1349 		printf("Skipping %s because it does not support unmap\n", spdk_bdev_get_name(bdev));
1350 		free(job->name);
1351 		free(job);
1352 		return -ENOTSUP;
1353 	}
1354 
1355 	if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_REFTAG)) {
1356 		job->dif_check_flags |= SPDK_DIF_FLAGS_REFTAG_CHECK;
1357 	}
1358 	if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_GUARD)) {
1359 		job->dif_check_flags |= SPDK_DIF_FLAGS_GUARD_CHECK;
1360 	}
1361 
1362 	job->offset_in_ios = 0;
1363 
1364 	if (config->length != 0) {
1365 		/* Use subset of disk */
1366 		job->size_in_ios = config->length / job->io_size_blocks;
1367 		job->ios_base = config->offset / job->io_size_blocks;
1368 	} else {
1369 		/* Use whole disk */
1370 		job->size_in_ios = spdk_bdev_get_num_blocks(bdev) / job->io_size_blocks;
1371 		job->ios_base = 0;
1372 	}
1373 
1374 	if (job->is_random && g_zipf_theta > 0) {
1375 		job->zipf = spdk_zipf_create(job->size_in_ios, g_zipf_theta, 0);
1376 	}
1377 
1378 	if (job->verify) {
1379 		job->outstanding = spdk_bit_array_create(job->size_in_ios);
1380 		if (job->outstanding == NULL) {
1381 			SPDK_ERRLOG("Could not create outstanding array bitmap for bdev %s\n",
1382 				    spdk_bdev_get_name(bdev));
1383 			free(job->name);
1384 			free(job);
1385 			return -ENOMEM;
1386 		}
1387 	}
1388 
1389 	TAILQ_INIT(&job->task_list);
1390 
1391 	task_num = job->queue_depth;
1392 	if (job->reset) {
1393 		task_num += 1;
1394 	}
1395 	if (job->abort) {
1396 		task_num += job->queue_depth;
1397 	}
1398 
1399 	TAILQ_INSERT_TAIL(&g_bdevperf.jobs, job, link);
1400 
1401 	for (n = 0; n < task_num; n++) {
1402 		task = calloc(1, sizeof(struct bdevperf_task));
1403 		if (!task) {
1404 			fprintf(stderr, "Failed to allocate task from memory\n");
1405 			return -ENOMEM;
1406 		}
1407 
1408 		task->buf = spdk_zmalloc(job->buf_size, spdk_bdev_get_buf_align(job->bdev), NULL,
1409 					 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
1410 		if (!task->buf) {
1411 			fprintf(stderr, "Cannot allocate buf for task=%p\n", task);
1412 			free(task);
1413 			return -ENOMEM;
1414 		}
1415 
1416 		if (spdk_bdev_is_md_separate(job->bdev)) {
1417 			task->md_buf = spdk_zmalloc(job->io_size_blocks *
1418 						    spdk_bdev_get_md_size(job->bdev), 0, NULL,
1419 						    SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
1420 			if (!task->md_buf) {
1421 				fprintf(stderr, "Cannot allocate md buf for task=%p\n", task);
1422 				spdk_free(task->buf);
1423 				free(task);
1424 				return -ENOMEM;
1425 			}
1426 		}
1427 
1428 		task->job = job;
1429 		TAILQ_INSERT_TAIL(&job->task_list, task, link);
1430 	}
1431 
1432 	job->thread = thread;
1433 
1434 	g_construct_job_count++;
1435 
1436 	rc = spdk_thread_send_msg(thread, _bdevperf_construct_job, job);
1437 	assert(rc == 0);
1438 
1439 	return rc;
1440 }
1441 
1442 static int
1443 parse_rw(const char *str, enum job_config_rw ret)
1444 {
1445 	if (str == NULL) {
1446 		return ret;
1447 	}
1448 
1449 	if (!strcmp(str, "read")) {
1450 		ret = JOB_CONFIG_RW_READ;
1451 	} else if (!strcmp(str, "randread")) {
1452 		ret = JOB_CONFIG_RW_RANDREAD;
1453 	} else if (!strcmp(str, "write")) {
1454 		ret = JOB_CONFIG_RW_WRITE;
1455 	} else if (!strcmp(str, "randwrite")) {
1456 		ret = JOB_CONFIG_RW_RANDWRITE;
1457 	} else if (!strcmp(str, "verify")) {
1458 		ret = JOB_CONFIG_RW_VERIFY;
1459 	} else if (!strcmp(str, "reset")) {
1460 		ret = JOB_CONFIG_RW_RESET;
1461 	} else if (!strcmp(str, "unmap")) {
1462 		ret = JOB_CONFIG_RW_UNMAP;
1463 	} else if (!strcmp(str, "write_zeroes")) {
1464 		ret = JOB_CONFIG_RW_WRITE_ZEROES;
1465 	} else if (!strcmp(str, "flush")) {
1466 		ret = JOB_CONFIG_RW_FLUSH;
1467 	} else if (!strcmp(str, "rw")) {
1468 		ret = JOB_CONFIG_RW_RW;
1469 	} else if (!strcmp(str, "randrw")) {
1470 		ret = JOB_CONFIG_RW_RANDRW;
1471 	} else {
1472 		fprintf(stderr, "rw must be one of\n"
1473 			"(read, write, randread, randwrite, rw, randrw, verify, reset, unmap, flush)\n");
1474 		ret = BDEVPERF_CONFIG_ERROR;
1475 	}
1476 
1477 	return ret;
1478 }
1479 
1480 static const char *
1481 config_filename_next(const char *filename, char *out)
1482 {
1483 	int i, k;
1484 
1485 	if (filename == NULL) {
1486 		out[0] = '\0';
1487 		return NULL;
1488 	}
1489 
1490 	if (filename[0] == ':') {
1491 		filename++;
1492 	}
1493 
1494 	for (i = 0, k = 0;
1495 	     filename[i] != '\0' &&
1496 	     filename[i] != ':' &&
1497 	     i < BDEVPERF_CONFIG_MAX_FILENAME;
1498 	     i++) {
1499 		if (filename[i] == ' ' || filename[i] == '\t') {
1500 			continue;
1501 		}
1502 
1503 		out[k++] = filename[i];
1504 	}
1505 	out[k] = 0;
1506 
1507 	return filename + i;
1508 }
1509 
1510 static void
1511 bdevperf_construct_jobs(void)
1512 {
1513 	char filename[BDEVPERF_CONFIG_MAX_FILENAME];
1514 	struct spdk_thread *thread;
1515 	struct job_config *config;
1516 	struct spdk_bdev *bdev;
1517 	const char *filenames;
1518 	int rc;
1519 
1520 	TAILQ_FOREACH(config, &job_config_list, link) {
1521 		filenames = config->filename;
1522 
1523 		thread = construct_job_thread(&config->cpumask, config->name);
1524 		assert(thread);
1525 
1526 		while (filenames) {
1527 			filenames = config_filename_next(filenames, filename);
1528 			if (strlen(filename) == 0) {
1529 				break;
1530 			}
1531 
1532 			bdev = spdk_bdev_get_by_name(filename);
1533 			if (!bdev) {
1534 				fprintf(stderr, "Unable to find bdev '%s'\n", filename);
1535 				g_run_rc = -EINVAL;
1536 				return;
1537 			}
1538 
1539 			rc = bdevperf_construct_job(bdev, config, thread);
1540 			if (rc < 0) {
1541 				g_run_rc = rc;
1542 				return;
1543 			}
1544 		}
1545 	}
1546 }
1547 
1548 static int
1549 make_cli_job_config(const char *filename, int64_t offset, uint64_t range)
1550 {
1551 	struct job_config *config = calloc(1, sizeof(*config));
1552 
1553 	if (config == NULL) {
1554 		fprintf(stderr, "Unable to allocate memory for job config\n");
1555 		return -ENOMEM;
1556 	}
1557 
1558 	config->name = filename;
1559 	config->filename = filename;
1560 	spdk_cpuset_zero(&config->cpumask);
1561 	spdk_cpuset_set_cpu(&config->cpumask, _get_next_core(), true);
1562 	config->bs = g_io_size;
1563 	config->iodepth = g_queue_depth;
1564 	config->rwmixread = g_rw_percentage;
1565 	config->offset = offset;
1566 	config->length = range;
1567 	config->rw = parse_rw(g_workload_type, BDEVPERF_CONFIG_ERROR);
1568 	if ((int)config->rw == BDEVPERF_CONFIG_ERROR) {
1569 		return -EINVAL;
1570 	}
1571 
1572 	TAILQ_INSERT_TAIL(&job_config_list, config, link);
1573 	return 0;
1574 }
1575 
1576 static void
1577 bdevperf_construct_multithread_job_configs(void)
1578 {
1579 	struct spdk_bdev *bdev;
1580 	uint32_t i;
1581 	uint32_t num_cores;
1582 	uint64_t blocks_per_job;
1583 	int64_t offset;
1584 
1585 	num_cores = 0;
1586 	SPDK_ENV_FOREACH_CORE(i) {
1587 		num_cores++;
1588 	}
1589 
1590 	if (num_cores == 0) {
1591 		g_run_rc = -EINVAL;
1592 		return;
1593 	}
1594 
1595 	if (g_job_bdev_name != NULL) {
1596 		bdev = spdk_bdev_get_by_name(g_job_bdev_name);
1597 		if (!bdev) {
1598 			fprintf(stderr, "Unable to find bdev '%s'\n", g_job_bdev_name);
1599 			return;
1600 		}
1601 
1602 		blocks_per_job = spdk_bdev_get_num_blocks(bdev) / num_cores;
1603 		offset = 0;
1604 
1605 		SPDK_ENV_FOREACH_CORE(i) {
1606 			g_run_rc = make_cli_job_config(g_job_bdev_name, offset, blocks_per_job);
1607 			if (g_run_rc) {
1608 				return;
1609 			}
1610 
1611 			offset += blocks_per_job;
1612 		}
1613 	} else {
1614 		bdev = spdk_bdev_first_leaf();
1615 		while (bdev != NULL) {
1616 			blocks_per_job = spdk_bdev_get_num_blocks(bdev) / num_cores;
1617 			offset = 0;
1618 
1619 			SPDK_ENV_FOREACH_CORE(i) {
1620 				g_run_rc = make_cli_job_config(spdk_bdev_get_name(bdev),
1621 							       offset, blocks_per_job);
1622 				if (g_run_rc) {
1623 					return;
1624 				}
1625 
1626 				offset += blocks_per_job;
1627 			}
1628 
1629 			bdev = spdk_bdev_next_leaf(bdev);
1630 		}
1631 	}
1632 }
1633 
1634 static void
1635 bdevperf_construct_job_configs(void)
1636 {
1637 	struct spdk_bdev *bdev;
1638 
1639 	/* There are three different modes for allocating jobs. Standard mode
1640 	 * (the default) creates one spdk_thread per bdev and runs the I/O job there.
1641 	 *
1642 	 * The -C flag places bdevperf into "multithread" mode, meaning it creates
1643 	 * one spdk_thread per bdev PER CORE, and runs a copy of the job on each.
1644 	 * This runs multiple threads per bdev, effectively.
1645 	 *
1646 	 * The -j flag implies "FIO" mode which tries to mimic semantic of FIO jobs.
1647 	 * In "FIO" mode, threads are spawned per-job instead of per-bdev.
1648 	 * Each FIO job can be individually parameterized by filename, cpu mask, etc,
1649 	 * which is different from other modes in that they only support global options.
1650 	 */
1651 
1652 	if (g_bdevperf_conf) {
1653 		goto end;
1654 	} else if (g_multithread_mode) {
1655 		bdevperf_construct_multithread_job_configs();
1656 		goto end;
1657 	}
1658 
1659 	if (g_job_bdev_name != NULL) {
1660 		bdev = spdk_bdev_get_by_name(g_job_bdev_name);
1661 		if (bdev) {
1662 			/* Construct the job */
1663 			g_run_rc = make_cli_job_config(g_job_bdev_name, 0, 0);
1664 		} else {
1665 			fprintf(stderr, "Unable to find bdev '%s'\n", g_job_bdev_name);
1666 		}
1667 	} else {
1668 		bdev = spdk_bdev_first_leaf();
1669 
1670 		while (bdev != NULL) {
1671 			/* Construct the job */
1672 			g_run_rc = make_cli_job_config(spdk_bdev_get_name(bdev), 0, 0);
1673 			if (g_run_rc) {
1674 				break;
1675 			}
1676 
1677 			bdev = spdk_bdev_next_leaf(bdev);
1678 		}
1679 	}
1680 
1681 end:
1682 	/* Increment initial construct_jobs count so that it will never reach 0 in the middle
1683 	 * of iteration.
1684 	 */
1685 	g_construct_job_count = 1;
1686 
1687 	if (g_run_rc == 0) {
1688 		bdevperf_construct_jobs();
1689 	}
1690 
1691 	_bdevperf_construct_job_done(NULL);
1692 }
1693 
1694 static int
1695 parse_uint_option(struct spdk_conf_section *s, const char *name, int def)
1696 {
1697 	const char *job_name;
1698 	int tmp;
1699 
1700 	tmp = spdk_conf_section_get_intval(s, name);
1701 	if (tmp == -1) {
1702 		/* Field was not found. Check default value
1703 		 * In [global] section it is ok to have undefined values
1704 		 * but for other sections it is not ok */
1705 		if (def == BDEVPERF_CONFIG_UNDEFINED) {
1706 			job_name = spdk_conf_section_get_name(s);
1707 			if (strcmp(job_name, "global") == 0) {
1708 				return def;
1709 			}
1710 
1711 			fprintf(stderr,
1712 				"Job '%s' has no '%s' assigned\n",
1713 				job_name, name);
1714 			return BDEVPERF_CONFIG_ERROR;
1715 		}
1716 		return def;
1717 	}
1718 
1719 	/* NOTE: get_intval returns nonnegative on success */
1720 	if (tmp < 0) {
1721 		fprintf(stderr, "Job '%s' has bad '%s' value.\n",
1722 			spdk_conf_section_get_name(s), name);
1723 		return BDEVPERF_CONFIG_ERROR;
1724 	}
1725 
1726 	return tmp;
1727 }
1728 
1729 /* CLI arguments override parameters for global sections */
1730 static void
1731 config_set_cli_args(struct job_config *config)
1732 {
1733 	if (g_job_bdev_name) {
1734 		config->filename = g_job_bdev_name;
1735 	}
1736 	if (g_io_size > 0) {
1737 		config->bs = g_io_size;
1738 	}
1739 	if (g_queue_depth > 0) {
1740 		config->iodepth = g_queue_depth;
1741 	}
1742 	if (g_rw_percentage > 0) {
1743 		config->rwmixread = g_rw_percentage;
1744 	}
1745 	if (g_workload_type) {
1746 		config->rw = parse_rw(g_workload_type, config->rw);
1747 	}
1748 }
1749 
1750 static int
1751 read_job_config(void)
1752 {
1753 	struct job_config global_default_config;
1754 	struct job_config global_config;
1755 	struct spdk_conf_section *s;
1756 	struct job_config *config;
1757 	const char *cpumask;
1758 	const char *rw;
1759 	bool is_global;
1760 	int n = 0;
1761 	int val;
1762 
1763 	if (g_bdevperf_conf_file == NULL) {
1764 		return 0;
1765 	}
1766 
1767 	g_bdevperf_conf = spdk_conf_allocate();
1768 	if (g_bdevperf_conf == NULL) {
1769 		fprintf(stderr, "Could not allocate job config structure\n");
1770 		return 1;
1771 	}
1772 
1773 	spdk_conf_disable_sections_merge(g_bdevperf_conf);
1774 	if (spdk_conf_read(g_bdevperf_conf, g_bdevperf_conf_file)) {
1775 		fprintf(stderr, "Invalid job config");
1776 		return 1;
1777 	}
1778 
1779 	/* Initialize global defaults */
1780 	global_default_config.filename = NULL;
1781 	/* Zero mask is the same as g_all_cpuset
1782 	 * The g_all_cpuset is not initialized yet,
1783 	 * so use zero mask as the default instead */
1784 	spdk_cpuset_zero(&global_default_config.cpumask);
1785 	global_default_config.bs = BDEVPERF_CONFIG_UNDEFINED;
1786 	global_default_config.iodepth = BDEVPERF_CONFIG_UNDEFINED;
1787 	/* bdevperf has no default for -M option but in FIO the default is 50 */
1788 	global_default_config.rwmixread = 50;
1789 	global_default_config.offset = 0;
1790 	/* length 0 means 100% */
1791 	global_default_config.length = 0;
1792 	global_default_config.rw = BDEVPERF_CONFIG_UNDEFINED;
1793 	config_set_cli_args(&global_default_config);
1794 
1795 	if ((int)global_default_config.rw == BDEVPERF_CONFIG_ERROR) {
1796 		return 1;
1797 	}
1798 
1799 	/* There is only a single instance of global job_config
1800 	 * We just reset its value when we encounter new [global] section */
1801 	global_config = global_default_config;
1802 
1803 	for (s = spdk_conf_first_section(g_bdevperf_conf);
1804 	     s != NULL;
1805 	     s = spdk_conf_next_section(s)) {
1806 		config = calloc(1, sizeof(*config));
1807 		if (config == NULL) {
1808 			fprintf(stderr, "Unable to allocate memory for job config\n");
1809 			return 1;
1810 		}
1811 
1812 		config->name = spdk_conf_section_get_name(s);
1813 		is_global = strcmp(config->name, "global") == 0;
1814 
1815 		if (is_global) {
1816 			global_config = global_default_config;
1817 		}
1818 
1819 		config->filename = spdk_conf_section_get_val(s, "filename");
1820 		if (config->filename == NULL) {
1821 			config->filename = global_config.filename;
1822 		}
1823 		if (!is_global) {
1824 			if (config->filename == NULL) {
1825 				fprintf(stderr, "Job '%s' expects 'filename' parameter\n", config->name);
1826 				goto error;
1827 			} else if (strnlen(config->filename, BDEVPERF_CONFIG_MAX_FILENAME)
1828 				   >= BDEVPERF_CONFIG_MAX_FILENAME) {
1829 				fprintf(stderr,
1830 					"filename for '%s' job is too long. Max length is %d\n",
1831 					config->name, BDEVPERF_CONFIG_MAX_FILENAME);
1832 				goto error;
1833 			}
1834 		}
1835 
1836 		cpumask = spdk_conf_section_get_val(s, "cpumask");
1837 		if (cpumask == NULL) {
1838 			config->cpumask = global_config.cpumask;
1839 		} else if (spdk_cpuset_parse(&config->cpumask, cpumask)) {
1840 			fprintf(stderr, "Job '%s' has bad 'cpumask' value\n", config->name);
1841 			goto error;
1842 		}
1843 
1844 		config->bs = parse_uint_option(s, "bs", global_config.bs);
1845 		if (config->bs == BDEVPERF_CONFIG_ERROR) {
1846 			goto error;
1847 		} else if (config->bs == 0) {
1848 			fprintf(stderr, "'bs' of job '%s' must be greater than 0\n", config->name);
1849 			goto error;
1850 		}
1851 
1852 		config->iodepth = parse_uint_option(s, "iodepth", global_config.iodepth);
1853 		if (config->iodepth == BDEVPERF_CONFIG_ERROR) {
1854 			goto error;
1855 		} else if (config->iodepth == 0) {
1856 			fprintf(stderr,
1857 				"'iodepth' of job '%s' must be greater than 0\n",
1858 				config->name);
1859 			goto error;
1860 		}
1861 
1862 		config->rwmixread = parse_uint_option(s, "rwmixread", global_config.rwmixread);
1863 		if (config->rwmixread == BDEVPERF_CONFIG_ERROR) {
1864 			goto error;
1865 		} else if (config->rwmixread > 100) {
1866 			fprintf(stderr,
1867 				"'rwmixread' value of '%s' job is not in 0-100 range\n",
1868 				config->name);
1869 			goto error;
1870 		}
1871 
1872 		config->offset = parse_uint_option(s, "offset", global_config.offset);
1873 		if (config->offset == BDEVPERF_CONFIG_ERROR) {
1874 			goto error;
1875 		}
1876 
1877 		val = parse_uint_option(s, "length", global_config.length);
1878 		if (val == BDEVPERF_CONFIG_ERROR) {
1879 			goto error;
1880 		}
1881 		config->length = val;
1882 
1883 		rw = spdk_conf_section_get_val(s, "rw");
1884 		config->rw = parse_rw(rw, global_config.rw);
1885 		if ((int)config->rw == BDEVPERF_CONFIG_ERROR) {
1886 			fprintf(stderr, "Job '%s' has bad 'rw' value\n", config->name);
1887 			goto error;
1888 		} else if (!is_global && (int)config->rw == BDEVPERF_CONFIG_UNDEFINED) {
1889 			fprintf(stderr, "Job '%s' has no 'rw' assigned\n", config->name);
1890 			goto error;
1891 		}
1892 
1893 		if (is_global) {
1894 			config_set_cli_args(config);
1895 			global_config = *config;
1896 			free(config);
1897 		} else {
1898 			TAILQ_INSERT_TAIL(&job_config_list, config, link);
1899 			n++;
1900 		}
1901 	}
1902 
1903 	printf("Using job config with %d jobs\n", n);
1904 	return 0;
1905 error:
1906 	free(config);
1907 	return 1;
1908 }
1909 
1910 static void
1911 bdevperf_run(void *arg1)
1912 {
1913 	uint32_t i;
1914 
1915 	g_main_thread = spdk_get_thread();
1916 
1917 	spdk_cpuset_zero(&g_all_cpuset);
1918 	SPDK_ENV_FOREACH_CORE(i) {
1919 		spdk_cpuset_set_cpu(&g_all_cpuset, i, true);
1920 	}
1921 
1922 	if (g_wait_for_tests) {
1923 		/* Do not perform any tests until RPC is received */
1924 		return;
1925 	}
1926 
1927 	bdevperf_construct_job_configs();
1928 }
1929 
1930 static void
1931 rpc_perform_tests_cb(void)
1932 {
1933 	struct spdk_json_write_ctx *w;
1934 	struct spdk_jsonrpc_request *request = g_request;
1935 
1936 	g_request = NULL;
1937 
1938 	if (g_run_rc == 0) {
1939 		w = spdk_jsonrpc_begin_result(request);
1940 		spdk_json_write_uint32(w, g_run_rc);
1941 		spdk_jsonrpc_end_result(request, w);
1942 	} else {
1943 		spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
1944 						     "bdevperf failed with error %s", spdk_strerror(-g_run_rc));
1945 	}
1946 
1947 	/* Reset g_run_rc to 0 for the next test run. */
1948 	g_run_rc = 0;
1949 
1950 	/* Reset g_stats to 0 for the next test run. */
1951 	memset(&g_stats, 0, sizeof(g_stats));
1952 }
1953 
1954 static void
1955 rpc_perform_tests(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params)
1956 {
1957 	if (params != NULL) {
1958 		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
1959 						 "perform_tests method requires no parameters");
1960 		return;
1961 	}
1962 	if (g_request != NULL) {
1963 		fprintf(stderr, "Another test is already in progress.\n");
1964 		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
1965 						 spdk_strerror(-EINPROGRESS));
1966 		return;
1967 	}
1968 	g_request = request;
1969 
1970 	/* Only construct job configs at the first test run.  */
1971 	if (TAILQ_EMPTY(&job_config_list)) {
1972 		bdevperf_construct_job_configs();
1973 	} else {
1974 		bdevperf_construct_jobs();
1975 	}
1976 }
1977 SPDK_RPC_REGISTER("perform_tests", rpc_perform_tests, SPDK_RPC_RUNTIME)
1978 
1979 static void
1980 _bdevperf_job_drain(void *ctx)
1981 {
1982 	bdevperf_job_drain(ctx);
1983 }
1984 
1985 static void
1986 spdk_bdevperf_shutdown_cb(void)
1987 {
1988 	g_shutdown = true;
1989 	struct bdevperf_job *job, *tmp;
1990 
1991 	if (g_bdevperf.running_jobs == 0) {
1992 		bdevperf_test_done(NULL);
1993 		return;
1994 	}
1995 
1996 	/* Iterate jobs to stop all I/O */
1997 	TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, tmp) {
1998 		spdk_thread_send_msg(job->thread, _bdevperf_job_drain, job);
1999 	}
2000 }
2001 
2002 static int
2003 bdevperf_parse_arg(int ch, char *arg)
2004 {
2005 	long long tmp;
2006 
2007 	if (ch == 'w') {
2008 		g_workload_type = optarg;
2009 	} else if (ch == 'T') {
2010 		g_job_bdev_name = optarg;
2011 	} else if (ch == 'z') {
2012 		g_wait_for_tests = true;
2013 	} else if (ch == 'Z') {
2014 		g_zcopy = true;
2015 	} else if (ch == 'X') {
2016 		g_abort = true;
2017 	} else if (ch == 'C') {
2018 		g_multithread_mode = true;
2019 	} else if (ch == 'f') {
2020 		g_continue_on_failure = true;
2021 	} else if (ch == 'j') {
2022 		g_bdevperf_conf_file = optarg;
2023 	} else if (ch == 'F') {
2024 		char *endptr;
2025 
2026 		errno = 0;
2027 		g_zipf_theta = strtod(optarg, &endptr);
2028 		if (errno || optarg == endptr || g_zipf_theta < 0) {
2029 			fprintf(stderr, "Illegal zipf theta value %s\n", optarg);
2030 			return -EINVAL;
2031 		}
2032 	} else {
2033 		tmp = spdk_strtoll(optarg, 10);
2034 		if (tmp < 0) {
2035 			fprintf(stderr, "Parse failed for the option %c.\n", ch);
2036 			return tmp;
2037 		} else if (tmp >= INT_MAX) {
2038 			fprintf(stderr, "Parsed option was too large %c.\n", ch);
2039 			return -ERANGE;
2040 		}
2041 
2042 		switch (ch) {
2043 		case 'q':
2044 			g_queue_depth = tmp;
2045 			break;
2046 		case 'o':
2047 			g_io_size = tmp;
2048 			break;
2049 		case 't':
2050 			g_time_in_sec = tmp;
2051 			break;
2052 		case 'k':
2053 			g_timeout_in_sec = tmp;
2054 			break;
2055 		case 'M':
2056 			g_rw_percentage = tmp;
2057 			g_mix_specified = true;
2058 			break;
2059 		case 'P':
2060 			g_show_performance_ema_period = tmp;
2061 			break;
2062 		case 'S':
2063 			g_show_performance_real_time = 1;
2064 			g_show_performance_period_in_usec = tmp * 1000000;
2065 			break;
2066 		default:
2067 			return -EINVAL;
2068 		}
2069 	}
2070 	return 0;
2071 }
2072 
2073 static void
2074 bdevperf_usage(void)
2075 {
2076 	printf(" -q <depth>                io depth\n");
2077 	printf(" -o <size>                 io size in bytes\n");
2078 	printf(" -w <type>                 io pattern type, must be one of (read, write, randread, randwrite, rw, randrw, verify, reset, unmap, flush)\n");
2079 	printf(" -t <time>                 time in seconds\n");
2080 	printf(" -k <timeout>              timeout in seconds to detect starved I/O (default is 0 and disabled)\n");
2081 	printf(" -M <percent>              rwmixread (100 for reads, 0 for writes)\n");
2082 	printf(" -P <num>                  number of moving average period\n");
2083 	printf("\t\t(If set to n, show weighted mean of the previous n IO/s in real time)\n");
2084 	printf("\t\t(Formula: M = 2 / (n + 1), EMA[i+1] = IO/s * M + (1 - M) * EMA[i])\n");
2085 	printf("\t\t(only valid with -S)\n");
2086 	printf(" -S <period>               show performance result in real time every <period> seconds\n");
2087 	printf(" -T <bdev>                 bdev to run against. Default: all available bdevs.\n");
2088 	printf(" -f                        continue processing I/O even after failures\n");
2089 	printf(" -F <zipf theta>           use zipf distribution for random I/O\n");
2090 	printf(" -Z                        enable using zcopy bdev API for read or write I/O\n");
2091 	printf(" -z                        start bdevperf, but wait for RPC to start tests\n");
2092 	printf(" -X                        abort timed out I/O\n");
2093 	printf(" -C                        enable every core to send I/Os to each bdev\n");
2094 	printf(" -j <filename>             use job config file\n");
2095 }
2096 
2097 static int
2098 verify_test_params(struct spdk_app_opts *opts)
2099 {
2100 	/* When RPC is used for starting tests and
2101 	 * no rpc_addr was configured for the app,
2102 	 * use the default address. */
2103 	if (g_wait_for_tests && opts->rpc_addr == NULL) {
2104 		opts->rpc_addr = SPDK_DEFAULT_RPC_ADDR;
2105 	}
2106 
2107 	if (!g_bdevperf_conf_file && g_queue_depth <= 0) {
2108 		goto out;
2109 	}
2110 	if (!g_bdevperf_conf_file && g_io_size <= 0) {
2111 		goto out;
2112 	}
2113 	if (!g_bdevperf_conf_file && !g_workload_type) {
2114 		goto out;
2115 	}
2116 	if (g_time_in_sec <= 0) {
2117 		goto out;
2118 	}
2119 	g_time_in_usec = g_time_in_sec * 1000000LL;
2120 
2121 	if (g_timeout_in_sec < 0) {
2122 		goto out;
2123 	}
2124 
2125 	if (g_abort && !g_timeout_in_sec) {
2126 		printf("Timeout must be set for abort option, Ignoring g_abort\n");
2127 	}
2128 
2129 	if (g_show_performance_ema_period > 0 &&
2130 	    g_show_performance_real_time == 0) {
2131 		fprintf(stderr, "-P option must be specified with -S option\n");
2132 		return 1;
2133 	}
2134 
2135 	if (g_io_size > SPDK_BDEV_LARGE_BUF_MAX_SIZE) {
2136 		printf("I/O size of %d is greater than zero copy threshold (%d).\n",
2137 		       g_io_size, SPDK_BDEV_LARGE_BUF_MAX_SIZE);
2138 		printf("Zero copy mechanism will not be used.\n");
2139 		g_zcopy = false;
2140 	}
2141 
2142 	if (g_bdevperf_conf_file) {
2143 		/* workload_type verification happens during config file parsing */
2144 		return 0;
2145 	}
2146 
2147 	if (!strcmp(g_workload_type, "verify") ||
2148 	    !strcmp(g_workload_type, "reset")) {
2149 		g_rw_percentage = 50;
2150 		if (g_io_size > SPDK_BDEV_LARGE_BUF_MAX_SIZE) {
2151 			fprintf(stderr, "Unable to exceed max I/O size of %d for verify. (%d provided).\n",
2152 				SPDK_BDEV_LARGE_BUF_MAX_SIZE, g_io_size);
2153 			return 1;
2154 		}
2155 		g_verify = true;
2156 		if (!strcmp(g_workload_type, "reset")) {
2157 			g_reset = true;
2158 		}
2159 	}
2160 
2161 	if (!strcmp(g_workload_type, "read") ||
2162 	    !strcmp(g_workload_type, "randread") ||
2163 	    !strcmp(g_workload_type, "write") ||
2164 	    !strcmp(g_workload_type, "randwrite") ||
2165 	    !strcmp(g_workload_type, "verify") ||
2166 	    !strcmp(g_workload_type, "reset") ||
2167 	    !strcmp(g_workload_type, "unmap") ||
2168 	    !strcmp(g_workload_type, "write_zeroes") ||
2169 	    !strcmp(g_workload_type, "flush")) {
2170 		if (g_mix_specified) {
2171 			fprintf(stderr, "Ignoring -M option... Please use -M option"
2172 				" only when using rw or randrw.\n");
2173 		}
2174 	}
2175 
2176 	if (!strcmp(g_workload_type, "rw") ||
2177 	    !strcmp(g_workload_type, "randrw")) {
2178 		if (g_rw_percentage < 0 || g_rw_percentage > 100) {
2179 			fprintf(stderr,
2180 				"-M must be specified to value from 0 to 100 "
2181 				"for rw or randrw.\n");
2182 			return 1;
2183 		}
2184 	}
2185 
2186 	return 0;
2187 out:
2188 	spdk_app_usage();
2189 	bdevperf_usage();
2190 	return 1;
2191 }
2192 
2193 int
2194 main(int argc, char **argv)
2195 {
2196 	struct spdk_app_opts opts = {};
2197 	int rc;
2198 
2199 	/* Use the runtime PID to set the random seed */
2200 	srand(getpid());
2201 
2202 	spdk_app_opts_init(&opts, sizeof(opts));
2203 	opts.name = "bdevperf";
2204 	opts.rpc_addr = NULL;
2205 	opts.shutdown_cb = spdk_bdevperf_shutdown_cb;
2206 
2207 	if ((rc = spdk_app_parse_args(argc, argv, &opts, "Zzfq:o:t:w:k:CF:M:P:S:T:Xj:", NULL,
2208 				      bdevperf_parse_arg, bdevperf_usage)) !=
2209 	    SPDK_APP_PARSE_ARGS_SUCCESS) {
2210 		return rc;
2211 	}
2212 
2213 	if (read_job_config()) {
2214 		free_job_config();
2215 		return 1;
2216 	}
2217 
2218 	if (verify_test_params(&opts) != 0) {
2219 		free_job_config();
2220 		exit(1);
2221 	}
2222 
2223 	rc = spdk_app_start(&opts, bdevperf_run, NULL);
2224 
2225 	spdk_app_fini();
2226 	free_job_config();
2227 	return rc;
2228 }
2229