xref: /spdk/examples/bdev/bdevperf/bdevperf.c (revision 60982c759db49b4f4579f16e3b24df0725ba4b94)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2016 Intel Corporation.
3  *   Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES.
4  *   All rights reserved.
5  */
6 
7 #include "spdk/stdinc.h"
8 
9 #include "spdk/bdev.h"
10 #include "spdk/accel.h"
11 #include "spdk/endian.h"
12 #include "spdk/env.h"
13 #include "spdk/event.h"
14 #include "spdk/log.h"
15 #include "spdk/util.h"
16 #include "spdk/thread.h"
17 #include "spdk/string.h"
18 #include "spdk/rpc.h"
19 #include "spdk/bit_array.h"
20 #include "spdk/conf.h"
21 #include "spdk/zipf.h"
22 #include "spdk/histogram_data.h"
23 
24 #define BDEVPERF_CONFIG_MAX_FILENAME 1024
25 #define BDEVPERF_CONFIG_UNDEFINED -1
26 #define BDEVPERF_CONFIG_ERROR -2
27 
28 struct bdevperf_task {
29 	struct iovec			iov;
30 	struct bdevperf_job		*job;
31 	struct spdk_bdev_io		*bdev_io;
32 	void				*buf;
33 	void				*md_buf;
34 	uint64_t			offset_blocks;
35 	struct bdevperf_task		*task_to_abort;
36 	enum spdk_bdev_io_type		io_type;
37 	TAILQ_ENTRY(bdevperf_task)	link;
38 	struct spdk_bdev_io_wait_entry	bdev_io_wait;
39 };
40 
41 static const char *g_workload_type = NULL;
42 static int g_io_size = 0;
43 /* initialize to invalid value so we can detect if user overrides it. */
44 static int g_rw_percentage = -1;
45 static bool g_verify = false;
46 static bool g_reset = false;
47 static bool g_continue_on_failure = false;
48 static bool g_abort = false;
49 static bool g_error_to_exit = false;
50 static int g_queue_depth = 0;
51 static uint64_t g_time_in_usec;
52 static int g_show_performance_real_time = 0;
53 static uint64_t g_show_performance_period_in_usec = SPDK_SEC_TO_USEC;
54 static uint64_t g_show_performance_period_num = 0;
55 static uint64_t g_show_performance_ema_period = 0;
56 static int g_run_rc = 0;
57 static bool g_shutdown = false;
58 static uint64_t g_start_tsc;
59 static uint64_t g_shutdown_tsc;
60 static bool g_zcopy = false;
61 static struct spdk_thread *g_main_thread;
62 static int g_time_in_sec = 0;
63 static bool g_mix_specified = false;
64 static const char *g_job_bdev_name;
65 static bool g_wait_for_tests = false;
66 static struct spdk_jsonrpc_request *g_request = NULL;
67 static bool g_multithread_mode = false;
68 static int g_timeout_in_sec;
69 static struct spdk_conf *g_bdevperf_conf = NULL;
70 static const char *g_bdevperf_conf_file = NULL;
71 static double g_zipf_theta;
72 static bool g_random_map = false;
73 
74 static struct spdk_cpuset g_all_cpuset;
75 static struct spdk_poller *g_perf_timer = NULL;
76 
77 static void bdevperf_submit_single(struct bdevperf_job *job, struct bdevperf_task *task);
78 static void rpc_perform_tests_cb(void);
79 
80 static uint32_t g_bdev_count = 0;
81 static uint32_t g_latency_display_level;
82 
83 static bool g_one_thread_per_lcore = false;
84 
85 static const double g_latency_cutoffs[] = {
86 	0.01,
87 	0.10,
88 	0.25,
89 	0.50,
90 	0.75,
91 	0.90,
92 	0.95,
93 	0.98,
94 	0.99,
95 	0.995,
96 	0.999,
97 	0.9999,
98 	0.99999,
99 	0.999999,
100 	0.9999999,
101 	-1,
102 };
103 
104 struct latency_info {
105 	uint64_t	min;
106 	uint64_t	max;
107 	uint64_t	total;
108 };
109 
110 struct bdevperf_job {
111 	char				*name;
112 	struct spdk_bdev		*bdev;
113 	struct spdk_bdev_desc		*bdev_desc;
114 	struct spdk_io_channel		*ch;
115 	TAILQ_ENTRY(bdevperf_job)	link;
116 	struct spdk_thread		*thread;
117 
118 	const char			*workload_type;
119 	int				io_size;
120 	int				rw_percentage;
121 	bool				is_random;
122 	bool				verify;
123 	bool				reset;
124 	bool				continue_on_failure;
125 	bool				unmap;
126 	bool				write_zeroes;
127 	bool				flush;
128 	bool				abort;
129 	int				queue_depth;
130 	unsigned int			seed;
131 
132 	uint64_t			io_completed;
133 	uint64_t			io_failed;
134 	uint64_t			io_timeout;
135 	uint64_t			prev_io_completed;
136 	double				ema_io_per_second;
137 	int				current_queue_depth;
138 	uint64_t			size_in_ios;
139 	uint64_t			ios_base;
140 	uint64_t			offset_in_ios;
141 	uint64_t			io_size_blocks;
142 	uint64_t			buf_size;
143 	uint32_t			dif_check_flags;
144 	bool				is_draining;
145 	struct spdk_poller		*run_timer;
146 	struct spdk_poller		*reset_timer;
147 	struct spdk_bit_array		*outstanding;
148 	struct spdk_zipf		*zipf;
149 	TAILQ_HEAD(, bdevperf_task)	task_list;
150 	uint64_t			run_time_in_usec;
151 
152 	/* keep channel's histogram data before being destroyed */
153 	struct spdk_histogram_data	*histogram;
154 	struct spdk_bit_array		*random_map;
155 };
156 
157 struct spdk_bdevperf {
158 	TAILQ_HEAD(, bdevperf_job)	jobs;
159 	uint32_t			running_jobs;
160 };
161 
162 static struct spdk_bdevperf g_bdevperf = {
163 	.jobs = TAILQ_HEAD_INITIALIZER(g_bdevperf.jobs),
164 	.running_jobs = 0,
165 };
166 
167 enum job_config_rw {
168 	JOB_CONFIG_RW_READ = 0,
169 	JOB_CONFIG_RW_WRITE,
170 	JOB_CONFIG_RW_RANDREAD,
171 	JOB_CONFIG_RW_RANDWRITE,
172 	JOB_CONFIG_RW_RW,
173 	JOB_CONFIG_RW_RANDRW,
174 	JOB_CONFIG_RW_VERIFY,
175 	JOB_CONFIG_RW_RESET,
176 	JOB_CONFIG_RW_UNMAP,
177 	JOB_CONFIG_RW_FLUSH,
178 	JOB_CONFIG_RW_WRITE_ZEROES,
179 };
180 
181 /* Storing values from a section of job config file */
182 struct job_config {
183 	const char			*name;
184 	const char			*filename;
185 	struct spdk_cpuset		cpumask;
186 	int				bs;
187 	int				iodepth;
188 	int				rwmixread;
189 	uint32_t			lcore;
190 	int64_t				offset;
191 	uint64_t			length;
192 	enum job_config_rw		rw;
193 	TAILQ_ENTRY(job_config)	link;
194 };
195 
196 TAILQ_HEAD(, job_config) job_config_list
197 	= TAILQ_HEAD_INITIALIZER(job_config_list);
198 
199 static bool g_performance_dump_active = false;
200 
201 struct bdevperf_aggregate_stats {
202 	struct bdevperf_job		*current_job;
203 	uint64_t			io_time_in_usec;
204 	uint64_t			ema_period;
205 	double				total_io_per_second;
206 	double				total_mb_per_second;
207 	double				total_failed_per_second;
208 	double				total_timeout_per_second;
209 	double				min_latency;
210 	double				max_latency;
211 	uint64_t			total_io_completed;
212 	uint64_t			total_tsc;
213 };
214 
215 static struct bdevperf_aggregate_stats g_stats = {.min_latency = (double)UINT64_MAX};
216 
217 struct lcore_thread {
218 	struct spdk_thread		*thread;
219 	uint32_t			lcore;
220 	TAILQ_ENTRY(lcore_thread)	link;
221 };
222 
223 TAILQ_HEAD(, lcore_thread) g_lcore_thread_list
224 	= TAILQ_HEAD_INITIALIZER(g_lcore_thread_list);
225 
226 /*
227  * Cumulative Moving Average (CMA): average of all data up to current
228  * Exponential Moving Average (EMA): weighted mean of the previous n data and more weight is given to recent
229  * Simple Moving Average (SMA): unweighted mean of the previous n data
230  *
231  * Bdevperf supports CMA and EMA.
232  */
233 static double
234 get_cma_io_per_second(struct bdevperf_job *job, uint64_t io_time_in_usec)
235 {
236 	return (double)job->io_completed * SPDK_SEC_TO_USEC / io_time_in_usec;
237 }
238 
239 static double
240 get_ema_io_per_second(struct bdevperf_job *job, uint64_t ema_period)
241 {
242 	double io_completed, io_per_second;
243 
244 	io_completed = job->io_completed;
245 	io_per_second = (double)(io_completed - job->prev_io_completed) * SPDK_SEC_TO_USEC
246 			/ g_show_performance_period_in_usec;
247 	job->prev_io_completed = io_completed;
248 
249 	job->ema_io_per_second += (io_per_second - job->ema_io_per_second) * 2
250 				  / (ema_period + 1);
251 	return job->ema_io_per_second;
252 }
253 
254 static void
255 get_avg_latency(void *ctx, uint64_t start, uint64_t end, uint64_t count,
256 		uint64_t total, uint64_t so_far)
257 {
258 	struct latency_info *latency_info = ctx;
259 
260 	if (count == 0) {
261 		return;
262 	}
263 
264 	latency_info->total += (start + end) / 2 * count;
265 
266 	if (so_far == count) {
267 		latency_info->min = start;
268 	}
269 
270 	if (so_far == total) {
271 		latency_info->max = end;
272 	}
273 }
274 
275 static void
276 performance_dump_job(struct bdevperf_aggregate_stats *stats, struct bdevperf_job *job)
277 {
278 	double io_per_second, mb_per_second, failed_per_second, timeout_per_second;
279 	double average_latency = 0.0, min_latency, max_latency;
280 	uint64_t time_in_usec;
281 	uint64_t tsc_rate;
282 	uint64_t total_io;
283 	struct latency_info latency_info = {};
284 
285 	printf("\r Job: %s (Core Mask 0x%s)\n", job->name,
286 	       spdk_cpuset_fmt(spdk_thread_get_cpumask(job->thread)));
287 
288 	if (job->io_failed > 0 && !job->reset && !job->continue_on_failure) {
289 		printf("\r Job: %s ended in about %.2f seconds with error\n",
290 		       job->name, (double)job->run_time_in_usec / SPDK_SEC_TO_USEC);
291 	}
292 	if (job->verify) {
293 		printf("\t Verification LBA range: start 0x%" PRIx64 " length 0x%" PRIx64 "\n",
294 		       job->ios_base, job->size_in_ios);
295 	}
296 
297 	if (g_performance_dump_active == true) {
298 		/* Use job's actual run time as Job has ended */
299 		if (job->io_failed > 0 && !job->continue_on_failure) {
300 			time_in_usec = job->run_time_in_usec;
301 		} else {
302 			time_in_usec = stats->io_time_in_usec;
303 		}
304 	} else {
305 		time_in_usec = job->run_time_in_usec;
306 	}
307 
308 	if (stats->ema_period == 0) {
309 		io_per_second = get_cma_io_per_second(job, time_in_usec);
310 	} else {
311 		io_per_second = get_ema_io_per_second(job, stats->ema_period);
312 	}
313 
314 	tsc_rate = spdk_get_ticks_hz();
315 	mb_per_second = io_per_second * job->io_size / (1024 * 1024);
316 
317 	spdk_histogram_data_iterate(job->histogram, get_avg_latency, &latency_info);
318 
319 	total_io = job->io_completed + job->io_failed;
320 	if (total_io != 0) {
321 		average_latency = (double)latency_info.total / total_io * SPDK_SEC_TO_USEC / tsc_rate;
322 	}
323 	min_latency = (double)latency_info.min * SPDK_SEC_TO_USEC / tsc_rate;
324 	max_latency = (double)latency_info.max * SPDK_SEC_TO_USEC / tsc_rate;
325 
326 	failed_per_second = (double)job->io_failed * SPDK_SEC_TO_USEC / time_in_usec;
327 	timeout_per_second = (double)job->io_timeout * SPDK_SEC_TO_USEC / time_in_usec;
328 
329 	printf("\t %-20s: %10.2f %10.2f %10.2f",
330 	       job->name, (float)time_in_usec / SPDK_SEC_TO_USEC, io_per_second, mb_per_second);
331 	printf(" %10.2f %8.2f",
332 	       failed_per_second, timeout_per_second);
333 	printf(" %10.2f %10.2f %10.2f\n",
334 	       average_latency, min_latency, max_latency);
335 
336 	stats->total_io_per_second += io_per_second;
337 	stats->total_mb_per_second += mb_per_second;
338 	stats->total_failed_per_second += failed_per_second;
339 	stats->total_timeout_per_second += timeout_per_second;
340 	stats->total_io_completed += job->io_completed + job->io_failed;
341 	stats->total_tsc += latency_info.total;
342 	if (min_latency < stats->min_latency) {
343 		stats->min_latency = min_latency;
344 	}
345 	if (max_latency > stats->max_latency) {
346 		stats->max_latency = max_latency;
347 	}
348 }
349 
350 static void
351 generate_data(void *buf, int buf_len, int block_size, void *md_buf, int md_size,
352 	      int num_blocks)
353 {
354 	int offset_blocks = 0, md_offset, data_block_size, inner_offset;
355 
356 	if (buf_len < num_blocks * block_size) {
357 		return;
358 	}
359 
360 	if (md_buf == NULL) {
361 		data_block_size = block_size - md_size;
362 		md_buf = (char *)buf + data_block_size;
363 		md_offset = block_size;
364 	} else {
365 		data_block_size = block_size;
366 		md_offset = md_size;
367 	}
368 
369 	while (offset_blocks < num_blocks) {
370 		inner_offset = 0;
371 		while (inner_offset < data_block_size) {
372 			*(uint32_t *)buf = offset_blocks + inner_offset;
373 			inner_offset += sizeof(uint32_t);
374 			buf += sizeof(uint32_t);
375 		}
376 		memset(md_buf, offset_blocks, md_size);
377 		md_buf += md_offset;
378 		offset_blocks++;
379 	}
380 }
381 
382 static bool
383 copy_data(void *wr_buf, int wr_buf_len, void *rd_buf, int rd_buf_len, int block_size,
384 	  void *wr_md_buf, void *rd_md_buf, int md_size, int num_blocks)
385 {
386 	if (wr_buf_len < num_blocks * block_size || rd_buf_len < num_blocks * block_size) {
387 		return false;
388 	}
389 
390 	assert((wr_md_buf != NULL) == (rd_md_buf != NULL));
391 
392 	memcpy(wr_buf, rd_buf, block_size * num_blocks);
393 
394 	if (wr_md_buf != NULL) {
395 		memcpy(wr_md_buf, rd_md_buf, md_size * num_blocks);
396 	}
397 
398 	return true;
399 }
400 
401 static bool
402 verify_data(void *wr_buf, int wr_buf_len, void *rd_buf, int rd_buf_len, int block_size,
403 	    void *wr_md_buf, void *rd_md_buf, int md_size, int num_blocks, bool md_check)
404 {
405 	int offset_blocks = 0, md_offset, data_block_size;
406 
407 	if (wr_buf_len < num_blocks * block_size || rd_buf_len < num_blocks * block_size) {
408 		return false;
409 	}
410 
411 	assert((wr_md_buf != NULL) == (rd_md_buf != NULL));
412 
413 	if (wr_md_buf == NULL) {
414 		data_block_size = block_size - md_size;
415 		wr_md_buf = (char *)wr_buf + data_block_size;
416 		rd_md_buf = (char *)rd_buf + data_block_size;
417 		md_offset = block_size;
418 	} else {
419 		data_block_size = block_size;
420 		md_offset = md_size;
421 	}
422 
423 	while (offset_blocks < num_blocks) {
424 		if (memcmp(wr_buf, rd_buf, data_block_size) != 0) {
425 			return false;
426 		}
427 
428 		wr_buf += block_size;
429 		rd_buf += block_size;
430 
431 		if (md_check) {
432 			if (memcmp(wr_md_buf, rd_md_buf, md_size) != 0) {
433 				return false;
434 			}
435 
436 			wr_md_buf += md_offset;
437 			rd_md_buf += md_offset;
438 		}
439 
440 		offset_blocks++;
441 	}
442 
443 	return true;
444 }
445 
446 static void
447 free_job_config(void)
448 {
449 	struct job_config *config, *tmp;
450 
451 	spdk_conf_free(g_bdevperf_conf);
452 	g_bdevperf_conf = NULL;
453 
454 	TAILQ_FOREACH_SAFE(config, &job_config_list, link, tmp) {
455 		TAILQ_REMOVE(&job_config_list, config, link);
456 		free(config);
457 	}
458 }
459 
460 static void
461 bdevperf_job_free(struct bdevperf_job *job)
462 {
463 	spdk_histogram_data_free(job->histogram);
464 	spdk_bit_array_free(&job->outstanding);
465 	spdk_bit_array_free(&job->random_map);
466 	spdk_zipf_free(&job->zipf);
467 	free(job->name);
468 	free(job);
469 }
470 
471 static void
472 job_thread_exit(void *ctx)
473 {
474 	spdk_thread_exit(spdk_get_thread());
475 }
476 
477 static void
478 check_cutoff(void *ctx, uint64_t start, uint64_t end, uint64_t count,
479 	     uint64_t total, uint64_t so_far)
480 {
481 	double so_far_pct;
482 	double **cutoff = ctx;
483 	uint64_t tsc_rate;
484 
485 	if (count == 0) {
486 		return;
487 	}
488 
489 	tsc_rate = spdk_get_ticks_hz();
490 	so_far_pct = (double)so_far / total;
491 	while (so_far_pct >= **cutoff && **cutoff > 0) {
492 		printf("%9.5f%% : %9.3fus\n", **cutoff * 100, (double)end * SPDK_SEC_TO_USEC / tsc_rate);
493 		(*cutoff)++;
494 	}
495 }
496 
497 static void
498 print_bucket(void *ctx, uint64_t start, uint64_t end, uint64_t count,
499 	     uint64_t total, uint64_t so_far)
500 {
501 	double so_far_pct;
502 	uint64_t tsc_rate;
503 
504 	if (count == 0) {
505 		return;
506 	}
507 
508 	tsc_rate = spdk_get_ticks_hz();
509 	so_far_pct = (double)so_far * 100 / total;
510 	printf("%9.3f - %9.3f: %9.4f%%  (%9ju)\n",
511 	       (double)start * SPDK_SEC_TO_USEC / tsc_rate,
512 	       (double)end * SPDK_SEC_TO_USEC / tsc_rate,
513 	       so_far_pct, count);
514 }
515 
516 static void
517 bdevperf_test_done(void *ctx)
518 {
519 	struct bdevperf_job *job, *jtmp;
520 	struct bdevperf_task *task, *ttmp;
521 	struct lcore_thread *lthread, *lttmp;
522 	double average_latency = 0.0;
523 	uint64_t time_in_usec;
524 	int rc;
525 
526 	if (g_time_in_usec) {
527 		g_stats.io_time_in_usec = g_time_in_usec;
528 
529 		if (!g_run_rc && g_performance_dump_active) {
530 			spdk_thread_send_msg(spdk_get_thread(), bdevperf_test_done, NULL);
531 			return;
532 		}
533 	}
534 
535 	if (g_show_performance_real_time) {
536 		spdk_poller_unregister(&g_perf_timer);
537 	}
538 
539 	if (g_shutdown) {
540 		g_shutdown_tsc = spdk_get_ticks() - g_start_tsc;
541 		time_in_usec = g_shutdown_tsc * SPDK_SEC_TO_USEC / spdk_get_ticks_hz();
542 		g_time_in_usec = (g_time_in_usec > time_in_usec) ? time_in_usec : g_time_in_usec;
543 		printf("Received shutdown signal, test time was about %.6f seconds\n",
544 		       (double)g_time_in_usec / SPDK_SEC_TO_USEC);
545 	}
546 
547 	printf("\n%*s\n", 107, "Latency(us)");
548 	printf("\r %-*s: %10s %10s %10s %10s %8s %10s %10s %10s\n",
549 	       28, "Device Information", "runtime(s)", "IOPS", "MiB/s", "Fail/s", "TO/s", "Average", "min", "max");
550 
551 	TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, jtmp) {
552 		performance_dump_job(&g_stats, job);
553 	}
554 
555 	printf("\r =================================================================================="
556 	       "=================================\n");
557 	printf("\r %-28s: %10s %10.2f %10.2f",
558 	       "Total", "", g_stats.total_io_per_second, g_stats.total_mb_per_second);
559 	printf(" %10.2f %8.2f",
560 	       g_stats.total_failed_per_second, g_stats.total_timeout_per_second);
561 
562 	if (g_stats.total_io_completed != 0) {
563 		average_latency = ((double)g_stats.total_tsc / g_stats.total_io_completed) * SPDK_SEC_TO_USEC /
564 				  spdk_get_ticks_hz();
565 	}
566 	printf(" %10.2f %10.2f %10.2f\n", average_latency, g_stats.min_latency, g_stats.max_latency);
567 
568 	fflush(stdout);
569 
570 	if (g_latency_display_level == 0 || g_stats.total_io_completed == 0) {
571 		goto clean;
572 	}
573 
574 	printf("\n Latency summary\n");
575 	TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, jtmp) {
576 		printf("\r =============================================\n");
577 		printf("\r Job: %s (Core Mask 0x%s)\n", job->name,
578 		       spdk_cpuset_fmt(spdk_thread_get_cpumask(job->thread)));
579 
580 		const double *cutoff = g_latency_cutoffs;
581 
582 		spdk_histogram_data_iterate(job->histogram, check_cutoff, &cutoff);
583 
584 		printf("\n");
585 	}
586 
587 	if (g_latency_display_level == 1) {
588 		goto clean;
589 	}
590 
591 	printf("\r Latency histogram\n");
592 	TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, jtmp) {
593 		printf("\r =============================================\n");
594 		printf("\r Job: %s (Core Mask 0x%s)\n", job->name,
595 		       spdk_cpuset_fmt(spdk_thread_get_cpumask(job->thread)));
596 
597 		spdk_histogram_data_iterate(job->histogram, print_bucket, NULL);
598 		printf("\n");
599 	}
600 
601 clean:
602 	TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, jtmp) {
603 		TAILQ_REMOVE(&g_bdevperf.jobs, job, link);
604 
605 		if (!g_one_thread_per_lcore) {
606 			spdk_thread_send_msg(job->thread, job_thread_exit, NULL);
607 		}
608 
609 		TAILQ_FOREACH_SAFE(task, &job->task_list, link, ttmp) {
610 			TAILQ_REMOVE(&job->task_list, task, link);
611 			spdk_free(task->buf);
612 			spdk_free(task->md_buf);
613 			free(task);
614 		}
615 
616 		bdevperf_job_free(job);
617 	}
618 
619 	if (g_one_thread_per_lcore) {
620 		TAILQ_FOREACH_SAFE(lthread, &g_lcore_thread_list, link, lttmp) {
621 			TAILQ_REMOVE(&g_lcore_thread_list, lthread, link);
622 			spdk_thread_send_msg(lthread->thread, job_thread_exit, NULL);
623 			free(lthread);
624 		}
625 	}
626 
627 	rc = g_run_rc;
628 	if (g_request && !g_shutdown) {
629 		rpc_perform_tests_cb();
630 		if (rc != 0) {
631 			spdk_app_stop(rc);
632 		}
633 	} else {
634 		spdk_app_stop(rc);
635 	}
636 }
637 
638 static void
639 bdevperf_job_end(void *ctx)
640 {
641 	assert(g_main_thread == spdk_get_thread());
642 
643 	if (--g_bdevperf.running_jobs == 0) {
644 		bdevperf_test_done(NULL);
645 	}
646 }
647 
648 static void
649 bdevperf_channel_get_histogram_cb(void *cb_arg, int status, struct spdk_histogram_data *histogram)
650 {
651 	struct spdk_histogram_data *job_hist = cb_arg;
652 
653 	if (status == 0) {
654 		spdk_histogram_data_merge(job_hist, histogram);
655 	}
656 }
657 
658 static void
659 bdevperf_job_empty(struct bdevperf_job *job)
660 {
661 	uint64_t end_tsc = 0;
662 
663 	end_tsc = spdk_get_ticks() - g_start_tsc;
664 	job->run_time_in_usec = end_tsc * SPDK_SEC_TO_USEC / spdk_get_ticks_hz();
665 	/* keep histogram info before channel is destroyed */
666 	spdk_bdev_channel_get_histogram(job->ch, bdevperf_channel_get_histogram_cb,
667 					job->histogram);
668 	spdk_put_io_channel(job->ch);
669 	spdk_bdev_close(job->bdev_desc);
670 	spdk_thread_send_msg(g_main_thread, bdevperf_job_end, NULL);
671 }
672 
673 static void
674 bdevperf_end_task(struct bdevperf_task *task)
675 {
676 	struct bdevperf_job     *job = task->job;
677 
678 	TAILQ_INSERT_TAIL(&job->task_list, task, link);
679 	if (job->is_draining) {
680 		if (job->current_queue_depth == 0) {
681 			bdevperf_job_empty(job);
682 		}
683 	}
684 }
685 
686 static void
687 bdevperf_queue_io_wait_with_cb(struct bdevperf_task *task, spdk_bdev_io_wait_cb cb_fn)
688 {
689 	struct bdevperf_job	*job = task->job;
690 
691 	task->bdev_io_wait.bdev = job->bdev;
692 	task->bdev_io_wait.cb_fn = cb_fn;
693 	task->bdev_io_wait.cb_arg = task;
694 	spdk_bdev_queue_io_wait(job->bdev, job->ch, &task->bdev_io_wait);
695 }
696 
697 static int
698 bdevperf_job_drain(void *ctx)
699 {
700 	struct bdevperf_job *job = ctx;
701 
702 	spdk_poller_unregister(&job->run_timer);
703 	if (job->reset) {
704 		spdk_poller_unregister(&job->reset_timer);
705 	}
706 
707 	job->is_draining = true;
708 
709 	return -1;
710 }
711 
712 static int
713 bdevperf_job_drain_timer(void *ctx)
714 {
715 	struct bdevperf_job *job = ctx;
716 
717 	bdevperf_job_drain(ctx);
718 	if (job->current_queue_depth == 0) {
719 		bdevperf_job_empty(job);
720 	}
721 
722 	return SPDK_POLLER_BUSY;
723 }
724 
725 static void
726 bdevperf_abort_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
727 {
728 	struct bdevperf_task	*task = cb_arg;
729 	struct bdevperf_job	*job = task->job;
730 
731 	job->current_queue_depth--;
732 
733 	if (success) {
734 		job->io_completed++;
735 	} else {
736 		job->io_failed++;
737 		if (!job->continue_on_failure) {
738 			bdevperf_job_drain(job);
739 			g_run_rc = -1;
740 		}
741 	}
742 
743 	spdk_bdev_free_io(bdev_io);
744 	bdevperf_end_task(task);
745 }
746 
747 static int
748 bdevperf_verify_dif(struct bdevperf_task *task, struct iovec *iovs, int iovcnt)
749 {
750 	struct bdevperf_job	*job = task->job;
751 	struct spdk_bdev	*bdev = job->bdev;
752 	struct spdk_dif_ctx	dif_ctx;
753 	struct spdk_dif_error	err_blk = {};
754 	int			rc;
755 	struct spdk_dif_ctx_init_ext_opts dif_opts;
756 
757 	dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format);
758 	dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16;
759 	rc = spdk_dif_ctx_init(&dif_ctx,
760 			       spdk_bdev_get_block_size(bdev),
761 			       spdk_bdev_get_md_size(bdev),
762 			       spdk_bdev_is_md_interleaved(bdev),
763 			       spdk_bdev_is_dif_head_of_md(bdev),
764 			       spdk_bdev_get_dif_type(bdev),
765 			       job->dif_check_flags,
766 			       task->offset_blocks, 0, 0, 0, 0, &dif_opts);
767 	if (rc != 0) {
768 		fprintf(stderr, "Initialization of DIF context failed\n");
769 		return rc;
770 	}
771 
772 	if (spdk_bdev_is_md_interleaved(bdev)) {
773 		rc = spdk_dif_verify(iovs, iovcnt, job->io_size_blocks, &dif_ctx, &err_blk);
774 	} else {
775 		struct iovec md_iov = {
776 			.iov_base	= task->md_buf,
777 			.iov_len	= spdk_bdev_get_md_size(bdev) * job->io_size_blocks,
778 		};
779 
780 		rc = spdk_dix_verify(iovs, iovcnt, &md_iov, job->io_size_blocks, &dif_ctx, &err_blk);
781 	}
782 
783 	if (rc != 0) {
784 		fprintf(stderr, "DIF/DIX error detected. type=%d, offset=%" PRIu32 "\n",
785 			err_blk.err_type, err_blk.err_offset);
786 	}
787 
788 	return rc;
789 }
790 
791 static void
792 bdevperf_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
793 {
794 	struct bdevperf_job	*job;
795 	struct bdevperf_task	*task = cb_arg;
796 	struct iovec		*iovs;
797 	int			iovcnt;
798 	bool			md_check;
799 	uint64_t		offset_in_ios;
800 	int			rc;
801 
802 	job = task->job;
803 	md_check = spdk_bdev_get_dif_type(job->bdev) == SPDK_DIF_DISABLE;
804 
805 	if (g_error_to_exit == true) {
806 		bdevperf_job_drain(job);
807 	} else if (!success) {
808 		if (!job->reset && !job->continue_on_failure) {
809 			bdevperf_job_drain(job);
810 			g_run_rc = -1;
811 			g_error_to_exit = true;
812 			printf("task offset: %" PRIu64 " on job bdev=%s fails\n",
813 			       task->offset_blocks, job->name);
814 		}
815 	} else if (job->verify || job->reset) {
816 		spdk_bdev_io_get_iovec(bdev_io, &iovs, &iovcnt);
817 		assert(iovcnt == 1);
818 		assert(iovs != NULL);
819 		if (!verify_data(task->buf, job->buf_size, iovs[0].iov_base, iovs[0].iov_len,
820 				 spdk_bdev_get_block_size(job->bdev),
821 				 task->md_buf, spdk_bdev_io_get_md_buf(bdev_io),
822 				 spdk_bdev_get_md_size(job->bdev),
823 				 job->io_size_blocks, md_check)) {
824 			printf("Buffer mismatch! Target: %s Disk Offset: %" PRIu64 "\n", job->name, task->offset_blocks);
825 			printf("   First dword expected 0x%x got 0x%x\n", *(int *)task->buf, *(int *)iovs[0].iov_base);
826 			bdevperf_job_drain(job);
827 			g_run_rc = -1;
828 		}
829 	} else if (job->dif_check_flags != 0) {
830 		if (task->io_type == SPDK_BDEV_IO_TYPE_READ && spdk_bdev_get_md_size(job->bdev) != 0) {
831 			spdk_bdev_io_get_iovec(bdev_io, &iovs, &iovcnt);
832 			assert(iovcnt == 1);
833 			assert(iovs != NULL);
834 			rc = bdevperf_verify_dif(task, iovs, iovcnt);
835 			if (rc != 0) {
836 				printf("DIF error detected. task offset: %" PRIu64 " on job bdev=%s\n",
837 				       task->offset_blocks, job->name);
838 
839 				success = false;
840 				if (!job->reset && !job->continue_on_failure) {
841 					bdevperf_job_drain(job);
842 					g_run_rc = -1;
843 					g_error_to_exit = true;
844 				}
845 			}
846 		}
847 	}
848 
849 	job->current_queue_depth--;
850 
851 	if (success) {
852 		job->io_completed++;
853 	} else {
854 		job->io_failed++;
855 	}
856 
857 	if (job->verify) {
858 		assert(task->offset_blocks / job->io_size_blocks >= job->ios_base);
859 		offset_in_ios = task->offset_blocks / job->io_size_blocks - job->ios_base;
860 
861 		assert(spdk_bit_array_get(job->outstanding, offset_in_ios) == true);
862 		spdk_bit_array_clear(job->outstanding, offset_in_ios);
863 	}
864 
865 	spdk_bdev_free_io(bdev_io);
866 
867 	/*
868 	 * is_draining indicates when time has expired for the test run
869 	 * and we are just waiting for the previously submitted I/O
870 	 * to complete.  In this case, do not submit a new I/O to replace
871 	 * the one just completed.
872 	 */
873 	if (!job->is_draining) {
874 		bdevperf_submit_single(job, task);
875 	} else {
876 		bdevperf_end_task(task);
877 	}
878 }
879 
880 static void
881 bdevperf_verify_submit_read(void *cb_arg)
882 {
883 	struct bdevperf_job	*job;
884 	struct bdevperf_task	*task = cb_arg;
885 	int			rc;
886 
887 	job = task->job;
888 
889 	/* Read the data back in */
890 	rc = spdk_bdev_read_blocks_with_md(job->bdev_desc, job->ch, NULL, NULL,
891 					   task->offset_blocks, job->io_size_blocks,
892 					   bdevperf_complete, task);
893 
894 	if (rc == -ENOMEM) {
895 		bdevperf_queue_io_wait_with_cb(task, bdevperf_verify_submit_read);
896 	} else if (rc != 0) {
897 		printf("Failed to submit read: %d\n", rc);
898 		bdevperf_job_drain(job);
899 		g_run_rc = rc;
900 	}
901 }
902 
903 static void
904 bdevperf_verify_write_complete(struct spdk_bdev_io *bdev_io, bool success,
905 			       void *cb_arg)
906 {
907 	if (success) {
908 		spdk_bdev_free_io(bdev_io);
909 		bdevperf_verify_submit_read(cb_arg);
910 	} else {
911 		bdevperf_complete(bdev_io, success, cb_arg);
912 	}
913 }
914 
915 static void
916 bdevperf_zcopy_populate_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
917 {
918 	if (!success) {
919 		bdevperf_complete(bdev_io, success, cb_arg);
920 		return;
921 	}
922 
923 	spdk_bdev_zcopy_end(bdev_io, false, bdevperf_complete, cb_arg);
924 }
925 
926 static int
927 bdevperf_generate_dif(struct bdevperf_task *task)
928 {
929 	struct bdevperf_job	*job = task->job;
930 	struct spdk_bdev	*bdev = job->bdev;
931 	struct spdk_dif_ctx	dif_ctx;
932 	int			rc;
933 	struct spdk_dif_ctx_init_ext_opts dif_opts;
934 
935 	dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format);
936 	dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16;
937 	rc = spdk_dif_ctx_init(&dif_ctx,
938 			       spdk_bdev_get_block_size(bdev),
939 			       spdk_bdev_get_md_size(bdev),
940 			       spdk_bdev_is_md_interleaved(bdev),
941 			       spdk_bdev_is_dif_head_of_md(bdev),
942 			       spdk_bdev_get_dif_type(bdev),
943 			       job->dif_check_flags,
944 			       task->offset_blocks, 0, 0, 0, 0, &dif_opts);
945 	if (rc != 0) {
946 		fprintf(stderr, "Initialization of DIF context failed\n");
947 		return rc;
948 	}
949 
950 	if (spdk_bdev_is_md_interleaved(bdev)) {
951 		rc = spdk_dif_generate(&task->iov, 1, job->io_size_blocks, &dif_ctx);
952 	} else {
953 		struct iovec md_iov = {
954 			.iov_base	= task->md_buf,
955 			.iov_len	= spdk_bdev_get_md_size(bdev) * job->io_size_blocks,
956 		};
957 
958 		rc = spdk_dix_generate(&task->iov, 1, &md_iov, job->io_size_blocks, &dif_ctx);
959 	}
960 
961 	if (rc != 0) {
962 		fprintf(stderr, "Generation of DIF/DIX failed\n");
963 	}
964 
965 	return rc;
966 }
967 
968 static void
969 bdevperf_submit_task(void *arg)
970 {
971 	struct bdevperf_task	*task = arg;
972 	struct bdevperf_job	*job = task->job;
973 	struct spdk_bdev_desc	*desc;
974 	struct spdk_io_channel	*ch;
975 	spdk_bdev_io_completion_cb cb_fn;
976 	uint64_t		offset_in_ios;
977 	int			rc = 0;
978 
979 	desc = job->bdev_desc;
980 	ch = job->ch;
981 
982 	switch (task->io_type) {
983 	case SPDK_BDEV_IO_TYPE_WRITE:
984 		if (spdk_bdev_get_md_size(job->bdev) != 0 && job->dif_check_flags != 0) {
985 			rc = bdevperf_generate_dif(task);
986 		}
987 		if (rc == 0) {
988 			cb_fn = (job->verify || job->reset) ? bdevperf_verify_write_complete : bdevperf_complete;
989 
990 			if (g_zcopy) {
991 				spdk_bdev_zcopy_end(task->bdev_io, true, cb_fn, task);
992 				return;
993 			} else {
994 				rc = spdk_bdev_writev_blocks_with_md(desc, ch, &task->iov, 1,
995 								     task->md_buf,
996 								     task->offset_blocks,
997 								     job->io_size_blocks,
998 								     cb_fn, task);
999 			}
1000 		}
1001 		break;
1002 	case SPDK_BDEV_IO_TYPE_FLUSH:
1003 		rc = spdk_bdev_flush_blocks(desc, ch, task->offset_blocks,
1004 					    job->io_size_blocks, bdevperf_complete, task);
1005 		break;
1006 	case SPDK_BDEV_IO_TYPE_UNMAP:
1007 		rc = spdk_bdev_unmap_blocks(desc, ch, task->offset_blocks,
1008 					    job->io_size_blocks, bdevperf_complete, task);
1009 		break;
1010 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
1011 		rc = spdk_bdev_write_zeroes_blocks(desc, ch, task->offset_blocks,
1012 						   job->io_size_blocks, bdevperf_complete, task);
1013 		break;
1014 	case SPDK_BDEV_IO_TYPE_READ:
1015 		if (g_zcopy) {
1016 			rc = spdk_bdev_zcopy_start(desc, ch, NULL, 0, task->offset_blocks, job->io_size_blocks,
1017 						   true, bdevperf_zcopy_populate_complete, task);
1018 		} else {
1019 			rc = spdk_bdev_read_blocks_with_md(desc, ch, task->buf, task->md_buf,
1020 							   task->offset_blocks,
1021 							   job->io_size_blocks,
1022 							   bdevperf_complete, task);
1023 		}
1024 		break;
1025 	case SPDK_BDEV_IO_TYPE_ABORT:
1026 		rc = spdk_bdev_abort(desc, ch, task->task_to_abort, bdevperf_abort_complete, task);
1027 		break;
1028 	default:
1029 		assert(false);
1030 		rc = -EINVAL;
1031 		break;
1032 	}
1033 
1034 	if (rc == -ENOMEM) {
1035 		bdevperf_queue_io_wait_with_cb(task, bdevperf_submit_task);
1036 		return;
1037 	} else if (rc != 0) {
1038 		printf("Failed to submit bdev_io: %d\n", rc);
1039 		if (job->verify) {
1040 			assert(task->offset_blocks / job->io_size_blocks >= job->ios_base);
1041 			offset_in_ios = task->offset_blocks / job->io_size_blocks - job->ios_base;
1042 
1043 			assert(spdk_bit_array_get(job->outstanding, offset_in_ios) == true);
1044 			spdk_bit_array_clear(job->outstanding, offset_in_ios);
1045 		}
1046 		bdevperf_job_drain(job);
1047 		g_run_rc = rc;
1048 		return;
1049 	}
1050 
1051 	job->current_queue_depth++;
1052 }
1053 
1054 static void
1055 bdevperf_zcopy_get_buf_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
1056 {
1057 	struct bdevperf_task	*task = cb_arg;
1058 	struct bdevperf_job	*job = task->job;
1059 	struct iovec		*iovs;
1060 	int			iovcnt;
1061 
1062 	if (!success) {
1063 		bdevperf_job_drain(job);
1064 		g_run_rc = -1;
1065 		return;
1066 	}
1067 
1068 	task->bdev_io = bdev_io;
1069 	task->io_type = SPDK_BDEV_IO_TYPE_WRITE;
1070 
1071 	if (job->verify || job->reset) {
1072 		/* When job->verify or job->reset is enabled, task->buf is used for
1073 		 *  verification of read after write.  For write I/O, when zcopy APIs
1074 		 *  are used, task->buf cannot be used, and data must be written to
1075 		 *  the data buffer allocated underneath bdev layer instead.
1076 		 *  Hence we copy task->buf to the allocated data buffer here.
1077 		 */
1078 		spdk_bdev_io_get_iovec(bdev_io, &iovs, &iovcnt);
1079 		assert(iovcnt == 1);
1080 		assert(iovs != NULL);
1081 
1082 		copy_data(iovs[0].iov_base, iovs[0].iov_len, task->buf, job->buf_size,
1083 			  spdk_bdev_get_block_size(job->bdev),
1084 			  spdk_bdev_io_get_md_buf(bdev_io), task->md_buf,
1085 			  spdk_bdev_get_md_size(job->bdev), job->io_size_blocks);
1086 	}
1087 
1088 	bdevperf_submit_task(task);
1089 }
1090 
1091 static void
1092 bdevperf_prep_zcopy_write_task(void *arg)
1093 {
1094 	struct bdevperf_task	*task = arg;
1095 	struct bdevperf_job	*job = task->job;
1096 	int			rc;
1097 
1098 	rc = spdk_bdev_zcopy_start(job->bdev_desc, job->ch, NULL, 0,
1099 				   task->offset_blocks, job->io_size_blocks,
1100 				   false, bdevperf_zcopy_get_buf_complete, task);
1101 	if (rc != 0) {
1102 		assert(rc == -ENOMEM);
1103 		bdevperf_queue_io_wait_with_cb(task, bdevperf_prep_zcopy_write_task);
1104 		return;
1105 	}
1106 
1107 	job->current_queue_depth++;
1108 }
1109 
1110 static struct bdevperf_task *
1111 bdevperf_job_get_task(struct bdevperf_job *job)
1112 {
1113 	struct bdevperf_task *task;
1114 
1115 	task = TAILQ_FIRST(&job->task_list);
1116 	if (!task) {
1117 		printf("Task allocation failed\n");
1118 		abort();
1119 	}
1120 
1121 	TAILQ_REMOVE(&job->task_list, task, link);
1122 	return task;
1123 }
1124 
1125 static void
1126 bdevperf_submit_single(struct bdevperf_job *job, struct bdevperf_task *task)
1127 {
1128 	uint64_t offset_in_ios;
1129 	uint64_t rand_value;
1130 	uint32_t first_clear;
1131 
1132 	if (job->zipf) {
1133 		offset_in_ios = spdk_zipf_generate(job->zipf);
1134 	} else if (job->is_random) {
1135 		/* RAND_MAX is only INT32_MAX, so use 2 calls to rand_r to
1136 		 * get a large enough value to ensure we are issuing I/O
1137 		 * uniformly across the whole bdev.
1138 		 */
1139 		rand_value = (uint64_t)rand_r(&job->seed) * RAND_MAX + rand_r(&job->seed);
1140 		offset_in_ios = rand_value % job->size_in_ios;
1141 
1142 		if (g_random_map) {
1143 			/* Make sure, that the offset does not exceed the maximum size
1144 			 * of the bit array (verified during job creation)
1145 			 */
1146 			assert(offset_in_ios < UINT32_MAX);
1147 
1148 			first_clear = spdk_bit_array_find_first_clear(job->random_map, (uint32_t)offset_in_ios);
1149 
1150 			if (first_clear == UINT32_MAX) {
1151 				first_clear = spdk_bit_array_find_first_clear(job->random_map, 0);
1152 
1153 				if (first_clear == UINT32_MAX) {
1154 					/* If there are no more clear bits in the array, we start over
1155 					 * and select the previously selected random value.
1156 					 */
1157 					spdk_bit_array_clear_mask(job->random_map);
1158 					first_clear = (uint32_t)offset_in_ios;
1159 				}
1160 			}
1161 
1162 			spdk_bit_array_set(job->random_map, first_clear);
1163 
1164 			offset_in_ios = first_clear;
1165 		}
1166 	} else {
1167 		offset_in_ios = job->offset_in_ios++;
1168 		if (job->offset_in_ios == job->size_in_ios) {
1169 			job->offset_in_ios = 0;
1170 		}
1171 
1172 		/* Increment of offset_in_ios if there's already an outstanding IO
1173 		 * to that location. We only need this with job->verify as random
1174 		 * offsets are not supported with job->verify at this time.
1175 		 */
1176 		if (job->verify) {
1177 			assert(spdk_bit_array_find_first_clear(job->outstanding, 0) != UINT32_MAX);
1178 
1179 			while (spdk_bit_array_get(job->outstanding, offset_in_ios)) {
1180 				offset_in_ios = job->offset_in_ios++;
1181 				if (job->offset_in_ios == job->size_in_ios) {
1182 					job->offset_in_ios = 0;
1183 				}
1184 			}
1185 			spdk_bit_array_set(job->outstanding, offset_in_ios);
1186 		}
1187 	}
1188 
1189 	/* For multi-thread to same job, offset_in_ios is relative
1190 	 * to the LBA range assigned for that job. job->offset_blocks
1191 	 * is absolute (entire bdev LBA range).
1192 	 */
1193 	task->offset_blocks = (offset_in_ios + job->ios_base) * job->io_size_blocks;
1194 
1195 	if (job->verify || job->reset) {
1196 		generate_data(task->buf, job->buf_size,
1197 			      spdk_bdev_get_block_size(job->bdev),
1198 			      task->md_buf, spdk_bdev_get_md_size(job->bdev),
1199 			      job->io_size_blocks);
1200 		if (g_zcopy) {
1201 			bdevperf_prep_zcopy_write_task(task);
1202 			return;
1203 		} else {
1204 			task->iov.iov_base = task->buf;
1205 			task->iov.iov_len = job->buf_size;
1206 			task->io_type = SPDK_BDEV_IO_TYPE_WRITE;
1207 		}
1208 	} else if (job->flush) {
1209 		task->io_type = SPDK_BDEV_IO_TYPE_FLUSH;
1210 	} else if (job->unmap) {
1211 		task->io_type = SPDK_BDEV_IO_TYPE_UNMAP;
1212 	} else if (job->write_zeroes) {
1213 		task->io_type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES;
1214 	} else if ((job->rw_percentage == 100) ||
1215 		   (job->rw_percentage != 0 && ((rand_r(&job->seed) % 100) < job->rw_percentage))) {
1216 		task->io_type = SPDK_BDEV_IO_TYPE_READ;
1217 	} else {
1218 		if (g_zcopy) {
1219 			bdevperf_prep_zcopy_write_task(task);
1220 			return;
1221 		} else {
1222 			task->iov.iov_base = task->buf;
1223 			task->iov.iov_len = job->buf_size;
1224 			task->io_type = SPDK_BDEV_IO_TYPE_WRITE;
1225 		}
1226 	}
1227 
1228 	bdevperf_submit_task(task);
1229 }
1230 
1231 static int reset_job(void *arg);
1232 
1233 static void
1234 reset_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
1235 {
1236 	struct bdevperf_task	*task = cb_arg;
1237 	struct bdevperf_job	*job = task->job;
1238 
1239 	if (!success) {
1240 		printf("Reset blockdev=%s failed\n", spdk_bdev_get_name(job->bdev));
1241 		bdevperf_job_drain(job);
1242 		g_run_rc = -1;
1243 	}
1244 
1245 	TAILQ_INSERT_TAIL(&job->task_list, task, link);
1246 	spdk_bdev_free_io(bdev_io);
1247 
1248 	job->reset_timer = SPDK_POLLER_REGISTER(reset_job, job,
1249 						10 * SPDK_SEC_TO_USEC);
1250 }
1251 
1252 static int
1253 reset_job(void *arg)
1254 {
1255 	struct bdevperf_job *job = arg;
1256 	struct bdevperf_task *task;
1257 	int rc;
1258 
1259 	spdk_poller_unregister(&job->reset_timer);
1260 
1261 	/* Do reset. */
1262 	task = bdevperf_job_get_task(job);
1263 	rc = spdk_bdev_reset(job->bdev_desc, job->ch,
1264 			     reset_cb, task);
1265 	if (rc) {
1266 		printf("Reset failed: %d\n", rc);
1267 		bdevperf_job_drain(job);
1268 		g_run_rc = -1;
1269 	}
1270 
1271 	return -1;
1272 }
1273 
1274 static void
1275 bdevperf_timeout_cb(void *cb_arg, struct spdk_bdev_io *bdev_io)
1276 {
1277 	struct bdevperf_job *job = cb_arg;
1278 	struct bdevperf_task *task;
1279 
1280 	job->io_timeout++;
1281 
1282 	if (job->is_draining || !job->abort ||
1283 	    !spdk_bdev_io_type_supported(job->bdev, SPDK_BDEV_IO_TYPE_ABORT)) {
1284 		return;
1285 	}
1286 
1287 	task = bdevperf_job_get_task(job);
1288 	if (task == NULL) {
1289 		return;
1290 	}
1291 
1292 	task->task_to_abort = spdk_bdev_io_get_cb_arg(bdev_io);
1293 	task->io_type = SPDK_BDEV_IO_TYPE_ABORT;
1294 
1295 	bdevperf_submit_task(task);
1296 }
1297 
1298 static void
1299 bdevperf_job_run(void *ctx)
1300 {
1301 	struct bdevperf_job *job = ctx;
1302 	struct bdevperf_task *task;
1303 	int i;
1304 
1305 	/* Submit initial I/O for this job. Each time one
1306 	 * completes, another will be submitted. */
1307 
1308 	/* Start a timer to stop this I/O chain when the run is over */
1309 	job->run_timer = SPDK_POLLER_REGISTER(bdevperf_job_drain_timer, job, g_time_in_usec);
1310 	if (job->reset) {
1311 		job->reset_timer = SPDK_POLLER_REGISTER(reset_job, job,
1312 							10 * SPDK_SEC_TO_USEC);
1313 	}
1314 
1315 	spdk_bdev_set_timeout(job->bdev_desc, g_timeout_in_sec, bdevperf_timeout_cb, job);
1316 
1317 	for (i = 0; i < job->queue_depth; i++) {
1318 		task = bdevperf_job_get_task(job);
1319 		bdevperf_submit_single(job, task);
1320 	}
1321 }
1322 
1323 static void
1324 _performance_dump_done(void *ctx)
1325 {
1326 	struct bdevperf_aggregate_stats *stats = ctx;
1327 	double average_latency;
1328 
1329 	printf("\r =================================================================================="
1330 	       "=================================\n");
1331 	printf("\r %-28s: %10s %10.2f %10.2f",
1332 	       "Total", "", stats->total_io_per_second, stats->total_mb_per_second);
1333 	printf(" %10.2f %8.2f",
1334 	       stats->total_failed_per_second, stats->total_timeout_per_second);
1335 
1336 	average_latency = ((double)stats->total_tsc / stats->total_io_completed) * SPDK_SEC_TO_USEC /
1337 			  spdk_get_ticks_hz();
1338 	printf(" %10.2f %10.2f %10.2f\n", average_latency, stats->min_latency, stats->max_latency);
1339 	printf("\n");
1340 
1341 	fflush(stdout);
1342 
1343 	g_performance_dump_active = false;
1344 
1345 	free(stats);
1346 }
1347 
1348 static void
1349 _performance_dump(void *ctx)
1350 {
1351 	struct bdevperf_aggregate_stats *stats = ctx;
1352 
1353 	performance_dump_job(stats, stats->current_job);
1354 
1355 	/* This assumes the jobs list is static after start up time.
1356 	 * That's true right now, but if that ever changed this would need a lock. */
1357 	stats->current_job = TAILQ_NEXT(stats->current_job, link);
1358 	if (stats->current_job == NULL) {
1359 		spdk_thread_send_msg(g_main_thread, _performance_dump_done, stats);
1360 	} else {
1361 		spdk_thread_send_msg(stats->current_job->thread, _performance_dump, stats);
1362 	}
1363 }
1364 
1365 static int
1366 performance_statistics_thread(void *arg)
1367 {
1368 	struct bdevperf_aggregate_stats *stats;
1369 
1370 	if (g_performance_dump_active) {
1371 		return -1;
1372 	}
1373 
1374 	g_performance_dump_active = true;
1375 
1376 	stats = calloc(1, sizeof(*stats));
1377 	if (stats == NULL) {
1378 		return -1;
1379 	}
1380 
1381 	stats->min_latency = (double)UINT64_MAX;
1382 
1383 	g_show_performance_period_num++;
1384 
1385 	stats->io_time_in_usec = g_show_performance_period_num * g_show_performance_period_in_usec;
1386 	stats->ema_period = g_show_performance_ema_period;
1387 
1388 	/* Iterate all of the jobs to gather stats
1389 	 * These jobs will not get removed here until a final performance dump is run,
1390 	 * so this should be safe without locking.
1391 	 */
1392 	stats->current_job = TAILQ_FIRST(&g_bdevperf.jobs);
1393 	if (stats->current_job == NULL) {
1394 		spdk_thread_send_msg(g_main_thread, _performance_dump_done, stats);
1395 	} else {
1396 		spdk_thread_send_msg(stats->current_job->thread, _performance_dump, stats);
1397 	}
1398 
1399 	return -1;
1400 }
1401 
1402 static void
1403 bdevperf_test(void)
1404 {
1405 	struct bdevperf_job *job;
1406 
1407 	printf("Running I/O for %" PRIu64 " seconds...\n", g_time_in_usec / (uint64_t)SPDK_SEC_TO_USEC);
1408 	fflush(stdout);
1409 
1410 	/* Start a timer to dump performance numbers */
1411 	g_start_tsc = spdk_get_ticks();
1412 	if (g_show_performance_real_time && !g_perf_timer) {
1413 		printf("%*s\n", 107, "Latency(us)");
1414 		printf("\r %-*s: %10s %10s %10s %10s %8s %10s %10s %10s\n",
1415 		       28, "Device Information", "runtime(s)", "IOPS", "MiB/s", "Fail/s", "TO/s", "Average", "min", "max");
1416 
1417 		g_perf_timer = SPDK_POLLER_REGISTER(performance_statistics_thread, NULL,
1418 						    g_show_performance_period_in_usec);
1419 	}
1420 
1421 	/* Iterate jobs to start all I/O */
1422 	TAILQ_FOREACH(job, &g_bdevperf.jobs, link) {
1423 		g_bdevperf.running_jobs++;
1424 		spdk_thread_send_msg(job->thread, bdevperf_job_run, job);
1425 	}
1426 }
1427 
1428 static void
1429 bdevperf_bdev_removed(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *event_ctx)
1430 {
1431 	struct bdevperf_job *job = event_ctx;
1432 
1433 	if (SPDK_BDEV_EVENT_REMOVE == type) {
1434 		bdevperf_job_drain(job);
1435 	}
1436 }
1437 
1438 static void
1439 bdevperf_histogram_status_cb(void *cb_arg, int status)
1440 {
1441 	if (status != 0) {
1442 		g_run_rc = status;
1443 		if (g_continue_on_failure == false) {
1444 			g_error_to_exit = true;
1445 		}
1446 	}
1447 
1448 	if (--g_bdev_count == 0) {
1449 		if (g_run_rc == 0) {
1450 			/* Ready to run the test */
1451 			bdevperf_test();
1452 		} else {
1453 			bdevperf_test_done(NULL);
1454 		}
1455 	}
1456 }
1457 
1458 static uint32_t g_construct_job_count = 0;
1459 
1460 static int
1461 _bdevperf_enable_histogram(void *ctx, struct spdk_bdev *bdev)
1462 {
1463 	bool *enable = ctx;
1464 
1465 	g_bdev_count++;
1466 
1467 	spdk_bdev_histogram_enable(bdev, bdevperf_histogram_status_cb, NULL, *enable);
1468 
1469 	return 0;
1470 }
1471 
1472 static void
1473 bdevperf_enable_histogram(bool enable)
1474 {
1475 	struct spdk_bdev *bdev;
1476 	int rc;
1477 
1478 	/* increment initial g_bdev_count so that it will never reach 0 in the middle of iteration */
1479 	g_bdev_count = 1;
1480 
1481 	if (g_job_bdev_name != NULL) {
1482 		bdev = spdk_bdev_get_by_name(g_job_bdev_name);
1483 		if (bdev) {
1484 			rc = _bdevperf_enable_histogram(&enable, bdev);
1485 		} else {
1486 			fprintf(stderr, "Unable to find bdev '%s'\n", g_job_bdev_name);
1487 			rc = -1;
1488 		}
1489 	} else {
1490 		rc = spdk_for_each_bdev_leaf(&enable, _bdevperf_enable_histogram);
1491 	}
1492 
1493 	bdevperf_histogram_status_cb(NULL, rc);
1494 }
1495 
1496 static void
1497 _bdevperf_construct_job_done(void *ctx)
1498 {
1499 	if (--g_construct_job_count == 0) {
1500 		if (g_run_rc != 0) {
1501 			/* Something failed. */
1502 			bdevperf_test_done(NULL);
1503 			return;
1504 		}
1505 
1506 		/* always enable histogram. */
1507 		bdevperf_enable_histogram(true);
1508 	} else if (g_run_rc != 0) {
1509 		/* Reset error as some jobs constructed right */
1510 		g_run_rc = 0;
1511 		if (g_continue_on_failure == false) {
1512 			g_error_to_exit = true;
1513 		}
1514 	}
1515 }
1516 
1517 /* Checkformat will not allow to use inlined type,
1518    this is a workaround */
1519 typedef struct spdk_thread *spdk_thread_t;
1520 
1521 static spdk_thread_t
1522 construct_job_thread(struct spdk_cpuset *cpumask, const char *tag)
1523 {
1524 	struct spdk_cpuset tmp;
1525 
1526 	/* This function runs on the main thread. */
1527 	assert(g_main_thread == spdk_get_thread());
1528 
1529 	/* Handle default mask */
1530 	if (spdk_cpuset_count(cpumask) == 0) {
1531 		cpumask = &g_all_cpuset;
1532 	}
1533 
1534 	/* Warn user that mask might need to be changed */
1535 	spdk_cpuset_copy(&tmp, cpumask);
1536 	spdk_cpuset_or(&tmp, &g_all_cpuset);
1537 	if (!spdk_cpuset_equal(&tmp, &g_all_cpuset)) {
1538 		fprintf(stderr, "cpumask for '%s' is too big\n", tag);
1539 	}
1540 
1541 	return spdk_thread_create(tag, cpumask);
1542 }
1543 
1544 static uint32_t
1545 _get_next_core(void)
1546 {
1547 	static uint32_t current_core = SPDK_ENV_LCORE_ID_ANY;
1548 
1549 	if (current_core == SPDK_ENV_LCORE_ID_ANY) {
1550 		current_core = spdk_env_get_first_core();
1551 		return current_core;
1552 	}
1553 
1554 	current_core = spdk_env_get_next_core(current_core);
1555 	if (current_core == SPDK_ENV_LCORE_ID_ANY) {
1556 		current_core = spdk_env_get_first_core();
1557 	}
1558 
1559 	return current_core;
1560 }
1561 
1562 static void
1563 _bdevperf_construct_job(void *ctx)
1564 {
1565 	struct bdevperf_job *job = ctx;
1566 	int rc;
1567 
1568 	rc = spdk_bdev_open_ext(spdk_bdev_get_name(job->bdev), true, bdevperf_bdev_removed, job,
1569 				&job->bdev_desc);
1570 	if (rc != 0) {
1571 		SPDK_ERRLOG("Could not open leaf bdev %s, error=%d\n", spdk_bdev_get_name(job->bdev), rc);
1572 		g_run_rc = -EINVAL;
1573 		goto end;
1574 	}
1575 
1576 	if (g_zcopy) {
1577 		if (!spdk_bdev_io_type_supported(job->bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) {
1578 			printf("Test requires ZCOPY but bdev module does not support ZCOPY\n");
1579 			g_run_rc = -ENOTSUP;
1580 			goto end;
1581 		}
1582 	}
1583 
1584 	job->ch = spdk_bdev_get_io_channel(job->bdev_desc);
1585 	if (!job->ch) {
1586 		SPDK_ERRLOG("Could not get io_channel for device %s, error=%d\n", spdk_bdev_get_name(job->bdev),
1587 			    rc);
1588 		spdk_bdev_close(job->bdev_desc);
1589 		TAILQ_REMOVE(&g_bdevperf.jobs, job, link);
1590 		g_run_rc = -ENOMEM;
1591 		goto end;
1592 	}
1593 
1594 end:
1595 	spdk_thread_send_msg(g_main_thread, _bdevperf_construct_job_done, NULL);
1596 }
1597 
1598 static void
1599 job_init_rw(struct bdevperf_job *job, enum job_config_rw rw)
1600 {
1601 	switch (rw) {
1602 	case JOB_CONFIG_RW_READ:
1603 		job->rw_percentage = 100;
1604 		break;
1605 	case JOB_CONFIG_RW_WRITE:
1606 		job->rw_percentage = 0;
1607 		break;
1608 	case JOB_CONFIG_RW_RANDREAD:
1609 		job->is_random = true;
1610 		job->rw_percentage = 100;
1611 		job->seed = rand();
1612 		break;
1613 	case JOB_CONFIG_RW_RANDWRITE:
1614 		job->is_random = true;
1615 		job->rw_percentage = 0;
1616 		job->seed = rand();
1617 		break;
1618 	case JOB_CONFIG_RW_RW:
1619 		job->is_random = false;
1620 		break;
1621 	case JOB_CONFIG_RW_RANDRW:
1622 		job->is_random = true;
1623 		job->seed = rand();
1624 		break;
1625 	case JOB_CONFIG_RW_VERIFY:
1626 		job->verify = true;
1627 		job->rw_percentage = 50;
1628 		break;
1629 	case JOB_CONFIG_RW_RESET:
1630 		job->reset = true;
1631 		job->verify = true;
1632 		job->rw_percentage = 50;
1633 		break;
1634 	case JOB_CONFIG_RW_UNMAP:
1635 		job->unmap = true;
1636 		break;
1637 	case JOB_CONFIG_RW_FLUSH:
1638 		job->flush = true;
1639 		break;
1640 	case JOB_CONFIG_RW_WRITE_ZEROES:
1641 		job->write_zeroes = true;
1642 		break;
1643 	}
1644 }
1645 
1646 static int
1647 bdevperf_construct_job(struct spdk_bdev *bdev, struct job_config *config,
1648 		       struct spdk_thread *thread)
1649 {
1650 	struct bdevperf_job *job;
1651 	struct bdevperf_task *task;
1652 	int block_size, data_block_size;
1653 	int rc;
1654 	int task_num, n;
1655 
1656 	block_size = spdk_bdev_get_block_size(bdev);
1657 	data_block_size = spdk_bdev_get_data_block_size(bdev);
1658 
1659 	job = calloc(1, sizeof(struct bdevperf_job));
1660 	if (!job) {
1661 		fprintf(stderr, "Unable to allocate memory for new job.\n");
1662 		return -ENOMEM;
1663 	}
1664 
1665 	job->name = strdup(spdk_bdev_get_name(bdev));
1666 	if (!job->name) {
1667 		fprintf(stderr, "Unable to allocate memory for job name.\n");
1668 		bdevperf_job_free(job);
1669 		return -ENOMEM;
1670 	}
1671 
1672 	job->workload_type = g_workload_type;
1673 	job->io_size = config->bs;
1674 	job->rw_percentage = config->rwmixread;
1675 	job->continue_on_failure = g_continue_on_failure;
1676 	job->queue_depth = config->iodepth;
1677 	job->bdev = bdev;
1678 	job->io_size_blocks = job->io_size / data_block_size;
1679 	job->buf_size = job->io_size_blocks * block_size;
1680 	job->abort = g_abort;
1681 	job_init_rw(job, config->rw);
1682 
1683 	if ((job->io_size % data_block_size) != 0) {
1684 		SPDK_ERRLOG("IO size (%d) is not multiples of data block size of bdev %s (%"PRIu32")\n",
1685 			    job->io_size, spdk_bdev_get_name(bdev), data_block_size);
1686 		bdevperf_job_free(job);
1687 		return -ENOTSUP;
1688 	}
1689 
1690 	if (job->unmap && !spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
1691 		printf("Skipping %s because it does not support unmap\n", spdk_bdev_get_name(bdev));
1692 		bdevperf_job_free(job);
1693 		return -ENOTSUP;
1694 	}
1695 
1696 	if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_REFTAG)) {
1697 		job->dif_check_flags |= SPDK_DIF_FLAGS_REFTAG_CHECK;
1698 	}
1699 	if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_GUARD)) {
1700 		job->dif_check_flags |= SPDK_DIF_FLAGS_GUARD_CHECK;
1701 	}
1702 
1703 	job->offset_in_ios = 0;
1704 
1705 	if (config->length != 0) {
1706 		/* Use subset of disk */
1707 		job->size_in_ios = config->length / job->io_size_blocks;
1708 		job->ios_base = config->offset / job->io_size_blocks;
1709 	} else {
1710 		/* Use whole disk */
1711 		job->size_in_ios = spdk_bdev_get_num_blocks(bdev) / job->io_size_blocks;
1712 		job->ios_base = 0;
1713 	}
1714 
1715 	if (job->is_random && g_zipf_theta > 0) {
1716 		job->zipf = spdk_zipf_create(job->size_in_ios, g_zipf_theta, 0);
1717 	}
1718 
1719 	if (job->verify) {
1720 		if (job->size_in_ios >= UINT32_MAX) {
1721 			SPDK_ERRLOG("Due to constraints of verify operation, the job storage capacity is too large\n");
1722 			bdevperf_job_free(job);
1723 			return -ENOMEM;
1724 		}
1725 		job->outstanding = spdk_bit_array_create(job->size_in_ios);
1726 		if (job->outstanding == NULL) {
1727 			SPDK_ERRLOG("Could not create outstanding array bitmap for bdev %s\n",
1728 				    spdk_bdev_get_name(bdev));
1729 			bdevperf_job_free(job);
1730 			return -ENOMEM;
1731 		}
1732 		if (job->queue_depth > (int)job->size_in_ios) {
1733 			SPDK_WARNLOG("Due to constraints of verify job, queue depth (-q, %d) can't exceed the number of IO "
1734 				     "requests which can be submitted to the bdev %s simultaneously (%"PRIu64"). "
1735 				     "Queue depth is limited to %"PRIu64"\n",
1736 				     job->queue_depth, job->name, job->size_in_ios, job->size_in_ios);
1737 			job->queue_depth = (int)job->size_in_ios;
1738 		}
1739 	}
1740 
1741 	job->histogram = spdk_histogram_data_alloc();
1742 	if (job->histogram == NULL) {
1743 		fprintf(stderr, "Failed to allocate histogram\n");
1744 		bdevperf_job_free(job);
1745 		return -ENOMEM;
1746 	}
1747 
1748 	TAILQ_INIT(&job->task_list);
1749 
1750 	if (g_random_map) {
1751 		if (job->size_in_ios >= UINT32_MAX) {
1752 			SPDK_ERRLOG("Due to constraints of the random map, the job storage capacity is too large\n");
1753 			bdevperf_job_free(job);
1754 			return -ENOMEM;
1755 		}
1756 		job->random_map = spdk_bit_array_create(job->size_in_ios);
1757 		if (job->random_map == NULL) {
1758 			SPDK_ERRLOG("Could not create random_map array bitmap for bdev %s\n",
1759 				    spdk_bdev_get_name(bdev));
1760 			bdevperf_job_free(job);
1761 			return -ENOMEM;
1762 		}
1763 	}
1764 
1765 	task_num = job->queue_depth;
1766 	if (job->reset) {
1767 		task_num += 1;
1768 	}
1769 	if (job->abort) {
1770 		task_num += job->queue_depth;
1771 	}
1772 
1773 	TAILQ_INSERT_TAIL(&g_bdevperf.jobs, job, link);
1774 
1775 	for (n = 0; n < task_num; n++) {
1776 		task = calloc(1, sizeof(struct bdevperf_task));
1777 		if (!task) {
1778 			fprintf(stderr, "Failed to allocate task from memory\n");
1779 			spdk_zipf_free(&job->zipf);
1780 			return -ENOMEM;
1781 		}
1782 
1783 		task->buf = spdk_zmalloc(job->buf_size, spdk_bdev_get_buf_align(job->bdev), NULL,
1784 					 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
1785 		if (!task->buf) {
1786 			fprintf(stderr, "Cannot allocate buf for task=%p\n", task);
1787 			spdk_zipf_free(&job->zipf);
1788 			free(task);
1789 			return -ENOMEM;
1790 		}
1791 
1792 		if (spdk_bdev_is_md_separate(job->bdev)) {
1793 			task->md_buf = spdk_zmalloc(job->io_size_blocks *
1794 						    spdk_bdev_get_md_size(job->bdev), 0, NULL,
1795 						    SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
1796 			if (!task->md_buf) {
1797 				fprintf(stderr, "Cannot allocate md buf for task=%p\n", task);
1798 				spdk_zipf_free(&job->zipf);
1799 				spdk_free(task->buf);
1800 				free(task);
1801 				return -ENOMEM;
1802 			}
1803 		}
1804 
1805 		task->job = job;
1806 		TAILQ_INSERT_TAIL(&job->task_list, task, link);
1807 	}
1808 
1809 	job->thread = thread;
1810 
1811 	g_construct_job_count++;
1812 
1813 	rc = spdk_thread_send_msg(thread, _bdevperf_construct_job, job);
1814 	assert(rc == 0);
1815 
1816 	return rc;
1817 }
1818 
1819 static int
1820 parse_rw(const char *str, enum job_config_rw ret)
1821 {
1822 	if (str == NULL) {
1823 		return ret;
1824 	}
1825 
1826 	if (!strcmp(str, "read")) {
1827 		ret = JOB_CONFIG_RW_READ;
1828 	} else if (!strcmp(str, "randread")) {
1829 		ret = JOB_CONFIG_RW_RANDREAD;
1830 	} else if (!strcmp(str, "write")) {
1831 		ret = JOB_CONFIG_RW_WRITE;
1832 	} else if (!strcmp(str, "randwrite")) {
1833 		ret = JOB_CONFIG_RW_RANDWRITE;
1834 	} else if (!strcmp(str, "verify")) {
1835 		ret = JOB_CONFIG_RW_VERIFY;
1836 	} else if (!strcmp(str, "reset")) {
1837 		ret = JOB_CONFIG_RW_RESET;
1838 	} else if (!strcmp(str, "unmap")) {
1839 		ret = JOB_CONFIG_RW_UNMAP;
1840 	} else if (!strcmp(str, "write_zeroes")) {
1841 		ret = JOB_CONFIG_RW_WRITE_ZEROES;
1842 	} else if (!strcmp(str, "flush")) {
1843 		ret = JOB_CONFIG_RW_FLUSH;
1844 	} else if (!strcmp(str, "rw")) {
1845 		ret = JOB_CONFIG_RW_RW;
1846 	} else if (!strcmp(str, "randrw")) {
1847 		ret = JOB_CONFIG_RW_RANDRW;
1848 	} else {
1849 		fprintf(stderr, "rw must be one of\n"
1850 			"(read, write, randread, randwrite, rw, randrw, verify, reset, unmap, flush)\n");
1851 		ret = BDEVPERF_CONFIG_ERROR;
1852 	}
1853 
1854 	return ret;
1855 }
1856 
1857 static const char *
1858 config_filename_next(const char *filename, char *out)
1859 {
1860 	int i, k;
1861 
1862 	if (filename == NULL) {
1863 		out[0] = '\0';
1864 		return NULL;
1865 	}
1866 
1867 	if (filename[0] == ':') {
1868 		filename++;
1869 	}
1870 
1871 	for (i = 0, k = 0;
1872 	     filename[i] != '\0' &&
1873 	     filename[i] != ':' &&
1874 	     i < BDEVPERF_CONFIG_MAX_FILENAME &&
1875 	     k < (BDEVPERF_CONFIG_MAX_FILENAME - 1);
1876 	     i++) {
1877 		if (filename[i] == ' ' || filename[i] == '\t') {
1878 			continue;
1879 		}
1880 
1881 		out[k++] = filename[i];
1882 	}
1883 	out[k] = 0;
1884 
1885 	return filename + i;
1886 }
1887 
1888 static struct spdk_thread *
1889 get_lcore_thread(uint32_t lcore)
1890 {
1891 	struct lcore_thread *lthread;
1892 
1893 	TAILQ_FOREACH(lthread, &g_lcore_thread_list, link) {
1894 		if (lthread->lcore == lcore) {
1895 			return lthread->thread;
1896 		}
1897 	}
1898 
1899 	return NULL;
1900 }
1901 
1902 static void
1903 bdevperf_construct_jobs(void)
1904 {
1905 	char filename[BDEVPERF_CONFIG_MAX_FILENAME];
1906 	struct spdk_thread *thread;
1907 	struct job_config *config;
1908 	struct spdk_bdev *bdev;
1909 	const char *filenames;
1910 	int rc;
1911 
1912 	TAILQ_FOREACH(config, &job_config_list, link) {
1913 		filenames = config->filename;
1914 
1915 		if (!g_one_thread_per_lcore) {
1916 			thread = construct_job_thread(&config->cpumask, config->name);
1917 		} else {
1918 			thread = get_lcore_thread(config->lcore);
1919 		}
1920 		assert(thread);
1921 
1922 		while (filenames) {
1923 			filenames = config_filename_next(filenames, filename);
1924 			if (strlen(filename) == 0) {
1925 				break;
1926 			}
1927 
1928 			bdev = spdk_bdev_get_by_name(filename);
1929 			if (!bdev) {
1930 				fprintf(stderr, "Unable to find bdev '%s'\n", filename);
1931 				g_run_rc = -EINVAL;
1932 				return;
1933 			}
1934 
1935 			rc = bdevperf_construct_job(bdev, config, thread);
1936 			if (rc < 0) {
1937 				g_run_rc = rc;
1938 				return;
1939 			}
1940 		}
1941 	}
1942 }
1943 
1944 static int
1945 make_cli_job_config(const char *filename, int64_t offset, uint64_t range)
1946 {
1947 	struct job_config *config = calloc(1, sizeof(*config));
1948 
1949 	if (config == NULL) {
1950 		fprintf(stderr, "Unable to allocate memory for job config\n");
1951 		return -ENOMEM;
1952 	}
1953 
1954 	config->name = filename;
1955 	config->filename = filename;
1956 	config->lcore = _get_next_core();
1957 	spdk_cpuset_zero(&config->cpumask);
1958 	spdk_cpuset_set_cpu(&config->cpumask, config->lcore, true);
1959 	config->bs = g_io_size;
1960 	config->iodepth = g_queue_depth;
1961 	config->rwmixread = g_rw_percentage;
1962 	config->offset = offset;
1963 	config->length = range;
1964 	config->rw = parse_rw(g_workload_type, BDEVPERF_CONFIG_ERROR);
1965 	if ((int)config->rw == BDEVPERF_CONFIG_ERROR) {
1966 		free(config);
1967 		return -EINVAL;
1968 	}
1969 
1970 	TAILQ_INSERT_TAIL(&job_config_list, config, link);
1971 	return 0;
1972 }
1973 
1974 static int
1975 bdevperf_construct_multithread_job_config(void *ctx, struct spdk_bdev *bdev)
1976 {
1977 	uint32_t *num_cores = ctx;
1978 	uint32_t i;
1979 	uint64_t blocks_per_job;
1980 	int64_t offset;
1981 	int rc;
1982 
1983 	blocks_per_job = spdk_bdev_get_num_blocks(bdev) / *num_cores;
1984 	offset = 0;
1985 
1986 	SPDK_ENV_FOREACH_CORE(i) {
1987 		rc = make_cli_job_config(spdk_bdev_get_name(bdev), offset, blocks_per_job);
1988 		if (rc) {
1989 			return rc;
1990 		}
1991 
1992 		offset += blocks_per_job;
1993 	}
1994 
1995 	return 0;
1996 }
1997 
1998 static void
1999 bdevperf_construct_multithread_job_configs(void)
2000 {
2001 	struct spdk_bdev *bdev;
2002 	uint32_t i;
2003 	uint32_t num_cores;
2004 
2005 	num_cores = 0;
2006 	SPDK_ENV_FOREACH_CORE(i) {
2007 		num_cores++;
2008 	}
2009 
2010 	if (num_cores == 0) {
2011 		g_run_rc = -EINVAL;
2012 		return;
2013 	}
2014 
2015 	if (g_job_bdev_name != NULL) {
2016 		bdev = spdk_bdev_get_by_name(g_job_bdev_name);
2017 		if (!bdev) {
2018 			fprintf(stderr, "Unable to find bdev '%s'\n", g_job_bdev_name);
2019 			return;
2020 		}
2021 		g_run_rc = bdevperf_construct_multithread_job_config(&num_cores, bdev);
2022 	} else {
2023 		g_run_rc = spdk_for_each_bdev_leaf(&num_cores, bdevperf_construct_multithread_job_config);
2024 	}
2025 
2026 }
2027 
2028 static int
2029 bdevperf_construct_job_config(void *ctx, struct spdk_bdev *bdev)
2030 {
2031 	/* Construct the job */
2032 	return make_cli_job_config(spdk_bdev_get_name(bdev), 0, 0);
2033 }
2034 
2035 static void
2036 create_lcore_thread(uint32_t lcore)
2037 {
2038 	struct lcore_thread *lthread;
2039 	struct spdk_cpuset cpumask = {};
2040 	char name[32];
2041 
2042 	lthread = calloc(1, sizeof(*lthread));
2043 	assert(lthread != NULL);
2044 
2045 	lthread->lcore = lcore;
2046 
2047 	snprintf(name, sizeof(name), "lcore_%u", lcore);
2048 	spdk_cpuset_set_cpu(&cpumask, lcore, true);
2049 
2050 	lthread->thread = spdk_thread_create(name, &cpumask);
2051 	assert(lthread->thread != NULL);
2052 
2053 	TAILQ_INSERT_TAIL(&g_lcore_thread_list, lthread, link);
2054 }
2055 
2056 static void
2057 bdevperf_construct_job_configs(void)
2058 {
2059 	struct spdk_bdev *bdev;
2060 	uint32_t i;
2061 
2062 	/* There are three different modes for allocating jobs. Standard mode
2063 	 * (the default) creates one spdk_thread per bdev and runs the I/O job there.
2064 	 *
2065 	 * The -C flag places bdevperf into "multithread" mode, meaning it creates
2066 	 * one spdk_thread per bdev PER CORE, and runs a copy of the job on each.
2067 	 * This runs multiple threads per bdev, effectively.
2068 	 *
2069 	 * The -j flag implies "FIO" mode which tries to mimic semantic of FIO jobs.
2070 	 * In "FIO" mode, threads are spawned per-job instead of per-bdev.
2071 	 * Each FIO job can be individually parameterized by filename, cpu mask, etc,
2072 	 * which is different from other modes in that they only support global options.
2073 	 *
2074 	 * Both for standard mode and "multithread" mode, if the -E flag is specified,
2075 	 * it creates one spdk_thread PER CORE. On each core, one spdk_thread is shared by
2076 	 * multiple jobs.
2077 	 */
2078 
2079 	if (g_bdevperf_conf) {
2080 		goto end;
2081 	}
2082 
2083 	if (g_one_thread_per_lcore) {
2084 		SPDK_ENV_FOREACH_CORE(i) {
2085 			create_lcore_thread(i);
2086 		}
2087 	}
2088 
2089 	if (g_multithread_mode) {
2090 		bdevperf_construct_multithread_job_configs();
2091 	} else if (g_job_bdev_name != NULL) {
2092 		bdev = spdk_bdev_get_by_name(g_job_bdev_name);
2093 		if (bdev) {
2094 			/* Construct the job */
2095 			g_run_rc = make_cli_job_config(g_job_bdev_name, 0, 0);
2096 		} else {
2097 			fprintf(stderr, "Unable to find bdev '%s'\n", g_job_bdev_name);
2098 		}
2099 	} else {
2100 		g_run_rc = spdk_for_each_bdev_leaf(NULL, bdevperf_construct_job_config);
2101 	}
2102 
2103 end:
2104 	/* Increment initial construct_jobs count so that it will never reach 0 in the middle
2105 	 * of iteration.
2106 	 */
2107 	g_construct_job_count = 1;
2108 
2109 	if (g_run_rc == 0) {
2110 		bdevperf_construct_jobs();
2111 	}
2112 
2113 	_bdevperf_construct_job_done(NULL);
2114 }
2115 
2116 static int
2117 parse_uint_option(struct spdk_conf_section *s, const char *name, int def)
2118 {
2119 	const char *job_name;
2120 	int tmp;
2121 
2122 	tmp = spdk_conf_section_get_intval(s, name);
2123 	if (tmp == -1) {
2124 		/* Field was not found. Check default value
2125 		 * In [global] section it is ok to have undefined values
2126 		 * but for other sections it is not ok */
2127 		if (def == BDEVPERF_CONFIG_UNDEFINED) {
2128 			job_name = spdk_conf_section_get_name(s);
2129 			if (strcmp(job_name, "global") == 0) {
2130 				return def;
2131 			}
2132 
2133 			fprintf(stderr,
2134 				"Job '%s' has no '%s' assigned\n",
2135 				job_name, name);
2136 			return BDEVPERF_CONFIG_ERROR;
2137 		}
2138 		return def;
2139 	}
2140 
2141 	/* NOTE: get_intval returns nonnegative on success */
2142 	if (tmp < 0) {
2143 		fprintf(stderr, "Job '%s' has bad '%s' value.\n",
2144 			spdk_conf_section_get_name(s), name);
2145 		return BDEVPERF_CONFIG_ERROR;
2146 	}
2147 
2148 	return tmp;
2149 }
2150 
2151 /* CLI arguments override parameters for global sections */
2152 static void
2153 config_set_cli_args(struct job_config *config)
2154 {
2155 	if (g_job_bdev_name) {
2156 		config->filename = g_job_bdev_name;
2157 	}
2158 	if (g_io_size > 0) {
2159 		config->bs = g_io_size;
2160 	}
2161 	if (g_queue_depth > 0) {
2162 		config->iodepth = g_queue_depth;
2163 	}
2164 	if (g_rw_percentage > 0) {
2165 		config->rwmixread = g_rw_percentage;
2166 	}
2167 	if (g_workload_type) {
2168 		config->rw = parse_rw(g_workload_type, config->rw);
2169 	}
2170 }
2171 
2172 static int
2173 read_job_config(void)
2174 {
2175 	struct job_config global_default_config;
2176 	struct job_config global_config;
2177 	struct spdk_conf_section *s;
2178 	struct job_config *config;
2179 	const char *cpumask;
2180 	const char *rw;
2181 	bool is_global;
2182 	int n = 0;
2183 	int val;
2184 
2185 	if (g_bdevperf_conf_file == NULL) {
2186 		return 0;
2187 	}
2188 
2189 	g_bdevperf_conf = spdk_conf_allocate();
2190 	if (g_bdevperf_conf == NULL) {
2191 		fprintf(stderr, "Could not allocate job config structure\n");
2192 		return 1;
2193 	}
2194 
2195 	spdk_conf_disable_sections_merge(g_bdevperf_conf);
2196 	if (spdk_conf_read(g_bdevperf_conf, g_bdevperf_conf_file)) {
2197 		fprintf(stderr, "Invalid job config");
2198 		return 1;
2199 	}
2200 
2201 	/* Initialize global defaults */
2202 	global_default_config.filename = NULL;
2203 	/* Zero mask is the same as g_all_cpuset
2204 	 * The g_all_cpuset is not initialized yet,
2205 	 * so use zero mask as the default instead */
2206 	spdk_cpuset_zero(&global_default_config.cpumask);
2207 	global_default_config.bs = BDEVPERF_CONFIG_UNDEFINED;
2208 	global_default_config.iodepth = BDEVPERF_CONFIG_UNDEFINED;
2209 	/* bdevperf has no default for -M option but in FIO the default is 50 */
2210 	global_default_config.rwmixread = 50;
2211 	global_default_config.offset = 0;
2212 	/* length 0 means 100% */
2213 	global_default_config.length = 0;
2214 	global_default_config.rw = BDEVPERF_CONFIG_UNDEFINED;
2215 	config_set_cli_args(&global_default_config);
2216 
2217 	if ((int)global_default_config.rw == BDEVPERF_CONFIG_ERROR) {
2218 		return 1;
2219 	}
2220 
2221 	/* There is only a single instance of global job_config
2222 	 * We just reset its value when we encounter new [global] section */
2223 	global_config = global_default_config;
2224 
2225 	for (s = spdk_conf_first_section(g_bdevperf_conf);
2226 	     s != NULL;
2227 	     s = spdk_conf_next_section(s)) {
2228 		config = calloc(1, sizeof(*config));
2229 		if (config == NULL) {
2230 			fprintf(stderr, "Unable to allocate memory for job config\n");
2231 			return 1;
2232 		}
2233 
2234 		config->name = spdk_conf_section_get_name(s);
2235 		is_global = strcmp(config->name, "global") == 0;
2236 
2237 		if (is_global) {
2238 			global_config = global_default_config;
2239 		}
2240 
2241 		config->filename = spdk_conf_section_get_val(s, "filename");
2242 		if (config->filename == NULL) {
2243 			config->filename = global_config.filename;
2244 		}
2245 		if (!is_global) {
2246 			if (config->filename == NULL) {
2247 				fprintf(stderr, "Job '%s' expects 'filename' parameter\n", config->name);
2248 				goto error;
2249 			} else if (strnlen(config->filename, BDEVPERF_CONFIG_MAX_FILENAME)
2250 				   >= BDEVPERF_CONFIG_MAX_FILENAME) {
2251 				fprintf(stderr,
2252 					"filename for '%s' job is too long. Max length is %d\n",
2253 					config->name, BDEVPERF_CONFIG_MAX_FILENAME);
2254 				goto error;
2255 			}
2256 		}
2257 
2258 		cpumask = spdk_conf_section_get_val(s, "cpumask");
2259 		if (cpumask == NULL) {
2260 			config->cpumask = global_config.cpumask;
2261 		} else if (spdk_cpuset_parse(&config->cpumask, cpumask)) {
2262 			fprintf(stderr, "Job '%s' has bad 'cpumask' value\n", config->name);
2263 			goto error;
2264 		}
2265 
2266 		config->bs = parse_uint_option(s, "bs", global_config.bs);
2267 		if (config->bs == BDEVPERF_CONFIG_ERROR) {
2268 			goto error;
2269 		} else if (config->bs == 0) {
2270 			fprintf(stderr, "'bs' of job '%s' must be greater than 0\n", config->name);
2271 			goto error;
2272 		}
2273 
2274 		config->iodepth = parse_uint_option(s, "iodepth", global_config.iodepth);
2275 		if (config->iodepth == BDEVPERF_CONFIG_ERROR) {
2276 			goto error;
2277 		} else if (config->iodepth == 0) {
2278 			fprintf(stderr,
2279 				"'iodepth' of job '%s' must be greater than 0\n",
2280 				config->name);
2281 			goto error;
2282 		}
2283 
2284 		config->rwmixread = parse_uint_option(s, "rwmixread", global_config.rwmixread);
2285 		if (config->rwmixread == BDEVPERF_CONFIG_ERROR) {
2286 			goto error;
2287 		} else if (config->rwmixread > 100) {
2288 			fprintf(stderr,
2289 				"'rwmixread' value of '%s' job is not in 0-100 range\n",
2290 				config->name);
2291 			goto error;
2292 		}
2293 
2294 		config->offset = parse_uint_option(s, "offset", global_config.offset);
2295 		if (config->offset == BDEVPERF_CONFIG_ERROR) {
2296 			goto error;
2297 		}
2298 
2299 		val = parse_uint_option(s, "length", global_config.length);
2300 		if (val == BDEVPERF_CONFIG_ERROR) {
2301 			goto error;
2302 		}
2303 		config->length = val;
2304 
2305 		rw = spdk_conf_section_get_val(s, "rw");
2306 		config->rw = parse_rw(rw, global_config.rw);
2307 		if ((int)config->rw == BDEVPERF_CONFIG_ERROR) {
2308 			fprintf(stderr, "Job '%s' has bad 'rw' value\n", config->name);
2309 			goto error;
2310 		} else if (!is_global && (int)config->rw == BDEVPERF_CONFIG_UNDEFINED) {
2311 			fprintf(stderr, "Job '%s' has no 'rw' assigned\n", config->name);
2312 			goto error;
2313 		}
2314 
2315 		if (is_global) {
2316 			config_set_cli_args(config);
2317 			global_config = *config;
2318 			free(config);
2319 		} else {
2320 			TAILQ_INSERT_TAIL(&job_config_list, config, link);
2321 			n++;
2322 		}
2323 	}
2324 
2325 	printf("Using job config with %d jobs\n", n);
2326 	return 0;
2327 error:
2328 	free(config);
2329 	return 1;
2330 }
2331 
2332 static void
2333 bdevperf_run(void *arg1)
2334 {
2335 	uint32_t i;
2336 
2337 	g_main_thread = spdk_get_thread();
2338 
2339 	spdk_cpuset_zero(&g_all_cpuset);
2340 	SPDK_ENV_FOREACH_CORE(i) {
2341 		spdk_cpuset_set_cpu(&g_all_cpuset, i, true);
2342 	}
2343 
2344 	if (g_wait_for_tests) {
2345 		/* Do not perform any tests until RPC is received */
2346 		return;
2347 	}
2348 
2349 	bdevperf_construct_job_configs();
2350 }
2351 
2352 static void
2353 rpc_perform_tests_reset(void)
2354 {
2355 	/* Reset g_run_rc to 0 for the next test run. */
2356 	g_run_rc = 0;
2357 
2358 	/* Reset g_stats to 0 for the next test run. */
2359 	memset(&g_stats, 0, sizeof(g_stats));
2360 
2361 	/* Reset g_show_performance_period_num to 0 for the next test run. */
2362 	g_show_performance_period_num = 0;
2363 }
2364 
2365 static void
2366 rpc_perform_tests_cb(void)
2367 {
2368 	struct spdk_json_write_ctx *w;
2369 	struct spdk_jsonrpc_request *request = g_request;
2370 
2371 	g_request = NULL;
2372 
2373 	if (g_run_rc == 0) {
2374 		w = spdk_jsonrpc_begin_result(request);
2375 		spdk_json_write_uint32(w, g_run_rc);
2376 		spdk_jsonrpc_end_result(request, w);
2377 	} else {
2378 		spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
2379 						     "bdevperf failed with error %s", spdk_strerror(-g_run_rc));
2380 	}
2381 
2382 	rpc_perform_tests_reset();
2383 }
2384 
2385 static void
2386 rpc_perform_tests(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params)
2387 {
2388 	if (params != NULL) {
2389 		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
2390 						 "perform_tests method requires no parameters");
2391 		return;
2392 	}
2393 	if (g_request != NULL) {
2394 		fprintf(stderr, "Another test is already in progress.\n");
2395 		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
2396 						 spdk_strerror(-EINPROGRESS));
2397 		return;
2398 	}
2399 	g_request = request;
2400 
2401 	/* Only construct job configs at the first test run.  */
2402 	if (TAILQ_EMPTY(&job_config_list)) {
2403 		bdevperf_construct_job_configs();
2404 	} else {
2405 		bdevperf_construct_jobs();
2406 	}
2407 }
2408 SPDK_RPC_REGISTER("perform_tests", rpc_perform_tests, SPDK_RPC_RUNTIME)
2409 
2410 static void
2411 _bdevperf_job_drain(void *ctx)
2412 {
2413 	bdevperf_job_drain(ctx);
2414 }
2415 
2416 static void
2417 spdk_bdevperf_shutdown_cb(void)
2418 {
2419 	g_shutdown = true;
2420 	struct bdevperf_job *job, *tmp;
2421 
2422 	if (g_bdevperf.running_jobs == 0) {
2423 		bdevperf_test_done(NULL);
2424 		return;
2425 	}
2426 
2427 	/* Iterate jobs to stop all I/O */
2428 	TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, tmp) {
2429 		spdk_thread_send_msg(job->thread, _bdevperf_job_drain, job);
2430 	}
2431 }
2432 
2433 static int
2434 bdevperf_parse_arg(int ch, char *arg)
2435 {
2436 	long long tmp;
2437 
2438 	if (ch == 'w') {
2439 		g_workload_type = optarg;
2440 	} else if (ch == 'T') {
2441 		g_job_bdev_name = optarg;
2442 	} else if (ch == 'z') {
2443 		g_wait_for_tests = true;
2444 	} else if (ch == 'Z') {
2445 		g_zcopy = true;
2446 	} else if (ch == 'X') {
2447 		g_abort = true;
2448 	} else if (ch == 'C') {
2449 		g_multithread_mode = true;
2450 	} else if (ch == 'f') {
2451 		g_continue_on_failure = true;
2452 	} else if (ch == 'j') {
2453 		g_bdevperf_conf_file = optarg;
2454 	} else if (ch == 'F') {
2455 		char *endptr;
2456 
2457 		errno = 0;
2458 		g_zipf_theta = strtod(optarg, &endptr);
2459 		if (errno || optarg == endptr || g_zipf_theta < 0) {
2460 			fprintf(stderr, "Illegal zipf theta value %s\n", optarg);
2461 			return -EINVAL;
2462 		}
2463 	} else if (ch == 'l') {
2464 		g_latency_display_level++;
2465 	} else if (ch == 'D') {
2466 		g_random_map = true;
2467 	} else if (ch == 'E') {
2468 		g_one_thread_per_lcore = true;
2469 	} else {
2470 		tmp = spdk_strtoll(optarg, 10);
2471 		if (tmp < 0) {
2472 			fprintf(stderr, "Parse failed for the option %c.\n", ch);
2473 			return tmp;
2474 		} else if (tmp >= INT_MAX) {
2475 			fprintf(stderr, "Parsed option was too large %c.\n", ch);
2476 			return -ERANGE;
2477 		}
2478 
2479 		switch (ch) {
2480 		case 'q':
2481 			g_queue_depth = tmp;
2482 			break;
2483 		case 'o':
2484 			g_io_size = tmp;
2485 			break;
2486 		case 't':
2487 			g_time_in_sec = tmp;
2488 			break;
2489 		case 'k':
2490 			g_timeout_in_sec = tmp;
2491 			break;
2492 		case 'M':
2493 			g_rw_percentage = tmp;
2494 			g_mix_specified = true;
2495 			break;
2496 		case 'P':
2497 			g_show_performance_ema_period = tmp;
2498 			break;
2499 		case 'S':
2500 			g_show_performance_real_time = 1;
2501 			g_show_performance_period_in_usec = tmp * SPDK_SEC_TO_USEC;
2502 			break;
2503 		default:
2504 			return -EINVAL;
2505 		}
2506 	}
2507 	return 0;
2508 }
2509 
2510 static void
2511 bdevperf_usage(void)
2512 {
2513 	printf(" -q <depth>                io depth\n");
2514 	printf(" -o <size>                 io size in bytes\n");
2515 	printf(" -w <type>                 io pattern type, must be one of (read, write, randread, randwrite, rw, randrw, verify, reset, unmap, flush)\n");
2516 	printf(" -t <time>                 time in seconds\n");
2517 	printf(" -k <timeout>              timeout in seconds to detect starved I/O (default is 0 and disabled)\n");
2518 	printf(" -M <percent>              rwmixread (100 for reads, 0 for writes)\n");
2519 	printf(" -P <num>                  number of moving average period\n");
2520 	printf("\t\t(If set to n, show weighted mean of the previous n IO/s in real time)\n");
2521 	printf("\t\t(Formula: M = 2 / (n + 1), EMA[i+1] = IO/s * M + (1 - M) * EMA[i])\n");
2522 	printf("\t\t(only valid with -S)\n");
2523 	printf(" -S <period>               show performance result in real time every <period> seconds\n");
2524 	printf(" -T <bdev>                 bdev to run against. Default: all available bdevs.\n");
2525 	printf(" -f                        continue processing I/O even after failures\n");
2526 	printf(" -F <zipf theta>           use zipf distribution for random I/O\n");
2527 	printf(" -Z                        enable using zcopy bdev API for read or write I/O\n");
2528 	printf(" -z                        start bdevperf, but wait for RPC to start tests\n");
2529 	printf(" -X                        abort timed out I/O\n");
2530 	printf(" -C                        enable every core to send I/Os to each bdev\n");
2531 	printf(" -j <filename>             use job config file\n");
2532 	printf(" -l                        display latency histogram, default: disable. -l display summary, -ll display details\n");
2533 	printf(" -D                        use a random map for picking offsets not previously read or written (for all jobs)\n");
2534 	printf(" -E                        share per lcore thread among jobs. Available only if -j is not used.\n");
2535 }
2536 
2537 static int
2538 verify_test_params(struct spdk_app_opts *opts)
2539 {
2540 	/* When RPC is used for starting tests and
2541 	 * no rpc_addr was configured for the app,
2542 	 * use the default address. */
2543 	if (g_wait_for_tests && opts->rpc_addr == NULL) {
2544 		opts->rpc_addr = SPDK_DEFAULT_RPC_ADDR;
2545 	}
2546 
2547 	if (!g_bdevperf_conf_file && g_queue_depth <= 0) {
2548 		goto out;
2549 	}
2550 	if (!g_bdevperf_conf_file && g_io_size <= 0) {
2551 		goto out;
2552 	}
2553 	if (!g_bdevperf_conf_file && !g_workload_type) {
2554 		goto out;
2555 	}
2556 	if (g_bdevperf_conf_file && g_one_thread_per_lcore) {
2557 		printf("If bdevperf's config file is used, per lcore thread cannot be used\n");
2558 		goto out;
2559 	}
2560 	if (g_time_in_sec <= 0) {
2561 		goto out;
2562 	}
2563 	g_time_in_usec = g_time_in_sec * SPDK_SEC_TO_USEC;
2564 
2565 	if (g_timeout_in_sec < 0) {
2566 		goto out;
2567 	}
2568 
2569 	if (g_abort && !g_timeout_in_sec) {
2570 		printf("Timeout must be set for abort option, Ignoring g_abort\n");
2571 	}
2572 
2573 	if (g_show_performance_ema_period > 0 &&
2574 	    g_show_performance_real_time == 0) {
2575 		fprintf(stderr, "-P option must be specified with -S option\n");
2576 		return 1;
2577 	}
2578 
2579 	if (g_io_size > SPDK_BDEV_LARGE_BUF_MAX_SIZE) {
2580 		printf("I/O size of %d is greater than zero copy threshold (%d).\n",
2581 		       g_io_size, SPDK_BDEV_LARGE_BUF_MAX_SIZE);
2582 		printf("Zero copy mechanism will not be used.\n");
2583 		g_zcopy = false;
2584 	}
2585 
2586 	if (g_bdevperf_conf_file) {
2587 		/* workload_type verification happens during config file parsing */
2588 		return 0;
2589 	}
2590 
2591 	if (!strcmp(g_workload_type, "verify") ||
2592 	    !strcmp(g_workload_type, "reset")) {
2593 		g_rw_percentage = 50;
2594 		if (g_io_size > SPDK_BDEV_LARGE_BUF_MAX_SIZE) {
2595 			fprintf(stderr, "Unable to exceed max I/O size of %d for verify. (%d provided).\n",
2596 				SPDK_BDEV_LARGE_BUF_MAX_SIZE, g_io_size);
2597 			return 1;
2598 		}
2599 		g_verify = true;
2600 		if (!strcmp(g_workload_type, "reset")) {
2601 			g_reset = true;
2602 		}
2603 	}
2604 
2605 	if (!strcmp(g_workload_type, "read") ||
2606 	    !strcmp(g_workload_type, "randread") ||
2607 	    !strcmp(g_workload_type, "write") ||
2608 	    !strcmp(g_workload_type, "randwrite") ||
2609 	    !strcmp(g_workload_type, "verify") ||
2610 	    !strcmp(g_workload_type, "reset") ||
2611 	    !strcmp(g_workload_type, "unmap") ||
2612 	    !strcmp(g_workload_type, "write_zeroes") ||
2613 	    !strcmp(g_workload_type, "flush")) {
2614 		if (g_mix_specified) {
2615 			fprintf(stderr, "Ignoring -M option... Please use -M option"
2616 				" only when using rw or randrw.\n");
2617 		}
2618 	}
2619 
2620 	if (!strcmp(g_workload_type, "rw") ||
2621 	    !strcmp(g_workload_type, "randrw")) {
2622 		if (g_rw_percentage < 0 || g_rw_percentage > 100) {
2623 			fprintf(stderr,
2624 				"-M must be specified to value from 0 to 100 "
2625 				"for rw or randrw.\n");
2626 			return 1;
2627 		}
2628 	}
2629 
2630 	if (strcmp(g_workload_type, "randread") &&
2631 	    strcmp(g_workload_type, "randwrite") &&
2632 	    strcmp(g_workload_type, "randrw")) {
2633 		if (g_random_map) {
2634 			fprintf(stderr, "Ignoring -D option... Please use -D option"
2635 				" only when using randread, randwrite or randrw.\n");
2636 			return 1;
2637 		}
2638 	}
2639 
2640 	return 0;
2641 out:
2642 	spdk_app_usage();
2643 	bdevperf_usage();
2644 	return 1;
2645 }
2646 
2647 int
2648 main(int argc, char **argv)
2649 {
2650 	struct spdk_app_opts opts = {};
2651 	int rc;
2652 
2653 	/* Use the runtime PID to set the random seed */
2654 	srand(getpid());
2655 
2656 	spdk_app_opts_init(&opts, sizeof(opts));
2657 	opts.name = "bdevperf";
2658 	opts.rpc_addr = NULL;
2659 	opts.shutdown_cb = spdk_bdevperf_shutdown_cb;
2660 
2661 	if ((rc = spdk_app_parse_args(argc, argv, &opts, "Zzfq:o:t:w:k:CEF:M:P:S:T:Xlj:D", NULL,
2662 				      bdevperf_parse_arg, bdevperf_usage)) !=
2663 	    SPDK_APP_PARSE_ARGS_SUCCESS) {
2664 		return rc;
2665 	}
2666 
2667 	if (read_job_config()) {
2668 		free_job_config();
2669 		return 1;
2670 	}
2671 
2672 	if (verify_test_params(&opts) != 0) {
2673 		free_job_config();
2674 		exit(1);
2675 	}
2676 
2677 	rc = spdk_app_start(&opts, bdevperf_run, NULL);
2678 
2679 	spdk_app_fini();
2680 	free_job_config();
2681 	return rc;
2682 }
2683