xref: /spdk/examples/bdev/bdevperf/bdevperf.c (revision 12fbe739a31b09aff0d05f354d4f3bbef99afc55)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2016 Intel Corporation.
3  *   Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES.
4  *   All rights reserved.
5  */
6 
7 #include "spdk/stdinc.h"
8 
9 #include "spdk/bdev.h"
10 #include "spdk/accel.h"
11 #include "spdk/endian.h"
12 #include "spdk/env.h"
13 #include "spdk/event.h"
14 #include "spdk/log.h"
15 #include "spdk/util.h"
16 #include "spdk/thread.h"
17 #include "spdk/string.h"
18 #include "spdk/rpc.h"
19 #include "spdk/bit_array.h"
20 #include "spdk/conf.h"
21 #include "spdk/zipf.h"
22 #include "spdk/histogram_data.h"
23 
24 #define BDEVPERF_CONFIG_MAX_FILENAME 1024
25 #define BDEVPERF_CONFIG_UNDEFINED -1
26 #define BDEVPERF_CONFIG_ERROR -2
27 #define PATTERN_TYPES_STR "(read, write, randread, randwrite, rw, randrw, verify, reset, unmap, flush, write_zeroes)"
28 
29 struct bdevperf_task {
30 	struct iovec			iov;
31 	struct bdevperf_job		*job;
32 	struct spdk_bdev_io		*bdev_io;
33 	void				*buf;
34 	void				*md_buf;
35 	uint64_t			offset_blocks;
36 	struct bdevperf_task		*task_to_abort;
37 	enum spdk_bdev_io_type		io_type;
38 	TAILQ_ENTRY(bdevperf_task)	link;
39 	struct spdk_bdev_io_wait_entry	bdev_io_wait;
40 };
41 
42 static const char *g_workload_type = NULL;
43 static int g_io_size = 0;
44 /* initialize to invalid value so we can detect if user overrides it. */
45 static int g_rw_percentage = -1;
46 static bool g_verify = false;
47 static bool g_reset = false;
48 static bool g_continue_on_failure = false;
49 static bool g_abort = false;
50 static bool g_error_to_exit = false;
51 static int g_queue_depth = 0;
52 static uint64_t g_time_in_usec;
53 static int g_show_performance_real_time = 0;
54 static uint64_t g_show_performance_period_in_usec = SPDK_SEC_TO_USEC;
55 static uint64_t g_show_performance_period_num = 0;
56 static uint64_t g_show_performance_ema_period = 0;
57 static int g_run_rc = 0;
58 static bool g_shutdown = false;
59 static uint64_t g_start_tsc;
60 static uint64_t g_shutdown_tsc;
61 static bool g_zcopy = false;
62 static struct spdk_thread *g_main_thread;
63 static int g_time_in_sec = 0;
64 static bool g_mix_specified = false;
65 static const char *g_job_bdev_name;
66 static bool g_wait_for_tests = false;
67 static struct spdk_jsonrpc_request *g_request = NULL;
68 static bool g_multithread_mode = false;
69 static int g_timeout_in_sec;
70 static struct spdk_conf *g_bdevperf_conf = NULL;
71 static const char *g_bdevperf_conf_file = NULL;
72 static double g_zipf_theta;
73 static bool g_random_map = false;
74 
75 static struct spdk_cpuset g_all_cpuset;
76 static struct spdk_poller *g_perf_timer = NULL;
77 
78 static void bdevperf_submit_single(struct bdevperf_job *job, struct bdevperf_task *task);
79 static void rpc_perform_tests_cb(void);
80 
81 static uint32_t g_bdev_count = 0;
82 static uint32_t g_latency_display_level;
83 
84 static bool g_one_thread_per_lcore = false;
85 
86 static const double g_latency_cutoffs[] = {
87 	0.01,
88 	0.10,
89 	0.25,
90 	0.50,
91 	0.75,
92 	0.90,
93 	0.95,
94 	0.98,
95 	0.99,
96 	0.995,
97 	0.999,
98 	0.9999,
99 	0.99999,
100 	0.999999,
101 	0.9999999,
102 	-1,
103 };
104 
105 static const char *g_rpc_log_file_name = NULL;
106 static FILE *g_rpc_log_file = NULL;
107 
108 struct latency_info {
109 	uint64_t	min;
110 	uint64_t	max;
111 	uint64_t	total;
112 };
113 
114 struct bdevperf_job {
115 	char				*name;
116 	struct spdk_bdev		*bdev;
117 	struct spdk_bdev_desc		*bdev_desc;
118 	struct spdk_io_channel		*ch;
119 	TAILQ_ENTRY(bdevperf_job)	link;
120 	struct spdk_thread		*thread;
121 
122 	const char			*workload_type;
123 	int				io_size;
124 	int				rw_percentage;
125 	bool				is_random;
126 	bool				verify;
127 	bool				reset;
128 	bool				continue_on_failure;
129 	bool				unmap;
130 	bool				write_zeroes;
131 	bool				flush;
132 	bool				abort;
133 	int				queue_depth;
134 	unsigned int			seed;
135 
136 	uint64_t			io_completed;
137 	uint64_t			io_failed;
138 	uint64_t			io_timeout;
139 	uint64_t			prev_io_completed;
140 	double				ema_io_per_second;
141 	int				current_queue_depth;
142 	uint64_t			size_in_ios;
143 	uint64_t			ios_base;
144 	uint64_t			offset_in_ios;
145 	uint64_t			io_size_blocks;
146 	uint64_t			buf_size;
147 	uint32_t			dif_check_flags;
148 	bool				is_draining;
149 	struct spdk_poller		*run_timer;
150 	struct spdk_poller		*reset_timer;
151 	struct spdk_bit_array		*outstanding;
152 	struct spdk_zipf		*zipf;
153 	TAILQ_HEAD(, bdevperf_task)	task_list;
154 	uint64_t			run_time_in_usec;
155 
156 	/* keep channel's histogram data before being destroyed */
157 	struct spdk_histogram_data	*histogram;
158 	struct spdk_bit_array		*random_map;
159 };
160 
161 struct spdk_bdevperf {
162 	TAILQ_HEAD(, bdevperf_job)	jobs;
163 	uint32_t			running_jobs;
164 };
165 
166 static struct spdk_bdevperf g_bdevperf = {
167 	.jobs = TAILQ_HEAD_INITIALIZER(g_bdevperf.jobs),
168 	.running_jobs = 0,
169 };
170 
171 enum job_config_rw {
172 	JOB_CONFIG_RW_READ = 0,
173 	JOB_CONFIG_RW_WRITE,
174 	JOB_CONFIG_RW_RANDREAD,
175 	JOB_CONFIG_RW_RANDWRITE,
176 	JOB_CONFIG_RW_RW,
177 	JOB_CONFIG_RW_RANDRW,
178 	JOB_CONFIG_RW_VERIFY,
179 	JOB_CONFIG_RW_RESET,
180 	JOB_CONFIG_RW_UNMAP,
181 	JOB_CONFIG_RW_FLUSH,
182 	JOB_CONFIG_RW_WRITE_ZEROES,
183 };
184 
185 /* Storing values from a section of job config file */
186 struct job_config {
187 	const char			*name;
188 	const char			*filename;
189 	struct spdk_cpuset		cpumask;
190 	int				bs;
191 	int				iodepth;
192 	int				rwmixread;
193 	uint32_t			lcore;
194 	int64_t				offset;
195 	uint64_t			length;
196 	enum job_config_rw		rw;
197 	TAILQ_ENTRY(job_config)	link;
198 };
199 
200 TAILQ_HEAD(, job_config) job_config_list
201 	= TAILQ_HEAD_INITIALIZER(job_config_list);
202 
203 static bool g_performance_dump_active = false;
204 
205 struct bdevperf_aggregate_stats {
206 	struct bdevperf_job		*current_job;
207 	uint64_t			io_time_in_usec;
208 	uint64_t			ema_period;
209 	double				total_io_per_second;
210 	double				total_mb_per_second;
211 	double				total_failed_per_second;
212 	double				total_timeout_per_second;
213 	double				min_latency;
214 	double				max_latency;
215 	uint64_t			total_io_completed;
216 	uint64_t			total_tsc;
217 };
218 
219 static struct bdevperf_aggregate_stats g_stats = {.min_latency = (double)UINT64_MAX};
220 
221 struct lcore_thread {
222 	struct spdk_thread		*thread;
223 	uint32_t			lcore;
224 	TAILQ_ENTRY(lcore_thread)	link;
225 };
226 
227 TAILQ_HEAD(, lcore_thread) g_lcore_thread_list
228 	= TAILQ_HEAD_INITIALIZER(g_lcore_thread_list);
229 
230 /*
231  * Cumulative Moving Average (CMA): average of all data up to current
232  * Exponential Moving Average (EMA): weighted mean of the previous n data and more weight is given to recent
233  * Simple Moving Average (SMA): unweighted mean of the previous n data
234  *
235  * Bdevperf supports CMA and EMA.
236  */
237 static double
238 get_cma_io_per_second(struct bdevperf_job *job, uint64_t io_time_in_usec)
239 {
240 	return (double)job->io_completed * SPDK_SEC_TO_USEC / io_time_in_usec;
241 }
242 
243 static double
244 get_ema_io_per_second(struct bdevperf_job *job, uint64_t ema_period)
245 {
246 	double io_completed, io_per_second;
247 
248 	io_completed = job->io_completed;
249 	io_per_second = (double)(io_completed - job->prev_io_completed) * SPDK_SEC_TO_USEC
250 			/ g_show_performance_period_in_usec;
251 	job->prev_io_completed = io_completed;
252 
253 	job->ema_io_per_second += (io_per_second - job->ema_io_per_second) * 2
254 				  / (ema_period + 1);
255 	return job->ema_io_per_second;
256 }
257 
258 static void
259 get_avg_latency(void *ctx, uint64_t start, uint64_t end, uint64_t count,
260 		uint64_t total, uint64_t so_far)
261 {
262 	struct latency_info *latency_info = ctx;
263 
264 	if (count == 0) {
265 		return;
266 	}
267 
268 	latency_info->total += (start + end) / 2 * count;
269 
270 	if (so_far == count) {
271 		latency_info->min = start;
272 	}
273 
274 	if (so_far == total) {
275 		latency_info->max = end;
276 	}
277 }
278 
279 static void
280 performance_dump_job(struct bdevperf_aggregate_stats *stats, struct bdevperf_job *job)
281 {
282 	double io_per_second, mb_per_second, failed_per_second, timeout_per_second;
283 	double average_latency = 0.0, min_latency, max_latency;
284 	uint64_t time_in_usec;
285 	uint64_t tsc_rate;
286 	uint64_t total_io;
287 	struct latency_info latency_info = {};
288 
289 	printf("\r Job: %s (Core Mask 0x%s)\n", job->name,
290 	       spdk_cpuset_fmt(spdk_thread_get_cpumask(job->thread)));
291 
292 	if (job->io_failed > 0 && !job->reset && !job->continue_on_failure) {
293 		printf("\r Job: %s ended in about %.2f seconds with error\n",
294 		       job->name, (double)job->run_time_in_usec / SPDK_SEC_TO_USEC);
295 	}
296 	if (job->verify) {
297 		printf("\t Verification LBA range: start 0x%" PRIx64 " length 0x%" PRIx64 "\n",
298 		       job->ios_base, job->size_in_ios);
299 	}
300 
301 	if (g_performance_dump_active == true) {
302 		/* Use job's actual run time as Job has ended */
303 		if (job->io_failed > 0 && !job->continue_on_failure) {
304 			time_in_usec = job->run_time_in_usec;
305 		} else {
306 			time_in_usec = stats->io_time_in_usec;
307 		}
308 	} else {
309 		time_in_usec = job->run_time_in_usec;
310 	}
311 
312 	if (stats->ema_period == 0) {
313 		io_per_second = get_cma_io_per_second(job, time_in_usec);
314 	} else {
315 		io_per_second = get_ema_io_per_second(job, stats->ema_period);
316 	}
317 
318 	tsc_rate = spdk_get_ticks_hz();
319 	mb_per_second = io_per_second * job->io_size / (1024 * 1024);
320 
321 	spdk_histogram_data_iterate(job->histogram, get_avg_latency, &latency_info);
322 
323 	total_io = job->io_completed + job->io_failed;
324 	if (total_io != 0) {
325 		average_latency = (double)latency_info.total / total_io * SPDK_SEC_TO_USEC / tsc_rate;
326 	}
327 	min_latency = (double)latency_info.min * SPDK_SEC_TO_USEC / tsc_rate;
328 	max_latency = (double)latency_info.max * SPDK_SEC_TO_USEC / tsc_rate;
329 
330 	failed_per_second = (double)job->io_failed * SPDK_SEC_TO_USEC / time_in_usec;
331 	timeout_per_second = (double)job->io_timeout * SPDK_SEC_TO_USEC / time_in_usec;
332 
333 	printf("\t %-20s: %10.2f %10.2f %10.2f",
334 	       job->name, (float)time_in_usec / SPDK_SEC_TO_USEC, io_per_second, mb_per_second);
335 	printf(" %10.2f %8.2f",
336 	       failed_per_second, timeout_per_second);
337 	printf(" %10.2f %10.2f %10.2f\n",
338 	       average_latency, min_latency, max_latency);
339 
340 	stats->total_io_per_second += io_per_second;
341 	stats->total_mb_per_second += mb_per_second;
342 	stats->total_failed_per_second += failed_per_second;
343 	stats->total_timeout_per_second += timeout_per_second;
344 	stats->total_io_completed += job->io_completed + job->io_failed;
345 	stats->total_tsc += latency_info.total;
346 	if (min_latency < stats->min_latency) {
347 		stats->min_latency = min_latency;
348 	}
349 	if (max_latency > stats->max_latency) {
350 		stats->max_latency = max_latency;
351 	}
352 }
353 
354 static void
355 generate_data(void *buf, int buf_len, int block_size, void *md_buf, int md_size,
356 	      int num_blocks)
357 {
358 	int offset_blocks = 0, md_offset, data_block_size, inner_offset;
359 
360 	if (buf_len < num_blocks * block_size) {
361 		return;
362 	}
363 
364 	if (md_buf == NULL) {
365 		data_block_size = block_size - md_size;
366 		md_buf = (char *)buf + data_block_size;
367 		md_offset = block_size;
368 	} else {
369 		data_block_size = block_size;
370 		md_offset = md_size;
371 	}
372 
373 	while (offset_blocks < num_blocks) {
374 		inner_offset = 0;
375 		while (inner_offset < data_block_size) {
376 			*(uint32_t *)buf = offset_blocks + inner_offset;
377 			inner_offset += sizeof(uint32_t);
378 			buf += sizeof(uint32_t);
379 		}
380 		memset(md_buf, offset_blocks, md_size);
381 		md_buf += md_offset;
382 		offset_blocks++;
383 	}
384 }
385 
386 static bool
387 copy_data(void *wr_buf, int wr_buf_len, void *rd_buf, int rd_buf_len, int block_size,
388 	  void *wr_md_buf, void *rd_md_buf, int md_size, int num_blocks)
389 {
390 	if (wr_buf_len < num_blocks * block_size || rd_buf_len < num_blocks * block_size) {
391 		return false;
392 	}
393 
394 	assert((wr_md_buf != NULL) == (rd_md_buf != NULL));
395 
396 	memcpy(wr_buf, rd_buf, block_size * num_blocks);
397 
398 	if (wr_md_buf != NULL) {
399 		memcpy(wr_md_buf, rd_md_buf, md_size * num_blocks);
400 	}
401 
402 	return true;
403 }
404 
405 static bool
406 verify_data(void *wr_buf, int wr_buf_len, void *rd_buf, int rd_buf_len, int block_size,
407 	    void *wr_md_buf, void *rd_md_buf, int md_size, int num_blocks, bool md_check)
408 {
409 	int offset_blocks = 0, md_offset, data_block_size;
410 
411 	if (wr_buf_len < num_blocks * block_size || rd_buf_len < num_blocks * block_size) {
412 		return false;
413 	}
414 
415 	assert((wr_md_buf != NULL) == (rd_md_buf != NULL));
416 
417 	if (wr_md_buf == NULL) {
418 		data_block_size = block_size - md_size;
419 		wr_md_buf = (char *)wr_buf + data_block_size;
420 		rd_md_buf = (char *)rd_buf + data_block_size;
421 		md_offset = block_size;
422 	} else {
423 		data_block_size = block_size;
424 		md_offset = md_size;
425 	}
426 
427 	while (offset_blocks < num_blocks) {
428 		if (memcmp(wr_buf, rd_buf, data_block_size) != 0) {
429 			return false;
430 		}
431 
432 		wr_buf += block_size;
433 		rd_buf += block_size;
434 
435 		if (md_check) {
436 			if (memcmp(wr_md_buf, rd_md_buf, md_size) != 0) {
437 				return false;
438 			}
439 
440 			wr_md_buf += md_offset;
441 			rd_md_buf += md_offset;
442 		}
443 
444 		offset_blocks++;
445 	}
446 
447 	return true;
448 }
449 
450 static void
451 free_job_config(void)
452 {
453 	struct job_config *config, *tmp;
454 
455 	spdk_conf_free(g_bdevperf_conf);
456 	g_bdevperf_conf = NULL;
457 
458 	TAILQ_FOREACH_SAFE(config, &job_config_list, link, tmp) {
459 		TAILQ_REMOVE(&job_config_list, config, link);
460 		free(config);
461 	}
462 }
463 
464 static void
465 bdevperf_job_free(struct bdevperf_job *job)
466 {
467 	spdk_histogram_data_free(job->histogram);
468 	spdk_bit_array_free(&job->outstanding);
469 	spdk_bit_array_free(&job->random_map);
470 	spdk_zipf_free(&job->zipf);
471 	free(job->name);
472 	free(job);
473 }
474 
475 static void
476 job_thread_exit(void *ctx)
477 {
478 	spdk_thread_exit(spdk_get_thread());
479 }
480 
481 static void
482 check_cutoff(void *ctx, uint64_t start, uint64_t end, uint64_t count,
483 	     uint64_t total, uint64_t so_far)
484 {
485 	double so_far_pct;
486 	double **cutoff = ctx;
487 	uint64_t tsc_rate;
488 
489 	if (count == 0) {
490 		return;
491 	}
492 
493 	tsc_rate = spdk_get_ticks_hz();
494 	so_far_pct = (double)so_far / total;
495 	while (so_far_pct >= **cutoff && **cutoff > 0) {
496 		printf("%9.5f%% : %9.3fus\n", **cutoff * 100, (double)end * SPDK_SEC_TO_USEC / tsc_rate);
497 		(*cutoff)++;
498 	}
499 }
500 
501 static void
502 print_bucket(void *ctx, uint64_t start, uint64_t end, uint64_t count,
503 	     uint64_t total, uint64_t so_far)
504 {
505 	double so_far_pct;
506 	uint64_t tsc_rate;
507 
508 	if (count == 0) {
509 		return;
510 	}
511 
512 	tsc_rate = spdk_get_ticks_hz();
513 	so_far_pct = (double)so_far * 100 / total;
514 	printf("%9.3f - %9.3f: %9.4f%%  (%9ju)\n",
515 	       (double)start * SPDK_SEC_TO_USEC / tsc_rate,
516 	       (double)end * SPDK_SEC_TO_USEC / tsc_rate,
517 	       so_far_pct, count);
518 }
519 
520 static void
521 bdevperf_test_done(void *ctx)
522 {
523 	struct bdevperf_job *job, *jtmp;
524 	struct bdevperf_task *task, *ttmp;
525 	struct lcore_thread *lthread, *lttmp;
526 	double average_latency = 0.0;
527 	uint64_t time_in_usec;
528 	int rc;
529 
530 	if (g_time_in_usec) {
531 		g_stats.io_time_in_usec = g_time_in_usec;
532 
533 		if (!g_run_rc && g_performance_dump_active) {
534 			spdk_thread_send_msg(spdk_get_thread(), bdevperf_test_done, NULL);
535 			return;
536 		}
537 	}
538 
539 	if (g_show_performance_real_time) {
540 		spdk_poller_unregister(&g_perf_timer);
541 	}
542 
543 	if (g_shutdown) {
544 		g_shutdown_tsc = spdk_get_ticks() - g_start_tsc;
545 		time_in_usec = g_shutdown_tsc * SPDK_SEC_TO_USEC / spdk_get_ticks_hz();
546 		g_time_in_usec = (g_time_in_usec > time_in_usec) ? time_in_usec : g_time_in_usec;
547 		printf("Received shutdown signal, test time was about %.6f seconds\n",
548 		       (double)g_time_in_usec / SPDK_SEC_TO_USEC);
549 	}
550 
551 	printf("\n%*s\n", 107, "Latency(us)");
552 	printf("\r %-*s: %10s %10s %10s %10s %8s %10s %10s %10s\n",
553 	       28, "Device Information", "runtime(s)", "IOPS", "MiB/s", "Fail/s", "TO/s", "Average", "min", "max");
554 
555 	TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, jtmp) {
556 		performance_dump_job(&g_stats, job);
557 	}
558 
559 	printf("\r =================================================================================="
560 	       "=================================\n");
561 	printf("\r %-28s: %10s %10.2f %10.2f",
562 	       "Total", "", g_stats.total_io_per_second, g_stats.total_mb_per_second);
563 	printf(" %10.2f %8.2f",
564 	       g_stats.total_failed_per_second, g_stats.total_timeout_per_second);
565 
566 	if (g_stats.total_io_completed != 0) {
567 		average_latency = ((double)g_stats.total_tsc / g_stats.total_io_completed) * SPDK_SEC_TO_USEC /
568 				  spdk_get_ticks_hz();
569 	}
570 	printf(" %10.2f %10.2f %10.2f\n", average_latency, g_stats.min_latency, g_stats.max_latency);
571 
572 	fflush(stdout);
573 
574 	if (g_latency_display_level == 0 || g_stats.total_io_completed == 0) {
575 		goto clean;
576 	}
577 
578 	printf("\n Latency summary\n");
579 	TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, jtmp) {
580 		printf("\r =============================================\n");
581 		printf("\r Job: %s (Core Mask 0x%s)\n", job->name,
582 		       spdk_cpuset_fmt(spdk_thread_get_cpumask(job->thread)));
583 
584 		const double *cutoff = g_latency_cutoffs;
585 
586 		spdk_histogram_data_iterate(job->histogram, check_cutoff, &cutoff);
587 
588 		printf("\n");
589 	}
590 
591 	if (g_latency_display_level == 1) {
592 		goto clean;
593 	}
594 
595 	printf("\r Latency histogram\n");
596 	TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, jtmp) {
597 		printf("\r =============================================\n");
598 		printf("\r Job: %s (Core Mask 0x%s)\n", job->name,
599 		       spdk_cpuset_fmt(spdk_thread_get_cpumask(job->thread)));
600 
601 		spdk_histogram_data_iterate(job->histogram, print_bucket, NULL);
602 		printf("\n");
603 	}
604 
605 clean:
606 	TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, jtmp) {
607 		TAILQ_REMOVE(&g_bdevperf.jobs, job, link);
608 
609 		if (!g_one_thread_per_lcore) {
610 			spdk_thread_send_msg(job->thread, job_thread_exit, NULL);
611 		}
612 
613 		TAILQ_FOREACH_SAFE(task, &job->task_list, link, ttmp) {
614 			TAILQ_REMOVE(&job->task_list, task, link);
615 			spdk_free(task->buf);
616 			spdk_free(task->md_buf);
617 			free(task);
618 		}
619 
620 		bdevperf_job_free(job);
621 	}
622 
623 	if (g_one_thread_per_lcore) {
624 		TAILQ_FOREACH_SAFE(lthread, &g_lcore_thread_list, link, lttmp) {
625 			TAILQ_REMOVE(&g_lcore_thread_list, lthread, link);
626 			spdk_thread_send_msg(lthread->thread, job_thread_exit, NULL);
627 			free(lthread);
628 		}
629 	}
630 
631 	if (g_bdevperf_conf == NULL) {
632 		free_job_config();
633 	}
634 
635 	rc = g_run_rc;
636 	if (g_request && !g_shutdown) {
637 		rpc_perform_tests_cb();
638 		if (rc != 0) {
639 			spdk_app_stop(rc);
640 		}
641 	} else {
642 		spdk_app_stop(rc);
643 	}
644 }
645 
646 static void
647 bdevperf_job_end(void *ctx)
648 {
649 	assert(g_main_thread == spdk_get_thread());
650 
651 	if (--g_bdevperf.running_jobs == 0) {
652 		bdevperf_test_done(NULL);
653 	}
654 }
655 
656 static void
657 bdevperf_channel_get_histogram_cb(void *cb_arg, int status, struct spdk_histogram_data *histogram)
658 {
659 	struct spdk_histogram_data *job_hist = cb_arg;
660 
661 	if (status == 0) {
662 		spdk_histogram_data_merge(job_hist, histogram);
663 	}
664 }
665 
666 static void
667 bdevperf_job_empty(struct bdevperf_job *job)
668 {
669 	uint64_t end_tsc = 0;
670 
671 	end_tsc = spdk_get_ticks() - g_start_tsc;
672 	job->run_time_in_usec = end_tsc * SPDK_SEC_TO_USEC / spdk_get_ticks_hz();
673 	/* keep histogram info before channel is destroyed */
674 	spdk_bdev_channel_get_histogram(job->ch, bdevperf_channel_get_histogram_cb,
675 					job->histogram);
676 	spdk_put_io_channel(job->ch);
677 	spdk_bdev_close(job->bdev_desc);
678 	spdk_thread_send_msg(g_main_thread, bdevperf_job_end, NULL);
679 }
680 
681 static void
682 bdevperf_end_task(struct bdevperf_task *task)
683 {
684 	struct bdevperf_job     *job = task->job;
685 
686 	TAILQ_INSERT_TAIL(&job->task_list, task, link);
687 	if (job->is_draining) {
688 		if (job->current_queue_depth == 0) {
689 			bdevperf_job_empty(job);
690 		}
691 	}
692 }
693 
694 static void
695 bdevperf_queue_io_wait_with_cb(struct bdevperf_task *task, spdk_bdev_io_wait_cb cb_fn)
696 {
697 	struct bdevperf_job	*job = task->job;
698 
699 	task->bdev_io_wait.bdev = job->bdev;
700 	task->bdev_io_wait.cb_fn = cb_fn;
701 	task->bdev_io_wait.cb_arg = task;
702 	spdk_bdev_queue_io_wait(job->bdev, job->ch, &task->bdev_io_wait);
703 }
704 
705 static int
706 bdevperf_job_drain(void *ctx)
707 {
708 	struct bdevperf_job *job = ctx;
709 
710 	spdk_poller_unregister(&job->run_timer);
711 	if (job->reset) {
712 		spdk_poller_unregister(&job->reset_timer);
713 	}
714 
715 	job->is_draining = true;
716 
717 	return -1;
718 }
719 
720 static int
721 bdevperf_job_drain_timer(void *ctx)
722 {
723 	struct bdevperf_job *job = ctx;
724 
725 	bdevperf_job_drain(ctx);
726 	if (job->current_queue_depth == 0) {
727 		bdevperf_job_empty(job);
728 	}
729 
730 	return SPDK_POLLER_BUSY;
731 }
732 
733 static void
734 bdevperf_abort_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
735 {
736 	struct bdevperf_task	*task = cb_arg;
737 	struct bdevperf_job	*job = task->job;
738 
739 	job->current_queue_depth--;
740 
741 	if (success) {
742 		job->io_completed++;
743 	} else {
744 		job->io_failed++;
745 		if (!job->continue_on_failure) {
746 			bdevperf_job_drain(job);
747 			g_run_rc = -1;
748 		}
749 	}
750 
751 	spdk_bdev_free_io(bdev_io);
752 	bdevperf_end_task(task);
753 }
754 
755 static int
756 bdevperf_verify_dif(struct bdevperf_task *task, struct iovec *iovs, int iovcnt)
757 {
758 	struct bdevperf_job	*job = task->job;
759 	struct spdk_bdev	*bdev = job->bdev;
760 	struct spdk_dif_ctx	dif_ctx;
761 	struct spdk_dif_error	err_blk = {};
762 	int			rc;
763 	struct spdk_dif_ctx_init_ext_opts dif_opts;
764 
765 	dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format);
766 	dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16;
767 	rc = spdk_dif_ctx_init(&dif_ctx,
768 			       spdk_bdev_get_block_size(bdev),
769 			       spdk_bdev_get_md_size(bdev),
770 			       spdk_bdev_is_md_interleaved(bdev),
771 			       spdk_bdev_is_dif_head_of_md(bdev),
772 			       spdk_bdev_get_dif_type(bdev),
773 			       job->dif_check_flags,
774 			       task->offset_blocks, 0, 0, 0, 0, &dif_opts);
775 	if (rc != 0) {
776 		fprintf(stderr, "Initialization of DIF context failed\n");
777 		return rc;
778 	}
779 
780 	if (spdk_bdev_is_md_interleaved(bdev)) {
781 		rc = spdk_dif_verify(iovs, iovcnt, job->io_size_blocks, &dif_ctx, &err_blk);
782 	} else {
783 		struct iovec md_iov = {
784 			.iov_base	= task->md_buf,
785 			.iov_len	= spdk_bdev_get_md_size(bdev) * job->io_size_blocks,
786 		};
787 
788 		rc = spdk_dix_verify(iovs, iovcnt, &md_iov, job->io_size_blocks, &dif_ctx, &err_blk);
789 	}
790 
791 	if (rc != 0) {
792 		fprintf(stderr, "DIF/DIX error detected. type=%d, offset=%" PRIu32 "\n",
793 			err_blk.err_type, err_blk.err_offset);
794 	}
795 
796 	return rc;
797 }
798 
799 static void
800 bdevperf_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
801 {
802 	struct bdevperf_job	*job;
803 	struct bdevperf_task	*task = cb_arg;
804 	struct iovec		*iovs;
805 	int			iovcnt;
806 	bool			md_check;
807 	uint64_t		offset_in_ios;
808 	int			rc;
809 
810 	job = task->job;
811 	md_check = spdk_bdev_get_dif_type(job->bdev) == SPDK_DIF_DISABLE;
812 
813 	if (g_error_to_exit == true) {
814 		bdevperf_job_drain(job);
815 	} else if (!success) {
816 		if (!job->reset && !job->continue_on_failure) {
817 			bdevperf_job_drain(job);
818 			g_run_rc = -1;
819 			g_error_to_exit = true;
820 			printf("task offset: %" PRIu64 " on job bdev=%s fails\n",
821 			       task->offset_blocks, job->name);
822 		}
823 	} else if (job->verify || job->reset) {
824 		spdk_bdev_io_get_iovec(bdev_io, &iovs, &iovcnt);
825 		assert(iovcnt == 1);
826 		assert(iovs != NULL);
827 		if (!verify_data(task->buf, job->buf_size, iovs[0].iov_base, iovs[0].iov_len,
828 				 spdk_bdev_get_block_size(job->bdev),
829 				 task->md_buf, spdk_bdev_io_get_md_buf(bdev_io),
830 				 spdk_bdev_get_md_size(job->bdev),
831 				 job->io_size_blocks, md_check)) {
832 			printf("Buffer mismatch! Target: %s Disk Offset: %" PRIu64 "\n", job->name, task->offset_blocks);
833 			printf("   First dword expected 0x%x got 0x%x\n", *(int *)task->buf, *(int *)iovs[0].iov_base);
834 			bdevperf_job_drain(job);
835 			g_run_rc = -1;
836 		}
837 	} else if (job->dif_check_flags != 0) {
838 		if (task->io_type == SPDK_BDEV_IO_TYPE_READ && spdk_bdev_get_md_size(job->bdev) != 0) {
839 			spdk_bdev_io_get_iovec(bdev_io, &iovs, &iovcnt);
840 			assert(iovcnt == 1);
841 			assert(iovs != NULL);
842 			rc = bdevperf_verify_dif(task, iovs, iovcnt);
843 			if (rc != 0) {
844 				printf("DIF error detected. task offset: %" PRIu64 " on job bdev=%s\n",
845 				       task->offset_blocks, job->name);
846 
847 				success = false;
848 				if (!job->reset && !job->continue_on_failure) {
849 					bdevperf_job_drain(job);
850 					g_run_rc = -1;
851 					g_error_to_exit = true;
852 				}
853 			}
854 		}
855 	}
856 
857 	job->current_queue_depth--;
858 
859 	if (success) {
860 		job->io_completed++;
861 	} else {
862 		job->io_failed++;
863 	}
864 
865 	if (job->verify) {
866 		assert(task->offset_blocks / job->io_size_blocks >= job->ios_base);
867 		offset_in_ios = task->offset_blocks / job->io_size_blocks - job->ios_base;
868 
869 		assert(spdk_bit_array_get(job->outstanding, offset_in_ios) == true);
870 		spdk_bit_array_clear(job->outstanding, offset_in_ios);
871 	}
872 
873 	spdk_bdev_free_io(bdev_io);
874 
875 	/*
876 	 * is_draining indicates when time has expired for the test run
877 	 * and we are just waiting for the previously submitted I/O
878 	 * to complete.  In this case, do not submit a new I/O to replace
879 	 * the one just completed.
880 	 */
881 	if (!job->is_draining) {
882 		bdevperf_submit_single(job, task);
883 	} else {
884 		bdevperf_end_task(task);
885 	}
886 }
887 
888 static void
889 bdevperf_verify_submit_read(void *cb_arg)
890 {
891 	struct bdevperf_job	*job;
892 	struct bdevperf_task	*task = cb_arg;
893 	int			rc;
894 
895 	job = task->job;
896 
897 	/* Read the data back in */
898 	rc = spdk_bdev_read_blocks_with_md(job->bdev_desc, job->ch, NULL, NULL,
899 					   task->offset_blocks, job->io_size_blocks,
900 					   bdevperf_complete, task);
901 
902 	if (rc == -ENOMEM) {
903 		bdevperf_queue_io_wait_with_cb(task, bdevperf_verify_submit_read);
904 	} else if (rc != 0) {
905 		printf("Failed to submit read: %d\n", rc);
906 		bdevperf_job_drain(job);
907 		g_run_rc = rc;
908 	}
909 }
910 
911 static void
912 bdevperf_verify_write_complete(struct spdk_bdev_io *bdev_io, bool success,
913 			       void *cb_arg)
914 {
915 	if (success) {
916 		spdk_bdev_free_io(bdev_io);
917 		bdevperf_verify_submit_read(cb_arg);
918 	} else {
919 		bdevperf_complete(bdev_io, success, cb_arg);
920 	}
921 }
922 
923 static void
924 bdevperf_zcopy_populate_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
925 {
926 	if (!success) {
927 		bdevperf_complete(bdev_io, success, cb_arg);
928 		return;
929 	}
930 
931 	spdk_bdev_zcopy_end(bdev_io, false, bdevperf_complete, cb_arg);
932 }
933 
934 static int
935 bdevperf_generate_dif(struct bdevperf_task *task)
936 {
937 	struct bdevperf_job	*job = task->job;
938 	struct spdk_bdev	*bdev = job->bdev;
939 	struct spdk_dif_ctx	dif_ctx;
940 	int			rc;
941 	struct spdk_dif_ctx_init_ext_opts dif_opts;
942 
943 	dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format);
944 	dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16;
945 	rc = spdk_dif_ctx_init(&dif_ctx,
946 			       spdk_bdev_get_block_size(bdev),
947 			       spdk_bdev_get_md_size(bdev),
948 			       spdk_bdev_is_md_interleaved(bdev),
949 			       spdk_bdev_is_dif_head_of_md(bdev),
950 			       spdk_bdev_get_dif_type(bdev),
951 			       job->dif_check_flags,
952 			       task->offset_blocks, 0, 0, 0, 0, &dif_opts);
953 	if (rc != 0) {
954 		fprintf(stderr, "Initialization of DIF context failed\n");
955 		return rc;
956 	}
957 
958 	if (spdk_bdev_is_md_interleaved(bdev)) {
959 		rc = spdk_dif_generate(&task->iov, 1, job->io_size_blocks, &dif_ctx);
960 	} else {
961 		struct iovec md_iov = {
962 			.iov_base	= task->md_buf,
963 			.iov_len	= spdk_bdev_get_md_size(bdev) * job->io_size_blocks,
964 		};
965 
966 		rc = spdk_dix_generate(&task->iov, 1, &md_iov, job->io_size_blocks, &dif_ctx);
967 	}
968 
969 	if (rc != 0) {
970 		fprintf(stderr, "Generation of DIF/DIX failed\n");
971 	}
972 
973 	return rc;
974 }
975 
976 static void
977 bdevperf_submit_task(void *arg)
978 {
979 	struct bdevperf_task	*task = arg;
980 	struct bdevperf_job	*job = task->job;
981 	struct spdk_bdev_desc	*desc;
982 	struct spdk_io_channel	*ch;
983 	spdk_bdev_io_completion_cb cb_fn;
984 	uint64_t		offset_in_ios;
985 	int			rc = 0;
986 
987 	desc = job->bdev_desc;
988 	ch = job->ch;
989 
990 	switch (task->io_type) {
991 	case SPDK_BDEV_IO_TYPE_WRITE:
992 		if (spdk_bdev_get_md_size(job->bdev) != 0 && job->dif_check_flags != 0) {
993 			rc = bdevperf_generate_dif(task);
994 		}
995 		if (rc == 0) {
996 			cb_fn = (job->verify || job->reset) ? bdevperf_verify_write_complete : bdevperf_complete;
997 
998 			if (g_zcopy) {
999 				spdk_bdev_zcopy_end(task->bdev_io, true, cb_fn, task);
1000 				return;
1001 			} else {
1002 				rc = spdk_bdev_writev_blocks_with_md(desc, ch, &task->iov, 1,
1003 								     task->md_buf,
1004 								     task->offset_blocks,
1005 								     job->io_size_blocks,
1006 								     cb_fn, task);
1007 			}
1008 		}
1009 		break;
1010 	case SPDK_BDEV_IO_TYPE_FLUSH:
1011 		rc = spdk_bdev_flush_blocks(desc, ch, task->offset_blocks,
1012 					    job->io_size_blocks, bdevperf_complete, task);
1013 		break;
1014 	case SPDK_BDEV_IO_TYPE_UNMAP:
1015 		rc = spdk_bdev_unmap_blocks(desc, ch, task->offset_blocks,
1016 					    job->io_size_blocks, bdevperf_complete, task);
1017 		break;
1018 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
1019 		rc = spdk_bdev_write_zeroes_blocks(desc, ch, task->offset_blocks,
1020 						   job->io_size_blocks, bdevperf_complete, task);
1021 		break;
1022 	case SPDK_BDEV_IO_TYPE_READ:
1023 		if (g_zcopy) {
1024 			rc = spdk_bdev_zcopy_start(desc, ch, NULL, 0, task->offset_blocks, job->io_size_blocks,
1025 						   true, bdevperf_zcopy_populate_complete, task);
1026 		} else {
1027 			rc = spdk_bdev_read_blocks_with_md(desc, ch, task->buf, task->md_buf,
1028 							   task->offset_blocks,
1029 							   job->io_size_blocks,
1030 							   bdevperf_complete, task);
1031 		}
1032 		break;
1033 	case SPDK_BDEV_IO_TYPE_ABORT:
1034 		rc = spdk_bdev_abort(desc, ch, task->task_to_abort, bdevperf_abort_complete, task);
1035 		break;
1036 	default:
1037 		assert(false);
1038 		rc = -EINVAL;
1039 		break;
1040 	}
1041 
1042 	if (rc == -ENOMEM) {
1043 		bdevperf_queue_io_wait_with_cb(task, bdevperf_submit_task);
1044 		return;
1045 	} else if (rc != 0) {
1046 		printf("Failed to submit bdev_io: %d\n", rc);
1047 		if (job->verify) {
1048 			assert(task->offset_blocks / job->io_size_blocks >= job->ios_base);
1049 			offset_in_ios = task->offset_blocks / job->io_size_blocks - job->ios_base;
1050 
1051 			assert(spdk_bit_array_get(job->outstanding, offset_in_ios) == true);
1052 			spdk_bit_array_clear(job->outstanding, offset_in_ios);
1053 		}
1054 		bdevperf_job_drain(job);
1055 		g_run_rc = rc;
1056 		return;
1057 	}
1058 
1059 	job->current_queue_depth++;
1060 }
1061 
1062 static void
1063 bdevperf_zcopy_get_buf_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
1064 {
1065 	struct bdevperf_task	*task = cb_arg;
1066 	struct bdevperf_job	*job = task->job;
1067 	struct iovec		*iovs;
1068 	int			iovcnt;
1069 
1070 	if (!success) {
1071 		bdevperf_job_drain(job);
1072 		g_run_rc = -1;
1073 		return;
1074 	}
1075 
1076 	task->bdev_io = bdev_io;
1077 	task->io_type = SPDK_BDEV_IO_TYPE_WRITE;
1078 
1079 	if (job->verify || job->reset) {
1080 		/* When job->verify or job->reset is enabled, task->buf is used for
1081 		 *  verification of read after write.  For write I/O, when zcopy APIs
1082 		 *  are used, task->buf cannot be used, and data must be written to
1083 		 *  the data buffer allocated underneath bdev layer instead.
1084 		 *  Hence we copy task->buf to the allocated data buffer here.
1085 		 */
1086 		spdk_bdev_io_get_iovec(bdev_io, &iovs, &iovcnt);
1087 		assert(iovcnt == 1);
1088 		assert(iovs != NULL);
1089 
1090 		copy_data(iovs[0].iov_base, iovs[0].iov_len, task->buf, job->buf_size,
1091 			  spdk_bdev_get_block_size(job->bdev),
1092 			  spdk_bdev_io_get_md_buf(bdev_io), task->md_buf,
1093 			  spdk_bdev_get_md_size(job->bdev), job->io_size_blocks);
1094 	}
1095 
1096 	bdevperf_submit_task(task);
1097 }
1098 
1099 static void
1100 bdevperf_prep_zcopy_write_task(void *arg)
1101 {
1102 	struct bdevperf_task	*task = arg;
1103 	struct bdevperf_job	*job = task->job;
1104 	int			rc;
1105 
1106 	rc = spdk_bdev_zcopy_start(job->bdev_desc, job->ch, NULL, 0,
1107 				   task->offset_blocks, job->io_size_blocks,
1108 				   false, bdevperf_zcopy_get_buf_complete, task);
1109 	if (rc != 0) {
1110 		assert(rc == -ENOMEM);
1111 		bdevperf_queue_io_wait_with_cb(task, bdevperf_prep_zcopy_write_task);
1112 		return;
1113 	}
1114 
1115 	job->current_queue_depth++;
1116 }
1117 
1118 static struct bdevperf_task *
1119 bdevperf_job_get_task(struct bdevperf_job *job)
1120 {
1121 	struct bdevperf_task *task;
1122 
1123 	task = TAILQ_FIRST(&job->task_list);
1124 	if (!task) {
1125 		printf("Task allocation failed\n");
1126 		abort();
1127 	}
1128 
1129 	TAILQ_REMOVE(&job->task_list, task, link);
1130 	return task;
1131 }
1132 
1133 static void
1134 bdevperf_submit_single(struct bdevperf_job *job, struct bdevperf_task *task)
1135 {
1136 	uint64_t offset_in_ios;
1137 	uint64_t rand_value;
1138 	uint32_t first_clear;
1139 
1140 	if (job->zipf) {
1141 		offset_in_ios = spdk_zipf_generate(job->zipf);
1142 	} else if (job->is_random) {
1143 		/* RAND_MAX is only INT32_MAX, so use 2 calls to rand_r to
1144 		 * get a large enough value to ensure we are issuing I/O
1145 		 * uniformly across the whole bdev.
1146 		 */
1147 		rand_value = (uint64_t)rand_r(&job->seed) * RAND_MAX + rand_r(&job->seed);
1148 		offset_in_ios = rand_value % job->size_in_ios;
1149 
1150 		if (g_random_map) {
1151 			/* Make sure, that the offset does not exceed the maximum size
1152 			 * of the bit array (verified during job creation)
1153 			 */
1154 			assert(offset_in_ios < UINT32_MAX);
1155 
1156 			first_clear = spdk_bit_array_find_first_clear(job->random_map, (uint32_t)offset_in_ios);
1157 
1158 			if (first_clear == UINT32_MAX) {
1159 				first_clear = spdk_bit_array_find_first_clear(job->random_map, 0);
1160 
1161 				if (first_clear == UINT32_MAX) {
1162 					/* If there are no more clear bits in the array, we start over
1163 					 * and select the previously selected random value.
1164 					 */
1165 					spdk_bit_array_clear_mask(job->random_map);
1166 					first_clear = (uint32_t)offset_in_ios;
1167 				}
1168 			}
1169 
1170 			spdk_bit_array_set(job->random_map, first_clear);
1171 
1172 			offset_in_ios = first_clear;
1173 		}
1174 	} else {
1175 		offset_in_ios = job->offset_in_ios++;
1176 		if (job->offset_in_ios == job->size_in_ios) {
1177 			job->offset_in_ios = 0;
1178 		}
1179 
1180 		/* Increment of offset_in_ios if there's already an outstanding IO
1181 		 * to that location. We only need this with job->verify as random
1182 		 * offsets are not supported with job->verify at this time.
1183 		 */
1184 		if (job->verify) {
1185 			assert(spdk_bit_array_find_first_clear(job->outstanding, 0) != UINT32_MAX);
1186 
1187 			while (spdk_bit_array_get(job->outstanding, offset_in_ios)) {
1188 				offset_in_ios = job->offset_in_ios++;
1189 				if (job->offset_in_ios == job->size_in_ios) {
1190 					job->offset_in_ios = 0;
1191 				}
1192 			}
1193 			spdk_bit_array_set(job->outstanding, offset_in_ios);
1194 		}
1195 	}
1196 
1197 	/* For multi-thread to same job, offset_in_ios is relative
1198 	 * to the LBA range assigned for that job. job->offset_blocks
1199 	 * is absolute (entire bdev LBA range).
1200 	 */
1201 	task->offset_blocks = (offset_in_ios + job->ios_base) * job->io_size_blocks;
1202 
1203 	if (job->verify || job->reset) {
1204 		generate_data(task->buf, job->buf_size,
1205 			      spdk_bdev_get_block_size(job->bdev),
1206 			      task->md_buf, spdk_bdev_get_md_size(job->bdev),
1207 			      job->io_size_blocks);
1208 		if (g_zcopy) {
1209 			bdevperf_prep_zcopy_write_task(task);
1210 			return;
1211 		} else {
1212 			task->iov.iov_base = task->buf;
1213 			task->iov.iov_len = job->buf_size;
1214 			task->io_type = SPDK_BDEV_IO_TYPE_WRITE;
1215 		}
1216 	} else if (job->flush) {
1217 		task->io_type = SPDK_BDEV_IO_TYPE_FLUSH;
1218 	} else if (job->unmap) {
1219 		task->io_type = SPDK_BDEV_IO_TYPE_UNMAP;
1220 	} else if (job->write_zeroes) {
1221 		task->io_type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES;
1222 	} else if ((job->rw_percentage == 100) ||
1223 		   (job->rw_percentage != 0 && ((rand_r(&job->seed) % 100) < job->rw_percentage))) {
1224 		task->io_type = SPDK_BDEV_IO_TYPE_READ;
1225 	} else {
1226 		if (g_zcopy) {
1227 			bdevperf_prep_zcopy_write_task(task);
1228 			return;
1229 		} else {
1230 			task->iov.iov_base = task->buf;
1231 			task->iov.iov_len = job->buf_size;
1232 			task->io_type = SPDK_BDEV_IO_TYPE_WRITE;
1233 		}
1234 	}
1235 
1236 	bdevperf_submit_task(task);
1237 }
1238 
1239 static int reset_job(void *arg);
1240 
1241 static void
1242 reset_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
1243 {
1244 	struct bdevperf_task	*task = cb_arg;
1245 	struct bdevperf_job	*job = task->job;
1246 
1247 	if (!success) {
1248 		printf("Reset blockdev=%s failed\n", spdk_bdev_get_name(job->bdev));
1249 		bdevperf_job_drain(job);
1250 		g_run_rc = -1;
1251 	}
1252 
1253 	TAILQ_INSERT_TAIL(&job->task_list, task, link);
1254 	spdk_bdev_free_io(bdev_io);
1255 
1256 	job->reset_timer = SPDK_POLLER_REGISTER(reset_job, job,
1257 						10 * SPDK_SEC_TO_USEC);
1258 }
1259 
1260 static int
1261 reset_job(void *arg)
1262 {
1263 	struct bdevperf_job *job = arg;
1264 	struct bdevperf_task *task;
1265 	int rc;
1266 
1267 	spdk_poller_unregister(&job->reset_timer);
1268 
1269 	/* Do reset. */
1270 	task = bdevperf_job_get_task(job);
1271 	rc = spdk_bdev_reset(job->bdev_desc, job->ch,
1272 			     reset_cb, task);
1273 	if (rc) {
1274 		printf("Reset failed: %d\n", rc);
1275 		bdevperf_job_drain(job);
1276 		g_run_rc = -1;
1277 	}
1278 
1279 	return -1;
1280 }
1281 
1282 static void
1283 bdevperf_timeout_cb(void *cb_arg, struct spdk_bdev_io *bdev_io)
1284 {
1285 	struct bdevperf_job *job = cb_arg;
1286 	struct bdevperf_task *task;
1287 
1288 	job->io_timeout++;
1289 
1290 	if (job->is_draining || !job->abort ||
1291 	    !spdk_bdev_io_type_supported(job->bdev, SPDK_BDEV_IO_TYPE_ABORT)) {
1292 		return;
1293 	}
1294 
1295 	task = bdevperf_job_get_task(job);
1296 	if (task == NULL) {
1297 		return;
1298 	}
1299 
1300 	task->task_to_abort = spdk_bdev_io_get_cb_arg(bdev_io);
1301 	task->io_type = SPDK_BDEV_IO_TYPE_ABORT;
1302 
1303 	bdevperf_submit_task(task);
1304 }
1305 
1306 static void
1307 bdevperf_job_run(void *ctx)
1308 {
1309 	struct bdevperf_job *job = ctx;
1310 	struct bdevperf_task *task;
1311 	int i;
1312 
1313 	/* Submit initial I/O for this job. Each time one
1314 	 * completes, another will be submitted. */
1315 
1316 	/* Start a timer to stop this I/O chain when the run is over */
1317 	job->run_timer = SPDK_POLLER_REGISTER(bdevperf_job_drain_timer, job, g_time_in_usec);
1318 	if (job->reset) {
1319 		job->reset_timer = SPDK_POLLER_REGISTER(reset_job, job,
1320 							10 * SPDK_SEC_TO_USEC);
1321 	}
1322 
1323 	spdk_bdev_set_timeout(job->bdev_desc, g_timeout_in_sec, bdevperf_timeout_cb, job);
1324 
1325 	for (i = 0; i < job->queue_depth; i++) {
1326 		task = bdevperf_job_get_task(job);
1327 		bdevperf_submit_single(job, task);
1328 	}
1329 }
1330 
1331 static void
1332 _performance_dump_done(void *ctx)
1333 {
1334 	struct bdevperf_aggregate_stats *stats = ctx;
1335 	double average_latency;
1336 
1337 	printf("\r =================================================================================="
1338 	       "=================================\n");
1339 	printf("\r %-28s: %10s %10.2f %10.2f",
1340 	       "Total", "", stats->total_io_per_second, stats->total_mb_per_second);
1341 	printf(" %10.2f %8.2f",
1342 	       stats->total_failed_per_second, stats->total_timeout_per_second);
1343 
1344 	average_latency = ((double)stats->total_tsc / stats->total_io_completed) * SPDK_SEC_TO_USEC /
1345 			  spdk_get_ticks_hz();
1346 	printf(" %10.2f %10.2f %10.2f\n", average_latency, stats->min_latency, stats->max_latency);
1347 	printf("\n");
1348 
1349 	fflush(stdout);
1350 
1351 	g_performance_dump_active = false;
1352 
1353 	free(stats);
1354 }
1355 
1356 static void
1357 _performance_dump(void *ctx)
1358 {
1359 	struct bdevperf_aggregate_stats *stats = ctx;
1360 
1361 	performance_dump_job(stats, stats->current_job);
1362 
1363 	/* This assumes the jobs list is static after start up time.
1364 	 * That's true right now, but if that ever changed this would need a lock. */
1365 	stats->current_job = TAILQ_NEXT(stats->current_job, link);
1366 	if (stats->current_job == NULL) {
1367 		spdk_thread_send_msg(g_main_thread, _performance_dump_done, stats);
1368 	} else {
1369 		spdk_thread_send_msg(stats->current_job->thread, _performance_dump, stats);
1370 	}
1371 }
1372 
1373 static int
1374 performance_statistics_thread(void *arg)
1375 {
1376 	struct bdevperf_aggregate_stats *stats;
1377 
1378 	if (g_performance_dump_active) {
1379 		return -1;
1380 	}
1381 
1382 	g_performance_dump_active = true;
1383 
1384 	stats = calloc(1, sizeof(*stats));
1385 	if (stats == NULL) {
1386 		return -1;
1387 	}
1388 
1389 	stats->min_latency = (double)UINT64_MAX;
1390 
1391 	g_show_performance_period_num++;
1392 
1393 	stats->io_time_in_usec = g_show_performance_period_num * g_show_performance_period_in_usec;
1394 	stats->ema_period = g_show_performance_ema_period;
1395 
1396 	/* Iterate all of the jobs to gather stats
1397 	 * These jobs will not get removed here until a final performance dump is run,
1398 	 * so this should be safe without locking.
1399 	 */
1400 	stats->current_job = TAILQ_FIRST(&g_bdevperf.jobs);
1401 	if (stats->current_job == NULL) {
1402 		spdk_thread_send_msg(g_main_thread, _performance_dump_done, stats);
1403 	} else {
1404 		spdk_thread_send_msg(stats->current_job->thread, _performance_dump, stats);
1405 	}
1406 
1407 	return -1;
1408 }
1409 
1410 static void
1411 bdevperf_test(void)
1412 {
1413 	struct bdevperf_job *job;
1414 
1415 	printf("Running I/O for %" PRIu64 " seconds...\n", g_time_in_usec / (uint64_t)SPDK_SEC_TO_USEC);
1416 	fflush(stdout);
1417 
1418 	/* Start a timer to dump performance numbers */
1419 	g_start_tsc = spdk_get_ticks();
1420 	if (g_show_performance_real_time && !g_perf_timer) {
1421 		printf("%*s\n", 107, "Latency(us)");
1422 		printf("\r %-*s: %10s %10s %10s %10s %8s %10s %10s %10s\n",
1423 		       28, "Device Information", "runtime(s)", "IOPS", "MiB/s", "Fail/s", "TO/s", "Average", "min", "max");
1424 
1425 		g_perf_timer = SPDK_POLLER_REGISTER(performance_statistics_thread, NULL,
1426 						    g_show_performance_period_in_usec);
1427 	}
1428 
1429 	/* Iterate jobs to start all I/O */
1430 	TAILQ_FOREACH(job, &g_bdevperf.jobs, link) {
1431 		g_bdevperf.running_jobs++;
1432 		spdk_thread_send_msg(job->thread, bdevperf_job_run, job);
1433 	}
1434 }
1435 
1436 static void
1437 bdevperf_bdev_removed(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *event_ctx)
1438 {
1439 	struct bdevperf_job *job = event_ctx;
1440 
1441 	if (SPDK_BDEV_EVENT_REMOVE == type) {
1442 		bdevperf_job_drain(job);
1443 	}
1444 }
1445 
1446 static void
1447 bdevperf_histogram_status_cb(void *cb_arg, int status)
1448 {
1449 	if (status != 0) {
1450 		g_run_rc = status;
1451 		if (g_continue_on_failure == false) {
1452 			g_error_to_exit = true;
1453 		}
1454 	}
1455 
1456 	if (--g_bdev_count == 0) {
1457 		if (g_run_rc == 0) {
1458 			/* Ready to run the test */
1459 			bdevperf_test();
1460 		} else {
1461 			bdevperf_test_done(NULL);
1462 		}
1463 	}
1464 }
1465 
1466 static uint32_t g_construct_job_count = 0;
1467 
1468 static int
1469 _bdevperf_enable_histogram(void *ctx, struct spdk_bdev *bdev)
1470 {
1471 	bool *enable = ctx;
1472 
1473 	g_bdev_count++;
1474 
1475 	spdk_bdev_histogram_enable(bdev, bdevperf_histogram_status_cb, NULL, *enable);
1476 
1477 	return 0;
1478 }
1479 
1480 static void
1481 bdevperf_enable_histogram(bool enable)
1482 {
1483 	struct spdk_bdev *bdev;
1484 	int rc;
1485 
1486 	/* increment initial g_bdev_count so that it will never reach 0 in the middle of iteration */
1487 	g_bdev_count = 1;
1488 
1489 	if (g_job_bdev_name != NULL) {
1490 		bdev = spdk_bdev_get_by_name(g_job_bdev_name);
1491 		if (bdev) {
1492 			rc = _bdevperf_enable_histogram(&enable, bdev);
1493 		} else {
1494 			fprintf(stderr, "Unable to find bdev '%s'\n", g_job_bdev_name);
1495 			rc = -1;
1496 		}
1497 	} else {
1498 		rc = spdk_for_each_bdev_leaf(&enable, _bdevperf_enable_histogram);
1499 	}
1500 
1501 	bdevperf_histogram_status_cb(NULL, rc);
1502 }
1503 
1504 static void
1505 _bdevperf_construct_job_done(void *ctx)
1506 {
1507 	if (--g_construct_job_count == 0) {
1508 		if (g_run_rc != 0) {
1509 			/* Something failed. */
1510 			bdevperf_test_done(NULL);
1511 			return;
1512 		}
1513 
1514 		/* always enable histogram. */
1515 		bdevperf_enable_histogram(true);
1516 	} else if (g_run_rc != 0) {
1517 		/* Reset error as some jobs constructed right */
1518 		g_run_rc = 0;
1519 		if (g_continue_on_failure == false) {
1520 			g_error_to_exit = true;
1521 		}
1522 	}
1523 }
1524 
1525 /* Checkformat will not allow to use inlined type,
1526    this is a workaround */
1527 typedef struct spdk_thread *spdk_thread_t;
1528 
1529 static spdk_thread_t
1530 construct_job_thread(struct spdk_cpuset *cpumask, const char *tag)
1531 {
1532 	struct spdk_cpuset tmp;
1533 
1534 	/* This function runs on the main thread. */
1535 	assert(g_main_thread == spdk_get_thread());
1536 
1537 	/* Handle default mask */
1538 	if (spdk_cpuset_count(cpumask) == 0) {
1539 		cpumask = &g_all_cpuset;
1540 	}
1541 
1542 	/* Warn user that mask might need to be changed */
1543 	spdk_cpuset_copy(&tmp, cpumask);
1544 	spdk_cpuset_or(&tmp, &g_all_cpuset);
1545 	if (!spdk_cpuset_equal(&tmp, &g_all_cpuset)) {
1546 		fprintf(stderr, "cpumask for '%s' is too big\n", tag);
1547 	}
1548 
1549 	return spdk_thread_create(tag, cpumask);
1550 }
1551 
1552 static uint32_t
1553 _get_next_core(void)
1554 {
1555 	static uint32_t current_core = SPDK_ENV_LCORE_ID_ANY;
1556 
1557 	if (current_core == SPDK_ENV_LCORE_ID_ANY) {
1558 		current_core = spdk_env_get_first_core();
1559 		return current_core;
1560 	}
1561 
1562 	current_core = spdk_env_get_next_core(current_core);
1563 	if (current_core == SPDK_ENV_LCORE_ID_ANY) {
1564 		current_core = spdk_env_get_first_core();
1565 	}
1566 
1567 	return current_core;
1568 }
1569 
1570 static void
1571 _bdevperf_construct_job(void *ctx)
1572 {
1573 	struct bdevperf_job *job = ctx;
1574 	int rc;
1575 
1576 	rc = spdk_bdev_open_ext(spdk_bdev_get_name(job->bdev), true, bdevperf_bdev_removed, job,
1577 				&job->bdev_desc);
1578 	if (rc != 0) {
1579 		SPDK_ERRLOG("Could not open leaf bdev %s, error=%d\n", spdk_bdev_get_name(job->bdev), rc);
1580 		g_run_rc = -EINVAL;
1581 		goto end;
1582 	}
1583 
1584 	if (g_zcopy) {
1585 		if (!spdk_bdev_io_type_supported(job->bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) {
1586 			printf("Test requires ZCOPY but bdev module does not support ZCOPY\n");
1587 			g_run_rc = -ENOTSUP;
1588 			goto end;
1589 		}
1590 	}
1591 
1592 	job->ch = spdk_bdev_get_io_channel(job->bdev_desc);
1593 	if (!job->ch) {
1594 		SPDK_ERRLOG("Could not get io_channel for device %s, error=%d\n", spdk_bdev_get_name(job->bdev),
1595 			    rc);
1596 		spdk_bdev_close(job->bdev_desc);
1597 		TAILQ_REMOVE(&g_bdevperf.jobs, job, link);
1598 		g_run_rc = -ENOMEM;
1599 		goto end;
1600 	}
1601 
1602 end:
1603 	spdk_thread_send_msg(g_main_thread, _bdevperf_construct_job_done, NULL);
1604 }
1605 
1606 static void
1607 job_init_rw(struct bdevperf_job *job, enum job_config_rw rw)
1608 {
1609 	switch (rw) {
1610 	case JOB_CONFIG_RW_READ:
1611 		job->rw_percentage = 100;
1612 		break;
1613 	case JOB_CONFIG_RW_WRITE:
1614 		job->rw_percentage = 0;
1615 		break;
1616 	case JOB_CONFIG_RW_RANDREAD:
1617 		job->is_random = true;
1618 		job->rw_percentage = 100;
1619 		job->seed = rand();
1620 		break;
1621 	case JOB_CONFIG_RW_RANDWRITE:
1622 		job->is_random = true;
1623 		job->rw_percentage = 0;
1624 		job->seed = rand();
1625 		break;
1626 	case JOB_CONFIG_RW_RW:
1627 		job->is_random = false;
1628 		break;
1629 	case JOB_CONFIG_RW_RANDRW:
1630 		job->is_random = true;
1631 		job->seed = rand();
1632 		break;
1633 	case JOB_CONFIG_RW_VERIFY:
1634 		job->verify = true;
1635 		job->rw_percentage = 50;
1636 		break;
1637 	case JOB_CONFIG_RW_RESET:
1638 		job->reset = true;
1639 		job->verify = true;
1640 		job->rw_percentage = 50;
1641 		break;
1642 	case JOB_CONFIG_RW_UNMAP:
1643 		job->unmap = true;
1644 		break;
1645 	case JOB_CONFIG_RW_FLUSH:
1646 		job->flush = true;
1647 		break;
1648 	case JOB_CONFIG_RW_WRITE_ZEROES:
1649 		job->write_zeroes = true;
1650 		break;
1651 	}
1652 }
1653 
1654 static int
1655 bdevperf_construct_job(struct spdk_bdev *bdev, struct job_config *config,
1656 		       struct spdk_thread *thread)
1657 {
1658 	struct bdevperf_job *job;
1659 	struct bdevperf_task *task;
1660 	int block_size, data_block_size;
1661 	int rc;
1662 	int task_num, n;
1663 
1664 	block_size = spdk_bdev_get_block_size(bdev);
1665 	data_block_size = spdk_bdev_get_data_block_size(bdev);
1666 
1667 	job = calloc(1, sizeof(struct bdevperf_job));
1668 	if (!job) {
1669 		fprintf(stderr, "Unable to allocate memory for new job.\n");
1670 		return -ENOMEM;
1671 	}
1672 
1673 	job->name = strdup(spdk_bdev_get_name(bdev));
1674 	if (!job->name) {
1675 		fprintf(stderr, "Unable to allocate memory for job name.\n");
1676 		bdevperf_job_free(job);
1677 		return -ENOMEM;
1678 	}
1679 
1680 	job->workload_type = g_workload_type;
1681 	job->io_size = config->bs;
1682 	job->rw_percentage = config->rwmixread;
1683 	job->continue_on_failure = g_continue_on_failure;
1684 	job->queue_depth = config->iodepth;
1685 	job->bdev = bdev;
1686 	job->io_size_blocks = job->io_size / data_block_size;
1687 	job->buf_size = job->io_size_blocks * block_size;
1688 	job->abort = g_abort;
1689 	job_init_rw(job, config->rw);
1690 
1691 	if ((job->io_size % data_block_size) != 0) {
1692 		SPDK_ERRLOG("IO size (%d) is not multiples of data block size of bdev %s (%"PRIu32")\n",
1693 			    job->io_size, spdk_bdev_get_name(bdev), data_block_size);
1694 		bdevperf_job_free(job);
1695 		return -ENOTSUP;
1696 	}
1697 
1698 	if (job->unmap && !spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
1699 		printf("Skipping %s because it does not support unmap\n", spdk_bdev_get_name(bdev));
1700 		bdevperf_job_free(job);
1701 		return -ENOTSUP;
1702 	}
1703 
1704 	if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_REFTAG)) {
1705 		job->dif_check_flags |= SPDK_DIF_FLAGS_REFTAG_CHECK;
1706 	}
1707 	if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_GUARD)) {
1708 		job->dif_check_flags |= SPDK_DIF_FLAGS_GUARD_CHECK;
1709 	}
1710 
1711 	job->offset_in_ios = 0;
1712 
1713 	if (config->length != 0) {
1714 		/* Use subset of disk */
1715 		job->size_in_ios = config->length / job->io_size_blocks;
1716 		job->ios_base = config->offset / job->io_size_blocks;
1717 	} else {
1718 		/* Use whole disk */
1719 		job->size_in_ios = spdk_bdev_get_num_blocks(bdev) / job->io_size_blocks;
1720 		job->ios_base = 0;
1721 	}
1722 
1723 	if (job->is_random && g_zipf_theta > 0) {
1724 		job->zipf = spdk_zipf_create(job->size_in_ios, g_zipf_theta, 0);
1725 	}
1726 
1727 	if (job->verify) {
1728 		if (job->size_in_ios >= UINT32_MAX) {
1729 			SPDK_ERRLOG("Due to constraints of verify operation, the job storage capacity is too large\n");
1730 			bdevperf_job_free(job);
1731 			return -ENOMEM;
1732 		}
1733 		job->outstanding = spdk_bit_array_create(job->size_in_ios);
1734 		if (job->outstanding == NULL) {
1735 			SPDK_ERRLOG("Could not create outstanding array bitmap for bdev %s\n",
1736 				    spdk_bdev_get_name(bdev));
1737 			bdevperf_job_free(job);
1738 			return -ENOMEM;
1739 		}
1740 		if (job->queue_depth > (int)job->size_in_ios) {
1741 			SPDK_WARNLOG("Due to constraints of verify job, queue depth (-q, %d) can't exceed the number of IO "
1742 				     "requests which can be submitted to the bdev %s simultaneously (%"PRIu64"). "
1743 				     "Queue depth is limited to %"PRIu64"\n",
1744 				     job->queue_depth, job->name, job->size_in_ios, job->size_in_ios);
1745 			job->queue_depth = (int)job->size_in_ios;
1746 		}
1747 	}
1748 
1749 	job->histogram = spdk_histogram_data_alloc();
1750 	if (job->histogram == NULL) {
1751 		fprintf(stderr, "Failed to allocate histogram\n");
1752 		bdevperf_job_free(job);
1753 		return -ENOMEM;
1754 	}
1755 
1756 	TAILQ_INIT(&job->task_list);
1757 
1758 	if (g_random_map) {
1759 		if (job->size_in_ios >= UINT32_MAX) {
1760 			SPDK_ERRLOG("Due to constraints of the random map, the job storage capacity is too large\n");
1761 			bdevperf_job_free(job);
1762 			return -ENOMEM;
1763 		}
1764 		job->random_map = spdk_bit_array_create(job->size_in_ios);
1765 		if (job->random_map == NULL) {
1766 			SPDK_ERRLOG("Could not create random_map array bitmap for bdev %s\n",
1767 				    spdk_bdev_get_name(bdev));
1768 			bdevperf_job_free(job);
1769 			return -ENOMEM;
1770 		}
1771 	}
1772 
1773 	task_num = job->queue_depth;
1774 	if (job->reset) {
1775 		task_num += 1;
1776 	}
1777 	if (job->abort) {
1778 		task_num += job->queue_depth;
1779 	}
1780 
1781 	TAILQ_INSERT_TAIL(&g_bdevperf.jobs, job, link);
1782 
1783 	for (n = 0; n < task_num; n++) {
1784 		task = calloc(1, sizeof(struct bdevperf_task));
1785 		if (!task) {
1786 			fprintf(stderr, "Failed to allocate task from memory\n");
1787 			spdk_zipf_free(&job->zipf);
1788 			return -ENOMEM;
1789 		}
1790 
1791 		task->buf = spdk_zmalloc(job->buf_size, spdk_bdev_get_buf_align(job->bdev), NULL,
1792 					 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
1793 		if (!task->buf) {
1794 			fprintf(stderr, "Cannot allocate buf for task=%p\n", task);
1795 			spdk_zipf_free(&job->zipf);
1796 			free(task);
1797 			return -ENOMEM;
1798 		}
1799 
1800 		if (spdk_bdev_is_md_separate(job->bdev)) {
1801 			task->md_buf = spdk_zmalloc(job->io_size_blocks *
1802 						    spdk_bdev_get_md_size(job->bdev), 0, NULL,
1803 						    SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
1804 			if (!task->md_buf) {
1805 				fprintf(stderr, "Cannot allocate md buf for task=%p\n", task);
1806 				spdk_zipf_free(&job->zipf);
1807 				spdk_free(task->buf);
1808 				free(task);
1809 				return -ENOMEM;
1810 			}
1811 		}
1812 
1813 		task->job = job;
1814 		TAILQ_INSERT_TAIL(&job->task_list, task, link);
1815 	}
1816 
1817 	job->thread = thread;
1818 
1819 	g_construct_job_count++;
1820 
1821 	rc = spdk_thread_send_msg(thread, _bdevperf_construct_job, job);
1822 	assert(rc == 0);
1823 
1824 	return rc;
1825 }
1826 
1827 static int
1828 parse_rw(const char *str, enum job_config_rw ret)
1829 {
1830 	if (str == NULL) {
1831 		return ret;
1832 	}
1833 
1834 	if (!strcmp(str, "read")) {
1835 		ret = JOB_CONFIG_RW_READ;
1836 	} else if (!strcmp(str, "randread")) {
1837 		ret = JOB_CONFIG_RW_RANDREAD;
1838 	} else if (!strcmp(str, "write")) {
1839 		ret = JOB_CONFIG_RW_WRITE;
1840 	} else if (!strcmp(str, "randwrite")) {
1841 		ret = JOB_CONFIG_RW_RANDWRITE;
1842 	} else if (!strcmp(str, "verify")) {
1843 		ret = JOB_CONFIG_RW_VERIFY;
1844 	} else if (!strcmp(str, "reset")) {
1845 		ret = JOB_CONFIG_RW_RESET;
1846 	} else if (!strcmp(str, "unmap")) {
1847 		ret = JOB_CONFIG_RW_UNMAP;
1848 	} else if (!strcmp(str, "write_zeroes")) {
1849 		ret = JOB_CONFIG_RW_WRITE_ZEROES;
1850 	} else if (!strcmp(str, "flush")) {
1851 		ret = JOB_CONFIG_RW_FLUSH;
1852 	} else if (!strcmp(str, "rw")) {
1853 		ret = JOB_CONFIG_RW_RW;
1854 	} else if (!strcmp(str, "randrw")) {
1855 		ret = JOB_CONFIG_RW_RANDRW;
1856 	} else {
1857 		fprintf(stderr, "rw must be one of\n"
1858 			PATTERN_TYPES_STR "\n");
1859 		ret = BDEVPERF_CONFIG_ERROR;
1860 	}
1861 
1862 	return ret;
1863 }
1864 
1865 static const char *
1866 config_filename_next(const char *filename, char *out)
1867 {
1868 	int i, k;
1869 
1870 	if (filename == NULL) {
1871 		out[0] = '\0';
1872 		return NULL;
1873 	}
1874 
1875 	if (filename[0] == ':') {
1876 		filename++;
1877 	}
1878 
1879 	for (i = 0, k = 0;
1880 	     filename[i] != '\0' &&
1881 	     filename[i] != ':' &&
1882 	     i < BDEVPERF_CONFIG_MAX_FILENAME &&
1883 	     k < (BDEVPERF_CONFIG_MAX_FILENAME - 1);
1884 	     i++) {
1885 		if (filename[i] == ' ' || filename[i] == '\t') {
1886 			continue;
1887 		}
1888 
1889 		out[k++] = filename[i];
1890 	}
1891 	out[k] = 0;
1892 
1893 	return filename + i;
1894 }
1895 
1896 static struct spdk_thread *
1897 get_lcore_thread(uint32_t lcore)
1898 {
1899 	struct lcore_thread *lthread;
1900 
1901 	TAILQ_FOREACH(lthread, &g_lcore_thread_list, link) {
1902 		if (lthread->lcore == lcore) {
1903 			return lthread->thread;
1904 		}
1905 	}
1906 
1907 	return NULL;
1908 }
1909 
1910 static void
1911 create_lcore_thread(uint32_t lcore)
1912 {
1913 	struct lcore_thread *lthread;
1914 	struct spdk_cpuset cpumask = {};
1915 	char name[32];
1916 
1917 	lthread = calloc(1, sizeof(*lthread));
1918 	assert(lthread != NULL);
1919 
1920 	lthread->lcore = lcore;
1921 
1922 	snprintf(name, sizeof(name), "lcore_%u", lcore);
1923 	spdk_cpuset_set_cpu(&cpumask, lcore, true);
1924 
1925 	lthread->thread = spdk_thread_create(name, &cpumask);
1926 	assert(lthread->thread != NULL);
1927 
1928 	TAILQ_INSERT_TAIL(&g_lcore_thread_list, lthread, link);
1929 }
1930 
1931 static void
1932 bdevperf_construct_jobs(void)
1933 {
1934 	char filename[BDEVPERF_CONFIG_MAX_FILENAME];
1935 	struct spdk_thread *thread;
1936 	struct job_config *config;
1937 	struct spdk_bdev *bdev;
1938 	const char *filenames;
1939 	uint32_t i;
1940 	int rc;
1941 
1942 	if (g_one_thread_per_lcore) {
1943 		SPDK_ENV_FOREACH_CORE(i) {
1944 			create_lcore_thread(i);
1945 		}
1946 	}
1947 
1948 	TAILQ_FOREACH(config, &job_config_list, link) {
1949 		filenames = config->filename;
1950 
1951 		if (!g_one_thread_per_lcore) {
1952 			thread = construct_job_thread(&config->cpumask, config->name);
1953 		} else {
1954 			thread = get_lcore_thread(config->lcore);
1955 		}
1956 		assert(thread);
1957 
1958 		while (filenames) {
1959 			filenames = config_filename_next(filenames, filename);
1960 			if (strlen(filename) == 0) {
1961 				break;
1962 			}
1963 
1964 			bdev = spdk_bdev_get_by_name(filename);
1965 			if (!bdev) {
1966 				fprintf(stderr, "Unable to find bdev '%s'\n", filename);
1967 				g_run_rc = -EINVAL;
1968 				return;
1969 			}
1970 
1971 			rc = bdevperf_construct_job(bdev, config, thread);
1972 			if (rc < 0) {
1973 				g_run_rc = rc;
1974 				return;
1975 			}
1976 		}
1977 	}
1978 }
1979 
1980 static int
1981 make_cli_job_config(const char *filename, int64_t offset, uint64_t range)
1982 {
1983 	struct job_config *config = calloc(1, sizeof(*config));
1984 
1985 	if (config == NULL) {
1986 		fprintf(stderr, "Unable to allocate memory for job config\n");
1987 		return -ENOMEM;
1988 	}
1989 
1990 	config->name = filename;
1991 	config->filename = filename;
1992 	config->lcore = _get_next_core();
1993 	spdk_cpuset_zero(&config->cpumask);
1994 	spdk_cpuset_set_cpu(&config->cpumask, config->lcore, true);
1995 	config->bs = g_io_size;
1996 	config->iodepth = g_queue_depth;
1997 	config->rwmixread = g_rw_percentage;
1998 	config->offset = offset;
1999 	config->length = range;
2000 	config->rw = parse_rw(g_workload_type, BDEVPERF_CONFIG_ERROR);
2001 	if ((int)config->rw == BDEVPERF_CONFIG_ERROR) {
2002 		free(config);
2003 		return -EINVAL;
2004 	}
2005 
2006 	TAILQ_INSERT_TAIL(&job_config_list, config, link);
2007 	return 0;
2008 }
2009 
2010 static int
2011 bdevperf_construct_multithread_job_config(void *ctx, struct spdk_bdev *bdev)
2012 {
2013 	uint32_t *num_cores = ctx;
2014 	uint32_t i;
2015 	uint64_t blocks_per_job;
2016 	int64_t offset;
2017 	int rc;
2018 
2019 	blocks_per_job = spdk_bdev_get_num_blocks(bdev) / *num_cores;
2020 	offset = 0;
2021 
2022 	SPDK_ENV_FOREACH_CORE(i) {
2023 		rc = make_cli_job_config(spdk_bdev_get_name(bdev), offset, blocks_per_job);
2024 		if (rc) {
2025 			return rc;
2026 		}
2027 
2028 		offset += blocks_per_job;
2029 	}
2030 
2031 	return 0;
2032 }
2033 
2034 static void
2035 bdevperf_construct_multithread_job_configs(void)
2036 {
2037 	struct spdk_bdev *bdev;
2038 	uint32_t i;
2039 	uint32_t num_cores;
2040 
2041 	num_cores = 0;
2042 	SPDK_ENV_FOREACH_CORE(i) {
2043 		num_cores++;
2044 	}
2045 
2046 	if (num_cores == 0) {
2047 		g_run_rc = -EINVAL;
2048 		return;
2049 	}
2050 
2051 	if (g_job_bdev_name != NULL) {
2052 		bdev = spdk_bdev_get_by_name(g_job_bdev_name);
2053 		if (!bdev) {
2054 			fprintf(stderr, "Unable to find bdev '%s'\n", g_job_bdev_name);
2055 			return;
2056 		}
2057 		g_run_rc = bdevperf_construct_multithread_job_config(&num_cores, bdev);
2058 	} else {
2059 		g_run_rc = spdk_for_each_bdev_leaf(&num_cores, bdevperf_construct_multithread_job_config);
2060 	}
2061 
2062 }
2063 
2064 static int
2065 bdevperf_construct_job_config(void *ctx, struct spdk_bdev *bdev)
2066 {
2067 	/* Construct the job */
2068 	return make_cli_job_config(spdk_bdev_get_name(bdev), 0, 0);
2069 }
2070 
2071 static void
2072 bdevperf_construct_job_configs(void)
2073 {
2074 	struct spdk_bdev *bdev;
2075 
2076 	/* There are three different modes for allocating jobs. Standard mode
2077 	 * (the default) creates one spdk_thread per bdev and runs the I/O job there.
2078 	 *
2079 	 * The -C flag places bdevperf into "multithread" mode, meaning it creates
2080 	 * one spdk_thread per bdev PER CORE, and runs a copy of the job on each.
2081 	 * This runs multiple threads per bdev, effectively.
2082 	 *
2083 	 * The -j flag implies "FIO" mode which tries to mimic semantic of FIO jobs.
2084 	 * In "FIO" mode, threads are spawned per-job instead of per-bdev.
2085 	 * Each FIO job can be individually parameterized by filename, cpu mask, etc,
2086 	 * which is different from other modes in that they only support global options.
2087 	 *
2088 	 * Both for standard mode and "multithread" mode, if the -E flag is specified,
2089 	 * it creates one spdk_thread PER CORE. On each core, one spdk_thread is shared by
2090 	 * multiple jobs.
2091 	 */
2092 
2093 	if (g_bdevperf_conf) {
2094 		goto end;
2095 	}
2096 
2097 	if (g_multithread_mode) {
2098 		bdevperf_construct_multithread_job_configs();
2099 	} else if (g_job_bdev_name != NULL) {
2100 		bdev = spdk_bdev_get_by_name(g_job_bdev_name);
2101 		if (bdev) {
2102 			/* Construct the job */
2103 			g_run_rc = make_cli_job_config(g_job_bdev_name, 0, 0);
2104 		} else {
2105 			fprintf(stderr, "Unable to find bdev '%s'\n", g_job_bdev_name);
2106 		}
2107 	} else {
2108 		g_run_rc = spdk_for_each_bdev_leaf(NULL, bdevperf_construct_job_config);
2109 	}
2110 
2111 end:
2112 	/* Increment initial construct_jobs count so that it will never reach 0 in the middle
2113 	 * of iteration.
2114 	 */
2115 	g_construct_job_count = 1;
2116 
2117 	if (g_run_rc == 0) {
2118 		bdevperf_construct_jobs();
2119 	}
2120 
2121 	_bdevperf_construct_job_done(NULL);
2122 }
2123 
2124 static int
2125 parse_uint_option(struct spdk_conf_section *s, const char *name, int def)
2126 {
2127 	const char *job_name;
2128 	int tmp;
2129 
2130 	tmp = spdk_conf_section_get_intval(s, name);
2131 	if (tmp == -1) {
2132 		/* Field was not found. Check default value
2133 		 * In [global] section it is ok to have undefined values
2134 		 * but for other sections it is not ok */
2135 		if (def == BDEVPERF_CONFIG_UNDEFINED) {
2136 			job_name = spdk_conf_section_get_name(s);
2137 			if (strcmp(job_name, "global") == 0) {
2138 				return def;
2139 			}
2140 
2141 			fprintf(stderr,
2142 				"Job '%s' has no '%s' assigned\n",
2143 				job_name, name);
2144 			return BDEVPERF_CONFIG_ERROR;
2145 		}
2146 		return def;
2147 	}
2148 
2149 	/* NOTE: get_intval returns nonnegative on success */
2150 	if (tmp < 0) {
2151 		fprintf(stderr, "Job '%s' has bad '%s' value.\n",
2152 			spdk_conf_section_get_name(s), name);
2153 		return BDEVPERF_CONFIG_ERROR;
2154 	}
2155 
2156 	return tmp;
2157 }
2158 
2159 /* CLI arguments override parameters for global sections */
2160 static void
2161 config_set_cli_args(struct job_config *config)
2162 {
2163 	if (g_job_bdev_name) {
2164 		config->filename = g_job_bdev_name;
2165 	}
2166 	if (g_io_size > 0) {
2167 		config->bs = g_io_size;
2168 	}
2169 	if (g_queue_depth > 0) {
2170 		config->iodepth = g_queue_depth;
2171 	}
2172 	if (g_rw_percentage > 0) {
2173 		config->rwmixread = g_rw_percentage;
2174 	}
2175 	if (g_workload_type) {
2176 		config->rw = parse_rw(g_workload_type, config->rw);
2177 	}
2178 }
2179 
2180 static int
2181 read_job_config(void)
2182 {
2183 	struct job_config global_default_config;
2184 	struct job_config global_config;
2185 	struct spdk_conf_section *s;
2186 	struct job_config *config = NULL;
2187 	const char *cpumask;
2188 	const char *rw;
2189 	bool is_global;
2190 	int n = 0;
2191 	int val;
2192 
2193 	if (g_bdevperf_conf_file == NULL) {
2194 		return 0;
2195 	}
2196 
2197 	g_bdevperf_conf = spdk_conf_allocate();
2198 	if (g_bdevperf_conf == NULL) {
2199 		fprintf(stderr, "Could not allocate job config structure\n");
2200 		return 1;
2201 	}
2202 
2203 	spdk_conf_disable_sections_merge(g_bdevperf_conf);
2204 	if (spdk_conf_read(g_bdevperf_conf, g_bdevperf_conf_file)) {
2205 		fprintf(stderr, "Invalid job config");
2206 		return 1;
2207 	}
2208 
2209 	/* Initialize global defaults */
2210 	global_default_config.filename = NULL;
2211 	/* Zero mask is the same as g_all_cpuset
2212 	 * The g_all_cpuset is not initialized yet,
2213 	 * so use zero mask as the default instead */
2214 	spdk_cpuset_zero(&global_default_config.cpumask);
2215 	global_default_config.bs = BDEVPERF_CONFIG_UNDEFINED;
2216 	global_default_config.iodepth = BDEVPERF_CONFIG_UNDEFINED;
2217 	/* bdevperf has no default for -M option but in FIO the default is 50 */
2218 	global_default_config.rwmixread = 50;
2219 	global_default_config.offset = 0;
2220 	/* length 0 means 100% */
2221 	global_default_config.length = 0;
2222 	global_default_config.rw = BDEVPERF_CONFIG_UNDEFINED;
2223 	config_set_cli_args(&global_default_config);
2224 
2225 	if ((int)global_default_config.rw == BDEVPERF_CONFIG_ERROR) {
2226 		return 1;
2227 	}
2228 
2229 	/* There is only a single instance of global job_config
2230 	 * We just reset its value when we encounter new [global] section */
2231 	global_config = global_default_config;
2232 
2233 	for (s = spdk_conf_first_section(g_bdevperf_conf);
2234 	     s != NULL;
2235 	     s = spdk_conf_next_section(s)) {
2236 		config = calloc(1, sizeof(*config));
2237 		if (config == NULL) {
2238 			fprintf(stderr, "Unable to allocate memory for job config\n");
2239 			return 1;
2240 		}
2241 
2242 		config->name = spdk_conf_section_get_name(s);
2243 		is_global = strcmp(config->name, "global") == 0;
2244 
2245 		if (is_global) {
2246 			global_config = global_default_config;
2247 		}
2248 
2249 		config->filename = spdk_conf_section_get_val(s, "filename");
2250 		if (config->filename == NULL) {
2251 			config->filename = global_config.filename;
2252 		}
2253 		if (!is_global) {
2254 			if (config->filename == NULL) {
2255 				fprintf(stderr, "Job '%s' expects 'filename' parameter\n", config->name);
2256 				goto error;
2257 			} else if (strnlen(config->filename, BDEVPERF_CONFIG_MAX_FILENAME)
2258 				   >= BDEVPERF_CONFIG_MAX_FILENAME) {
2259 				fprintf(stderr,
2260 					"filename for '%s' job is too long. Max length is %d\n",
2261 					config->name, BDEVPERF_CONFIG_MAX_FILENAME);
2262 				goto error;
2263 			}
2264 		}
2265 
2266 		cpumask = spdk_conf_section_get_val(s, "cpumask");
2267 		if (cpumask == NULL) {
2268 			config->cpumask = global_config.cpumask;
2269 		} else if (spdk_cpuset_parse(&config->cpumask, cpumask)) {
2270 			fprintf(stderr, "Job '%s' has bad 'cpumask' value\n", config->name);
2271 			goto error;
2272 		}
2273 
2274 		config->bs = parse_uint_option(s, "bs", global_config.bs);
2275 		if (config->bs == BDEVPERF_CONFIG_ERROR) {
2276 			goto error;
2277 		} else if (config->bs == 0) {
2278 			fprintf(stderr, "'bs' of job '%s' must be greater than 0\n", config->name);
2279 			goto error;
2280 		}
2281 
2282 		config->iodepth = parse_uint_option(s, "iodepth", global_config.iodepth);
2283 		if (config->iodepth == BDEVPERF_CONFIG_ERROR) {
2284 			goto error;
2285 		} else if (config->iodepth == 0) {
2286 			fprintf(stderr,
2287 				"'iodepth' of job '%s' must be greater than 0\n",
2288 				config->name);
2289 			goto error;
2290 		}
2291 
2292 		config->rwmixread = parse_uint_option(s, "rwmixread", global_config.rwmixread);
2293 		if (config->rwmixread == BDEVPERF_CONFIG_ERROR) {
2294 			goto error;
2295 		} else if (config->rwmixread > 100) {
2296 			fprintf(stderr,
2297 				"'rwmixread' value of '%s' job is not in 0-100 range\n",
2298 				config->name);
2299 			goto error;
2300 		}
2301 
2302 		config->offset = parse_uint_option(s, "offset", global_config.offset);
2303 		if (config->offset == BDEVPERF_CONFIG_ERROR) {
2304 			goto error;
2305 		}
2306 
2307 		val = parse_uint_option(s, "length", global_config.length);
2308 		if (val == BDEVPERF_CONFIG_ERROR) {
2309 			goto error;
2310 		}
2311 		config->length = val;
2312 
2313 		rw = spdk_conf_section_get_val(s, "rw");
2314 		config->rw = parse_rw(rw, global_config.rw);
2315 		if ((int)config->rw == BDEVPERF_CONFIG_ERROR) {
2316 			fprintf(stderr, "Job '%s' has bad 'rw' value\n", config->name);
2317 			goto error;
2318 		} else if (!is_global && (int)config->rw == BDEVPERF_CONFIG_UNDEFINED) {
2319 			fprintf(stderr, "Job '%s' has no 'rw' assigned\n", config->name);
2320 			goto error;
2321 		}
2322 
2323 		if (is_global) {
2324 			config_set_cli_args(config);
2325 			global_config = *config;
2326 			free(config);
2327 			config = NULL;
2328 		} else {
2329 			TAILQ_INSERT_TAIL(&job_config_list, config, link);
2330 			n++;
2331 		}
2332 	}
2333 
2334 	if (g_rpc_log_file_name != NULL) {
2335 		g_rpc_log_file = fopen(g_rpc_log_file_name, "a");
2336 		if (g_rpc_log_file == NULL) {
2337 			fprintf(stderr, "Failed to open %s\n", g_rpc_log_file_name);
2338 			goto error;
2339 		}
2340 	}
2341 
2342 	printf("Using job config with %d jobs\n", n);
2343 	return 0;
2344 error:
2345 	free(config);
2346 	return 1;
2347 }
2348 
2349 static void
2350 bdevperf_run(void *arg1)
2351 {
2352 	uint32_t i;
2353 
2354 	g_main_thread = spdk_get_thread();
2355 
2356 	spdk_cpuset_zero(&g_all_cpuset);
2357 	SPDK_ENV_FOREACH_CORE(i) {
2358 		spdk_cpuset_set_cpu(&g_all_cpuset, i, true);
2359 	}
2360 
2361 	if (g_wait_for_tests) {
2362 		/* Do not perform any tests until RPC is received */
2363 		return;
2364 	}
2365 
2366 	bdevperf_construct_job_configs();
2367 }
2368 
2369 static void
2370 rpc_perform_tests_reset(void)
2371 {
2372 	/* Reset g_run_rc to 0 for the next test run. */
2373 	g_run_rc = 0;
2374 
2375 	/* Reset g_stats to 0 for the next test run. */
2376 	memset(&g_stats, 0, sizeof(g_stats));
2377 
2378 	/* Reset g_show_performance_period_num to 0 for the next test run. */
2379 	g_show_performance_period_num = 0;
2380 }
2381 
2382 static void
2383 rpc_perform_tests_cb(void)
2384 {
2385 	struct spdk_json_write_ctx *w;
2386 	struct spdk_jsonrpc_request *request = g_request;
2387 
2388 	g_request = NULL;
2389 
2390 	if (g_run_rc == 0) {
2391 		w = spdk_jsonrpc_begin_result(request);
2392 		spdk_json_write_uint32(w, g_run_rc);
2393 		spdk_jsonrpc_end_result(request, w);
2394 	} else {
2395 		spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
2396 						     "bdevperf failed with error %s", spdk_strerror(-g_run_rc));
2397 	}
2398 
2399 	rpc_perform_tests_reset();
2400 }
2401 
2402 static void
2403 rpc_perform_tests(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params)
2404 {
2405 	if (params != NULL) {
2406 		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
2407 						 "perform_tests method requires no parameters");
2408 		return;
2409 	}
2410 	if (g_request != NULL) {
2411 		fprintf(stderr, "Another test is already in progress.\n");
2412 		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
2413 						 spdk_strerror(-EINPROGRESS));
2414 		return;
2415 	}
2416 	g_request = request;
2417 
2418 	/* Only construct job configs at the first test run.  */
2419 	if (TAILQ_EMPTY(&job_config_list)) {
2420 		bdevperf_construct_job_configs();
2421 	} else {
2422 		bdevperf_construct_jobs();
2423 	}
2424 }
2425 SPDK_RPC_REGISTER("perform_tests", rpc_perform_tests, SPDK_RPC_RUNTIME)
2426 
2427 static void
2428 _bdevperf_job_drain(void *ctx)
2429 {
2430 	bdevperf_job_drain(ctx);
2431 }
2432 
2433 static void
2434 spdk_bdevperf_shutdown_cb(void)
2435 {
2436 	g_shutdown = true;
2437 	struct bdevperf_job *job, *tmp;
2438 
2439 	if (g_bdevperf.running_jobs == 0) {
2440 		bdevperf_test_done(NULL);
2441 		return;
2442 	}
2443 
2444 	/* Iterate jobs to stop all I/O */
2445 	TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, tmp) {
2446 		spdk_thread_send_msg(job->thread, _bdevperf_job_drain, job);
2447 	}
2448 }
2449 
2450 static int
2451 bdevperf_parse_arg(int ch, char *arg)
2452 {
2453 	long long tmp;
2454 
2455 	if (ch == 'w') {
2456 		g_workload_type = optarg;
2457 	} else if (ch == 'T') {
2458 		g_job_bdev_name = optarg;
2459 	} else if (ch == 'z') {
2460 		g_wait_for_tests = true;
2461 	} else if (ch == 'Z') {
2462 		g_zcopy = true;
2463 	} else if (ch == 'X') {
2464 		g_abort = true;
2465 	} else if (ch == 'C') {
2466 		g_multithread_mode = true;
2467 	} else if (ch == 'f') {
2468 		g_continue_on_failure = true;
2469 	} else if (ch == 'j') {
2470 		g_bdevperf_conf_file = optarg;
2471 	} else if (ch == 'F') {
2472 		char *endptr;
2473 
2474 		errno = 0;
2475 		g_zipf_theta = strtod(optarg, &endptr);
2476 		if (errno || optarg == endptr || g_zipf_theta < 0) {
2477 			fprintf(stderr, "Illegal zipf theta value %s\n", optarg);
2478 			return -EINVAL;
2479 		}
2480 	} else if (ch == 'l') {
2481 		g_latency_display_level++;
2482 	} else if (ch == 'D') {
2483 		g_random_map = true;
2484 	} else if (ch == 'E') {
2485 		g_one_thread_per_lcore = true;
2486 	} else if (ch == 'J') {
2487 		g_rpc_log_file_name = optarg;
2488 	} else {
2489 		tmp = spdk_strtoll(optarg, 10);
2490 		if (tmp < 0) {
2491 			fprintf(stderr, "Parse failed for the option %c.\n", ch);
2492 			return tmp;
2493 		} else if (tmp >= INT_MAX) {
2494 			fprintf(stderr, "Parsed option was too large %c.\n", ch);
2495 			return -ERANGE;
2496 		}
2497 
2498 		switch (ch) {
2499 		case 'q':
2500 			g_queue_depth = tmp;
2501 			break;
2502 		case 'o':
2503 			g_io_size = tmp;
2504 			break;
2505 		case 't':
2506 			g_time_in_sec = tmp;
2507 			break;
2508 		case 'k':
2509 			g_timeout_in_sec = tmp;
2510 			break;
2511 		case 'M':
2512 			g_rw_percentage = tmp;
2513 			g_mix_specified = true;
2514 			break;
2515 		case 'P':
2516 			g_show_performance_ema_period = tmp;
2517 			break;
2518 		case 'S':
2519 			g_show_performance_real_time = 1;
2520 			g_show_performance_period_in_usec = tmp * SPDK_SEC_TO_USEC;
2521 			break;
2522 		default:
2523 			return -EINVAL;
2524 		}
2525 	}
2526 	return 0;
2527 }
2528 
2529 static void
2530 bdevperf_usage(void)
2531 {
2532 	printf(" -q <depth>                io depth\n");
2533 	printf(" -o <size>                 io size in bytes\n");
2534 	printf(" -w <type>                 io pattern type, must be one of " PATTERN_TYPES_STR "\n");
2535 	printf(" -t <time>                 time in seconds\n");
2536 	printf(" -k <timeout>              timeout in seconds to detect starved I/O (default is 0 and disabled)\n");
2537 	printf(" -M <percent>              rwmixread (100 for reads, 0 for writes)\n");
2538 	printf(" -P <num>                  number of moving average period\n");
2539 	printf("\t\t(If set to n, show weighted mean of the previous n IO/s in real time)\n");
2540 	printf("\t\t(Formula: M = 2 / (n + 1), EMA[i+1] = IO/s * M + (1 - M) * EMA[i])\n");
2541 	printf("\t\t(only valid with -S)\n");
2542 	printf(" -S <period>               show performance result in real time every <period> seconds\n");
2543 	printf(" -T <bdev>                 bdev to run against. Default: all available bdevs.\n");
2544 	printf(" -f                        continue processing I/O even after failures\n");
2545 	printf(" -F <zipf theta>           use zipf distribution for random I/O\n");
2546 	printf(" -Z                        enable using zcopy bdev API for read or write I/O\n");
2547 	printf(" -z                        start bdevperf, but wait for RPC to start tests\n");
2548 	printf(" -X                        abort timed out I/O\n");
2549 	printf(" -C                        enable every core to send I/Os to each bdev\n");
2550 	printf(" -j <filename>             use job config file\n");
2551 	printf(" -l                        display latency histogram, default: disable. -l display summary, -ll display details\n");
2552 	printf(" -D                        use a random map for picking offsets not previously read or written (for all jobs)\n");
2553 	printf(" -E                        share per lcore thread among jobs. Available only if -j is not used.\n");
2554 	printf(" -J                        File name to open with append mode and log JSON RPC calls.\n");
2555 }
2556 
2557 static void
2558 bdevperf_fini(void)
2559 {
2560 	free_job_config();
2561 
2562 	if (g_rpc_log_file != NULL) {
2563 		fclose(g_rpc_log_file);
2564 		g_rpc_log_file = NULL;
2565 	}
2566 }
2567 
2568 static int
2569 verify_test_params(struct spdk_app_opts *opts)
2570 {
2571 	/* When RPC is used for starting tests and
2572 	 * no rpc_addr was configured for the app,
2573 	 * use the default address. */
2574 	if (g_wait_for_tests && opts->rpc_addr == NULL) {
2575 		opts->rpc_addr = SPDK_DEFAULT_RPC_ADDR;
2576 	}
2577 
2578 	if (g_rpc_log_file != NULL) {
2579 		opts->rpc_log_file = g_rpc_log_file;
2580 	}
2581 
2582 	if (!g_bdevperf_conf_file && g_queue_depth <= 0) {
2583 		goto out;
2584 	}
2585 	if (!g_bdevperf_conf_file && g_io_size <= 0) {
2586 		goto out;
2587 	}
2588 	if (!g_bdevperf_conf_file && !g_workload_type) {
2589 		goto out;
2590 	}
2591 	if (g_bdevperf_conf_file && g_one_thread_per_lcore) {
2592 		printf("If bdevperf's config file is used, per lcore thread cannot be used\n");
2593 		goto out;
2594 	}
2595 	if (g_time_in_sec <= 0) {
2596 		goto out;
2597 	}
2598 	g_time_in_usec = g_time_in_sec * SPDK_SEC_TO_USEC;
2599 
2600 	if (g_timeout_in_sec < 0) {
2601 		goto out;
2602 	}
2603 
2604 	if (g_abort && !g_timeout_in_sec) {
2605 		printf("Timeout must be set for abort option, Ignoring g_abort\n");
2606 	}
2607 
2608 	if (g_show_performance_ema_period > 0 &&
2609 	    g_show_performance_real_time == 0) {
2610 		fprintf(stderr, "-P option must be specified with -S option\n");
2611 		return 1;
2612 	}
2613 
2614 	if (g_io_size > SPDK_BDEV_LARGE_BUF_MAX_SIZE) {
2615 		printf("I/O size of %d is greater than zero copy threshold (%d).\n",
2616 		       g_io_size, SPDK_BDEV_LARGE_BUF_MAX_SIZE);
2617 		printf("Zero copy mechanism will not be used.\n");
2618 		g_zcopy = false;
2619 	}
2620 
2621 	if (g_bdevperf_conf_file) {
2622 		/* workload_type verification happens during config file parsing */
2623 		return 0;
2624 	}
2625 
2626 	if (!strcmp(g_workload_type, "verify") ||
2627 	    !strcmp(g_workload_type, "reset")) {
2628 		g_rw_percentage = 50;
2629 		if (g_io_size > SPDK_BDEV_LARGE_BUF_MAX_SIZE) {
2630 			fprintf(stderr, "Unable to exceed max I/O size of %d for verify. (%d provided).\n",
2631 				SPDK_BDEV_LARGE_BUF_MAX_SIZE, g_io_size);
2632 			return 1;
2633 		}
2634 		g_verify = true;
2635 		if (!strcmp(g_workload_type, "reset")) {
2636 			g_reset = true;
2637 		}
2638 	}
2639 
2640 	if (!strcmp(g_workload_type, "read") ||
2641 	    !strcmp(g_workload_type, "randread") ||
2642 	    !strcmp(g_workload_type, "write") ||
2643 	    !strcmp(g_workload_type, "randwrite") ||
2644 	    !strcmp(g_workload_type, "verify") ||
2645 	    !strcmp(g_workload_type, "reset") ||
2646 	    !strcmp(g_workload_type, "unmap") ||
2647 	    !strcmp(g_workload_type, "write_zeroes") ||
2648 	    !strcmp(g_workload_type, "flush")) {
2649 		if (g_mix_specified) {
2650 			fprintf(stderr, "Ignoring -M option... Please use -M option"
2651 				" only when using rw or randrw.\n");
2652 		}
2653 	}
2654 
2655 	if (!strcmp(g_workload_type, "rw") ||
2656 	    !strcmp(g_workload_type, "randrw")) {
2657 		if (g_rw_percentage < 0 || g_rw_percentage > 100) {
2658 			fprintf(stderr,
2659 				"-M must be specified to value from 0 to 100 "
2660 				"for rw or randrw.\n");
2661 			return 1;
2662 		}
2663 	}
2664 
2665 	if (strcmp(g_workload_type, "randread") &&
2666 	    strcmp(g_workload_type, "randwrite") &&
2667 	    strcmp(g_workload_type, "randrw")) {
2668 		if (g_random_map) {
2669 			fprintf(stderr, "Ignoring -D option... Please use -D option"
2670 				" only when using randread, randwrite or randrw.\n");
2671 			return 1;
2672 		}
2673 	}
2674 
2675 	return 0;
2676 out:
2677 	spdk_app_usage();
2678 	bdevperf_usage();
2679 	return 1;
2680 }
2681 
2682 int
2683 main(int argc, char **argv)
2684 {
2685 	struct spdk_app_opts opts = {};
2686 	int rc;
2687 
2688 	/* Use the runtime PID to set the random seed */
2689 	srand(getpid());
2690 
2691 	spdk_app_opts_init(&opts, sizeof(opts));
2692 	opts.name = "bdevperf";
2693 	opts.rpc_addr = NULL;
2694 	opts.shutdown_cb = spdk_bdevperf_shutdown_cb;
2695 
2696 	if ((rc = spdk_app_parse_args(argc, argv, &opts, "Zzfq:o:t:w:k:CEF:J:M:P:S:T:Xlj:D", NULL,
2697 				      bdevperf_parse_arg, bdevperf_usage)) !=
2698 	    SPDK_APP_PARSE_ARGS_SUCCESS) {
2699 		return rc;
2700 	}
2701 
2702 	if (read_job_config()) {
2703 		bdevperf_fini();
2704 		return 1;
2705 	}
2706 
2707 	if (verify_test_params(&opts) != 0) {
2708 		bdevperf_fini();
2709 		exit(1);
2710 	}
2711 
2712 	rc = spdk_app_start(&opts, bdevperf_run, NULL);
2713 
2714 	spdk_app_fini();
2715 	bdevperf_fini();
2716 	return rc;
2717 }
2718