xref: /spdk/examples/bdev/bdevperf/bdevperf.c (revision 588dfe314bb83d86effdf67ec42837b11c2620bf)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2016 Intel Corporation.
3  *   Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES.
4  *   All rights reserved.
5  */
6 
7 #include "spdk/stdinc.h"
8 
9 #include "spdk/bdev.h"
10 #include "spdk/accel.h"
11 #include "spdk/endian.h"
12 #include "spdk/env.h"
13 #include "spdk/event.h"
14 #include "spdk/log.h"
15 #include "spdk/util.h"
16 #include "spdk/thread.h"
17 #include "spdk/string.h"
18 #include "spdk/rpc.h"
19 #include "spdk/bit_array.h"
20 #include "spdk/conf.h"
21 #include "spdk/zipf.h"
22 
23 #define BDEVPERF_CONFIG_MAX_FILENAME 1024
24 #define BDEVPERF_CONFIG_UNDEFINED -1
25 #define BDEVPERF_CONFIG_ERROR -2
26 
27 struct bdevperf_task {
28 	struct iovec			iov;
29 	struct bdevperf_job		*job;
30 	struct spdk_bdev_io		*bdev_io;
31 	void				*buf;
32 	void				*md_buf;
33 	uint64_t			offset_blocks;
34 	struct bdevperf_task		*task_to_abort;
35 	enum spdk_bdev_io_type		io_type;
36 	TAILQ_ENTRY(bdevperf_task)	link;
37 	struct spdk_bdev_io_wait_entry	bdev_io_wait;
38 };
39 
40 static const char *g_workload_type = NULL;
41 static int g_io_size = 0;
42 /* initialize to invalid value so we can detect if user overrides it. */
43 static int g_rw_percentage = -1;
44 static bool g_verify = false;
45 static bool g_reset = false;
46 static bool g_continue_on_failure = false;
47 static bool g_abort = false;
48 static bool g_error_to_exit = false;
49 static int g_queue_depth = 0;
50 static uint64_t g_time_in_usec;
51 static int g_show_performance_real_time = 0;
52 static uint64_t g_show_performance_period_in_usec = 1000000;
53 static uint64_t g_show_performance_period_num = 0;
54 static uint64_t g_show_performance_ema_period = 0;
55 static int g_run_rc = 0;
56 static bool g_shutdown = false;
57 static uint64_t g_start_tsc;
58 static uint64_t g_shutdown_tsc;
59 static bool g_zcopy = false;
60 static struct spdk_thread *g_main_thread;
61 static int g_time_in_sec = 0;
62 static bool g_mix_specified = false;
63 static const char *g_job_bdev_name;
64 static bool g_wait_for_tests = false;
65 static struct spdk_jsonrpc_request *g_request = NULL;
66 static bool g_multithread_mode = false;
67 static int g_timeout_in_sec;
68 static struct spdk_conf *g_bdevperf_conf = NULL;
69 static const char *g_bdevperf_conf_file = NULL;
70 static double g_zipf_theta;
71 
72 static struct spdk_cpuset g_all_cpuset;
73 static struct spdk_poller *g_perf_timer = NULL;
74 
75 static void bdevperf_submit_single(struct bdevperf_job *job, struct bdevperf_task *task);
76 static void rpc_perform_tests_cb(void);
77 
78 struct bdevperf_job {
79 	char				*name;
80 	struct spdk_bdev		*bdev;
81 	struct spdk_bdev_desc		*bdev_desc;
82 	struct spdk_io_channel		*ch;
83 	TAILQ_ENTRY(bdevperf_job)	link;
84 	struct spdk_thread		*thread;
85 
86 	const char			*workload_type;
87 	int				io_size;
88 	int				rw_percentage;
89 	bool				is_random;
90 	bool				verify;
91 	bool				reset;
92 	bool				continue_on_failure;
93 	bool				unmap;
94 	bool				write_zeroes;
95 	bool				flush;
96 	bool				abort;
97 	int				queue_depth;
98 	unsigned int			seed;
99 
100 	uint64_t			io_completed;
101 	uint64_t			io_failed;
102 	uint64_t			io_timeout;
103 	uint64_t			prev_io_completed;
104 	double				ema_io_per_second;
105 	int				current_queue_depth;
106 	uint64_t			size_in_ios;
107 	uint64_t			ios_base;
108 	uint64_t			offset_in_ios;
109 	uint64_t			io_size_blocks;
110 	uint64_t			buf_size;
111 	uint32_t			dif_check_flags;
112 	bool				is_draining;
113 	struct spdk_poller		*run_timer;
114 	struct spdk_poller		*reset_timer;
115 	struct spdk_bit_array		*outstanding;
116 	struct spdk_zipf		*zipf;
117 	TAILQ_HEAD(, bdevperf_task)	task_list;
118 	uint64_t			run_time_in_usec;
119 };
120 
121 struct spdk_bdevperf {
122 	TAILQ_HEAD(, bdevperf_job)	jobs;
123 	uint32_t			running_jobs;
124 };
125 
126 static struct spdk_bdevperf g_bdevperf = {
127 	.jobs = TAILQ_HEAD_INITIALIZER(g_bdevperf.jobs),
128 	.running_jobs = 0,
129 };
130 
131 enum job_config_rw {
132 	JOB_CONFIG_RW_READ = 0,
133 	JOB_CONFIG_RW_WRITE,
134 	JOB_CONFIG_RW_RANDREAD,
135 	JOB_CONFIG_RW_RANDWRITE,
136 	JOB_CONFIG_RW_RW,
137 	JOB_CONFIG_RW_RANDRW,
138 	JOB_CONFIG_RW_VERIFY,
139 	JOB_CONFIG_RW_RESET,
140 	JOB_CONFIG_RW_UNMAP,
141 	JOB_CONFIG_RW_FLUSH,
142 	JOB_CONFIG_RW_WRITE_ZEROES,
143 };
144 
145 /* Storing values from a section of job config file */
146 struct job_config {
147 	const char			*name;
148 	const char			*filename;
149 	struct spdk_cpuset		cpumask;
150 	int				bs;
151 	int				iodepth;
152 	int				rwmixread;
153 	int64_t				offset;
154 	uint64_t			length;
155 	enum job_config_rw		rw;
156 	TAILQ_ENTRY(job_config)	link;
157 };
158 
159 TAILQ_HEAD(, job_config) job_config_list
160 	= TAILQ_HEAD_INITIALIZER(job_config_list);
161 
162 static bool g_performance_dump_active = false;
163 
164 struct bdevperf_aggregate_stats {
165 	struct bdevperf_job		*current_job;
166 	uint64_t			io_time_in_usec;
167 	uint64_t			ema_period;
168 	double				total_io_per_second;
169 	double				total_mb_per_second;
170 	double				total_failed_per_second;
171 	double				total_timeout_per_second;
172 };
173 
174 static struct bdevperf_aggregate_stats g_stats = {};
175 
176 /*
177  * Cumulative Moving Average (CMA): average of all data up to current
178  * Exponential Moving Average (EMA): weighted mean of the previous n data and more weight is given to recent
179  * Simple Moving Average (SMA): unweighted mean of the previous n data
180  *
181  * Bdevperf supports CMA and EMA.
182  */
183 static double
184 get_cma_io_per_second(struct bdevperf_job *job, uint64_t io_time_in_usec)
185 {
186 	return (double)job->io_completed * 1000000 / io_time_in_usec;
187 }
188 
189 static double
190 get_ema_io_per_second(struct bdevperf_job *job, uint64_t ema_period)
191 {
192 	double io_completed, io_per_second;
193 
194 	io_completed = job->io_completed;
195 	io_per_second = (double)(io_completed - job->prev_io_completed) * 1000000
196 			/ g_show_performance_period_in_usec;
197 	job->prev_io_completed = io_completed;
198 
199 	job->ema_io_per_second += (io_per_second - job->ema_io_per_second) * 2
200 				  / (ema_period + 1);
201 	return job->ema_io_per_second;
202 }
203 
204 static void
205 performance_dump_job(struct bdevperf_aggregate_stats *stats, struct bdevperf_job *job)
206 {
207 	double io_per_second, mb_per_second, failed_per_second, timeout_per_second;
208 	uint64_t time_in_usec;
209 
210 	printf("\r Job: %s (Core Mask 0x%s)\n", spdk_thread_get_name(job->thread),
211 	       spdk_cpuset_fmt(spdk_thread_get_cpumask(job->thread)));
212 
213 	if (job->io_failed > 0 && !job->reset && !job->continue_on_failure) {
214 		printf("\r Job: %s ended in about %.2f seconds with error\n",
215 		       spdk_thread_get_name(job->thread), (double)job->run_time_in_usec / 1000000);
216 	}
217 	if (job->verify) {
218 		printf("\t Verification LBA range: start 0x%" PRIx64 " length 0x%" PRIx64 "\n",
219 		       job->ios_base, job->size_in_ios);
220 	}
221 
222 	if (g_performance_dump_active == true) {
223 		/* Use job's actual run time as Job has ended */
224 		if (job->io_failed > 0 && !job->continue_on_failure) {
225 			time_in_usec = job->run_time_in_usec;
226 		} else {
227 			time_in_usec = stats->io_time_in_usec;
228 		}
229 	} else {
230 		time_in_usec = job->run_time_in_usec;
231 	}
232 
233 	if (stats->ema_period == 0) {
234 		io_per_second = get_cma_io_per_second(job, time_in_usec);
235 	} else {
236 		io_per_second = get_ema_io_per_second(job, stats->ema_period);
237 	}
238 	mb_per_second = io_per_second * job->io_size / (1024 * 1024);
239 
240 	failed_per_second = (double)job->io_failed * 1000000 / time_in_usec;
241 	timeout_per_second = (double)job->io_timeout * 1000000 / time_in_usec;
242 
243 	printf("\t %-20s: %10.2f %10.2f %10.2f",
244 	       job->name, (float)time_in_usec / 1000000, io_per_second, mb_per_second);
245 	printf(" %10.2f %8.2f\n",
246 	       failed_per_second, timeout_per_second);
247 
248 	stats->total_io_per_second += io_per_second;
249 	stats->total_mb_per_second += mb_per_second;
250 	stats->total_failed_per_second += failed_per_second;
251 	stats->total_timeout_per_second += timeout_per_second;
252 }
253 
254 static void
255 generate_data(void *buf, int buf_len, int block_size, void *md_buf, int md_size,
256 	      int num_blocks)
257 {
258 	int offset_blocks = 0, md_offset, data_block_size, inner_offset;
259 
260 	if (buf_len < num_blocks * block_size) {
261 		return;
262 	}
263 
264 	if (md_buf == NULL) {
265 		data_block_size = block_size - md_size;
266 		md_buf = (char *)buf + data_block_size;
267 		md_offset = block_size;
268 	} else {
269 		data_block_size = block_size;
270 		md_offset = md_size;
271 	}
272 
273 	while (offset_blocks < num_blocks) {
274 		inner_offset = 0;
275 		while (inner_offset < data_block_size) {
276 			*(uint32_t *)buf = offset_blocks + inner_offset;
277 			inner_offset += sizeof(uint32_t);
278 			buf += sizeof(uint32_t);
279 		}
280 		memset(md_buf, offset_blocks, md_size);
281 		md_buf += md_offset;
282 		offset_blocks++;
283 	}
284 }
285 
286 static bool
287 copy_data(void *wr_buf, int wr_buf_len, void *rd_buf, int rd_buf_len, int block_size,
288 	  void *wr_md_buf, void *rd_md_buf, int md_size, int num_blocks)
289 {
290 	if (wr_buf_len < num_blocks * block_size || rd_buf_len < num_blocks * block_size) {
291 		return false;
292 	}
293 
294 	assert((wr_md_buf != NULL) == (rd_md_buf != NULL));
295 
296 	memcpy(wr_buf, rd_buf, block_size * num_blocks);
297 
298 	if (wr_md_buf != NULL) {
299 		memcpy(wr_md_buf, rd_md_buf, md_size * num_blocks);
300 	}
301 
302 	return true;
303 }
304 
305 static bool
306 verify_data(void *wr_buf, int wr_buf_len, void *rd_buf, int rd_buf_len, int block_size,
307 	    void *wr_md_buf, void *rd_md_buf, int md_size, int num_blocks, bool md_check)
308 {
309 	int offset_blocks = 0, md_offset, data_block_size;
310 
311 	if (wr_buf_len < num_blocks * block_size || rd_buf_len < num_blocks * block_size) {
312 		return false;
313 	}
314 
315 	assert((wr_md_buf != NULL) == (rd_md_buf != NULL));
316 
317 	if (wr_md_buf == NULL) {
318 		data_block_size = block_size - md_size;
319 		wr_md_buf = (char *)wr_buf + data_block_size;
320 		rd_md_buf = (char *)rd_buf + data_block_size;
321 		md_offset = block_size;
322 	} else {
323 		data_block_size = block_size;
324 		md_offset = md_size;
325 	}
326 
327 	while (offset_blocks < num_blocks) {
328 		if (memcmp(wr_buf, rd_buf, data_block_size) != 0) {
329 			return false;
330 		}
331 
332 		wr_buf += block_size;
333 		rd_buf += block_size;
334 
335 		if (md_check) {
336 			if (memcmp(wr_md_buf, rd_md_buf, md_size) != 0) {
337 				return false;
338 			}
339 
340 			wr_md_buf += md_offset;
341 			rd_md_buf += md_offset;
342 		}
343 
344 		offset_blocks++;
345 	}
346 
347 	return true;
348 }
349 
350 static void
351 free_job_config(void)
352 {
353 	struct job_config *config, *tmp;
354 
355 	spdk_conf_free(g_bdevperf_conf);
356 	g_bdevperf_conf = NULL;
357 
358 	TAILQ_FOREACH_SAFE(config, &job_config_list, link, tmp) {
359 		TAILQ_REMOVE(&job_config_list, config, link);
360 		free(config);
361 	}
362 }
363 
364 static void
365 bdevperf_job_free(struct bdevperf_job *job)
366 {
367 	spdk_bit_array_free(&job->outstanding);
368 	spdk_zipf_free(&job->zipf);
369 	free(job->name);
370 	free(job);
371 }
372 
373 static void
374 job_thread_exit(void *ctx)
375 {
376 	spdk_thread_exit(spdk_get_thread());
377 }
378 
379 static void
380 bdevperf_test_done(void *ctx)
381 {
382 	struct bdevperf_job *job, *jtmp;
383 	struct bdevperf_task *task, *ttmp;
384 	int rc;
385 	uint64_t time_in_usec;
386 
387 	if (g_time_in_usec) {
388 		g_stats.io_time_in_usec = g_time_in_usec;
389 
390 		if (!g_run_rc && g_performance_dump_active) {
391 			spdk_thread_send_msg(spdk_get_thread(), bdevperf_test_done, NULL);
392 			return;
393 		}
394 	}
395 
396 	if (g_show_performance_real_time) {
397 		spdk_poller_unregister(&g_perf_timer);
398 	}
399 
400 	if (g_shutdown) {
401 		g_shutdown_tsc = spdk_get_ticks() - g_start_tsc;
402 		time_in_usec = g_shutdown_tsc * 1000000 / spdk_get_ticks_hz();
403 		g_time_in_usec = (g_time_in_usec > time_in_usec) ? time_in_usec : g_time_in_usec;
404 		printf("Received shutdown signal, test time was about %.6f seconds\n",
405 		       (double)g_time_in_usec / 1000000);
406 	}
407 
408 	printf("\n\r %-*s: %10s %10s %10s %10s %8s\n",
409 	       28, "Device Information", "runtime(s)", "IOPS", "MiB/s", "Fail/s", "TO/s");
410 
411 	TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, jtmp) {
412 		TAILQ_REMOVE(&g_bdevperf.jobs, job, link);
413 
414 		performance_dump_job(&g_stats, job);
415 
416 		spdk_thread_send_msg(job->thread, job_thread_exit, NULL);
417 
418 		TAILQ_FOREACH_SAFE(task, &job->task_list, link, ttmp) {
419 			TAILQ_REMOVE(&job->task_list, task, link);
420 			spdk_free(task->buf);
421 			spdk_free(task->md_buf);
422 			free(task);
423 		}
424 
425 		bdevperf_job_free(job);
426 	}
427 
428 	printf("\r ==================================================================================\n");
429 	printf("\r %-28s: %10s %10.2f %10.2f",
430 	       "Total", "", g_stats.total_io_per_second, g_stats.total_mb_per_second);
431 	printf(" %10.2f %8.2f\n",
432 	       g_stats.total_failed_per_second, g_stats.total_timeout_per_second);
433 	fflush(stdout);
434 
435 	rc = g_run_rc;
436 	if (g_request && !g_shutdown) {
437 		rpc_perform_tests_cb();
438 		if (rc != 0) {
439 			spdk_app_stop(rc);
440 		}
441 	} else {
442 		spdk_app_stop(rc);
443 	}
444 }
445 
446 static void
447 bdevperf_job_end(void *ctx)
448 {
449 	assert(g_main_thread == spdk_get_thread());
450 
451 	if (--g_bdevperf.running_jobs == 0) {
452 		bdevperf_test_done(NULL);
453 	}
454 }
455 
456 static void
457 bdevperf_end_task(struct bdevperf_task *task)
458 {
459 	struct bdevperf_job     *job = task->job;
460 	uint64_t		end_tsc = 0;
461 
462 	TAILQ_INSERT_TAIL(&job->task_list, task, link);
463 	if (job->is_draining) {
464 		if (job->current_queue_depth == 0) {
465 			end_tsc = spdk_get_ticks() - g_start_tsc;
466 			job->run_time_in_usec = end_tsc * 1000000 / spdk_get_ticks_hz();
467 			spdk_put_io_channel(job->ch);
468 			spdk_bdev_close(job->bdev_desc);
469 			spdk_thread_send_msg(g_main_thread, bdevperf_job_end, NULL);
470 		}
471 	}
472 }
473 
474 static void
475 bdevperf_queue_io_wait_with_cb(struct bdevperf_task *task, spdk_bdev_io_wait_cb cb_fn)
476 {
477 	struct bdevperf_job	*job = task->job;
478 
479 	task->bdev_io_wait.bdev = job->bdev;
480 	task->bdev_io_wait.cb_fn = cb_fn;
481 	task->bdev_io_wait.cb_arg = task;
482 	spdk_bdev_queue_io_wait(job->bdev, job->ch, &task->bdev_io_wait);
483 }
484 
485 static int
486 bdevperf_job_drain(void *ctx)
487 {
488 	struct bdevperf_job *job = ctx;
489 
490 	spdk_poller_unregister(&job->run_timer);
491 	if (job->reset) {
492 		spdk_poller_unregister(&job->reset_timer);
493 	}
494 
495 	job->is_draining = true;
496 
497 	return -1;
498 }
499 
500 static void
501 bdevperf_abort_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
502 {
503 	struct bdevperf_task	*task = cb_arg;
504 	struct bdevperf_job	*job = task->job;
505 
506 	job->current_queue_depth--;
507 
508 	if (success) {
509 		job->io_completed++;
510 	} else {
511 		job->io_failed++;
512 		if (!job->continue_on_failure) {
513 			bdevperf_job_drain(job);
514 			g_run_rc = -1;
515 		}
516 	}
517 
518 	spdk_bdev_free_io(bdev_io);
519 	bdevperf_end_task(task);
520 }
521 
522 static int
523 bdevperf_verify_dif(struct bdevperf_task *task, struct iovec *iovs, int iovcnt)
524 {
525 	struct bdevperf_job	*job = task->job;
526 	struct spdk_bdev	*bdev = job->bdev;
527 	struct spdk_dif_ctx	dif_ctx;
528 	struct spdk_dif_error	err_blk = {};
529 	int			rc;
530 
531 	rc = spdk_dif_ctx_init(&dif_ctx,
532 			       spdk_bdev_get_block_size(bdev),
533 			       spdk_bdev_get_md_size(bdev),
534 			       spdk_bdev_is_md_interleaved(bdev),
535 			       spdk_bdev_is_dif_head_of_md(bdev),
536 			       spdk_bdev_get_dif_type(bdev),
537 			       job->dif_check_flags,
538 			       task->offset_blocks, 0, 0, 0, 0);
539 	if (rc != 0) {
540 		fprintf(stderr, "Initialization of DIF context failed\n");
541 		return rc;
542 	}
543 
544 	if (spdk_bdev_is_md_interleaved(bdev)) {
545 		rc = spdk_dif_verify(iovs, iovcnt, job->io_size_blocks, &dif_ctx, &err_blk);
546 	} else {
547 		struct iovec md_iov = {
548 			.iov_base	= task->md_buf,
549 			.iov_len	= spdk_bdev_get_md_size(bdev) * job->io_size_blocks,
550 		};
551 
552 		rc = spdk_dix_verify(iovs, iovcnt, &md_iov, job->io_size_blocks, &dif_ctx, &err_blk);
553 	}
554 
555 	if (rc != 0) {
556 		fprintf(stderr, "DIF/DIX error detected. type=%d, offset=%" PRIu32 "\n",
557 			err_blk.err_type, err_blk.err_offset);
558 	}
559 
560 	return rc;
561 }
562 
563 static void
564 bdevperf_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
565 {
566 	struct bdevperf_job	*job;
567 	struct bdevperf_task	*task = cb_arg;
568 	struct iovec		*iovs;
569 	int			iovcnt;
570 	bool			md_check;
571 	uint64_t		offset_in_ios;
572 	int			rc;
573 
574 	job = task->job;
575 	md_check = spdk_bdev_get_dif_type(job->bdev) == SPDK_DIF_DISABLE;
576 
577 	if (g_error_to_exit == true) {
578 		bdevperf_job_drain(job);
579 	} else if (!success) {
580 		if (!job->reset && !job->continue_on_failure) {
581 			bdevperf_job_drain(job);
582 			g_run_rc = -1;
583 			g_error_to_exit = true;
584 			printf("task offset: %" PRIu64 " on job bdev=%s fails\n",
585 			       task->offset_blocks, job->name);
586 		}
587 	} else if (job->verify || job->reset) {
588 		spdk_bdev_io_get_iovec(bdev_io, &iovs, &iovcnt);
589 		assert(iovcnt == 1);
590 		assert(iovs != NULL);
591 		if (!verify_data(task->buf, job->buf_size, iovs[0].iov_base, iovs[0].iov_len,
592 				 spdk_bdev_get_block_size(job->bdev),
593 				 task->md_buf, spdk_bdev_io_get_md_buf(bdev_io),
594 				 spdk_bdev_get_md_size(job->bdev),
595 				 job->io_size_blocks, md_check)) {
596 			printf("Buffer mismatch! Target: %s Disk Offset: %" PRIu64 "\n", job->name, task->offset_blocks);
597 			printf("   First dword expected 0x%x got 0x%x\n", *(int *)task->buf, *(int *)iovs[0].iov_base);
598 			bdevperf_job_drain(job);
599 			g_run_rc = -1;
600 		}
601 	} else if (job->dif_check_flags != 0) {
602 		if (task->io_type == SPDK_BDEV_IO_TYPE_READ && spdk_bdev_get_md_size(job->bdev) != 0) {
603 			spdk_bdev_io_get_iovec(bdev_io, &iovs, &iovcnt);
604 			assert(iovcnt == 1);
605 			assert(iovs != NULL);
606 			rc = bdevperf_verify_dif(task, iovs, iovcnt);
607 			if (rc != 0) {
608 				printf("DIF error detected. task offset: %" PRIu64 " on job bdev=%s\n",
609 				       task->offset_blocks, job->name);
610 
611 				success = false;
612 				if (!job->reset && !job->continue_on_failure) {
613 					bdevperf_job_drain(job);
614 					g_run_rc = -1;
615 					g_error_to_exit = true;
616 				}
617 			}
618 		}
619 	}
620 
621 	job->current_queue_depth--;
622 
623 	if (success) {
624 		job->io_completed++;
625 	} else {
626 		job->io_failed++;
627 	}
628 
629 	if (job->verify) {
630 		assert(task->offset_blocks / job->io_size_blocks >= job->ios_base);
631 		offset_in_ios = task->offset_blocks / job->io_size_blocks - job->ios_base;
632 
633 		assert(spdk_bit_array_get(job->outstanding, offset_in_ios) == true);
634 		spdk_bit_array_clear(job->outstanding, offset_in_ios);
635 	}
636 
637 	spdk_bdev_free_io(bdev_io);
638 
639 	/*
640 	 * is_draining indicates when time has expired for the test run
641 	 * and we are just waiting for the previously submitted I/O
642 	 * to complete.  In this case, do not submit a new I/O to replace
643 	 * the one just completed.
644 	 */
645 	if (!job->is_draining) {
646 		bdevperf_submit_single(job, task);
647 	} else {
648 		bdevperf_end_task(task);
649 	}
650 }
651 
652 static void
653 bdevperf_verify_submit_read(void *cb_arg)
654 {
655 	struct bdevperf_job	*job;
656 	struct bdevperf_task	*task = cb_arg;
657 	int			rc;
658 
659 	job = task->job;
660 
661 	/* Read the data back in */
662 	rc = spdk_bdev_read_blocks_with_md(job->bdev_desc, job->ch, NULL, NULL,
663 					   task->offset_blocks, job->io_size_blocks,
664 					   bdevperf_complete, task);
665 
666 	if (rc == -ENOMEM) {
667 		bdevperf_queue_io_wait_with_cb(task, bdevperf_verify_submit_read);
668 	} else if (rc != 0) {
669 		printf("Failed to submit read: %d\n", rc);
670 		bdevperf_job_drain(job);
671 		g_run_rc = rc;
672 	}
673 }
674 
675 static void
676 bdevperf_verify_write_complete(struct spdk_bdev_io *bdev_io, bool success,
677 			       void *cb_arg)
678 {
679 	if (success) {
680 		spdk_bdev_free_io(bdev_io);
681 		bdevperf_verify_submit_read(cb_arg);
682 	} else {
683 		bdevperf_complete(bdev_io, success, cb_arg);
684 	}
685 }
686 
687 static void
688 bdevperf_zcopy_populate_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
689 {
690 	if (!success) {
691 		bdevperf_complete(bdev_io, success, cb_arg);
692 		return;
693 	}
694 
695 	spdk_bdev_zcopy_end(bdev_io, false, bdevperf_complete, cb_arg);
696 }
697 
698 static int
699 bdevperf_generate_dif(struct bdevperf_task *task)
700 {
701 	struct bdevperf_job	*job = task->job;
702 	struct spdk_bdev	*bdev = job->bdev;
703 	struct spdk_dif_ctx	dif_ctx;
704 	int			rc;
705 
706 	rc = spdk_dif_ctx_init(&dif_ctx,
707 			       spdk_bdev_get_block_size(bdev),
708 			       spdk_bdev_get_md_size(bdev),
709 			       spdk_bdev_is_md_interleaved(bdev),
710 			       spdk_bdev_is_dif_head_of_md(bdev),
711 			       spdk_bdev_get_dif_type(bdev),
712 			       job->dif_check_flags,
713 			       task->offset_blocks, 0, 0, 0, 0);
714 	if (rc != 0) {
715 		fprintf(stderr, "Initialization of DIF context failed\n");
716 		return rc;
717 	}
718 
719 	if (spdk_bdev_is_md_interleaved(bdev)) {
720 		rc = spdk_dif_generate(&task->iov, 1, job->io_size_blocks, &dif_ctx);
721 	} else {
722 		struct iovec md_iov = {
723 			.iov_base	= task->md_buf,
724 			.iov_len	= spdk_bdev_get_md_size(bdev) * job->io_size_blocks,
725 		};
726 
727 		rc = spdk_dix_generate(&task->iov, 1, &md_iov, job->io_size_blocks, &dif_ctx);
728 	}
729 
730 	if (rc != 0) {
731 		fprintf(stderr, "Generation of DIF/DIX failed\n");
732 	}
733 
734 	return rc;
735 }
736 
737 static void
738 bdevperf_submit_task(void *arg)
739 {
740 	struct bdevperf_task	*task = arg;
741 	struct bdevperf_job	*job = task->job;
742 	struct spdk_bdev_desc	*desc;
743 	struct spdk_io_channel	*ch;
744 	spdk_bdev_io_completion_cb cb_fn;
745 	uint64_t		offset_in_ios;
746 	int			rc = 0;
747 
748 	desc = job->bdev_desc;
749 	ch = job->ch;
750 
751 	switch (task->io_type) {
752 	case SPDK_BDEV_IO_TYPE_WRITE:
753 		if (spdk_bdev_get_md_size(job->bdev) != 0 && job->dif_check_flags != 0) {
754 			rc = bdevperf_generate_dif(task);
755 		}
756 		if (rc == 0) {
757 			cb_fn = (job->verify || job->reset) ? bdevperf_verify_write_complete : bdevperf_complete;
758 
759 			if (g_zcopy) {
760 				spdk_bdev_zcopy_end(task->bdev_io, true, cb_fn, task);
761 				return;
762 			} else {
763 				rc = spdk_bdev_writev_blocks_with_md(desc, ch, &task->iov, 1,
764 								     task->md_buf,
765 								     task->offset_blocks,
766 								     job->io_size_blocks,
767 								     cb_fn, task);
768 			}
769 		}
770 		break;
771 	case SPDK_BDEV_IO_TYPE_FLUSH:
772 		rc = spdk_bdev_flush_blocks(desc, ch, task->offset_blocks,
773 					    job->io_size_blocks, bdevperf_complete, task);
774 		break;
775 	case SPDK_BDEV_IO_TYPE_UNMAP:
776 		rc = spdk_bdev_unmap_blocks(desc, ch, task->offset_blocks,
777 					    job->io_size_blocks, bdevperf_complete, task);
778 		break;
779 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
780 		rc = spdk_bdev_write_zeroes_blocks(desc, ch, task->offset_blocks,
781 						   job->io_size_blocks, bdevperf_complete, task);
782 		break;
783 	case SPDK_BDEV_IO_TYPE_READ:
784 		if (g_zcopy) {
785 			rc = spdk_bdev_zcopy_start(desc, ch, NULL, 0, task->offset_blocks, job->io_size_blocks,
786 						   true, bdevperf_zcopy_populate_complete, task);
787 		} else {
788 			rc = spdk_bdev_read_blocks_with_md(desc, ch, task->buf, task->md_buf,
789 							   task->offset_blocks,
790 							   job->io_size_blocks,
791 							   bdevperf_complete, task);
792 		}
793 		break;
794 	case SPDK_BDEV_IO_TYPE_ABORT:
795 		rc = spdk_bdev_abort(desc, ch, task->task_to_abort, bdevperf_abort_complete, task);
796 		break;
797 	default:
798 		assert(false);
799 		rc = -EINVAL;
800 		break;
801 	}
802 
803 	if (rc == -ENOMEM) {
804 		bdevperf_queue_io_wait_with_cb(task, bdevperf_submit_task);
805 		return;
806 	} else if (rc != 0) {
807 		printf("Failed to submit bdev_io: %d\n", rc);
808 		if (job->verify) {
809 			assert(task->offset_blocks / job->io_size_blocks >= job->ios_base);
810 			offset_in_ios = task->offset_blocks / job->io_size_blocks - job->ios_base;
811 
812 			assert(spdk_bit_array_get(job->outstanding, offset_in_ios) == true);
813 			spdk_bit_array_clear(job->outstanding, offset_in_ios);
814 		}
815 		bdevperf_job_drain(job);
816 		g_run_rc = rc;
817 		return;
818 	}
819 
820 	job->current_queue_depth++;
821 }
822 
823 static void
824 bdevperf_zcopy_get_buf_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
825 {
826 	struct bdevperf_task	*task = cb_arg;
827 	struct bdevperf_job	*job = task->job;
828 	struct iovec		*iovs;
829 	int			iovcnt;
830 
831 	if (!success) {
832 		bdevperf_job_drain(job);
833 		g_run_rc = -1;
834 		return;
835 	}
836 
837 	task->bdev_io = bdev_io;
838 	task->io_type = SPDK_BDEV_IO_TYPE_WRITE;
839 
840 	if (job->verify || job->reset) {
841 		/* When job->verify or job->reset is enabled, task->buf is used for
842 		 *  verification of read after write.  For write I/O, when zcopy APIs
843 		 *  are used, task->buf cannot be used, and data must be written to
844 		 *  the data buffer allocated underneath bdev layer instead.
845 		 *  Hence we copy task->buf to the allocated data buffer here.
846 		 */
847 		spdk_bdev_io_get_iovec(bdev_io, &iovs, &iovcnt);
848 		assert(iovcnt == 1);
849 		assert(iovs != NULL);
850 
851 		copy_data(iovs[0].iov_base, iovs[0].iov_len, task->buf, job->buf_size,
852 			  spdk_bdev_get_block_size(job->bdev),
853 			  spdk_bdev_io_get_md_buf(bdev_io), task->md_buf,
854 			  spdk_bdev_get_md_size(job->bdev), job->io_size_blocks);
855 	}
856 
857 	bdevperf_submit_task(task);
858 }
859 
860 static void
861 bdevperf_prep_zcopy_write_task(void *arg)
862 {
863 	struct bdevperf_task	*task = arg;
864 	struct bdevperf_job	*job = task->job;
865 	int			rc;
866 
867 	rc = spdk_bdev_zcopy_start(job->bdev_desc, job->ch, NULL, 0,
868 				   task->offset_blocks, job->io_size_blocks,
869 				   false, bdevperf_zcopy_get_buf_complete, task);
870 	if (rc != 0) {
871 		assert(rc == -ENOMEM);
872 		bdevperf_queue_io_wait_with_cb(task, bdevperf_prep_zcopy_write_task);
873 		return;
874 	}
875 
876 	job->current_queue_depth++;
877 }
878 
879 static struct bdevperf_task *
880 bdevperf_job_get_task(struct bdevperf_job *job)
881 {
882 	struct bdevperf_task *task;
883 
884 	task = TAILQ_FIRST(&job->task_list);
885 	if (!task) {
886 		printf("Task allocation failed\n");
887 		abort();
888 	}
889 
890 	TAILQ_REMOVE(&job->task_list, task, link);
891 	return task;
892 }
893 
894 static void
895 bdevperf_submit_single(struct bdevperf_job *job, struct bdevperf_task *task)
896 {
897 	uint64_t offset_in_ios;
898 
899 	if (job->zipf) {
900 		offset_in_ios = spdk_zipf_generate(job->zipf);
901 	} else if (job->is_random) {
902 		offset_in_ios = rand_r(&job->seed) % job->size_in_ios;
903 	} else {
904 		offset_in_ios = job->offset_in_ios++;
905 		if (job->offset_in_ios == job->size_in_ios) {
906 			job->offset_in_ios = 0;
907 		}
908 
909 		/* Increment of offset_in_ios if there's already an outstanding IO
910 		 * to that location. We only need this with job->verify as random
911 		 * offsets are not supported with job->verify at this time.
912 		 */
913 		if (job->verify) {
914 			assert(spdk_bit_array_find_first_clear(job->outstanding, 0) != UINT32_MAX);
915 
916 			while (spdk_bit_array_get(job->outstanding, offset_in_ios)) {
917 				offset_in_ios = job->offset_in_ios++;
918 				if (job->offset_in_ios == job->size_in_ios) {
919 					job->offset_in_ios = 0;
920 				}
921 			}
922 			spdk_bit_array_set(job->outstanding, offset_in_ios);
923 		}
924 	}
925 
926 	/* For multi-thread to same job, offset_in_ios is relative
927 	 * to the LBA range assigned for that job. job->offset_blocks
928 	 * is absolute (entire bdev LBA range).
929 	 */
930 	task->offset_blocks = (offset_in_ios + job->ios_base) * job->io_size_blocks;
931 
932 	if (job->verify || job->reset) {
933 		generate_data(task->buf, job->buf_size,
934 			      spdk_bdev_get_block_size(job->bdev),
935 			      task->md_buf, spdk_bdev_get_md_size(job->bdev),
936 			      job->io_size_blocks);
937 		if (g_zcopy) {
938 			bdevperf_prep_zcopy_write_task(task);
939 			return;
940 		} else {
941 			task->iov.iov_base = task->buf;
942 			task->iov.iov_len = job->buf_size;
943 			task->io_type = SPDK_BDEV_IO_TYPE_WRITE;
944 		}
945 	} else if (job->flush) {
946 		task->io_type = SPDK_BDEV_IO_TYPE_FLUSH;
947 	} else if (job->unmap) {
948 		task->io_type = SPDK_BDEV_IO_TYPE_UNMAP;
949 	} else if (job->write_zeroes) {
950 		task->io_type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES;
951 	} else if ((job->rw_percentage == 100) ||
952 		   (job->rw_percentage != 0 && ((rand_r(&job->seed) % 100) < job->rw_percentage))) {
953 		task->io_type = SPDK_BDEV_IO_TYPE_READ;
954 	} else {
955 		if (g_zcopy) {
956 			bdevperf_prep_zcopy_write_task(task);
957 			return;
958 		} else {
959 			task->iov.iov_base = task->buf;
960 			task->iov.iov_len = job->buf_size;
961 			task->io_type = SPDK_BDEV_IO_TYPE_WRITE;
962 		}
963 	}
964 
965 	bdevperf_submit_task(task);
966 }
967 
968 static int reset_job(void *arg);
969 
970 static void
971 reset_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
972 {
973 	struct bdevperf_task	*task = cb_arg;
974 	struct bdevperf_job	*job = task->job;
975 
976 	if (!success) {
977 		printf("Reset blockdev=%s failed\n", spdk_bdev_get_name(job->bdev));
978 		bdevperf_job_drain(job);
979 		g_run_rc = -1;
980 	}
981 
982 	TAILQ_INSERT_TAIL(&job->task_list, task, link);
983 	spdk_bdev_free_io(bdev_io);
984 
985 	job->reset_timer = SPDK_POLLER_REGISTER(reset_job, job,
986 						10 * 1000000);
987 }
988 
989 static int
990 reset_job(void *arg)
991 {
992 	struct bdevperf_job *job = arg;
993 	struct bdevperf_task *task;
994 	int rc;
995 
996 	spdk_poller_unregister(&job->reset_timer);
997 
998 	/* Do reset. */
999 	task = bdevperf_job_get_task(job);
1000 	rc = spdk_bdev_reset(job->bdev_desc, job->ch,
1001 			     reset_cb, task);
1002 	if (rc) {
1003 		printf("Reset failed: %d\n", rc);
1004 		bdevperf_job_drain(job);
1005 		g_run_rc = -1;
1006 	}
1007 
1008 	return -1;
1009 }
1010 
1011 static void
1012 bdevperf_timeout_cb(void *cb_arg, struct spdk_bdev_io *bdev_io)
1013 {
1014 	struct bdevperf_job *job = cb_arg;
1015 	struct bdevperf_task *task;
1016 
1017 	job->io_timeout++;
1018 
1019 	if (job->is_draining || !job->abort ||
1020 	    !spdk_bdev_io_type_supported(job->bdev, SPDK_BDEV_IO_TYPE_ABORT)) {
1021 		return;
1022 	}
1023 
1024 	task = bdevperf_job_get_task(job);
1025 	if (task == NULL) {
1026 		return;
1027 	}
1028 
1029 	task->task_to_abort = spdk_bdev_io_get_cb_arg(bdev_io);
1030 	task->io_type = SPDK_BDEV_IO_TYPE_ABORT;
1031 
1032 	bdevperf_submit_task(task);
1033 }
1034 
1035 static void
1036 bdevperf_job_run(void *ctx)
1037 {
1038 	struct bdevperf_job *job = ctx;
1039 	struct bdevperf_task *task;
1040 	int i;
1041 
1042 	/* Submit initial I/O for this job. Each time one
1043 	 * completes, another will be submitted. */
1044 
1045 	/* Start a timer to stop this I/O chain when the run is over */
1046 	job->run_timer = SPDK_POLLER_REGISTER(bdevperf_job_drain, job, g_time_in_usec);
1047 	if (job->reset) {
1048 		job->reset_timer = SPDK_POLLER_REGISTER(reset_job, job,
1049 							10 * 1000000);
1050 	}
1051 
1052 	spdk_bdev_set_timeout(job->bdev_desc, g_timeout_in_sec, bdevperf_timeout_cb, job);
1053 
1054 	for (i = 0; i < job->queue_depth; i++) {
1055 		task = bdevperf_job_get_task(job);
1056 		bdevperf_submit_single(job, task);
1057 	}
1058 }
1059 
1060 static void
1061 _performance_dump_done(void *ctx)
1062 {
1063 	struct bdevperf_aggregate_stats *stats = ctx;
1064 
1065 	printf("\r ==================================================================================\n");
1066 	printf("\r %-28s: %10s %10.2f %10.2f",
1067 	       "Total", "", stats->total_io_per_second, stats->total_mb_per_second);
1068 	printf(" %10.2f %8.2f\n",
1069 	       stats->total_failed_per_second, stats->total_timeout_per_second);
1070 	fflush(stdout);
1071 
1072 	g_performance_dump_active = false;
1073 
1074 	free(stats);
1075 }
1076 
1077 static void
1078 _performance_dump(void *ctx)
1079 {
1080 	struct bdevperf_aggregate_stats *stats = ctx;
1081 
1082 	performance_dump_job(stats, stats->current_job);
1083 
1084 	/* This assumes the jobs list is static after start up time.
1085 	 * That's true right now, but if that ever changed this would need a lock. */
1086 	stats->current_job = TAILQ_NEXT(stats->current_job, link);
1087 	if (stats->current_job == NULL) {
1088 		spdk_thread_send_msg(g_main_thread, _performance_dump_done, stats);
1089 	} else {
1090 		spdk_thread_send_msg(stats->current_job->thread, _performance_dump, stats);
1091 	}
1092 }
1093 
1094 static int
1095 performance_statistics_thread(void *arg)
1096 {
1097 	struct bdevperf_aggregate_stats *stats;
1098 
1099 	if (g_performance_dump_active) {
1100 		return -1;
1101 	}
1102 
1103 	g_performance_dump_active = true;
1104 
1105 	stats = calloc(1, sizeof(*stats));
1106 	if (stats == NULL) {
1107 		return -1;
1108 	}
1109 
1110 	g_show_performance_period_num++;
1111 
1112 	stats->io_time_in_usec = g_show_performance_period_num * g_show_performance_period_in_usec;
1113 	stats->ema_period = g_show_performance_ema_period;
1114 
1115 	/* Iterate all of the jobs to gather stats
1116 	 * These jobs will not get removed here until a final performance dump is run,
1117 	 * so this should be safe without locking.
1118 	 */
1119 	stats->current_job = TAILQ_FIRST(&g_bdevperf.jobs);
1120 	if (stats->current_job == NULL) {
1121 		spdk_thread_send_msg(g_main_thread, _performance_dump_done, stats);
1122 	} else {
1123 		spdk_thread_send_msg(stats->current_job->thread, _performance_dump, stats);
1124 	}
1125 
1126 	return -1;
1127 }
1128 
1129 static void
1130 bdevperf_test(void)
1131 {
1132 	struct bdevperf_job *job;
1133 
1134 	printf("Running I/O for %" PRIu64 " seconds...\n", g_time_in_usec / 1000000);
1135 	fflush(stdout);
1136 
1137 	/* Start a timer to dump performance numbers */
1138 	g_start_tsc = spdk_get_ticks();
1139 	if (g_show_performance_real_time && !g_perf_timer) {
1140 		printf("\r %-*s: %10s %10s %10s %10s %8s\n",
1141 		       28, "Device Information", "runtime(s)", "IOPS", "MiB/s", "Fail/s", "TO/s");
1142 
1143 		g_perf_timer = SPDK_POLLER_REGISTER(performance_statistics_thread, NULL,
1144 						    g_show_performance_period_in_usec);
1145 	}
1146 
1147 	/* Iterate jobs to start all I/O */
1148 	TAILQ_FOREACH(job, &g_bdevperf.jobs, link) {
1149 		g_bdevperf.running_jobs++;
1150 		spdk_thread_send_msg(job->thread, bdevperf_job_run, job);
1151 	}
1152 }
1153 
1154 static void
1155 bdevperf_bdev_removed(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *event_ctx)
1156 {
1157 	struct bdevperf_job *job = event_ctx;
1158 
1159 	if (SPDK_BDEV_EVENT_REMOVE == type) {
1160 		bdevperf_job_drain(job);
1161 	}
1162 }
1163 
1164 static uint32_t g_construct_job_count = 0;
1165 
1166 static void
1167 _bdevperf_construct_job_done(void *ctx)
1168 {
1169 	if (--g_construct_job_count == 0) {
1170 
1171 		if (g_run_rc != 0) {
1172 			/* Something failed. */
1173 			bdevperf_test_done(NULL);
1174 			return;
1175 		}
1176 
1177 		/* Ready to run the test */
1178 		bdevperf_test();
1179 	} else if (g_run_rc != 0) {
1180 		/* Reset error as some jobs constructed right */
1181 		g_run_rc = 0;
1182 		if (g_continue_on_failure == false) {
1183 			g_error_to_exit = true;
1184 		}
1185 	}
1186 }
1187 
1188 /* Checkformat will not allow to use inlined type,
1189    this is a workaround */
1190 typedef struct spdk_thread *spdk_thread_t;
1191 
1192 static spdk_thread_t
1193 construct_job_thread(struct spdk_cpuset *cpumask, const char *tag)
1194 {
1195 	struct spdk_cpuset tmp;
1196 
1197 	/* This function runs on the main thread. */
1198 	assert(g_main_thread == spdk_get_thread());
1199 
1200 	/* Handle default mask */
1201 	if (spdk_cpuset_count(cpumask) == 0) {
1202 		cpumask = &g_all_cpuset;
1203 	}
1204 
1205 	/* Warn user that mask might need to be changed */
1206 	spdk_cpuset_copy(&tmp, cpumask);
1207 	spdk_cpuset_or(&tmp, &g_all_cpuset);
1208 	if (!spdk_cpuset_equal(&tmp, &g_all_cpuset)) {
1209 		fprintf(stderr, "cpumask for '%s' is too big\n", tag);
1210 	}
1211 
1212 	return spdk_thread_create(tag, cpumask);
1213 }
1214 
1215 static uint32_t
1216 _get_next_core(void)
1217 {
1218 	static uint32_t current_core = SPDK_ENV_LCORE_ID_ANY;
1219 
1220 	if (current_core == SPDK_ENV_LCORE_ID_ANY) {
1221 		current_core = spdk_env_get_first_core();
1222 		return current_core;
1223 	}
1224 
1225 	current_core = spdk_env_get_next_core(current_core);
1226 	if (current_core == SPDK_ENV_LCORE_ID_ANY) {
1227 		current_core = spdk_env_get_first_core();
1228 	}
1229 
1230 	return current_core;
1231 }
1232 
1233 static void
1234 _bdevperf_construct_job(void *ctx)
1235 {
1236 	struct bdevperf_job *job = ctx;
1237 	int rc;
1238 
1239 	rc = spdk_bdev_open_ext(spdk_bdev_get_name(job->bdev), true, bdevperf_bdev_removed, job,
1240 				&job->bdev_desc);
1241 	if (rc != 0) {
1242 		SPDK_ERRLOG("Could not open leaf bdev %s, error=%d\n", spdk_bdev_get_name(job->bdev), rc);
1243 		g_run_rc = -EINVAL;
1244 		goto end;
1245 	}
1246 
1247 	if (g_zcopy) {
1248 		if (!spdk_bdev_io_type_supported(job->bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) {
1249 			printf("Test requires ZCOPY but bdev module does not support ZCOPY\n");
1250 			g_run_rc = -ENOTSUP;
1251 			goto end;
1252 		}
1253 	}
1254 
1255 	job->ch = spdk_bdev_get_io_channel(job->bdev_desc);
1256 	if (!job->ch) {
1257 		SPDK_ERRLOG("Could not get io_channel for device %s, error=%d\n", spdk_bdev_get_name(job->bdev),
1258 			    rc);
1259 		spdk_bdev_close(job->bdev_desc);
1260 		TAILQ_REMOVE(&g_bdevperf.jobs, job, link);
1261 		g_run_rc = -ENOMEM;
1262 		goto end;
1263 	}
1264 
1265 end:
1266 	spdk_thread_send_msg(g_main_thread, _bdevperf_construct_job_done, NULL);
1267 }
1268 
1269 static void
1270 job_init_rw(struct bdevperf_job *job, enum job_config_rw rw)
1271 {
1272 	switch (rw) {
1273 	case JOB_CONFIG_RW_READ:
1274 		job->rw_percentage = 100;
1275 		break;
1276 	case JOB_CONFIG_RW_WRITE:
1277 		job->rw_percentage = 0;
1278 		break;
1279 	case JOB_CONFIG_RW_RANDREAD:
1280 		job->is_random = true;
1281 		job->rw_percentage = 100;
1282 		job->seed = rand();
1283 		break;
1284 	case JOB_CONFIG_RW_RANDWRITE:
1285 		job->is_random = true;
1286 		job->rw_percentage = 0;
1287 		job->seed = rand();
1288 		break;
1289 	case JOB_CONFIG_RW_RW:
1290 		job->is_random = false;
1291 		break;
1292 	case JOB_CONFIG_RW_RANDRW:
1293 		job->is_random = true;
1294 		job->seed = rand();
1295 		break;
1296 	case JOB_CONFIG_RW_VERIFY:
1297 		job->verify = true;
1298 		job->rw_percentage = 50;
1299 		break;
1300 	case JOB_CONFIG_RW_RESET:
1301 		job->reset = true;
1302 		job->verify = true;
1303 		job->rw_percentage = 50;
1304 		break;
1305 	case JOB_CONFIG_RW_UNMAP:
1306 		job->unmap = true;
1307 		break;
1308 	case JOB_CONFIG_RW_FLUSH:
1309 		job->flush = true;
1310 		break;
1311 	case JOB_CONFIG_RW_WRITE_ZEROES:
1312 		job->write_zeroes = true;
1313 		break;
1314 	}
1315 }
1316 
1317 static int
1318 bdevperf_construct_job(struct spdk_bdev *bdev, struct job_config *config,
1319 		       struct spdk_thread *thread)
1320 {
1321 	struct bdevperf_job *job;
1322 	struct bdevperf_task *task;
1323 	int block_size, data_block_size;
1324 	int rc;
1325 	int task_num, n;
1326 
1327 	block_size = spdk_bdev_get_block_size(bdev);
1328 	data_block_size = spdk_bdev_get_data_block_size(bdev);
1329 
1330 	job = calloc(1, sizeof(struct bdevperf_job));
1331 	if (!job) {
1332 		fprintf(stderr, "Unable to allocate memory for new job.\n");
1333 		return -ENOMEM;
1334 	}
1335 
1336 	job->name = strdup(spdk_bdev_get_name(bdev));
1337 	if (!job->name) {
1338 		fprintf(stderr, "Unable to allocate memory for job name.\n");
1339 		bdevperf_job_free(job);
1340 		return -ENOMEM;
1341 	}
1342 
1343 	job->workload_type = g_workload_type;
1344 	job->io_size = config->bs;
1345 	job->rw_percentage = config->rwmixread;
1346 	job->continue_on_failure = g_continue_on_failure;
1347 	job->queue_depth = config->iodepth;
1348 	job->bdev = bdev;
1349 	job->io_size_blocks = job->io_size / data_block_size;
1350 	job->buf_size = job->io_size_blocks * block_size;
1351 	job->abort = g_abort;
1352 	job_init_rw(job, config->rw);
1353 
1354 	if ((job->io_size % data_block_size) != 0) {
1355 		SPDK_ERRLOG("IO size (%d) is not multiples of data block size of bdev %s (%"PRIu32")\n",
1356 			    job->io_size, spdk_bdev_get_name(bdev), data_block_size);
1357 		bdevperf_job_free(job);
1358 		return -ENOTSUP;
1359 	}
1360 
1361 	if (job->unmap && !spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
1362 		printf("Skipping %s because it does not support unmap\n", spdk_bdev_get_name(bdev));
1363 		bdevperf_job_free(job);
1364 		return -ENOTSUP;
1365 	}
1366 
1367 	if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_REFTAG)) {
1368 		job->dif_check_flags |= SPDK_DIF_FLAGS_REFTAG_CHECK;
1369 	}
1370 	if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_GUARD)) {
1371 		job->dif_check_flags |= SPDK_DIF_FLAGS_GUARD_CHECK;
1372 	}
1373 
1374 	job->offset_in_ios = 0;
1375 
1376 	if (config->length != 0) {
1377 		/* Use subset of disk */
1378 		job->size_in_ios = config->length / job->io_size_blocks;
1379 		job->ios_base = config->offset / job->io_size_blocks;
1380 	} else {
1381 		/* Use whole disk */
1382 		job->size_in_ios = spdk_bdev_get_num_blocks(bdev) / job->io_size_blocks;
1383 		job->ios_base = 0;
1384 	}
1385 
1386 	if (job->is_random && g_zipf_theta > 0) {
1387 		job->zipf = spdk_zipf_create(job->size_in_ios, g_zipf_theta, 0);
1388 	}
1389 
1390 	if (job->verify) {
1391 		job->outstanding = spdk_bit_array_create(job->size_in_ios);
1392 		if (job->outstanding == NULL) {
1393 			SPDK_ERRLOG("Could not create outstanding array bitmap for bdev %s\n",
1394 				    spdk_bdev_get_name(bdev));
1395 			bdevperf_job_free(job);
1396 			return -ENOMEM;
1397 		}
1398 	}
1399 
1400 	TAILQ_INIT(&job->task_list);
1401 
1402 	task_num = job->queue_depth;
1403 	if (job->reset) {
1404 		task_num += 1;
1405 	}
1406 	if (job->abort) {
1407 		task_num += job->queue_depth;
1408 	}
1409 
1410 	TAILQ_INSERT_TAIL(&g_bdevperf.jobs, job, link);
1411 
1412 	for (n = 0; n < task_num; n++) {
1413 		task = calloc(1, sizeof(struct bdevperf_task));
1414 		if (!task) {
1415 			fprintf(stderr, "Failed to allocate task from memory\n");
1416 			return -ENOMEM;
1417 		}
1418 
1419 		task->buf = spdk_zmalloc(job->buf_size, spdk_bdev_get_buf_align(job->bdev), NULL,
1420 					 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
1421 		if (!task->buf) {
1422 			fprintf(stderr, "Cannot allocate buf for task=%p\n", task);
1423 			free(task);
1424 			return -ENOMEM;
1425 		}
1426 
1427 		if (spdk_bdev_is_md_separate(job->bdev)) {
1428 			task->md_buf = spdk_zmalloc(job->io_size_blocks *
1429 						    spdk_bdev_get_md_size(job->bdev), 0, NULL,
1430 						    SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
1431 			if (!task->md_buf) {
1432 				fprintf(stderr, "Cannot allocate md buf for task=%p\n", task);
1433 				spdk_free(task->buf);
1434 				free(task);
1435 				return -ENOMEM;
1436 			}
1437 		}
1438 
1439 		task->job = job;
1440 		TAILQ_INSERT_TAIL(&job->task_list, task, link);
1441 	}
1442 
1443 	job->thread = thread;
1444 
1445 	g_construct_job_count++;
1446 
1447 	rc = spdk_thread_send_msg(thread, _bdevperf_construct_job, job);
1448 	assert(rc == 0);
1449 
1450 	return rc;
1451 }
1452 
1453 static int
1454 parse_rw(const char *str, enum job_config_rw ret)
1455 {
1456 	if (str == NULL) {
1457 		return ret;
1458 	}
1459 
1460 	if (!strcmp(str, "read")) {
1461 		ret = JOB_CONFIG_RW_READ;
1462 	} else if (!strcmp(str, "randread")) {
1463 		ret = JOB_CONFIG_RW_RANDREAD;
1464 	} else if (!strcmp(str, "write")) {
1465 		ret = JOB_CONFIG_RW_WRITE;
1466 	} else if (!strcmp(str, "randwrite")) {
1467 		ret = JOB_CONFIG_RW_RANDWRITE;
1468 	} else if (!strcmp(str, "verify")) {
1469 		ret = JOB_CONFIG_RW_VERIFY;
1470 	} else if (!strcmp(str, "reset")) {
1471 		ret = JOB_CONFIG_RW_RESET;
1472 	} else if (!strcmp(str, "unmap")) {
1473 		ret = JOB_CONFIG_RW_UNMAP;
1474 	} else if (!strcmp(str, "write_zeroes")) {
1475 		ret = JOB_CONFIG_RW_WRITE_ZEROES;
1476 	} else if (!strcmp(str, "flush")) {
1477 		ret = JOB_CONFIG_RW_FLUSH;
1478 	} else if (!strcmp(str, "rw")) {
1479 		ret = JOB_CONFIG_RW_RW;
1480 	} else if (!strcmp(str, "randrw")) {
1481 		ret = JOB_CONFIG_RW_RANDRW;
1482 	} else {
1483 		fprintf(stderr, "rw must be one of\n"
1484 			"(read, write, randread, randwrite, rw, randrw, verify, reset, unmap, flush)\n");
1485 		ret = BDEVPERF_CONFIG_ERROR;
1486 	}
1487 
1488 	return ret;
1489 }
1490 
1491 static const char *
1492 config_filename_next(const char *filename, char *out)
1493 {
1494 	int i, k;
1495 
1496 	if (filename == NULL) {
1497 		out[0] = '\0';
1498 		return NULL;
1499 	}
1500 
1501 	if (filename[0] == ':') {
1502 		filename++;
1503 	}
1504 
1505 	for (i = 0, k = 0;
1506 	     filename[i] != '\0' &&
1507 	     filename[i] != ':' &&
1508 	     i < BDEVPERF_CONFIG_MAX_FILENAME;
1509 	     i++) {
1510 		if (filename[i] == ' ' || filename[i] == '\t') {
1511 			continue;
1512 		}
1513 
1514 		out[k++] = filename[i];
1515 	}
1516 	out[k] = 0;
1517 
1518 	return filename + i;
1519 }
1520 
1521 static void
1522 bdevperf_construct_jobs(void)
1523 {
1524 	char filename[BDEVPERF_CONFIG_MAX_FILENAME];
1525 	struct spdk_thread *thread;
1526 	struct job_config *config;
1527 	struct spdk_bdev *bdev;
1528 	const char *filenames;
1529 	int rc;
1530 
1531 	TAILQ_FOREACH(config, &job_config_list, link) {
1532 		filenames = config->filename;
1533 
1534 		thread = construct_job_thread(&config->cpumask, config->name);
1535 		assert(thread);
1536 
1537 		while (filenames) {
1538 			filenames = config_filename_next(filenames, filename);
1539 			if (strlen(filename) == 0) {
1540 				break;
1541 			}
1542 
1543 			bdev = spdk_bdev_get_by_name(filename);
1544 			if (!bdev) {
1545 				fprintf(stderr, "Unable to find bdev '%s'\n", filename);
1546 				g_run_rc = -EINVAL;
1547 				return;
1548 			}
1549 
1550 			rc = bdevperf_construct_job(bdev, config, thread);
1551 			if (rc < 0) {
1552 				g_run_rc = rc;
1553 				return;
1554 			}
1555 		}
1556 	}
1557 }
1558 
1559 static int
1560 make_cli_job_config(const char *filename, int64_t offset, uint64_t range)
1561 {
1562 	struct job_config *config = calloc(1, sizeof(*config));
1563 
1564 	if (config == NULL) {
1565 		fprintf(stderr, "Unable to allocate memory for job config\n");
1566 		return -ENOMEM;
1567 	}
1568 
1569 	config->name = filename;
1570 	config->filename = filename;
1571 	spdk_cpuset_zero(&config->cpumask);
1572 	spdk_cpuset_set_cpu(&config->cpumask, _get_next_core(), true);
1573 	config->bs = g_io_size;
1574 	config->iodepth = g_queue_depth;
1575 	config->rwmixread = g_rw_percentage;
1576 	config->offset = offset;
1577 	config->length = range;
1578 	config->rw = parse_rw(g_workload_type, BDEVPERF_CONFIG_ERROR);
1579 	if ((int)config->rw == BDEVPERF_CONFIG_ERROR) {
1580 		free(config);
1581 		return -EINVAL;
1582 	}
1583 
1584 	TAILQ_INSERT_TAIL(&job_config_list, config, link);
1585 	return 0;
1586 }
1587 
1588 static void
1589 bdevperf_construct_multithread_job_configs(void)
1590 {
1591 	struct spdk_bdev *bdev;
1592 	uint32_t i;
1593 	uint32_t num_cores;
1594 	uint64_t blocks_per_job;
1595 	int64_t offset;
1596 
1597 	num_cores = 0;
1598 	SPDK_ENV_FOREACH_CORE(i) {
1599 		num_cores++;
1600 	}
1601 
1602 	if (num_cores == 0) {
1603 		g_run_rc = -EINVAL;
1604 		return;
1605 	}
1606 
1607 	if (g_job_bdev_name != NULL) {
1608 		bdev = spdk_bdev_get_by_name(g_job_bdev_name);
1609 		if (!bdev) {
1610 			fprintf(stderr, "Unable to find bdev '%s'\n", g_job_bdev_name);
1611 			return;
1612 		}
1613 
1614 		blocks_per_job = spdk_bdev_get_num_blocks(bdev) / num_cores;
1615 		offset = 0;
1616 
1617 		SPDK_ENV_FOREACH_CORE(i) {
1618 			g_run_rc = make_cli_job_config(g_job_bdev_name, offset, blocks_per_job);
1619 			if (g_run_rc) {
1620 				return;
1621 			}
1622 
1623 			offset += blocks_per_job;
1624 		}
1625 	} else {
1626 		bdev = spdk_bdev_first_leaf();
1627 		while (bdev != NULL) {
1628 			blocks_per_job = spdk_bdev_get_num_blocks(bdev) / num_cores;
1629 			offset = 0;
1630 
1631 			SPDK_ENV_FOREACH_CORE(i) {
1632 				g_run_rc = make_cli_job_config(spdk_bdev_get_name(bdev),
1633 							       offset, blocks_per_job);
1634 				if (g_run_rc) {
1635 					return;
1636 				}
1637 
1638 				offset += blocks_per_job;
1639 			}
1640 
1641 			bdev = spdk_bdev_next_leaf(bdev);
1642 		}
1643 	}
1644 }
1645 
1646 static void
1647 bdevperf_construct_job_configs(void)
1648 {
1649 	struct spdk_bdev *bdev;
1650 
1651 	/* There are three different modes for allocating jobs. Standard mode
1652 	 * (the default) creates one spdk_thread per bdev and runs the I/O job there.
1653 	 *
1654 	 * The -C flag places bdevperf into "multithread" mode, meaning it creates
1655 	 * one spdk_thread per bdev PER CORE, and runs a copy of the job on each.
1656 	 * This runs multiple threads per bdev, effectively.
1657 	 *
1658 	 * The -j flag implies "FIO" mode which tries to mimic semantic of FIO jobs.
1659 	 * In "FIO" mode, threads are spawned per-job instead of per-bdev.
1660 	 * Each FIO job can be individually parameterized by filename, cpu mask, etc,
1661 	 * which is different from other modes in that they only support global options.
1662 	 */
1663 
1664 	if (g_bdevperf_conf) {
1665 		goto end;
1666 	} else if (g_multithread_mode) {
1667 		bdevperf_construct_multithread_job_configs();
1668 		goto end;
1669 	}
1670 
1671 	if (g_job_bdev_name != NULL) {
1672 		bdev = spdk_bdev_get_by_name(g_job_bdev_name);
1673 		if (bdev) {
1674 			/* Construct the job */
1675 			g_run_rc = make_cli_job_config(g_job_bdev_name, 0, 0);
1676 		} else {
1677 			fprintf(stderr, "Unable to find bdev '%s'\n", g_job_bdev_name);
1678 		}
1679 	} else {
1680 		bdev = spdk_bdev_first_leaf();
1681 
1682 		while (bdev != NULL) {
1683 			/* Construct the job */
1684 			g_run_rc = make_cli_job_config(spdk_bdev_get_name(bdev), 0, 0);
1685 			if (g_run_rc) {
1686 				break;
1687 			}
1688 
1689 			bdev = spdk_bdev_next_leaf(bdev);
1690 		}
1691 	}
1692 
1693 end:
1694 	/* Increment initial construct_jobs count so that it will never reach 0 in the middle
1695 	 * of iteration.
1696 	 */
1697 	g_construct_job_count = 1;
1698 
1699 	if (g_run_rc == 0) {
1700 		bdevperf_construct_jobs();
1701 	}
1702 
1703 	_bdevperf_construct_job_done(NULL);
1704 }
1705 
1706 static int
1707 parse_uint_option(struct spdk_conf_section *s, const char *name, int def)
1708 {
1709 	const char *job_name;
1710 	int tmp;
1711 
1712 	tmp = spdk_conf_section_get_intval(s, name);
1713 	if (tmp == -1) {
1714 		/* Field was not found. Check default value
1715 		 * In [global] section it is ok to have undefined values
1716 		 * but for other sections it is not ok */
1717 		if (def == BDEVPERF_CONFIG_UNDEFINED) {
1718 			job_name = spdk_conf_section_get_name(s);
1719 			if (strcmp(job_name, "global") == 0) {
1720 				return def;
1721 			}
1722 
1723 			fprintf(stderr,
1724 				"Job '%s' has no '%s' assigned\n",
1725 				job_name, name);
1726 			return BDEVPERF_CONFIG_ERROR;
1727 		}
1728 		return def;
1729 	}
1730 
1731 	/* NOTE: get_intval returns nonnegative on success */
1732 	if (tmp < 0) {
1733 		fprintf(stderr, "Job '%s' has bad '%s' value.\n",
1734 			spdk_conf_section_get_name(s), name);
1735 		return BDEVPERF_CONFIG_ERROR;
1736 	}
1737 
1738 	return tmp;
1739 }
1740 
1741 /* CLI arguments override parameters for global sections */
1742 static void
1743 config_set_cli_args(struct job_config *config)
1744 {
1745 	if (g_job_bdev_name) {
1746 		config->filename = g_job_bdev_name;
1747 	}
1748 	if (g_io_size > 0) {
1749 		config->bs = g_io_size;
1750 	}
1751 	if (g_queue_depth > 0) {
1752 		config->iodepth = g_queue_depth;
1753 	}
1754 	if (g_rw_percentage > 0) {
1755 		config->rwmixread = g_rw_percentage;
1756 	}
1757 	if (g_workload_type) {
1758 		config->rw = parse_rw(g_workload_type, config->rw);
1759 	}
1760 }
1761 
1762 static int
1763 read_job_config(void)
1764 {
1765 	struct job_config global_default_config;
1766 	struct job_config global_config;
1767 	struct spdk_conf_section *s;
1768 	struct job_config *config;
1769 	const char *cpumask;
1770 	const char *rw;
1771 	bool is_global;
1772 	int n = 0;
1773 	int val;
1774 
1775 	if (g_bdevperf_conf_file == NULL) {
1776 		return 0;
1777 	}
1778 
1779 	g_bdevperf_conf = spdk_conf_allocate();
1780 	if (g_bdevperf_conf == NULL) {
1781 		fprintf(stderr, "Could not allocate job config structure\n");
1782 		return 1;
1783 	}
1784 
1785 	spdk_conf_disable_sections_merge(g_bdevperf_conf);
1786 	if (spdk_conf_read(g_bdevperf_conf, g_bdevperf_conf_file)) {
1787 		fprintf(stderr, "Invalid job config");
1788 		return 1;
1789 	}
1790 
1791 	/* Initialize global defaults */
1792 	global_default_config.filename = NULL;
1793 	/* Zero mask is the same as g_all_cpuset
1794 	 * The g_all_cpuset is not initialized yet,
1795 	 * so use zero mask as the default instead */
1796 	spdk_cpuset_zero(&global_default_config.cpumask);
1797 	global_default_config.bs = BDEVPERF_CONFIG_UNDEFINED;
1798 	global_default_config.iodepth = BDEVPERF_CONFIG_UNDEFINED;
1799 	/* bdevperf has no default for -M option but in FIO the default is 50 */
1800 	global_default_config.rwmixread = 50;
1801 	global_default_config.offset = 0;
1802 	/* length 0 means 100% */
1803 	global_default_config.length = 0;
1804 	global_default_config.rw = BDEVPERF_CONFIG_UNDEFINED;
1805 	config_set_cli_args(&global_default_config);
1806 
1807 	if ((int)global_default_config.rw == BDEVPERF_CONFIG_ERROR) {
1808 		return 1;
1809 	}
1810 
1811 	/* There is only a single instance of global job_config
1812 	 * We just reset its value when we encounter new [global] section */
1813 	global_config = global_default_config;
1814 
1815 	for (s = spdk_conf_first_section(g_bdevperf_conf);
1816 	     s != NULL;
1817 	     s = spdk_conf_next_section(s)) {
1818 		config = calloc(1, sizeof(*config));
1819 		if (config == NULL) {
1820 			fprintf(stderr, "Unable to allocate memory for job config\n");
1821 			return 1;
1822 		}
1823 
1824 		config->name = spdk_conf_section_get_name(s);
1825 		is_global = strcmp(config->name, "global") == 0;
1826 
1827 		if (is_global) {
1828 			global_config = global_default_config;
1829 		}
1830 
1831 		config->filename = spdk_conf_section_get_val(s, "filename");
1832 		if (config->filename == NULL) {
1833 			config->filename = global_config.filename;
1834 		}
1835 		if (!is_global) {
1836 			if (config->filename == NULL) {
1837 				fprintf(stderr, "Job '%s' expects 'filename' parameter\n", config->name);
1838 				goto error;
1839 			} else if (strnlen(config->filename, BDEVPERF_CONFIG_MAX_FILENAME)
1840 				   >= BDEVPERF_CONFIG_MAX_FILENAME) {
1841 				fprintf(stderr,
1842 					"filename for '%s' job is too long. Max length is %d\n",
1843 					config->name, BDEVPERF_CONFIG_MAX_FILENAME);
1844 				goto error;
1845 			}
1846 		}
1847 
1848 		cpumask = spdk_conf_section_get_val(s, "cpumask");
1849 		if (cpumask == NULL) {
1850 			config->cpumask = global_config.cpumask;
1851 		} else if (spdk_cpuset_parse(&config->cpumask, cpumask)) {
1852 			fprintf(stderr, "Job '%s' has bad 'cpumask' value\n", config->name);
1853 			goto error;
1854 		}
1855 
1856 		config->bs = parse_uint_option(s, "bs", global_config.bs);
1857 		if (config->bs == BDEVPERF_CONFIG_ERROR) {
1858 			goto error;
1859 		} else if (config->bs == 0) {
1860 			fprintf(stderr, "'bs' of job '%s' must be greater than 0\n", config->name);
1861 			goto error;
1862 		}
1863 
1864 		config->iodepth = parse_uint_option(s, "iodepth", global_config.iodepth);
1865 		if (config->iodepth == BDEVPERF_CONFIG_ERROR) {
1866 			goto error;
1867 		} else if (config->iodepth == 0) {
1868 			fprintf(stderr,
1869 				"'iodepth' of job '%s' must be greater than 0\n",
1870 				config->name);
1871 			goto error;
1872 		}
1873 
1874 		config->rwmixread = parse_uint_option(s, "rwmixread", global_config.rwmixread);
1875 		if (config->rwmixread == BDEVPERF_CONFIG_ERROR) {
1876 			goto error;
1877 		} else if (config->rwmixread > 100) {
1878 			fprintf(stderr,
1879 				"'rwmixread' value of '%s' job is not in 0-100 range\n",
1880 				config->name);
1881 			goto error;
1882 		}
1883 
1884 		config->offset = parse_uint_option(s, "offset", global_config.offset);
1885 		if (config->offset == BDEVPERF_CONFIG_ERROR) {
1886 			goto error;
1887 		}
1888 
1889 		val = parse_uint_option(s, "length", global_config.length);
1890 		if (val == BDEVPERF_CONFIG_ERROR) {
1891 			goto error;
1892 		}
1893 		config->length = val;
1894 
1895 		rw = spdk_conf_section_get_val(s, "rw");
1896 		config->rw = parse_rw(rw, global_config.rw);
1897 		if ((int)config->rw == BDEVPERF_CONFIG_ERROR) {
1898 			fprintf(stderr, "Job '%s' has bad 'rw' value\n", config->name);
1899 			goto error;
1900 		} else if (!is_global && (int)config->rw == BDEVPERF_CONFIG_UNDEFINED) {
1901 			fprintf(stderr, "Job '%s' has no 'rw' assigned\n", config->name);
1902 			goto error;
1903 		}
1904 
1905 		if (is_global) {
1906 			config_set_cli_args(config);
1907 			global_config = *config;
1908 			free(config);
1909 		} else {
1910 			TAILQ_INSERT_TAIL(&job_config_list, config, link);
1911 			n++;
1912 		}
1913 	}
1914 
1915 	printf("Using job config with %d jobs\n", n);
1916 	return 0;
1917 error:
1918 	free(config);
1919 	return 1;
1920 }
1921 
1922 static void
1923 bdevperf_run(void *arg1)
1924 {
1925 	uint32_t i;
1926 
1927 	g_main_thread = spdk_get_thread();
1928 
1929 	spdk_cpuset_zero(&g_all_cpuset);
1930 	SPDK_ENV_FOREACH_CORE(i) {
1931 		spdk_cpuset_set_cpu(&g_all_cpuset, i, true);
1932 	}
1933 
1934 	if (g_wait_for_tests) {
1935 		/* Do not perform any tests until RPC is received */
1936 		return;
1937 	}
1938 
1939 	bdevperf_construct_job_configs();
1940 }
1941 
1942 static void
1943 rpc_perform_tests_cb(void)
1944 {
1945 	struct spdk_json_write_ctx *w;
1946 	struct spdk_jsonrpc_request *request = g_request;
1947 
1948 	g_request = NULL;
1949 
1950 	if (g_run_rc == 0) {
1951 		w = spdk_jsonrpc_begin_result(request);
1952 		spdk_json_write_uint32(w, g_run_rc);
1953 		spdk_jsonrpc_end_result(request, w);
1954 	} else {
1955 		spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
1956 						     "bdevperf failed with error %s", spdk_strerror(-g_run_rc));
1957 	}
1958 
1959 	/* Reset g_run_rc to 0 for the next test run. */
1960 	g_run_rc = 0;
1961 
1962 	/* Reset g_stats to 0 for the next test run. */
1963 	memset(&g_stats, 0, sizeof(g_stats));
1964 }
1965 
1966 static void
1967 rpc_perform_tests(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params)
1968 {
1969 	if (params != NULL) {
1970 		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
1971 						 "perform_tests method requires no parameters");
1972 		return;
1973 	}
1974 	if (g_request != NULL) {
1975 		fprintf(stderr, "Another test is already in progress.\n");
1976 		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
1977 						 spdk_strerror(-EINPROGRESS));
1978 		return;
1979 	}
1980 	g_request = request;
1981 
1982 	/* Only construct job configs at the first test run.  */
1983 	if (TAILQ_EMPTY(&job_config_list)) {
1984 		bdevperf_construct_job_configs();
1985 	} else {
1986 		bdevperf_construct_jobs();
1987 	}
1988 }
1989 SPDK_RPC_REGISTER("perform_tests", rpc_perform_tests, SPDK_RPC_RUNTIME)
1990 
1991 static void
1992 _bdevperf_job_drain(void *ctx)
1993 {
1994 	bdevperf_job_drain(ctx);
1995 }
1996 
1997 static void
1998 spdk_bdevperf_shutdown_cb(void)
1999 {
2000 	g_shutdown = true;
2001 	struct bdevperf_job *job, *tmp;
2002 
2003 	if (g_bdevperf.running_jobs == 0) {
2004 		bdevperf_test_done(NULL);
2005 		return;
2006 	}
2007 
2008 	/* Iterate jobs to stop all I/O */
2009 	TAILQ_FOREACH_SAFE(job, &g_bdevperf.jobs, link, tmp) {
2010 		spdk_thread_send_msg(job->thread, _bdevperf_job_drain, job);
2011 	}
2012 }
2013 
2014 static int
2015 bdevperf_parse_arg(int ch, char *arg)
2016 {
2017 	long long tmp;
2018 
2019 	if (ch == 'w') {
2020 		g_workload_type = optarg;
2021 	} else if (ch == 'T') {
2022 		g_job_bdev_name = optarg;
2023 	} else if (ch == 'z') {
2024 		g_wait_for_tests = true;
2025 	} else if (ch == 'Z') {
2026 		g_zcopy = true;
2027 	} else if (ch == 'X') {
2028 		g_abort = true;
2029 	} else if (ch == 'C') {
2030 		g_multithread_mode = true;
2031 	} else if (ch == 'f') {
2032 		g_continue_on_failure = true;
2033 	} else if (ch == 'j') {
2034 		g_bdevperf_conf_file = optarg;
2035 	} else if (ch == 'F') {
2036 		char *endptr;
2037 
2038 		errno = 0;
2039 		g_zipf_theta = strtod(optarg, &endptr);
2040 		if (errno || optarg == endptr || g_zipf_theta < 0) {
2041 			fprintf(stderr, "Illegal zipf theta value %s\n", optarg);
2042 			return -EINVAL;
2043 		}
2044 	} else {
2045 		tmp = spdk_strtoll(optarg, 10);
2046 		if (tmp < 0) {
2047 			fprintf(stderr, "Parse failed for the option %c.\n", ch);
2048 			return tmp;
2049 		} else if (tmp >= INT_MAX) {
2050 			fprintf(stderr, "Parsed option was too large %c.\n", ch);
2051 			return -ERANGE;
2052 		}
2053 
2054 		switch (ch) {
2055 		case 'q':
2056 			g_queue_depth = tmp;
2057 			break;
2058 		case 'o':
2059 			g_io_size = tmp;
2060 			break;
2061 		case 't':
2062 			g_time_in_sec = tmp;
2063 			break;
2064 		case 'k':
2065 			g_timeout_in_sec = tmp;
2066 			break;
2067 		case 'M':
2068 			g_rw_percentage = tmp;
2069 			g_mix_specified = true;
2070 			break;
2071 		case 'P':
2072 			g_show_performance_ema_period = tmp;
2073 			break;
2074 		case 'S':
2075 			g_show_performance_real_time = 1;
2076 			g_show_performance_period_in_usec = tmp * 1000000;
2077 			break;
2078 		default:
2079 			return -EINVAL;
2080 		}
2081 	}
2082 	return 0;
2083 }
2084 
2085 static void
2086 bdevperf_usage(void)
2087 {
2088 	printf(" -q <depth>                io depth\n");
2089 	printf(" -o <size>                 io size in bytes\n");
2090 	printf(" -w <type>                 io pattern type, must be one of (read, write, randread, randwrite, rw, randrw, verify, reset, unmap, flush)\n");
2091 	printf(" -t <time>                 time in seconds\n");
2092 	printf(" -k <timeout>              timeout in seconds to detect starved I/O (default is 0 and disabled)\n");
2093 	printf(" -M <percent>              rwmixread (100 for reads, 0 for writes)\n");
2094 	printf(" -P <num>                  number of moving average period\n");
2095 	printf("\t\t(If set to n, show weighted mean of the previous n IO/s in real time)\n");
2096 	printf("\t\t(Formula: M = 2 / (n + 1), EMA[i+1] = IO/s * M + (1 - M) * EMA[i])\n");
2097 	printf("\t\t(only valid with -S)\n");
2098 	printf(" -S <period>               show performance result in real time every <period> seconds\n");
2099 	printf(" -T <bdev>                 bdev to run against. Default: all available bdevs.\n");
2100 	printf(" -f                        continue processing I/O even after failures\n");
2101 	printf(" -F <zipf theta>           use zipf distribution for random I/O\n");
2102 	printf(" -Z                        enable using zcopy bdev API for read or write I/O\n");
2103 	printf(" -z                        start bdevperf, but wait for RPC to start tests\n");
2104 	printf(" -X                        abort timed out I/O\n");
2105 	printf(" -C                        enable every core to send I/Os to each bdev\n");
2106 	printf(" -j <filename>             use job config file\n");
2107 }
2108 
2109 static int
2110 verify_test_params(struct spdk_app_opts *opts)
2111 {
2112 	/* When RPC is used for starting tests and
2113 	 * no rpc_addr was configured for the app,
2114 	 * use the default address. */
2115 	if (g_wait_for_tests && opts->rpc_addr == NULL) {
2116 		opts->rpc_addr = SPDK_DEFAULT_RPC_ADDR;
2117 	}
2118 
2119 	if (!g_bdevperf_conf_file && g_queue_depth <= 0) {
2120 		goto out;
2121 	}
2122 	if (!g_bdevperf_conf_file && g_io_size <= 0) {
2123 		goto out;
2124 	}
2125 	if (!g_bdevperf_conf_file && !g_workload_type) {
2126 		goto out;
2127 	}
2128 	if (g_time_in_sec <= 0) {
2129 		goto out;
2130 	}
2131 	g_time_in_usec = g_time_in_sec * 1000000LL;
2132 
2133 	if (g_timeout_in_sec < 0) {
2134 		goto out;
2135 	}
2136 
2137 	if (g_abort && !g_timeout_in_sec) {
2138 		printf("Timeout must be set for abort option, Ignoring g_abort\n");
2139 	}
2140 
2141 	if (g_show_performance_ema_period > 0 &&
2142 	    g_show_performance_real_time == 0) {
2143 		fprintf(stderr, "-P option must be specified with -S option\n");
2144 		return 1;
2145 	}
2146 
2147 	if (g_io_size > SPDK_BDEV_LARGE_BUF_MAX_SIZE) {
2148 		printf("I/O size of %d is greater than zero copy threshold (%d).\n",
2149 		       g_io_size, SPDK_BDEV_LARGE_BUF_MAX_SIZE);
2150 		printf("Zero copy mechanism will not be used.\n");
2151 		g_zcopy = false;
2152 	}
2153 
2154 	if (g_bdevperf_conf_file) {
2155 		/* workload_type verification happens during config file parsing */
2156 		return 0;
2157 	}
2158 
2159 	if (!strcmp(g_workload_type, "verify") ||
2160 	    !strcmp(g_workload_type, "reset")) {
2161 		g_rw_percentage = 50;
2162 		if (g_io_size > SPDK_BDEV_LARGE_BUF_MAX_SIZE) {
2163 			fprintf(stderr, "Unable to exceed max I/O size of %d for verify. (%d provided).\n",
2164 				SPDK_BDEV_LARGE_BUF_MAX_SIZE, g_io_size);
2165 			return 1;
2166 		}
2167 		g_verify = true;
2168 		if (!strcmp(g_workload_type, "reset")) {
2169 			g_reset = true;
2170 		}
2171 	}
2172 
2173 	if (!strcmp(g_workload_type, "read") ||
2174 	    !strcmp(g_workload_type, "randread") ||
2175 	    !strcmp(g_workload_type, "write") ||
2176 	    !strcmp(g_workload_type, "randwrite") ||
2177 	    !strcmp(g_workload_type, "verify") ||
2178 	    !strcmp(g_workload_type, "reset") ||
2179 	    !strcmp(g_workload_type, "unmap") ||
2180 	    !strcmp(g_workload_type, "write_zeroes") ||
2181 	    !strcmp(g_workload_type, "flush")) {
2182 		if (g_mix_specified) {
2183 			fprintf(stderr, "Ignoring -M option... Please use -M option"
2184 				" only when using rw or randrw.\n");
2185 		}
2186 	}
2187 
2188 	if (!strcmp(g_workload_type, "rw") ||
2189 	    !strcmp(g_workload_type, "randrw")) {
2190 		if (g_rw_percentage < 0 || g_rw_percentage > 100) {
2191 			fprintf(stderr,
2192 				"-M must be specified to value from 0 to 100 "
2193 				"for rw or randrw.\n");
2194 			return 1;
2195 		}
2196 	}
2197 
2198 	return 0;
2199 out:
2200 	spdk_app_usage();
2201 	bdevperf_usage();
2202 	return 1;
2203 }
2204 
2205 int
2206 main(int argc, char **argv)
2207 {
2208 	struct spdk_app_opts opts = {};
2209 	int rc;
2210 
2211 	/* Use the runtime PID to set the random seed */
2212 	srand(getpid());
2213 
2214 	spdk_app_opts_init(&opts, sizeof(opts));
2215 	opts.name = "bdevperf";
2216 	opts.rpc_addr = NULL;
2217 	opts.shutdown_cb = spdk_bdevperf_shutdown_cb;
2218 
2219 	if ((rc = spdk_app_parse_args(argc, argv, &opts, "Zzfq:o:t:w:k:CF:M:P:S:T:Xj:", NULL,
2220 				      bdevperf_parse_arg, bdevperf_usage)) !=
2221 	    SPDK_APP_PARSE_ARGS_SUCCESS) {
2222 		return rc;
2223 	}
2224 
2225 	if (read_job_config()) {
2226 		free_job_config();
2227 		return 1;
2228 	}
2229 
2230 	if (verify_test_params(&opts) != 0) {
2231 		free_job_config();
2232 		exit(1);
2233 	}
2234 
2235 	rc = spdk_app_start(&opts, bdevperf_run, NULL);
2236 
2237 	spdk_app_fini();
2238 	free_job_config();
2239 	return rc;
2240 }
2241