xref: /spdk/lib/bdev/bdev.c (revision c899854d0371a7cdb3e2fd8c07ccf3d1f0b8089a)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "spdk/bdev.h"
37 #include "spdk/conf.h"
38 
39 #include "spdk/env.h"
40 #include "spdk/event.h"
41 #include "spdk/thread.h"
42 #include "spdk/likely.h"
43 #include "spdk/queue.h"
44 #include "spdk/nvme_spec.h"
45 #include "spdk/scsi_spec.h"
46 #include "spdk/util.h"
47 
48 #include "spdk/bdev_module.h"
49 #include "spdk_internal/log.h"
50 #include "spdk/string.h"
51 
52 #ifdef SPDK_CONFIG_VTUNE
53 #include "ittnotify.h"
54 #include "ittnotify_types.h"
55 int __itt_init_ittlib(const char *, __itt_group_id);
56 #endif
57 
58 #define SPDK_BDEV_IO_POOL_SIZE			(64 * 1024)
59 #define SPDK_BDEV_IO_CACHE_SIZE			256
60 #define BUF_SMALL_POOL_SIZE			8192
61 #define BUF_LARGE_POOL_SIZE			1024
62 #define NOMEM_THRESHOLD_COUNT			8
63 #define ZERO_BUFFER_SIZE			0x100000
64 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC		1000
65 #define SPDK_BDEV_SEC_TO_USEC			1000000ULL
66 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE	1
67 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE	512
68 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC		10000
69 #define SPDK_BDEV_QOS_MIN_BW_IN_MB_PER_SEC	10
70 
71 enum spdk_bdev_qos_type {
72 	SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT = 0,
73 	SPDK_BDEV_QOS_RW_BYTEPS_RATE_LIMIT,
74 	SPDK_BDEV_QOS_NUM_TYPES /* Keep last */
75 };
76 
77 static const char *qos_type_str[SPDK_BDEV_QOS_NUM_TYPES] = {"Limit_IOPS", "Limit_BWPS"};
78 
79 struct spdk_bdev_mgr {
80 	struct spdk_mempool *bdev_io_pool;
81 
82 	struct spdk_mempool *buf_small_pool;
83 	struct spdk_mempool *buf_large_pool;
84 
85 	void *zero_buffer;
86 
87 	TAILQ_HEAD(, spdk_bdev_module) bdev_modules;
88 
89 	TAILQ_HEAD(, spdk_bdev) bdevs;
90 
91 	bool init_complete;
92 	bool module_init_complete;
93 
94 #ifdef SPDK_CONFIG_VTUNE
95 	__itt_domain	*domain;
96 #endif
97 };
98 
99 static struct spdk_bdev_mgr g_bdev_mgr = {
100 	.bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules),
101 	.bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs),
102 	.init_complete = false,
103 	.module_init_complete = false,
104 };
105 
106 static struct spdk_bdev_opts	g_bdev_opts = {
107 	.bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE,
108 	.bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE,
109 };
110 
111 static spdk_bdev_init_cb	g_init_cb_fn = NULL;
112 static void			*g_init_cb_arg = NULL;
113 
114 static spdk_bdev_fini_cb	g_fini_cb_fn = NULL;
115 static void			*g_fini_cb_arg = NULL;
116 static struct spdk_thread	*g_fini_thread = NULL;
117 
118 struct spdk_bdev_qos {
119 	/** Rate limit, in I/O per second */
120 	uint64_t iops_rate_limit;
121 
122 	/** Rate limit, in byte per second */
123 	uint64_t byte_rate_limit;
124 
125 	/** The channel that all I/O are funneled through */
126 	struct spdk_bdev_channel *ch;
127 
128 	/** The thread on which the poller is running. */
129 	struct spdk_thread *thread;
130 
131 	/** Queue of I/O waiting to be issued. */
132 	bdev_io_tailq_t queued;
133 
134 	/** Maximum allowed IOs to be issued in one timeslice (e.g., 1ms) and
135 	 *  only valid for the master channel which manages the outstanding IOs. */
136 	uint64_t max_ios_per_timeslice;
137 
138 	/** Maximum allowed bytes to be issued in one timeslice (e.g., 1ms) and
139 	 *  only valid for the master channel which manages the outstanding IOs. */
140 	uint64_t max_byte_per_timeslice;
141 
142 	/** Submitted IO in one timeslice (e.g., 1ms) */
143 	uint64_t io_submitted_this_timeslice;
144 
145 	/** Submitted byte in one timeslice (e.g., 1ms) */
146 	uint64_t byte_submitted_this_timeslice;
147 
148 	/** Polller that processes queued I/O commands each time slice. */
149 	struct spdk_poller *poller;
150 };
151 
152 struct spdk_bdev_mgmt_channel {
153 	bdev_io_stailq_t need_buf_small;
154 	bdev_io_stailq_t need_buf_large;
155 
156 	/*
157 	 * Each thread keeps a cache of bdev_io - this allows
158 	 *  bdev threads which are *not* DPDK threads to still
159 	 *  benefit from a per-thread bdev_io cache.  Without
160 	 *  this, non-DPDK threads fetching from the mempool
161 	 *  incur a cmpxchg on get and put.
162 	 */
163 	bdev_io_stailq_t per_thread_cache;
164 	uint32_t	per_thread_cache_count;
165 	uint32_t	bdev_io_cache_size;
166 
167 	TAILQ_HEAD(, spdk_bdev_shared_resource)	shared_resources;
168 	TAILQ_HEAD(, spdk_bdev_io_wait_entry)	io_wait_queue;
169 };
170 
171 /*
172  * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device
173  * will queue here their IO that awaits retry. It makes it posible to retry sending
174  * IO to one bdev after IO from other bdev completes.
175  */
176 struct spdk_bdev_shared_resource {
177 	/* The bdev management channel */
178 	struct spdk_bdev_mgmt_channel *mgmt_ch;
179 
180 	/*
181 	 * Count of I/O submitted to bdev module and waiting for completion.
182 	 * Incremented before submit_request() is called on an spdk_bdev_io.
183 	 */
184 	uint64_t		io_outstanding;
185 
186 	/*
187 	 * Queue of IO awaiting retry because of a previous NOMEM status returned
188 	 *  on this channel.
189 	 */
190 	bdev_io_tailq_t		nomem_io;
191 
192 	/*
193 	 * Threshold which io_outstanding must drop to before retrying nomem_io.
194 	 */
195 	uint64_t		nomem_threshold;
196 
197 	/* I/O channel allocated by a bdev module */
198 	struct spdk_io_channel	*shared_ch;
199 
200 	/* Refcount of bdev channels using this resource */
201 	uint32_t		ref;
202 
203 	TAILQ_ENTRY(spdk_bdev_shared_resource) link;
204 };
205 
206 #define BDEV_CH_RESET_IN_PROGRESS	(1 << 0)
207 #define BDEV_CH_QOS_ENABLED		(1 << 1)
208 
209 struct spdk_bdev_channel {
210 	struct spdk_bdev	*bdev;
211 
212 	/* The channel for the underlying device */
213 	struct spdk_io_channel	*channel;
214 
215 	/* Per io_device per thread data */
216 	struct spdk_bdev_shared_resource *shared_resource;
217 
218 	struct spdk_bdev_io_stat stat;
219 
220 	/*
221 	 * Count of I/O submitted through this channel and waiting for completion.
222 	 * Incremented before submit_request() is called on an spdk_bdev_io.
223 	 */
224 	uint64_t		io_outstanding;
225 
226 	bdev_io_tailq_t		queued_resets;
227 
228 	uint32_t		flags;
229 
230 #ifdef SPDK_CONFIG_VTUNE
231 	uint64_t		start_tsc;
232 	uint64_t		interval_tsc;
233 	__itt_string_handle	*handle;
234 	struct spdk_bdev_io_stat prev_stat;
235 #endif
236 
237 };
238 
239 struct spdk_bdev_desc {
240 	struct spdk_bdev		*bdev;
241 	spdk_bdev_remove_cb_t		remove_cb;
242 	void				*remove_ctx;
243 	bool				remove_scheduled;
244 	bool				write;
245 	TAILQ_ENTRY(spdk_bdev_desc)	link;
246 };
247 
248 struct spdk_bdev_iostat_ctx {
249 	struct spdk_bdev_io_stat *stat;
250 	spdk_bdev_get_device_stat_cb cb;
251 	void *cb_arg;
252 };
253 
254 #define __bdev_to_io_dev(bdev)		(((char *)bdev) + 1)
255 #define __bdev_from_io_dev(io_dev)	((struct spdk_bdev *)(((char *)io_dev) - 1))
256 
257 static void spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
258 
259 void
260 spdk_bdev_get_opts(struct spdk_bdev_opts *opts)
261 {
262 	*opts = g_bdev_opts;
263 }
264 
265 int
266 spdk_bdev_set_opts(struct spdk_bdev_opts *opts)
267 {
268 	uint32_t min_pool_size;
269 
270 	/*
271 	 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem
272 	 *  initialization.  A second mgmt_ch will be created on the same thread when the application starts
273 	 *  but before the deferred put_io_channel event is executed for the first mgmt_ch.
274 	 */
275 	min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1);
276 	if (opts->bdev_io_pool_size < min_pool_size) {
277 		SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32
278 			    " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size,
279 			    spdk_thread_get_count());
280 		SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size);
281 		return -1;
282 	}
283 
284 	g_bdev_opts = *opts;
285 	return 0;
286 }
287 
288 struct spdk_bdev *
289 spdk_bdev_first(void)
290 {
291 	struct spdk_bdev *bdev;
292 
293 	bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs);
294 	if (bdev) {
295 		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name);
296 	}
297 
298 	return bdev;
299 }
300 
301 struct spdk_bdev *
302 spdk_bdev_next(struct spdk_bdev *prev)
303 {
304 	struct spdk_bdev *bdev;
305 
306 	bdev = TAILQ_NEXT(prev, internal.link);
307 	if (bdev) {
308 		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name);
309 	}
310 
311 	return bdev;
312 }
313 
314 static struct spdk_bdev *
315 _bdev_next_leaf(struct spdk_bdev *bdev)
316 {
317 	while (bdev != NULL) {
318 		if (bdev->internal.claim_module == NULL) {
319 			return bdev;
320 		} else {
321 			bdev = TAILQ_NEXT(bdev, internal.link);
322 		}
323 	}
324 
325 	return bdev;
326 }
327 
328 struct spdk_bdev *
329 spdk_bdev_first_leaf(void)
330 {
331 	struct spdk_bdev *bdev;
332 
333 	bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs));
334 
335 	if (bdev) {
336 		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name);
337 	}
338 
339 	return bdev;
340 }
341 
342 struct spdk_bdev *
343 spdk_bdev_next_leaf(struct spdk_bdev *prev)
344 {
345 	struct spdk_bdev *bdev;
346 
347 	bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link));
348 
349 	if (bdev) {
350 		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name);
351 	}
352 
353 	return bdev;
354 }
355 
356 struct spdk_bdev *
357 spdk_bdev_get_by_name(const char *bdev_name)
358 {
359 	struct spdk_bdev_alias *tmp;
360 	struct spdk_bdev *bdev = spdk_bdev_first();
361 
362 	while (bdev != NULL) {
363 		if (strcmp(bdev_name, bdev->name) == 0) {
364 			return bdev;
365 		}
366 
367 		TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
368 			if (strcmp(bdev_name, tmp->alias) == 0) {
369 				return bdev;
370 			}
371 		}
372 
373 		bdev = spdk_bdev_next(bdev);
374 	}
375 
376 	return NULL;
377 }
378 
379 void
380 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len)
381 {
382 	struct iovec *iovs;
383 
384 	iovs = bdev_io->u.bdev.iovs;
385 
386 	assert(iovs != NULL);
387 	assert(bdev_io->u.bdev.iovcnt >= 1);
388 
389 	iovs[0].iov_base = buf;
390 	iovs[0].iov_len = len;
391 }
392 
393 static void
394 spdk_bdev_io_put_buf(struct spdk_bdev_io *bdev_io)
395 {
396 	struct spdk_mempool *pool;
397 	struct spdk_bdev_io *tmp;
398 	void *buf, *aligned_buf;
399 	bdev_io_stailq_t *stailq;
400 	struct spdk_bdev_mgmt_channel *ch;
401 
402 	assert(bdev_io->u.bdev.iovcnt == 1);
403 
404 	buf = bdev_io->internal.buf;
405 	ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
406 
407 	bdev_io->internal.buf = NULL;
408 
409 	if (bdev_io->internal.buf_len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) {
410 		pool = g_bdev_mgr.buf_small_pool;
411 		stailq = &ch->need_buf_small;
412 	} else {
413 		pool = g_bdev_mgr.buf_large_pool;
414 		stailq = &ch->need_buf_large;
415 	}
416 
417 	if (STAILQ_EMPTY(stailq)) {
418 		spdk_mempool_put(pool, buf);
419 	} else {
420 		tmp = STAILQ_FIRST(stailq);
421 
422 		aligned_buf = (void *)(((uintptr_t)buf + 511) & ~511UL);
423 		spdk_bdev_io_set_buf(bdev_io, aligned_buf, tmp->internal.buf_len);
424 
425 		STAILQ_REMOVE_HEAD(stailq, internal.buf_link);
426 		tmp->internal.buf = buf;
427 		tmp->internal.get_buf_cb(tmp->internal.ch->channel, tmp);
428 	}
429 }
430 
431 void
432 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len)
433 {
434 	struct spdk_mempool *pool;
435 	bdev_io_stailq_t *stailq;
436 	void *buf, *aligned_buf;
437 	struct spdk_bdev_mgmt_channel *mgmt_ch;
438 
439 	assert(cb != NULL);
440 	assert(bdev_io->u.bdev.iovs != NULL);
441 
442 	if (spdk_unlikely(bdev_io->u.bdev.iovs[0].iov_base != NULL)) {
443 		/* Buffer already present */
444 		cb(bdev_io->internal.ch->channel, bdev_io);
445 		return;
446 	}
447 
448 	assert(len <= SPDK_BDEV_LARGE_BUF_MAX_SIZE);
449 	mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
450 
451 	bdev_io->internal.buf_len = len;
452 	bdev_io->internal.get_buf_cb = cb;
453 	if (len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) {
454 		pool = g_bdev_mgr.buf_small_pool;
455 		stailq = &mgmt_ch->need_buf_small;
456 	} else {
457 		pool = g_bdev_mgr.buf_large_pool;
458 		stailq = &mgmt_ch->need_buf_large;
459 	}
460 
461 	buf = spdk_mempool_get(pool);
462 
463 	if (!buf) {
464 		STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link);
465 	} else {
466 		aligned_buf = (void *)(((uintptr_t)buf + 511) & ~511UL);
467 		spdk_bdev_io_set_buf(bdev_io, aligned_buf, len);
468 
469 		bdev_io->internal.buf = buf;
470 		bdev_io->internal.get_buf_cb(bdev_io->internal.ch->channel, bdev_io);
471 	}
472 }
473 
474 static int
475 spdk_bdev_module_get_max_ctx_size(void)
476 {
477 	struct spdk_bdev_module *bdev_module;
478 	int max_bdev_module_size = 0;
479 
480 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
481 		if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) {
482 			max_bdev_module_size = bdev_module->get_ctx_size();
483 		}
484 	}
485 
486 	return max_bdev_module_size;
487 }
488 
489 void
490 spdk_bdev_config_text(FILE *fp)
491 {
492 	struct spdk_bdev_module *bdev_module;
493 
494 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
495 		if (bdev_module->config_text) {
496 			bdev_module->config_text(fp);
497 		}
498 	}
499 }
500 
501 void
502 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w)
503 {
504 	struct spdk_bdev_module *bdev_module;
505 	struct spdk_bdev *bdev;
506 
507 	assert(w != NULL);
508 
509 	spdk_json_write_array_begin(w);
510 
511 	spdk_json_write_object_begin(w);
512 	spdk_json_write_named_string(w, "method", "set_bdev_options");
513 	spdk_json_write_name(w, "params");
514 	spdk_json_write_object_begin(w);
515 	spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size);
516 	spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size);
517 	spdk_json_write_object_end(w);
518 	spdk_json_write_object_end(w);
519 
520 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
521 		if (bdev_module->config_json) {
522 			bdev_module->config_json(w);
523 		}
524 	}
525 
526 	TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) {
527 		spdk_bdev_config_json(bdev, w);
528 	}
529 
530 	spdk_json_write_array_end(w);
531 }
532 
533 static int
534 spdk_bdev_mgmt_channel_create(void *io_device, void *ctx_buf)
535 {
536 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
537 	struct spdk_bdev_io *bdev_io;
538 	uint32_t i;
539 
540 	STAILQ_INIT(&ch->need_buf_small);
541 	STAILQ_INIT(&ch->need_buf_large);
542 
543 	STAILQ_INIT(&ch->per_thread_cache);
544 	ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size;
545 
546 	/* Pre-populate bdev_io cache to ensure this thread cannot be starved. */
547 	ch->per_thread_cache_count = 0;
548 	for (i = 0; i < ch->bdev_io_cache_size; i++) {
549 		bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
550 		assert(bdev_io != NULL);
551 		ch->per_thread_cache_count++;
552 		STAILQ_INSERT_TAIL(&ch->per_thread_cache, bdev_io, internal.buf_link);
553 	}
554 
555 	TAILQ_INIT(&ch->shared_resources);
556 	TAILQ_INIT(&ch->io_wait_queue);
557 
558 	return 0;
559 }
560 
561 static void
562 spdk_bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf)
563 {
564 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
565 	struct spdk_bdev_io *bdev_io;
566 
567 	if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) {
568 		SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n");
569 	}
570 
571 	if (!TAILQ_EMPTY(&ch->shared_resources)) {
572 		SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n");
573 	}
574 
575 	while (!STAILQ_EMPTY(&ch->per_thread_cache)) {
576 		bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
577 		STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
578 		ch->per_thread_cache_count--;
579 		spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
580 	}
581 
582 	assert(ch->per_thread_cache_count == 0);
583 }
584 
585 static void
586 spdk_bdev_init_complete(int rc)
587 {
588 	spdk_bdev_init_cb cb_fn = g_init_cb_fn;
589 	void *cb_arg = g_init_cb_arg;
590 	struct spdk_bdev_module *m;
591 
592 	g_bdev_mgr.init_complete = true;
593 	g_init_cb_fn = NULL;
594 	g_init_cb_arg = NULL;
595 
596 	/*
597 	 * For modules that need to know when subsystem init is complete,
598 	 * inform them now.
599 	 */
600 	if (rc == 0) {
601 		TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
602 			if (m->init_complete) {
603 				m->init_complete();
604 			}
605 		}
606 	}
607 
608 	cb_fn(cb_arg, rc);
609 }
610 
611 static void
612 spdk_bdev_module_action_complete(void)
613 {
614 	struct spdk_bdev_module *m;
615 
616 	/*
617 	 * Don't finish bdev subsystem initialization if
618 	 * module pre-initialization is still in progress, or
619 	 * the subsystem been already initialized.
620 	 */
621 	if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) {
622 		return;
623 	}
624 
625 	/*
626 	 * Check all bdev modules for inits/examinations in progress. If any
627 	 * exist, return immediately since we cannot finish bdev subsystem
628 	 * initialization until all are completed.
629 	 */
630 	TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
631 		if (m->internal.action_in_progress > 0) {
632 			return;
633 		}
634 	}
635 
636 	/*
637 	 * Modules already finished initialization - now that all
638 	 * the bdev modules have finished their asynchronous I/O
639 	 * processing, the entire bdev layer can be marked as complete.
640 	 */
641 	spdk_bdev_init_complete(0);
642 }
643 
644 static void
645 spdk_bdev_module_action_done(struct spdk_bdev_module *module)
646 {
647 	assert(module->internal.action_in_progress > 0);
648 	module->internal.action_in_progress--;
649 	spdk_bdev_module_action_complete();
650 }
651 
652 void
653 spdk_bdev_module_init_done(struct spdk_bdev_module *module)
654 {
655 	spdk_bdev_module_action_done(module);
656 }
657 
658 void
659 spdk_bdev_module_examine_done(struct spdk_bdev_module *module)
660 {
661 	spdk_bdev_module_action_done(module);
662 }
663 
664 static int
665 spdk_bdev_modules_init(void)
666 {
667 	struct spdk_bdev_module *module;
668 	int rc = 0;
669 
670 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
671 		rc = module->module_init();
672 		if (rc != 0) {
673 			break;
674 		}
675 	}
676 
677 	g_bdev_mgr.module_init_complete = true;
678 	return rc;
679 }
680 
681 
682 static void
683 spdk_bdev_init_failed_complete(void *cb_arg)
684 {
685 	spdk_bdev_init_complete(-1);
686 }
687 
688 static void
689 spdk_bdev_init_failed(void *cb_arg)
690 {
691 	spdk_bdev_finish(spdk_bdev_init_failed_complete, NULL);
692 }
693 
694 void
695 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg)
696 {
697 	struct spdk_conf_section *sp;
698 	struct spdk_bdev_opts bdev_opts;
699 	int32_t bdev_io_pool_size, bdev_io_cache_size;
700 	int cache_size;
701 	int rc = 0;
702 	char mempool_name[32];
703 
704 	assert(cb_fn != NULL);
705 
706 	sp = spdk_conf_find_section(NULL, "Bdev");
707 	if (sp != NULL) {
708 		spdk_bdev_get_opts(&bdev_opts);
709 
710 		bdev_io_pool_size = spdk_conf_section_get_intval(sp, "BdevIoPoolSize");
711 		if (bdev_io_pool_size >= 0) {
712 			bdev_opts.bdev_io_pool_size = bdev_io_pool_size;
713 		}
714 
715 		bdev_io_cache_size = spdk_conf_section_get_intval(sp, "BdevIoCacheSize");
716 		if (bdev_io_cache_size >= 0) {
717 			bdev_opts.bdev_io_cache_size = bdev_io_cache_size;
718 		}
719 
720 		if (spdk_bdev_set_opts(&bdev_opts)) {
721 			spdk_bdev_init_complete(-1);
722 			return;
723 		}
724 
725 		assert(memcmp(&bdev_opts, &g_bdev_opts, sizeof(bdev_opts)) == 0);
726 	}
727 
728 	g_init_cb_fn = cb_fn;
729 	g_init_cb_arg = cb_arg;
730 
731 	snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid());
732 
733 	g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name,
734 				  g_bdev_opts.bdev_io_pool_size,
735 				  sizeof(struct spdk_bdev_io) +
736 				  spdk_bdev_module_get_max_ctx_size(),
737 				  0,
738 				  SPDK_ENV_SOCKET_ID_ANY);
739 
740 	if (g_bdev_mgr.bdev_io_pool == NULL) {
741 		SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n");
742 		spdk_bdev_init_complete(-1);
743 		return;
744 	}
745 
746 	/**
747 	 * Ensure no more than half of the total buffers end up local caches, by
748 	 *   using spdk_thread_get_count() to determine how many local caches we need
749 	 *   to account for.
750 	 */
751 	cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_thread_get_count());
752 	snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid());
753 
754 	g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name,
755 				    BUF_SMALL_POOL_SIZE,
756 				    SPDK_BDEV_SMALL_BUF_MAX_SIZE + 512,
757 				    cache_size,
758 				    SPDK_ENV_SOCKET_ID_ANY);
759 	if (!g_bdev_mgr.buf_small_pool) {
760 		SPDK_ERRLOG("create rbuf small pool failed\n");
761 		spdk_bdev_init_complete(-1);
762 		return;
763 	}
764 
765 	cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_thread_get_count());
766 	snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid());
767 
768 	g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name,
769 				    BUF_LARGE_POOL_SIZE,
770 				    SPDK_BDEV_LARGE_BUF_MAX_SIZE + 512,
771 				    cache_size,
772 				    SPDK_ENV_SOCKET_ID_ANY);
773 	if (!g_bdev_mgr.buf_large_pool) {
774 		SPDK_ERRLOG("create rbuf large pool failed\n");
775 		spdk_bdev_init_complete(-1);
776 		return;
777 	}
778 
779 	g_bdev_mgr.zero_buffer = spdk_dma_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE,
780 				 NULL);
781 	if (!g_bdev_mgr.zero_buffer) {
782 		SPDK_ERRLOG("create bdev zero buffer failed\n");
783 		spdk_bdev_init_complete(-1);
784 		return;
785 	}
786 
787 #ifdef SPDK_CONFIG_VTUNE
788 	g_bdev_mgr.domain = __itt_domain_create("spdk_bdev");
789 #endif
790 
791 	spdk_io_device_register(&g_bdev_mgr, spdk_bdev_mgmt_channel_create,
792 				spdk_bdev_mgmt_channel_destroy,
793 				sizeof(struct spdk_bdev_mgmt_channel));
794 
795 	rc = spdk_bdev_modules_init();
796 	if (rc != 0) {
797 		SPDK_ERRLOG("bdev modules init failed\n");
798 		spdk_thread_send_msg(spdk_get_thread(), spdk_bdev_init_failed, NULL);
799 		return;
800 	}
801 
802 	spdk_bdev_module_action_complete();
803 }
804 
805 static void
806 spdk_bdev_mgr_unregister_cb(void *io_device)
807 {
808 	spdk_bdev_fini_cb cb_fn = g_fini_cb_fn;
809 
810 	if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) {
811 		SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n",
812 			    spdk_mempool_count(g_bdev_mgr.bdev_io_pool),
813 			    g_bdev_opts.bdev_io_pool_size);
814 	}
815 
816 	if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != BUF_SMALL_POOL_SIZE) {
817 		SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n",
818 			    spdk_mempool_count(g_bdev_mgr.buf_small_pool),
819 			    BUF_SMALL_POOL_SIZE);
820 		assert(false);
821 	}
822 
823 	if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != BUF_LARGE_POOL_SIZE) {
824 		SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n",
825 			    spdk_mempool_count(g_bdev_mgr.buf_large_pool),
826 			    BUF_LARGE_POOL_SIZE);
827 		assert(false);
828 	}
829 
830 	spdk_mempool_free(g_bdev_mgr.bdev_io_pool);
831 	spdk_mempool_free(g_bdev_mgr.buf_small_pool);
832 	spdk_mempool_free(g_bdev_mgr.buf_large_pool);
833 	spdk_dma_free(g_bdev_mgr.zero_buffer);
834 
835 	cb_fn(g_fini_cb_arg);
836 	g_fini_cb_fn = NULL;
837 	g_fini_cb_arg = NULL;
838 }
839 
840 static struct spdk_bdev_module *g_resume_bdev_module = NULL;
841 
842 static void
843 spdk_bdev_module_finish_iter(void *arg)
844 {
845 	struct spdk_bdev_module *bdev_module;
846 
847 	/* Start iterating from the last touched module */
848 	if (!g_resume_bdev_module) {
849 		bdev_module = TAILQ_FIRST(&g_bdev_mgr.bdev_modules);
850 	} else {
851 		bdev_module = TAILQ_NEXT(g_resume_bdev_module, internal.tailq);
852 	}
853 
854 	while (bdev_module) {
855 		if (bdev_module->async_fini) {
856 			/* Save our place so we can resume later. We must
857 			 * save the variable here, before calling module_fini()
858 			 * below, because in some cases the module may immediately
859 			 * call spdk_bdev_module_finish_done() and re-enter
860 			 * this function to continue iterating. */
861 			g_resume_bdev_module = bdev_module;
862 		}
863 
864 		if (bdev_module->module_fini) {
865 			bdev_module->module_fini();
866 		}
867 
868 		if (bdev_module->async_fini) {
869 			return;
870 		}
871 
872 		bdev_module = TAILQ_NEXT(bdev_module, internal.tailq);
873 	}
874 
875 	g_resume_bdev_module = NULL;
876 	spdk_io_device_unregister(&g_bdev_mgr, spdk_bdev_mgr_unregister_cb);
877 }
878 
879 void
880 spdk_bdev_module_finish_done(void)
881 {
882 	if (spdk_get_thread() != g_fini_thread) {
883 		spdk_thread_send_msg(g_fini_thread, spdk_bdev_module_finish_iter, NULL);
884 	} else {
885 		spdk_bdev_module_finish_iter(NULL);
886 	}
887 }
888 
889 static void
890 _spdk_bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno)
891 {
892 	struct spdk_bdev *bdev = cb_arg;
893 
894 	if (bdeverrno && bdev) {
895 		SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n",
896 			     bdev->name);
897 
898 		/*
899 		 * Since the call to spdk_bdev_unregister() failed, we have no way to free this
900 		 *  bdev; try to continue by manually removing this bdev from the list and continue
901 		 *  with the next bdev in the list.
902 		 */
903 		TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
904 	}
905 
906 	if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) {
907 		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Done unregistering bdevs\n");
908 		/*
909 		 * Bdev module finish need to be deffered as we might be in the middle of some context
910 		 * (like bdev part free) that will use this bdev (or private bdev driver ctx data)
911 		 * after returning.
912 		 */
913 		spdk_thread_send_msg(spdk_get_thread(), spdk_bdev_module_finish_iter, NULL);
914 		return;
915 	}
916 
917 	/*
918 	 * Unregister the first bdev in the list.
919 	 *
920 	 * spdk_bdev_unregister() will handle the case where the bdev has open descriptors by
921 	 *  calling the remove_cb of the descriptors first.
922 	 *
923 	 * Once this bdev and all of its open descriptors have been cleaned up, this function
924 	 *  will be called again via the unregister completion callback to continue the cleanup
925 	 *  process with the next bdev.
926 	 */
927 	bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs);
928 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Unregistering bdev '%s'\n", bdev->name);
929 	spdk_bdev_unregister(bdev, _spdk_bdev_finish_unregister_bdevs_iter, bdev);
930 }
931 
932 void
933 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg)
934 {
935 	struct spdk_bdev_module *m;
936 
937 	assert(cb_fn != NULL);
938 
939 	g_fini_thread = spdk_get_thread();
940 
941 	g_fini_cb_fn = cb_fn;
942 	g_fini_cb_arg = cb_arg;
943 
944 	TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
945 		if (m->fini_start) {
946 			m->fini_start();
947 		}
948 	}
949 
950 	_spdk_bdev_finish_unregister_bdevs_iter(NULL, 0);
951 }
952 
953 static struct spdk_bdev_io *
954 spdk_bdev_get_io(struct spdk_bdev_channel *channel)
955 {
956 	struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch;
957 	struct spdk_bdev_io *bdev_io;
958 
959 	if (ch->per_thread_cache_count > 0) {
960 		bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
961 		STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
962 		ch->per_thread_cache_count--;
963 	} else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) {
964 		/*
965 		 * Don't try to look for bdev_ios in the global pool if there are
966 		 * waiters on bdev_ios - we don't want this caller to jump the line.
967 		 */
968 		bdev_io = NULL;
969 	} else {
970 		bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
971 	}
972 
973 	return bdev_io;
974 }
975 
976 void
977 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io)
978 {
979 	struct spdk_bdev_mgmt_channel *ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
980 
981 	assert(bdev_io != NULL);
982 	assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING);
983 
984 	if (bdev_io->internal.buf != NULL) {
985 		spdk_bdev_io_put_buf(bdev_io);
986 	}
987 
988 	if (ch->per_thread_cache_count < ch->bdev_io_cache_size) {
989 		ch->per_thread_cache_count++;
990 		STAILQ_INSERT_TAIL(&ch->per_thread_cache, bdev_io, internal.buf_link);
991 		while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) {
992 			struct spdk_bdev_io_wait_entry *entry;
993 
994 			entry = TAILQ_FIRST(&ch->io_wait_queue);
995 			TAILQ_REMOVE(&ch->io_wait_queue, entry, link);
996 			entry->cb_fn(entry->cb_arg);
997 		}
998 	} else {
999 		/* We should never have a full cache with entries on the io wait queue. */
1000 		assert(TAILQ_EMPTY(&ch->io_wait_queue));
1001 		spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
1002 	}
1003 }
1004 
1005 static uint64_t
1006 _spdk_bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io)
1007 {
1008 	struct spdk_bdev	*bdev = bdev_io->bdev;
1009 
1010 	switch (bdev_io->type) {
1011 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
1012 	case SPDK_BDEV_IO_TYPE_NVME_IO:
1013 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
1014 		return bdev_io->u.nvme_passthru.nbytes;
1015 	case SPDK_BDEV_IO_TYPE_READ:
1016 	case SPDK_BDEV_IO_TYPE_WRITE:
1017 	case SPDK_BDEV_IO_TYPE_UNMAP:
1018 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
1019 		return bdev_io->u.bdev.num_blocks * bdev->blocklen;
1020 	default:
1021 		return 0;
1022 	}
1023 }
1024 
1025 static void
1026 _spdk_bdev_qos_io_submit(struct spdk_bdev_channel *ch)
1027 {
1028 	struct spdk_bdev_io		*bdev_io = NULL;
1029 	struct spdk_bdev		*bdev = ch->bdev;
1030 	struct spdk_bdev_qos		*qos = bdev->internal.qos;
1031 	struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource;
1032 
1033 	while (!TAILQ_EMPTY(&qos->queued)) {
1034 		if (qos->max_ios_per_timeslice > 0 &&
1035 		    qos->io_submitted_this_timeslice >= qos->max_ios_per_timeslice) {
1036 			break;
1037 		}
1038 
1039 		if (qos->max_byte_per_timeslice > 0 &&
1040 		    qos->byte_submitted_this_timeslice >= qos->max_byte_per_timeslice) {
1041 			break;
1042 		}
1043 
1044 		bdev_io = TAILQ_FIRST(&qos->queued);
1045 		TAILQ_REMOVE(&qos->queued, bdev_io, internal.link);
1046 		qos->io_submitted_this_timeslice++;
1047 		qos->byte_submitted_this_timeslice += _spdk_bdev_get_io_size_in_byte(bdev_io);
1048 		ch->io_outstanding++;
1049 		shared_resource->io_outstanding++;
1050 		bdev->fn_table->submit_request(ch->channel, bdev_io);
1051 	}
1052 }
1053 
1054 static void
1055 _spdk_bdev_io_submit(void *ctx)
1056 {
1057 	struct spdk_bdev_io *bdev_io = ctx;
1058 	struct spdk_bdev *bdev = bdev_io->bdev;
1059 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
1060 	struct spdk_io_channel *ch = bdev_ch->channel;
1061 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
1062 
1063 	bdev_io->internal.submit_tsc = spdk_get_ticks();
1064 	bdev_ch->io_outstanding++;
1065 	shared_resource->io_outstanding++;
1066 	bdev_io->internal.in_submit_request = true;
1067 	if (spdk_likely(bdev_ch->flags == 0)) {
1068 		if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) {
1069 			bdev->fn_table->submit_request(ch, bdev_io);
1070 		} else {
1071 			bdev_ch->io_outstanding--;
1072 			shared_resource->io_outstanding--;
1073 			TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link);
1074 		}
1075 	} else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) {
1076 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
1077 	} else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) {
1078 		bdev_ch->io_outstanding--;
1079 		shared_resource->io_outstanding--;
1080 		TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link);
1081 		_spdk_bdev_qos_io_submit(bdev_ch);
1082 	} else {
1083 		SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags);
1084 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
1085 	}
1086 	bdev_io->internal.in_submit_request = false;
1087 }
1088 
1089 static void
1090 spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io)
1091 {
1092 	struct spdk_bdev *bdev = bdev_io->bdev;
1093 	struct spdk_thread *thread = spdk_io_channel_get_thread(bdev_io->internal.ch->channel);
1094 
1095 	assert(thread != NULL);
1096 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
1097 
1098 	if (bdev_io->internal.ch->flags & BDEV_CH_QOS_ENABLED) {
1099 		if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) {
1100 			_spdk_bdev_io_submit(bdev_io);
1101 		} else {
1102 			bdev_io->internal.io_submit_ch = bdev_io->internal.ch;
1103 			bdev_io->internal.ch = bdev->internal.qos->ch;
1104 			spdk_thread_send_msg(bdev->internal.qos->thread, _spdk_bdev_io_submit, bdev_io);
1105 		}
1106 	} else {
1107 		_spdk_bdev_io_submit(bdev_io);
1108 	}
1109 }
1110 
1111 static void
1112 spdk_bdev_io_submit_reset(struct spdk_bdev_io *bdev_io)
1113 {
1114 	struct spdk_bdev *bdev = bdev_io->bdev;
1115 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
1116 	struct spdk_io_channel *ch = bdev_ch->channel;
1117 
1118 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
1119 
1120 	bdev_io->internal.in_submit_request = true;
1121 	bdev->fn_table->submit_request(ch, bdev_io);
1122 	bdev_io->internal.in_submit_request = false;
1123 }
1124 
1125 static void
1126 spdk_bdev_io_init(struct spdk_bdev_io *bdev_io,
1127 		  struct spdk_bdev *bdev, void *cb_arg,
1128 		  spdk_bdev_io_completion_cb cb)
1129 {
1130 	bdev_io->bdev = bdev;
1131 	bdev_io->internal.caller_ctx = cb_arg;
1132 	bdev_io->internal.cb = cb;
1133 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
1134 	bdev_io->internal.in_submit_request = false;
1135 	bdev_io->internal.buf = NULL;
1136 	bdev_io->internal.io_submit_ch = NULL;
1137 }
1138 
1139 static bool
1140 _spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
1141 {
1142 	return bdev->fn_table->io_type_supported(bdev->ctxt, io_type);
1143 }
1144 
1145 bool
1146 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
1147 {
1148 	bool supported;
1149 
1150 	supported = _spdk_bdev_io_type_supported(bdev, io_type);
1151 
1152 	if (!supported) {
1153 		switch (io_type) {
1154 		case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
1155 			/* The bdev layer will emulate write zeroes as long as write is supported. */
1156 			supported = _spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE);
1157 			break;
1158 		default:
1159 			break;
1160 		}
1161 	}
1162 
1163 	return supported;
1164 }
1165 
1166 int
1167 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1168 {
1169 	if (bdev->fn_table->dump_info_json) {
1170 		return bdev->fn_table->dump_info_json(bdev->ctxt, w);
1171 	}
1172 
1173 	return 0;
1174 }
1175 
1176 void
1177 spdk_bdev_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1178 {
1179 	assert(bdev != NULL);
1180 	assert(w != NULL);
1181 
1182 	if (bdev->fn_table->write_config_json) {
1183 		bdev->fn_table->write_config_json(bdev, w);
1184 	} else {
1185 		spdk_json_write_object_begin(w);
1186 		spdk_json_write_named_string(w, "name", bdev->name);
1187 		spdk_json_write_object_end(w);
1188 	}
1189 }
1190 
1191 static void
1192 spdk_bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos)
1193 {
1194 	uint64_t max_ios_per_timeslice = 0, max_byte_per_timeslice = 0;
1195 
1196 	if (qos->iops_rate_limit > 0) {
1197 		max_ios_per_timeslice = qos->iops_rate_limit * SPDK_BDEV_QOS_TIMESLICE_IN_USEC /
1198 					SPDK_BDEV_SEC_TO_USEC;
1199 		qos->max_ios_per_timeslice = spdk_max(max_ios_per_timeslice,
1200 						      SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE);
1201 	}
1202 
1203 	if (qos->byte_rate_limit > 0) {
1204 		max_byte_per_timeslice = qos->byte_rate_limit * SPDK_BDEV_QOS_TIMESLICE_IN_USEC /
1205 					 SPDK_BDEV_SEC_TO_USEC;
1206 		qos->max_byte_per_timeslice = spdk_max(max_byte_per_timeslice,
1207 						       SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE);
1208 	}
1209 }
1210 
1211 static int
1212 spdk_bdev_channel_poll_qos(void *arg)
1213 {
1214 	struct spdk_bdev_qos *qos = arg;
1215 
1216 	/* Reset for next round of rate limiting */
1217 	qos->io_submitted_this_timeslice = 0;
1218 
1219 	/* More bytes sent in the last timeslice, allow less in this timeslice */
1220 	if (qos->byte_submitted_this_timeslice > qos->max_byte_per_timeslice) {
1221 		qos->byte_submitted_this_timeslice -= qos->max_byte_per_timeslice;
1222 	} else {
1223 		qos->byte_submitted_this_timeslice = 0;
1224 	}
1225 
1226 	_spdk_bdev_qos_io_submit(qos->ch);
1227 
1228 	return -1;
1229 }
1230 
1231 static void
1232 _spdk_bdev_channel_destroy_resource(struct spdk_bdev_channel *ch)
1233 {
1234 	struct spdk_bdev_shared_resource *shared_resource;
1235 
1236 	if (!ch) {
1237 		return;
1238 	}
1239 
1240 	if (ch->channel) {
1241 		spdk_put_io_channel(ch->channel);
1242 	}
1243 
1244 	assert(ch->io_outstanding == 0);
1245 
1246 	shared_resource = ch->shared_resource;
1247 	if (shared_resource) {
1248 		assert(ch->io_outstanding == 0);
1249 		assert(shared_resource->ref > 0);
1250 		shared_resource->ref--;
1251 		if (shared_resource->ref == 0) {
1252 			assert(shared_resource->io_outstanding == 0);
1253 			TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link);
1254 			spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch));
1255 			free(shared_resource);
1256 		}
1257 	}
1258 }
1259 
1260 /* Caller must hold bdev->internal.mutex. */
1261 static void
1262 _spdk_bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch)
1263 {
1264 	struct spdk_bdev_qos *qos = bdev->internal.qos;
1265 
1266 	/* Rate limiting on this bdev enabled */
1267 	if (qos) {
1268 		if (qos->ch == NULL) {
1269 			struct spdk_io_channel *io_ch;
1270 
1271 			SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch,
1272 				      bdev->name, spdk_get_thread());
1273 
1274 			/* No qos channel has been selected, so set one up */
1275 
1276 			/* Take another reference to ch */
1277 			io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev));
1278 			qos->ch = ch;
1279 
1280 			qos->thread = spdk_io_channel_get_thread(io_ch);
1281 
1282 			TAILQ_INIT(&qos->queued);
1283 			spdk_bdev_qos_update_max_quota_per_timeslice(qos);
1284 			qos->io_submitted_this_timeslice = 0;
1285 			qos->byte_submitted_this_timeslice = 0;
1286 
1287 			qos->poller = spdk_poller_register(spdk_bdev_channel_poll_qos,
1288 							   qos,
1289 							   SPDK_BDEV_QOS_TIMESLICE_IN_USEC);
1290 		}
1291 
1292 		ch->flags |= BDEV_CH_QOS_ENABLED;
1293 	}
1294 }
1295 
1296 static int
1297 spdk_bdev_channel_create(void *io_device, void *ctx_buf)
1298 {
1299 	struct spdk_bdev		*bdev = __bdev_from_io_dev(io_device);
1300 	struct spdk_bdev_channel	*ch = ctx_buf;
1301 	struct spdk_io_channel		*mgmt_io_ch;
1302 	struct spdk_bdev_mgmt_channel	*mgmt_ch;
1303 	struct spdk_bdev_shared_resource *shared_resource;
1304 
1305 	ch->bdev = bdev;
1306 	ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt);
1307 	if (!ch->channel) {
1308 		return -1;
1309 	}
1310 
1311 	mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr);
1312 	if (!mgmt_io_ch) {
1313 		return -1;
1314 	}
1315 
1316 	mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch);
1317 	TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) {
1318 		if (shared_resource->shared_ch == ch->channel) {
1319 			spdk_put_io_channel(mgmt_io_ch);
1320 			shared_resource->ref++;
1321 			break;
1322 		}
1323 	}
1324 
1325 	if (shared_resource == NULL) {
1326 		shared_resource = calloc(1, sizeof(*shared_resource));
1327 		if (shared_resource == NULL) {
1328 			spdk_put_io_channel(mgmt_io_ch);
1329 			return -1;
1330 		}
1331 
1332 		shared_resource->mgmt_ch = mgmt_ch;
1333 		shared_resource->io_outstanding = 0;
1334 		TAILQ_INIT(&shared_resource->nomem_io);
1335 		shared_resource->nomem_threshold = 0;
1336 		shared_resource->shared_ch = ch->channel;
1337 		shared_resource->ref = 1;
1338 		TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link);
1339 	}
1340 
1341 	memset(&ch->stat, 0, sizeof(ch->stat));
1342 	ch->stat.ticks_rate = spdk_get_ticks_hz();
1343 	ch->io_outstanding = 0;
1344 	TAILQ_INIT(&ch->queued_resets);
1345 	ch->flags = 0;
1346 	ch->shared_resource = shared_resource;
1347 
1348 #ifdef SPDK_CONFIG_VTUNE
1349 	{
1350 		char *name;
1351 		__itt_init_ittlib(NULL, 0);
1352 		name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch);
1353 		if (!name) {
1354 			_spdk_bdev_channel_destroy_resource(ch);
1355 			return -1;
1356 		}
1357 		ch->handle = __itt_string_handle_create(name);
1358 		free(name);
1359 		ch->start_tsc = spdk_get_ticks();
1360 		ch->interval_tsc = spdk_get_ticks_hz() / 100;
1361 		memset(&ch->prev_stat, 0, sizeof(ch->prev_stat));
1362 	}
1363 #endif
1364 
1365 	pthread_mutex_lock(&bdev->internal.mutex);
1366 	_spdk_bdev_enable_qos(bdev, ch);
1367 	pthread_mutex_unlock(&bdev->internal.mutex);
1368 
1369 	return 0;
1370 }
1371 
1372 /*
1373  * Abort I/O that are waiting on a data buffer.  These types of I/O are
1374  *  linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY.
1375  */
1376 static void
1377 _spdk_bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch)
1378 {
1379 	bdev_io_stailq_t tmp;
1380 	struct spdk_bdev_io *bdev_io;
1381 
1382 	STAILQ_INIT(&tmp);
1383 
1384 	while (!STAILQ_EMPTY(queue)) {
1385 		bdev_io = STAILQ_FIRST(queue);
1386 		STAILQ_REMOVE_HEAD(queue, internal.buf_link);
1387 		if (bdev_io->internal.ch == ch) {
1388 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
1389 		} else {
1390 			STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link);
1391 		}
1392 	}
1393 
1394 	STAILQ_SWAP(&tmp, queue, spdk_bdev_io);
1395 }
1396 
1397 /*
1398  * Abort I/O that are queued waiting for submission.  These types of I/O are
1399  *  linked using the spdk_bdev_io link TAILQ_ENTRY.
1400  */
1401 static void
1402 _spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch)
1403 {
1404 	struct spdk_bdev_io *bdev_io, *tmp;
1405 
1406 	TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) {
1407 		if (bdev_io->internal.ch == ch) {
1408 			TAILQ_REMOVE(queue, bdev_io, internal.link);
1409 			/*
1410 			 * spdk_bdev_io_complete() assumes that the completed I/O had
1411 			 *  been submitted to the bdev module.  Since in this case it
1412 			 *  hadn't, bump io_outstanding to account for the decrement
1413 			 *  that spdk_bdev_io_complete() will do.
1414 			 */
1415 			if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) {
1416 				ch->io_outstanding++;
1417 				ch->shared_resource->io_outstanding++;
1418 			}
1419 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
1420 		}
1421 	}
1422 }
1423 
1424 static void
1425 spdk_bdev_qos_channel_destroy(void *cb_arg)
1426 {
1427 	struct spdk_bdev_qos *qos = cb_arg;
1428 
1429 	spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
1430 	spdk_poller_unregister(&qos->poller);
1431 
1432 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Free QoS %p.\n", qos);
1433 
1434 	free(qos);
1435 }
1436 
1437 static int
1438 spdk_bdev_qos_destroy(struct spdk_bdev *bdev)
1439 {
1440 	/*
1441 	 * Cleanly shutting down the QoS poller is tricky, because
1442 	 * during the asynchronous operation the user could open
1443 	 * a new descriptor and create a new channel, spawning
1444 	 * a new QoS poller.
1445 	 *
1446 	 * The strategy is to create a new QoS structure here and swap it
1447 	 * in. The shutdown path then continues to refer to the old one
1448 	 * until it completes and then releases it.
1449 	 */
1450 	struct spdk_bdev_qos *new_qos, *old_qos;
1451 
1452 	old_qos = bdev->internal.qos;
1453 
1454 	new_qos = calloc(1, sizeof(*new_qos));
1455 	if (!new_qos) {
1456 		SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n");
1457 		return -ENOMEM;
1458 	}
1459 
1460 	/* Copy the old QoS data into the newly allocated structure */
1461 	memcpy(new_qos, old_qos, sizeof(*new_qos));
1462 
1463 	/* Zero out the key parts of the QoS structure */
1464 	new_qos->ch = NULL;
1465 	new_qos->thread = NULL;
1466 	new_qos->max_ios_per_timeslice = 0;
1467 	new_qos->max_byte_per_timeslice = 0;
1468 	new_qos->io_submitted_this_timeslice = 0;
1469 	new_qos->byte_submitted_this_timeslice = 0;
1470 	new_qos->poller = NULL;
1471 	TAILQ_INIT(&new_qos->queued);
1472 
1473 	bdev->internal.qos = new_qos;
1474 
1475 	if (old_qos->thread == NULL) {
1476 		free(old_qos);
1477 	} else {
1478 		spdk_thread_send_msg(old_qos->thread, spdk_bdev_qos_channel_destroy,
1479 				     old_qos);
1480 	}
1481 
1482 	/* It is safe to continue with destroying the bdev even though the QoS channel hasn't
1483 	 * been destroyed yet. The destruction path will end up waiting for the final
1484 	 * channel to be put before it releases resources. */
1485 
1486 	return 0;
1487 }
1488 
1489 static void
1490 _spdk_bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add)
1491 {
1492 	total->bytes_read += add->bytes_read;
1493 	total->num_read_ops += add->num_read_ops;
1494 	total->bytes_written += add->bytes_written;
1495 	total->num_write_ops += add->num_write_ops;
1496 	total->read_latency_ticks += add->read_latency_ticks;
1497 	total->write_latency_ticks += add->write_latency_ticks;
1498 }
1499 
1500 static void
1501 spdk_bdev_channel_destroy(void *io_device, void *ctx_buf)
1502 {
1503 	struct spdk_bdev_channel	*ch = ctx_buf;
1504 	struct spdk_bdev_mgmt_channel	*mgmt_ch;
1505 	struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource;
1506 
1507 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name,
1508 		      spdk_get_thread());
1509 
1510 	/* This channel is going away, so add its statistics into the bdev so that they don't get lost. */
1511 	pthread_mutex_lock(&ch->bdev->internal.mutex);
1512 	_spdk_bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat);
1513 	pthread_mutex_unlock(&ch->bdev->internal.mutex);
1514 
1515 	mgmt_ch = shared_resource->mgmt_ch;
1516 
1517 	_spdk_bdev_abort_queued_io(&ch->queued_resets, ch);
1518 	_spdk_bdev_abort_queued_io(&shared_resource->nomem_io, ch);
1519 	_spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_small, ch);
1520 	_spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_large, ch);
1521 
1522 	_spdk_bdev_channel_destroy_resource(ch);
1523 }
1524 
1525 int
1526 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias)
1527 {
1528 	struct spdk_bdev_alias *tmp;
1529 
1530 	if (alias == NULL) {
1531 		SPDK_ERRLOG("Empty alias passed\n");
1532 		return -EINVAL;
1533 	}
1534 
1535 	if (spdk_bdev_get_by_name(alias)) {
1536 		SPDK_ERRLOG("Bdev name/alias: %s already exists\n", alias);
1537 		return -EEXIST;
1538 	}
1539 
1540 	tmp = calloc(1, sizeof(*tmp));
1541 	if (tmp == NULL) {
1542 		SPDK_ERRLOG("Unable to allocate alias\n");
1543 		return -ENOMEM;
1544 	}
1545 
1546 	tmp->alias = strdup(alias);
1547 	if (tmp->alias == NULL) {
1548 		free(tmp);
1549 		SPDK_ERRLOG("Unable to allocate alias\n");
1550 		return -ENOMEM;
1551 	}
1552 
1553 	TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq);
1554 
1555 	return 0;
1556 }
1557 
1558 int
1559 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias)
1560 {
1561 	struct spdk_bdev_alias *tmp;
1562 
1563 	TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
1564 		if (strcmp(alias, tmp->alias) == 0) {
1565 			TAILQ_REMOVE(&bdev->aliases, tmp, tailq);
1566 			free(tmp->alias);
1567 			free(tmp);
1568 			return 0;
1569 		}
1570 	}
1571 
1572 	SPDK_INFOLOG(SPDK_LOG_BDEV, "Alias %s does not exists\n", alias);
1573 
1574 	return -ENOENT;
1575 }
1576 
1577 struct spdk_io_channel *
1578 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc)
1579 {
1580 	return spdk_get_io_channel(__bdev_to_io_dev(desc->bdev));
1581 }
1582 
1583 const char *
1584 spdk_bdev_get_name(const struct spdk_bdev *bdev)
1585 {
1586 	return bdev->name;
1587 }
1588 
1589 const char *
1590 spdk_bdev_get_product_name(const struct spdk_bdev *bdev)
1591 {
1592 	return bdev->product_name;
1593 }
1594 
1595 const struct spdk_bdev_aliases_list *
1596 spdk_bdev_get_aliases(const struct spdk_bdev *bdev)
1597 {
1598 	return &bdev->aliases;
1599 }
1600 
1601 uint32_t
1602 spdk_bdev_get_block_size(const struct spdk_bdev *bdev)
1603 {
1604 	return bdev->blocklen;
1605 }
1606 
1607 uint64_t
1608 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev)
1609 {
1610 	return bdev->blockcnt;
1611 }
1612 
1613 uint64_t
1614 spdk_bdev_get_qos_ios_per_sec(struct spdk_bdev *bdev)
1615 {
1616 	uint64_t iops_rate_limit = 0;
1617 
1618 	pthread_mutex_lock(&bdev->internal.mutex);
1619 	if (bdev->internal.qos) {
1620 		iops_rate_limit = bdev->internal.qos->iops_rate_limit;
1621 	}
1622 	pthread_mutex_unlock(&bdev->internal.mutex);
1623 
1624 	return iops_rate_limit;
1625 }
1626 
1627 size_t
1628 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev)
1629 {
1630 	/* TODO: push this logic down to the bdev modules */
1631 	if (bdev->need_aligned_buffer) {
1632 		return bdev->blocklen;
1633 	}
1634 
1635 	return 1;
1636 }
1637 
1638 uint32_t
1639 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev)
1640 {
1641 	return bdev->optimal_io_boundary;
1642 }
1643 
1644 bool
1645 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev)
1646 {
1647 	return bdev->write_cache;
1648 }
1649 
1650 const struct spdk_uuid *
1651 spdk_bdev_get_uuid(const struct spdk_bdev *bdev)
1652 {
1653 	return &bdev->uuid;
1654 }
1655 
1656 uint64_t
1657 spdk_bdev_get_qd(const struct spdk_bdev *bdev)
1658 {
1659 	return bdev->internal.measured_queue_depth;
1660 }
1661 
1662 uint64_t
1663 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev)
1664 {
1665 	return bdev->internal.period;
1666 }
1667 
1668 uint64_t
1669 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev)
1670 {
1671 	return bdev->internal.weighted_io_time;
1672 }
1673 
1674 uint64_t
1675 spdk_bdev_get_io_time(const struct spdk_bdev *bdev)
1676 {
1677 	return bdev->internal.io_time;
1678 }
1679 
1680 static void
1681 _calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status)
1682 {
1683 	struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i);
1684 
1685 	bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth;
1686 
1687 	if (bdev->internal.measured_queue_depth) {
1688 		bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth;
1689 	}
1690 }
1691 
1692 static void
1693 _calculate_measured_qd(struct spdk_io_channel_iter *i)
1694 {
1695 	struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i);
1696 	struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i);
1697 	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch);
1698 
1699 	bdev->internal.temporary_queue_depth += ch->io_outstanding;
1700 	spdk_for_each_channel_continue(i, 0);
1701 }
1702 
1703 static int
1704 spdk_bdev_calculate_measured_queue_depth(void *ctx)
1705 {
1706 	struct spdk_bdev *bdev = ctx;
1707 	bdev->internal.temporary_queue_depth = 0;
1708 	spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev,
1709 			      _calculate_measured_qd_cpl);
1710 	return 0;
1711 }
1712 
1713 void
1714 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period)
1715 {
1716 	bdev->internal.period = period;
1717 
1718 	if (bdev->internal.qd_poller != NULL) {
1719 		spdk_poller_unregister(&bdev->internal.qd_poller);
1720 		bdev->internal.measured_queue_depth = UINT64_MAX;
1721 	}
1722 
1723 	if (period != 0) {
1724 		bdev->internal.qd_poller = spdk_poller_register(spdk_bdev_calculate_measured_queue_depth, bdev,
1725 					   period);
1726 	}
1727 }
1728 
1729 int
1730 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size)
1731 {
1732 	int ret;
1733 
1734 	pthread_mutex_lock(&bdev->internal.mutex);
1735 
1736 	/* bdev has open descriptors */
1737 	if (!TAILQ_EMPTY(&bdev->internal.open_descs) &&
1738 	    bdev->blockcnt > size) {
1739 		ret = -EBUSY;
1740 	} else {
1741 		bdev->blockcnt = size;
1742 		ret = 0;
1743 	}
1744 
1745 	pthread_mutex_unlock(&bdev->internal.mutex);
1746 
1747 	return ret;
1748 }
1749 
1750 /*
1751  * Convert I/O offset and length from bytes to blocks.
1752  *
1753  * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size.
1754  */
1755 static uint64_t
1756 spdk_bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks,
1757 			  uint64_t num_bytes, uint64_t *num_blocks)
1758 {
1759 	uint32_t block_size = bdev->blocklen;
1760 
1761 	*offset_blocks = offset_bytes / block_size;
1762 	*num_blocks = num_bytes / block_size;
1763 
1764 	return (offset_bytes % block_size) | (num_bytes % block_size);
1765 }
1766 
1767 static bool
1768 spdk_bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks)
1769 {
1770 	/* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there
1771 	 * has been an overflow and hence the offset has been wrapped around */
1772 	if (offset_blocks + num_blocks < offset_blocks) {
1773 		return false;
1774 	}
1775 
1776 	/* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */
1777 	if (offset_blocks + num_blocks > bdev->blockcnt) {
1778 		return false;
1779 	}
1780 
1781 	return true;
1782 }
1783 
1784 int
1785 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1786 	       void *buf, uint64_t offset, uint64_t nbytes,
1787 	       spdk_bdev_io_completion_cb cb, void *cb_arg)
1788 {
1789 	uint64_t offset_blocks, num_blocks;
1790 
1791 	if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
1792 		return -EINVAL;
1793 	}
1794 
1795 	return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
1796 }
1797 
1798 int
1799 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1800 		      void *buf, uint64_t offset_blocks, uint64_t num_blocks,
1801 		      spdk_bdev_io_completion_cb cb, void *cb_arg)
1802 {
1803 	struct spdk_bdev *bdev = desc->bdev;
1804 	struct spdk_bdev_io *bdev_io;
1805 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1806 
1807 	if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
1808 		return -EINVAL;
1809 	}
1810 
1811 	bdev_io = spdk_bdev_get_io(channel);
1812 	if (!bdev_io) {
1813 		return -ENOMEM;
1814 	}
1815 
1816 	bdev_io->internal.ch = channel;
1817 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
1818 	bdev_io->u.bdev.iovs = &bdev_io->iov;
1819 	bdev_io->u.bdev.iovs[0].iov_base = buf;
1820 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
1821 	bdev_io->u.bdev.iovcnt = 1;
1822 	bdev_io->u.bdev.num_blocks = num_blocks;
1823 	bdev_io->u.bdev.offset_blocks = offset_blocks;
1824 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
1825 
1826 	spdk_bdev_io_submit(bdev_io);
1827 	return 0;
1828 }
1829 
1830 int
1831 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1832 		struct iovec *iov, int iovcnt,
1833 		uint64_t offset, uint64_t nbytes,
1834 		spdk_bdev_io_completion_cb cb, void *cb_arg)
1835 {
1836 	uint64_t offset_blocks, num_blocks;
1837 
1838 	if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
1839 		return -EINVAL;
1840 	}
1841 
1842 	return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
1843 }
1844 
1845 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1846 			   struct iovec *iov, int iovcnt,
1847 			   uint64_t offset_blocks, uint64_t num_blocks,
1848 			   spdk_bdev_io_completion_cb cb, void *cb_arg)
1849 {
1850 	struct spdk_bdev *bdev = desc->bdev;
1851 	struct spdk_bdev_io *bdev_io;
1852 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1853 
1854 	if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
1855 		return -EINVAL;
1856 	}
1857 
1858 	bdev_io = spdk_bdev_get_io(channel);
1859 	if (!bdev_io) {
1860 		return -ENOMEM;
1861 	}
1862 
1863 	bdev_io->internal.ch = channel;
1864 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
1865 	bdev_io->u.bdev.iovs = iov;
1866 	bdev_io->u.bdev.iovcnt = iovcnt;
1867 	bdev_io->u.bdev.num_blocks = num_blocks;
1868 	bdev_io->u.bdev.offset_blocks = offset_blocks;
1869 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
1870 
1871 	spdk_bdev_io_submit(bdev_io);
1872 	return 0;
1873 }
1874 
1875 int
1876 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1877 		void *buf, uint64_t offset, uint64_t nbytes,
1878 		spdk_bdev_io_completion_cb cb, void *cb_arg)
1879 {
1880 	uint64_t offset_blocks, num_blocks;
1881 
1882 	if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
1883 		return -EINVAL;
1884 	}
1885 
1886 	return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
1887 }
1888 
1889 int
1890 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1891 		       void *buf, uint64_t offset_blocks, uint64_t num_blocks,
1892 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
1893 {
1894 	struct spdk_bdev *bdev = desc->bdev;
1895 	struct spdk_bdev_io *bdev_io;
1896 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1897 
1898 	if (!desc->write) {
1899 		return -EBADF;
1900 	}
1901 
1902 	if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
1903 		return -EINVAL;
1904 	}
1905 
1906 	bdev_io = spdk_bdev_get_io(channel);
1907 	if (!bdev_io) {
1908 		return -ENOMEM;
1909 	}
1910 
1911 	bdev_io->internal.ch = channel;
1912 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
1913 	bdev_io->u.bdev.iovs = &bdev_io->iov;
1914 	bdev_io->u.bdev.iovs[0].iov_base = buf;
1915 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
1916 	bdev_io->u.bdev.iovcnt = 1;
1917 	bdev_io->u.bdev.num_blocks = num_blocks;
1918 	bdev_io->u.bdev.offset_blocks = offset_blocks;
1919 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
1920 
1921 	spdk_bdev_io_submit(bdev_io);
1922 	return 0;
1923 }
1924 
1925 int
1926 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1927 		 struct iovec *iov, int iovcnt,
1928 		 uint64_t offset, uint64_t len,
1929 		 spdk_bdev_io_completion_cb cb, void *cb_arg)
1930 {
1931 	uint64_t offset_blocks, num_blocks;
1932 
1933 	if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) {
1934 		return -EINVAL;
1935 	}
1936 
1937 	return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
1938 }
1939 
1940 int
1941 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1942 			struct iovec *iov, int iovcnt,
1943 			uint64_t offset_blocks, uint64_t num_blocks,
1944 			spdk_bdev_io_completion_cb cb, void *cb_arg)
1945 {
1946 	struct spdk_bdev *bdev = desc->bdev;
1947 	struct spdk_bdev_io *bdev_io;
1948 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1949 
1950 	if (!desc->write) {
1951 		return -EBADF;
1952 	}
1953 
1954 	if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
1955 		return -EINVAL;
1956 	}
1957 
1958 	bdev_io = spdk_bdev_get_io(channel);
1959 	if (!bdev_io) {
1960 		return -ENOMEM;
1961 	}
1962 
1963 	bdev_io->internal.ch = channel;
1964 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
1965 	bdev_io->u.bdev.iovs = iov;
1966 	bdev_io->u.bdev.iovcnt = iovcnt;
1967 	bdev_io->u.bdev.num_blocks = num_blocks;
1968 	bdev_io->u.bdev.offset_blocks = offset_blocks;
1969 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
1970 
1971 	spdk_bdev_io_submit(bdev_io);
1972 	return 0;
1973 }
1974 
1975 int
1976 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1977 		       uint64_t offset, uint64_t len,
1978 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
1979 {
1980 	uint64_t offset_blocks, num_blocks;
1981 
1982 	if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) {
1983 		return -EINVAL;
1984 	}
1985 
1986 	return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
1987 }
1988 
1989 int
1990 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1991 			      uint64_t offset_blocks, uint64_t num_blocks,
1992 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
1993 {
1994 	struct spdk_bdev *bdev = desc->bdev;
1995 	struct spdk_bdev_io *bdev_io;
1996 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1997 	uint64_t len;
1998 	bool split_request = false;
1999 
2000 	if (!desc->write) {
2001 		return -EBADF;
2002 	}
2003 
2004 	if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
2005 		return -EINVAL;
2006 	}
2007 
2008 	bdev_io = spdk_bdev_get_io(channel);
2009 
2010 	if (!bdev_io) {
2011 		return -ENOMEM;
2012 	}
2013 
2014 	bdev_io->internal.ch = channel;
2015 	bdev_io->u.bdev.offset_blocks = offset_blocks;
2016 
2017 	if (_spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
2018 		bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES;
2019 		bdev_io->u.bdev.num_blocks = num_blocks;
2020 		bdev_io->u.bdev.iovs = NULL;
2021 		bdev_io->u.bdev.iovcnt = 0;
2022 
2023 	} else if (_spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) {
2024 		assert(spdk_bdev_get_block_size(bdev) <= ZERO_BUFFER_SIZE);
2025 
2026 		len = spdk_bdev_get_block_size(bdev) * num_blocks;
2027 
2028 		if (len > ZERO_BUFFER_SIZE) {
2029 			split_request = true;
2030 			len = ZERO_BUFFER_SIZE;
2031 		}
2032 
2033 		bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
2034 		bdev_io->u.bdev.iovs = &bdev_io->iov;
2035 		bdev_io->u.bdev.iovs[0].iov_base = g_bdev_mgr.zero_buffer;
2036 		bdev_io->u.bdev.iovs[0].iov_len = len;
2037 		bdev_io->u.bdev.iovcnt = 1;
2038 		bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev);
2039 		bdev_io->u.bdev.split_remaining_num_blocks = num_blocks - bdev_io->u.bdev.num_blocks;
2040 		bdev_io->u.bdev.split_current_offset_blocks = offset_blocks + bdev_io->u.bdev.num_blocks;
2041 	} else {
2042 		spdk_bdev_free_io(bdev_io);
2043 		return -ENOTSUP;
2044 	}
2045 
2046 	if (split_request) {
2047 		bdev_io->u.bdev.stored_user_cb = cb;
2048 		spdk_bdev_io_init(bdev_io, bdev, cb_arg, spdk_bdev_write_zeroes_split);
2049 	} else {
2050 		spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
2051 	}
2052 	spdk_bdev_io_submit(bdev_io);
2053 	return 0;
2054 }
2055 
2056 int
2057 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
2058 		uint64_t offset, uint64_t nbytes,
2059 		spdk_bdev_io_completion_cb cb, void *cb_arg)
2060 {
2061 	uint64_t offset_blocks, num_blocks;
2062 
2063 	if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
2064 		return -EINVAL;
2065 	}
2066 
2067 	return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
2068 }
2069 
2070 int
2071 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
2072 		       uint64_t offset_blocks, uint64_t num_blocks,
2073 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
2074 {
2075 	struct spdk_bdev *bdev = desc->bdev;
2076 	struct spdk_bdev_io *bdev_io;
2077 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2078 
2079 	if (!desc->write) {
2080 		return -EBADF;
2081 	}
2082 
2083 	if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
2084 		return -EINVAL;
2085 	}
2086 
2087 	if (num_blocks == 0) {
2088 		SPDK_ERRLOG("Can't unmap 0 bytes\n");
2089 		return -EINVAL;
2090 	}
2091 
2092 	bdev_io = spdk_bdev_get_io(channel);
2093 	if (!bdev_io) {
2094 		return -ENOMEM;
2095 	}
2096 
2097 	bdev_io->internal.ch = channel;
2098 	bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP;
2099 
2100 	bdev_io->u.bdev.iovs = &bdev_io->iov;
2101 	bdev_io->u.bdev.iovs[0].iov_base = NULL;
2102 	bdev_io->u.bdev.iovs[0].iov_len = 0;
2103 	bdev_io->u.bdev.iovcnt = 1;
2104 
2105 	bdev_io->u.bdev.offset_blocks = offset_blocks;
2106 	bdev_io->u.bdev.num_blocks = num_blocks;
2107 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
2108 
2109 	spdk_bdev_io_submit(bdev_io);
2110 	return 0;
2111 }
2112 
2113 int
2114 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
2115 		uint64_t offset, uint64_t length,
2116 		spdk_bdev_io_completion_cb cb, void *cb_arg)
2117 {
2118 	uint64_t offset_blocks, num_blocks;
2119 
2120 	if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, length, &num_blocks) != 0) {
2121 		return -EINVAL;
2122 	}
2123 
2124 	return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
2125 }
2126 
2127 int
2128 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
2129 		       uint64_t offset_blocks, uint64_t num_blocks,
2130 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
2131 {
2132 	struct spdk_bdev *bdev = desc->bdev;
2133 	struct spdk_bdev_io *bdev_io;
2134 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2135 
2136 	if (!desc->write) {
2137 		return -EBADF;
2138 	}
2139 
2140 	if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
2141 		return -EINVAL;
2142 	}
2143 
2144 	bdev_io = spdk_bdev_get_io(channel);
2145 	if (!bdev_io) {
2146 		return -ENOMEM;
2147 	}
2148 
2149 	bdev_io->internal.ch = channel;
2150 	bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH;
2151 	bdev_io->u.bdev.iovs = NULL;
2152 	bdev_io->u.bdev.iovcnt = 0;
2153 	bdev_io->u.bdev.offset_blocks = offset_blocks;
2154 	bdev_io->u.bdev.num_blocks = num_blocks;
2155 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
2156 
2157 	spdk_bdev_io_submit(bdev_io);
2158 	return 0;
2159 }
2160 
2161 static void
2162 _spdk_bdev_reset_dev(struct spdk_io_channel_iter *i, int status)
2163 {
2164 	struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i);
2165 	struct spdk_bdev_io *bdev_io;
2166 
2167 	bdev_io = TAILQ_FIRST(&ch->queued_resets);
2168 	TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
2169 	spdk_bdev_io_submit_reset(bdev_io);
2170 }
2171 
2172 static void
2173 _spdk_bdev_reset_freeze_channel(struct spdk_io_channel_iter *i)
2174 {
2175 	struct spdk_io_channel		*ch;
2176 	struct spdk_bdev_channel	*channel;
2177 	struct spdk_bdev_mgmt_channel	*mgmt_channel;
2178 	struct spdk_bdev_shared_resource *shared_resource;
2179 	bdev_io_tailq_t			tmp_queued;
2180 
2181 	TAILQ_INIT(&tmp_queued);
2182 
2183 	ch = spdk_io_channel_iter_get_channel(i);
2184 	channel = spdk_io_channel_get_ctx(ch);
2185 	shared_resource = channel->shared_resource;
2186 	mgmt_channel = shared_resource->mgmt_ch;
2187 
2188 	channel->flags |= BDEV_CH_RESET_IN_PROGRESS;
2189 
2190 	if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) {
2191 		/* The QoS object is always valid and readable while
2192 		 * the channel flag is set, so the lock here should not
2193 		 * be necessary. We're not in the fast path though, so
2194 		 * just take it anyway. */
2195 		pthread_mutex_lock(&channel->bdev->internal.mutex);
2196 		if (channel->bdev->internal.qos->ch == channel) {
2197 			TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link);
2198 		}
2199 		pthread_mutex_unlock(&channel->bdev->internal.mutex);
2200 	}
2201 
2202 	_spdk_bdev_abort_queued_io(&shared_resource->nomem_io, channel);
2203 	_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel);
2204 	_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel);
2205 	_spdk_bdev_abort_queued_io(&tmp_queued, channel);
2206 
2207 	spdk_for_each_channel_continue(i, 0);
2208 }
2209 
2210 static void
2211 _spdk_bdev_start_reset(void *ctx)
2212 {
2213 	struct spdk_bdev_channel *ch = ctx;
2214 
2215 	spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), _spdk_bdev_reset_freeze_channel,
2216 			      ch, _spdk_bdev_reset_dev);
2217 }
2218 
2219 static void
2220 _spdk_bdev_channel_start_reset(struct spdk_bdev_channel *ch)
2221 {
2222 	struct spdk_bdev *bdev = ch->bdev;
2223 
2224 	assert(!TAILQ_EMPTY(&ch->queued_resets));
2225 
2226 	pthread_mutex_lock(&bdev->internal.mutex);
2227 	if (bdev->internal.reset_in_progress == NULL) {
2228 		bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets);
2229 		/*
2230 		 * Take a channel reference for the target bdev for the life of this
2231 		 *  reset.  This guards against the channel getting destroyed while
2232 		 *  spdk_for_each_channel() calls related to this reset IO are in
2233 		 *  progress.  We will release the reference when this reset is
2234 		 *  completed.
2235 		 */
2236 		bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev));
2237 		_spdk_bdev_start_reset(ch);
2238 	}
2239 	pthread_mutex_unlock(&bdev->internal.mutex);
2240 }
2241 
2242 int
2243 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
2244 		spdk_bdev_io_completion_cb cb, void *cb_arg)
2245 {
2246 	struct spdk_bdev *bdev = desc->bdev;
2247 	struct spdk_bdev_io *bdev_io;
2248 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2249 
2250 	bdev_io = spdk_bdev_get_io(channel);
2251 	if (!bdev_io) {
2252 		return -ENOMEM;
2253 	}
2254 
2255 	bdev_io->internal.ch = channel;
2256 	bdev_io->type = SPDK_BDEV_IO_TYPE_RESET;
2257 	bdev_io->u.reset.ch_ref = NULL;
2258 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
2259 
2260 	pthread_mutex_lock(&bdev->internal.mutex);
2261 	TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link);
2262 	pthread_mutex_unlock(&bdev->internal.mutex);
2263 
2264 	_spdk_bdev_channel_start_reset(channel);
2265 
2266 	return 0;
2267 }
2268 
2269 void
2270 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
2271 		      struct spdk_bdev_io_stat *stat)
2272 {
2273 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2274 
2275 	*stat = channel->stat;
2276 }
2277 
2278 static void
2279 _spdk_bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status)
2280 {
2281 	void *io_device = spdk_io_channel_iter_get_io_device(i);
2282 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i);
2283 
2284 	bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat,
2285 			    bdev_iostat_ctx->cb_arg, 0);
2286 	free(bdev_iostat_ctx);
2287 }
2288 
2289 static void
2290 _spdk_bdev_get_each_channel_stat(struct spdk_io_channel_iter *i)
2291 {
2292 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i);
2293 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
2294 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2295 
2296 	_spdk_bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat);
2297 	spdk_for_each_channel_continue(i, 0);
2298 }
2299 
2300 void
2301 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat,
2302 			  spdk_bdev_get_device_stat_cb cb, void *cb_arg)
2303 {
2304 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx;
2305 
2306 	assert(bdev != NULL);
2307 	assert(stat != NULL);
2308 	assert(cb != NULL);
2309 
2310 	bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx));
2311 	if (bdev_iostat_ctx == NULL) {
2312 		SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n");
2313 		cb(bdev, stat, cb_arg, -ENOMEM);
2314 		return;
2315 	}
2316 
2317 	bdev_iostat_ctx->stat = stat;
2318 	bdev_iostat_ctx->cb = cb;
2319 	bdev_iostat_ctx->cb_arg = cb_arg;
2320 
2321 	/* Start with the statistics from previously deleted channels. */
2322 	pthread_mutex_lock(&bdev->internal.mutex);
2323 	_spdk_bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat);
2324 	pthread_mutex_unlock(&bdev->internal.mutex);
2325 
2326 	/* Then iterate and add the statistics from each existing channel. */
2327 	spdk_for_each_channel(__bdev_to_io_dev(bdev),
2328 			      _spdk_bdev_get_each_channel_stat,
2329 			      bdev_iostat_ctx,
2330 			      _spdk_bdev_get_device_stat_done);
2331 }
2332 
2333 int
2334 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
2335 			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
2336 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
2337 {
2338 	struct spdk_bdev *bdev = desc->bdev;
2339 	struct spdk_bdev_io *bdev_io;
2340 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2341 
2342 	if (!desc->write) {
2343 		return -EBADF;
2344 	}
2345 
2346 	bdev_io = spdk_bdev_get_io(channel);
2347 	if (!bdev_io) {
2348 		return -ENOMEM;
2349 	}
2350 
2351 	bdev_io->internal.ch = channel;
2352 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN;
2353 	bdev_io->u.nvme_passthru.cmd = *cmd;
2354 	bdev_io->u.nvme_passthru.buf = buf;
2355 	bdev_io->u.nvme_passthru.nbytes = nbytes;
2356 	bdev_io->u.nvme_passthru.md_buf = NULL;
2357 	bdev_io->u.nvme_passthru.md_len = 0;
2358 
2359 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
2360 
2361 	spdk_bdev_io_submit(bdev_io);
2362 	return 0;
2363 }
2364 
2365 int
2366 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
2367 			   const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
2368 			   spdk_bdev_io_completion_cb cb, void *cb_arg)
2369 {
2370 	struct spdk_bdev *bdev = desc->bdev;
2371 	struct spdk_bdev_io *bdev_io;
2372 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2373 
2374 	if (!desc->write) {
2375 		/*
2376 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
2377 		 *  to easily determine if the command is a read or write, but for now just
2378 		 *  do not allow io_passthru with a read-only descriptor.
2379 		 */
2380 		return -EBADF;
2381 	}
2382 
2383 	bdev_io = spdk_bdev_get_io(channel);
2384 	if (!bdev_io) {
2385 		return -ENOMEM;
2386 	}
2387 
2388 	bdev_io->internal.ch = channel;
2389 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO;
2390 	bdev_io->u.nvme_passthru.cmd = *cmd;
2391 	bdev_io->u.nvme_passthru.buf = buf;
2392 	bdev_io->u.nvme_passthru.nbytes = nbytes;
2393 	bdev_io->u.nvme_passthru.md_buf = NULL;
2394 	bdev_io->u.nvme_passthru.md_len = 0;
2395 
2396 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
2397 
2398 	spdk_bdev_io_submit(bdev_io);
2399 	return 0;
2400 }
2401 
2402 int
2403 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
2404 			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len,
2405 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
2406 {
2407 	struct spdk_bdev *bdev = desc->bdev;
2408 	struct spdk_bdev_io *bdev_io;
2409 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2410 
2411 	if (!desc->write) {
2412 		/*
2413 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
2414 		 *  to easily determine if the command is a read or write, but for now just
2415 		 *  do not allow io_passthru with a read-only descriptor.
2416 		 */
2417 		return -EBADF;
2418 	}
2419 
2420 	bdev_io = spdk_bdev_get_io(channel);
2421 	if (!bdev_io) {
2422 		return -ENOMEM;
2423 	}
2424 
2425 	bdev_io->internal.ch = channel;
2426 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD;
2427 	bdev_io->u.nvme_passthru.cmd = *cmd;
2428 	bdev_io->u.nvme_passthru.buf = buf;
2429 	bdev_io->u.nvme_passthru.nbytes = nbytes;
2430 	bdev_io->u.nvme_passthru.md_buf = md_buf;
2431 	bdev_io->u.nvme_passthru.md_len = md_len;
2432 
2433 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
2434 
2435 	spdk_bdev_io_submit(bdev_io);
2436 	return 0;
2437 }
2438 
2439 int
2440 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
2441 			struct spdk_bdev_io_wait_entry *entry)
2442 {
2443 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2444 	struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch;
2445 
2446 	if (bdev != entry->bdev) {
2447 		SPDK_ERRLOG("bdevs do not match\n");
2448 		return -EINVAL;
2449 	}
2450 
2451 	if (mgmt_ch->per_thread_cache_count > 0) {
2452 		SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n");
2453 		return -EINVAL;
2454 	}
2455 
2456 	TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link);
2457 	return 0;
2458 }
2459 
2460 static void
2461 _spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
2462 {
2463 	struct spdk_bdev *bdev = bdev_ch->bdev;
2464 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
2465 	struct spdk_bdev_io *bdev_io;
2466 
2467 	if (shared_resource->io_outstanding > shared_resource->nomem_threshold) {
2468 		/*
2469 		 * Allow some more I/O to complete before retrying the nomem_io queue.
2470 		 *  Some drivers (such as nvme) cannot immediately take a new I/O in
2471 		 *  the context of a completion, because the resources for the I/O are
2472 		 *  not released until control returns to the bdev poller.  Also, we
2473 		 *  may require several small I/O to complete before a larger I/O
2474 		 *  (that requires splitting) can be submitted.
2475 		 */
2476 		return;
2477 	}
2478 
2479 	while (!TAILQ_EMPTY(&shared_resource->nomem_io)) {
2480 		bdev_io = TAILQ_FIRST(&shared_resource->nomem_io);
2481 		TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link);
2482 		bdev_io->internal.ch->io_outstanding++;
2483 		shared_resource->io_outstanding++;
2484 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
2485 		bdev->fn_table->submit_request(bdev_io->internal.ch->channel, bdev_io);
2486 		if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
2487 			break;
2488 		}
2489 	}
2490 }
2491 
2492 static inline void
2493 _spdk_bdev_io_complete(void *ctx)
2494 {
2495 	struct spdk_bdev_io *bdev_io = ctx;
2496 
2497 	if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) {
2498 		/*
2499 		 * Send the completion to the thread that originally submitted the I/O,
2500 		 * which may not be the current thread in the case of QoS.
2501 		 */
2502 		if (bdev_io->internal.io_submit_ch) {
2503 			bdev_io->internal.ch = bdev_io->internal.io_submit_ch;
2504 			bdev_io->internal.io_submit_ch = NULL;
2505 		}
2506 
2507 		/*
2508 		 * Defer completion to avoid potential infinite recursion if the
2509 		 * user's completion callback issues a new I/O.
2510 		 */
2511 		spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->internal.ch->channel),
2512 				     _spdk_bdev_io_complete, bdev_io);
2513 		return;
2514 	}
2515 
2516 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
2517 		switch (bdev_io->type) {
2518 		case SPDK_BDEV_IO_TYPE_READ:
2519 			bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
2520 			bdev_io->internal.ch->stat.num_read_ops++;
2521 			bdev_io->internal.ch->stat.read_latency_ticks += (spdk_get_ticks() - bdev_io->internal.submit_tsc);
2522 			break;
2523 		case SPDK_BDEV_IO_TYPE_WRITE:
2524 			bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
2525 			bdev_io->internal.ch->stat.num_write_ops++;
2526 			bdev_io->internal.ch->stat.write_latency_ticks += (spdk_get_ticks() - bdev_io->internal.submit_tsc);
2527 			break;
2528 		default:
2529 			break;
2530 		}
2531 	}
2532 
2533 #ifdef SPDK_CONFIG_VTUNE
2534 	uint64_t now_tsc = spdk_get_ticks();
2535 	if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) {
2536 		uint64_t data[5];
2537 
2538 		data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops;
2539 		data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read;
2540 		data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops;
2541 		data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written;
2542 		data[4] = bdev_io->bdev->fn_table->get_spin_time ?
2543 			  bdev_io->bdev->fn_table->get_spin_time(bdev_io->internal.ch->channel) : 0;
2544 
2545 		__itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle,
2546 				   __itt_metadata_u64, 5, data);
2547 
2548 		bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat;
2549 		bdev_io->internal.ch->start_tsc = now_tsc;
2550 	}
2551 #endif
2552 
2553 	assert(bdev_io->internal.cb != NULL);
2554 	assert(spdk_get_thread() == spdk_io_channel_get_thread(bdev_io->internal.ch->channel));
2555 
2556 	bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
2557 			     bdev_io->internal.caller_ctx);
2558 }
2559 
2560 static void
2561 _spdk_bdev_reset_complete(struct spdk_io_channel_iter *i, int status)
2562 {
2563 	struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i);
2564 
2565 	if (bdev_io->u.reset.ch_ref != NULL) {
2566 		spdk_put_io_channel(bdev_io->u.reset.ch_ref);
2567 		bdev_io->u.reset.ch_ref = NULL;
2568 	}
2569 
2570 	_spdk_bdev_io_complete(bdev_io);
2571 }
2572 
2573 static void
2574 _spdk_bdev_unfreeze_channel(struct spdk_io_channel_iter *i)
2575 {
2576 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
2577 	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
2578 
2579 	ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS;
2580 	if (!TAILQ_EMPTY(&ch->queued_resets)) {
2581 		_spdk_bdev_channel_start_reset(ch);
2582 	}
2583 
2584 	spdk_for_each_channel_continue(i, 0);
2585 }
2586 
2587 void
2588 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status)
2589 {
2590 	struct spdk_bdev *bdev = bdev_io->bdev;
2591 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
2592 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
2593 
2594 	bdev_io->internal.status = status;
2595 
2596 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) {
2597 		bool unlock_channels = false;
2598 
2599 		if (status == SPDK_BDEV_IO_STATUS_NOMEM) {
2600 			SPDK_ERRLOG("NOMEM returned for reset\n");
2601 		}
2602 		pthread_mutex_lock(&bdev->internal.mutex);
2603 		if (bdev_io == bdev->internal.reset_in_progress) {
2604 			bdev->internal.reset_in_progress = NULL;
2605 			unlock_channels = true;
2606 		}
2607 		pthread_mutex_unlock(&bdev->internal.mutex);
2608 
2609 		if (unlock_channels) {
2610 			spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_unfreeze_channel,
2611 					      bdev_io, _spdk_bdev_reset_complete);
2612 			return;
2613 		}
2614 	} else {
2615 		assert(bdev_ch->io_outstanding > 0);
2616 		assert(shared_resource->io_outstanding > 0);
2617 		bdev_ch->io_outstanding--;
2618 		shared_resource->io_outstanding--;
2619 
2620 		if (spdk_unlikely(status == SPDK_BDEV_IO_STATUS_NOMEM)) {
2621 			TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link);
2622 			/*
2623 			 * Wait for some of the outstanding I/O to complete before we
2624 			 *  retry any of the nomem_io.  Normally we will wait for
2625 			 *  NOMEM_THRESHOLD_COUNT I/O to complete but for low queue
2626 			 *  depth channels we will instead wait for half to complete.
2627 			 */
2628 			shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2,
2629 							   (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT);
2630 			return;
2631 		}
2632 
2633 		if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) {
2634 			_spdk_bdev_ch_retry_io(bdev_ch);
2635 		}
2636 	}
2637 
2638 	_spdk_bdev_io_complete(bdev_io);
2639 }
2640 
2641 void
2642 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc,
2643 				  enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq)
2644 {
2645 	if (sc == SPDK_SCSI_STATUS_GOOD) {
2646 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
2647 	} else {
2648 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR;
2649 		bdev_io->internal.error.scsi.sc = sc;
2650 		bdev_io->internal.error.scsi.sk = sk;
2651 		bdev_io->internal.error.scsi.asc = asc;
2652 		bdev_io->internal.error.scsi.ascq = ascq;
2653 	}
2654 
2655 	spdk_bdev_io_complete(bdev_io, bdev_io->internal.status);
2656 }
2657 
2658 void
2659 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io,
2660 			     int *sc, int *sk, int *asc, int *ascq)
2661 {
2662 	assert(sc != NULL);
2663 	assert(sk != NULL);
2664 	assert(asc != NULL);
2665 	assert(ascq != NULL);
2666 
2667 	switch (bdev_io->internal.status) {
2668 	case SPDK_BDEV_IO_STATUS_SUCCESS:
2669 		*sc = SPDK_SCSI_STATUS_GOOD;
2670 		*sk = SPDK_SCSI_SENSE_NO_SENSE;
2671 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
2672 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
2673 		break;
2674 	case SPDK_BDEV_IO_STATUS_NVME_ERROR:
2675 		spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq);
2676 		break;
2677 	case SPDK_BDEV_IO_STATUS_SCSI_ERROR:
2678 		*sc = bdev_io->internal.error.scsi.sc;
2679 		*sk = bdev_io->internal.error.scsi.sk;
2680 		*asc = bdev_io->internal.error.scsi.asc;
2681 		*ascq = bdev_io->internal.error.scsi.ascq;
2682 		break;
2683 	default:
2684 		*sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
2685 		*sk = SPDK_SCSI_SENSE_ABORTED_COMMAND;
2686 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
2687 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
2688 		break;
2689 	}
2690 }
2691 
2692 void
2693 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, int sct, int sc)
2694 {
2695 	if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) {
2696 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
2697 	} else {
2698 		bdev_io->internal.error.nvme.sct = sct;
2699 		bdev_io->internal.error.nvme.sc = sc;
2700 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR;
2701 	}
2702 
2703 	spdk_bdev_io_complete(bdev_io, bdev_io->internal.status);
2704 }
2705 
2706 void
2707 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, int *sct, int *sc)
2708 {
2709 	assert(sct != NULL);
2710 	assert(sc != NULL);
2711 
2712 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
2713 		*sct = bdev_io->internal.error.nvme.sct;
2714 		*sc = bdev_io->internal.error.nvme.sc;
2715 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
2716 		*sct = SPDK_NVME_SCT_GENERIC;
2717 		*sc = SPDK_NVME_SC_SUCCESS;
2718 	} else {
2719 		*sct = SPDK_NVME_SCT_GENERIC;
2720 		*sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
2721 	}
2722 }
2723 
2724 struct spdk_thread *
2725 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io)
2726 {
2727 	return spdk_io_channel_get_thread(bdev_io->internal.ch->channel);
2728 }
2729 
2730 static void
2731 _spdk_bdev_qos_config_type(struct spdk_bdev *bdev, uint64_t qos_set,
2732 			   enum spdk_bdev_qos_type qos_type)
2733 {
2734 	uint64_t	min_qos_set = 0;
2735 
2736 	switch (qos_type) {
2737 	case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
2738 		min_qos_set = SPDK_BDEV_QOS_MIN_IOS_PER_SEC;
2739 		break;
2740 	case SPDK_BDEV_QOS_RW_BYTEPS_RATE_LIMIT:
2741 		min_qos_set = SPDK_BDEV_QOS_MIN_BW_IN_MB_PER_SEC;
2742 		break;
2743 	default:
2744 		SPDK_ERRLOG("Unsupported QoS type.\n");
2745 		return;
2746 	}
2747 
2748 	if (qos_set % min_qos_set) {
2749 		SPDK_ERRLOG("Assigned QoS %" PRIu64 " on bdev %s is not multiple of %lu\n",
2750 			    qos_set, bdev->name, min_qos_set);
2751 		SPDK_ERRLOG("Failed to enable QoS on this bdev %s\n", bdev->name);
2752 		return;
2753 	}
2754 
2755 	if (!bdev->internal.qos) {
2756 		bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos));
2757 		if (!bdev->internal.qos) {
2758 			SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n");
2759 			return;
2760 		}
2761 	}
2762 
2763 	switch (qos_type) {
2764 	case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
2765 		bdev->internal.qos->iops_rate_limit = qos_set;
2766 		break;
2767 	case SPDK_BDEV_QOS_RW_BYTEPS_RATE_LIMIT:
2768 		bdev->internal.qos->byte_rate_limit = qos_set * 1024 * 1024;
2769 		break;
2770 	default:
2771 		break;
2772 	}
2773 
2774 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev:%s QoS type:%d set:%lu\n",
2775 		      bdev->name, qos_type, qos_set);
2776 
2777 	return;
2778 }
2779 
2780 static void
2781 _spdk_bdev_qos_config(struct spdk_bdev *bdev)
2782 {
2783 	struct spdk_conf_section	*sp = NULL;
2784 	const char			*val = NULL;
2785 	uint64_t			qos_set = 0;
2786 	int				i = 0, j = 0;
2787 
2788 	sp = spdk_conf_find_section(NULL, "QoS");
2789 	if (!sp) {
2790 		return;
2791 	}
2792 
2793 	while (j < SPDK_BDEV_QOS_NUM_TYPES) {
2794 		i = 0;
2795 		while (true) {
2796 			val = spdk_conf_section_get_nmval(sp, qos_type_str[j], i, 0);
2797 			if (!val) {
2798 				break;
2799 			}
2800 
2801 			if (strcmp(bdev->name, val) != 0) {
2802 				i++;
2803 				continue;
2804 			}
2805 
2806 			val = spdk_conf_section_get_nmval(sp, qos_type_str[j], i, 1);
2807 			if (val) {
2808 				qos_set = strtoull(val, NULL, 10);
2809 				_spdk_bdev_qos_config_type(bdev, qos_set, j);
2810 			}
2811 
2812 			break;
2813 		}
2814 
2815 		j++;
2816 	}
2817 
2818 	return;
2819 }
2820 
2821 static int
2822 spdk_bdev_init(struct spdk_bdev *bdev)
2823 {
2824 	assert(bdev->module != NULL);
2825 
2826 	if (!bdev->name) {
2827 		SPDK_ERRLOG("Bdev name is NULL\n");
2828 		return -EINVAL;
2829 	}
2830 
2831 	if (spdk_bdev_get_by_name(bdev->name)) {
2832 		SPDK_ERRLOG("Bdev name:%s already exists\n", bdev->name);
2833 		return -EEXIST;
2834 	}
2835 
2836 	bdev->internal.status = SPDK_BDEV_STATUS_READY;
2837 	bdev->internal.measured_queue_depth = UINT64_MAX;
2838 
2839 	TAILQ_INIT(&bdev->internal.open_descs);
2840 
2841 	TAILQ_INIT(&bdev->aliases);
2842 
2843 	bdev->internal.reset_in_progress = NULL;
2844 
2845 	_spdk_bdev_qos_config(bdev);
2846 
2847 	spdk_io_device_register(__bdev_to_io_dev(bdev),
2848 				spdk_bdev_channel_create, spdk_bdev_channel_destroy,
2849 				sizeof(struct spdk_bdev_channel));
2850 
2851 	pthread_mutex_init(&bdev->internal.mutex, NULL);
2852 	return 0;
2853 }
2854 
2855 static void
2856 spdk_bdev_destroy_cb(void *io_device)
2857 {
2858 	int			rc;
2859 	struct spdk_bdev	*bdev;
2860 	spdk_bdev_unregister_cb	cb_fn;
2861 	void			*cb_arg;
2862 
2863 	bdev = __bdev_from_io_dev(io_device);
2864 	cb_fn = bdev->internal.unregister_cb;
2865 	cb_arg = bdev->internal.unregister_ctx;
2866 
2867 	rc = bdev->fn_table->destruct(bdev->ctxt);
2868 	if (rc < 0) {
2869 		SPDK_ERRLOG("destruct failed\n");
2870 	}
2871 	if (rc <= 0 && cb_fn != NULL) {
2872 		cb_fn(cb_arg, rc);
2873 	}
2874 }
2875 
2876 
2877 static void
2878 spdk_bdev_fini(struct spdk_bdev *bdev)
2879 {
2880 	pthread_mutex_destroy(&bdev->internal.mutex);
2881 
2882 	free(bdev->internal.qos);
2883 
2884 	spdk_io_device_unregister(__bdev_to_io_dev(bdev), spdk_bdev_destroy_cb);
2885 }
2886 
2887 static void
2888 spdk_bdev_start(struct spdk_bdev *bdev)
2889 {
2890 	struct spdk_bdev_module *module;
2891 	uint32_t action;
2892 
2893 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Inserting bdev %s into list\n", bdev->name);
2894 	TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link);
2895 
2896 	/* Examine configuration before initializing I/O */
2897 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
2898 		if (module->examine_config) {
2899 			action = module->internal.action_in_progress;
2900 			module->internal.action_in_progress++;
2901 			module->examine_config(bdev);
2902 			if (action != module->internal.action_in_progress) {
2903 				SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n",
2904 					    module->name);
2905 			}
2906 		}
2907 	}
2908 
2909 	if (bdev->internal.claim_module) {
2910 		return;
2911 	}
2912 
2913 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
2914 		if (module->examine_disk) {
2915 			module->internal.action_in_progress++;
2916 			module->examine_disk(bdev);
2917 		}
2918 	}
2919 }
2920 
2921 int
2922 spdk_bdev_register(struct spdk_bdev *bdev)
2923 {
2924 	int rc = spdk_bdev_init(bdev);
2925 
2926 	if (rc == 0) {
2927 		spdk_bdev_start(bdev);
2928 	}
2929 
2930 	return rc;
2931 }
2932 
2933 static void
2934 spdk_vbdev_remove_base_bdevs(struct spdk_bdev *vbdev)
2935 {
2936 	struct spdk_bdev **bdevs;
2937 	struct spdk_bdev *base;
2938 	size_t i, j, k;
2939 	bool found;
2940 
2941 	/* Iterate over base bdevs to remove vbdev from them. */
2942 	for (i = 0; i < vbdev->internal.base_bdevs_cnt; i++) {
2943 		found = false;
2944 		base = vbdev->internal.base_bdevs[i];
2945 
2946 		for (j = 0; j < base->vbdevs_cnt; j++) {
2947 			if (base->vbdevs[j] != vbdev) {
2948 				continue;
2949 			}
2950 
2951 			for (k = j; k + 1 < base->vbdevs_cnt; k++) {
2952 				base->vbdevs[k] = base->vbdevs[k + 1];
2953 			}
2954 
2955 			base->vbdevs_cnt--;
2956 			if (base->vbdevs_cnt > 0) {
2957 				bdevs = realloc(base->vbdevs, base->vbdevs_cnt * sizeof(bdevs[0]));
2958 				/* It would be odd if shrinking memory block fail. */
2959 				assert(bdevs);
2960 				base->vbdevs = bdevs;
2961 			} else {
2962 				free(base->vbdevs);
2963 				base->vbdevs = NULL;
2964 			}
2965 
2966 			found = true;
2967 			break;
2968 		}
2969 
2970 		if (!found) {
2971 			SPDK_WARNLOG("Bdev '%s' is not base bdev of '%s'.\n", base->name, vbdev->name);
2972 		}
2973 	}
2974 
2975 	free(vbdev->internal.base_bdevs);
2976 	vbdev->internal.base_bdevs = NULL;
2977 	vbdev->internal.base_bdevs_cnt = 0;
2978 }
2979 
2980 static int
2981 spdk_vbdev_set_base_bdevs(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, size_t cnt)
2982 {
2983 	struct spdk_bdev **vbdevs;
2984 	struct spdk_bdev *base;
2985 	size_t i;
2986 
2987 	/* Adding base bdevs isn't supported (yet?). */
2988 	assert(vbdev->internal.base_bdevs_cnt == 0);
2989 
2990 	vbdev->internal.base_bdevs = malloc(cnt * sizeof(vbdev->internal.base_bdevs[0]));
2991 	if (!vbdev->internal.base_bdevs) {
2992 		SPDK_ERRLOG("%s - realloc() failed\n", vbdev->name);
2993 		return -ENOMEM;
2994 	}
2995 
2996 	memcpy(vbdev->internal.base_bdevs, base_bdevs, cnt * sizeof(vbdev->internal.base_bdevs[0]));
2997 	vbdev->internal.base_bdevs_cnt = cnt;
2998 
2999 	/* Iterate over base bdevs to add this vbdev to them. */
3000 	for (i = 0; i < cnt; i++) {
3001 		base = vbdev->internal.base_bdevs[i];
3002 
3003 		assert(base != NULL);
3004 		assert(base->internal.claim_module != NULL);
3005 
3006 		vbdevs = realloc(base->vbdevs, (base->vbdevs_cnt + 1) * sizeof(vbdevs[0]));
3007 		if (!vbdevs) {
3008 			SPDK_ERRLOG("%s - realloc() failed\n", base->name);
3009 			spdk_vbdev_remove_base_bdevs(vbdev);
3010 			return -ENOMEM;
3011 		}
3012 
3013 		vbdevs[base->vbdevs_cnt] = vbdev;
3014 		base->vbdevs = vbdevs;
3015 		base->vbdevs_cnt++;
3016 	}
3017 
3018 	return 0;
3019 }
3020 
3021 int
3022 spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count)
3023 {
3024 	int rc;
3025 
3026 	rc = spdk_bdev_init(vbdev);
3027 	if (rc) {
3028 		return rc;
3029 	}
3030 
3031 	if (base_bdev_count == 0) {
3032 		spdk_bdev_start(vbdev);
3033 		return 0;
3034 	}
3035 
3036 	rc = spdk_vbdev_set_base_bdevs(vbdev, base_bdevs, base_bdev_count);
3037 	if (rc) {
3038 		spdk_bdev_fini(vbdev);
3039 		return rc;
3040 	}
3041 
3042 	spdk_bdev_start(vbdev);
3043 	return 0;
3044 
3045 }
3046 
3047 void
3048 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno)
3049 {
3050 	if (bdev->internal.unregister_cb != NULL) {
3051 		bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno);
3052 	}
3053 }
3054 
3055 static void
3056 _remove_notify(void *arg)
3057 {
3058 	struct spdk_bdev_desc *desc = arg;
3059 
3060 	desc->remove_cb(desc->remove_ctx);
3061 }
3062 
3063 void
3064 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
3065 {
3066 	struct spdk_bdev_desc	*desc, *tmp;
3067 	bool			do_destruct = true;
3068 	struct spdk_thread	*thread;
3069 
3070 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list\n", bdev->name);
3071 
3072 	thread = spdk_get_thread();
3073 	if (!thread) {
3074 		/* The user called this from a non-SPDK thread. */
3075 		if (cb_fn != NULL) {
3076 			cb_fn(cb_arg, -ENOTSUP);
3077 		}
3078 		return;
3079 	}
3080 
3081 	pthread_mutex_lock(&bdev->internal.mutex);
3082 
3083 	spdk_vbdev_remove_base_bdevs(bdev);
3084 
3085 	bdev->internal.status = SPDK_BDEV_STATUS_REMOVING;
3086 	bdev->internal.unregister_cb = cb_fn;
3087 	bdev->internal.unregister_ctx = cb_arg;
3088 
3089 	TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) {
3090 		if (desc->remove_cb) {
3091 			do_destruct = false;
3092 			/*
3093 			 * Defer invocation of the remove_cb to a separate message that will
3094 			 *  run later on this thread.  This ensures this context unwinds and
3095 			 *  we don't recursively unregister this bdev again if the remove_cb
3096 			 *  immediately closes its descriptor.
3097 			 */
3098 			if (!desc->remove_scheduled) {
3099 				/* Avoid scheduling removal of the same descriptor multiple times. */
3100 				desc->remove_scheduled = true;
3101 				spdk_thread_send_msg(thread, _remove_notify, desc);
3102 			}
3103 		}
3104 	}
3105 
3106 	if (!do_destruct) {
3107 		pthread_mutex_unlock(&bdev->internal.mutex);
3108 		return;
3109 	}
3110 
3111 	TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
3112 	pthread_mutex_unlock(&bdev->internal.mutex);
3113 
3114 	spdk_bdev_fini(bdev);
3115 }
3116 
3117 int
3118 spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb,
3119 	       void *remove_ctx, struct spdk_bdev_desc **_desc)
3120 {
3121 	struct spdk_bdev_desc *desc;
3122 
3123 	desc = calloc(1, sizeof(*desc));
3124 	if (desc == NULL) {
3125 		SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n");
3126 		return -ENOMEM;
3127 	}
3128 
3129 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
3130 		      spdk_get_thread());
3131 
3132 	pthread_mutex_lock(&bdev->internal.mutex);
3133 
3134 	if (write && bdev->internal.claim_module) {
3135 		SPDK_ERRLOG("Could not open %s - %s module already claimed it\n",
3136 			    bdev->name, bdev->internal.claim_module->name);
3137 		free(desc);
3138 		pthread_mutex_unlock(&bdev->internal.mutex);
3139 		return -EPERM;
3140 	}
3141 
3142 	TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link);
3143 
3144 	desc->bdev = bdev;
3145 	desc->remove_cb = remove_cb;
3146 	desc->remove_ctx = remove_ctx;
3147 	desc->write = write;
3148 	*_desc = desc;
3149 
3150 	pthread_mutex_unlock(&bdev->internal.mutex);
3151 
3152 	return 0;
3153 }
3154 
3155 void
3156 spdk_bdev_close(struct spdk_bdev_desc *desc)
3157 {
3158 	struct spdk_bdev *bdev = desc->bdev;
3159 	bool do_unregister = false;
3160 
3161 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
3162 		      spdk_get_thread());
3163 
3164 	pthread_mutex_lock(&bdev->internal.mutex);
3165 
3166 	TAILQ_REMOVE(&bdev->internal.open_descs, desc, link);
3167 	free(desc);
3168 
3169 	/* If no more descriptors, kill QoS channel */
3170 	if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) {
3171 		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n",
3172 			      bdev->name, spdk_get_thread());
3173 
3174 		if (spdk_bdev_qos_destroy(bdev)) {
3175 			/* There isn't anything we can do to recover here. Just let the
3176 			 * old QoS poller keep running. The QoS handling won't change
3177 			 * cores when the user allocates a new channel, but it won't break. */
3178 			SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n");
3179 		}
3180 	}
3181 
3182 	spdk_bdev_set_qd_sampling_period(bdev, 0);
3183 
3184 	if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) {
3185 		do_unregister = true;
3186 	}
3187 	pthread_mutex_unlock(&bdev->internal.mutex);
3188 
3189 	if (do_unregister == true) {
3190 		spdk_bdev_unregister(bdev, bdev->internal.unregister_cb, bdev->internal.unregister_ctx);
3191 	}
3192 }
3193 
3194 int
3195 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
3196 			    struct spdk_bdev_module *module)
3197 {
3198 	if (bdev->internal.claim_module != NULL) {
3199 		SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name,
3200 			    bdev->internal.claim_module->name);
3201 		return -EPERM;
3202 	}
3203 
3204 	if (desc && !desc->write) {
3205 		desc->write = true;
3206 	}
3207 
3208 	bdev->internal.claim_module = module;
3209 	return 0;
3210 }
3211 
3212 void
3213 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev)
3214 {
3215 	assert(bdev->internal.claim_module != NULL);
3216 	bdev->internal.claim_module = NULL;
3217 }
3218 
3219 struct spdk_bdev *
3220 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc)
3221 {
3222 	return desc->bdev;
3223 }
3224 
3225 void
3226 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp)
3227 {
3228 	struct iovec *iovs;
3229 	int iovcnt;
3230 
3231 	if (bdev_io == NULL) {
3232 		return;
3233 	}
3234 
3235 	switch (bdev_io->type) {
3236 	case SPDK_BDEV_IO_TYPE_READ:
3237 		iovs = bdev_io->u.bdev.iovs;
3238 		iovcnt = bdev_io->u.bdev.iovcnt;
3239 		break;
3240 	case SPDK_BDEV_IO_TYPE_WRITE:
3241 		iovs = bdev_io->u.bdev.iovs;
3242 		iovcnt = bdev_io->u.bdev.iovcnt;
3243 		break;
3244 	default:
3245 		iovs = NULL;
3246 		iovcnt = 0;
3247 		break;
3248 	}
3249 
3250 	if (iovp) {
3251 		*iovp = iovs;
3252 	}
3253 	if (iovcntp) {
3254 		*iovcntp = iovcnt;
3255 	}
3256 }
3257 
3258 void
3259 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module)
3260 {
3261 
3262 	if (spdk_bdev_module_list_find(bdev_module->name)) {
3263 		SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name);
3264 		assert(false);
3265 	}
3266 
3267 	if (bdev_module->async_init) {
3268 		bdev_module->internal.action_in_progress = 1;
3269 	}
3270 
3271 	/*
3272 	 * Modules with examine callbacks must be initialized first, so they are
3273 	 *  ready to handle examine callbacks from later modules that will
3274 	 *  register physical bdevs.
3275 	 */
3276 	if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) {
3277 		TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
3278 	} else {
3279 		TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
3280 	}
3281 }
3282 
3283 struct spdk_bdev_module *
3284 spdk_bdev_module_list_find(const char *name)
3285 {
3286 	struct spdk_bdev_module *bdev_module;
3287 
3288 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
3289 		if (strcmp(name, bdev_module->name) == 0) {
3290 			break;
3291 		}
3292 	}
3293 
3294 	return bdev_module;
3295 }
3296 
3297 static void
3298 spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
3299 {
3300 	uint64_t len;
3301 
3302 	if (!success) {
3303 		bdev_io->internal.cb = bdev_io->u.bdev.stored_user_cb;
3304 		_spdk_bdev_io_complete(bdev_io);
3305 		return;
3306 	}
3307 
3308 	/* no need to perform the error checking from write_zeroes_blocks because this request already passed those checks. */
3309 	len = spdk_min(spdk_bdev_get_block_size(bdev_io->bdev) * bdev_io->u.bdev.split_remaining_num_blocks,
3310 		       ZERO_BUFFER_SIZE);
3311 
3312 	bdev_io->u.bdev.offset_blocks = bdev_io->u.bdev.split_current_offset_blocks;
3313 	bdev_io->u.bdev.iovs[0].iov_len = len;
3314 	bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev_io->bdev);
3315 	bdev_io->u.bdev.split_remaining_num_blocks -= bdev_io->u.bdev.num_blocks;
3316 	bdev_io->u.bdev.split_current_offset_blocks += bdev_io->u.bdev.num_blocks;
3317 
3318 	/* if this round completes the i/o, change the callback to be the original user callback */
3319 	if (bdev_io->u.bdev.split_remaining_num_blocks == 0) {
3320 		spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, bdev_io->u.bdev.stored_user_cb);
3321 	} else {
3322 		spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, spdk_bdev_write_zeroes_split);
3323 	}
3324 	spdk_bdev_io_submit(bdev_io);
3325 }
3326 
3327 struct set_qos_limit_ctx {
3328 	void (*cb_fn)(void *cb_arg, int status);
3329 	void *cb_arg;
3330 	struct spdk_bdev *bdev;
3331 };
3332 
3333 static void
3334 _spdk_bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status)
3335 {
3336 	pthread_mutex_lock(&ctx->bdev->internal.mutex);
3337 	ctx->bdev->internal.qos_mod_in_progress = false;
3338 	pthread_mutex_unlock(&ctx->bdev->internal.mutex);
3339 
3340 	ctx->cb_fn(ctx->cb_arg, status);
3341 	free(ctx);
3342 }
3343 
3344 static void
3345 _spdk_bdev_disable_qos_done(void *cb_arg)
3346 {
3347 	struct set_qos_limit_ctx *ctx = cb_arg;
3348 	struct spdk_bdev *bdev = ctx->bdev;
3349 	struct spdk_bdev_io *bdev_io;
3350 	struct spdk_bdev_qos *qos;
3351 
3352 	pthread_mutex_lock(&bdev->internal.mutex);
3353 	qos = bdev->internal.qos;
3354 	bdev->internal.qos = NULL;
3355 	pthread_mutex_unlock(&bdev->internal.mutex);
3356 
3357 	while (!TAILQ_EMPTY(&qos->queued)) {
3358 		/* Send queued I/O back to their original thread for resubmission. */
3359 		bdev_io = TAILQ_FIRST(&qos->queued);
3360 		TAILQ_REMOVE(&qos->queued, bdev_io, internal.link);
3361 
3362 		if (bdev_io->internal.io_submit_ch) {
3363 			/*
3364 			 * Channel was changed when sending it to the QoS thread - change it back
3365 			 *  before sending it back to the original thread.
3366 			 */
3367 			bdev_io->internal.ch = bdev_io->internal.io_submit_ch;
3368 			bdev_io->internal.io_submit_ch = NULL;
3369 		}
3370 
3371 		spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->internal.ch->channel),
3372 				     _spdk_bdev_io_submit, bdev_io);
3373 	}
3374 
3375 	spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
3376 	spdk_poller_unregister(&qos->poller);
3377 
3378 	free(qos);
3379 
3380 	_spdk_bdev_set_qos_limit_done(ctx, 0);
3381 }
3382 
3383 static void
3384 _spdk_bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status)
3385 {
3386 	void *io_device = spdk_io_channel_iter_get_io_device(i);
3387 	struct spdk_bdev *bdev = __bdev_from_io_dev(io_device);
3388 	struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
3389 	struct spdk_thread *thread;
3390 
3391 	pthread_mutex_lock(&bdev->internal.mutex);
3392 	thread = bdev->internal.qos->thread;
3393 	pthread_mutex_unlock(&bdev->internal.mutex);
3394 
3395 	spdk_thread_send_msg(thread, _spdk_bdev_disable_qos_done, ctx);
3396 }
3397 
3398 static void
3399 _spdk_bdev_disable_qos_msg(struct spdk_io_channel_iter *i)
3400 {
3401 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
3402 	struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch);
3403 
3404 	bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED;
3405 
3406 	spdk_for_each_channel_continue(i, 0);
3407 }
3408 
3409 static void
3410 _spdk_bdev_update_qos_limit_iops_msg(void *cb_arg)
3411 {
3412 	struct set_qos_limit_ctx *ctx = cb_arg;
3413 	struct spdk_bdev *bdev = ctx->bdev;
3414 
3415 	pthread_mutex_lock(&bdev->internal.mutex);
3416 	spdk_bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos);
3417 	pthread_mutex_unlock(&bdev->internal.mutex);
3418 
3419 	_spdk_bdev_set_qos_limit_done(ctx, 0);
3420 }
3421 
3422 static void
3423 _spdk_bdev_enable_qos_msg(struct spdk_io_channel_iter *i)
3424 {
3425 	void *io_device = spdk_io_channel_iter_get_io_device(i);
3426 	struct spdk_bdev *bdev = __bdev_from_io_dev(io_device);
3427 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
3428 	struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch);
3429 
3430 	pthread_mutex_lock(&bdev->internal.mutex);
3431 	_spdk_bdev_enable_qos(bdev, bdev_ch);
3432 	pthread_mutex_unlock(&bdev->internal.mutex);
3433 	spdk_for_each_channel_continue(i, 0);
3434 }
3435 
3436 static void
3437 _spdk_bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status)
3438 {
3439 	struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
3440 
3441 	_spdk_bdev_set_qos_limit_done(ctx, status);
3442 }
3443 
3444 void
3445 spdk_bdev_set_qos_limit_iops(struct spdk_bdev *bdev, uint64_t ios_per_sec,
3446 			     void (*cb_fn)(void *cb_arg, int status), void *cb_arg)
3447 {
3448 	struct set_qos_limit_ctx *ctx;
3449 
3450 	if (ios_per_sec > 0 && ios_per_sec % SPDK_BDEV_QOS_MIN_IOS_PER_SEC) {
3451 		SPDK_ERRLOG("Requested ios_per_sec limit %" PRIu64 " is not a multiple of %u\n",
3452 			    ios_per_sec, SPDK_BDEV_QOS_MIN_IOS_PER_SEC);
3453 		cb_fn(cb_arg, -EINVAL);
3454 		return;
3455 	}
3456 
3457 	ctx = calloc(1, sizeof(*ctx));
3458 	if (ctx == NULL) {
3459 		cb_fn(cb_arg, -ENOMEM);
3460 		return;
3461 	}
3462 
3463 	ctx->cb_fn = cb_fn;
3464 	ctx->cb_arg = cb_arg;
3465 	ctx->bdev = bdev;
3466 
3467 	pthread_mutex_lock(&bdev->internal.mutex);
3468 	if (bdev->internal.qos_mod_in_progress) {
3469 		pthread_mutex_unlock(&bdev->internal.mutex);
3470 		free(ctx);
3471 		cb_fn(cb_arg, -EAGAIN);
3472 		return;
3473 	}
3474 	bdev->internal.qos_mod_in_progress = true;
3475 
3476 	if (ios_per_sec > 0) {
3477 		if (bdev->internal.qos == NULL) {
3478 			/* Enabling */
3479 			bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos));
3480 			if (!bdev->internal.qos) {
3481 				pthread_mutex_unlock(&bdev->internal.mutex);
3482 				SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n");
3483 				free(ctx);
3484 				cb_fn(cb_arg, -ENOMEM);
3485 				return;
3486 			}
3487 
3488 			bdev->internal.qos->iops_rate_limit = ios_per_sec;
3489 			spdk_for_each_channel(__bdev_to_io_dev(bdev),
3490 					      _spdk_bdev_enable_qos_msg, ctx,
3491 					      _spdk_bdev_enable_qos_done);
3492 		} else {
3493 			/* Updating */
3494 			bdev->internal.qos->iops_rate_limit = ios_per_sec;
3495 			spdk_thread_send_msg(bdev->internal.qos->thread, _spdk_bdev_update_qos_limit_iops_msg, ctx);
3496 		}
3497 	} else {
3498 		if (bdev->internal.qos != NULL) {
3499 			/* Disabling */
3500 			spdk_for_each_channel(__bdev_to_io_dev(bdev),
3501 					      _spdk_bdev_disable_qos_msg, ctx,
3502 					      _spdk_bdev_disable_qos_msg_done);
3503 		} else {
3504 			pthread_mutex_unlock(&bdev->internal.mutex);
3505 			_spdk_bdev_set_qos_limit_done(ctx, 0);
3506 			return;
3507 		}
3508 	}
3509 
3510 	pthread_mutex_unlock(&bdev->internal.mutex);
3511 }
3512 
3513 SPDK_LOG_REGISTER_COMPONENT("bdev", SPDK_LOG_BDEV)
3514