xref: /spdk/lib/bdev/bdev.c (revision f56b2300633de2070005dd0fa244edb2a7a060d2)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "spdk/bdev.h"
37 #include "spdk/conf.h"
38 
39 #include "spdk/env.h"
40 #include "spdk/event.h"
41 #include "spdk/thread.h"
42 #include "spdk/likely.h"
43 #include "spdk/queue.h"
44 #include "spdk/nvme_spec.h"
45 #include "spdk/scsi_spec.h"
46 #include "spdk/util.h"
47 
48 #include "spdk/bdev_module.h"
49 #include "spdk_internal/log.h"
50 #include "spdk/string.h"
51 
52 #ifdef SPDK_CONFIG_VTUNE
53 #include "ittnotify.h"
54 #include "ittnotify_types.h"
55 int __itt_init_ittlib(const char *, __itt_group_id);
56 #endif
57 
58 #define SPDK_BDEV_IO_POOL_SIZE			(64 * 1024)
59 #define SPDK_BDEV_IO_CACHE_SIZE			256
60 #define BUF_SMALL_POOL_SIZE			8192
61 #define BUF_LARGE_POOL_SIZE			1024
62 #define NOMEM_THRESHOLD_COUNT			8
63 #define ZERO_BUFFER_SIZE			0x100000
64 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC		1000
65 #define SPDK_BDEV_SEC_TO_USEC			1000000ULL
66 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE	1
67 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE	512
68 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC		10000
69 #define SPDK_BDEV_QOS_MIN_BW_IN_MB_PER_SEC	10
70 
71 enum spdk_bdev_qos_type {
72 	SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT = 0,
73 	SPDK_BDEV_QOS_RW_BYTEPS_RATE_LIMIT,
74 	SPDK_BDEV_QOS_NUM_TYPES /* Keep last */
75 };
76 
77 static const char *qos_type_str[SPDK_BDEV_QOS_NUM_TYPES] = {"Limit_IOPS", "Limit_BWPS"};
78 
79 struct spdk_bdev_mgr {
80 	struct spdk_mempool *bdev_io_pool;
81 
82 	struct spdk_mempool *buf_small_pool;
83 	struct spdk_mempool *buf_large_pool;
84 
85 	void *zero_buffer;
86 
87 	TAILQ_HEAD(, spdk_bdev_module) bdev_modules;
88 
89 	TAILQ_HEAD(, spdk_bdev) bdevs;
90 
91 	bool init_complete;
92 	bool module_init_complete;
93 
94 #ifdef SPDK_CONFIG_VTUNE
95 	__itt_domain	*domain;
96 #endif
97 };
98 
99 static struct spdk_bdev_mgr g_bdev_mgr = {
100 	.bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules),
101 	.bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs),
102 	.init_complete = false,
103 	.module_init_complete = false,
104 };
105 
106 static struct spdk_bdev_opts	g_bdev_opts = {
107 	.bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE,
108 	.bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE,
109 };
110 
111 static spdk_bdev_init_cb	g_init_cb_fn = NULL;
112 static void			*g_init_cb_arg = NULL;
113 
114 static spdk_bdev_fini_cb	g_fini_cb_fn = NULL;
115 static void			*g_fini_cb_arg = NULL;
116 static struct spdk_thread	*g_fini_thread = NULL;
117 
118 struct spdk_bdev_qos {
119 	/** Rate limit, in I/O per second */
120 	uint64_t iops_rate_limit;
121 
122 	/** Rate limit, in byte per second */
123 	uint64_t byte_rate_limit;
124 
125 	/** The channel that all I/O are funneled through */
126 	struct spdk_bdev_channel *ch;
127 
128 	/** The thread on which the poller is running. */
129 	struct spdk_thread *thread;
130 
131 	/** Queue of I/O waiting to be issued. */
132 	bdev_io_tailq_t queued;
133 
134 	/** Maximum allowed IOs to be issued in one timeslice (e.g., 1ms) and
135 	 *  only valid for the master channel which manages the outstanding IOs. */
136 	uint64_t max_ios_per_timeslice;
137 
138 	/** Maximum allowed bytes to be issued in one timeslice (e.g., 1ms) and
139 	 *  only valid for the master channel which manages the outstanding IOs. */
140 	uint64_t max_byte_per_timeslice;
141 
142 	/** Submitted IO in one timeslice (e.g., 1ms) */
143 	uint64_t io_submitted_this_timeslice;
144 
145 	/** Submitted byte in one timeslice (e.g., 1ms) */
146 	uint64_t byte_submitted_this_timeslice;
147 
148 	/** Polller that processes queued I/O commands each time slice. */
149 	struct spdk_poller *poller;
150 };
151 
152 struct spdk_bdev_mgmt_channel {
153 	bdev_io_stailq_t need_buf_small;
154 	bdev_io_stailq_t need_buf_large;
155 
156 	/*
157 	 * Each thread keeps a cache of bdev_io - this allows
158 	 *  bdev threads which are *not* DPDK threads to still
159 	 *  benefit from a per-thread bdev_io cache.  Without
160 	 *  this, non-DPDK threads fetching from the mempool
161 	 *  incur a cmpxchg on get and put.
162 	 */
163 	bdev_io_stailq_t per_thread_cache;
164 	uint32_t	per_thread_cache_count;
165 	uint32_t	bdev_io_cache_size;
166 
167 	TAILQ_HEAD(, spdk_bdev_shared_resource)	shared_resources;
168 	TAILQ_HEAD(, spdk_bdev_io_wait_entry)	io_wait_queue;
169 };
170 
171 /*
172  * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device
173  * will queue here their IO that awaits retry. It makes it posible to retry sending
174  * IO to one bdev after IO from other bdev completes.
175  */
176 struct spdk_bdev_shared_resource {
177 	/* The bdev management channel */
178 	struct spdk_bdev_mgmt_channel *mgmt_ch;
179 
180 	/*
181 	 * Count of I/O submitted to bdev module and waiting for completion.
182 	 * Incremented before submit_request() is called on an spdk_bdev_io.
183 	 */
184 	uint64_t		io_outstanding;
185 
186 	/*
187 	 * Queue of IO awaiting retry because of a previous NOMEM status returned
188 	 *  on this channel.
189 	 */
190 	bdev_io_tailq_t		nomem_io;
191 
192 	/*
193 	 * Threshold which io_outstanding must drop to before retrying nomem_io.
194 	 */
195 	uint64_t		nomem_threshold;
196 
197 	/* I/O channel allocated by a bdev module */
198 	struct spdk_io_channel	*shared_ch;
199 
200 	/* Refcount of bdev channels using this resource */
201 	uint32_t		ref;
202 
203 	TAILQ_ENTRY(spdk_bdev_shared_resource) link;
204 };
205 
206 #define BDEV_CH_RESET_IN_PROGRESS	(1 << 0)
207 #define BDEV_CH_QOS_ENABLED		(1 << 1)
208 
209 struct spdk_bdev_channel {
210 	struct spdk_bdev	*bdev;
211 
212 	/* The channel for the underlying device */
213 	struct spdk_io_channel	*channel;
214 
215 	/* Per io_device per thread data */
216 	struct spdk_bdev_shared_resource *shared_resource;
217 
218 	struct spdk_bdev_io_stat stat;
219 
220 	/*
221 	 * Count of I/O submitted through this channel and waiting for completion.
222 	 * Incremented before submit_request() is called on an spdk_bdev_io.
223 	 */
224 	uint64_t		io_outstanding;
225 
226 	bdev_io_tailq_t		queued_resets;
227 
228 	uint32_t		flags;
229 
230 #ifdef SPDK_CONFIG_VTUNE
231 	uint64_t		start_tsc;
232 	uint64_t		interval_tsc;
233 	__itt_string_handle	*handle;
234 	struct spdk_bdev_io_stat prev_stat;
235 #endif
236 
237 };
238 
239 struct spdk_bdev_desc {
240 	struct spdk_bdev		*bdev;
241 	spdk_bdev_remove_cb_t		remove_cb;
242 	void				*remove_ctx;
243 	bool				remove_scheduled;
244 	bool				write;
245 	TAILQ_ENTRY(spdk_bdev_desc)	link;
246 };
247 
248 struct spdk_bdev_iostat_ctx {
249 	struct spdk_bdev_io_stat *stat;
250 	spdk_bdev_get_device_stat_cb cb;
251 	void *cb_arg;
252 };
253 
254 #define __bdev_to_io_dev(bdev)		(((char *)bdev) + 1)
255 #define __bdev_from_io_dev(io_dev)	((struct spdk_bdev *)(((char *)io_dev) - 1))
256 
257 static void spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
258 
259 void
260 spdk_bdev_get_opts(struct spdk_bdev_opts *opts)
261 {
262 	*opts = g_bdev_opts;
263 }
264 
265 int
266 spdk_bdev_set_opts(struct spdk_bdev_opts *opts)
267 {
268 	uint32_t min_pool_size;
269 
270 	/*
271 	 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem
272 	 *  initialization.  A second mgmt_ch will be created on the same thread when the application starts
273 	 *  but before the deferred put_io_channel event is executed for the first mgmt_ch.
274 	 */
275 	min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1);
276 	if (opts->bdev_io_pool_size < min_pool_size) {
277 		SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32
278 			    " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size,
279 			    spdk_thread_get_count());
280 		SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size);
281 		return -1;
282 	}
283 
284 	g_bdev_opts = *opts;
285 	return 0;
286 }
287 
288 struct spdk_bdev *
289 spdk_bdev_first(void)
290 {
291 	struct spdk_bdev *bdev;
292 
293 	bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs);
294 	if (bdev) {
295 		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name);
296 	}
297 
298 	return bdev;
299 }
300 
301 struct spdk_bdev *
302 spdk_bdev_next(struct spdk_bdev *prev)
303 {
304 	struct spdk_bdev *bdev;
305 
306 	bdev = TAILQ_NEXT(prev, internal.link);
307 	if (bdev) {
308 		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name);
309 	}
310 
311 	return bdev;
312 }
313 
314 static struct spdk_bdev *
315 _bdev_next_leaf(struct spdk_bdev *bdev)
316 {
317 	while (bdev != NULL) {
318 		if (bdev->internal.claim_module == NULL) {
319 			return bdev;
320 		} else {
321 			bdev = TAILQ_NEXT(bdev, internal.link);
322 		}
323 	}
324 
325 	return bdev;
326 }
327 
328 struct spdk_bdev *
329 spdk_bdev_first_leaf(void)
330 {
331 	struct spdk_bdev *bdev;
332 
333 	bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs));
334 
335 	if (bdev) {
336 		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name);
337 	}
338 
339 	return bdev;
340 }
341 
342 struct spdk_bdev *
343 spdk_bdev_next_leaf(struct spdk_bdev *prev)
344 {
345 	struct spdk_bdev *bdev;
346 
347 	bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link));
348 
349 	if (bdev) {
350 		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name);
351 	}
352 
353 	return bdev;
354 }
355 
356 struct spdk_bdev *
357 spdk_bdev_get_by_name(const char *bdev_name)
358 {
359 	struct spdk_bdev_alias *tmp;
360 	struct spdk_bdev *bdev = spdk_bdev_first();
361 
362 	while (bdev != NULL) {
363 		if (strcmp(bdev_name, bdev->name) == 0) {
364 			return bdev;
365 		}
366 
367 		TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
368 			if (strcmp(bdev_name, tmp->alias) == 0) {
369 				return bdev;
370 			}
371 		}
372 
373 		bdev = spdk_bdev_next(bdev);
374 	}
375 
376 	return NULL;
377 }
378 
379 void
380 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len)
381 {
382 	struct iovec *iovs;
383 
384 	iovs = bdev_io->u.bdev.iovs;
385 
386 	assert(iovs != NULL);
387 	assert(bdev_io->u.bdev.iovcnt >= 1);
388 
389 	iovs[0].iov_base = buf;
390 	iovs[0].iov_len = len;
391 }
392 
393 static void
394 spdk_bdev_io_put_buf(struct spdk_bdev_io *bdev_io)
395 {
396 	struct spdk_mempool *pool;
397 	struct spdk_bdev_io *tmp;
398 	void *buf, *aligned_buf;
399 	bdev_io_stailq_t *stailq;
400 	struct spdk_bdev_mgmt_channel *ch;
401 
402 	assert(bdev_io->u.bdev.iovcnt == 1);
403 
404 	buf = bdev_io->internal.buf;
405 	ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
406 
407 	bdev_io->internal.buf = NULL;
408 
409 	if (bdev_io->internal.buf_len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) {
410 		pool = g_bdev_mgr.buf_small_pool;
411 		stailq = &ch->need_buf_small;
412 	} else {
413 		pool = g_bdev_mgr.buf_large_pool;
414 		stailq = &ch->need_buf_large;
415 	}
416 
417 	if (STAILQ_EMPTY(stailq)) {
418 		spdk_mempool_put(pool, buf);
419 	} else {
420 		tmp = STAILQ_FIRST(stailq);
421 
422 		aligned_buf = (void *)(((uintptr_t)buf + 511) & ~511UL);
423 		spdk_bdev_io_set_buf(bdev_io, aligned_buf, tmp->internal.buf_len);
424 
425 		STAILQ_REMOVE_HEAD(stailq, internal.buf_link);
426 		tmp->internal.buf = buf;
427 		tmp->internal.get_buf_cb(tmp->internal.ch->channel, tmp);
428 	}
429 }
430 
431 void
432 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len)
433 {
434 	struct spdk_mempool *pool;
435 	bdev_io_stailq_t *stailq;
436 	void *buf, *aligned_buf;
437 	struct spdk_bdev_mgmt_channel *mgmt_ch;
438 
439 	assert(cb != NULL);
440 	assert(bdev_io->u.bdev.iovs != NULL);
441 
442 	if (spdk_unlikely(bdev_io->u.bdev.iovs[0].iov_base != NULL)) {
443 		/* Buffer already present */
444 		cb(bdev_io->internal.ch->channel, bdev_io);
445 		return;
446 	}
447 
448 	assert(len <= SPDK_BDEV_LARGE_BUF_MAX_SIZE);
449 	mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
450 
451 	bdev_io->internal.buf_len = len;
452 	bdev_io->internal.get_buf_cb = cb;
453 	if (len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) {
454 		pool = g_bdev_mgr.buf_small_pool;
455 		stailq = &mgmt_ch->need_buf_small;
456 	} else {
457 		pool = g_bdev_mgr.buf_large_pool;
458 		stailq = &mgmt_ch->need_buf_large;
459 	}
460 
461 	buf = spdk_mempool_get(pool);
462 
463 	if (!buf) {
464 		STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link);
465 	} else {
466 		aligned_buf = (void *)(((uintptr_t)buf + 511) & ~511UL);
467 		spdk_bdev_io_set_buf(bdev_io, aligned_buf, len);
468 
469 		bdev_io->internal.buf = buf;
470 		bdev_io->internal.get_buf_cb(bdev_io->internal.ch->channel, bdev_io);
471 	}
472 }
473 
474 static int
475 spdk_bdev_module_get_max_ctx_size(void)
476 {
477 	struct spdk_bdev_module *bdev_module;
478 	int max_bdev_module_size = 0;
479 
480 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
481 		if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) {
482 			max_bdev_module_size = bdev_module->get_ctx_size();
483 		}
484 	}
485 
486 	return max_bdev_module_size;
487 }
488 
489 void
490 spdk_bdev_config_text(FILE *fp)
491 {
492 	struct spdk_bdev_module *bdev_module;
493 
494 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
495 		if (bdev_module->config_text) {
496 			bdev_module->config_text(fp);
497 		}
498 	}
499 }
500 
501 void
502 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w)
503 {
504 	struct spdk_bdev_module *bdev_module;
505 	struct spdk_bdev *bdev;
506 
507 	assert(w != NULL);
508 
509 	spdk_json_write_array_begin(w);
510 
511 	spdk_json_write_object_begin(w);
512 	spdk_json_write_named_string(w, "method", "set_bdev_options");
513 	spdk_json_write_name(w, "params");
514 	spdk_json_write_object_begin(w);
515 	spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size);
516 	spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size);
517 	spdk_json_write_object_end(w);
518 	spdk_json_write_object_end(w);
519 
520 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
521 		if (bdev_module->config_json) {
522 			bdev_module->config_json(w);
523 		}
524 	}
525 
526 	TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) {
527 		spdk_bdev_config_json(bdev, w);
528 	}
529 
530 	spdk_json_write_array_end(w);
531 }
532 
533 static int
534 spdk_bdev_mgmt_channel_create(void *io_device, void *ctx_buf)
535 {
536 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
537 	struct spdk_bdev_io *bdev_io;
538 	uint32_t i;
539 
540 	STAILQ_INIT(&ch->need_buf_small);
541 	STAILQ_INIT(&ch->need_buf_large);
542 
543 	STAILQ_INIT(&ch->per_thread_cache);
544 	ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size;
545 
546 	/* Pre-populate bdev_io cache to ensure this thread cannot be starved. */
547 	ch->per_thread_cache_count = 0;
548 	for (i = 0; i < ch->bdev_io_cache_size; i++) {
549 		bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
550 		assert(bdev_io != NULL);
551 		ch->per_thread_cache_count++;
552 		STAILQ_INSERT_TAIL(&ch->per_thread_cache, bdev_io, internal.buf_link);
553 	}
554 
555 	TAILQ_INIT(&ch->shared_resources);
556 	TAILQ_INIT(&ch->io_wait_queue);
557 
558 	return 0;
559 }
560 
561 static void
562 spdk_bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf)
563 {
564 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
565 	struct spdk_bdev_io *bdev_io;
566 
567 	if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) {
568 		SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n");
569 	}
570 
571 	if (!TAILQ_EMPTY(&ch->shared_resources)) {
572 		SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n");
573 	}
574 
575 	while (!STAILQ_EMPTY(&ch->per_thread_cache)) {
576 		bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
577 		STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
578 		ch->per_thread_cache_count--;
579 		spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
580 	}
581 
582 	assert(ch->per_thread_cache_count == 0);
583 }
584 
585 static void
586 spdk_bdev_init_complete(int rc)
587 {
588 	spdk_bdev_init_cb cb_fn = g_init_cb_fn;
589 	void *cb_arg = g_init_cb_arg;
590 	struct spdk_bdev_module *m;
591 
592 	g_bdev_mgr.init_complete = true;
593 	g_init_cb_fn = NULL;
594 	g_init_cb_arg = NULL;
595 
596 	/*
597 	 * For modules that need to know when subsystem init is complete,
598 	 * inform them now.
599 	 */
600 	if (rc == 0) {
601 		TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
602 			if (m->init_complete) {
603 				m->init_complete();
604 			}
605 		}
606 	}
607 
608 	cb_fn(cb_arg, rc);
609 }
610 
611 static void
612 spdk_bdev_module_action_complete(void)
613 {
614 	struct spdk_bdev_module *m;
615 
616 	/*
617 	 * Don't finish bdev subsystem initialization if
618 	 * module pre-initialization is still in progress, or
619 	 * the subsystem been already initialized.
620 	 */
621 	if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) {
622 		return;
623 	}
624 
625 	/*
626 	 * Check all bdev modules for inits/examinations in progress. If any
627 	 * exist, return immediately since we cannot finish bdev subsystem
628 	 * initialization until all are completed.
629 	 */
630 	TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
631 		if (m->internal.action_in_progress > 0) {
632 			return;
633 		}
634 	}
635 
636 	/*
637 	 * Modules already finished initialization - now that all
638 	 * the bdev modules have finished their asynchronous I/O
639 	 * processing, the entire bdev layer can be marked as complete.
640 	 */
641 	spdk_bdev_init_complete(0);
642 }
643 
644 static void
645 spdk_bdev_module_action_done(struct spdk_bdev_module *module)
646 {
647 	assert(module->internal.action_in_progress > 0);
648 	module->internal.action_in_progress--;
649 	spdk_bdev_module_action_complete();
650 }
651 
652 void
653 spdk_bdev_module_init_done(struct spdk_bdev_module *module)
654 {
655 	spdk_bdev_module_action_done(module);
656 }
657 
658 void
659 spdk_bdev_module_examine_done(struct spdk_bdev_module *module)
660 {
661 	spdk_bdev_module_action_done(module);
662 }
663 
664 static int
665 spdk_bdev_modules_init(void)
666 {
667 	struct spdk_bdev_module *module;
668 	int rc = 0;
669 
670 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
671 		rc = module->module_init();
672 		if (rc != 0) {
673 			break;
674 		}
675 	}
676 
677 	g_bdev_mgr.module_init_complete = true;
678 	return rc;
679 }
680 
681 
682 static void
683 spdk_bdev_init_failed_complete(void *cb_arg)
684 {
685 	spdk_bdev_init_complete(-1);
686 }
687 
688 static void
689 spdk_bdev_init_failed(void *cb_arg)
690 {
691 	spdk_bdev_finish(spdk_bdev_init_failed_complete, NULL);
692 }
693 
694 void
695 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg)
696 {
697 	struct spdk_conf_section *sp;
698 	struct spdk_bdev_opts bdev_opts;
699 	int32_t bdev_io_pool_size, bdev_io_cache_size;
700 	int cache_size;
701 	int rc = 0;
702 	char mempool_name[32];
703 
704 	assert(cb_fn != NULL);
705 
706 	sp = spdk_conf_find_section(NULL, "Bdev");
707 	if (sp != NULL) {
708 		spdk_bdev_get_opts(&bdev_opts);
709 
710 		bdev_io_pool_size = spdk_conf_section_get_intval(sp, "BdevIoPoolSize");
711 		if (bdev_io_pool_size >= 0) {
712 			bdev_opts.bdev_io_pool_size = bdev_io_pool_size;
713 		}
714 
715 		bdev_io_cache_size = spdk_conf_section_get_intval(sp, "BdevIoCacheSize");
716 		if (bdev_io_cache_size >= 0) {
717 			bdev_opts.bdev_io_cache_size = bdev_io_cache_size;
718 		}
719 
720 		if (spdk_bdev_set_opts(&bdev_opts)) {
721 			spdk_bdev_init_complete(-1);
722 			return;
723 		}
724 
725 		assert(memcmp(&bdev_opts, &g_bdev_opts, sizeof(bdev_opts)) == 0);
726 	}
727 
728 	g_init_cb_fn = cb_fn;
729 	g_init_cb_arg = cb_arg;
730 
731 	snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid());
732 
733 	g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name,
734 				  g_bdev_opts.bdev_io_pool_size,
735 				  sizeof(struct spdk_bdev_io) +
736 				  spdk_bdev_module_get_max_ctx_size(),
737 				  0,
738 				  SPDK_ENV_SOCKET_ID_ANY);
739 
740 	if (g_bdev_mgr.bdev_io_pool == NULL) {
741 		SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n");
742 		spdk_bdev_init_complete(-1);
743 		return;
744 	}
745 
746 	/**
747 	 * Ensure no more than half of the total buffers end up local caches, by
748 	 *   using spdk_thread_get_count() to determine how many local caches we need
749 	 *   to account for.
750 	 */
751 	cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_thread_get_count());
752 	snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid());
753 
754 	g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name,
755 				    BUF_SMALL_POOL_SIZE,
756 				    SPDK_BDEV_SMALL_BUF_MAX_SIZE + 512,
757 				    cache_size,
758 				    SPDK_ENV_SOCKET_ID_ANY);
759 	if (!g_bdev_mgr.buf_small_pool) {
760 		SPDK_ERRLOG("create rbuf small pool failed\n");
761 		spdk_bdev_init_complete(-1);
762 		return;
763 	}
764 
765 	cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_thread_get_count());
766 	snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid());
767 
768 	g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name,
769 				    BUF_LARGE_POOL_SIZE,
770 				    SPDK_BDEV_LARGE_BUF_MAX_SIZE + 512,
771 				    cache_size,
772 				    SPDK_ENV_SOCKET_ID_ANY);
773 	if (!g_bdev_mgr.buf_large_pool) {
774 		SPDK_ERRLOG("create rbuf large pool failed\n");
775 		spdk_bdev_init_complete(-1);
776 		return;
777 	}
778 
779 	g_bdev_mgr.zero_buffer = spdk_dma_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE,
780 				 NULL);
781 	if (!g_bdev_mgr.zero_buffer) {
782 		SPDK_ERRLOG("create bdev zero buffer failed\n");
783 		spdk_bdev_init_complete(-1);
784 		return;
785 	}
786 
787 #ifdef SPDK_CONFIG_VTUNE
788 	g_bdev_mgr.domain = __itt_domain_create("spdk_bdev");
789 #endif
790 
791 	spdk_io_device_register(&g_bdev_mgr, spdk_bdev_mgmt_channel_create,
792 				spdk_bdev_mgmt_channel_destroy,
793 				sizeof(struct spdk_bdev_mgmt_channel));
794 
795 	rc = spdk_bdev_modules_init();
796 	if (rc != 0) {
797 		SPDK_ERRLOG("bdev modules init failed\n");
798 		spdk_thread_send_msg(spdk_get_thread(), spdk_bdev_init_failed, NULL);
799 		return;
800 	}
801 
802 	spdk_bdev_module_action_complete();
803 }
804 
805 static void
806 spdk_bdev_mgr_unregister_cb(void *io_device)
807 {
808 	spdk_bdev_fini_cb cb_fn = g_fini_cb_fn;
809 
810 	if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) {
811 		SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n",
812 			    spdk_mempool_count(g_bdev_mgr.bdev_io_pool),
813 			    g_bdev_opts.bdev_io_pool_size);
814 	}
815 
816 	if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != BUF_SMALL_POOL_SIZE) {
817 		SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n",
818 			    spdk_mempool_count(g_bdev_mgr.buf_small_pool),
819 			    BUF_SMALL_POOL_SIZE);
820 		assert(false);
821 	}
822 
823 	if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != BUF_LARGE_POOL_SIZE) {
824 		SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n",
825 			    spdk_mempool_count(g_bdev_mgr.buf_large_pool),
826 			    BUF_LARGE_POOL_SIZE);
827 		assert(false);
828 	}
829 
830 	spdk_mempool_free(g_bdev_mgr.bdev_io_pool);
831 	spdk_mempool_free(g_bdev_mgr.buf_small_pool);
832 	spdk_mempool_free(g_bdev_mgr.buf_large_pool);
833 	spdk_dma_free(g_bdev_mgr.zero_buffer);
834 
835 	cb_fn(g_fini_cb_arg);
836 	g_fini_cb_fn = NULL;
837 	g_fini_cb_arg = NULL;
838 }
839 
840 static struct spdk_bdev_module *g_resume_bdev_module = NULL;
841 
842 static void
843 spdk_bdev_module_finish_iter(void *arg)
844 {
845 	struct spdk_bdev_module *bdev_module;
846 
847 	/* Start iterating from the last touched module */
848 	if (!g_resume_bdev_module) {
849 		bdev_module = TAILQ_FIRST(&g_bdev_mgr.bdev_modules);
850 	} else {
851 		bdev_module = TAILQ_NEXT(g_resume_bdev_module, internal.tailq);
852 	}
853 
854 	while (bdev_module) {
855 		if (bdev_module->async_fini) {
856 			/* Save our place so we can resume later. We must
857 			 * save the variable here, before calling module_fini()
858 			 * below, because in some cases the module may immediately
859 			 * call spdk_bdev_module_finish_done() and re-enter
860 			 * this function to continue iterating. */
861 			g_resume_bdev_module = bdev_module;
862 		}
863 
864 		if (bdev_module->module_fini) {
865 			bdev_module->module_fini();
866 		}
867 
868 		if (bdev_module->async_fini) {
869 			return;
870 		}
871 
872 		bdev_module = TAILQ_NEXT(bdev_module, internal.tailq);
873 	}
874 
875 	g_resume_bdev_module = NULL;
876 	spdk_io_device_unregister(&g_bdev_mgr, spdk_bdev_mgr_unregister_cb);
877 }
878 
879 void
880 spdk_bdev_module_finish_done(void)
881 {
882 	if (spdk_get_thread() != g_fini_thread) {
883 		spdk_thread_send_msg(g_fini_thread, spdk_bdev_module_finish_iter, NULL);
884 	} else {
885 		spdk_bdev_module_finish_iter(NULL);
886 	}
887 }
888 
889 static void
890 _spdk_bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno)
891 {
892 	struct spdk_bdev *bdev = cb_arg;
893 
894 	if (bdeverrno && bdev) {
895 		SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n",
896 			     bdev->name);
897 
898 		/*
899 		 * Since the call to spdk_bdev_unregister() failed, we have no way to free this
900 		 *  bdev; try to continue by manually removing this bdev from the list and continue
901 		 *  with the next bdev in the list.
902 		 */
903 		TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
904 	}
905 
906 	if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) {
907 		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Done unregistering bdevs\n");
908 		/*
909 		 * Bdev module finish need to be deffered as we might be in the middle of some context
910 		 * (like bdev part free) that will use this bdev (or private bdev driver ctx data)
911 		 * after returning.
912 		 */
913 		spdk_thread_send_msg(spdk_get_thread(), spdk_bdev_module_finish_iter, NULL);
914 		return;
915 	}
916 
917 	/*
918 	 * Unregister the first bdev in the list.
919 	 *
920 	 * spdk_bdev_unregister() will handle the case where the bdev has open descriptors by
921 	 *  calling the remove_cb of the descriptors first.
922 	 *
923 	 * Once this bdev and all of its open descriptors have been cleaned up, this function
924 	 *  will be called again via the unregister completion callback to continue the cleanup
925 	 *  process with the next bdev.
926 	 */
927 	bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs);
928 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Unregistering bdev '%s'\n", bdev->name);
929 	spdk_bdev_unregister(bdev, _spdk_bdev_finish_unregister_bdevs_iter, bdev);
930 }
931 
932 void
933 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg)
934 {
935 	assert(cb_fn != NULL);
936 
937 	g_fini_thread = spdk_get_thread();
938 
939 	g_fini_cb_fn = cb_fn;
940 	g_fini_cb_arg = cb_arg;
941 
942 	_spdk_bdev_finish_unregister_bdevs_iter(NULL, 0);
943 }
944 
945 static struct spdk_bdev_io *
946 spdk_bdev_get_io(struct spdk_bdev_channel *channel)
947 {
948 	struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch;
949 	struct spdk_bdev_io *bdev_io;
950 
951 	if (ch->per_thread_cache_count > 0) {
952 		bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
953 		STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
954 		ch->per_thread_cache_count--;
955 	} else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) {
956 		/*
957 		 * Don't try to look for bdev_ios in the global pool if there are
958 		 * waiters on bdev_ios - we don't want this caller to jump the line.
959 		 */
960 		bdev_io = NULL;
961 	} else {
962 		bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
963 	}
964 
965 	return bdev_io;
966 }
967 
968 void
969 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io)
970 {
971 	struct spdk_bdev_mgmt_channel *ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
972 
973 	assert(bdev_io != NULL);
974 	assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING);
975 
976 	if (bdev_io->internal.buf != NULL) {
977 		spdk_bdev_io_put_buf(bdev_io);
978 	}
979 
980 	if (ch->per_thread_cache_count < ch->bdev_io_cache_size) {
981 		ch->per_thread_cache_count++;
982 		STAILQ_INSERT_TAIL(&ch->per_thread_cache, bdev_io, internal.buf_link);
983 		while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) {
984 			struct spdk_bdev_io_wait_entry *entry;
985 
986 			entry = TAILQ_FIRST(&ch->io_wait_queue);
987 			TAILQ_REMOVE(&ch->io_wait_queue, entry, link);
988 			entry->cb_fn(entry->cb_arg);
989 		}
990 	} else {
991 		/* We should never have a full cache with entries on the io wait queue. */
992 		assert(TAILQ_EMPTY(&ch->io_wait_queue));
993 		spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
994 	}
995 }
996 
997 static uint64_t
998 _spdk_bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io)
999 {
1000 	struct spdk_bdev	*bdev = bdev_io->bdev;
1001 
1002 	switch (bdev_io->type) {
1003 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
1004 	case SPDK_BDEV_IO_TYPE_NVME_IO:
1005 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
1006 		return bdev_io->u.nvme_passthru.nbytes;
1007 	case SPDK_BDEV_IO_TYPE_READ:
1008 	case SPDK_BDEV_IO_TYPE_WRITE:
1009 	case SPDK_BDEV_IO_TYPE_UNMAP:
1010 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
1011 		return bdev_io->u.bdev.num_blocks * bdev->blocklen;
1012 	default:
1013 		return 0;
1014 	}
1015 }
1016 
1017 static void
1018 _spdk_bdev_qos_io_submit(struct spdk_bdev_channel *ch)
1019 {
1020 	struct spdk_bdev_io		*bdev_io = NULL;
1021 	struct spdk_bdev		*bdev = ch->bdev;
1022 	struct spdk_bdev_qos		*qos = bdev->internal.qos;
1023 	struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource;
1024 
1025 	while (!TAILQ_EMPTY(&qos->queued)) {
1026 		if (qos->max_ios_per_timeslice > 0 &&
1027 		    qos->io_submitted_this_timeslice >= qos->max_ios_per_timeslice) {
1028 			break;
1029 		}
1030 
1031 		if (qos->max_byte_per_timeslice > 0 &&
1032 		    qos->byte_submitted_this_timeslice >= qos->max_byte_per_timeslice) {
1033 			break;
1034 		}
1035 
1036 		bdev_io = TAILQ_FIRST(&qos->queued);
1037 		TAILQ_REMOVE(&qos->queued, bdev_io, internal.link);
1038 		qos->io_submitted_this_timeslice++;
1039 		qos->byte_submitted_this_timeslice += _spdk_bdev_get_io_size_in_byte(bdev_io);
1040 		ch->io_outstanding++;
1041 		shared_resource->io_outstanding++;
1042 		bdev->fn_table->submit_request(ch->channel, bdev_io);
1043 	}
1044 }
1045 
1046 static void
1047 _spdk_bdev_io_submit(void *ctx)
1048 {
1049 	struct spdk_bdev_io *bdev_io = ctx;
1050 	struct spdk_bdev *bdev = bdev_io->bdev;
1051 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
1052 	struct spdk_io_channel *ch = bdev_ch->channel;
1053 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
1054 
1055 	bdev_io->internal.submit_tsc = spdk_get_ticks();
1056 	bdev_ch->io_outstanding++;
1057 	shared_resource->io_outstanding++;
1058 	bdev_io->internal.in_submit_request = true;
1059 	if (spdk_likely(bdev_ch->flags == 0)) {
1060 		if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) {
1061 			bdev->fn_table->submit_request(ch, bdev_io);
1062 		} else {
1063 			bdev_ch->io_outstanding--;
1064 			shared_resource->io_outstanding--;
1065 			TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link);
1066 		}
1067 	} else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) {
1068 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
1069 	} else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) {
1070 		bdev_ch->io_outstanding--;
1071 		shared_resource->io_outstanding--;
1072 		TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link);
1073 		_spdk_bdev_qos_io_submit(bdev_ch);
1074 	} else {
1075 		SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags);
1076 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
1077 	}
1078 	bdev_io->internal.in_submit_request = false;
1079 }
1080 
1081 static void
1082 spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io)
1083 {
1084 	struct spdk_bdev *bdev = bdev_io->bdev;
1085 	struct spdk_thread *thread = spdk_io_channel_get_thread(bdev_io->internal.ch->channel);
1086 
1087 	assert(thread != NULL);
1088 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
1089 
1090 	if (bdev_io->internal.ch->flags & BDEV_CH_QOS_ENABLED) {
1091 		if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) {
1092 			_spdk_bdev_io_submit(bdev_io);
1093 		} else {
1094 			bdev_io->internal.io_submit_ch = bdev_io->internal.ch;
1095 			bdev_io->internal.ch = bdev->internal.qos->ch;
1096 			spdk_thread_send_msg(bdev->internal.qos->thread, _spdk_bdev_io_submit, bdev_io);
1097 		}
1098 	} else {
1099 		_spdk_bdev_io_submit(bdev_io);
1100 	}
1101 }
1102 
1103 static void
1104 spdk_bdev_io_submit_reset(struct spdk_bdev_io *bdev_io)
1105 {
1106 	struct spdk_bdev *bdev = bdev_io->bdev;
1107 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
1108 	struct spdk_io_channel *ch = bdev_ch->channel;
1109 
1110 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
1111 
1112 	bdev_io->internal.in_submit_request = true;
1113 	bdev->fn_table->submit_request(ch, bdev_io);
1114 	bdev_io->internal.in_submit_request = false;
1115 }
1116 
1117 static void
1118 spdk_bdev_io_init(struct spdk_bdev_io *bdev_io,
1119 		  struct spdk_bdev *bdev, void *cb_arg,
1120 		  spdk_bdev_io_completion_cb cb)
1121 {
1122 	bdev_io->bdev = bdev;
1123 	bdev_io->internal.caller_ctx = cb_arg;
1124 	bdev_io->internal.cb = cb;
1125 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
1126 	bdev_io->internal.in_submit_request = false;
1127 	bdev_io->internal.buf = NULL;
1128 	bdev_io->internal.io_submit_ch = NULL;
1129 }
1130 
1131 static bool
1132 _spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
1133 {
1134 	return bdev->fn_table->io_type_supported(bdev->ctxt, io_type);
1135 }
1136 
1137 bool
1138 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
1139 {
1140 	bool supported;
1141 
1142 	supported = _spdk_bdev_io_type_supported(bdev, io_type);
1143 
1144 	if (!supported) {
1145 		switch (io_type) {
1146 		case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
1147 			/* The bdev layer will emulate write zeroes as long as write is supported. */
1148 			supported = _spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE);
1149 			break;
1150 		default:
1151 			break;
1152 		}
1153 	}
1154 
1155 	return supported;
1156 }
1157 
1158 int
1159 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1160 {
1161 	if (bdev->fn_table->dump_info_json) {
1162 		return bdev->fn_table->dump_info_json(bdev->ctxt, w);
1163 	}
1164 
1165 	return 0;
1166 }
1167 
1168 void
1169 spdk_bdev_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1170 {
1171 	assert(bdev != NULL);
1172 	assert(w != NULL);
1173 
1174 	if (bdev->fn_table->write_config_json) {
1175 		bdev->fn_table->write_config_json(bdev, w);
1176 	} else {
1177 		spdk_json_write_object_begin(w);
1178 		spdk_json_write_named_string(w, "name", bdev->name);
1179 		spdk_json_write_object_end(w);
1180 	}
1181 }
1182 
1183 static void
1184 spdk_bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos)
1185 {
1186 	uint64_t max_ios_per_timeslice = 0, max_byte_per_timeslice = 0;
1187 
1188 	if (qos->iops_rate_limit > 0) {
1189 		max_ios_per_timeslice = qos->iops_rate_limit * SPDK_BDEV_QOS_TIMESLICE_IN_USEC /
1190 					SPDK_BDEV_SEC_TO_USEC;
1191 		qos->max_ios_per_timeslice = spdk_max(max_ios_per_timeslice,
1192 						      SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE);
1193 	}
1194 
1195 	if (qos->byte_rate_limit > 0) {
1196 		max_byte_per_timeslice = qos->byte_rate_limit * SPDK_BDEV_QOS_TIMESLICE_IN_USEC /
1197 					 SPDK_BDEV_SEC_TO_USEC;
1198 		qos->max_byte_per_timeslice = spdk_max(max_byte_per_timeslice,
1199 						       SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE);
1200 	}
1201 }
1202 
1203 static int
1204 spdk_bdev_channel_poll_qos(void *arg)
1205 {
1206 	struct spdk_bdev_qos *qos = arg;
1207 
1208 	/* Reset for next round of rate limiting */
1209 	qos->io_submitted_this_timeslice = 0;
1210 
1211 	/* More bytes sent in the last timeslice, allow less in this timeslice */
1212 	if (qos->byte_submitted_this_timeslice > qos->max_byte_per_timeslice) {
1213 		qos->byte_submitted_this_timeslice -= qos->max_byte_per_timeslice;
1214 	} else {
1215 		qos->byte_submitted_this_timeslice = 0;
1216 	}
1217 
1218 	_spdk_bdev_qos_io_submit(qos->ch);
1219 
1220 	return -1;
1221 }
1222 
1223 static void
1224 _spdk_bdev_channel_destroy_resource(struct spdk_bdev_channel *ch)
1225 {
1226 	struct spdk_bdev_shared_resource *shared_resource;
1227 
1228 	if (!ch) {
1229 		return;
1230 	}
1231 
1232 	if (ch->channel) {
1233 		spdk_put_io_channel(ch->channel);
1234 	}
1235 
1236 	assert(ch->io_outstanding == 0);
1237 
1238 	shared_resource = ch->shared_resource;
1239 	if (shared_resource) {
1240 		assert(ch->io_outstanding == 0);
1241 		assert(shared_resource->ref > 0);
1242 		shared_resource->ref--;
1243 		if (shared_resource->ref == 0) {
1244 			assert(shared_resource->io_outstanding == 0);
1245 			TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link);
1246 			spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch));
1247 			free(shared_resource);
1248 		}
1249 	}
1250 }
1251 
1252 /* Caller must hold bdev->internal.mutex. */
1253 static void
1254 _spdk_bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch)
1255 {
1256 	struct spdk_bdev_qos *qos = bdev->internal.qos;
1257 
1258 	/* Rate limiting on this bdev enabled */
1259 	if (qos) {
1260 		if (qos->ch == NULL) {
1261 			struct spdk_io_channel *io_ch;
1262 
1263 			SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch,
1264 				      bdev->name, spdk_get_thread());
1265 
1266 			/* No qos channel has been selected, so set one up */
1267 
1268 			/* Take another reference to ch */
1269 			io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev));
1270 			qos->ch = ch;
1271 
1272 			qos->thread = spdk_io_channel_get_thread(io_ch);
1273 
1274 			TAILQ_INIT(&qos->queued);
1275 			spdk_bdev_qos_update_max_quota_per_timeslice(qos);
1276 			qos->io_submitted_this_timeslice = 0;
1277 			qos->byte_submitted_this_timeslice = 0;
1278 
1279 			qos->poller = spdk_poller_register(spdk_bdev_channel_poll_qos,
1280 							   qos,
1281 							   SPDK_BDEV_QOS_TIMESLICE_IN_USEC);
1282 		}
1283 
1284 		ch->flags |= BDEV_CH_QOS_ENABLED;
1285 	}
1286 }
1287 
1288 static int
1289 spdk_bdev_channel_create(void *io_device, void *ctx_buf)
1290 {
1291 	struct spdk_bdev		*bdev = __bdev_from_io_dev(io_device);
1292 	struct spdk_bdev_channel	*ch = ctx_buf;
1293 	struct spdk_io_channel		*mgmt_io_ch;
1294 	struct spdk_bdev_mgmt_channel	*mgmt_ch;
1295 	struct spdk_bdev_shared_resource *shared_resource;
1296 
1297 	ch->bdev = bdev;
1298 	ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt);
1299 	if (!ch->channel) {
1300 		return -1;
1301 	}
1302 
1303 	mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr);
1304 	if (!mgmt_io_ch) {
1305 		return -1;
1306 	}
1307 
1308 	mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch);
1309 	TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) {
1310 		if (shared_resource->shared_ch == ch->channel) {
1311 			spdk_put_io_channel(mgmt_io_ch);
1312 			shared_resource->ref++;
1313 			break;
1314 		}
1315 	}
1316 
1317 	if (shared_resource == NULL) {
1318 		shared_resource = calloc(1, sizeof(*shared_resource));
1319 		if (shared_resource == NULL) {
1320 			spdk_put_io_channel(mgmt_io_ch);
1321 			return -1;
1322 		}
1323 
1324 		shared_resource->mgmt_ch = mgmt_ch;
1325 		shared_resource->io_outstanding = 0;
1326 		TAILQ_INIT(&shared_resource->nomem_io);
1327 		shared_resource->nomem_threshold = 0;
1328 		shared_resource->shared_ch = ch->channel;
1329 		shared_resource->ref = 1;
1330 		TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link);
1331 	}
1332 
1333 	memset(&ch->stat, 0, sizeof(ch->stat));
1334 	ch->stat.ticks_rate = spdk_get_ticks_hz();
1335 	ch->io_outstanding = 0;
1336 	TAILQ_INIT(&ch->queued_resets);
1337 	ch->flags = 0;
1338 	ch->shared_resource = shared_resource;
1339 
1340 #ifdef SPDK_CONFIG_VTUNE
1341 	{
1342 		char *name;
1343 		__itt_init_ittlib(NULL, 0);
1344 		name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch);
1345 		if (!name) {
1346 			_spdk_bdev_channel_destroy_resource(ch);
1347 			return -1;
1348 		}
1349 		ch->handle = __itt_string_handle_create(name);
1350 		free(name);
1351 		ch->start_tsc = spdk_get_ticks();
1352 		ch->interval_tsc = spdk_get_ticks_hz() / 100;
1353 		memset(&ch->prev_stat, 0, sizeof(ch->prev_stat));
1354 	}
1355 #endif
1356 
1357 	pthread_mutex_lock(&bdev->internal.mutex);
1358 	_spdk_bdev_enable_qos(bdev, ch);
1359 	pthread_mutex_unlock(&bdev->internal.mutex);
1360 
1361 	return 0;
1362 }
1363 
1364 /*
1365  * Abort I/O that are waiting on a data buffer.  These types of I/O are
1366  *  linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY.
1367  */
1368 static void
1369 _spdk_bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch)
1370 {
1371 	bdev_io_stailq_t tmp;
1372 	struct spdk_bdev_io *bdev_io;
1373 
1374 	STAILQ_INIT(&tmp);
1375 
1376 	while (!STAILQ_EMPTY(queue)) {
1377 		bdev_io = STAILQ_FIRST(queue);
1378 		STAILQ_REMOVE_HEAD(queue, internal.buf_link);
1379 		if (bdev_io->internal.ch == ch) {
1380 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
1381 		} else {
1382 			STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link);
1383 		}
1384 	}
1385 
1386 	STAILQ_SWAP(&tmp, queue, spdk_bdev_io);
1387 }
1388 
1389 /*
1390  * Abort I/O that are queued waiting for submission.  These types of I/O are
1391  *  linked using the spdk_bdev_io link TAILQ_ENTRY.
1392  */
1393 static void
1394 _spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch)
1395 {
1396 	struct spdk_bdev_io *bdev_io, *tmp;
1397 
1398 	TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) {
1399 		if (bdev_io->internal.ch == ch) {
1400 			TAILQ_REMOVE(queue, bdev_io, internal.link);
1401 			/*
1402 			 * spdk_bdev_io_complete() assumes that the completed I/O had
1403 			 *  been submitted to the bdev module.  Since in this case it
1404 			 *  hadn't, bump io_outstanding to account for the decrement
1405 			 *  that spdk_bdev_io_complete() will do.
1406 			 */
1407 			if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) {
1408 				ch->io_outstanding++;
1409 				ch->shared_resource->io_outstanding++;
1410 			}
1411 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
1412 		}
1413 	}
1414 }
1415 
1416 static void
1417 spdk_bdev_qos_channel_destroy(void *cb_arg)
1418 {
1419 	struct spdk_bdev_qos *qos = cb_arg;
1420 
1421 	spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
1422 	spdk_poller_unregister(&qos->poller);
1423 
1424 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Free QoS %p.\n", qos);
1425 
1426 	free(qos);
1427 }
1428 
1429 static int
1430 spdk_bdev_qos_destroy(struct spdk_bdev *bdev)
1431 {
1432 	/*
1433 	 * Cleanly shutting down the QoS poller is tricky, because
1434 	 * during the asynchronous operation the user could open
1435 	 * a new descriptor and create a new channel, spawning
1436 	 * a new QoS poller.
1437 	 *
1438 	 * The strategy is to create a new QoS structure here and swap it
1439 	 * in. The shutdown path then continues to refer to the old one
1440 	 * until it completes and then releases it.
1441 	 */
1442 	struct spdk_bdev_qos *new_qos, *old_qos;
1443 
1444 	old_qos = bdev->internal.qos;
1445 
1446 	new_qos = calloc(1, sizeof(*new_qos));
1447 	if (!new_qos) {
1448 		SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n");
1449 		return -ENOMEM;
1450 	}
1451 
1452 	/* Copy the old QoS data into the newly allocated structure */
1453 	memcpy(new_qos, old_qos, sizeof(*new_qos));
1454 
1455 	/* Zero out the key parts of the QoS structure */
1456 	new_qos->ch = NULL;
1457 	new_qos->thread = NULL;
1458 	new_qos->max_ios_per_timeslice = 0;
1459 	new_qos->max_byte_per_timeslice = 0;
1460 	new_qos->io_submitted_this_timeslice = 0;
1461 	new_qos->byte_submitted_this_timeslice = 0;
1462 	new_qos->poller = NULL;
1463 	TAILQ_INIT(&new_qos->queued);
1464 
1465 	bdev->internal.qos = new_qos;
1466 
1467 	if (old_qos->thread == NULL) {
1468 		free(old_qos);
1469 	} else {
1470 		spdk_thread_send_msg(old_qos->thread, spdk_bdev_qos_channel_destroy,
1471 				     old_qos);
1472 	}
1473 
1474 	/* It is safe to continue with destroying the bdev even though the QoS channel hasn't
1475 	 * been destroyed yet. The destruction path will end up waiting for the final
1476 	 * channel to be put before it releases resources. */
1477 
1478 	return 0;
1479 }
1480 
1481 static void
1482 _spdk_bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add)
1483 {
1484 	total->bytes_read += add->bytes_read;
1485 	total->num_read_ops += add->num_read_ops;
1486 	total->bytes_written += add->bytes_written;
1487 	total->num_write_ops += add->num_write_ops;
1488 	total->read_latency_ticks += add->read_latency_ticks;
1489 	total->write_latency_ticks += add->write_latency_ticks;
1490 }
1491 
1492 static void
1493 spdk_bdev_channel_destroy(void *io_device, void *ctx_buf)
1494 {
1495 	struct spdk_bdev_channel	*ch = ctx_buf;
1496 	struct spdk_bdev_mgmt_channel	*mgmt_ch;
1497 	struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource;
1498 
1499 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name,
1500 		      spdk_get_thread());
1501 
1502 	/* This channel is going away, so add its statistics into the bdev so that they don't get lost. */
1503 	pthread_mutex_lock(&ch->bdev->internal.mutex);
1504 	_spdk_bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat);
1505 	pthread_mutex_unlock(&ch->bdev->internal.mutex);
1506 
1507 	mgmt_ch = shared_resource->mgmt_ch;
1508 
1509 	_spdk_bdev_abort_queued_io(&ch->queued_resets, ch);
1510 	_spdk_bdev_abort_queued_io(&shared_resource->nomem_io, ch);
1511 	_spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_small, ch);
1512 	_spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_large, ch);
1513 
1514 	_spdk_bdev_channel_destroy_resource(ch);
1515 }
1516 
1517 int
1518 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias)
1519 {
1520 	struct spdk_bdev_alias *tmp;
1521 
1522 	if (alias == NULL) {
1523 		SPDK_ERRLOG("Empty alias passed\n");
1524 		return -EINVAL;
1525 	}
1526 
1527 	if (spdk_bdev_get_by_name(alias)) {
1528 		SPDK_ERRLOG("Bdev name/alias: %s already exists\n", alias);
1529 		return -EEXIST;
1530 	}
1531 
1532 	tmp = calloc(1, sizeof(*tmp));
1533 	if (tmp == NULL) {
1534 		SPDK_ERRLOG("Unable to allocate alias\n");
1535 		return -ENOMEM;
1536 	}
1537 
1538 	tmp->alias = strdup(alias);
1539 	if (tmp->alias == NULL) {
1540 		free(tmp);
1541 		SPDK_ERRLOG("Unable to allocate alias\n");
1542 		return -ENOMEM;
1543 	}
1544 
1545 	TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq);
1546 
1547 	return 0;
1548 }
1549 
1550 int
1551 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias)
1552 {
1553 	struct spdk_bdev_alias *tmp;
1554 
1555 	TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
1556 		if (strcmp(alias, tmp->alias) == 0) {
1557 			TAILQ_REMOVE(&bdev->aliases, tmp, tailq);
1558 			free(tmp->alias);
1559 			free(tmp);
1560 			return 0;
1561 		}
1562 	}
1563 
1564 	SPDK_INFOLOG(SPDK_LOG_BDEV, "Alias %s does not exists\n", alias);
1565 
1566 	return -ENOENT;
1567 }
1568 
1569 struct spdk_io_channel *
1570 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc)
1571 {
1572 	return spdk_get_io_channel(__bdev_to_io_dev(desc->bdev));
1573 }
1574 
1575 const char *
1576 spdk_bdev_get_name(const struct spdk_bdev *bdev)
1577 {
1578 	return bdev->name;
1579 }
1580 
1581 const char *
1582 spdk_bdev_get_product_name(const struct spdk_bdev *bdev)
1583 {
1584 	return bdev->product_name;
1585 }
1586 
1587 const struct spdk_bdev_aliases_list *
1588 spdk_bdev_get_aliases(const struct spdk_bdev *bdev)
1589 {
1590 	return &bdev->aliases;
1591 }
1592 
1593 uint32_t
1594 spdk_bdev_get_block_size(const struct spdk_bdev *bdev)
1595 {
1596 	return bdev->blocklen;
1597 }
1598 
1599 uint64_t
1600 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev)
1601 {
1602 	return bdev->blockcnt;
1603 }
1604 
1605 uint64_t
1606 spdk_bdev_get_qos_ios_per_sec(struct spdk_bdev *bdev)
1607 {
1608 	uint64_t iops_rate_limit = 0;
1609 
1610 	pthread_mutex_lock(&bdev->internal.mutex);
1611 	if (bdev->internal.qos) {
1612 		iops_rate_limit = bdev->internal.qos->iops_rate_limit;
1613 	}
1614 	pthread_mutex_unlock(&bdev->internal.mutex);
1615 
1616 	return iops_rate_limit;
1617 }
1618 
1619 size_t
1620 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev)
1621 {
1622 	/* TODO: push this logic down to the bdev modules */
1623 	if (bdev->need_aligned_buffer) {
1624 		return bdev->blocklen;
1625 	}
1626 
1627 	return 1;
1628 }
1629 
1630 uint32_t
1631 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev)
1632 {
1633 	return bdev->optimal_io_boundary;
1634 }
1635 
1636 bool
1637 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev)
1638 {
1639 	return bdev->write_cache;
1640 }
1641 
1642 const struct spdk_uuid *
1643 spdk_bdev_get_uuid(const struct spdk_bdev *bdev)
1644 {
1645 	return &bdev->uuid;
1646 }
1647 
1648 uint64_t
1649 spdk_bdev_get_qd(const struct spdk_bdev *bdev)
1650 {
1651 	return bdev->internal.measured_queue_depth;
1652 }
1653 
1654 uint64_t
1655 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev)
1656 {
1657 	return bdev->internal.period;
1658 }
1659 
1660 uint64_t
1661 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev)
1662 {
1663 	return bdev->internal.weighted_io_time;
1664 }
1665 
1666 uint64_t
1667 spdk_bdev_get_io_time(const struct spdk_bdev *bdev)
1668 {
1669 	return bdev->internal.io_time;
1670 }
1671 
1672 static void
1673 _calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status)
1674 {
1675 	struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i);
1676 
1677 	bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth;
1678 
1679 	if (bdev->internal.measured_queue_depth) {
1680 		bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth;
1681 	}
1682 }
1683 
1684 static void
1685 _calculate_measured_qd(struct spdk_io_channel_iter *i)
1686 {
1687 	struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i);
1688 	struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i);
1689 	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch);
1690 
1691 	bdev->internal.temporary_queue_depth += ch->io_outstanding;
1692 	spdk_for_each_channel_continue(i, 0);
1693 }
1694 
1695 static int
1696 spdk_bdev_calculate_measured_queue_depth(void *ctx)
1697 {
1698 	struct spdk_bdev *bdev = ctx;
1699 	bdev->internal.temporary_queue_depth = 0;
1700 	spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev,
1701 			      _calculate_measured_qd_cpl);
1702 	return 0;
1703 }
1704 
1705 void
1706 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period)
1707 {
1708 	bdev->internal.period = period;
1709 
1710 	if (bdev->internal.qd_poller != NULL) {
1711 		spdk_poller_unregister(&bdev->internal.qd_poller);
1712 		bdev->internal.measured_queue_depth = UINT64_MAX;
1713 	}
1714 
1715 	if (period != 0) {
1716 		bdev->internal.qd_poller = spdk_poller_register(spdk_bdev_calculate_measured_queue_depth, bdev,
1717 					   period);
1718 	}
1719 }
1720 
1721 int
1722 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size)
1723 {
1724 	int ret;
1725 
1726 	pthread_mutex_lock(&bdev->internal.mutex);
1727 
1728 	/* bdev has open descriptors */
1729 	if (!TAILQ_EMPTY(&bdev->internal.open_descs) &&
1730 	    bdev->blockcnt > size) {
1731 		ret = -EBUSY;
1732 	} else {
1733 		bdev->blockcnt = size;
1734 		ret = 0;
1735 	}
1736 
1737 	pthread_mutex_unlock(&bdev->internal.mutex);
1738 
1739 	return ret;
1740 }
1741 
1742 /*
1743  * Convert I/O offset and length from bytes to blocks.
1744  *
1745  * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size.
1746  */
1747 static uint64_t
1748 spdk_bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks,
1749 			  uint64_t num_bytes, uint64_t *num_blocks)
1750 {
1751 	uint32_t block_size = bdev->blocklen;
1752 
1753 	*offset_blocks = offset_bytes / block_size;
1754 	*num_blocks = num_bytes / block_size;
1755 
1756 	return (offset_bytes % block_size) | (num_bytes % block_size);
1757 }
1758 
1759 static bool
1760 spdk_bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks)
1761 {
1762 	/* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there
1763 	 * has been an overflow and hence the offset has been wrapped around */
1764 	if (offset_blocks + num_blocks < offset_blocks) {
1765 		return false;
1766 	}
1767 
1768 	/* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */
1769 	if (offset_blocks + num_blocks > bdev->blockcnt) {
1770 		return false;
1771 	}
1772 
1773 	return true;
1774 }
1775 
1776 int
1777 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1778 	       void *buf, uint64_t offset, uint64_t nbytes,
1779 	       spdk_bdev_io_completion_cb cb, void *cb_arg)
1780 {
1781 	uint64_t offset_blocks, num_blocks;
1782 
1783 	if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
1784 		return -EINVAL;
1785 	}
1786 
1787 	return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
1788 }
1789 
1790 int
1791 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1792 		      void *buf, uint64_t offset_blocks, uint64_t num_blocks,
1793 		      spdk_bdev_io_completion_cb cb, void *cb_arg)
1794 {
1795 	struct spdk_bdev *bdev = desc->bdev;
1796 	struct spdk_bdev_io *bdev_io;
1797 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1798 
1799 	if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
1800 		return -EINVAL;
1801 	}
1802 
1803 	bdev_io = spdk_bdev_get_io(channel);
1804 	if (!bdev_io) {
1805 		return -ENOMEM;
1806 	}
1807 
1808 	bdev_io->internal.ch = channel;
1809 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
1810 	bdev_io->u.bdev.iovs = &bdev_io->iov;
1811 	bdev_io->u.bdev.iovs[0].iov_base = buf;
1812 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
1813 	bdev_io->u.bdev.iovcnt = 1;
1814 	bdev_io->u.bdev.num_blocks = num_blocks;
1815 	bdev_io->u.bdev.offset_blocks = offset_blocks;
1816 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
1817 
1818 	spdk_bdev_io_submit(bdev_io);
1819 	return 0;
1820 }
1821 
1822 int
1823 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1824 		struct iovec *iov, int iovcnt,
1825 		uint64_t offset, uint64_t nbytes,
1826 		spdk_bdev_io_completion_cb cb, void *cb_arg)
1827 {
1828 	uint64_t offset_blocks, num_blocks;
1829 
1830 	if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
1831 		return -EINVAL;
1832 	}
1833 
1834 	return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
1835 }
1836 
1837 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1838 			   struct iovec *iov, int iovcnt,
1839 			   uint64_t offset_blocks, uint64_t num_blocks,
1840 			   spdk_bdev_io_completion_cb cb, void *cb_arg)
1841 {
1842 	struct spdk_bdev *bdev = desc->bdev;
1843 	struct spdk_bdev_io *bdev_io;
1844 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1845 
1846 	if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
1847 		return -EINVAL;
1848 	}
1849 
1850 	bdev_io = spdk_bdev_get_io(channel);
1851 	if (!bdev_io) {
1852 		return -ENOMEM;
1853 	}
1854 
1855 	bdev_io->internal.ch = channel;
1856 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
1857 	bdev_io->u.bdev.iovs = iov;
1858 	bdev_io->u.bdev.iovcnt = iovcnt;
1859 	bdev_io->u.bdev.num_blocks = num_blocks;
1860 	bdev_io->u.bdev.offset_blocks = offset_blocks;
1861 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
1862 
1863 	spdk_bdev_io_submit(bdev_io);
1864 	return 0;
1865 }
1866 
1867 int
1868 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1869 		void *buf, uint64_t offset, uint64_t nbytes,
1870 		spdk_bdev_io_completion_cb cb, void *cb_arg)
1871 {
1872 	uint64_t offset_blocks, num_blocks;
1873 
1874 	if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
1875 		return -EINVAL;
1876 	}
1877 
1878 	return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
1879 }
1880 
1881 int
1882 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1883 		       void *buf, uint64_t offset_blocks, uint64_t num_blocks,
1884 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
1885 {
1886 	struct spdk_bdev *bdev = desc->bdev;
1887 	struct spdk_bdev_io *bdev_io;
1888 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1889 
1890 	if (!desc->write) {
1891 		return -EBADF;
1892 	}
1893 
1894 	if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
1895 		return -EINVAL;
1896 	}
1897 
1898 	bdev_io = spdk_bdev_get_io(channel);
1899 	if (!bdev_io) {
1900 		return -ENOMEM;
1901 	}
1902 
1903 	bdev_io->internal.ch = channel;
1904 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
1905 	bdev_io->u.bdev.iovs = &bdev_io->iov;
1906 	bdev_io->u.bdev.iovs[0].iov_base = buf;
1907 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
1908 	bdev_io->u.bdev.iovcnt = 1;
1909 	bdev_io->u.bdev.num_blocks = num_blocks;
1910 	bdev_io->u.bdev.offset_blocks = offset_blocks;
1911 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
1912 
1913 	spdk_bdev_io_submit(bdev_io);
1914 	return 0;
1915 }
1916 
1917 int
1918 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1919 		 struct iovec *iov, int iovcnt,
1920 		 uint64_t offset, uint64_t len,
1921 		 spdk_bdev_io_completion_cb cb, void *cb_arg)
1922 {
1923 	uint64_t offset_blocks, num_blocks;
1924 
1925 	if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) {
1926 		return -EINVAL;
1927 	}
1928 
1929 	return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
1930 }
1931 
1932 int
1933 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1934 			struct iovec *iov, int iovcnt,
1935 			uint64_t offset_blocks, uint64_t num_blocks,
1936 			spdk_bdev_io_completion_cb cb, void *cb_arg)
1937 {
1938 	struct spdk_bdev *bdev = desc->bdev;
1939 	struct spdk_bdev_io *bdev_io;
1940 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1941 
1942 	if (!desc->write) {
1943 		return -EBADF;
1944 	}
1945 
1946 	if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
1947 		return -EINVAL;
1948 	}
1949 
1950 	bdev_io = spdk_bdev_get_io(channel);
1951 	if (!bdev_io) {
1952 		return -ENOMEM;
1953 	}
1954 
1955 	bdev_io->internal.ch = channel;
1956 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
1957 	bdev_io->u.bdev.iovs = iov;
1958 	bdev_io->u.bdev.iovcnt = iovcnt;
1959 	bdev_io->u.bdev.num_blocks = num_blocks;
1960 	bdev_io->u.bdev.offset_blocks = offset_blocks;
1961 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
1962 
1963 	spdk_bdev_io_submit(bdev_io);
1964 	return 0;
1965 }
1966 
1967 int
1968 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1969 		       uint64_t offset, uint64_t len,
1970 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
1971 {
1972 	uint64_t offset_blocks, num_blocks;
1973 
1974 	if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) {
1975 		return -EINVAL;
1976 	}
1977 
1978 	return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
1979 }
1980 
1981 int
1982 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1983 			      uint64_t offset_blocks, uint64_t num_blocks,
1984 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
1985 {
1986 	struct spdk_bdev *bdev = desc->bdev;
1987 	struct spdk_bdev_io *bdev_io;
1988 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1989 	uint64_t len;
1990 	bool split_request = false;
1991 
1992 	if (!desc->write) {
1993 		return -EBADF;
1994 	}
1995 
1996 	if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
1997 		return -EINVAL;
1998 	}
1999 
2000 	bdev_io = spdk_bdev_get_io(channel);
2001 
2002 	if (!bdev_io) {
2003 		return -ENOMEM;
2004 	}
2005 
2006 	bdev_io->internal.ch = channel;
2007 	bdev_io->u.bdev.offset_blocks = offset_blocks;
2008 
2009 	if (_spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
2010 		bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES;
2011 		bdev_io->u.bdev.num_blocks = num_blocks;
2012 		bdev_io->u.bdev.iovs = NULL;
2013 		bdev_io->u.bdev.iovcnt = 0;
2014 
2015 	} else if (_spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) {
2016 		assert(spdk_bdev_get_block_size(bdev) <= ZERO_BUFFER_SIZE);
2017 
2018 		len = spdk_bdev_get_block_size(bdev) * num_blocks;
2019 
2020 		if (len > ZERO_BUFFER_SIZE) {
2021 			split_request = true;
2022 			len = ZERO_BUFFER_SIZE;
2023 		}
2024 
2025 		bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
2026 		bdev_io->u.bdev.iovs = &bdev_io->iov;
2027 		bdev_io->u.bdev.iovs[0].iov_base = g_bdev_mgr.zero_buffer;
2028 		bdev_io->u.bdev.iovs[0].iov_len = len;
2029 		bdev_io->u.bdev.iovcnt = 1;
2030 		bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev);
2031 		bdev_io->u.bdev.split_remaining_num_blocks = num_blocks - bdev_io->u.bdev.num_blocks;
2032 		bdev_io->u.bdev.split_current_offset_blocks = offset_blocks + bdev_io->u.bdev.num_blocks;
2033 	} else {
2034 		spdk_bdev_free_io(bdev_io);
2035 		return -ENOTSUP;
2036 	}
2037 
2038 	if (split_request) {
2039 		bdev_io->u.bdev.stored_user_cb = cb;
2040 		spdk_bdev_io_init(bdev_io, bdev, cb_arg, spdk_bdev_write_zeroes_split);
2041 	} else {
2042 		spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
2043 	}
2044 	spdk_bdev_io_submit(bdev_io);
2045 	return 0;
2046 }
2047 
2048 int
2049 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
2050 		uint64_t offset, uint64_t nbytes,
2051 		spdk_bdev_io_completion_cb cb, void *cb_arg)
2052 {
2053 	uint64_t offset_blocks, num_blocks;
2054 
2055 	if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
2056 		return -EINVAL;
2057 	}
2058 
2059 	return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
2060 }
2061 
2062 int
2063 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
2064 		       uint64_t offset_blocks, uint64_t num_blocks,
2065 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
2066 {
2067 	struct spdk_bdev *bdev = desc->bdev;
2068 	struct spdk_bdev_io *bdev_io;
2069 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2070 
2071 	if (!desc->write) {
2072 		return -EBADF;
2073 	}
2074 
2075 	if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
2076 		return -EINVAL;
2077 	}
2078 
2079 	if (num_blocks == 0) {
2080 		SPDK_ERRLOG("Can't unmap 0 bytes\n");
2081 		return -EINVAL;
2082 	}
2083 
2084 	bdev_io = spdk_bdev_get_io(channel);
2085 	if (!bdev_io) {
2086 		return -ENOMEM;
2087 	}
2088 
2089 	bdev_io->internal.ch = channel;
2090 	bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP;
2091 
2092 	bdev_io->u.bdev.iovs = &bdev_io->iov;
2093 	bdev_io->u.bdev.iovs[0].iov_base = NULL;
2094 	bdev_io->u.bdev.iovs[0].iov_len = 0;
2095 	bdev_io->u.bdev.iovcnt = 1;
2096 
2097 	bdev_io->u.bdev.offset_blocks = offset_blocks;
2098 	bdev_io->u.bdev.num_blocks = num_blocks;
2099 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
2100 
2101 	spdk_bdev_io_submit(bdev_io);
2102 	return 0;
2103 }
2104 
2105 int
2106 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
2107 		uint64_t offset, uint64_t length,
2108 		spdk_bdev_io_completion_cb cb, void *cb_arg)
2109 {
2110 	uint64_t offset_blocks, num_blocks;
2111 
2112 	if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, length, &num_blocks) != 0) {
2113 		return -EINVAL;
2114 	}
2115 
2116 	return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
2117 }
2118 
2119 int
2120 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
2121 		       uint64_t offset_blocks, uint64_t num_blocks,
2122 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
2123 {
2124 	struct spdk_bdev *bdev = desc->bdev;
2125 	struct spdk_bdev_io *bdev_io;
2126 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2127 
2128 	if (!desc->write) {
2129 		return -EBADF;
2130 	}
2131 
2132 	if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
2133 		return -EINVAL;
2134 	}
2135 
2136 	bdev_io = spdk_bdev_get_io(channel);
2137 	if (!bdev_io) {
2138 		return -ENOMEM;
2139 	}
2140 
2141 	bdev_io->internal.ch = channel;
2142 	bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH;
2143 	bdev_io->u.bdev.iovs = NULL;
2144 	bdev_io->u.bdev.iovcnt = 0;
2145 	bdev_io->u.bdev.offset_blocks = offset_blocks;
2146 	bdev_io->u.bdev.num_blocks = num_blocks;
2147 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
2148 
2149 	spdk_bdev_io_submit(bdev_io);
2150 	return 0;
2151 }
2152 
2153 static void
2154 _spdk_bdev_reset_dev(struct spdk_io_channel_iter *i, int status)
2155 {
2156 	struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i);
2157 	struct spdk_bdev_io *bdev_io;
2158 
2159 	bdev_io = TAILQ_FIRST(&ch->queued_resets);
2160 	TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
2161 	spdk_bdev_io_submit_reset(bdev_io);
2162 }
2163 
2164 static void
2165 _spdk_bdev_reset_freeze_channel(struct spdk_io_channel_iter *i)
2166 {
2167 	struct spdk_io_channel		*ch;
2168 	struct spdk_bdev_channel	*channel;
2169 	struct spdk_bdev_mgmt_channel	*mgmt_channel;
2170 	struct spdk_bdev_shared_resource *shared_resource;
2171 	bdev_io_tailq_t			tmp_queued;
2172 
2173 	TAILQ_INIT(&tmp_queued);
2174 
2175 	ch = spdk_io_channel_iter_get_channel(i);
2176 	channel = spdk_io_channel_get_ctx(ch);
2177 	shared_resource = channel->shared_resource;
2178 	mgmt_channel = shared_resource->mgmt_ch;
2179 
2180 	channel->flags |= BDEV_CH_RESET_IN_PROGRESS;
2181 
2182 	if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) {
2183 		/* The QoS object is always valid and readable while
2184 		 * the channel flag is set, so the lock here should not
2185 		 * be necessary. We're not in the fast path though, so
2186 		 * just take it anyway. */
2187 		pthread_mutex_lock(&channel->bdev->internal.mutex);
2188 		if (channel->bdev->internal.qos->ch == channel) {
2189 			TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link);
2190 		}
2191 		pthread_mutex_unlock(&channel->bdev->internal.mutex);
2192 	}
2193 
2194 	_spdk_bdev_abort_queued_io(&shared_resource->nomem_io, channel);
2195 	_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel);
2196 	_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel);
2197 	_spdk_bdev_abort_queued_io(&tmp_queued, channel);
2198 
2199 	spdk_for_each_channel_continue(i, 0);
2200 }
2201 
2202 static void
2203 _spdk_bdev_start_reset(void *ctx)
2204 {
2205 	struct spdk_bdev_channel *ch = ctx;
2206 
2207 	spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), _spdk_bdev_reset_freeze_channel,
2208 			      ch, _spdk_bdev_reset_dev);
2209 }
2210 
2211 static void
2212 _spdk_bdev_channel_start_reset(struct spdk_bdev_channel *ch)
2213 {
2214 	struct spdk_bdev *bdev = ch->bdev;
2215 
2216 	assert(!TAILQ_EMPTY(&ch->queued_resets));
2217 
2218 	pthread_mutex_lock(&bdev->internal.mutex);
2219 	if (bdev->internal.reset_in_progress == NULL) {
2220 		bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets);
2221 		/*
2222 		 * Take a channel reference for the target bdev for the life of this
2223 		 *  reset.  This guards against the channel getting destroyed while
2224 		 *  spdk_for_each_channel() calls related to this reset IO are in
2225 		 *  progress.  We will release the reference when this reset is
2226 		 *  completed.
2227 		 */
2228 		bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev));
2229 		_spdk_bdev_start_reset(ch);
2230 	}
2231 	pthread_mutex_unlock(&bdev->internal.mutex);
2232 }
2233 
2234 int
2235 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
2236 		spdk_bdev_io_completion_cb cb, void *cb_arg)
2237 {
2238 	struct spdk_bdev *bdev = desc->bdev;
2239 	struct spdk_bdev_io *bdev_io;
2240 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2241 
2242 	bdev_io = spdk_bdev_get_io(channel);
2243 	if (!bdev_io) {
2244 		return -ENOMEM;
2245 	}
2246 
2247 	bdev_io->internal.ch = channel;
2248 	bdev_io->type = SPDK_BDEV_IO_TYPE_RESET;
2249 	bdev_io->u.reset.ch_ref = NULL;
2250 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
2251 
2252 	pthread_mutex_lock(&bdev->internal.mutex);
2253 	TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link);
2254 	pthread_mutex_unlock(&bdev->internal.mutex);
2255 
2256 	_spdk_bdev_channel_start_reset(channel);
2257 
2258 	return 0;
2259 }
2260 
2261 void
2262 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
2263 		      struct spdk_bdev_io_stat *stat)
2264 {
2265 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2266 
2267 	*stat = channel->stat;
2268 }
2269 
2270 static void
2271 _spdk_bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status)
2272 {
2273 	void *io_device = spdk_io_channel_iter_get_io_device(i);
2274 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i);
2275 
2276 	bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat,
2277 			    bdev_iostat_ctx->cb_arg, 0);
2278 	free(bdev_iostat_ctx);
2279 }
2280 
2281 static void
2282 _spdk_bdev_get_each_channel_stat(struct spdk_io_channel_iter *i)
2283 {
2284 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i);
2285 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
2286 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2287 
2288 	_spdk_bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat);
2289 	spdk_for_each_channel_continue(i, 0);
2290 }
2291 
2292 void
2293 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat,
2294 			  spdk_bdev_get_device_stat_cb cb, void *cb_arg)
2295 {
2296 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx;
2297 
2298 	assert(bdev != NULL);
2299 	assert(stat != NULL);
2300 	assert(cb != NULL);
2301 
2302 	bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx));
2303 	if (bdev_iostat_ctx == NULL) {
2304 		SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n");
2305 		cb(bdev, stat, cb_arg, -ENOMEM);
2306 		return;
2307 	}
2308 
2309 	bdev_iostat_ctx->stat = stat;
2310 	bdev_iostat_ctx->cb = cb;
2311 	bdev_iostat_ctx->cb_arg = cb_arg;
2312 
2313 	/* Start with the statistics from previously deleted channels. */
2314 	pthread_mutex_lock(&bdev->internal.mutex);
2315 	_spdk_bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat);
2316 	pthread_mutex_unlock(&bdev->internal.mutex);
2317 
2318 	/* Then iterate and add the statistics from each existing channel. */
2319 	spdk_for_each_channel(__bdev_to_io_dev(bdev),
2320 			      _spdk_bdev_get_each_channel_stat,
2321 			      bdev_iostat_ctx,
2322 			      _spdk_bdev_get_device_stat_done);
2323 }
2324 
2325 int
2326 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
2327 			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
2328 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
2329 {
2330 	struct spdk_bdev *bdev = desc->bdev;
2331 	struct spdk_bdev_io *bdev_io;
2332 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2333 
2334 	if (!desc->write) {
2335 		return -EBADF;
2336 	}
2337 
2338 	bdev_io = spdk_bdev_get_io(channel);
2339 	if (!bdev_io) {
2340 		return -ENOMEM;
2341 	}
2342 
2343 	bdev_io->internal.ch = channel;
2344 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN;
2345 	bdev_io->u.nvme_passthru.cmd = *cmd;
2346 	bdev_io->u.nvme_passthru.buf = buf;
2347 	bdev_io->u.nvme_passthru.nbytes = nbytes;
2348 	bdev_io->u.nvme_passthru.md_buf = NULL;
2349 	bdev_io->u.nvme_passthru.md_len = 0;
2350 
2351 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
2352 
2353 	spdk_bdev_io_submit(bdev_io);
2354 	return 0;
2355 }
2356 
2357 int
2358 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
2359 			   const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
2360 			   spdk_bdev_io_completion_cb cb, void *cb_arg)
2361 {
2362 	struct spdk_bdev *bdev = desc->bdev;
2363 	struct spdk_bdev_io *bdev_io;
2364 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2365 
2366 	if (!desc->write) {
2367 		/*
2368 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
2369 		 *  to easily determine if the command is a read or write, but for now just
2370 		 *  do not allow io_passthru with a read-only descriptor.
2371 		 */
2372 		return -EBADF;
2373 	}
2374 
2375 	bdev_io = spdk_bdev_get_io(channel);
2376 	if (!bdev_io) {
2377 		return -ENOMEM;
2378 	}
2379 
2380 	bdev_io->internal.ch = channel;
2381 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO;
2382 	bdev_io->u.nvme_passthru.cmd = *cmd;
2383 	bdev_io->u.nvme_passthru.buf = buf;
2384 	bdev_io->u.nvme_passthru.nbytes = nbytes;
2385 	bdev_io->u.nvme_passthru.md_buf = NULL;
2386 	bdev_io->u.nvme_passthru.md_len = 0;
2387 
2388 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
2389 
2390 	spdk_bdev_io_submit(bdev_io);
2391 	return 0;
2392 }
2393 
2394 int
2395 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
2396 			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len,
2397 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
2398 {
2399 	struct spdk_bdev *bdev = desc->bdev;
2400 	struct spdk_bdev_io *bdev_io;
2401 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2402 
2403 	if (!desc->write) {
2404 		/*
2405 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
2406 		 *  to easily determine if the command is a read or write, but for now just
2407 		 *  do not allow io_passthru with a read-only descriptor.
2408 		 */
2409 		return -EBADF;
2410 	}
2411 
2412 	bdev_io = spdk_bdev_get_io(channel);
2413 	if (!bdev_io) {
2414 		return -ENOMEM;
2415 	}
2416 
2417 	bdev_io->internal.ch = channel;
2418 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD;
2419 	bdev_io->u.nvme_passthru.cmd = *cmd;
2420 	bdev_io->u.nvme_passthru.buf = buf;
2421 	bdev_io->u.nvme_passthru.nbytes = nbytes;
2422 	bdev_io->u.nvme_passthru.md_buf = md_buf;
2423 	bdev_io->u.nvme_passthru.md_len = md_len;
2424 
2425 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
2426 
2427 	spdk_bdev_io_submit(bdev_io);
2428 	return 0;
2429 }
2430 
2431 int
2432 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
2433 			struct spdk_bdev_io_wait_entry *entry)
2434 {
2435 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2436 	struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch;
2437 
2438 	if (bdev != entry->bdev) {
2439 		SPDK_ERRLOG("bdevs do not match\n");
2440 		return -EINVAL;
2441 	}
2442 
2443 	if (mgmt_ch->per_thread_cache_count > 0) {
2444 		SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n");
2445 		return -EINVAL;
2446 	}
2447 
2448 	TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link);
2449 	return 0;
2450 }
2451 
2452 static void
2453 _spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
2454 {
2455 	struct spdk_bdev *bdev = bdev_ch->bdev;
2456 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
2457 	struct spdk_bdev_io *bdev_io;
2458 
2459 	if (shared_resource->io_outstanding > shared_resource->nomem_threshold) {
2460 		/*
2461 		 * Allow some more I/O to complete before retrying the nomem_io queue.
2462 		 *  Some drivers (such as nvme) cannot immediately take a new I/O in
2463 		 *  the context of a completion, because the resources for the I/O are
2464 		 *  not released until control returns to the bdev poller.  Also, we
2465 		 *  may require several small I/O to complete before a larger I/O
2466 		 *  (that requires splitting) can be submitted.
2467 		 */
2468 		return;
2469 	}
2470 
2471 	while (!TAILQ_EMPTY(&shared_resource->nomem_io)) {
2472 		bdev_io = TAILQ_FIRST(&shared_resource->nomem_io);
2473 		TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link);
2474 		bdev_io->internal.ch->io_outstanding++;
2475 		shared_resource->io_outstanding++;
2476 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
2477 		bdev->fn_table->submit_request(bdev_io->internal.ch->channel, bdev_io);
2478 		if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
2479 			break;
2480 		}
2481 	}
2482 }
2483 
2484 static inline void
2485 _spdk_bdev_io_complete(void *ctx)
2486 {
2487 	struct spdk_bdev_io *bdev_io = ctx;
2488 
2489 	if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) {
2490 		/*
2491 		 * Send the completion to the thread that originally submitted the I/O,
2492 		 * which may not be the current thread in the case of QoS.
2493 		 */
2494 		if (bdev_io->internal.io_submit_ch) {
2495 			bdev_io->internal.ch = bdev_io->internal.io_submit_ch;
2496 			bdev_io->internal.io_submit_ch = NULL;
2497 		}
2498 
2499 		/*
2500 		 * Defer completion to avoid potential infinite recursion if the
2501 		 * user's completion callback issues a new I/O.
2502 		 */
2503 		spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->internal.ch->channel),
2504 				     _spdk_bdev_io_complete, bdev_io);
2505 		return;
2506 	}
2507 
2508 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
2509 		switch (bdev_io->type) {
2510 		case SPDK_BDEV_IO_TYPE_READ:
2511 			bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
2512 			bdev_io->internal.ch->stat.num_read_ops++;
2513 			bdev_io->internal.ch->stat.read_latency_ticks += (spdk_get_ticks() - bdev_io->internal.submit_tsc);
2514 			break;
2515 		case SPDK_BDEV_IO_TYPE_WRITE:
2516 			bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
2517 			bdev_io->internal.ch->stat.num_write_ops++;
2518 			bdev_io->internal.ch->stat.write_latency_ticks += (spdk_get_ticks() - bdev_io->internal.submit_tsc);
2519 			break;
2520 		default:
2521 			break;
2522 		}
2523 	}
2524 
2525 #ifdef SPDK_CONFIG_VTUNE
2526 	uint64_t now_tsc = spdk_get_ticks();
2527 	if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) {
2528 		uint64_t data[5];
2529 
2530 		data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops;
2531 		data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read;
2532 		data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops;
2533 		data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written;
2534 		data[4] = bdev_io->bdev->fn_table->get_spin_time ?
2535 			  bdev_io->bdev->fn_table->get_spin_time(bdev_io->internal.ch->channel) : 0;
2536 
2537 		__itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle,
2538 				   __itt_metadata_u64, 5, data);
2539 
2540 		bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat;
2541 		bdev_io->internal.ch->start_tsc = now_tsc;
2542 	}
2543 #endif
2544 
2545 	assert(bdev_io->internal.cb != NULL);
2546 	assert(spdk_get_thread() == spdk_io_channel_get_thread(bdev_io->internal.ch->channel));
2547 
2548 	bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
2549 			     bdev_io->internal.caller_ctx);
2550 }
2551 
2552 static void
2553 _spdk_bdev_reset_complete(struct spdk_io_channel_iter *i, int status)
2554 {
2555 	struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i);
2556 
2557 	if (bdev_io->u.reset.ch_ref != NULL) {
2558 		spdk_put_io_channel(bdev_io->u.reset.ch_ref);
2559 		bdev_io->u.reset.ch_ref = NULL;
2560 	}
2561 
2562 	_spdk_bdev_io_complete(bdev_io);
2563 }
2564 
2565 static void
2566 _spdk_bdev_unfreeze_channel(struct spdk_io_channel_iter *i)
2567 {
2568 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
2569 	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
2570 
2571 	ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS;
2572 	if (!TAILQ_EMPTY(&ch->queued_resets)) {
2573 		_spdk_bdev_channel_start_reset(ch);
2574 	}
2575 
2576 	spdk_for_each_channel_continue(i, 0);
2577 }
2578 
2579 void
2580 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status)
2581 {
2582 	struct spdk_bdev *bdev = bdev_io->bdev;
2583 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
2584 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
2585 
2586 	bdev_io->internal.status = status;
2587 
2588 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) {
2589 		bool unlock_channels = false;
2590 
2591 		if (status == SPDK_BDEV_IO_STATUS_NOMEM) {
2592 			SPDK_ERRLOG("NOMEM returned for reset\n");
2593 		}
2594 		pthread_mutex_lock(&bdev->internal.mutex);
2595 		if (bdev_io == bdev->internal.reset_in_progress) {
2596 			bdev->internal.reset_in_progress = NULL;
2597 			unlock_channels = true;
2598 		}
2599 		pthread_mutex_unlock(&bdev->internal.mutex);
2600 
2601 		if (unlock_channels) {
2602 			spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_unfreeze_channel,
2603 					      bdev_io, _spdk_bdev_reset_complete);
2604 			return;
2605 		}
2606 	} else {
2607 		assert(bdev_ch->io_outstanding > 0);
2608 		assert(shared_resource->io_outstanding > 0);
2609 		bdev_ch->io_outstanding--;
2610 		shared_resource->io_outstanding--;
2611 
2612 		if (spdk_unlikely(status == SPDK_BDEV_IO_STATUS_NOMEM)) {
2613 			TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link);
2614 			/*
2615 			 * Wait for some of the outstanding I/O to complete before we
2616 			 *  retry any of the nomem_io.  Normally we will wait for
2617 			 *  NOMEM_THRESHOLD_COUNT I/O to complete but for low queue
2618 			 *  depth channels we will instead wait for half to complete.
2619 			 */
2620 			shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2,
2621 							   (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT);
2622 			return;
2623 		}
2624 
2625 		if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) {
2626 			_spdk_bdev_ch_retry_io(bdev_ch);
2627 		}
2628 	}
2629 
2630 	_spdk_bdev_io_complete(bdev_io);
2631 }
2632 
2633 void
2634 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc,
2635 				  enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq)
2636 {
2637 	if (sc == SPDK_SCSI_STATUS_GOOD) {
2638 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
2639 	} else {
2640 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR;
2641 		bdev_io->internal.error.scsi.sc = sc;
2642 		bdev_io->internal.error.scsi.sk = sk;
2643 		bdev_io->internal.error.scsi.asc = asc;
2644 		bdev_io->internal.error.scsi.ascq = ascq;
2645 	}
2646 
2647 	spdk_bdev_io_complete(bdev_io, bdev_io->internal.status);
2648 }
2649 
2650 void
2651 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io,
2652 			     int *sc, int *sk, int *asc, int *ascq)
2653 {
2654 	assert(sc != NULL);
2655 	assert(sk != NULL);
2656 	assert(asc != NULL);
2657 	assert(ascq != NULL);
2658 
2659 	switch (bdev_io->internal.status) {
2660 	case SPDK_BDEV_IO_STATUS_SUCCESS:
2661 		*sc = SPDK_SCSI_STATUS_GOOD;
2662 		*sk = SPDK_SCSI_SENSE_NO_SENSE;
2663 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
2664 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
2665 		break;
2666 	case SPDK_BDEV_IO_STATUS_NVME_ERROR:
2667 		spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq);
2668 		break;
2669 	case SPDK_BDEV_IO_STATUS_SCSI_ERROR:
2670 		*sc = bdev_io->internal.error.scsi.sc;
2671 		*sk = bdev_io->internal.error.scsi.sk;
2672 		*asc = bdev_io->internal.error.scsi.asc;
2673 		*ascq = bdev_io->internal.error.scsi.ascq;
2674 		break;
2675 	default:
2676 		*sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
2677 		*sk = SPDK_SCSI_SENSE_ABORTED_COMMAND;
2678 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
2679 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
2680 		break;
2681 	}
2682 }
2683 
2684 void
2685 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, int sct, int sc)
2686 {
2687 	if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) {
2688 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
2689 	} else {
2690 		bdev_io->internal.error.nvme.sct = sct;
2691 		bdev_io->internal.error.nvme.sc = sc;
2692 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR;
2693 	}
2694 
2695 	spdk_bdev_io_complete(bdev_io, bdev_io->internal.status);
2696 }
2697 
2698 void
2699 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, int *sct, int *sc)
2700 {
2701 	assert(sct != NULL);
2702 	assert(sc != NULL);
2703 
2704 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
2705 		*sct = bdev_io->internal.error.nvme.sct;
2706 		*sc = bdev_io->internal.error.nvme.sc;
2707 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
2708 		*sct = SPDK_NVME_SCT_GENERIC;
2709 		*sc = SPDK_NVME_SC_SUCCESS;
2710 	} else {
2711 		*sct = SPDK_NVME_SCT_GENERIC;
2712 		*sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
2713 	}
2714 }
2715 
2716 struct spdk_thread *
2717 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io)
2718 {
2719 	return spdk_io_channel_get_thread(bdev_io->internal.ch->channel);
2720 }
2721 
2722 static void
2723 _spdk_bdev_qos_config_type(struct spdk_bdev *bdev, uint64_t qos_set,
2724 			   enum spdk_bdev_qos_type qos_type)
2725 {
2726 	uint64_t	min_qos_set = 0;
2727 
2728 	switch (qos_type) {
2729 	case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
2730 		min_qos_set = SPDK_BDEV_QOS_MIN_IOS_PER_SEC;
2731 		break;
2732 	case SPDK_BDEV_QOS_RW_BYTEPS_RATE_LIMIT:
2733 		min_qos_set = SPDK_BDEV_QOS_MIN_BW_IN_MB_PER_SEC;
2734 		break;
2735 	default:
2736 		SPDK_ERRLOG("Unsupported QoS type.\n");
2737 		return;
2738 	}
2739 
2740 	if (qos_set % min_qos_set) {
2741 		SPDK_ERRLOG("Assigned QoS %" PRIu64 " on bdev %s is not multiple of %lu\n",
2742 			    qos_set, bdev->name, min_qos_set);
2743 		SPDK_ERRLOG("Failed to enable QoS on this bdev %s\n", bdev->name);
2744 		return;
2745 	}
2746 
2747 	if (!bdev->internal.qos) {
2748 		bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos));
2749 		if (!bdev->internal.qos) {
2750 			SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n");
2751 			return;
2752 		}
2753 	}
2754 
2755 	switch (qos_type) {
2756 	case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
2757 		bdev->internal.qos->iops_rate_limit = qos_set;
2758 		break;
2759 	case SPDK_BDEV_QOS_RW_BYTEPS_RATE_LIMIT:
2760 		bdev->internal.qos->byte_rate_limit = qos_set * 1024 * 1024;
2761 		break;
2762 	default:
2763 		break;
2764 	}
2765 
2766 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev:%s QoS type:%d set:%lu\n",
2767 		      bdev->name, qos_type, qos_set);
2768 
2769 	return;
2770 }
2771 
2772 static void
2773 _spdk_bdev_qos_config(struct spdk_bdev *bdev)
2774 {
2775 	struct spdk_conf_section	*sp = NULL;
2776 	const char			*val = NULL;
2777 	uint64_t			qos_set = 0;
2778 	int				i = 0, j = 0;
2779 
2780 	sp = spdk_conf_find_section(NULL, "QoS");
2781 	if (!sp) {
2782 		return;
2783 	}
2784 
2785 	while (j < SPDK_BDEV_QOS_NUM_TYPES) {
2786 		i = 0;
2787 		while (true) {
2788 			val = spdk_conf_section_get_nmval(sp, qos_type_str[j], i, 0);
2789 			if (!val) {
2790 				break;
2791 			}
2792 
2793 			if (strcmp(bdev->name, val) != 0) {
2794 				i++;
2795 				continue;
2796 			}
2797 
2798 			val = spdk_conf_section_get_nmval(sp, qos_type_str[j], i, 1);
2799 			if (val) {
2800 				qos_set = strtoull(val, NULL, 10);
2801 				_spdk_bdev_qos_config_type(bdev, qos_set, j);
2802 			}
2803 
2804 			break;
2805 		}
2806 
2807 		j++;
2808 	}
2809 
2810 	return;
2811 }
2812 
2813 static int
2814 spdk_bdev_init(struct spdk_bdev *bdev)
2815 {
2816 	assert(bdev->module != NULL);
2817 
2818 	if (!bdev->name) {
2819 		SPDK_ERRLOG("Bdev name is NULL\n");
2820 		return -EINVAL;
2821 	}
2822 
2823 	if (spdk_bdev_get_by_name(bdev->name)) {
2824 		SPDK_ERRLOG("Bdev name:%s already exists\n", bdev->name);
2825 		return -EEXIST;
2826 	}
2827 
2828 	bdev->internal.status = SPDK_BDEV_STATUS_READY;
2829 	bdev->internal.measured_queue_depth = UINT64_MAX;
2830 
2831 	TAILQ_INIT(&bdev->internal.open_descs);
2832 
2833 	TAILQ_INIT(&bdev->aliases);
2834 
2835 	bdev->internal.reset_in_progress = NULL;
2836 
2837 	_spdk_bdev_qos_config(bdev);
2838 
2839 	spdk_io_device_register(__bdev_to_io_dev(bdev),
2840 				spdk_bdev_channel_create, spdk_bdev_channel_destroy,
2841 				sizeof(struct spdk_bdev_channel));
2842 
2843 	pthread_mutex_init(&bdev->internal.mutex, NULL);
2844 	return 0;
2845 }
2846 
2847 static void
2848 spdk_bdev_destroy_cb(void *io_device)
2849 {
2850 	int			rc;
2851 	struct spdk_bdev	*bdev;
2852 	spdk_bdev_unregister_cb	cb_fn;
2853 	void			*cb_arg;
2854 
2855 	bdev = __bdev_from_io_dev(io_device);
2856 	cb_fn = bdev->internal.unregister_cb;
2857 	cb_arg = bdev->internal.unregister_ctx;
2858 
2859 	rc = bdev->fn_table->destruct(bdev->ctxt);
2860 	if (rc < 0) {
2861 		SPDK_ERRLOG("destruct failed\n");
2862 	}
2863 	if (rc <= 0 && cb_fn != NULL) {
2864 		cb_fn(cb_arg, rc);
2865 	}
2866 }
2867 
2868 
2869 static void
2870 spdk_bdev_fini(struct spdk_bdev *bdev)
2871 {
2872 	pthread_mutex_destroy(&bdev->internal.mutex);
2873 
2874 	free(bdev->internal.qos);
2875 
2876 	spdk_io_device_unregister(__bdev_to_io_dev(bdev), spdk_bdev_destroy_cb);
2877 }
2878 
2879 static void
2880 spdk_bdev_start(struct spdk_bdev *bdev)
2881 {
2882 	struct spdk_bdev_module *module;
2883 	uint32_t action;
2884 
2885 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Inserting bdev %s into list\n", bdev->name);
2886 	TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link);
2887 
2888 	/* Examine configuration before initializing I/O */
2889 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
2890 		if (module->examine_config) {
2891 			action = module->internal.action_in_progress;
2892 			module->internal.action_in_progress++;
2893 			module->examine_config(bdev);
2894 			if (action != module->internal.action_in_progress) {
2895 				SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n",
2896 					    module->name);
2897 			}
2898 		}
2899 	}
2900 
2901 	if (bdev->internal.claim_module) {
2902 		return;
2903 	}
2904 
2905 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
2906 		if (module->examine_disk) {
2907 			module->internal.action_in_progress++;
2908 			module->examine_disk(bdev);
2909 		}
2910 	}
2911 }
2912 
2913 int
2914 spdk_bdev_register(struct spdk_bdev *bdev)
2915 {
2916 	int rc = spdk_bdev_init(bdev);
2917 
2918 	if (rc == 0) {
2919 		spdk_bdev_start(bdev);
2920 	}
2921 
2922 	return rc;
2923 }
2924 
2925 static void
2926 spdk_vbdev_remove_base_bdevs(struct spdk_bdev *vbdev)
2927 {
2928 	struct spdk_bdev **bdevs;
2929 	struct spdk_bdev *base;
2930 	size_t i, j, k;
2931 	bool found;
2932 
2933 	/* Iterate over base bdevs to remove vbdev from them. */
2934 	for (i = 0; i < vbdev->internal.base_bdevs_cnt; i++) {
2935 		found = false;
2936 		base = vbdev->internal.base_bdevs[i];
2937 
2938 		for (j = 0; j < base->vbdevs_cnt; j++) {
2939 			if (base->vbdevs[j] != vbdev) {
2940 				continue;
2941 			}
2942 
2943 			for (k = j; k + 1 < base->vbdevs_cnt; k++) {
2944 				base->vbdevs[k] = base->vbdevs[k + 1];
2945 			}
2946 
2947 			base->vbdevs_cnt--;
2948 			if (base->vbdevs_cnt > 0) {
2949 				bdevs = realloc(base->vbdevs, base->vbdevs_cnt * sizeof(bdevs[0]));
2950 				/* It would be odd if shrinking memory block fail. */
2951 				assert(bdevs);
2952 				base->vbdevs = bdevs;
2953 			} else {
2954 				free(base->vbdevs);
2955 				base->vbdevs = NULL;
2956 			}
2957 
2958 			found = true;
2959 			break;
2960 		}
2961 
2962 		if (!found) {
2963 			SPDK_WARNLOG("Bdev '%s' is not base bdev of '%s'.\n", base->name, vbdev->name);
2964 		}
2965 	}
2966 
2967 	free(vbdev->internal.base_bdevs);
2968 	vbdev->internal.base_bdevs = NULL;
2969 	vbdev->internal.base_bdevs_cnt = 0;
2970 }
2971 
2972 static int
2973 spdk_vbdev_set_base_bdevs(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, size_t cnt)
2974 {
2975 	struct spdk_bdev **vbdevs;
2976 	struct spdk_bdev *base;
2977 	size_t i;
2978 
2979 	/* Adding base bdevs isn't supported (yet?). */
2980 	assert(vbdev->internal.base_bdevs_cnt == 0);
2981 
2982 	vbdev->internal.base_bdevs = malloc(cnt * sizeof(vbdev->internal.base_bdevs[0]));
2983 	if (!vbdev->internal.base_bdevs) {
2984 		SPDK_ERRLOG("%s - realloc() failed\n", vbdev->name);
2985 		return -ENOMEM;
2986 	}
2987 
2988 	memcpy(vbdev->internal.base_bdevs, base_bdevs, cnt * sizeof(vbdev->internal.base_bdevs[0]));
2989 	vbdev->internal.base_bdevs_cnt = cnt;
2990 
2991 	/* Iterate over base bdevs to add this vbdev to them. */
2992 	for (i = 0; i < cnt; i++) {
2993 		base = vbdev->internal.base_bdevs[i];
2994 
2995 		assert(base != NULL);
2996 		assert(base->internal.claim_module != NULL);
2997 
2998 		vbdevs = realloc(base->vbdevs, (base->vbdevs_cnt + 1) * sizeof(vbdevs[0]));
2999 		if (!vbdevs) {
3000 			SPDK_ERRLOG("%s - realloc() failed\n", base->name);
3001 			spdk_vbdev_remove_base_bdevs(vbdev);
3002 			return -ENOMEM;
3003 		}
3004 
3005 		vbdevs[base->vbdevs_cnt] = vbdev;
3006 		base->vbdevs = vbdevs;
3007 		base->vbdevs_cnt++;
3008 	}
3009 
3010 	return 0;
3011 }
3012 
3013 int
3014 spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count)
3015 {
3016 	int rc;
3017 
3018 	rc = spdk_bdev_init(vbdev);
3019 	if (rc) {
3020 		return rc;
3021 	}
3022 
3023 	if (base_bdev_count == 0) {
3024 		spdk_bdev_start(vbdev);
3025 		return 0;
3026 	}
3027 
3028 	rc = spdk_vbdev_set_base_bdevs(vbdev, base_bdevs, base_bdev_count);
3029 	if (rc) {
3030 		spdk_bdev_fini(vbdev);
3031 		return rc;
3032 	}
3033 
3034 	spdk_bdev_start(vbdev);
3035 	return 0;
3036 
3037 }
3038 
3039 void
3040 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno)
3041 {
3042 	if (bdev->internal.unregister_cb != NULL) {
3043 		bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno);
3044 	}
3045 }
3046 
3047 static void
3048 _remove_notify(void *arg)
3049 {
3050 	struct spdk_bdev_desc *desc = arg;
3051 
3052 	desc->remove_cb(desc->remove_ctx);
3053 }
3054 
3055 void
3056 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
3057 {
3058 	struct spdk_bdev_desc	*desc, *tmp;
3059 	bool			do_destruct = true;
3060 	struct spdk_thread	*thread;
3061 
3062 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list\n", bdev->name);
3063 
3064 	thread = spdk_get_thread();
3065 	if (!thread) {
3066 		/* The user called this from a non-SPDK thread. */
3067 		if (cb_fn != NULL) {
3068 			cb_fn(cb_arg, -ENOTSUP);
3069 		}
3070 		return;
3071 	}
3072 
3073 	pthread_mutex_lock(&bdev->internal.mutex);
3074 
3075 	spdk_vbdev_remove_base_bdevs(bdev);
3076 
3077 	bdev->internal.status = SPDK_BDEV_STATUS_REMOVING;
3078 	bdev->internal.unregister_cb = cb_fn;
3079 	bdev->internal.unregister_ctx = cb_arg;
3080 
3081 	TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) {
3082 		if (desc->remove_cb) {
3083 			do_destruct = false;
3084 			/*
3085 			 * Defer invocation of the remove_cb to a separate message that will
3086 			 *  run later on this thread.  This ensures this context unwinds and
3087 			 *  we don't recursively unregister this bdev again if the remove_cb
3088 			 *  immediately closes its descriptor.
3089 			 */
3090 			if (!desc->remove_scheduled) {
3091 				/* Avoid scheduling removal of the same descriptor multiple times. */
3092 				desc->remove_scheduled = true;
3093 				spdk_thread_send_msg(thread, _remove_notify, desc);
3094 			}
3095 		}
3096 	}
3097 
3098 	if (!do_destruct) {
3099 		pthread_mutex_unlock(&bdev->internal.mutex);
3100 		return;
3101 	}
3102 
3103 	TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
3104 	pthread_mutex_unlock(&bdev->internal.mutex);
3105 
3106 	spdk_bdev_fini(bdev);
3107 }
3108 
3109 int
3110 spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb,
3111 	       void *remove_ctx, struct spdk_bdev_desc **_desc)
3112 {
3113 	struct spdk_bdev_desc *desc;
3114 
3115 	desc = calloc(1, sizeof(*desc));
3116 	if (desc == NULL) {
3117 		SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n");
3118 		return -ENOMEM;
3119 	}
3120 
3121 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
3122 		      spdk_get_thread());
3123 
3124 	pthread_mutex_lock(&bdev->internal.mutex);
3125 
3126 	if (write && bdev->internal.claim_module) {
3127 		SPDK_ERRLOG("Could not open %s - %s module already claimed it\n",
3128 			    bdev->name, bdev->internal.claim_module->name);
3129 		free(desc);
3130 		pthread_mutex_unlock(&bdev->internal.mutex);
3131 		return -EPERM;
3132 	}
3133 
3134 	TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link);
3135 
3136 	desc->bdev = bdev;
3137 	desc->remove_cb = remove_cb;
3138 	desc->remove_ctx = remove_ctx;
3139 	desc->write = write;
3140 	*_desc = desc;
3141 
3142 	pthread_mutex_unlock(&bdev->internal.mutex);
3143 
3144 	return 0;
3145 }
3146 
3147 void
3148 spdk_bdev_close(struct spdk_bdev_desc *desc)
3149 {
3150 	struct spdk_bdev *bdev = desc->bdev;
3151 	bool do_unregister = false;
3152 
3153 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
3154 		      spdk_get_thread());
3155 
3156 	pthread_mutex_lock(&bdev->internal.mutex);
3157 
3158 	TAILQ_REMOVE(&bdev->internal.open_descs, desc, link);
3159 	free(desc);
3160 
3161 	/* If no more descriptors, kill QoS channel */
3162 	if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) {
3163 		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n",
3164 			      bdev->name, spdk_get_thread());
3165 
3166 		if (spdk_bdev_qos_destroy(bdev)) {
3167 			/* There isn't anything we can do to recover here. Just let the
3168 			 * old QoS poller keep running. The QoS handling won't change
3169 			 * cores when the user allocates a new channel, but it won't break. */
3170 			SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n");
3171 		}
3172 	}
3173 
3174 	spdk_bdev_set_qd_sampling_period(bdev, 0);
3175 
3176 	if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) {
3177 		do_unregister = true;
3178 	}
3179 	pthread_mutex_unlock(&bdev->internal.mutex);
3180 
3181 	if (do_unregister == true) {
3182 		spdk_bdev_unregister(bdev, bdev->internal.unregister_cb, bdev->internal.unregister_ctx);
3183 	}
3184 }
3185 
3186 int
3187 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
3188 			    struct spdk_bdev_module *module)
3189 {
3190 	if (bdev->internal.claim_module != NULL) {
3191 		SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name,
3192 			    bdev->internal.claim_module->name);
3193 		return -EPERM;
3194 	}
3195 
3196 	if (desc && !desc->write) {
3197 		desc->write = true;
3198 	}
3199 
3200 	bdev->internal.claim_module = module;
3201 	return 0;
3202 }
3203 
3204 void
3205 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev)
3206 {
3207 	assert(bdev->internal.claim_module != NULL);
3208 	bdev->internal.claim_module = NULL;
3209 }
3210 
3211 struct spdk_bdev *
3212 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc)
3213 {
3214 	return desc->bdev;
3215 }
3216 
3217 void
3218 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp)
3219 {
3220 	struct iovec *iovs;
3221 	int iovcnt;
3222 
3223 	if (bdev_io == NULL) {
3224 		return;
3225 	}
3226 
3227 	switch (bdev_io->type) {
3228 	case SPDK_BDEV_IO_TYPE_READ:
3229 		iovs = bdev_io->u.bdev.iovs;
3230 		iovcnt = bdev_io->u.bdev.iovcnt;
3231 		break;
3232 	case SPDK_BDEV_IO_TYPE_WRITE:
3233 		iovs = bdev_io->u.bdev.iovs;
3234 		iovcnt = bdev_io->u.bdev.iovcnt;
3235 		break;
3236 	default:
3237 		iovs = NULL;
3238 		iovcnt = 0;
3239 		break;
3240 	}
3241 
3242 	if (iovp) {
3243 		*iovp = iovs;
3244 	}
3245 	if (iovcntp) {
3246 		*iovcntp = iovcnt;
3247 	}
3248 }
3249 
3250 void
3251 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module)
3252 {
3253 
3254 	if (spdk_bdev_module_list_find(bdev_module->name)) {
3255 		SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name);
3256 		assert(false);
3257 	}
3258 
3259 	if (bdev_module->async_init) {
3260 		bdev_module->internal.action_in_progress = 1;
3261 	}
3262 
3263 	/*
3264 	 * Modules with examine callbacks must be initialized first, so they are
3265 	 *  ready to handle examine callbacks from later modules that will
3266 	 *  register physical bdevs.
3267 	 */
3268 	if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) {
3269 		TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
3270 	} else {
3271 		TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
3272 	}
3273 }
3274 
3275 struct spdk_bdev_module *
3276 spdk_bdev_module_list_find(const char *name)
3277 {
3278 	struct spdk_bdev_module *bdev_module;
3279 
3280 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
3281 		if (strcmp(name, bdev_module->name) == 0) {
3282 			break;
3283 		}
3284 	}
3285 
3286 	return bdev_module;
3287 }
3288 
3289 static void
3290 spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
3291 {
3292 	uint64_t len;
3293 
3294 	if (!success) {
3295 		bdev_io->internal.cb = bdev_io->u.bdev.stored_user_cb;
3296 		_spdk_bdev_io_complete(bdev_io);
3297 		return;
3298 	}
3299 
3300 	/* no need to perform the error checking from write_zeroes_blocks because this request already passed those checks. */
3301 	len = spdk_min(spdk_bdev_get_block_size(bdev_io->bdev) * bdev_io->u.bdev.split_remaining_num_blocks,
3302 		       ZERO_BUFFER_SIZE);
3303 
3304 	bdev_io->u.bdev.offset_blocks = bdev_io->u.bdev.split_current_offset_blocks;
3305 	bdev_io->u.bdev.iovs[0].iov_len = len;
3306 	bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev_io->bdev);
3307 	bdev_io->u.bdev.split_remaining_num_blocks -= bdev_io->u.bdev.num_blocks;
3308 	bdev_io->u.bdev.split_current_offset_blocks += bdev_io->u.bdev.num_blocks;
3309 
3310 	/* if this round completes the i/o, change the callback to be the original user callback */
3311 	if (bdev_io->u.bdev.split_remaining_num_blocks == 0) {
3312 		spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, bdev_io->u.bdev.stored_user_cb);
3313 	} else {
3314 		spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, spdk_bdev_write_zeroes_split);
3315 	}
3316 	spdk_bdev_io_submit(bdev_io);
3317 }
3318 
3319 struct set_qos_limit_ctx {
3320 	void (*cb_fn)(void *cb_arg, int status);
3321 	void *cb_arg;
3322 	struct spdk_bdev *bdev;
3323 };
3324 
3325 static void
3326 _spdk_bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status)
3327 {
3328 	pthread_mutex_lock(&ctx->bdev->internal.mutex);
3329 	ctx->bdev->internal.qos_mod_in_progress = false;
3330 	pthread_mutex_unlock(&ctx->bdev->internal.mutex);
3331 
3332 	ctx->cb_fn(ctx->cb_arg, status);
3333 	free(ctx);
3334 }
3335 
3336 static void
3337 _spdk_bdev_disable_qos_done(void *cb_arg)
3338 {
3339 	struct set_qos_limit_ctx *ctx = cb_arg;
3340 	struct spdk_bdev *bdev = ctx->bdev;
3341 	struct spdk_bdev_io *bdev_io;
3342 	struct spdk_bdev_qos *qos;
3343 
3344 	pthread_mutex_lock(&bdev->internal.mutex);
3345 	qos = bdev->internal.qos;
3346 	bdev->internal.qos = NULL;
3347 	pthread_mutex_unlock(&bdev->internal.mutex);
3348 
3349 	while (!TAILQ_EMPTY(&qos->queued)) {
3350 		/* Send queued I/O back to their original thread for resubmission. */
3351 		bdev_io = TAILQ_FIRST(&qos->queued);
3352 		TAILQ_REMOVE(&qos->queued, bdev_io, internal.link);
3353 
3354 		if (bdev_io->internal.io_submit_ch) {
3355 			/*
3356 			 * Channel was changed when sending it to the QoS thread - change it back
3357 			 *  before sending it back to the original thread.
3358 			 */
3359 			bdev_io->internal.ch = bdev_io->internal.io_submit_ch;
3360 			bdev_io->internal.io_submit_ch = NULL;
3361 		}
3362 
3363 		spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->internal.ch->channel),
3364 				     _spdk_bdev_io_submit, bdev_io);
3365 	}
3366 
3367 	spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
3368 	spdk_poller_unregister(&qos->poller);
3369 
3370 	free(qos);
3371 
3372 	_spdk_bdev_set_qos_limit_done(ctx, 0);
3373 }
3374 
3375 static void
3376 _spdk_bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status)
3377 {
3378 	void *io_device = spdk_io_channel_iter_get_io_device(i);
3379 	struct spdk_bdev *bdev = __bdev_from_io_dev(io_device);
3380 	struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
3381 	struct spdk_thread *thread;
3382 
3383 	pthread_mutex_lock(&bdev->internal.mutex);
3384 	thread = bdev->internal.qos->thread;
3385 	pthread_mutex_unlock(&bdev->internal.mutex);
3386 
3387 	spdk_thread_send_msg(thread, _spdk_bdev_disable_qos_done, ctx);
3388 }
3389 
3390 static void
3391 _spdk_bdev_disable_qos_msg(struct spdk_io_channel_iter *i)
3392 {
3393 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
3394 	struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch);
3395 
3396 	bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED;
3397 
3398 	spdk_for_each_channel_continue(i, 0);
3399 }
3400 
3401 static void
3402 _spdk_bdev_update_qos_limit_iops_msg(void *cb_arg)
3403 {
3404 	struct set_qos_limit_ctx *ctx = cb_arg;
3405 	struct spdk_bdev *bdev = ctx->bdev;
3406 
3407 	pthread_mutex_lock(&bdev->internal.mutex);
3408 	spdk_bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos);
3409 	pthread_mutex_unlock(&bdev->internal.mutex);
3410 
3411 	_spdk_bdev_set_qos_limit_done(ctx, 0);
3412 }
3413 
3414 static void
3415 _spdk_bdev_enable_qos_msg(struct spdk_io_channel_iter *i)
3416 {
3417 	void *io_device = spdk_io_channel_iter_get_io_device(i);
3418 	struct spdk_bdev *bdev = __bdev_from_io_dev(io_device);
3419 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
3420 	struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch);
3421 
3422 	pthread_mutex_lock(&bdev->internal.mutex);
3423 	_spdk_bdev_enable_qos(bdev, bdev_ch);
3424 	pthread_mutex_unlock(&bdev->internal.mutex);
3425 	spdk_for_each_channel_continue(i, 0);
3426 }
3427 
3428 static void
3429 _spdk_bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status)
3430 {
3431 	struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
3432 
3433 	_spdk_bdev_set_qos_limit_done(ctx, status);
3434 }
3435 
3436 void
3437 spdk_bdev_set_qos_limit_iops(struct spdk_bdev *bdev, uint64_t ios_per_sec,
3438 			     void (*cb_fn)(void *cb_arg, int status), void *cb_arg)
3439 {
3440 	struct set_qos_limit_ctx *ctx;
3441 
3442 	if (ios_per_sec > 0 && ios_per_sec % SPDK_BDEV_QOS_MIN_IOS_PER_SEC) {
3443 		SPDK_ERRLOG("Requested ios_per_sec limit %" PRIu64 " is not a multiple of %u\n",
3444 			    ios_per_sec, SPDK_BDEV_QOS_MIN_IOS_PER_SEC);
3445 		cb_fn(cb_arg, -EINVAL);
3446 		return;
3447 	}
3448 
3449 	ctx = calloc(1, sizeof(*ctx));
3450 	if (ctx == NULL) {
3451 		cb_fn(cb_arg, -ENOMEM);
3452 		return;
3453 	}
3454 
3455 	ctx->cb_fn = cb_fn;
3456 	ctx->cb_arg = cb_arg;
3457 	ctx->bdev = bdev;
3458 
3459 	pthread_mutex_lock(&bdev->internal.mutex);
3460 	if (bdev->internal.qos_mod_in_progress) {
3461 		pthread_mutex_unlock(&bdev->internal.mutex);
3462 		free(ctx);
3463 		cb_fn(cb_arg, -EAGAIN);
3464 		return;
3465 	}
3466 	bdev->internal.qos_mod_in_progress = true;
3467 
3468 	if (ios_per_sec > 0) {
3469 		if (bdev->internal.qos == NULL) {
3470 			/* Enabling */
3471 			bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos));
3472 			if (!bdev->internal.qos) {
3473 				pthread_mutex_unlock(&bdev->internal.mutex);
3474 				SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n");
3475 				free(ctx);
3476 				cb_fn(cb_arg, -ENOMEM);
3477 				return;
3478 			}
3479 
3480 			bdev->internal.qos->iops_rate_limit = ios_per_sec;
3481 			spdk_for_each_channel(__bdev_to_io_dev(bdev),
3482 					      _spdk_bdev_enable_qos_msg, ctx,
3483 					      _spdk_bdev_enable_qos_done);
3484 		} else {
3485 			/* Updating */
3486 			bdev->internal.qos->iops_rate_limit = ios_per_sec;
3487 			spdk_thread_send_msg(bdev->internal.qos->thread, _spdk_bdev_update_qos_limit_iops_msg, ctx);
3488 		}
3489 	} else {
3490 		if (bdev->internal.qos != NULL) {
3491 			/* Disabling */
3492 			spdk_for_each_channel(__bdev_to_io_dev(bdev),
3493 					      _spdk_bdev_disable_qos_msg, ctx,
3494 					      _spdk_bdev_disable_qos_msg_done);
3495 		} else {
3496 			pthread_mutex_unlock(&bdev->internal.mutex);
3497 			_spdk_bdev_set_qos_limit_done(ctx, 0);
3498 			return;
3499 		}
3500 	}
3501 
3502 	pthread_mutex_unlock(&bdev->internal.mutex);
3503 }
3504 
3505 SPDK_LOG_REGISTER_COMPONENT("bdev", SPDK_LOG_BDEV)
3506