xref: /spdk/lib/bdev/bdev.c (revision 9f583911fd21e6bc0bbd482f2e9a40560e70781e)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
5  *   Copyright (c) Intel Corporation.
6  *   All rights reserved.
7  *
8  *   Redistribution and use in source and binary forms, with or without
9  *   modification, are permitted provided that the following conditions
10  *   are met:
11  *
12  *     * Redistributions of source code must retain the above copyright
13  *       notice, this list of conditions and the following disclaimer.
14  *     * Redistributions in binary form must reproduce the above copyright
15  *       notice, this list of conditions and the following disclaimer in
16  *       the documentation and/or other materials provided with the
17  *       distribution.
18  *     * Neither the name of Intel Corporation nor the names of its
19  *       contributors may be used to endorse or promote products derived
20  *       from this software without specific prior written permission.
21  *
22  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 #include "spdk/stdinc.h"
36 
37 #include "spdk/bdev.h"
38 #include "spdk/conf.h"
39 
40 #include "spdk/env.h"
41 #include "spdk/event.h"
42 #include "spdk/thread.h"
43 #include "spdk/likely.h"
44 #include "spdk/queue.h"
45 #include "spdk/nvme_spec.h"
46 #include "spdk/scsi_spec.h"
47 #include "spdk/util.h"
48 
49 #include "spdk/bdev_module.h"
50 #include "spdk_internal/log.h"
51 #include "spdk/string.h"
52 
53 #ifdef SPDK_CONFIG_VTUNE
54 #include "ittnotify.h"
55 #include "ittnotify_types.h"
56 int __itt_init_ittlib(const char *, __itt_group_id);
57 #endif
58 
59 #define SPDK_BDEV_IO_POOL_SIZE			(64 * 1024)
60 #define SPDK_BDEV_IO_CACHE_SIZE			256
61 #define BUF_SMALL_POOL_SIZE			8192
62 #define BUF_LARGE_POOL_SIZE			1024
63 #define NOMEM_THRESHOLD_COUNT			8
64 #define ZERO_BUFFER_SIZE			0x100000
65 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC		1000
66 #define SPDK_BDEV_SEC_TO_USEC			1000000ULL
67 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE	1
68 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE	512
69 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC		10000
70 #define SPDK_BDEV_QOS_MIN_BW_IN_MB_PER_SEC	10
71 
72 enum spdk_bdev_qos_type {
73 	SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT = 0,
74 	SPDK_BDEV_QOS_RW_BYTEPS_RATE_LIMIT,
75 	SPDK_BDEV_QOS_NUM_TYPES /* Keep last */
76 };
77 
78 static const char *qos_type_str[SPDK_BDEV_QOS_NUM_TYPES] = {"Limit_IOPS", "Limit_BWPS"};
79 
80 struct spdk_bdev_mgr {
81 	struct spdk_mempool *bdev_io_pool;
82 
83 	struct spdk_mempool *buf_small_pool;
84 	struct spdk_mempool *buf_large_pool;
85 
86 	void *zero_buffer;
87 
88 	TAILQ_HEAD(, spdk_bdev_module) bdev_modules;
89 
90 	TAILQ_HEAD(, spdk_bdev) bdevs;
91 
92 	bool init_complete;
93 	bool module_init_complete;
94 
95 #ifdef SPDK_CONFIG_VTUNE
96 	__itt_domain	*domain;
97 #endif
98 };
99 
100 static struct spdk_bdev_mgr g_bdev_mgr = {
101 	.bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules),
102 	.bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs),
103 	.init_complete = false,
104 	.module_init_complete = false,
105 };
106 
107 static struct spdk_bdev_opts	g_bdev_opts = {
108 	.bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE,
109 	.bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE,
110 };
111 
112 static spdk_bdev_init_cb	g_init_cb_fn = NULL;
113 static void			*g_init_cb_arg = NULL;
114 
115 static spdk_bdev_fini_cb	g_fini_cb_fn = NULL;
116 static void			*g_fini_cb_arg = NULL;
117 static struct spdk_thread	*g_fini_thread = NULL;
118 
119 struct spdk_bdev_qos {
120 	/** Rate limit, in I/O per second */
121 	uint64_t iops_rate_limit;
122 
123 	/** Rate limit, in byte per second */
124 	uint64_t byte_rate_limit;
125 
126 	/** The channel that all I/O are funneled through */
127 	struct spdk_bdev_channel *ch;
128 
129 	/** The thread on which the poller is running. */
130 	struct spdk_thread *thread;
131 
132 	/** Queue of I/O waiting to be issued. */
133 	bdev_io_tailq_t queued;
134 
135 	/** Maximum allowed IOs to be issued in one timeslice (e.g., 1ms) and
136 	 *  only valid for the master channel which manages the outstanding IOs. */
137 	uint64_t max_ios_per_timeslice;
138 
139 	/** Maximum allowed bytes to be issued in one timeslice (e.g., 1ms) and
140 	 *  only valid for the master channel which manages the outstanding IOs. */
141 	uint64_t max_byte_per_timeslice;
142 
143 	/** Submitted IO in one timeslice (e.g., 1ms) */
144 	uint64_t io_submitted_this_timeslice;
145 
146 	/** Submitted byte in one timeslice (e.g., 1ms) */
147 	uint64_t byte_submitted_this_timeslice;
148 
149 	/** Polller that processes queued I/O commands each time slice. */
150 	struct spdk_poller *poller;
151 };
152 
153 struct spdk_bdev_mgmt_channel {
154 	bdev_io_stailq_t need_buf_small;
155 	bdev_io_stailq_t need_buf_large;
156 
157 	/*
158 	 * Each thread keeps a cache of bdev_io - this allows
159 	 *  bdev threads which are *not* DPDK threads to still
160 	 *  benefit from a per-thread bdev_io cache.  Without
161 	 *  this, non-DPDK threads fetching from the mempool
162 	 *  incur a cmpxchg on get and put.
163 	 */
164 	bdev_io_stailq_t per_thread_cache;
165 	uint32_t	per_thread_cache_count;
166 	uint32_t	bdev_io_cache_size;
167 
168 	TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources;
169 };
170 
171 /*
172  * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device
173  * will queue here their IO that awaits retry. It makes it posible to retry sending
174  * IO to one bdev after IO from other bdev completes.
175  */
176 struct spdk_bdev_shared_resource {
177 	/* The bdev management channel */
178 	struct spdk_bdev_mgmt_channel *mgmt_ch;
179 
180 	/*
181 	 * Count of I/O submitted to bdev module and waiting for completion.
182 	 * Incremented before submit_request() is called on an spdk_bdev_io.
183 	 */
184 	uint64_t		io_outstanding;
185 
186 	/*
187 	 * Queue of IO awaiting retry because of a previous NOMEM status returned
188 	 *  on this channel.
189 	 */
190 	bdev_io_tailq_t		nomem_io;
191 
192 	/*
193 	 * Threshold which io_outstanding must drop to before retrying nomem_io.
194 	 */
195 	uint64_t		nomem_threshold;
196 
197 	/* I/O channel allocated by a bdev module */
198 	struct spdk_io_channel	*shared_ch;
199 
200 	/* Refcount of bdev channels using this resource */
201 	uint32_t		ref;
202 
203 	TAILQ_ENTRY(spdk_bdev_shared_resource) link;
204 };
205 
206 #define BDEV_CH_RESET_IN_PROGRESS	(1 << 0)
207 #define BDEV_CH_QOS_ENABLED		(1 << 1)
208 
209 struct spdk_bdev_channel {
210 	struct spdk_bdev	*bdev;
211 
212 	/* The channel for the underlying device */
213 	struct spdk_io_channel	*channel;
214 
215 	/* Per io_device per thread data */
216 	struct spdk_bdev_shared_resource *shared_resource;
217 
218 	struct spdk_bdev_io_stat stat;
219 
220 	/*
221 	 * Count of I/O submitted through this channel and waiting for completion.
222 	 * Incremented before submit_request() is called on an spdk_bdev_io.
223 	 */
224 	uint64_t		io_outstanding;
225 
226 	bdev_io_tailq_t		queued_resets;
227 
228 	uint32_t		flags;
229 
230 #ifdef SPDK_CONFIG_VTUNE
231 	uint64_t		start_tsc;
232 	uint64_t		interval_tsc;
233 	__itt_string_handle	*handle;
234 	struct spdk_bdev_io_stat prev_stat;
235 #endif
236 
237 };
238 
239 struct spdk_bdev_desc {
240 	struct spdk_bdev		*bdev;
241 	spdk_bdev_remove_cb_t		remove_cb;
242 	void				*remove_ctx;
243 	bool				write;
244 	TAILQ_ENTRY(spdk_bdev_desc)	link;
245 };
246 
247 struct spdk_bdev_iostat_ctx {
248 	struct spdk_bdev_io_stat *stat;
249 	spdk_bdev_get_device_stat_cb cb;
250 	void *cb_arg;
251 };
252 
253 #define __bdev_to_io_dev(bdev)		(((char *)bdev) + 1)
254 #define __bdev_from_io_dev(io_dev)	((struct spdk_bdev *)(((char *)io_dev) - 1))
255 
256 static void spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
257 
258 void
259 spdk_bdev_get_opts(struct spdk_bdev_opts *opts)
260 {
261 	*opts = g_bdev_opts;
262 }
263 
264 int
265 spdk_bdev_set_opts(struct spdk_bdev_opts *opts)
266 {
267 	if (opts->bdev_io_pool_size < opts->bdev_io_cache_size * spdk_thread_get_count()) {
268 		SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32
269 			    " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size,
270 			    spdk_thread_get_count());
271 		return -1;
272 	}
273 
274 	g_bdev_opts = *opts;
275 	return 0;
276 }
277 
278 struct spdk_bdev *
279 spdk_bdev_first(void)
280 {
281 	struct spdk_bdev *bdev;
282 
283 	bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs);
284 	if (bdev) {
285 		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name);
286 	}
287 
288 	return bdev;
289 }
290 
291 struct spdk_bdev *
292 spdk_bdev_next(struct spdk_bdev *prev)
293 {
294 	struct spdk_bdev *bdev;
295 
296 	bdev = TAILQ_NEXT(prev, link);
297 	if (bdev) {
298 		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name);
299 	}
300 
301 	return bdev;
302 }
303 
304 static struct spdk_bdev *
305 _bdev_next_leaf(struct spdk_bdev *bdev)
306 {
307 	while (bdev != NULL) {
308 		if (bdev->claim_module == NULL) {
309 			return bdev;
310 		} else {
311 			bdev = TAILQ_NEXT(bdev, link);
312 		}
313 	}
314 
315 	return bdev;
316 }
317 
318 struct spdk_bdev *
319 spdk_bdev_first_leaf(void)
320 {
321 	struct spdk_bdev *bdev;
322 
323 	bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs));
324 
325 	if (bdev) {
326 		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name);
327 	}
328 
329 	return bdev;
330 }
331 
332 struct spdk_bdev *
333 spdk_bdev_next_leaf(struct spdk_bdev *prev)
334 {
335 	struct spdk_bdev *bdev;
336 
337 	bdev = _bdev_next_leaf(TAILQ_NEXT(prev, link));
338 
339 	if (bdev) {
340 		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name);
341 	}
342 
343 	return bdev;
344 }
345 
346 struct spdk_bdev *
347 spdk_bdev_get_by_name(const char *bdev_name)
348 {
349 	struct spdk_bdev_alias *tmp;
350 	struct spdk_bdev *bdev = spdk_bdev_first();
351 
352 	while (bdev != NULL) {
353 		if (strcmp(bdev_name, bdev->name) == 0) {
354 			return bdev;
355 		}
356 
357 		TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
358 			if (strcmp(bdev_name, tmp->alias) == 0) {
359 				return bdev;
360 			}
361 		}
362 
363 		bdev = spdk_bdev_next(bdev);
364 	}
365 
366 	return NULL;
367 }
368 
369 static void
370 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf)
371 {
372 	assert(bdev_io->get_buf_cb != NULL);
373 	assert(buf != NULL);
374 	assert(bdev_io->u.bdev.iovs != NULL);
375 
376 	bdev_io->buf = buf;
377 	bdev_io->u.bdev.iovs[0].iov_base = (void *)((unsigned long)((char *)buf + 512) & ~511UL);
378 	bdev_io->u.bdev.iovs[0].iov_len = bdev_io->buf_len;
379 	bdev_io->get_buf_cb(bdev_io->ch->channel, bdev_io);
380 }
381 
382 static void
383 spdk_bdev_io_put_buf(struct spdk_bdev_io *bdev_io)
384 {
385 	struct spdk_mempool *pool;
386 	struct spdk_bdev_io *tmp;
387 	void *buf;
388 	bdev_io_stailq_t *stailq;
389 	struct spdk_bdev_mgmt_channel *ch;
390 
391 	assert(bdev_io->u.bdev.iovcnt == 1);
392 
393 	buf = bdev_io->buf;
394 	ch = bdev_io->ch->shared_resource->mgmt_ch;
395 
396 	if (bdev_io->buf_len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) {
397 		pool = g_bdev_mgr.buf_small_pool;
398 		stailq = &ch->need_buf_small;
399 	} else {
400 		pool = g_bdev_mgr.buf_large_pool;
401 		stailq = &ch->need_buf_large;
402 	}
403 
404 	if (STAILQ_EMPTY(stailq)) {
405 		spdk_mempool_put(pool, buf);
406 	} else {
407 		tmp = STAILQ_FIRST(stailq);
408 		STAILQ_REMOVE_HEAD(stailq, internal.buf_link);
409 		spdk_bdev_io_set_buf(tmp, buf);
410 	}
411 }
412 
413 void
414 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len)
415 {
416 	struct spdk_mempool *pool;
417 	bdev_io_stailq_t *stailq;
418 	void *buf = NULL;
419 	struct spdk_bdev_mgmt_channel *mgmt_ch;
420 
421 	assert(cb != NULL);
422 	assert(bdev_io->u.bdev.iovs != NULL);
423 
424 	if (spdk_unlikely(bdev_io->u.bdev.iovs[0].iov_base != NULL)) {
425 		/* Buffer already present */
426 		cb(bdev_io->ch->channel, bdev_io);
427 		return;
428 	}
429 
430 	assert(len <= SPDK_BDEV_LARGE_BUF_MAX_SIZE);
431 	mgmt_ch = bdev_io->ch->shared_resource->mgmt_ch;
432 
433 	bdev_io->buf_len = len;
434 	bdev_io->get_buf_cb = cb;
435 	if (len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) {
436 		pool = g_bdev_mgr.buf_small_pool;
437 		stailq = &mgmt_ch->need_buf_small;
438 	} else {
439 		pool = g_bdev_mgr.buf_large_pool;
440 		stailq = &mgmt_ch->need_buf_large;
441 	}
442 
443 	buf = spdk_mempool_get(pool);
444 
445 	if (!buf) {
446 		STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link);
447 	} else {
448 		spdk_bdev_io_set_buf(bdev_io, buf);
449 	}
450 }
451 
452 static int
453 spdk_bdev_module_get_max_ctx_size(void)
454 {
455 	struct spdk_bdev_module *bdev_module;
456 	int max_bdev_module_size = 0;
457 
458 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) {
459 		if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) {
460 			max_bdev_module_size = bdev_module->get_ctx_size();
461 		}
462 	}
463 
464 	return max_bdev_module_size;
465 }
466 
467 void
468 spdk_bdev_config_text(FILE *fp)
469 {
470 	struct spdk_bdev_module *bdev_module;
471 
472 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) {
473 		if (bdev_module->config_text) {
474 			bdev_module->config_text(fp);
475 		}
476 	}
477 }
478 
479 void
480 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w)
481 {
482 	struct spdk_bdev_module *bdev_module;
483 	struct spdk_bdev *bdev;
484 
485 	assert(w != NULL);
486 
487 	spdk_json_write_array_begin(w);
488 
489 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) {
490 		if (bdev_module->config_json) {
491 			bdev_module->config_json(w);
492 		}
493 	}
494 
495 	TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, link) {
496 		spdk_bdev_config_json(bdev, w);
497 	}
498 
499 	spdk_json_write_array_end(w);
500 }
501 
502 static int
503 spdk_bdev_mgmt_channel_create(void *io_device, void *ctx_buf)
504 {
505 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
506 
507 	STAILQ_INIT(&ch->need_buf_small);
508 	STAILQ_INIT(&ch->need_buf_large);
509 
510 	STAILQ_INIT(&ch->per_thread_cache);
511 	ch->per_thread_cache_count = 0;
512 	ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size;
513 
514 	TAILQ_INIT(&ch->shared_resources);
515 
516 	return 0;
517 }
518 
519 static void
520 spdk_bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf)
521 {
522 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
523 	struct spdk_bdev_io *bdev_io;
524 
525 	if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) {
526 		SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n");
527 	}
528 
529 	if (!TAILQ_EMPTY(&ch->shared_resources)) {
530 		SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n");
531 	}
532 
533 	while (!STAILQ_EMPTY(&ch->per_thread_cache)) {
534 		bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
535 		STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
536 		ch->per_thread_cache_count--;
537 		spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
538 	}
539 
540 	assert(ch->per_thread_cache_count == 0);
541 }
542 
543 static void
544 spdk_bdev_init_complete(int rc)
545 {
546 	spdk_bdev_init_cb cb_fn = g_init_cb_fn;
547 	void *cb_arg = g_init_cb_arg;
548 	struct spdk_bdev_module *m;
549 
550 	g_bdev_mgr.init_complete = true;
551 	g_init_cb_fn = NULL;
552 	g_init_cb_arg = NULL;
553 
554 	/*
555 	 * For modules that need to know when subsystem init is complete,
556 	 * inform them now.
557 	 */
558 	TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, tailq) {
559 		if (m->init_complete) {
560 			m->init_complete();
561 		}
562 	}
563 
564 	cb_fn(cb_arg, rc);
565 }
566 
567 static void
568 spdk_bdev_module_action_complete(void)
569 {
570 	struct spdk_bdev_module *m;
571 
572 	/*
573 	 * Don't finish bdev subsystem initialization if
574 	 * module pre-initialization is still in progress, or
575 	 * the subsystem been already initialized.
576 	 */
577 	if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) {
578 		return;
579 	}
580 
581 	/*
582 	 * Check all bdev modules for inits/examinations in progress. If any
583 	 * exist, return immediately since we cannot finish bdev subsystem
584 	 * initialization until all are completed.
585 	 */
586 	TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, tailq) {
587 		if (m->action_in_progress > 0) {
588 			return;
589 		}
590 	}
591 
592 	/*
593 	 * Modules already finished initialization - now that all
594 	 * the bdev modules have finished their asynchronous I/O
595 	 * processing, the entire bdev layer can be marked as complete.
596 	 */
597 	spdk_bdev_init_complete(0);
598 }
599 
600 static void
601 spdk_bdev_module_action_done(struct spdk_bdev_module *module)
602 {
603 	assert(module->action_in_progress > 0);
604 	module->action_in_progress--;
605 	spdk_bdev_module_action_complete();
606 }
607 
608 void
609 spdk_bdev_module_init_done(struct spdk_bdev_module *module)
610 {
611 	spdk_bdev_module_action_done(module);
612 }
613 
614 void
615 spdk_bdev_module_examine_done(struct spdk_bdev_module *module)
616 {
617 	spdk_bdev_module_action_done(module);
618 }
619 
620 static int
621 spdk_bdev_modules_init(void)
622 {
623 	struct spdk_bdev_module *module;
624 	int rc = 0;
625 
626 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, tailq) {
627 		rc = module->module_init();
628 		if (rc != 0) {
629 			break;
630 		}
631 	}
632 
633 	g_bdev_mgr.module_init_complete = true;
634 	return rc;
635 }
636 
637 void
638 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg)
639 {
640 	struct spdk_conf_section *sp;
641 	struct spdk_bdev_opts bdev_opts;
642 	int32_t bdev_io_pool_size, bdev_io_cache_size;
643 	int cache_size;
644 	int rc = 0;
645 	char mempool_name[32];
646 
647 	assert(cb_fn != NULL);
648 
649 	sp = spdk_conf_find_section(NULL, "Bdev");
650 	if (sp != NULL) {
651 		spdk_bdev_get_opts(&bdev_opts);
652 
653 		bdev_io_pool_size = spdk_conf_section_get_intval(sp, "BdevIoPoolSize");
654 		if (bdev_io_pool_size >= 0) {
655 			bdev_opts.bdev_io_pool_size = bdev_io_pool_size;
656 		}
657 
658 		bdev_io_cache_size = spdk_conf_section_get_intval(sp, "BdevIoCacheSize");
659 		if (bdev_io_cache_size >= 0) {
660 			bdev_opts.bdev_io_cache_size = bdev_io_cache_size;
661 		}
662 
663 		if (spdk_bdev_set_opts(&bdev_opts)) {
664 			spdk_bdev_init_complete(-1);
665 			return;
666 		}
667 
668 		assert(memcmp(&bdev_opts, &g_bdev_opts, sizeof(bdev_opts)) == 0);
669 	}
670 
671 	g_init_cb_fn = cb_fn;
672 	g_init_cb_arg = cb_arg;
673 
674 	snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid());
675 
676 	g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name,
677 				  g_bdev_opts.bdev_io_pool_size,
678 				  sizeof(struct spdk_bdev_io) +
679 				  spdk_bdev_module_get_max_ctx_size(),
680 				  0,
681 				  SPDK_ENV_SOCKET_ID_ANY);
682 
683 	if (g_bdev_mgr.bdev_io_pool == NULL) {
684 		SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n");
685 		spdk_bdev_init_complete(-1);
686 		return;
687 	}
688 
689 	/**
690 	 * Ensure no more than half of the total buffers end up local caches, by
691 	 *   using spdk_thread_get_count() to determine how many local caches we need
692 	 *   to account for.
693 	 */
694 	cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_thread_get_count());
695 	snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid());
696 
697 	g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name,
698 				    BUF_SMALL_POOL_SIZE,
699 				    SPDK_BDEV_SMALL_BUF_MAX_SIZE + 512,
700 				    cache_size,
701 				    SPDK_ENV_SOCKET_ID_ANY);
702 	if (!g_bdev_mgr.buf_small_pool) {
703 		SPDK_ERRLOG("create rbuf small pool failed\n");
704 		spdk_bdev_init_complete(-1);
705 		return;
706 	}
707 
708 	cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_thread_get_count());
709 	snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid());
710 
711 	g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name,
712 				    BUF_LARGE_POOL_SIZE,
713 				    SPDK_BDEV_LARGE_BUF_MAX_SIZE + 512,
714 				    cache_size,
715 				    SPDK_ENV_SOCKET_ID_ANY);
716 	if (!g_bdev_mgr.buf_large_pool) {
717 		SPDK_ERRLOG("create rbuf large pool failed\n");
718 		spdk_bdev_init_complete(-1);
719 		return;
720 	}
721 
722 	g_bdev_mgr.zero_buffer = spdk_dma_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE,
723 				 NULL);
724 	if (!g_bdev_mgr.zero_buffer) {
725 		SPDK_ERRLOG("create bdev zero buffer failed\n");
726 		spdk_bdev_init_complete(-1);
727 		return;
728 	}
729 
730 #ifdef SPDK_CONFIG_VTUNE
731 	g_bdev_mgr.domain = __itt_domain_create("spdk_bdev");
732 #endif
733 
734 	spdk_io_device_register(&g_bdev_mgr, spdk_bdev_mgmt_channel_create,
735 				spdk_bdev_mgmt_channel_destroy,
736 				sizeof(struct spdk_bdev_mgmt_channel));
737 
738 	rc = spdk_bdev_modules_init();
739 	if (rc != 0) {
740 		SPDK_ERRLOG("bdev modules init failed\n");
741 		spdk_bdev_init_complete(-1);
742 		return;
743 	}
744 
745 	spdk_bdev_module_action_complete();
746 }
747 
748 static void
749 spdk_bdev_mgr_unregister_cb(void *io_device)
750 {
751 	spdk_bdev_fini_cb cb_fn = g_fini_cb_fn;
752 
753 	if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) {
754 		SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n",
755 			    spdk_mempool_count(g_bdev_mgr.bdev_io_pool),
756 			    g_bdev_opts.bdev_io_pool_size);
757 	}
758 
759 	if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != BUF_SMALL_POOL_SIZE) {
760 		SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n",
761 			    spdk_mempool_count(g_bdev_mgr.buf_small_pool),
762 			    BUF_SMALL_POOL_SIZE);
763 		assert(false);
764 	}
765 
766 	if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != BUF_LARGE_POOL_SIZE) {
767 		SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n",
768 			    spdk_mempool_count(g_bdev_mgr.buf_large_pool),
769 			    BUF_LARGE_POOL_SIZE);
770 		assert(false);
771 	}
772 
773 	spdk_mempool_free(g_bdev_mgr.bdev_io_pool);
774 	spdk_mempool_free(g_bdev_mgr.buf_small_pool);
775 	spdk_mempool_free(g_bdev_mgr.buf_large_pool);
776 	spdk_dma_free(g_bdev_mgr.zero_buffer);
777 
778 	cb_fn(g_fini_cb_arg);
779 	g_fini_cb_fn = NULL;
780 	g_fini_cb_arg = NULL;
781 }
782 
783 static struct spdk_bdev_module *g_resume_bdev_module = NULL;
784 
785 static void
786 spdk_bdev_module_finish_iter(void *arg)
787 {
788 	struct spdk_bdev_module *bdev_module;
789 
790 	/* Start iterating from the last touched module */
791 	if (!g_resume_bdev_module) {
792 		bdev_module = TAILQ_FIRST(&g_bdev_mgr.bdev_modules);
793 	} else {
794 		bdev_module = TAILQ_NEXT(g_resume_bdev_module, tailq);
795 	}
796 
797 	while (bdev_module) {
798 		if (bdev_module->async_fini) {
799 			/* Save our place so we can resume later. We must
800 			 * save the variable here, before calling module_fini()
801 			 * below, because in some cases the module may immediately
802 			 * call spdk_bdev_module_finish_done() and re-enter
803 			 * this function to continue iterating. */
804 			g_resume_bdev_module = bdev_module;
805 		}
806 
807 		if (bdev_module->module_fini) {
808 			bdev_module->module_fini();
809 		}
810 
811 		if (bdev_module->async_fini) {
812 			return;
813 		}
814 
815 		bdev_module = TAILQ_NEXT(bdev_module, tailq);
816 	}
817 
818 	g_resume_bdev_module = NULL;
819 	spdk_io_device_unregister(&g_bdev_mgr, spdk_bdev_mgr_unregister_cb);
820 }
821 
822 void
823 spdk_bdev_module_finish_done(void)
824 {
825 	if (spdk_get_thread() != g_fini_thread) {
826 		spdk_thread_send_msg(g_fini_thread, spdk_bdev_module_finish_iter, NULL);
827 	} else {
828 		spdk_bdev_module_finish_iter(NULL);
829 	}
830 }
831 
832 static void
833 _spdk_bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno)
834 {
835 	struct spdk_bdev *bdev = cb_arg;
836 
837 	if (bdeverrno && bdev) {
838 		SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n",
839 			     bdev->name);
840 
841 		/*
842 		 * Since the call to spdk_bdev_unregister() failed, we have no way to free this
843 		 *  bdev; try to continue by manually removing this bdev from the list and continue
844 		 *  with the next bdev in the list.
845 		 */
846 		TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, link);
847 	}
848 
849 	if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) {
850 		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Done unregistering bdevs\n");
851 		/*
852 		 * Bdev module finish need to be deffered as we might be in the middle of some context
853 		 * (like bdev part free) that will use this bdev (or private bdev driver ctx data)
854 		 * after returning.
855 		 */
856 		spdk_thread_send_msg(spdk_get_thread(), spdk_bdev_module_finish_iter, NULL);
857 		return;
858 	}
859 
860 	/*
861 	 * Unregister the first bdev in the list.
862 	 *
863 	 * spdk_bdev_unregister() will handle the case where the bdev has open descriptors by
864 	 *  calling the remove_cb of the descriptors first.
865 	 *
866 	 * Once this bdev and all of its open descriptors have been cleaned up, this function
867 	 *  will be called again via the unregister completion callback to continue the cleanup
868 	 *  process with the next bdev.
869 	 */
870 	bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs);
871 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Unregistering bdev '%s'\n", bdev->name);
872 	spdk_bdev_unregister(bdev, _spdk_bdev_finish_unregister_bdevs_iter, bdev);
873 }
874 
875 void
876 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg)
877 {
878 	assert(cb_fn != NULL);
879 
880 	g_fini_thread = spdk_get_thread();
881 
882 	g_fini_cb_fn = cb_fn;
883 	g_fini_cb_arg = cb_arg;
884 
885 	_spdk_bdev_finish_unregister_bdevs_iter(NULL, 0);
886 }
887 
888 static struct spdk_bdev_io *
889 spdk_bdev_get_io(struct spdk_bdev_channel *channel)
890 {
891 	struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch;
892 	struct spdk_bdev_io *bdev_io;
893 
894 	if (ch->per_thread_cache_count > 0) {
895 		bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
896 		STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
897 		ch->per_thread_cache_count--;
898 	} else {
899 		bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
900 		if (!bdev_io) {
901 			SPDK_ERRLOG("Unable to get spdk_bdev_io\n");
902 			return NULL;
903 		}
904 	}
905 
906 	return bdev_io;
907 }
908 
909 static void
910 spdk_bdev_put_io(struct spdk_bdev_io *bdev_io)
911 {
912 	struct spdk_bdev_mgmt_channel *ch = bdev_io->ch->shared_resource->mgmt_ch;
913 
914 	if (bdev_io->buf != NULL) {
915 		spdk_bdev_io_put_buf(bdev_io);
916 	}
917 
918 	if (ch->per_thread_cache_count < ch->bdev_io_cache_size) {
919 		ch->per_thread_cache_count++;
920 		STAILQ_INSERT_TAIL(&ch->per_thread_cache, bdev_io, internal.buf_link);
921 	} else {
922 		spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
923 	}
924 }
925 
926 static uint64_t
927 _spdk_bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io)
928 {
929 	struct spdk_bdev	*bdev = bdev_io->bdev;
930 
931 	switch (bdev_io->type) {
932 	case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
933 	case SPDK_BDEV_IO_TYPE_NVME_IO:
934 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
935 		return bdev_io->u.nvme_passthru.nbytes;
936 	case SPDK_BDEV_IO_TYPE_READ:
937 	case SPDK_BDEV_IO_TYPE_WRITE:
938 	case SPDK_BDEV_IO_TYPE_UNMAP:
939 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
940 		return bdev_io->u.bdev.num_blocks * bdev->blocklen;
941 	default:
942 		return 0;
943 	}
944 }
945 
946 static void
947 _spdk_bdev_qos_io_submit(struct spdk_bdev_channel *ch)
948 {
949 	struct spdk_bdev_io		*bdev_io = NULL;
950 	struct spdk_bdev		*bdev = ch->bdev;
951 	struct spdk_bdev_qos		*qos = bdev->qos;
952 	struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource;
953 
954 	while (!TAILQ_EMPTY(&qos->queued)) {
955 		if (qos->max_ios_per_timeslice > 0 &&
956 		    qos->io_submitted_this_timeslice >= qos->max_ios_per_timeslice) {
957 			break;
958 		}
959 
960 		if (qos->max_byte_per_timeslice > 0 &&
961 		    qos->byte_submitted_this_timeslice >= qos->max_byte_per_timeslice) {
962 			break;
963 		}
964 
965 		bdev_io = TAILQ_FIRST(&qos->queued);
966 		TAILQ_REMOVE(&qos->queued, bdev_io, link);
967 		qos->io_submitted_this_timeslice++;
968 		qos->byte_submitted_this_timeslice += _spdk_bdev_get_io_size_in_byte(bdev_io);
969 		ch->io_outstanding++;
970 		shared_resource->io_outstanding++;
971 		bdev->fn_table->submit_request(ch->channel, bdev_io);
972 	}
973 }
974 
975 static void
976 _spdk_bdev_io_submit(void *ctx)
977 {
978 	struct spdk_bdev_io *bdev_io = ctx;
979 	struct spdk_bdev *bdev = bdev_io->bdev;
980 	struct spdk_bdev_channel *bdev_ch = bdev_io->ch;
981 	struct spdk_io_channel *ch = bdev_ch->channel;
982 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
983 
984 	bdev_io->submit_tsc = spdk_get_ticks();
985 	bdev_ch->io_outstanding++;
986 	shared_resource->io_outstanding++;
987 	bdev_io->in_submit_request = true;
988 	if (spdk_likely(bdev_ch->flags == 0)) {
989 		if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) {
990 			bdev->fn_table->submit_request(ch, bdev_io);
991 		} else {
992 			bdev_ch->io_outstanding--;
993 			shared_resource->io_outstanding--;
994 			TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, link);
995 		}
996 	} else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) {
997 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
998 	} else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) {
999 		bdev_ch->io_outstanding--;
1000 		shared_resource->io_outstanding--;
1001 		TAILQ_INSERT_TAIL(&bdev->qos->queued, bdev_io, link);
1002 		_spdk_bdev_qos_io_submit(bdev_ch);
1003 	} else {
1004 		SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags);
1005 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
1006 	}
1007 	bdev_io->in_submit_request = false;
1008 }
1009 
1010 static void
1011 spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io)
1012 {
1013 	struct spdk_bdev *bdev = bdev_io->bdev;
1014 	struct spdk_thread *thread = spdk_io_channel_get_thread(bdev_io->ch->channel);
1015 
1016 	assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING);
1017 
1018 	if (bdev_io->ch->flags & BDEV_CH_QOS_ENABLED) {
1019 		if (thread == bdev->qos->thread) {
1020 			_spdk_bdev_io_submit(bdev_io);
1021 		} else {
1022 			bdev_io->io_submit_ch = bdev_io->ch;
1023 			bdev_io->ch = bdev->qos->ch;
1024 			spdk_thread_send_msg(bdev->qos->thread, _spdk_bdev_io_submit, bdev_io);
1025 		}
1026 	} else {
1027 		_spdk_bdev_io_submit(bdev_io);
1028 	}
1029 }
1030 
1031 static void
1032 spdk_bdev_io_submit_reset(struct spdk_bdev_io *bdev_io)
1033 {
1034 	struct spdk_bdev *bdev = bdev_io->bdev;
1035 	struct spdk_bdev_channel *bdev_ch = bdev_io->ch;
1036 	struct spdk_io_channel *ch = bdev_ch->channel;
1037 
1038 	assert(bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING);
1039 
1040 	bdev_io->in_submit_request = true;
1041 	bdev->fn_table->submit_request(ch, bdev_io);
1042 	bdev_io->in_submit_request = false;
1043 }
1044 
1045 static void
1046 spdk_bdev_io_init(struct spdk_bdev_io *bdev_io,
1047 		  struct spdk_bdev *bdev, void *cb_arg,
1048 		  spdk_bdev_io_completion_cb cb)
1049 {
1050 	bdev_io->bdev = bdev;
1051 	bdev_io->caller_ctx = cb_arg;
1052 	bdev_io->cb = cb;
1053 	bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING;
1054 	bdev_io->in_submit_request = false;
1055 	bdev_io->buf = NULL;
1056 	bdev_io->io_submit_ch = NULL;
1057 }
1058 
1059 bool
1060 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
1061 {
1062 	return bdev->fn_table->io_type_supported(bdev->ctxt, io_type);
1063 }
1064 
1065 int
1066 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1067 {
1068 	if (bdev->fn_table->dump_info_json) {
1069 		return bdev->fn_table->dump_info_json(bdev->ctxt, w);
1070 	}
1071 
1072 	return 0;
1073 }
1074 
1075 void
1076 spdk_bdev_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1077 {
1078 	assert(bdev != NULL);
1079 	assert(w != NULL);
1080 
1081 	if (bdev->fn_table->write_config_json) {
1082 		bdev->fn_table->write_config_json(bdev, w);
1083 	} else {
1084 		spdk_json_write_object_begin(w);
1085 		spdk_json_write_named_string(w, "name", bdev->name);
1086 		spdk_json_write_object_end(w);
1087 	}
1088 }
1089 
1090 static void
1091 spdk_bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos)
1092 {
1093 	uint64_t max_ios_per_timeslice = 0, max_byte_per_timeslice = 0;
1094 
1095 	if (qos->iops_rate_limit > 0) {
1096 		max_ios_per_timeslice = qos->iops_rate_limit * SPDK_BDEV_QOS_TIMESLICE_IN_USEC /
1097 					SPDK_BDEV_SEC_TO_USEC;
1098 		qos->max_ios_per_timeslice = spdk_max(max_ios_per_timeslice,
1099 						      SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE);
1100 	}
1101 
1102 	if (qos->byte_rate_limit > 0) {
1103 		max_byte_per_timeslice = qos->byte_rate_limit * SPDK_BDEV_QOS_TIMESLICE_IN_USEC /
1104 					 SPDK_BDEV_SEC_TO_USEC;
1105 		qos->max_byte_per_timeslice = spdk_max(max_byte_per_timeslice,
1106 						       SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE);
1107 	}
1108 }
1109 
1110 static int
1111 spdk_bdev_channel_poll_qos(void *arg)
1112 {
1113 	struct spdk_bdev_qos *qos = arg;
1114 
1115 	/* Reset for next round of rate limiting */
1116 	qos->io_submitted_this_timeslice = 0;
1117 	qos->byte_submitted_this_timeslice = 0;
1118 
1119 	_spdk_bdev_qos_io_submit(qos->ch);
1120 
1121 	return -1;
1122 }
1123 
1124 static void
1125 _spdk_bdev_channel_destroy_resource(struct spdk_bdev_channel *ch)
1126 {
1127 	struct spdk_bdev_shared_resource *shared_resource;
1128 
1129 	if (!ch) {
1130 		return;
1131 	}
1132 
1133 	if (ch->channel) {
1134 		spdk_put_io_channel(ch->channel);
1135 	}
1136 
1137 	assert(ch->io_outstanding == 0);
1138 
1139 	shared_resource = ch->shared_resource;
1140 	if (shared_resource) {
1141 		assert(ch->io_outstanding == 0);
1142 		assert(shared_resource->ref > 0);
1143 		shared_resource->ref--;
1144 		if (shared_resource->ref == 0) {
1145 			assert(shared_resource->io_outstanding == 0);
1146 			spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch));
1147 			TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link);
1148 			free(shared_resource);
1149 		}
1150 	}
1151 }
1152 
1153 /* Caller must hold bdev->mutex. */
1154 static int
1155 _spdk_bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch)
1156 {
1157 	struct spdk_bdev_qos *qos = bdev->qos;
1158 
1159 	/* Rate limiting on this bdev enabled */
1160 	if (qos) {
1161 		if (qos->ch == NULL) {
1162 			struct spdk_io_channel *io_ch;
1163 
1164 			SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch,
1165 				      bdev->name, spdk_get_thread());
1166 
1167 			/* No qos channel has been selected, so set one up */
1168 
1169 			/* Take another reference to ch */
1170 			io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev));
1171 			qos->ch = ch;
1172 
1173 			qos->thread = spdk_io_channel_get_thread(io_ch);
1174 
1175 			TAILQ_INIT(&qos->queued);
1176 			spdk_bdev_qos_update_max_quota_per_timeslice(qos);
1177 			qos->io_submitted_this_timeslice = 0;
1178 			qos->byte_submitted_this_timeslice = 0;
1179 
1180 			qos->poller = spdk_poller_register(spdk_bdev_channel_poll_qos,
1181 							   qos,
1182 							   SPDK_BDEV_QOS_TIMESLICE_IN_USEC);
1183 		}
1184 
1185 		ch->flags |= BDEV_CH_QOS_ENABLED;
1186 	}
1187 
1188 	return 0;
1189 }
1190 
1191 static int
1192 spdk_bdev_channel_create(void *io_device, void *ctx_buf)
1193 {
1194 	struct spdk_bdev		*bdev = __bdev_from_io_dev(io_device);
1195 	struct spdk_bdev_channel	*ch = ctx_buf;
1196 	struct spdk_io_channel		*mgmt_io_ch;
1197 	struct spdk_bdev_mgmt_channel	*mgmt_ch;
1198 	struct spdk_bdev_shared_resource *shared_resource;
1199 
1200 	ch->bdev = bdev;
1201 	ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt);
1202 	if (!ch->channel) {
1203 		return -1;
1204 	}
1205 
1206 	mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr);
1207 	if (!mgmt_io_ch) {
1208 		return -1;
1209 	}
1210 
1211 	mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch);
1212 	TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) {
1213 		if (shared_resource->shared_ch == ch->channel) {
1214 			spdk_put_io_channel(mgmt_io_ch);
1215 			shared_resource->ref++;
1216 			break;
1217 		}
1218 	}
1219 
1220 	if (shared_resource == NULL) {
1221 		shared_resource = calloc(1, sizeof(*shared_resource));
1222 		if (shared_resource == NULL) {
1223 			spdk_put_io_channel(mgmt_io_ch);
1224 			return -1;
1225 		}
1226 
1227 		shared_resource->mgmt_ch = mgmt_ch;
1228 		shared_resource->io_outstanding = 0;
1229 		TAILQ_INIT(&shared_resource->nomem_io);
1230 		shared_resource->nomem_threshold = 0;
1231 		shared_resource->shared_ch = ch->channel;
1232 		shared_resource->ref = 1;
1233 		TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link);
1234 	}
1235 
1236 	memset(&ch->stat, 0, sizeof(ch->stat));
1237 	ch->stat.ticks_rate = spdk_get_ticks_hz();
1238 	ch->io_outstanding = 0;
1239 	TAILQ_INIT(&ch->queued_resets);
1240 	ch->flags = 0;
1241 	ch->shared_resource = shared_resource;
1242 
1243 #ifdef SPDK_CONFIG_VTUNE
1244 	{
1245 		char *name;
1246 		__itt_init_ittlib(NULL, 0);
1247 		name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch);
1248 		if (!name) {
1249 			_spdk_bdev_channel_destroy_resource(ch);
1250 			return -1;
1251 		}
1252 		ch->handle = __itt_string_handle_create(name);
1253 		free(name);
1254 		ch->start_tsc = spdk_get_ticks();
1255 		ch->interval_tsc = spdk_get_ticks_hz() / 100;
1256 		memset(&ch->prev_stat, 0, sizeof(ch->prev_stat));
1257 	}
1258 #endif
1259 
1260 	pthread_mutex_lock(&bdev->mutex);
1261 
1262 	if (_spdk_bdev_enable_qos(bdev, ch)) {
1263 		_spdk_bdev_channel_destroy_resource(ch);
1264 		pthread_mutex_unlock(&bdev->mutex);
1265 		return -1;
1266 	}
1267 
1268 	pthread_mutex_unlock(&bdev->mutex);
1269 
1270 	return 0;
1271 }
1272 
1273 /*
1274  * Abort I/O that are waiting on a data buffer.  These types of I/O are
1275  *  linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY.
1276  */
1277 static void
1278 _spdk_bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch)
1279 {
1280 	bdev_io_stailq_t tmp;
1281 	struct spdk_bdev_io *bdev_io;
1282 
1283 	STAILQ_INIT(&tmp);
1284 
1285 	while (!STAILQ_EMPTY(queue)) {
1286 		bdev_io = STAILQ_FIRST(queue);
1287 		STAILQ_REMOVE_HEAD(queue, internal.buf_link);
1288 		if (bdev_io->ch == ch) {
1289 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
1290 		} else {
1291 			STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link);
1292 		}
1293 	}
1294 
1295 	STAILQ_SWAP(&tmp, queue, spdk_bdev_io);
1296 }
1297 
1298 /*
1299  * Abort I/O that are queued waiting for submission.  These types of I/O are
1300  *  linked using the spdk_bdev_io link TAILQ_ENTRY.
1301  */
1302 static void
1303 _spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch)
1304 {
1305 	struct spdk_bdev_io *bdev_io, *tmp;
1306 
1307 	TAILQ_FOREACH_SAFE(bdev_io, queue, link, tmp) {
1308 		if (bdev_io->ch == ch) {
1309 			TAILQ_REMOVE(queue, bdev_io, link);
1310 			/*
1311 			 * spdk_bdev_io_complete() assumes that the completed I/O had
1312 			 *  been submitted to the bdev module.  Since in this case it
1313 			 *  hadn't, bump io_outstanding to account for the decrement
1314 			 *  that spdk_bdev_io_complete() will do.
1315 			 */
1316 			if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) {
1317 				ch->io_outstanding++;
1318 				ch->shared_resource->io_outstanding++;
1319 			}
1320 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
1321 		}
1322 	}
1323 }
1324 
1325 static void
1326 spdk_bdev_qos_channel_destroy(void *cb_arg)
1327 {
1328 	struct spdk_bdev_qos *qos = cb_arg;
1329 
1330 	spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
1331 	spdk_poller_unregister(&qos->poller);
1332 
1333 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Free QoS %p.\n", qos);
1334 
1335 	free(qos);
1336 }
1337 
1338 static int
1339 spdk_bdev_qos_destroy(struct spdk_bdev *bdev)
1340 {
1341 	/*
1342 	 * Cleanly shutting down the QoS poller is tricky, because
1343 	 * during the asynchronous operation the user could open
1344 	 * a new descriptor and create a new channel, spawning
1345 	 * a new QoS poller.
1346 	 *
1347 	 * The strategy is to create a new QoS structure here and swap it
1348 	 * in. The shutdown path then continues to refer to the old one
1349 	 * until it completes and then releases it.
1350 	 */
1351 	struct spdk_bdev_qos *new_qos, *old_qos;
1352 
1353 	old_qos = bdev->qos;
1354 
1355 	new_qos = calloc(1, sizeof(*new_qos));
1356 	if (!new_qos) {
1357 		SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n");
1358 		return -ENOMEM;
1359 	}
1360 
1361 	/* Copy the old QoS data into the newly allocated structure */
1362 	memcpy(new_qos, old_qos, sizeof(*new_qos));
1363 
1364 	/* Zero out the key parts of the QoS structure */
1365 	new_qos->ch = NULL;
1366 	new_qos->thread = NULL;
1367 	new_qos->max_ios_per_timeslice = 0;
1368 	new_qos->max_byte_per_timeslice = 0;
1369 	new_qos->io_submitted_this_timeslice = 0;
1370 	new_qos->byte_submitted_this_timeslice = 0;
1371 	new_qos->poller = NULL;
1372 	TAILQ_INIT(&new_qos->queued);
1373 
1374 	bdev->qos = new_qos;
1375 
1376 	spdk_thread_send_msg(old_qos->thread, spdk_bdev_qos_channel_destroy,
1377 			     old_qos);
1378 
1379 	/* It is safe to continue with destroying the bdev even though the QoS channel hasn't
1380 	 * been destroyed yet. The destruction path will end up waiting for the final
1381 	 * channel to be put before it releases resources. */
1382 
1383 	return 0;
1384 }
1385 
1386 static void
1387 spdk_bdev_channel_destroy(void *io_device, void *ctx_buf)
1388 {
1389 	struct spdk_bdev_channel	*ch = ctx_buf;
1390 	struct spdk_bdev_mgmt_channel	*mgmt_ch;
1391 	struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource;
1392 
1393 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name,
1394 		      spdk_get_thread());
1395 
1396 	mgmt_ch = shared_resource->mgmt_ch;
1397 
1398 	_spdk_bdev_abort_queued_io(&ch->queued_resets, ch);
1399 	_spdk_bdev_abort_queued_io(&shared_resource->nomem_io, ch);
1400 	_spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_small, ch);
1401 	_spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_large, ch);
1402 
1403 	_spdk_bdev_channel_destroy_resource(ch);
1404 }
1405 
1406 int
1407 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias)
1408 {
1409 	struct spdk_bdev_alias *tmp;
1410 
1411 	if (alias == NULL) {
1412 		SPDK_ERRLOG("Empty alias passed\n");
1413 		return -EINVAL;
1414 	}
1415 
1416 	if (spdk_bdev_get_by_name(alias)) {
1417 		SPDK_ERRLOG("Bdev name/alias: %s already exists\n", alias);
1418 		return -EEXIST;
1419 	}
1420 
1421 	tmp = calloc(1, sizeof(*tmp));
1422 	if (tmp == NULL) {
1423 		SPDK_ERRLOG("Unable to allocate alias\n");
1424 		return -ENOMEM;
1425 	}
1426 
1427 	tmp->alias = strdup(alias);
1428 	if (tmp->alias == NULL) {
1429 		free(tmp);
1430 		SPDK_ERRLOG("Unable to allocate alias\n");
1431 		return -ENOMEM;
1432 	}
1433 
1434 	TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq);
1435 
1436 	return 0;
1437 }
1438 
1439 int
1440 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias)
1441 {
1442 	struct spdk_bdev_alias *tmp;
1443 
1444 	TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
1445 		if (strcmp(alias, tmp->alias) == 0) {
1446 			TAILQ_REMOVE(&bdev->aliases, tmp, tailq);
1447 			free(tmp->alias);
1448 			free(tmp);
1449 			return 0;
1450 		}
1451 	}
1452 
1453 	SPDK_INFOLOG(SPDK_LOG_BDEV, "Alias %s does not exists\n", alias);
1454 
1455 	return -ENOENT;
1456 }
1457 
1458 struct spdk_io_channel *
1459 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc)
1460 {
1461 	return spdk_get_io_channel(__bdev_to_io_dev(desc->bdev));
1462 }
1463 
1464 const char *
1465 spdk_bdev_get_name(const struct spdk_bdev *bdev)
1466 {
1467 	return bdev->name;
1468 }
1469 
1470 const char *
1471 spdk_bdev_get_product_name(const struct spdk_bdev *bdev)
1472 {
1473 	return bdev->product_name;
1474 }
1475 
1476 const struct spdk_bdev_aliases_list *
1477 spdk_bdev_get_aliases(const struct spdk_bdev *bdev)
1478 {
1479 	return &bdev->aliases;
1480 }
1481 
1482 uint32_t
1483 spdk_bdev_get_block_size(const struct spdk_bdev *bdev)
1484 {
1485 	return bdev->blocklen;
1486 }
1487 
1488 uint64_t
1489 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev)
1490 {
1491 	return bdev->blockcnt;
1492 }
1493 
1494 uint64_t
1495 spdk_bdev_get_qos_ios_per_sec(struct spdk_bdev *bdev)
1496 {
1497 	uint64_t iops_rate_limit = 0;
1498 
1499 	pthread_mutex_lock(&bdev->mutex);
1500 	if (bdev->qos) {
1501 		iops_rate_limit = bdev->qos->iops_rate_limit;
1502 	}
1503 	pthread_mutex_unlock(&bdev->mutex);
1504 
1505 	return iops_rate_limit;
1506 }
1507 
1508 size_t
1509 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev)
1510 {
1511 	/* TODO: push this logic down to the bdev modules */
1512 	if (bdev->need_aligned_buffer) {
1513 		return bdev->blocklen;
1514 	}
1515 
1516 	return 1;
1517 }
1518 
1519 uint32_t
1520 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev)
1521 {
1522 	return bdev->optimal_io_boundary;
1523 }
1524 
1525 bool
1526 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev)
1527 {
1528 	return bdev->write_cache;
1529 }
1530 
1531 const struct spdk_uuid *
1532 spdk_bdev_get_uuid(const struct spdk_bdev *bdev)
1533 {
1534 	return &bdev->uuid;
1535 }
1536 
1537 int
1538 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size)
1539 {
1540 	int ret;
1541 
1542 	pthread_mutex_lock(&bdev->mutex);
1543 
1544 	/* bdev has open descriptors */
1545 	if (!TAILQ_EMPTY(&bdev->open_descs) &&
1546 	    bdev->blockcnt > size) {
1547 		ret = -EBUSY;
1548 	} else {
1549 		bdev->blockcnt = size;
1550 		ret = 0;
1551 	}
1552 
1553 	pthread_mutex_unlock(&bdev->mutex);
1554 
1555 	return ret;
1556 }
1557 
1558 /*
1559  * Convert I/O offset and length from bytes to blocks.
1560  *
1561  * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size.
1562  */
1563 static uint64_t
1564 spdk_bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks,
1565 			  uint64_t num_bytes, uint64_t *num_blocks)
1566 {
1567 	uint32_t block_size = bdev->blocklen;
1568 
1569 	*offset_blocks = offset_bytes / block_size;
1570 	*num_blocks = num_bytes / block_size;
1571 
1572 	return (offset_bytes % block_size) | (num_bytes % block_size);
1573 }
1574 
1575 static bool
1576 spdk_bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks)
1577 {
1578 	/* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there
1579 	 * has been an overflow and hence the offset has been wrapped around */
1580 	if (offset_blocks + num_blocks < offset_blocks) {
1581 		return false;
1582 	}
1583 
1584 	/* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */
1585 	if (offset_blocks + num_blocks > bdev->blockcnt) {
1586 		return false;
1587 	}
1588 
1589 	return true;
1590 }
1591 
1592 int
1593 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1594 	       void *buf, uint64_t offset, uint64_t nbytes,
1595 	       spdk_bdev_io_completion_cb cb, void *cb_arg)
1596 {
1597 	uint64_t offset_blocks, num_blocks;
1598 
1599 	if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
1600 		return -EINVAL;
1601 	}
1602 
1603 	return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
1604 }
1605 
1606 int
1607 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1608 		      void *buf, uint64_t offset_blocks, uint64_t num_blocks,
1609 		      spdk_bdev_io_completion_cb cb, void *cb_arg)
1610 {
1611 	struct spdk_bdev *bdev = desc->bdev;
1612 	struct spdk_bdev_io *bdev_io;
1613 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1614 
1615 	if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
1616 		return -EINVAL;
1617 	}
1618 
1619 	bdev_io = spdk_bdev_get_io(channel);
1620 	if (!bdev_io) {
1621 		SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n");
1622 		return -ENOMEM;
1623 	}
1624 
1625 	bdev_io->ch = channel;
1626 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
1627 	bdev_io->u.bdev.iov.iov_base = buf;
1628 	bdev_io->u.bdev.iov.iov_len = num_blocks * bdev->blocklen;
1629 	bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov;
1630 	bdev_io->u.bdev.iovcnt = 1;
1631 	bdev_io->u.bdev.num_blocks = num_blocks;
1632 	bdev_io->u.bdev.offset_blocks = offset_blocks;
1633 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
1634 
1635 	spdk_bdev_io_submit(bdev_io);
1636 	return 0;
1637 }
1638 
1639 int
1640 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1641 		struct iovec *iov, int iovcnt,
1642 		uint64_t offset, uint64_t nbytes,
1643 		spdk_bdev_io_completion_cb cb, void *cb_arg)
1644 {
1645 	uint64_t offset_blocks, num_blocks;
1646 
1647 	if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
1648 		return -EINVAL;
1649 	}
1650 
1651 	return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
1652 }
1653 
1654 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1655 			   struct iovec *iov, int iovcnt,
1656 			   uint64_t offset_blocks, uint64_t num_blocks,
1657 			   spdk_bdev_io_completion_cb cb, void *cb_arg)
1658 {
1659 	struct spdk_bdev *bdev = desc->bdev;
1660 	struct spdk_bdev_io *bdev_io;
1661 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1662 
1663 	if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
1664 		return -EINVAL;
1665 	}
1666 
1667 	bdev_io = spdk_bdev_get_io(channel);
1668 	if (!bdev_io) {
1669 		SPDK_ERRLOG("spdk_bdev_io memory allocation failed duing read\n");
1670 		return -ENOMEM;
1671 	}
1672 
1673 	bdev_io->ch = channel;
1674 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
1675 	bdev_io->u.bdev.iovs = iov;
1676 	bdev_io->u.bdev.iovcnt = iovcnt;
1677 	bdev_io->u.bdev.num_blocks = num_blocks;
1678 	bdev_io->u.bdev.offset_blocks = offset_blocks;
1679 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
1680 
1681 	spdk_bdev_io_submit(bdev_io);
1682 	return 0;
1683 }
1684 
1685 int
1686 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1687 		void *buf, uint64_t offset, uint64_t nbytes,
1688 		spdk_bdev_io_completion_cb cb, void *cb_arg)
1689 {
1690 	uint64_t offset_blocks, num_blocks;
1691 
1692 	if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
1693 		return -EINVAL;
1694 	}
1695 
1696 	return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
1697 }
1698 
1699 int
1700 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1701 		       void *buf, uint64_t offset_blocks, uint64_t num_blocks,
1702 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
1703 {
1704 	struct spdk_bdev *bdev = desc->bdev;
1705 	struct spdk_bdev_io *bdev_io;
1706 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1707 
1708 	if (!desc->write) {
1709 		return -EBADF;
1710 	}
1711 
1712 	if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
1713 		return -EINVAL;
1714 	}
1715 
1716 	bdev_io = spdk_bdev_get_io(channel);
1717 	if (!bdev_io) {
1718 		SPDK_ERRLOG("bdev_io memory allocation failed duing write\n");
1719 		return -ENOMEM;
1720 	}
1721 
1722 	bdev_io->ch = channel;
1723 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
1724 	bdev_io->u.bdev.iov.iov_base = buf;
1725 	bdev_io->u.bdev.iov.iov_len = num_blocks * bdev->blocklen;
1726 	bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov;
1727 	bdev_io->u.bdev.iovcnt = 1;
1728 	bdev_io->u.bdev.num_blocks = num_blocks;
1729 	bdev_io->u.bdev.offset_blocks = offset_blocks;
1730 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
1731 
1732 	spdk_bdev_io_submit(bdev_io);
1733 	return 0;
1734 }
1735 
1736 int
1737 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1738 		 struct iovec *iov, int iovcnt,
1739 		 uint64_t offset, uint64_t len,
1740 		 spdk_bdev_io_completion_cb cb, void *cb_arg)
1741 {
1742 	uint64_t offset_blocks, num_blocks;
1743 
1744 	if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) {
1745 		return -EINVAL;
1746 	}
1747 
1748 	return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
1749 }
1750 
1751 int
1752 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1753 			struct iovec *iov, int iovcnt,
1754 			uint64_t offset_blocks, uint64_t num_blocks,
1755 			spdk_bdev_io_completion_cb cb, void *cb_arg)
1756 {
1757 	struct spdk_bdev *bdev = desc->bdev;
1758 	struct spdk_bdev_io *bdev_io;
1759 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1760 
1761 	if (!desc->write) {
1762 		return -EBADF;
1763 	}
1764 
1765 	if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
1766 		return -EINVAL;
1767 	}
1768 
1769 	bdev_io = spdk_bdev_get_io(channel);
1770 	if (!bdev_io) {
1771 		SPDK_ERRLOG("bdev_io memory allocation failed duing writev\n");
1772 		return -ENOMEM;
1773 	}
1774 
1775 	bdev_io->ch = channel;
1776 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
1777 	bdev_io->u.bdev.iovs = iov;
1778 	bdev_io->u.bdev.iovcnt = iovcnt;
1779 	bdev_io->u.bdev.num_blocks = num_blocks;
1780 	bdev_io->u.bdev.offset_blocks = offset_blocks;
1781 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
1782 
1783 	spdk_bdev_io_submit(bdev_io);
1784 	return 0;
1785 }
1786 
1787 int
1788 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1789 		       uint64_t offset, uint64_t len,
1790 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
1791 {
1792 	uint64_t offset_blocks, num_blocks;
1793 
1794 	if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) {
1795 		return -EINVAL;
1796 	}
1797 
1798 	return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
1799 }
1800 
1801 int
1802 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1803 			      uint64_t offset_blocks, uint64_t num_blocks,
1804 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
1805 {
1806 	struct spdk_bdev *bdev = desc->bdev;
1807 	struct spdk_bdev_io *bdev_io;
1808 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1809 	uint64_t len;
1810 	bool split_request = false;
1811 
1812 	if (num_blocks > UINT64_MAX / spdk_bdev_get_block_size(bdev)) {
1813 		SPDK_ERRLOG("length argument out of range in write_zeroes\n");
1814 		return -ERANGE;
1815 	}
1816 
1817 	if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
1818 		return -EINVAL;
1819 	}
1820 
1821 	bdev_io = spdk_bdev_get_io(channel);
1822 
1823 	if (!bdev_io) {
1824 		SPDK_ERRLOG("bdev_io memory allocation failed duing write_zeroes\n");
1825 		return -ENOMEM;
1826 	}
1827 
1828 	bdev_io->ch = channel;
1829 	bdev_io->u.bdev.offset_blocks = offset_blocks;
1830 
1831 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
1832 		bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES;
1833 		bdev_io->u.bdev.num_blocks = num_blocks;
1834 		bdev_io->u.bdev.iovs = NULL;
1835 		bdev_io->u.bdev.iovcnt = 0;
1836 
1837 	} else {
1838 		assert(spdk_bdev_get_block_size(bdev) <= ZERO_BUFFER_SIZE);
1839 
1840 		len = spdk_bdev_get_block_size(bdev) * num_blocks;
1841 
1842 		if (len > ZERO_BUFFER_SIZE) {
1843 			split_request = true;
1844 			len = ZERO_BUFFER_SIZE;
1845 		}
1846 
1847 		bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
1848 		bdev_io->u.bdev.iov.iov_base = g_bdev_mgr.zero_buffer;
1849 		bdev_io->u.bdev.iov.iov_len = len;
1850 		bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov;
1851 		bdev_io->u.bdev.iovcnt = 1;
1852 		bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev);
1853 		bdev_io->u.bdev.split_remaining_num_blocks = num_blocks - bdev_io->u.bdev.num_blocks;
1854 		bdev_io->u.bdev.split_current_offset_blocks = offset_blocks + bdev_io->u.bdev.num_blocks;
1855 	}
1856 
1857 	if (split_request) {
1858 		bdev_io->u.bdev.stored_user_cb = cb;
1859 		spdk_bdev_io_init(bdev_io, bdev, cb_arg, spdk_bdev_write_zeroes_split);
1860 	} else {
1861 		spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
1862 	}
1863 	spdk_bdev_io_submit(bdev_io);
1864 	return 0;
1865 }
1866 
1867 int
1868 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1869 		uint64_t offset, uint64_t nbytes,
1870 		spdk_bdev_io_completion_cb cb, void *cb_arg)
1871 {
1872 	uint64_t offset_blocks, num_blocks;
1873 
1874 	if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
1875 		return -EINVAL;
1876 	}
1877 
1878 	return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
1879 }
1880 
1881 int
1882 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1883 		       uint64_t offset_blocks, uint64_t num_blocks,
1884 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
1885 {
1886 	struct spdk_bdev *bdev = desc->bdev;
1887 	struct spdk_bdev_io *bdev_io;
1888 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1889 
1890 	if (!desc->write) {
1891 		return -EBADF;
1892 	}
1893 
1894 	if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
1895 		return -EINVAL;
1896 	}
1897 
1898 	if (num_blocks == 0) {
1899 		SPDK_ERRLOG("Can't unmap 0 bytes\n");
1900 		return -EINVAL;
1901 	}
1902 
1903 	bdev_io = spdk_bdev_get_io(channel);
1904 	if (!bdev_io) {
1905 		SPDK_ERRLOG("bdev_io memory allocation failed duing unmap\n");
1906 		return -ENOMEM;
1907 	}
1908 
1909 	bdev_io->ch = channel;
1910 	bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP;
1911 	bdev_io->u.bdev.iov.iov_base = NULL;
1912 	bdev_io->u.bdev.iov.iov_len = 0;
1913 	bdev_io->u.bdev.iovs = &bdev_io->u.bdev.iov;
1914 	bdev_io->u.bdev.iovcnt = 1;
1915 	bdev_io->u.bdev.offset_blocks = offset_blocks;
1916 	bdev_io->u.bdev.num_blocks = num_blocks;
1917 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
1918 
1919 	spdk_bdev_io_submit(bdev_io);
1920 	return 0;
1921 }
1922 
1923 int
1924 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1925 		uint64_t offset, uint64_t length,
1926 		spdk_bdev_io_completion_cb cb, void *cb_arg)
1927 {
1928 	uint64_t offset_blocks, num_blocks;
1929 
1930 	if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, length, &num_blocks) != 0) {
1931 		return -EINVAL;
1932 	}
1933 
1934 	return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
1935 }
1936 
1937 int
1938 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
1939 		       uint64_t offset_blocks, uint64_t num_blocks,
1940 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
1941 {
1942 	struct spdk_bdev *bdev = desc->bdev;
1943 	struct spdk_bdev_io *bdev_io;
1944 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
1945 
1946 	if (!desc->write) {
1947 		return -EBADF;
1948 	}
1949 
1950 	if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
1951 		return -EINVAL;
1952 	}
1953 
1954 	bdev_io = spdk_bdev_get_io(channel);
1955 	if (!bdev_io) {
1956 		SPDK_ERRLOG("bdev_io memory allocation failed duing flush\n");
1957 		return -ENOMEM;
1958 	}
1959 
1960 	bdev_io->ch = channel;
1961 	bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH;
1962 	bdev_io->u.bdev.iovs = NULL;
1963 	bdev_io->u.bdev.iovcnt = 0;
1964 	bdev_io->u.bdev.offset_blocks = offset_blocks;
1965 	bdev_io->u.bdev.num_blocks = num_blocks;
1966 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
1967 
1968 	spdk_bdev_io_submit(bdev_io);
1969 	return 0;
1970 }
1971 
1972 static void
1973 _spdk_bdev_reset_dev(struct spdk_io_channel_iter *i, int status)
1974 {
1975 	struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i);
1976 	struct spdk_bdev_io *bdev_io;
1977 
1978 	bdev_io = TAILQ_FIRST(&ch->queued_resets);
1979 	TAILQ_REMOVE(&ch->queued_resets, bdev_io, link);
1980 	spdk_bdev_io_submit_reset(bdev_io);
1981 }
1982 
1983 static void
1984 _spdk_bdev_reset_freeze_channel(struct spdk_io_channel_iter *i)
1985 {
1986 	struct spdk_io_channel		*ch;
1987 	struct spdk_bdev_channel	*channel;
1988 	struct spdk_bdev_mgmt_channel	*mgmt_channel;
1989 	struct spdk_bdev_shared_resource *shared_resource;
1990 	bdev_io_tailq_t			tmp_queued;
1991 
1992 	TAILQ_INIT(&tmp_queued);
1993 
1994 	ch = spdk_io_channel_iter_get_channel(i);
1995 	channel = spdk_io_channel_get_ctx(ch);
1996 	shared_resource = channel->shared_resource;
1997 	mgmt_channel = shared_resource->mgmt_ch;
1998 
1999 	channel->flags |= BDEV_CH_RESET_IN_PROGRESS;
2000 
2001 	if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) {
2002 		/* The QoS object is always valid and readable while
2003 		 * the channel flag is set, so the lock here should not
2004 		 * be necessary. We're not in the fast path though, so
2005 		 * just take it anyway. */
2006 		pthread_mutex_lock(&channel->bdev->mutex);
2007 		if (channel->bdev->qos->ch == channel) {
2008 			TAILQ_SWAP(&channel->bdev->qos->queued, &tmp_queued, spdk_bdev_io, link);
2009 		}
2010 		pthread_mutex_unlock(&channel->bdev->mutex);
2011 	}
2012 
2013 	_spdk_bdev_abort_queued_io(&shared_resource->nomem_io, channel);
2014 	_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel);
2015 	_spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel);
2016 	_spdk_bdev_abort_queued_io(&tmp_queued, channel);
2017 
2018 	spdk_for_each_channel_continue(i, 0);
2019 }
2020 
2021 static void
2022 _spdk_bdev_start_reset(void *ctx)
2023 {
2024 	struct spdk_bdev_channel *ch = ctx;
2025 
2026 	spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), _spdk_bdev_reset_freeze_channel,
2027 			      ch, _spdk_bdev_reset_dev);
2028 }
2029 
2030 static void
2031 _spdk_bdev_channel_start_reset(struct spdk_bdev_channel *ch)
2032 {
2033 	struct spdk_bdev *bdev = ch->bdev;
2034 
2035 	assert(!TAILQ_EMPTY(&ch->queued_resets));
2036 
2037 	pthread_mutex_lock(&bdev->mutex);
2038 	if (bdev->reset_in_progress == NULL) {
2039 		bdev->reset_in_progress = TAILQ_FIRST(&ch->queued_resets);
2040 		/*
2041 		 * Take a channel reference for the target bdev for the life of this
2042 		 *  reset.  This guards against the channel getting destroyed while
2043 		 *  spdk_for_each_channel() calls related to this reset IO are in
2044 		 *  progress.  We will release the reference when this reset is
2045 		 *  completed.
2046 		 */
2047 		bdev->reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev));
2048 		_spdk_bdev_start_reset(ch);
2049 	}
2050 	pthread_mutex_unlock(&bdev->mutex);
2051 }
2052 
2053 int
2054 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
2055 		spdk_bdev_io_completion_cb cb, void *cb_arg)
2056 {
2057 	struct spdk_bdev *bdev = desc->bdev;
2058 	struct spdk_bdev_io *bdev_io;
2059 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2060 
2061 	bdev_io = spdk_bdev_get_io(channel);
2062 	if (!bdev_io) {
2063 		SPDK_ERRLOG("bdev_io memory allocation failed duing reset\n");
2064 		return -ENOMEM;
2065 	}
2066 
2067 	bdev_io->ch = channel;
2068 	bdev_io->type = SPDK_BDEV_IO_TYPE_RESET;
2069 	bdev_io->u.reset.ch_ref = NULL;
2070 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
2071 
2072 	pthread_mutex_lock(&bdev->mutex);
2073 	TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, link);
2074 	pthread_mutex_unlock(&bdev->mutex);
2075 
2076 	_spdk_bdev_channel_start_reset(channel);
2077 
2078 	return 0;
2079 }
2080 
2081 void
2082 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
2083 		      struct spdk_bdev_io_stat *stat)
2084 {
2085 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2086 
2087 	*stat = channel->stat;
2088 }
2089 
2090 static void
2091 _spdk_bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status)
2092 {
2093 	void *io_device = spdk_io_channel_iter_get_io_device(i);
2094 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i);
2095 
2096 	bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat,
2097 			    bdev_iostat_ctx->cb_arg, 0);
2098 	free(bdev_iostat_ctx);
2099 }
2100 
2101 static void
2102 _spdk_bdev_get_each_channel_stat(struct spdk_io_channel_iter *i)
2103 {
2104 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i);
2105 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
2106 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2107 
2108 	bdev_iostat_ctx->stat->bytes_read += channel->stat.bytes_read;
2109 	bdev_iostat_ctx->stat->num_read_ops += channel->stat.num_read_ops;
2110 	bdev_iostat_ctx->stat->bytes_written += channel->stat.bytes_written;
2111 	bdev_iostat_ctx->stat->num_write_ops += channel->stat.num_write_ops;
2112 
2113 	spdk_for_each_channel_continue(i, 0);
2114 }
2115 
2116 void
2117 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat,
2118 			  spdk_bdev_get_device_stat_cb cb, void *cb_arg)
2119 {
2120 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx;
2121 
2122 	assert(bdev != NULL);
2123 	assert(stat != NULL);
2124 	assert(cb != NULL);
2125 
2126 	bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx));
2127 	if (bdev_iostat_ctx == NULL) {
2128 		SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n");
2129 		cb(bdev, stat, cb_arg, -ENOMEM);
2130 		return;
2131 	}
2132 
2133 	bdev_iostat_ctx->stat = stat;
2134 	bdev_iostat_ctx->cb = cb;
2135 	bdev_iostat_ctx->cb_arg = cb_arg;
2136 
2137 	spdk_for_each_channel(__bdev_to_io_dev(bdev),
2138 			      _spdk_bdev_get_each_channel_stat,
2139 			      bdev_iostat_ctx,
2140 			      _spdk_bdev_get_device_stat_done);
2141 }
2142 
2143 int
2144 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
2145 			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
2146 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
2147 {
2148 	struct spdk_bdev *bdev = desc->bdev;
2149 	struct spdk_bdev_io *bdev_io;
2150 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2151 
2152 	if (!desc->write) {
2153 		return -EBADF;
2154 	}
2155 
2156 	bdev_io = spdk_bdev_get_io(channel);
2157 	if (!bdev_io) {
2158 		SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n");
2159 		return -ENOMEM;
2160 	}
2161 
2162 	bdev_io->ch = channel;
2163 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN;
2164 	bdev_io->u.nvme_passthru.cmd = *cmd;
2165 	bdev_io->u.nvme_passthru.buf = buf;
2166 	bdev_io->u.nvme_passthru.nbytes = nbytes;
2167 	bdev_io->u.nvme_passthru.md_buf = NULL;
2168 	bdev_io->u.nvme_passthru.md_len = 0;
2169 
2170 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
2171 
2172 	spdk_bdev_io_submit(bdev_io);
2173 	return 0;
2174 }
2175 
2176 int
2177 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
2178 			   const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
2179 			   spdk_bdev_io_completion_cb cb, void *cb_arg)
2180 {
2181 	struct spdk_bdev *bdev = desc->bdev;
2182 	struct spdk_bdev_io *bdev_io;
2183 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2184 
2185 	if (!desc->write) {
2186 		/*
2187 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
2188 		 *  to easily determine if the command is a read or write, but for now just
2189 		 *  do not allow io_passthru with a read-only descriptor.
2190 		 */
2191 		return -EBADF;
2192 	}
2193 
2194 	bdev_io = spdk_bdev_get_io(channel);
2195 	if (!bdev_io) {
2196 		SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n");
2197 		return -ENOMEM;
2198 	}
2199 
2200 	bdev_io->ch = channel;
2201 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO;
2202 	bdev_io->u.nvme_passthru.cmd = *cmd;
2203 	bdev_io->u.nvme_passthru.buf = buf;
2204 	bdev_io->u.nvme_passthru.nbytes = nbytes;
2205 	bdev_io->u.nvme_passthru.md_buf = NULL;
2206 	bdev_io->u.nvme_passthru.md_len = 0;
2207 
2208 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
2209 
2210 	spdk_bdev_io_submit(bdev_io);
2211 	return 0;
2212 }
2213 
2214 int
2215 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
2216 			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len,
2217 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
2218 {
2219 	struct spdk_bdev *bdev = desc->bdev;
2220 	struct spdk_bdev_io *bdev_io;
2221 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
2222 
2223 	if (!desc->write) {
2224 		/*
2225 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
2226 		 *  to easily determine if the command is a read or write, but for now just
2227 		 *  do not allow io_passthru with a read-only descriptor.
2228 		 */
2229 		return -EBADF;
2230 	}
2231 
2232 	bdev_io = spdk_bdev_get_io(channel);
2233 	if (!bdev_io) {
2234 		SPDK_ERRLOG("bdev_io memory allocation failed during nvme_admin_passthru\n");
2235 		return -ENOMEM;
2236 	}
2237 
2238 	bdev_io->ch = channel;
2239 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD;
2240 	bdev_io->u.nvme_passthru.cmd = *cmd;
2241 	bdev_io->u.nvme_passthru.buf = buf;
2242 	bdev_io->u.nvme_passthru.nbytes = nbytes;
2243 	bdev_io->u.nvme_passthru.md_buf = md_buf;
2244 	bdev_io->u.nvme_passthru.md_len = md_len;
2245 
2246 	spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb);
2247 
2248 	spdk_bdev_io_submit(bdev_io);
2249 	return 0;
2250 }
2251 
2252 int
2253 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io)
2254 {
2255 	if (!bdev_io) {
2256 		SPDK_ERRLOG("bdev_io is NULL\n");
2257 		return -1;
2258 	}
2259 
2260 	if (bdev_io->status == SPDK_BDEV_IO_STATUS_PENDING) {
2261 		SPDK_ERRLOG("bdev_io is in pending state\n");
2262 		assert(false);
2263 		return -1;
2264 	}
2265 
2266 	spdk_bdev_put_io(bdev_io);
2267 
2268 	return 0;
2269 }
2270 
2271 static void
2272 _spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
2273 {
2274 	struct spdk_bdev *bdev = bdev_ch->bdev;
2275 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
2276 	struct spdk_bdev_io *bdev_io;
2277 
2278 	if (shared_resource->io_outstanding > shared_resource->nomem_threshold) {
2279 		/*
2280 		 * Allow some more I/O to complete before retrying the nomem_io queue.
2281 		 *  Some drivers (such as nvme) cannot immediately take a new I/O in
2282 		 *  the context of a completion, because the resources for the I/O are
2283 		 *  not released until control returns to the bdev poller.  Also, we
2284 		 *  may require several small I/O to complete before a larger I/O
2285 		 *  (that requires splitting) can be submitted.
2286 		 */
2287 		return;
2288 	}
2289 
2290 	while (!TAILQ_EMPTY(&shared_resource->nomem_io)) {
2291 		bdev_io = TAILQ_FIRST(&shared_resource->nomem_io);
2292 		TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, link);
2293 		bdev_io->ch->io_outstanding++;
2294 		shared_resource->io_outstanding++;
2295 		bdev_io->status = SPDK_BDEV_IO_STATUS_PENDING;
2296 		bdev->fn_table->submit_request(bdev_io->ch->channel, bdev_io);
2297 		if (bdev_io->status == SPDK_BDEV_IO_STATUS_NOMEM) {
2298 			break;
2299 		}
2300 	}
2301 }
2302 
2303 static inline void
2304 _spdk_bdev_io_complete(void *ctx)
2305 {
2306 	struct spdk_bdev_io *bdev_io = ctx;
2307 
2308 	if (spdk_unlikely(bdev_io->in_submit_request || bdev_io->io_submit_ch)) {
2309 		/*
2310 		 * Send the completion to the thread that originally submitted the I/O,
2311 		 * which may not be the current thread in the case of QoS.
2312 		 */
2313 		if (bdev_io->io_submit_ch) {
2314 			bdev_io->ch = bdev_io->io_submit_ch;
2315 			bdev_io->io_submit_ch = NULL;
2316 		}
2317 
2318 		/*
2319 		 * Defer completion to avoid potential infinite recursion if the
2320 		 * user's completion callback issues a new I/O.
2321 		 */
2322 		spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->ch->channel),
2323 				     _spdk_bdev_io_complete, bdev_io);
2324 		return;
2325 	}
2326 
2327 	if (bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS) {
2328 		switch (bdev_io->type) {
2329 		case SPDK_BDEV_IO_TYPE_READ:
2330 			bdev_io->ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
2331 			bdev_io->ch->stat.num_read_ops++;
2332 			bdev_io->ch->stat.read_latency_ticks += (spdk_get_ticks() - bdev_io->submit_tsc);
2333 			break;
2334 		case SPDK_BDEV_IO_TYPE_WRITE:
2335 			bdev_io->ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
2336 			bdev_io->ch->stat.num_write_ops++;
2337 			bdev_io->ch->stat.write_latency_ticks += (spdk_get_ticks() - bdev_io->submit_tsc);
2338 			break;
2339 		default:
2340 			break;
2341 		}
2342 	}
2343 
2344 #ifdef SPDK_CONFIG_VTUNE
2345 	uint64_t now_tsc = spdk_get_ticks();
2346 	if (now_tsc > (bdev_io->ch->start_tsc + bdev_io->ch->interval_tsc)) {
2347 		uint64_t data[5];
2348 
2349 		data[0] = bdev_io->ch->stat.num_read_ops - bdev_io->ch->prev_stat.num_read_ops;
2350 		data[1] = bdev_io->ch->stat.bytes_read - bdev_io->ch->prev_stat.bytes_read;
2351 		data[2] = bdev_io->ch->stat.num_write_ops - bdev_io->ch->prev_stat.num_write_ops;
2352 		data[3] = bdev_io->ch->stat.bytes_written - bdev_io->ch->prev_stat.bytes_written;
2353 		data[4] = bdev_io->bdev->fn_table->get_spin_time ?
2354 			  bdev_io->bdev->fn_table->get_spin_time(bdev_io->ch->channel) : 0;
2355 
2356 		__itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->ch->handle,
2357 				   __itt_metadata_u64, 5, data);
2358 
2359 		bdev_io->ch->prev_stat = bdev_io->ch->stat;
2360 		bdev_io->ch->start_tsc = now_tsc;
2361 	}
2362 #endif
2363 
2364 	assert(bdev_io->cb != NULL);
2365 	assert(spdk_get_thread() == spdk_io_channel_get_thread(bdev_io->ch->channel));
2366 
2367 	bdev_io->cb(bdev_io, bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS,
2368 		    bdev_io->caller_ctx);
2369 }
2370 
2371 static void
2372 _spdk_bdev_reset_complete(struct spdk_io_channel_iter *i, int status)
2373 {
2374 	struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i);
2375 
2376 	if (bdev_io->u.reset.ch_ref != NULL) {
2377 		spdk_put_io_channel(bdev_io->u.reset.ch_ref);
2378 		bdev_io->u.reset.ch_ref = NULL;
2379 	}
2380 
2381 	_spdk_bdev_io_complete(bdev_io);
2382 }
2383 
2384 static void
2385 _spdk_bdev_unfreeze_channel(struct spdk_io_channel_iter *i)
2386 {
2387 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
2388 	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
2389 
2390 	ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS;
2391 	if (!TAILQ_EMPTY(&ch->queued_resets)) {
2392 		_spdk_bdev_channel_start_reset(ch);
2393 	}
2394 
2395 	spdk_for_each_channel_continue(i, 0);
2396 }
2397 
2398 void
2399 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status)
2400 {
2401 	struct spdk_bdev *bdev = bdev_io->bdev;
2402 	struct spdk_bdev_channel *bdev_ch = bdev_io->ch;
2403 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
2404 
2405 	bdev_io->status = status;
2406 
2407 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) {
2408 		bool unlock_channels = false;
2409 
2410 		if (status == SPDK_BDEV_IO_STATUS_NOMEM) {
2411 			SPDK_ERRLOG("NOMEM returned for reset\n");
2412 		}
2413 		pthread_mutex_lock(&bdev->mutex);
2414 		if (bdev_io == bdev->reset_in_progress) {
2415 			bdev->reset_in_progress = NULL;
2416 			unlock_channels = true;
2417 		}
2418 		pthread_mutex_unlock(&bdev->mutex);
2419 
2420 		if (unlock_channels) {
2421 			spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_unfreeze_channel,
2422 					      bdev_io, _spdk_bdev_reset_complete);
2423 			return;
2424 		}
2425 	} else {
2426 		assert(bdev_ch->io_outstanding > 0);
2427 		assert(shared_resource->io_outstanding > 0);
2428 		bdev_ch->io_outstanding--;
2429 		shared_resource->io_outstanding--;
2430 
2431 		if (spdk_unlikely(status == SPDK_BDEV_IO_STATUS_NOMEM)) {
2432 			TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, link);
2433 			/*
2434 			 * Wait for some of the outstanding I/O to complete before we
2435 			 *  retry any of the nomem_io.  Normally we will wait for
2436 			 *  NOMEM_THRESHOLD_COUNT I/O to complete but for low queue
2437 			 *  depth channels we will instead wait for half to complete.
2438 			 */
2439 			shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2,
2440 							   (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT);
2441 			return;
2442 		}
2443 
2444 		if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) {
2445 			_spdk_bdev_ch_retry_io(bdev_ch);
2446 		}
2447 	}
2448 
2449 	_spdk_bdev_io_complete(bdev_io);
2450 }
2451 
2452 void
2453 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc,
2454 				  enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq)
2455 {
2456 	if (sc == SPDK_SCSI_STATUS_GOOD) {
2457 		bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS;
2458 	} else {
2459 		bdev_io->status = SPDK_BDEV_IO_STATUS_SCSI_ERROR;
2460 		bdev_io->error.scsi.sc = sc;
2461 		bdev_io->error.scsi.sk = sk;
2462 		bdev_io->error.scsi.asc = asc;
2463 		bdev_io->error.scsi.ascq = ascq;
2464 	}
2465 
2466 	spdk_bdev_io_complete(bdev_io, bdev_io->status);
2467 }
2468 
2469 void
2470 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io,
2471 			     int *sc, int *sk, int *asc, int *ascq)
2472 {
2473 	assert(sc != NULL);
2474 	assert(sk != NULL);
2475 	assert(asc != NULL);
2476 	assert(ascq != NULL);
2477 
2478 	switch (bdev_io->status) {
2479 	case SPDK_BDEV_IO_STATUS_SUCCESS:
2480 		*sc = SPDK_SCSI_STATUS_GOOD;
2481 		*sk = SPDK_SCSI_SENSE_NO_SENSE;
2482 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
2483 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
2484 		break;
2485 	case SPDK_BDEV_IO_STATUS_NVME_ERROR:
2486 		spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq);
2487 		break;
2488 	case SPDK_BDEV_IO_STATUS_SCSI_ERROR:
2489 		*sc = bdev_io->error.scsi.sc;
2490 		*sk = bdev_io->error.scsi.sk;
2491 		*asc = bdev_io->error.scsi.asc;
2492 		*ascq = bdev_io->error.scsi.ascq;
2493 		break;
2494 	default:
2495 		*sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
2496 		*sk = SPDK_SCSI_SENSE_ABORTED_COMMAND;
2497 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
2498 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
2499 		break;
2500 	}
2501 }
2502 
2503 void
2504 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, int sct, int sc)
2505 {
2506 	if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) {
2507 		bdev_io->status = SPDK_BDEV_IO_STATUS_SUCCESS;
2508 	} else {
2509 		bdev_io->error.nvme.sct = sct;
2510 		bdev_io->error.nvme.sc = sc;
2511 		bdev_io->status = SPDK_BDEV_IO_STATUS_NVME_ERROR;
2512 	}
2513 
2514 	spdk_bdev_io_complete(bdev_io, bdev_io->status);
2515 }
2516 
2517 void
2518 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, int *sct, int *sc)
2519 {
2520 	assert(sct != NULL);
2521 	assert(sc != NULL);
2522 
2523 	if (bdev_io->status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
2524 		*sct = bdev_io->error.nvme.sct;
2525 		*sc = bdev_io->error.nvme.sc;
2526 	} else if (bdev_io->status == SPDK_BDEV_IO_STATUS_SUCCESS) {
2527 		*sct = SPDK_NVME_SCT_GENERIC;
2528 		*sc = SPDK_NVME_SC_SUCCESS;
2529 	} else {
2530 		*sct = SPDK_NVME_SCT_GENERIC;
2531 		*sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
2532 	}
2533 }
2534 
2535 struct spdk_thread *
2536 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io)
2537 {
2538 	return spdk_io_channel_get_thread(bdev_io->ch->channel);
2539 }
2540 
2541 static void
2542 _spdk_bdev_qos_config_type(struct spdk_bdev *bdev, uint64_t qos_set,
2543 			   enum spdk_bdev_qos_type qos_type)
2544 {
2545 	uint64_t	min_qos_set = 0;
2546 
2547 	switch (qos_type) {
2548 	case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
2549 		min_qos_set = SPDK_BDEV_QOS_MIN_IOS_PER_SEC;
2550 		break;
2551 	case SPDK_BDEV_QOS_RW_BYTEPS_RATE_LIMIT:
2552 		min_qos_set = SPDK_BDEV_QOS_MIN_BW_IN_MB_PER_SEC;
2553 		break;
2554 	default:
2555 		SPDK_ERRLOG("Unsupported QoS type.\n");
2556 		return;
2557 	}
2558 
2559 	if (qos_set % min_qos_set) {
2560 		SPDK_ERRLOG("Assigned QoS %" PRIu64 " on bdev %s is not multiple of %lu\n",
2561 			    qos_set, bdev->name, min_qos_set);
2562 		SPDK_ERRLOG("Failed to enable QoS on this bdev %s\n", bdev->name);
2563 		return;
2564 	}
2565 
2566 	if (!bdev->qos) {
2567 		bdev->qos = calloc(1, sizeof(*bdev->qos));
2568 		if (!bdev->qos) {
2569 			SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n");
2570 			return;
2571 		}
2572 	}
2573 
2574 	switch (qos_type) {
2575 	case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
2576 		bdev->qos->iops_rate_limit = qos_set;
2577 		break;
2578 	case SPDK_BDEV_QOS_RW_BYTEPS_RATE_LIMIT:
2579 		bdev->qos->byte_rate_limit = qos_set * 1024 * 1024;
2580 		break;
2581 	default:
2582 		break;
2583 	}
2584 
2585 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev:%s QoS type:%d set:%lu\n",
2586 		      bdev->name, qos_type, qos_set);
2587 
2588 	return;
2589 }
2590 
2591 static void
2592 _spdk_bdev_qos_config(struct spdk_bdev *bdev)
2593 {
2594 	struct spdk_conf_section	*sp = NULL;
2595 	const char			*val = NULL;
2596 	uint64_t			qos_set = 0;
2597 	int				i = 0, j = 0;
2598 
2599 	sp = spdk_conf_find_section(NULL, "QoS");
2600 	if (!sp) {
2601 		return;
2602 	}
2603 
2604 	while (j < SPDK_BDEV_QOS_NUM_TYPES) {
2605 		i = 0;
2606 		while (true) {
2607 			val = spdk_conf_section_get_nmval(sp, qos_type_str[j], i, 0);
2608 			if (!val) {
2609 				break;
2610 			}
2611 
2612 			if (strcmp(bdev->name, val) != 0) {
2613 				i++;
2614 				continue;
2615 			}
2616 
2617 			val = spdk_conf_section_get_nmval(sp, qos_type_str[j], i, 1);
2618 			if (val) {
2619 				qos_set = strtoull(val, NULL, 10);
2620 				_spdk_bdev_qos_config_type(bdev, qos_set, j);
2621 			}
2622 
2623 			break;
2624 		}
2625 
2626 		j++;
2627 	}
2628 
2629 	return;
2630 }
2631 
2632 static int
2633 spdk_bdev_init(struct spdk_bdev *bdev)
2634 {
2635 	assert(bdev->module != NULL);
2636 
2637 	if (!bdev->name) {
2638 		SPDK_ERRLOG("Bdev name is NULL\n");
2639 		return -EINVAL;
2640 	}
2641 
2642 	if (spdk_bdev_get_by_name(bdev->name)) {
2643 		SPDK_ERRLOG("Bdev name:%s already exists\n", bdev->name);
2644 		return -EEXIST;
2645 	}
2646 
2647 	bdev->status = SPDK_BDEV_STATUS_READY;
2648 
2649 	TAILQ_INIT(&bdev->open_descs);
2650 
2651 	TAILQ_INIT(&bdev->aliases);
2652 
2653 	bdev->reset_in_progress = NULL;
2654 
2655 	_spdk_bdev_qos_config(bdev);
2656 
2657 	spdk_io_device_register(__bdev_to_io_dev(bdev),
2658 				spdk_bdev_channel_create, spdk_bdev_channel_destroy,
2659 				sizeof(struct spdk_bdev_channel));
2660 
2661 	pthread_mutex_init(&bdev->mutex, NULL);
2662 	return 0;
2663 }
2664 
2665 static void
2666 spdk_bdev_destroy_cb(void *io_device)
2667 {
2668 	int			rc;
2669 	struct spdk_bdev	*bdev;
2670 	spdk_bdev_unregister_cb	cb_fn;
2671 	void			*cb_arg;
2672 
2673 	bdev = __bdev_from_io_dev(io_device);
2674 	cb_fn = bdev->unregister_cb;
2675 	cb_arg = bdev->unregister_ctx;
2676 
2677 	rc = bdev->fn_table->destruct(bdev->ctxt);
2678 	if (rc < 0) {
2679 		SPDK_ERRLOG("destruct failed\n");
2680 	}
2681 	if (rc <= 0 && cb_fn != NULL) {
2682 		cb_fn(cb_arg, rc);
2683 	}
2684 }
2685 
2686 
2687 static void
2688 spdk_bdev_fini(struct spdk_bdev *bdev)
2689 {
2690 	pthread_mutex_destroy(&bdev->mutex);
2691 
2692 	free(bdev->qos);
2693 
2694 	spdk_io_device_unregister(__bdev_to_io_dev(bdev), spdk_bdev_destroy_cb);
2695 }
2696 
2697 static void
2698 spdk_bdev_start(struct spdk_bdev *bdev)
2699 {
2700 	struct spdk_bdev_module *module;
2701 
2702 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Inserting bdev %s into list\n", bdev->name);
2703 	TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, link);
2704 
2705 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, tailq) {
2706 		if (module->examine) {
2707 			module->action_in_progress++;
2708 			module->examine(bdev);
2709 		}
2710 	}
2711 }
2712 
2713 int
2714 spdk_bdev_register(struct spdk_bdev *bdev)
2715 {
2716 	int rc = spdk_bdev_init(bdev);
2717 
2718 	if (rc == 0) {
2719 		spdk_bdev_start(bdev);
2720 	}
2721 
2722 	return rc;
2723 }
2724 
2725 static void
2726 spdk_vbdev_remove_base_bdevs(struct spdk_bdev *vbdev)
2727 {
2728 	struct spdk_bdev **bdevs;
2729 	struct spdk_bdev *base;
2730 	size_t i, j, k;
2731 	bool found;
2732 
2733 	/* Iterate over base bdevs to remove vbdev from them. */
2734 	for (i = 0; i < vbdev->base_bdevs_cnt; i++) {
2735 		found = false;
2736 		base = vbdev->base_bdevs[i];
2737 
2738 		for (j = 0; j < base->vbdevs_cnt; j++) {
2739 			if (base->vbdevs[j] != vbdev) {
2740 				continue;
2741 			}
2742 
2743 			for (k = j; k + 1 < base->vbdevs_cnt; k++) {
2744 				base->vbdevs[k] = base->vbdevs[k + 1];
2745 			}
2746 
2747 			base->vbdevs_cnt--;
2748 			if (base->vbdevs_cnt > 0) {
2749 				bdevs = realloc(base->vbdevs, base->vbdevs_cnt * sizeof(bdevs[0]));
2750 				/* It would be odd if shrinking memory block fail. */
2751 				assert(bdevs);
2752 				base->vbdevs = bdevs;
2753 			} else {
2754 				free(base->vbdevs);
2755 				base->vbdevs = NULL;
2756 			}
2757 
2758 			found = true;
2759 			break;
2760 		}
2761 
2762 		if (!found) {
2763 			SPDK_WARNLOG("Bdev '%s' is not base bdev of '%s'.\n", base->name, vbdev->name);
2764 		}
2765 	}
2766 
2767 	free(vbdev->base_bdevs);
2768 	vbdev->base_bdevs = NULL;
2769 	vbdev->base_bdevs_cnt = 0;
2770 }
2771 
2772 static int
2773 spdk_vbdev_set_base_bdevs(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, size_t cnt)
2774 {
2775 	struct spdk_bdev **vbdevs;
2776 	struct spdk_bdev *base;
2777 	size_t i;
2778 
2779 	/* Adding base bdevs isn't supported (yet?). */
2780 	assert(vbdev->base_bdevs_cnt == 0);
2781 
2782 	vbdev->base_bdevs = malloc(cnt * sizeof(vbdev->base_bdevs[0]));
2783 	if (!vbdev->base_bdevs) {
2784 		SPDK_ERRLOG("%s - realloc() failed\n", vbdev->name);
2785 		return -ENOMEM;
2786 	}
2787 
2788 	memcpy(vbdev->base_bdevs, base_bdevs, cnt * sizeof(vbdev->base_bdevs[0]));
2789 	vbdev->base_bdevs_cnt = cnt;
2790 
2791 	/* Iterate over base bdevs to add this vbdev to them. */
2792 	for (i = 0; i < cnt; i++) {
2793 		base = vbdev->base_bdevs[i];
2794 
2795 		assert(base != NULL);
2796 		assert(base->claim_module != NULL);
2797 
2798 		vbdevs = realloc(base->vbdevs, (base->vbdevs_cnt + 1) * sizeof(vbdevs[0]));
2799 		if (!vbdevs) {
2800 			SPDK_ERRLOG("%s - realloc() failed\n", base->name);
2801 			spdk_vbdev_remove_base_bdevs(vbdev);
2802 			return -ENOMEM;
2803 		}
2804 
2805 		vbdevs[base->vbdevs_cnt] = vbdev;
2806 		base->vbdevs = vbdevs;
2807 		base->vbdevs_cnt++;
2808 	}
2809 
2810 	return 0;
2811 }
2812 
2813 int
2814 spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count)
2815 {
2816 	int rc;
2817 
2818 	rc = spdk_bdev_init(vbdev);
2819 	if (rc) {
2820 		return rc;
2821 	}
2822 
2823 	if (base_bdev_count == 0) {
2824 		spdk_bdev_start(vbdev);
2825 		return 0;
2826 	}
2827 
2828 	rc = spdk_vbdev_set_base_bdevs(vbdev, base_bdevs, base_bdev_count);
2829 	if (rc) {
2830 		spdk_bdev_fini(vbdev);
2831 		return rc;
2832 	}
2833 
2834 	spdk_bdev_start(vbdev);
2835 	return 0;
2836 
2837 }
2838 
2839 void
2840 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno)
2841 {
2842 	if (bdev->unregister_cb != NULL) {
2843 		bdev->unregister_cb(bdev->unregister_ctx, bdeverrno);
2844 	}
2845 }
2846 
2847 static void
2848 _remove_notify(void *arg)
2849 {
2850 	struct spdk_bdev_desc *desc = arg;
2851 
2852 	desc->remove_cb(desc->remove_ctx);
2853 }
2854 
2855 void
2856 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
2857 {
2858 	struct spdk_bdev_desc	*desc, *tmp;
2859 	bool			do_destruct = true;
2860 	struct spdk_thread	*thread;
2861 
2862 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list\n", bdev->name);
2863 
2864 	thread = spdk_get_thread();
2865 	if (!thread) {
2866 		/* The user called this from a non-SPDK thread. */
2867 		cb_fn(cb_arg, -ENOTSUP);
2868 		return;
2869 	}
2870 
2871 	pthread_mutex_lock(&bdev->mutex);
2872 
2873 	spdk_vbdev_remove_base_bdevs(bdev);
2874 
2875 	bdev->status = SPDK_BDEV_STATUS_REMOVING;
2876 	bdev->unregister_cb = cb_fn;
2877 	bdev->unregister_ctx = cb_arg;
2878 
2879 	TAILQ_FOREACH_SAFE(desc, &bdev->open_descs, link, tmp) {
2880 		if (desc->remove_cb) {
2881 			do_destruct = false;
2882 			/*
2883 			 * Defer invocation of the remove_cb to a separate message that will
2884 			 *  run later on this thread.  This ensures this context unwinds and
2885 			 *  we don't recursively unregister this bdev again if the remove_cb
2886 			 *  immediately closes its descriptor.
2887 			 */
2888 			spdk_thread_send_msg(thread, _remove_notify, desc);
2889 		}
2890 	}
2891 
2892 	if (!do_destruct) {
2893 		pthread_mutex_unlock(&bdev->mutex);
2894 		return;
2895 	}
2896 
2897 	TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, link);
2898 	pthread_mutex_unlock(&bdev->mutex);
2899 
2900 	spdk_bdev_fini(bdev);
2901 }
2902 
2903 int
2904 spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb,
2905 	       void *remove_ctx, struct spdk_bdev_desc **_desc)
2906 {
2907 	struct spdk_bdev_desc *desc;
2908 
2909 	desc = calloc(1, sizeof(*desc));
2910 	if (desc == NULL) {
2911 		SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n");
2912 		return -ENOMEM;
2913 	}
2914 
2915 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
2916 		      spdk_get_thread());
2917 
2918 	pthread_mutex_lock(&bdev->mutex);
2919 
2920 	if (write && bdev->claim_module) {
2921 		SPDK_ERRLOG("Could not open %s - already claimed\n", bdev->name);
2922 		free(desc);
2923 		pthread_mutex_unlock(&bdev->mutex);
2924 		return -EPERM;
2925 	}
2926 
2927 	TAILQ_INSERT_TAIL(&bdev->open_descs, desc, link);
2928 
2929 	desc->bdev = bdev;
2930 	desc->remove_cb = remove_cb;
2931 	desc->remove_ctx = remove_ctx;
2932 	desc->write = write;
2933 	*_desc = desc;
2934 
2935 	pthread_mutex_unlock(&bdev->mutex);
2936 
2937 	return 0;
2938 }
2939 
2940 void
2941 spdk_bdev_close(struct spdk_bdev_desc *desc)
2942 {
2943 	struct spdk_bdev *bdev = desc->bdev;
2944 	bool do_unregister = false;
2945 
2946 	SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
2947 		      spdk_get_thread());
2948 
2949 	pthread_mutex_lock(&bdev->mutex);
2950 
2951 	TAILQ_REMOVE(&bdev->open_descs, desc, link);
2952 	free(desc);
2953 
2954 	/* If no more descriptors, kill QoS channel */
2955 	if (bdev->qos && TAILQ_EMPTY(&bdev->open_descs)) {
2956 		SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n",
2957 			      bdev->name, spdk_get_thread());
2958 
2959 		if (spdk_bdev_qos_destroy(bdev)) {
2960 			/* There isn't anything we can do to recover here. Just let the
2961 			 * old QoS poller keep running. The QoS handling won't change
2962 			 * cores when the user allocates a new channel, but it won't break. */
2963 			SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n");
2964 		}
2965 	}
2966 
2967 	if (bdev->status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->open_descs)) {
2968 		do_unregister = true;
2969 	}
2970 	pthread_mutex_unlock(&bdev->mutex);
2971 
2972 	if (do_unregister == true) {
2973 		spdk_bdev_unregister(bdev, bdev->unregister_cb, bdev->unregister_ctx);
2974 	}
2975 }
2976 
2977 int
2978 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
2979 			    struct spdk_bdev_module *module)
2980 {
2981 	if (bdev->claim_module != NULL) {
2982 		SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name,
2983 			    bdev->claim_module->name);
2984 		return -EPERM;
2985 	}
2986 
2987 	if (desc && !desc->write) {
2988 		desc->write = true;
2989 	}
2990 
2991 	bdev->claim_module = module;
2992 	return 0;
2993 }
2994 
2995 void
2996 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev)
2997 {
2998 	assert(bdev->claim_module != NULL);
2999 	bdev->claim_module = NULL;
3000 }
3001 
3002 struct spdk_bdev *
3003 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc)
3004 {
3005 	return desc->bdev;
3006 }
3007 
3008 void
3009 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp)
3010 {
3011 	struct iovec *iovs;
3012 	int iovcnt;
3013 
3014 	if (bdev_io == NULL) {
3015 		return;
3016 	}
3017 
3018 	switch (bdev_io->type) {
3019 	case SPDK_BDEV_IO_TYPE_READ:
3020 		iovs = bdev_io->u.bdev.iovs;
3021 		iovcnt = bdev_io->u.bdev.iovcnt;
3022 		break;
3023 	case SPDK_BDEV_IO_TYPE_WRITE:
3024 		iovs = bdev_io->u.bdev.iovs;
3025 		iovcnt = bdev_io->u.bdev.iovcnt;
3026 		break;
3027 	default:
3028 		iovs = NULL;
3029 		iovcnt = 0;
3030 		break;
3031 	}
3032 
3033 	if (iovp) {
3034 		*iovp = iovs;
3035 	}
3036 	if (iovcntp) {
3037 		*iovcntp = iovcnt;
3038 	}
3039 }
3040 
3041 void
3042 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module)
3043 {
3044 
3045 	if (spdk_bdev_module_list_find(bdev_module->name)) {
3046 		SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name);
3047 		assert(false);
3048 	}
3049 
3050 	if (bdev_module->async_init) {
3051 		bdev_module->action_in_progress = 1;
3052 	}
3053 
3054 	/*
3055 	 * Modules with examine callbacks must be initialized first, so they are
3056 	 *  ready to handle examine callbacks from later modules that will
3057 	 *  register physical bdevs.
3058 	 */
3059 	if (bdev_module->examine != NULL) {
3060 		TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, tailq);
3061 	} else {
3062 		TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, tailq);
3063 	}
3064 }
3065 
3066 struct spdk_bdev_module *
3067 spdk_bdev_module_list_find(const char *name)
3068 {
3069 	struct spdk_bdev_module *bdev_module;
3070 
3071 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, tailq) {
3072 		if (strcmp(name, bdev_module->name) == 0) {
3073 			break;
3074 		}
3075 	}
3076 
3077 	return bdev_module;
3078 }
3079 
3080 static void
3081 spdk_bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
3082 {
3083 	uint64_t len;
3084 
3085 	if (!success) {
3086 		bdev_io->cb = bdev_io->u.bdev.stored_user_cb;
3087 		_spdk_bdev_io_complete(bdev_io);
3088 		return;
3089 	}
3090 
3091 	/* no need to perform the error checking from write_zeroes_blocks because this request already passed those checks. */
3092 	len = spdk_min(spdk_bdev_get_block_size(bdev_io->bdev) * bdev_io->u.bdev.split_remaining_num_blocks,
3093 		       ZERO_BUFFER_SIZE);
3094 
3095 	bdev_io->u.bdev.offset_blocks = bdev_io->u.bdev.split_current_offset_blocks;
3096 	bdev_io->u.bdev.iov.iov_len = len;
3097 	bdev_io->u.bdev.num_blocks = len / spdk_bdev_get_block_size(bdev_io->bdev);
3098 	bdev_io->u.bdev.split_remaining_num_blocks -= bdev_io->u.bdev.num_blocks;
3099 	bdev_io->u.bdev.split_current_offset_blocks += bdev_io->u.bdev.num_blocks;
3100 
3101 	/* if this round completes the i/o, change the callback to be the original user callback */
3102 	if (bdev_io->u.bdev.split_remaining_num_blocks == 0) {
3103 		spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, bdev_io->u.bdev.stored_user_cb);
3104 	} else {
3105 		spdk_bdev_io_init(bdev_io, bdev_io->bdev, cb_arg, spdk_bdev_write_zeroes_split);
3106 	}
3107 	spdk_bdev_io_submit(bdev_io);
3108 }
3109 
3110 struct set_qos_limit_ctx {
3111 	void (*cb_fn)(void *cb_arg, int status);
3112 	void *cb_arg;
3113 	struct spdk_bdev *bdev;
3114 };
3115 
3116 static void
3117 _spdk_bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status)
3118 {
3119 	pthread_mutex_lock(&ctx->bdev->mutex);
3120 	ctx->bdev->qos_mod_in_progress = false;
3121 	pthread_mutex_unlock(&ctx->bdev->mutex);
3122 
3123 	ctx->cb_fn(ctx->cb_arg, status);
3124 	free(ctx);
3125 }
3126 
3127 static void
3128 _spdk_bdev_disable_qos_done(void *cb_arg)
3129 {
3130 	struct set_qos_limit_ctx *ctx = cb_arg;
3131 	struct spdk_bdev *bdev = ctx->bdev;
3132 	struct spdk_bdev_qos *qos;
3133 
3134 	pthread_mutex_lock(&bdev->mutex);
3135 	qos = bdev->qos;
3136 	bdev->qos = NULL;
3137 	pthread_mutex_unlock(&bdev->mutex);
3138 
3139 	_spdk_bdev_abort_queued_io(&qos->queued, qos->ch);
3140 	spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
3141 	spdk_poller_unregister(&qos->poller);
3142 
3143 	free(qos);
3144 
3145 	_spdk_bdev_set_qos_limit_done(ctx, 0);
3146 }
3147 
3148 static void
3149 _spdk_bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status)
3150 {
3151 	void *io_device = spdk_io_channel_iter_get_io_device(i);
3152 	struct spdk_bdev *bdev = __bdev_from_io_dev(io_device);
3153 	struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
3154 	struct spdk_thread *thread;
3155 
3156 	pthread_mutex_lock(&bdev->mutex);
3157 	thread = bdev->qos->thread;
3158 	pthread_mutex_unlock(&bdev->mutex);
3159 
3160 	spdk_thread_send_msg(thread, _spdk_bdev_disable_qos_done, ctx);
3161 }
3162 
3163 static void
3164 _spdk_bdev_disable_qos_msg(struct spdk_io_channel_iter *i)
3165 {
3166 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
3167 	struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch);
3168 
3169 	bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED;
3170 
3171 	spdk_for_each_channel_continue(i, 0);
3172 }
3173 
3174 static void
3175 _spdk_bdev_update_qos_limit_iops_msg(void *cb_arg)
3176 {
3177 	struct set_qos_limit_ctx *ctx = cb_arg;
3178 	struct spdk_bdev *bdev = ctx->bdev;
3179 
3180 	pthread_mutex_lock(&bdev->mutex);
3181 	spdk_bdev_qos_update_max_quota_per_timeslice(bdev->qos);
3182 	pthread_mutex_unlock(&bdev->mutex);
3183 
3184 	_spdk_bdev_set_qos_limit_done(ctx, 0);
3185 }
3186 
3187 static void
3188 _spdk_bdev_enable_qos_msg(struct spdk_io_channel_iter *i)
3189 {
3190 	void *io_device = spdk_io_channel_iter_get_io_device(i);
3191 	struct spdk_bdev *bdev = __bdev_from_io_dev(io_device);
3192 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
3193 	struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch);
3194 	int rc;
3195 
3196 	pthread_mutex_lock(&bdev->mutex);
3197 	rc = _spdk_bdev_enable_qos(bdev, bdev_ch);
3198 	pthread_mutex_unlock(&bdev->mutex);
3199 	spdk_for_each_channel_continue(i, rc);
3200 }
3201 
3202 static void
3203 _spdk_bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status)
3204 {
3205 	struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
3206 
3207 	_spdk_bdev_set_qos_limit_done(ctx, status);
3208 }
3209 
3210 void
3211 spdk_bdev_set_qos_limit_iops(struct spdk_bdev *bdev, uint64_t ios_per_sec,
3212 			     void (*cb_fn)(void *cb_arg, int status), void *cb_arg)
3213 {
3214 	struct set_qos_limit_ctx *ctx;
3215 
3216 	if (ios_per_sec > 0 && ios_per_sec % SPDK_BDEV_QOS_MIN_IOS_PER_SEC) {
3217 		SPDK_ERRLOG("Requested ios_per_sec limit %" PRIu64 " is not a multiple of %u\n",
3218 			    ios_per_sec, SPDK_BDEV_QOS_MIN_IOS_PER_SEC);
3219 		cb_fn(cb_arg, -EINVAL);
3220 		return;
3221 	}
3222 
3223 	ctx = calloc(1, sizeof(*ctx));
3224 	if (ctx == NULL) {
3225 		cb_fn(cb_arg, -ENOMEM);
3226 		return;
3227 	}
3228 
3229 	ctx->cb_fn = cb_fn;
3230 	ctx->cb_arg = cb_arg;
3231 	ctx->bdev = bdev;
3232 
3233 	pthread_mutex_lock(&bdev->mutex);
3234 	if (bdev->qos_mod_in_progress) {
3235 		pthread_mutex_unlock(&bdev->mutex);
3236 		free(ctx);
3237 		cb_fn(cb_arg, -EAGAIN);
3238 		return;
3239 	}
3240 	bdev->qos_mod_in_progress = true;
3241 
3242 	if (ios_per_sec > 0) {
3243 		if (bdev->qos == NULL) {
3244 			/* Enabling */
3245 			bdev->qos = calloc(1, sizeof(*bdev->qos));
3246 			if (!bdev->qos) {
3247 				pthread_mutex_unlock(&bdev->mutex);
3248 				SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n");
3249 				free(ctx);
3250 				cb_fn(cb_arg, -ENOMEM);
3251 				return;
3252 			}
3253 
3254 			bdev->qos->iops_rate_limit = ios_per_sec;
3255 			spdk_for_each_channel(__bdev_to_io_dev(bdev),
3256 					      _spdk_bdev_enable_qos_msg, ctx,
3257 					      _spdk_bdev_enable_qos_done);
3258 		} else {
3259 			/* Updating */
3260 			bdev->qos->iops_rate_limit = ios_per_sec;
3261 			spdk_thread_send_msg(bdev->qos->thread, _spdk_bdev_update_qos_limit_iops_msg, ctx);
3262 		}
3263 	} else {
3264 		if (bdev->qos != NULL) {
3265 			/* Disabling */
3266 			spdk_for_each_channel(__bdev_to_io_dev(bdev),
3267 					      _spdk_bdev_disable_qos_msg, ctx,
3268 					      _spdk_bdev_disable_qos_msg_done);
3269 		} else {
3270 			pthread_mutex_unlock(&bdev->mutex);
3271 			_spdk_bdev_set_qos_limit_done(ctx, 0);
3272 			return;
3273 		}
3274 	}
3275 
3276 	pthread_mutex_unlock(&bdev->mutex);
3277 }
3278 
3279 SPDK_LOG_REGISTER_COMPONENT("bdev", SPDK_LOG_BDEV)
3280