xref: /spdk/lib/bdev/bdev.c (revision dfc989439662457d39bac524be72e8ea1c20e817)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (c) Intel Corporation. All rights reserved.
3  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
4  *   Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  */
6 
7 #include "spdk/stdinc.h"
8 
9 #include "spdk/bdev.h"
10 
11 #include "spdk/config.h"
12 #include "spdk/env.h"
13 #include "spdk/thread.h"
14 #include "spdk/likely.h"
15 #include "spdk/queue.h"
16 #include "spdk/nvme_spec.h"
17 #include "spdk/scsi_spec.h"
18 #include "spdk/notify.h"
19 #include "spdk/util.h"
20 #include "spdk/trace.h"
21 #include "spdk/dma.h"
22 
23 #include "spdk/bdev_module.h"
24 #include "spdk/log.h"
25 #include "spdk/string.h"
26 
27 #include "bdev_internal.h"
28 #include "spdk_internal/trace_defs.h"
29 
30 #ifdef SPDK_CONFIG_VTUNE
31 #include "ittnotify.h"
32 #include "ittnotify_types.h"
33 int __itt_init_ittlib(const char *, __itt_group_id);
34 #endif
35 
36 #define SPDK_BDEV_IO_POOL_SIZE			(64 * 1024 - 1)
37 #define SPDK_BDEV_IO_CACHE_SIZE			256
38 #define SPDK_BDEV_AUTO_EXAMINE			true
39 #define BUF_SMALL_POOL_SIZE			8191
40 #define BUF_LARGE_POOL_SIZE			1023
41 #define NOMEM_THRESHOLD_COUNT			8
42 
43 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC		1000
44 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE	1
45 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE	512
46 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC		1000
47 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC		(1024 * 1024)
48 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED		UINT64_MAX
49 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC	1000
50 
51 #define SPDK_BDEV_POOL_ALIGNMENT 512
52 
53 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command
54  * when splitting into children requests at a time.
55  */
56 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8)
57 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000
58 
59 static const char *qos_rpc_type[] = {"rw_ios_per_sec",
60 				     "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec"
61 				    };
62 
63 TAILQ_HEAD(spdk_bdev_list, spdk_bdev);
64 
65 RB_HEAD(bdev_name_tree, spdk_bdev_name);
66 
67 static int
68 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2)
69 {
70 	return strcmp(name1->name, name2->name);
71 }
72 
73 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp);
74 
75 struct spdk_bdev_mgr {
76 	struct spdk_mempool *bdev_io_pool;
77 
78 	struct spdk_mempool *buf_small_pool;
79 	struct spdk_mempool *buf_large_pool;
80 
81 	void *zero_buffer;
82 
83 	TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules;
84 
85 	struct spdk_bdev_list bdevs;
86 	struct bdev_name_tree bdev_names;
87 
88 	bool init_complete;
89 	bool module_init_complete;
90 
91 	pthread_mutex_t mutex;
92 
93 #ifdef SPDK_CONFIG_VTUNE
94 	__itt_domain	*domain;
95 #endif
96 };
97 
98 static struct spdk_bdev_mgr g_bdev_mgr = {
99 	.bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules),
100 	.bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs),
101 	.bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names),
102 	.init_complete = false,
103 	.module_init_complete = false,
104 	.mutex = PTHREAD_MUTEX_INITIALIZER,
105 };
106 
107 typedef void (*lock_range_cb)(void *ctx, int status);
108 
109 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc);
110 
111 struct lba_range {
112 	uint64_t			offset;
113 	uint64_t			length;
114 	void				*locked_ctx;
115 	struct spdk_bdev_channel	*owner_ch;
116 	TAILQ_ENTRY(lba_range)		tailq;
117 };
118 
119 static struct spdk_bdev_opts	g_bdev_opts = {
120 	.bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE,
121 	.bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE,
122 	.bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE,
123 	.small_buf_pool_size = BUF_SMALL_POOL_SIZE,
124 	.large_buf_pool_size = BUF_LARGE_POOL_SIZE,
125 };
126 
127 static spdk_bdev_init_cb	g_init_cb_fn = NULL;
128 static void			*g_init_cb_arg = NULL;
129 
130 static spdk_bdev_fini_cb	g_fini_cb_fn = NULL;
131 static void			*g_fini_cb_arg = NULL;
132 static struct spdk_thread	*g_fini_thread = NULL;
133 
134 struct spdk_bdev_qos_limit {
135 	/** IOs or bytes allowed per second (i.e., 1s). */
136 	uint64_t limit;
137 
138 	/** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms).
139 	 *  For remaining bytes, allowed to run negative if an I/O is submitted when
140 	 *  some bytes are remaining, but the I/O is bigger than that amount. The
141 	 *  excess will be deducted from the next timeslice.
142 	 */
143 	int64_t remaining_this_timeslice;
144 
145 	/** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */
146 	uint32_t min_per_timeslice;
147 
148 	/** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */
149 	uint32_t max_per_timeslice;
150 
151 	/** Function to check whether to queue the IO. */
152 	bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io);
153 
154 	/** Function to update for the submitted IO. */
155 	void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io);
156 };
157 
158 struct spdk_bdev_qos {
159 	/** Types of structure of rate limits. */
160 	struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
161 
162 	/** The channel that all I/O are funneled through. */
163 	struct spdk_bdev_channel *ch;
164 
165 	/** The thread on which the poller is running. */
166 	struct spdk_thread *thread;
167 
168 	/** Queue of I/O waiting to be issued. */
169 	bdev_io_tailq_t queued;
170 
171 	/** Size of a timeslice in tsc ticks. */
172 	uint64_t timeslice_size;
173 
174 	/** Timestamp of start of last timeslice. */
175 	uint64_t last_timeslice;
176 
177 	/** Poller that processes queued I/O commands each time slice. */
178 	struct spdk_poller *poller;
179 };
180 
181 struct spdk_bdev_mgmt_channel {
182 	bdev_io_stailq_t need_buf_small;
183 	bdev_io_stailq_t need_buf_large;
184 
185 	/*
186 	 * Each thread keeps a cache of bdev_io - this allows
187 	 *  bdev threads which are *not* DPDK threads to still
188 	 *  benefit from a per-thread bdev_io cache.  Without
189 	 *  this, non-DPDK threads fetching from the mempool
190 	 *  incur a cmpxchg on get and put.
191 	 */
192 	bdev_io_stailq_t per_thread_cache;
193 	uint32_t	per_thread_cache_count;
194 	uint32_t	bdev_io_cache_size;
195 
196 	TAILQ_HEAD(, spdk_bdev_shared_resource)	shared_resources;
197 	TAILQ_HEAD(, spdk_bdev_io_wait_entry)	io_wait_queue;
198 };
199 
200 /*
201  * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device
202  * will queue here their IO that awaits retry. It makes it possible to retry sending
203  * IO to one bdev after IO from other bdev completes.
204  */
205 struct spdk_bdev_shared_resource {
206 	/* The bdev management channel */
207 	struct spdk_bdev_mgmt_channel *mgmt_ch;
208 
209 	/*
210 	 * Count of I/O submitted to bdev module and waiting for completion.
211 	 * Incremented before submit_request() is called on an spdk_bdev_io.
212 	 */
213 	uint64_t		io_outstanding;
214 
215 	/*
216 	 * Queue of IO awaiting retry because of a previous NOMEM status returned
217 	 *  on this channel.
218 	 */
219 	bdev_io_tailq_t		nomem_io;
220 
221 	/*
222 	 * Threshold which io_outstanding must drop to before retrying nomem_io.
223 	 */
224 	uint64_t		nomem_threshold;
225 
226 	/* I/O channel allocated by a bdev module */
227 	struct spdk_io_channel	*shared_ch;
228 
229 	/* Refcount of bdev channels using this resource */
230 	uint32_t		ref;
231 
232 	TAILQ_ENTRY(spdk_bdev_shared_resource) link;
233 };
234 
235 #define BDEV_CH_RESET_IN_PROGRESS	(1 << 0)
236 #define BDEV_CH_QOS_ENABLED		(1 << 1)
237 
238 struct spdk_bdev_channel {
239 	struct spdk_bdev	*bdev;
240 
241 	/* The channel for the underlying device */
242 	struct spdk_io_channel	*channel;
243 
244 	/* Per io_device per thread data */
245 	struct spdk_bdev_shared_resource *shared_resource;
246 
247 	struct spdk_bdev_io_stat stat;
248 
249 	/*
250 	 * Count of I/O submitted to the underlying dev module through this channel
251 	 * and waiting for completion.
252 	 */
253 	uint64_t		io_outstanding;
254 
255 	/*
256 	 * List of all submitted I/Os including I/O that are generated via splitting.
257 	 */
258 	bdev_io_tailq_t		io_submitted;
259 
260 	/*
261 	 * List of spdk_bdev_io that are currently queued because they write to a locked
262 	 * LBA range.
263 	 */
264 	bdev_io_tailq_t		io_locked;
265 
266 	uint32_t		flags;
267 
268 	struct spdk_histogram_data *histogram;
269 
270 #ifdef SPDK_CONFIG_VTUNE
271 	uint64_t		start_tsc;
272 	uint64_t		interval_tsc;
273 	__itt_string_handle	*handle;
274 	struct spdk_bdev_io_stat prev_stat;
275 #endif
276 
277 	bdev_io_tailq_t		queued_resets;
278 
279 	lba_range_tailq_t	locked_ranges;
280 };
281 
282 struct media_event_entry {
283 	struct spdk_bdev_media_event	event;
284 	TAILQ_ENTRY(media_event_entry)	tailq;
285 };
286 
287 #define MEDIA_EVENT_POOL_SIZE 64
288 
289 struct spdk_bdev_desc {
290 	struct spdk_bdev		*bdev;
291 	struct spdk_thread		*thread;
292 	struct {
293 		spdk_bdev_event_cb_t event_fn;
294 		void *ctx;
295 	}				callback;
296 	bool				closed;
297 	bool				write;
298 	bool				memory_domains_supported;
299 	pthread_mutex_t			mutex;
300 	uint32_t			refs;
301 	TAILQ_HEAD(, media_event_entry)	pending_media_events;
302 	TAILQ_HEAD(, media_event_entry)	free_media_events;
303 	struct media_event_entry	*media_events_buffer;
304 	TAILQ_ENTRY(spdk_bdev_desc)	link;
305 
306 	uint64_t		timeout_in_sec;
307 	spdk_bdev_io_timeout_cb	cb_fn;
308 	void			*cb_arg;
309 	struct spdk_poller	*io_timeout_poller;
310 };
311 
312 struct spdk_bdev_iostat_ctx {
313 	struct spdk_bdev_io_stat *stat;
314 	spdk_bdev_get_device_stat_cb cb;
315 	void *cb_arg;
316 };
317 
318 struct set_qos_limit_ctx {
319 	void (*cb_fn)(void *cb_arg, int status);
320 	void *cb_arg;
321 	struct spdk_bdev *bdev;
322 };
323 
324 #define __bdev_to_io_dev(bdev)		(((char *)bdev) + 1)
325 #define __bdev_from_io_dev(io_dev)	((struct spdk_bdev *)(((char *)io_dev) - 1))
326 
327 static inline void bdev_io_complete(void *ctx);
328 
329 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
330 static void bdev_write_zero_buffer_next(void *_bdev_io);
331 
332 static void bdev_enable_qos_msg(struct spdk_io_channel_iter *i);
333 static void bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status);
334 
335 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
336 				     struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks,
337 				     uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg,
338 				     struct spdk_bdev_ext_io_opts *opts, bool copy_opts);
339 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
340 				      struct iovec *iov, int iovcnt, void *md_buf,
341 				      uint64_t offset_blocks, uint64_t num_blocks,
342 				      spdk_bdev_io_completion_cb cb, void *cb_arg,
343 				      struct spdk_bdev_ext_io_opts *opts, bool copy_opts);
344 
345 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
346 			       uint64_t offset, uint64_t length,
347 			       lock_range_cb cb_fn, void *cb_arg);
348 
349 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
350 				 uint64_t offset, uint64_t length,
351 				 lock_range_cb cb_fn, void *cb_arg);
352 
353 static inline void bdev_io_complete(void *ctx);
354 
355 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort);
356 static bool bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort);
357 
358 void
359 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size)
360 {
361 	if (!opts) {
362 		SPDK_ERRLOG("opts should not be NULL\n");
363 		return;
364 	}
365 
366 	if (!opts_size) {
367 		SPDK_ERRLOG("opts_size should not be zero value\n");
368 		return;
369 	}
370 
371 	opts->opts_size = opts_size;
372 
373 #define SET_FIELD(field) \
374 	if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \
375 		opts->field = g_bdev_opts.field; \
376 	} \
377 
378 	SET_FIELD(bdev_io_pool_size);
379 	SET_FIELD(bdev_io_cache_size);
380 	SET_FIELD(bdev_auto_examine);
381 	SET_FIELD(small_buf_pool_size);
382 	SET_FIELD(large_buf_pool_size);
383 
384 	/* Do not remove this statement, you should always update this statement when you adding a new field,
385 	 * and do not forget to add the SET_FIELD statement for your added field. */
386 	SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size");
387 
388 #undef SET_FIELD
389 }
390 
391 int
392 spdk_bdev_set_opts(struct spdk_bdev_opts *opts)
393 {
394 	uint32_t min_pool_size;
395 
396 	if (!opts) {
397 		SPDK_ERRLOG("opts cannot be NULL\n");
398 		return -1;
399 	}
400 
401 	if (!opts->opts_size) {
402 		SPDK_ERRLOG("opts_size inside opts cannot be zero value\n");
403 		return -1;
404 	}
405 
406 	/*
407 	 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem
408 	 *  initialization.  A second mgmt_ch will be created on the same thread when the application starts
409 	 *  but before the deferred put_io_channel event is executed for the first mgmt_ch.
410 	 */
411 	min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1);
412 	if (opts->bdev_io_pool_size < min_pool_size) {
413 		SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32
414 			    " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size,
415 			    spdk_thread_get_count());
416 		SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size);
417 		return -1;
418 	}
419 
420 	if (opts->small_buf_pool_size < BUF_SMALL_POOL_SIZE) {
421 		SPDK_ERRLOG("small_buf_pool_size must be at least %" PRIu32 "\n", BUF_SMALL_POOL_SIZE);
422 		return -1;
423 	}
424 
425 	if (opts->large_buf_pool_size < BUF_LARGE_POOL_SIZE) {
426 		SPDK_ERRLOG("large_buf_pool_size must be at least %" PRIu32 "\n", BUF_LARGE_POOL_SIZE);
427 		return -1;
428 	}
429 
430 #define SET_FIELD(field) \
431         if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \
432                 g_bdev_opts.field = opts->field; \
433         } \
434 
435 	SET_FIELD(bdev_io_pool_size);
436 	SET_FIELD(bdev_io_cache_size);
437 	SET_FIELD(bdev_auto_examine);
438 	SET_FIELD(small_buf_pool_size);
439 	SET_FIELD(large_buf_pool_size);
440 
441 	g_bdev_opts.opts_size = opts->opts_size;
442 
443 #undef SET_FIELD
444 
445 	return 0;
446 }
447 
448 static struct spdk_bdev *
449 bdev_get_by_name(const char *bdev_name)
450 {
451 	struct spdk_bdev_name find;
452 	struct spdk_bdev_name *res;
453 
454 	find.name = (char *)bdev_name;
455 	res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find);
456 	if (res != NULL) {
457 		return res->bdev;
458 	}
459 
460 	return NULL;
461 }
462 
463 struct spdk_bdev *
464 spdk_bdev_get_by_name(const char *bdev_name)
465 {
466 	struct spdk_bdev *bdev;
467 
468 	pthread_mutex_lock(&g_bdev_mgr.mutex);
469 	bdev = bdev_get_by_name(bdev_name);
470 	pthread_mutex_unlock(&g_bdev_mgr.mutex);
471 
472 	return bdev;
473 }
474 
475 struct spdk_bdev_wait_for_examine_ctx {
476 	struct spdk_poller              *poller;
477 	spdk_bdev_wait_for_examine_cb	cb_fn;
478 	void				*cb_arg;
479 };
480 
481 static bool bdev_module_all_actions_completed(void);
482 
483 static int
484 bdev_wait_for_examine_cb(void *arg)
485 {
486 	struct spdk_bdev_wait_for_examine_ctx *ctx = arg;
487 
488 	if (!bdev_module_all_actions_completed()) {
489 		return SPDK_POLLER_IDLE;
490 	}
491 
492 	spdk_poller_unregister(&ctx->poller);
493 	ctx->cb_fn(ctx->cb_arg);
494 	free(ctx);
495 
496 	return SPDK_POLLER_BUSY;
497 }
498 
499 int
500 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg)
501 {
502 	struct spdk_bdev_wait_for_examine_ctx *ctx;
503 
504 	ctx = calloc(1, sizeof(*ctx));
505 	if (ctx == NULL) {
506 		return -ENOMEM;
507 	}
508 	ctx->cb_fn = cb_fn;
509 	ctx->cb_arg = cb_arg;
510 	ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0);
511 
512 	return 0;
513 }
514 
515 struct spdk_bdev_examine_item {
516 	char *name;
517 	TAILQ_ENTRY(spdk_bdev_examine_item) link;
518 };
519 
520 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item);
521 
522 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER(
523 			g_bdev_examine_allowlist);
524 
525 static inline bool
526 bdev_examine_allowlist_check(const char *name)
527 {
528 	struct spdk_bdev_examine_item *item;
529 	TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) {
530 		if (strcmp(name, item->name) == 0) {
531 			return true;
532 		}
533 	}
534 	return false;
535 }
536 
537 static inline void
538 bdev_examine_allowlist_free(void)
539 {
540 	struct spdk_bdev_examine_item *item;
541 	while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) {
542 		item = TAILQ_FIRST(&g_bdev_examine_allowlist);
543 		TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link);
544 		free(item->name);
545 		free(item);
546 	}
547 }
548 
549 static inline bool
550 bdev_in_examine_allowlist(struct spdk_bdev *bdev)
551 {
552 	struct spdk_bdev_alias *tmp;
553 	if (bdev_examine_allowlist_check(bdev->name)) {
554 		return true;
555 	}
556 	TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
557 		if (bdev_examine_allowlist_check(tmp->alias.name)) {
558 			return true;
559 		}
560 	}
561 	return false;
562 }
563 
564 static inline bool
565 bdev_ok_to_examine(struct spdk_bdev *bdev)
566 {
567 	if (g_bdev_opts.bdev_auto_examine) {
568 		return true;
569 	} else {
570 		return bdev_in_examine_allowlist(bdev);
571 	}
572 }
573 
574 static void
575 bdev_examine(struct spdk_bdev *bdev)
576 {
577 	struct spdk_bdev_module *module;
578 	uint32_t action;
579 
580 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
581 		if (module->examine_config && bdev_ok_to_examine(bdev)) {
582 			action = module->internal.action_in_progress;
583 			module->internal.action_in_progress++;
584 			module->examine_config(bdev);
585 			if (action != module->internal.action_in_progress) {
586 				SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n",
587 					    module->name);
588 			}
589 		}
590 	}
591 
592 	if (bdev->internal.claim_module && bdev_ok_to_examine(bdev)) {
593 		if (bdev->internal.claim_module->examine_disk) {
594 			bdev->internal.claim_module->internal.action_in_progress++;
595 			bdev->internal.claim_module->examine_disk(bdev);
596 		}
597 		return;
598 	}
599 
600 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
601 		if (module->examine_disk && bdev_ok_to_examine(bdev)) {
602 			module->internal.action_in_progress++;
603 			module->examine_disk(bdev);
604 		}
605 	}
606 }
607 
608 int
609 spdk_bdev_examine(const char *name)
610 {
611 	struct spdk_bdev *bdev;
612 	struct spdk_bdev_examine_item *item;
613 
614 	if (g_bdev_opts.bdev_auto_examine) {
615 		SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled");
616 		return -EINVAL;
617 	}
618 
619 	if (bdev_examine_allowlist_check(name)) {
620 		SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name);
621 		return -EEXIST;
622 	}
623 
624 	item = calloc(1, sizeof(*item));
625 	if (!item) {
626 		return -ENOMEM;
627 	}
628 	item->name = strdup(name);
629 	if (!item->name) {
630 		free(item);
631 		return -ENOMEM;
632 	}
633 	TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link);
634 
635 	bdev = spdk_bdev_get_by_name(name);
636 	if (bdev) {
637 		bdev_examine(bdev);
638 	}
639 	return 0;
640 }
641 
642 static inline void
643 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w)
644 {
645 	struct spdk_bdev_examine_item *item;
646 	TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) {
647 		spdk_json_write_object_begin(w);
648 		spdk_json_write_named_string(w, "method", "bdev_examine");
649 		spdk_json_write_named_object_begin(w, "params");
650 		spdk_json_write_named_string(w, "name", item->name);
651 		spdk_json_write_object_end(w);
652 		spdk_json_write_object_end(w);
653 	}
654 }
655 
656 struct spdk_bdev *
657 spdk_bdev_first(void)
658 {
659 	struct spdk_bdev *bdev;
660 
661 	bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs);
662 	if (bdev) {
663 		SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name);
664 	}
665 
666 	return bdev;
667 }
668 
669 struct spdk_bdev *
670 spdk_bdev_next(struct spdk_bdev *prev)
671 {
672 	struct spdk_bdev *bdev;
673 
674 	bdev = TAILQ_NEXT(prev, internal.link);
675 	if (bdev) {
676 		SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name);
677 	}
678 
679 	return bdev;
680 }
681 
682 static struct spdk_bdev *
683 _bdev_next_leaf(struct spdk_bdev *bdev)
684 {
685 	while (bdev != NULL) {
686 		if (bdev->internal.claim_module == NULL) {
687 			return bdev;
688 		} else {
689 			bdev = TAILQ_NEXT(bdev, internal.link);
690 		}
691 	}
692 
693 	return bdev;
694 }
695 
696 struct spdk_bdev *
697 spdk_bdev_first_leaf(void)
698 {
699 	struct spdk_bdev *bdev;
700 
701 	bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs));
702 
703 	if (bdev) {
704 		SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name);
705 	}
706 
707 	return bdev;
708 }
709 
710 struct spdk_bdev *
711 spdk_bdev_next_leaf(struct spdk_bdev *prev)
712 {
713 	struct spdk_bdev *bdev;
714 
715 	bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link));
716 
717 	if (bdev) {
718 		SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name);
719 	}
720 
721 	return bdev;
722 }
723 
724 static inline bool
725 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io)
726 {
727 	return bdev_io->internal.ext_opts && bdev_io->internal.ext_opts->memory_domain;
728 }
729 
730 void
731 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len)
732 {
733 	struct iovec *iovs;
734 
735 	if (bdev_io->u.bdev.iovs == NULL) {
736 		bdev_io->u.bdev.iovs = &bdev_io->iov;
737 		bdev_io->u.bdev.iovcnt = 1;
738 	}
739 
740 	iovs = bdev_io->u.bdev.iovs;
741 
742 	assert(iovs != NULL);
743 	assert(bdev_io->u.bdev.iovcnt >= 1);
744 
745 	iovs[0].iov_base = buf;
746 	iovs[0].iov_len = len;
747 }
748 
749 void
750 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len)
751 {
752 	assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks);
753 	bdev_io->u.bdev.md_buf = md_buf;
754 }
755 
756 static bool
757 _is_buf_allocated(const struct iovec *iovs)
758 {
759 	if (iovs == NULL) {
760 		return false;
761 	}
762 
763 	return iovs[0].iov_base != NULL;
764 }
765 
766 static bool
767 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment)
768 {
769 	int i;
770 	uintptr_t iov_base;
771 
772 	if (spdk_likely(alignment == 1)) {
773 		return true;
774 	}
775 
776 	for (i = 0; i < iovcnt; i++) {
777 		iov_base = (uintptr_t)iovs[i].iov_base;
778 		if ((iov_base & (alignment - 1)) != 0) {
779 			return false;
780 		}
781 	}
782 
783 	return true;
784 }
785 
786 static void
787 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status)
788 {
789 	struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io);
790 	void *buf;
791 
792 	if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) {
793 		buf = bdev_io->internal.buf;
794 		bdev_io->internal.buf = NULL;
795 		bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf);
796 		bdev_io->internal.get_aux_buf_cb = NULL;
797 	} else {
798 		assert(bdev_io->internal.get_buf_cb != NULL);
799 		bdev_io->internal.get_buf_cb(ch, bdev_io, status);
800 		bdev_io->internal.get_buf_cb = NULL;
801 	}
802 }
803 
804 static void
805 _bdev_io_pull_buffer_cpl(void *ctx, int rc)
806 {
807 	struct spdk_bdev_io *bdev_io = ctx;
808 
809 	if (rc) {
810 		SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc);
811 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
812 	}
813 	bdev_io_get_buf_complete(bdev_io, !rc);
814 }
815 
816 static void
817 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len)
818 {
819 	int rc = 0;
820 
821 	/* save original md_buf */
822 	bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf;
823 	bdev_io->internal.orig_md_iov.iov_len = len;
824 	bdev_io->internal.bounce_md_iov.iov_base = md_buf;
825 	bdev_io->internal.bounce_md_iov.iov_len = len;
826 	/* set bounce md_buf */
827 	bdev_io->u.bdev.md_buf = md_buf;
828 
829 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
830 		if (bdev_io_use_memory_domain(bdev_io)) {
831 			rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain,
832 							  bdev_io->internal.ext_opts->memory_domain_ctx,
833 							  &bdev_io->internal.orig_md_iov, 1,
834 							  &bdev_io->internal.bounce_md_iov, 1,
835 							  bdev_io->internal.data_transfer_cpl,
836 							  bdev_io);
837 			if (rc == 0) {
838 				/* Continue to submit IO in completion callback */
839 				return;
840 			}
841 			SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n",
842 				    spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain), rc);
843 		} else {
844 			memcpy(md_buf, bdev_io->internal.orig_md_iov.iov_base, bdev_io->internal.orig_md_iov.iov_len);
845 		}
846 	}
847 
848 	assert(bdev_io->internal.data_transfer_cpl);
849 	bdev_io->internal.data_transfer_cpl(bdev_io, rc);
850 }
851 
852 static void
853 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io)
854 {
855 	struct spdk_bdev *bdev = bdev_io->bdev;
856 	uint64_t md_len;
857 	void *buf;
858 
859 	if (spdk_bdev_is_md_separate(bdev)) {
860 		buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len;
861 		md_len = bdev_io->u.bdev.num_blocks * bdev->md_len;
862 
863 		assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0);
864 
865 		if (bdev_io->u.bdev.md_buf != NULL) {
866 			_bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len);
867 			return;
868 		} else {
869 			spdk_bdev_io_set_md_buf(bdev_io, buf, md_len);
870 		}
871 	}
872 
873 	bdev_io_get_buf_complete(bdev_io, true);
874 }
875 
876 static void
877 _bdev_io_pull_bounce_data_buf_done(void *ctx, int rc)
878 {
879 	struct spdk_bdev_io *bdev_io = ctx;
880 
881 	if (rc) {
882 		SPDK_ERRLOG("Failed to get data buffer\n");
883 		assert(bdev_io->internal.data_transfer_cpl);
884 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
885 		return;
886 	}
887 
888 	_bdev_io_set_md_buf(bdev_io);
889 }
890 
891 static void
892 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len,
893 			      bdev_copy_bounce_buffer_cpl cpl_cb)
894 {
895 	int rc = 0;
896 
897 	bdev_io->internal.data_transfer_cpl = cpl_cb;
898 	/* save original iovec */
899 	bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs;
900 	bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt;
901 	/* set bounce iov */
902 	bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov;
903 	bdev_io->u.bdev.iovcnt = 1;
904 	/* set bounce buffer for this operation */
905 	bdev_io->u.bdev.iovs[0].iov_base = buf;
906 	bdev_io->u.bdev.iovs[0].iov_len = len;
907 	/* if this is write path, copy data from original buffer to bounce buffer */
908 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
909 		if (bdev_io_use_memory_domain(bdev_io)) {
910 			rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain,
911 							  bdev_io->internal.ext_opts->memory_domain_ctx,
912 							  bdev_io->internal.orig_iovs,
913 							  (uint32_t) bdev_io->internal.orig_iovcnt,
914 							  bdev_io->u.bdev.iovs, 1,
915 							  _bdev_io_pull_bounce_data_buf_done,
916 							  bdev_io);
917 			if (rc == 0) {
918 				/* Continue to submit IO in completion callback */
919 				return;
920 			}
921 			SPDK_ERRLOG("Failed to pull data from memory domain %s\n",
922 				    spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain));
923 		} else {
924 			spdk_copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt);
925 		}
926 	}
927 
928 	_bdev_io_pull_bounce_data_buf_done(bdev_io, rc);
929 }
930 
931 static void
932 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len)
933 {
934 	struct spdk_bdev *bdev = bdev_io->bdev;
935 	bool buf_allocated;
936 	uint64_t alignment;
937 	void *aligned_buf;
938 
939 	bdev_io->internal.buf = buf;
940 
941 	if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) {
942 		bdev_io_get_buf_complete(bdev_io, true);
943 		return;
944 	}
945 
946 	alignment = spdk_bdev_get_buf_align(bdev);
947 	buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs);
948 	aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1));
949 
950 	if (buf_allocated) {
951 		_bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl);
952 		/* Continue in completion callback */
953 		return;
954 	} else {
955 		spdk_bdev_io_set_buf(bdev_io, aligned_buf, len);
956 	}
957 
958 	_bdev_io_set_md_buf(bdev_io);
959 }
960 
961 static void
962 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len)
963 {
964 	struct spdk_bdev *bdev = bdev_io->bdev;
965 	struct spdk_mempool *pool;
966 	struct spdk_bdev_io *tmp;
967 	bdev_io_stailq_t *stailq;
968 	struct spdk_bdev_mgmt_channel *ch;
969 	uint64_t md_len, alignment;
970 
971 	md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0;
972 	alignment = spdk_bdev_get_buf_align(bdev);
973 	ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
974 
975 	if (buf_len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) +
976 	    SPDK_BDEV_POOL_ALIGNMENT) {
977 		pool = g_bdev_mgr.buf_small_pool;
978 		stailq = &ch->need_buf_small;
979 	} else {
980 		pool = g_bdev_mgr.buf_large_pool;
981 		stailq = &ch->need_buf_large;
982 	}
983 
984 	if (STAILQ_EMPTY(stailq)) {
985 		spdk_mempool_put(pool, buf);
986 	} else {
987 		tmp = STAILQ_FIRST(stailq);
988 		STAILQ_REMOVE_HEAD(stailq, internal.buf_link);
989 		_bdev_io_set_buf(tmp, buf, tmp->internal.buf_len);
990 	}
991 }
992 
993 static void
994 bdev_io_put_buf(struct spdk_bdev_io *bdev_io)
995 {
996 	assert(bdev_io->internal.buf != NULL);
997 	_bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len);
998 	bdev_io->internal.buf = NULL;
999 }
1000 
1001 void
1002 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf)
1003 {
1004 	uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
1005 
1006 	assert(buf != NULL);
1007 	_bdev_io_put_buf(bdev_io, buf, len);
1008 }
1009 
1010 static void
1011 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
1012 {
1013 	struct spdk_bdev *bdev = bdev_ch->bdev;
1014 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
1015 	struct spdk_bdev_io *bdev_io;
1016 
1017 	if (shared_resource->io_outstanding > shared_resource->nomem_threshold) {
1018 		/*
1019 		 * Allow some more I/O to complete before retrying the nomem_io queue.
1020 		 *  Some drivers (such as nvme) cannot immediately take a new I/O in
1021 		 *  the context of a completion, because the resources for the I/O are
1022 		 *  not released until control returns to the bdev poller.  Also, we
1023 		 *  may require several small I/O to complete before a larger I/O
1024 		 *  (that requires splitting) can be submitted.
1025 		 */
1026 		return;
1027 	}
1028 
1029 	while (!TAILQ_EMPTY(&shared_resource->nomem_io)) {
1030 		bdev_io = TAILQ_FIRST(&shared_resource->nomem_io);
1031 		TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link);
1032 		bdev_io->internal.ch->io_outstanding++;
1033 		shared_resource->io_outstanding++;
1034 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
1035 		bdev_io->internal.error.nvme.cdw0 = 0;
1036 		bdev_io->num_retries++;
1037 		bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io);
1038 		if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
1039 			break;
1040 		}
1041 	}
1042 }
1043 
1044 static inline void
1045 _bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch,
1046 			       struct spdk_bdev_shared_resource *shared_resource)
1047 {
1048 	assert(bdev_ch->io_outstanding > 0);
1049 	assert(shared_resource->io_outstanding > 0);
1050 	bdev_ch->io_outstanding--;
1051 	shared_resource->io_outstanding--;
1052 }
1053 
1054 static inline bool
1055 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io)
1056 {
1057 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
1058 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
1059 
1060 	if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) {
1061 		TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link);
1062 		/*
1063 		 * Wait for some of the outstanding I/O to complete before we
1064 		 *  retry any of the nomem_io.  Normally we will wait for
1065 		 *  NOMEM_THRESHOLD_COUNT I/O to complete but for low queue
1066 		 *  depth channels we will instead wait for half to complete.
1067 		 */
1068 		shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2,
1069 						   (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT);
1070 		return true;
1071 	}
1072 
1073 	if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) {
1074 		bdev_ch_retry_io(bdev_ch);
1075 	}
1076 
1077 	return false;
1078 }
1079 
1080 static void
1081 _bdev_io_complete_push_bounce_done(void *ctx, int rc)
1082 {
1083 	struct spdk_bdev_io *bdev_io = ctx;
1084 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
1085 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
1086 
1087 	if (rc) {
1088 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1089 	}
1090 	/* We want to free the bounce buffer here since we know we're done with it (as opposed
1091 	 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()).
1092 	 */
1093 	bdev_io_put_buf(bdev_io);
1094 
1095 	/* Continue with IO completion flow */
1096 	_bdev_io_decrement_outstanding(bdev_ch, shared_resource);
1097 	if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) {
1098 		return;
1099 	}
1100 
1101 	bdev_io_complete(bdev_io);
1102 }
1103 
1104 static inline void
1105 _bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io)
1106 {
1107 	int rc = 0;
1108 
1109 	/* do the same for metadata buffer */
1110 	if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) {
1111 		assert(spdk_bdev_is_md_separate(bdev_io->bdev));
1112 
1113 		if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ &&
1114 		    bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
1115 			if (bdev_io_use_memory_domain(bdev_io)) {
1116 				/* If memory domain is used then we need to call async push function */
1117 				rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain,
1118 								  bdev_io->internal.ext_opts->memory_domain_ctx,
1119 								  &bdev_io->internal.orig_md_iov,
1120 								  (uint32_t)bdev_io->internal.orig_iovcnt,
1121 								  &bdev_io->internal.bounce_md_iov, 1,
1122 								  bdev_io->internal.data_transfer_cpl,
1123 								  bdev_io);
1124 				if (rc == 0) {
1125 					/* Continue IO completion in async callback */
1126 					return;
1127 				}
1128 				SPDK_ERRLOG("Failed to push md to memory domain %s\n",
1129 					    spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain));
1130 			} else {
1131 				memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf,
1132 				       bdev_io->internal.orig_md_iov.iov_len);
1133 			}
1134 		}
1135 	}
1136 
1137 	assert(bdev_io->internal.data_transfer_cpl);
1138 	bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1139 }
1140 
1141 static void
1142 _bdev_io_push_bounce_data_buffer_done(void *ctx, int rc)
1143 {
1144 	struct spdk_bdev_io *bdev_io = ctx;
1145 
1146 	assert(bdev_io->internal.data_transfer_cpl);
1147 
1148 	if (rc) {
1149 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1150 		return;
1151 	}
1152 
1153 	/* set original buffer for this io */
1154 	bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt;
1155 	bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs;
1156 	/* disable bouncing buffer for this io */
1157 	bdev_io->internal.orig_iovcnt = 0;
1158 	bdev_io->internal.orig_iovs = NULL;
1159 
1160 	_bdev_io_push_bounce_md_buffer(bdev_io);
1161 }
1162 
1163 static inline void
1164 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb)
1165 {
1166 	int rc = 0;
1167 
1168 	bdev_io->internal.data_transfer_cpl = cpl_cb;
1169 
1170 	/* if this is read path, copy data from bounce buffer to original buffer */
1171 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ &&
1172 	    bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
1173 		if (bdev_io_use_memory_domain(bdev_io)) {
1174 			/* If memory domain is used then we need to call async push function */
1175 			rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain,
1176 							  bdev_io->internal.ext_opts->memory_domain_ctx,
1177 							  bdev_io->internal.orig_iovs,
1178 							  (uint32_t)bdev_io->internal.orig_iovcnt,
1179 							  &bdev_io->internal.bounce_iov, 1,
1180 							  _bdev_io_push_bounce_data_buffer_done,
1181 							  bdev_io);
1182 			if (rc == 0) {
1183 				/* Continue IO completion in async callback */
1184 				return;
1185 			}
1186 			SPDK_ERRLOG("Failed to push data to memory domain %s\n",
1187 				    spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain));
1188 		} else {
1189 			spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs,
1190 					      bdev_io->internal.orig_iovcnt,
1191 					      bdev_io->internal.bounce_iov.iov_base,
1192 					      bdev_io->internal.bounce_iov.iov_len);
1193 		}
1194 	}
1195 
1196 	_bdev_io_push_bounce_data_buffer_done(bdev_io, rc);
1197 }
1198 
1199 static void
1200 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len)
1201 {
1202 	struct spdk_bdev *bdev = bdev_io->bdev;
1203 	struct spdk_mempool *pool;
1204 	bdev_io_stailq_t *stailq;
1205 	struct spdk_bdev_mgmt_channel *mgmt_ch;
1206 	uint64_t alignment, md_len;
1207 	void *buf;
1208 
1209 	alignment = spdk_bdev_get_buf_align(bdev);
1210 	md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0;
1211 
1212 	if (len + alignment + md_len > SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) +
1213 	    SPDK_BDEV_POOL_ALIGNMENT) {
1214 		SPDK_ERRLOG("Length + alignment %" PRIu64 " is larger than allowed\n",
1215 			    len + alignment);
1216 		bdev_io_get_buf_complete(bdev_io, false);
1217 		return;
1218 	}
1219 
1220 	mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
1221 
1222 	bdev_io->internal.buf_len = len;
1223 
1224 	if (len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) +
1225 	    SPDK_BDEV_POOL_ALIGNMENT) {
1226 		pool = g_bdev_mgr.buf_small_pool;
1227 		stailq = &mgmt_ch->need_buf_small;
1228 	} else {
1229 		pool = g_bdev_mgr.buf_large_pool;
1230 		stailq = &mgmt_ch->need_buf_large;
1231 	}
1232 
1233 	buf = spdk_mempool_get(pool);
1234 	if (!buf) {
1235 		STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link);
1236 	} else {
1237 		_bdev_io_set_buf(bdev_io, buf, len);
1238 	}
1239 }
1240 
1241 void
1242 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len)
1243 {
1244 	struct spdk_bdev *bdev = bdev_io->bdev;
1245 	uint64_t alignment;
1246 
1247 	assert(cb != NULL);
1248 	bdev_io->internal.get_buf_cb = cb;
1249 
1250 	alignment = spdk_bdev_get_buf_align(bdev);
1251 
1252 	if (_is_buf_allocated(bdev_io->u.bdev.iovs) &&
1253 	    _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) {
1254 		/* Buffer already present and aligned */
1255 		cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true);
1256 		return;
1257 	}
1258 
1259 	bdev_io_get_buf(bdev_io, len);
1260 }
1261 
1262 static void
1263 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
1264 			      bool success)
1265 {
1266 	if (!success) {
1267 		SPDK_ERRLOG("Failed to get data buffer, completing IO\n");
1268 		bdev_io_complete(bdev_io);
1269 	} else {
1270 		bdev_io_submit(bdev_io);
1271 	}
1272 }
1273 
1274 static void
1275 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb,
1276 			       uint64_t len)
1277 {
1278 	assert(cb != NULL);
1279 	bdev_io->internal.get_buf_cb = cb;
1280 
1281 	bdev_io_get_buf(bdev_io, len);
1282 }
1283 
1284 void
1285 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb)
1286 {
1287 	uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
1288 
1289 	assert(cb != NULL);
1290 	assert(bdev_io->internal.get_aux_buf_cb == NULL);
1291 	bdev_io->internal.get_aux_buf_cb = cb;
1292 	bdev_io_get_buf(bdev_io, len);
1293 }
1294 
1295 static int
1296 bdev_module_get_max_ctx_size(void)
1297 {
1298 	struct spdk_bdev_module *bdev_module;
1299 	int max_bdev_module_size = 0;
1300 
1301 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
1302 		if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) {
1303 			max_bdev_module_size = bdev_module->get_ctx_size();
1304 		}
1305 	}
1306 
1307 	return max_bdev_module_size;
1308 }
1309 
1310 static void
1311 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1312 {
1313 	int i;
1314 	struct spdk_bdev_qos *qos = bdev->internal.qos;
1315 	uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
1316 
1317 	if (!qos) {
1318 		return;
1319 	}
1320 
1321 	spdk_bdev_get_qos_rate_limits(bdev, limits);
1322 
1323 	spdk_json_write_object_begin(w);
1324 	spdk_json_write_named_string(w, "method", "bdev_set_qos_limit");
1325 
1326 	spdk_json_write_named_object_begin(w, "params");
1327 	spdk_json_write_named_string(w, "name", bdev->name);
1328 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
1329 		if (limits[i] > 0) {
1330 			spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]);
1331 		}
1332 	}
1333 	spdk_json_write_object_end(w);
1334 
1335 	spdk_json_write_object_end(w);
1336 }
1337 
1338 void
1339 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w)
1340 {
1341 	struct spdk_bdev_module *bdev_module;
1342 	struct spdk_bdev *bdev;
1343 
1344 	assert(w != NULL);
1345 
1346 	spdk_json_write_array_begin(w);
1347 
1348 	spdk_json_write_object_begin(w);
1349 	spdk_json_write_named_string(w, "method", "bdev_set_options");
1350 	spdk_json_write_named_object_begin(w, "params");
1351 	spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size);
1352 	spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size);
1353 	spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine);
1354 	spdk_json_write_object_end(w);
1355 	spdk_json_write_object_end(w);
1356 
1357 	bdev_examine_allowlist_config_json(w);
1358 
1359 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
1360 		if (bdev_module->config_json) {
1361 			bdev_module->config_json(w);
1362 		}
1363 	}
1364 
1365 	pthread_mutex_lock(&g_bdev_mgr.mutex);
1366 
1367 	TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) {
1368 		if (bdev->fn_table->write_config_json) {
1369 			bdev->fn_table->write_config_json(bdev, w);
1370 		}
1371 
1372 		bdev_qos_config_json(bdev, w);
1373 	}
1374 
1375 	pthread_mutex_unlock(&g_bdev_mgr.mutex);
1376 
1377 	/* This has to be last RPC in array to make sure all bdevs finished examine */
1378 	spdk_json_write_object_begin(w);
1379 	spdk_json_write_named_string(w, "method", "bdev_wait_for_examine");
1380 	spdk_json_write_object_end(w);
1381 
1382 	spdk_json_write_array_end(w);
1383 }
1384 
1385 static int
1386 bdev_mgmt_channel_create(void *io_device, void *ctx_buf)
1387 {
1388 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
1389 	struct spdk_bdev_io *bdev_io;
1390 	uint32_t i;
1391 
1392 	STAILQ_INIT(&ch->need_buf_small);
1393 	STAILQ_INIT(&ch->need_buf_large);
1394 
1395 	STAILQ_INIT(&ch->per_thread_cache);
1396 	ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size;
1397 
1398 	/* Pre-populate bdev_io cache to ensure this thread cannot be starved. */
1399 	ch->per_thread_cache_count = 0;
1400 	for (i = 0; i < ch->bdev_io_cache_size; i++) {
1401 		bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
1402 		assert(bdev_io != NULL);
1403 		ch->per_thread_cache_count++;
1404 		STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link);
1405 	}
1406 
1407 	TAILQ_INIT(&ch->shared_resources);
1408 	TAILQ_INIT(&ch->io_wait_queue);
1409 
1410 	return 0;
1411 }
1412 
1413 static void
1414 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf)
1415 {
1416 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
1417 	struct spdk_bdev_io *bdev_io;
1418 
1419 	if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) {
1420 		SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n");
1421 	}
1422 
1423 	if (!TAILQ_EMPTY(&ch->shared_resources)) {
1424 		SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n");
1425 	}
1426 
1427 	while (!STAILQ_EMPTY(&ch->per_thread_cache)) {
1428 		bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
1429 		STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
1430 		ch->per_thread_cache_count--;
1431 		spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
1432 	}
1433 
1434 	assert(ch->per_thread_cache_count == 0);
1435 }
1436 
1437 static void
1438 bdev_init_complete(int rc)
1439 {
1440 	spdk_bdev_init_cb cb_fn = g_init_cb_fn;
1441 	void *cb_arg = g_init_cb_arg;
1442 	struct spdk_bdev_module *m;
1443 
1444 	g_bdev_mgr.init_complete = true;
1445 	g_init_cb_fn = NULL;
1446 	g_init_cb_arg = NULL;
1447 
1448 	/*
1449 	 * For modules that need to know when subsystem init is complete,
1450 	 * inform them now.
1451 	 */
1452 	if (rc == 0) {
1453 		TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
1454 			if (m->init_complete) {
1455 				m->init_complete();
1456 			}
1457 		}
1458 	}
1459 
1460 	cb_fn(cb_arg, rc);
1461 }
1462 
1463 static bool
1464 bdev_module_all_actions_completed(void)
1465 {
1466 	struct spdk_bdev_module *m;
1467 
1468 	TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
1469 		if (m->internal.action_in_progress > 0) {
1470 			return false;
1471 		}
1472 	}
1473 	return true;
1474 }
1475 
1476 static void
1477 bdev_module_action_complete(void)
1478 {
1479 	/*
1480 	 * Don't finish bdev subsystem initialization if
1481 	 * module pre-initialization is still in progress, or
1482 	 * the subsystem been already initialized.
1483 	 */
1484 	if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) {
1485 		return;
1486 	}
1487 
1488 	/*
1489 	 * Check all bdev modules for inits/examinations in progress. If any
1490 	 * exist, return immediately since we cannot finish bdev subsystem
1491 	 * initialization until all are completed.
1492 	 */
1493 	if (!bdev_module_all_actions_completed()) {
1494 		return;
1495 	}
1496 
1497 	/*
1498 	 * Modules already finished initialization - now that all
1499 	 * the bdev modules have finished their asynchronous I/O
1500 	 * processing, the entire bdev layer can be marked as complete.
1501 	 */
1502 	bdev_init_complete(0);
1503 }
1504 
1505 static void
1506 bdev_module_action_done(struct spdk_bdev_module *module)
1507 {
1508 	assert(module->internal.action_in_progress > 0);
1509 	module->internal.action_in_progress--;
1510 	bdev_module_action_complete();
1511 }
1512 
1513 void
1514 spdk_bdev_module_init_done(struct spdk_bdev_module *module)
1515 {
1516 	bdev_module_action_done(module);
1517 }
1518 
1519 void
1520 spdk_bdev_module_examine_done(struct spdk_bdev_module *module)
1521 {
1522 	bdev_module_action_done(module);
1523 }
1524 
1525 /** The last initialized bdev module */
1526 static struct spdk_bdev_module *g_resume_bdev_module = NULL;
1527 
1528 static void
1529 bdev_init_failed(void *cb_arg)
1530 {
1531 	struct spdk_bdev_module *module = cb_arg;
1532 
1533 	module->internal.action_in_progress--;
1534 	bdev_init_complete(-1);
1535 }
1536 
1537 static int
1538 bdev_modules_init(void)
1539 {
1540 	struct spdk_bdev_module *module;
1541 	int rc = 0;
1542 
1543 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
1544 		g_resume_bdev_module = module;
1545 		if (module->async_init) {
1546 			module->internal.action_in_progress = 1;
1547 		}
1548 		rc = module->module_init();
1549 		if (rc != 0) {
1550 			/* Bump action_in_progress to prevent other modules from completion of modules_init
1551 			 * Send message to defer application shutdown until resources are cleaned up */
1552 			module->internal.action_in_progress = 1;
1553 			spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module);
1554 			return rc;
1555 		}
1556 	}
1557 
1558 	g_resume_bdev_module = NULL;
1559 	return 0;
1560 }
1561 
1562 void
1563 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg)
1564 {
1565 	int cache_size;
1566 	int rc = 0;
1567 	char mempool_name[32];
1568 
1569 	assert(cb_fn != NULL);
1570 
1571 	g_init_cb_fn = cb_fn;
1572 	g_init_cb_arg = cb_arg;
1573 
1574 	spdk_notify_type_register("bdev_register");
1575 	spdk_notify_type_register("bdev_unregister");
1576 
1577 	snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid());
1578 
1579 	g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name,
1580 				  g_bdev_opts.bdev_io_pool_size,
1581 				  sizeof(struct spdk_bdev_io) +
1582 				  bdev_module_get_max_ctx_size(),
1583 				  0,
1584 				  SPDK_ENV_SOCKET_ID_ANY);
1585 
1586 	if (g_bdev_mgr.bdev_io_pool == NULL) {
1587 		SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n");
1588 		bdev_init_complete(-1);
1589 		return;
1590 	}
1591 
1592 	/**
1593 	 * Ensure no more than half of the total buffers end up local caches, by
1594 	 *   using spdk_env_get_core_count() to determine how many local caches we need
1595 	 *   to account for.
1596 	 */
1597 	cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count());
1598 	snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid());
1599 
1600 	g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name,
1601 				    g_bdev_opts.small_buf_pool_size,
1602 				    SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) +
1603 				    SPDK_BDEV_POOL_ALIGNMENT,
1604 				    cache_size,
1605 				    SPDK_ENV_SOCKET_ID_ANY);
1606 	if (!g_bdev_mgr.buf_small_pool) {
1607 		SPDK_ERRLOG("create rbuf small pool failed\n");
1608 		bdev_init_complete(-1);
1609 		return;
1610 	}
1611 
1612 	cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count());
1613 	snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid());
1614 
1615 	g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name,
1616 				    g_bdev_opts.large_buf_pool_size,
1617 				    SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) +
1618 				    SPDK_BDEV_POOL_ALIGNMENT,
1619 				    cache_size,
1620 				    SPDK_ENV_SOCKET_ID_ANY);
1621 	if (!g_bdev_mgr.buf_large_pool) {
1622 		SPDK_ERRLOG("create rbuf large pool failed\n");
1623 		bdev_init_complete(-1);
1624 		return;
1625 	}
1626 
1627 	g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE,
1628 					      NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
1629 	if (!g_bdev_mgr.zero_buffer) {
1630 		SPDK_ERRLOG("create bdev zero buffer failed\n");
1631 		bdev_init_complete(-1);
1632 		return;
1633 	}
1634 
1635 #ifdef SPDK_CONFIG_VTUNE
1636 	g_bdev_mgr.domain = __itt_domain_create("spdk_bdev");
1637 #endif
1638 
1639 	spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create,
1640 				bdev_mgmt_channel_destroy,
1641 				sizeof(struct spdk_bdev_mgmt_channel),
1642 				"bdev_mgr");
1643 
1644 	rc = bdev_modules_init();
1645 	g_bdev_mgr.module_init_complete = true;
1646 	if (rc != 0) {
1647 		SPDK_ERRLOG("bdev modules init failed\n");
1648 		return;
1649 	}
1650 
1651 	bdev_module_action_complete();
1652 }
1653 
1654 static void
1655 bdev_mgr_unregister_cb(void *io_device)
1656 {
1657 	spdk_bdev_fini_cb cb_fn = g_fini_cb_fn;
1658 
1659 	if (g_bdev_mgr.bdev_io_pool) {
1660 		if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) {
1661 			SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n",
1662 				    spdk_mempool_count(g_bdev_mgr.bdev_io_pool),
1663 				    g_bdev_opts.bdev_io_pool_size);
1664 		}
1665 
1666 		spdk_mempool_free(g_bdev_mgr.bdev_io_pool);
1667 	}
1668 
1669 	if (g_bdev_mgr.buf_small_pool) {
1670 		if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != g_bdev_opts.small_buf_pool_size) {
1671 			SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n",
1672 				    spdk_mempool_count(g_bdev_mgr.buf_small_pool),
1673 				    g_bdev_opts.small_buf_pool_size);
1674 			assert(false);
1675 		}
1676 
1677 		spdk_mempool_free(g_bdev_mgr.buf_small_pool);
1678 	}
1679 
1680 	if (g_bdev_mgr.buf_large_pool) {
1681 		if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != g_bdev_opts.large_buf_pool_size) {
1682 			SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n",
1683 				    spdk_mempool_count(g_bdev_mgr.buf_large_pool),
1684 				    g_bdev_opts.large_buf_pool_size);
1685 			assert(false);
1686 		}
1687 
1688 		spdk_mempool_free(g_bdev_mgr.buf_large_pool);
1689 	}
1690 
1691 	spdk_free(g_bdev_mgr.zero_buffer);
1692 
1693 	bdev_examine_allowlist_free();
1694 
1695 	cb_fn(g_fini_cb_arg);
1696 	g_fini_cb_fn = NULL;
1697 	g_fini_cb_arg = NULL;
1698 	g_bdev_mgr.init_complete = false;
1699 	g_bdev_mgr.module_init_complete = false;
1700 }
1701 
1702 static void
1703 bdev_module_fini_iter(void *arg)
1704 {
1705 	struct spdk_bdev_module *bdev_module;
1706 
1707 	/* FIXME: Handling initialization failures is broken now,
1708 	 * so we won't even try cleaning up after successfully
1709 	 * initialized modules. if module_init_complete is false,
1710 	 * just call spdk_bdev_mgr_unregister_cb
1711 	 */
1712 	if (!g_bdev_mgr.module_init_complete) {
1713 		bdev_mgr_unregister_cb(NULL);
1714 		return;
1715 	}
1716 
1717 	/* Start iterating from the last touched module */
1718 	if (!g_resume_bdev_module) {
1719 		bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list);
1720 	} else {
1721 		bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list,
1722 					 internal.tailq);
1723 	}
1724 
1725 	while (bdev_module) {
1726 		if (bdev_module->async_fini) {
1727 			/* Save our place so we can resume later. We must
1728 			 * save the variable here, before calling module_fini()
1729 			 * below, because in some cases the module may immediately
1730 			 * call spdk_bdev_module_fini_done() and re-enter
1731 			 * this function to continue iterating. */
1732 			g_resume_bdev_module = bdev_module;
1733 		}
1734 
1735 		if (bdev_module->module_fini) {
1736 			bdev_module->module_fini();
1737 		}
1738 
1739 		if (bdev_module->async_fini) {
1740 			return;
1741 		}
1742 
1743 		bdev_module = TAILQ_PREV(bdev_module, bdev_module_list,
1744 					 internal.tailq);
1745 	}
1746 
1747 	g_resume_bdev_module = NULL;
1748 	spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb);
1749 }
1750 
1751 void
1752 spdk_bdev_module_fini_done(void)
1753 {
1754 	if (spdk_get_thread() != g_fini_thread) {
1755 		spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL);
1756 	} else {
1757 		bdev_module_fini_iter(NULL);
1758 	}
1759 }
1760 
1761 static void
1762 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno)
1763 {
1764 	struct spdk_bdev *bdev = cb_arg;
1765 
1766 	if (bdeverrno && bdev) {
1767 		SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n",
1768 			     bdev->name);
1769 
1770 		/*
1771 		 * Since the call to spdk_bdev_unregister() failed, we have no way to free this
1772 		 *  bdev; try to continue by manually removing this bdev from the list and continue
1773 		 *  with the next bdev in the list.
1774 		 */
1775 		TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
1776 	}
1777 
1778 	if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) {
1779 		SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n");
1780 		/*
1781 		 * Bdev module finish need to be deferred as we might be in the middle of some context
1782 		 * (like bdev part free) that will use this bdev (or private bdev driver ctx data)
1783 		 * after returning.
1784 		 */
1785 		spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL);
1786 		return;
1787 	}
1788 
1789 	/*
1790 	 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem
1791 	 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity
1792 	 * to detect clean shutdown as opposed to run-time hot removal of the underlying
1793 	 * base bdevs.
1794 	 *
1795 	 * Also, walk the list in the reverse order.
1796 	 */
1797 	for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list);
1798 	     bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) {
1799 		if (bdev->internal.claim_module != NULL) {
1800 			SPDK_DEBUGLOG(bdev, "Skipping claimed bdev '%s'(<-'%s').\n",
1801 				      bdev->name, bdev->internal.claim_module->name);
1802 			continue;
1803 		}
1804 
1805 		SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name);
1806 		spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev);
1807 		return;
1808 	}
1809 
1810 	/*
1811 	 * If any bdev fails to unclaim underlying bdev properly, we may face the
1812 	 * case of bdev list consisting of claimed bdevs only (if claims are managed
1813 	 * correctly, this would mean there's a loop in the claims graph which is
1814 	 * clearly impossible). Warn and unregister last bdev on the list then.
1815 	 */
1816 	for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list);
1817 	     bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) {
1818 		SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name);
1819 		spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev);
1820 		return;
1821 	}
1822 }
1823 
1824 static void
1825 bdev_module_fini_start_iter(void *arg)
1826 {
1827 	struct spdk_bdev_module *bdev_module;
1828 
1829 	if (!g_resume_bdev_module) {
1830 		bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list);
1831 	} else {
1832 		bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq);
1833 	}
1834 
1835 	while (bdev_module) {
1836 		if (bdev_module->async_fini_start) {
1837 			/* Save our place so we can resume later. We must
1838 			 * save the variable here, before calling fini_start()
1839 			 * below, because in some cases the module may immediately
1840 			 * call spdk_bdev_module_fini_start_done() and re-enter
1841 			 * this function to continue iterating. */
1842 			g_resume_bdev_module = bdev_module;
1843 		}
1844 
1845 		if (bdev_module->fini_start) {
1846 			bdev_module->fini_start();
1847 		}
1848 
1849 		if (bdev_module->async_fini_start) {
1850 			return;
1851 		}
1852 
1853 		bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq);
1854 	}
1855 
1856 	g_resume_bdev_module = NULL;
1857 
1858 	bdev_finish_unregister_bdevs_iter(NULL, 0);
1859 }
1860 
1861 void
1862 spdk_bdev_module_fini_start_done(void)
1863 {
1864 	if (spdk_get_thread() != g_fini_thread) {
1865 		spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL);
1866 	} else {
1867 		bdev_module_fini_start_iter(NULL);
1868 	}
1869 }
1870 
1871 static void
1872 bdev_finish_wait_for_examine_done(void *cb_arg)
1873 {
1874 	bdev_module_fini_start_iter(NULL);
1875 }
1876 
1877 void
1878 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg)
1879 {
1880 	int rc;
1881 
1882 	assert(cb_fn != NULL);
1883 
1884 	g_fini_thread = spdk_get_thread();
1885 
1886 	g_fini_cb_fn = cb_fn;
1887 	g_fini_cb_arg = cb_arg;
1888 
1889 	rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL);
1890 	if (rc != 0) {
1891 		SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc));
1892 		bdev_finish_wait_for_examine_done(NULL);
1893 	}
1894 }
1895 
1896 struct spdk_bdev_io *
1897 bdev_channel_get_io(struct spdk_bdev_channel *channel)
1898 {
1899 	struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch;
1900 	struct spdk_bdev_io *bdev_io;
1901 
1902 	if (ch->per_thread_cache_count > 0) {
1903 		bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
1904 		STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
1905 		ch->per_thread_cache_count--;
1906 	} else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) {
1907 		/*
1908 		 * Don't try to look for bdev_ios in the global pool if there are
1909 		 * waiters on bdev_ios - we don't want this caller to jump the line.
1910 		 */
1911 		bdev_io = NULL;
1912 	} else {
1913 		bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
1914 	}
1915 
1916 	return bdev_io;
1917 }
1918 
1919 void
1920 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io)
1921 {
1922 	struct spdk_bdev_mgmt_channel *ch;
1923 
1924 	assert(bdev_io != NULL);
1925 	assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING);
1926 
1927 	ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
1928 
1929 	if (bdev_io->internal.buf != NULL) {
1930 		bdev_io_put_buf(bdev_io);
1931 	}
1932 
1933 	if (ch->per_thread_cache_count < ch->bdev_io_cache_size) {
1934 		ch->per_thread_cache_count++;
1935 		STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link);
1936 		while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) {
1937 			struct spdk_bdev_io_wait_entry *entry;
1938 
1939 			entry = TAILQ_FIRST(&ch->io_wait_queue);
1940 			TAILQ_REMOVE(&ch->io_wait_queue, entry, link);
1941 			entry->cb_fn(entry->cb_arg);
1942 		}
1943 	} else {
1944 		/* We should never have a full cache with entries on the io wait queue. */
1945 		assert(TAILQ_EMPTY(&ch->io_wait_queue));
1946 		spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
1947 	}
1948 }
1949 
1950 static bool
1951 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit)
1952 {
1953 	assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES);
1954 
1955 	switch (limit) {
1956 	case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
1957 		return true;
1958 	case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT:
1959 	case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT:
1960 	case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT:
1961 		return false;
1962 	case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES:
1963 	default:
1964 		return false;
1965 	}
1966 }
1967 
1968 static bool
1969 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io)
1970 {
1971 	switch (bdev_io->type) {
1972 	case SPDK_BDEV_IO_TYPE_NVME_IO:
1973 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
1974 	case SPDK_BDEV_IO_TYPE_READ:
1975 	case SPDK_BDEV_IO_TYPE_WRITE:
1976 		return true;
1977 	case SPDK_BDEV_IO_TYPE_ZCOPY:
1978 		if (bdev_io->u.bdev.zcopy.start) {
1979 			return true;
1980 		} else {
1981 			return false;
1982 		}
1983 	default:
1984 		return false;
1985 	}
1986 }
1987 
1988 static bool
1989 bdev_is_read_io(struct spdk_bdev_io *bdev_io)
1990 {
1991 	switch (bdev_io->type) {
1992 	case SPDK_BDEV_IO_TYPE_NVME_IO:
1993 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
1994 		/* Bit 1 (0x2) set for read operation */
1995 		if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) {
1996 			return true;
1997 		} else {
1998 			return false;
1999 		}
2000 	case SPDK_BDEV_IO_TYPE_READ:
2001 		return true;
2002 	case SPDK_BDEV_IO_TYPE_ZCOPY:
2003 		/* Populate to read from disk */
2004 		if (bdev_io->u.bdev.zcopy.populate) {
2005 			return true;
2006 		} else {
2007 			return false;
2008 		}
2009 	default:
2010 		return false;
2011 	}
2012 }
2013 
2014 static uint64_t
2015 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io)
2016 {
2017 	struct spdk_bdev	*bdev = bdev_io->bdev;
2018 
2019 	switch (bdev_io->type) {
2020 	case SPDK_BDEV_IO_TYPE_NVME_IO:
2021 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2022 		return bdev_io->u.nvme_passthru.nbytes;
2023 	case SPDK_BDEV_IO_TYPE_READ:
2024 	case SPDK_BDEV_IO_TYPE_WRITE:
2025 		return bdev_io->u.bdev.num_blocks * bdev->blocklen;
2026 	case SPDK_BDEV_IO_TYPE_ZCOPY:
2027 		/* Track the data in the start phase only */
2028 		if (bdev_io->u.bdev.zcopy.start) {
2029 			return bdev_io->u.bdev.num_blocks * bdev->blocklen;
2030 		} else {
2031 			return 0;
2032 		}
2033 	default:
2034 		return 0;
2035 	}
2036 }
2037 
2038 static bool
2039 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2040 {
2041 	if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) {
2042 		return true;
2043 	} else {
2044 		return false;
2045 	}
2046 }
2047 
2048 static bool
2049 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2050 {
2051 	if (bdev_is_read_io(io) == false) {
2052 		return false;
2053 	}
2054 
2055 	return bdev_qos_rw_queue_io(limit, io);
2056 }
2057 
2058 static bool
2059 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2060 {
2061 	if (bdev_is_read_io(io) == true) {
2062 		return false;
2063 	}
2064 
2065 	return bdev_qos_rw_queue_io(limit, io);
2066 }
2067 
2068 static void
2069 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2070 {
2071 	limit->remaining_this_timeslice--;
2072 }
2073 
2074 static void
2075 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2076 {
2077 	limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io);
2078 }
2079 
2080 static void
2081 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2082 {
2083 	if (bdev_is_read_io(io) == false) {
2084 		return;
2085 	}
2086 
2087 	return bdev_qos_rw_bps_update_quota(limit, io);
2088 }
2089 
2090 static void
2091 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2092 {
2093 	if (bdev_is_read_io(io) == true) {
2094 		return;
2095 	}
2096 
2097 	return bdev_qos_rw_bps_update_quota(limit, io);
2098 }
2099 
2100 static void
2101 bdev_qos_set_ops(struct spdk_bdev_qos *qos)
2102 {
2103 	int i;
2104 
2105 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2106 		if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
2107 			qos->rate_limits[i].queue_io = NULL;
2108 			qos->rate_limits[i].update_quota = NULL;
2109 			continue;
2110 		}
2111 
2112 		switch (i) {
2113 		case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
2114 			qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io;
2115 			qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota;
2116 			break;
2117 		case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT:
2118 			qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io;
2119 			qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota;
2120 			break;
2121 		case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT:
2122 			qos->rate_limits[i].queue_io = bdev_qos_r_queue_io;
2123 			qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota;
2124 			break;
2125 		case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT:
2126 			qos->rate_limits[i].queue_io = bdev_qos_w_queue_io;
2127 			qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota;
2128 			break;
2129 		default:
2130 			break;
2131 		}
2132 	}
2133 }
2134 
2135 static void
2136 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch,
2137 			    struct spdk_bdev_io *bdev_io,
2138 			    enum spdk_bdev_io_status status)
2139 {
2140 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
2141 
2142 	bdev_io->internal.in_submit_request = true;
2143 	bdev_ch->io_outstanding++;
2144 	shared_resource->io_outstanding++;
2145 	spdk_bdev_io_complete(bdev_io, status);
2146 	bdev_io->internal.in_submit_request = false;
2147 }
2148 
2149 static inline void
2150 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io)
2151 {
2152 	struct spdk_bdev *bdev = bdev_io->bdev;
2153 	struct spdk_io_channel *ch = bdev_ch->channel;
2154 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
2155 
2156 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) {
2157 		struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch;
2158 		struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort;
2159 
2160 		if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) ||
2161 		    bdev_abort_buf_io(&mgmt_channel->need_buf_small, bio_to_abort) ||
2162 		    bdev_abort_buf_io(&mgmt_channel->need_buf_large, bio_to_abort)) {
2163 			_bdev_io_complete_in_submit(bdev_ch, bdev_io,
2164 						    SPDK_BDEV_IO_STATUS_SUCCESS);
2165 			return;
2166 		}
2167 	}
2168 
2169 	if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) {
2170 		bdev_ch->io_outstanding++;
2171 		shared_resource->io_outstanding++;
2172 		bdev_io->internal.in_submit_request = true;
2173 		bdev->fn_table->submit_request(ch, bdev_io);
2174 		bdev_io->internal.in_submit_request = false;
2175 	} else {
2176 		TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link);
2177 	}
2178 }
2179 
2180 static bool
2181 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io)
2182 {
2183 	int i;
2184 
2185 	if (bdev_qos_io_to_limit(bdev_io) == true) {
2186 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2187 			if (!qos->rate_limits[i].queue_io) {
2188 				continue;
2189 			}
2190 
2191 			if (qos->rate_limits[i].queue_io(&qos->rate_limits[i],
2192 							 bdev_io) == true) {
2193 				return true;
2194 			}
2195 		}
2196 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2197 			if (!qos->rate_limits[i].update_quota) {
2198 				continue;
2199 			}
2200 
2201 			qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io);
2202 		}
2203 	}
2204 
2205 	return false;
2206 }
2207 
2208 static int
2209 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos)
2210 {
2211 	struct spdk_bdev_io		*bdev_io = NULL, *tmp = NULL;
2212 	int				submitted_ios = 0;
2213 
2214 	TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) {
2215 		if (!bdev_qos_queue_io(qos, bdev_io)) {
2216 			TAILQ_REMOVE(&qos->queued, bdev_io, internal.link);
2217 			bdev_io_do_submit(ch, bdev_io);
2218 			submitted_ios++;
2219 		}
2220 	}
2221 
2222 	return submitted_ios;
2223 }
2224 
2225 static void
2226 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn)
2227 {
2228 	int rc;
2229 
2230 	bdev_io->internal.waitq_entry.bdev = bdev_io->bdev;
2231 	bdev_io->internal.waitq_entry.cb_fn = cb_fn;
2232 	bdev_io->internal.waitq_entry.cb_arg = bdev_io;
2233 	rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch),
2234 				     &bdev_io->internal.waitq_entry);
2235 	if (rc != 0) {
2236 		SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc);
2237 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
2238 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
2239 	}
2240 }
2241 
2242 static bool
2243 bdev_rw_should_split(struct spdk_bdev_io *bdev_io)
2244 {
2245 	uint32_t io_boundary = bdev_io->bdev->optimal_io_boundary;
2246 	uint32_t max_size = bdev_io->bdev->max_segment_size;
2247 	int max_segs = bdev_io->bdev->max_num_segments;
2248 
2249 	io_boundary = bdev_io->bdev->split_on_optimal_io_boundary ? io_boundary : 0;
2250 
2251 	if (spdk_likely(!io_boundary && !max_segs && !max_size)) {
2252 		return false;
2253 	}
2254 
2255 	if (io_boundary) {
2256 		uint64_t start_stripe, end_stripe;
2257 
2258 		start_stripe = bdev_io->u.bdev.offset_blocks;
2259 		end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1;
2260 		/* Avoid expensive div operations if possible.  These spdk_u32 functions are very cheap. */
2261 		if (spdk_likely(spdk_u32_is_pow2(io_boundary))) {
2262 			start_stripe >>= spdk_u32log2(io_boundary);
2263 			end_stripe >>= spdk_u32log2(io_boundary);
2264 		} else {
2265 			start_stripe /= io_boundary;
2266 			end_stripe /= io_boundary;
2267 		}
2268 
2269 		if (start_stripe != end_stripe) {
2270 			return true;
2271 		}
2272 	}
2273 
2274 	if (max_segs) {
2275 		if (bdev_io->u.bdev.iovcnt > max_segs) {
2276 			return true;
2277 		}
2278 	}
2279 
2280 	if (max_size) {
2281 		for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) {
2282 			if (bdev_io->u.bdev.iovs[i].iov_len > max_size) {
2283 				return true;
2284 			}
2285 		}
2286 	}
2287 
2288 	return false;
2289 }
2290 
2291 static bool
2292 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io)
2293 {
2294 	uint32_t num_unmap_segments;
2295 
2296 	if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) {
2297 		return false;
2298 	}
2299 	num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap);
2300 	if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) {
2301 		return true;
2302 	}
2303 
2304 	return false;
2305 }
2306 
2307 static bool
2308 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io)
2309 {
2310 	if (!bdev_io->bdev->max_write_zeroes) {
2311 		return false;
2312 	}
2313 
2314 	if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) {
2315 		return true;
2316 	}
2317 
2318 	return false;
2319 }
2320 
2321 static bool
2322 bdev_io_should_split(struct spdk_bdev_io *bdev_io)
2323 {
2324 	switch (bdev_io->type) {
2325 	case SPDK_BDEV_IO_TYPE_READ:
2326 	case SPDK_BDEV_IO_TYPE_WRITE:
2327 		return bdev_rw_should_split(bdev_io);
2328 	case SPDK_BDEV_IO_TYPE_UNMAP:
2329 		return bdev_unmap_should_split(bdev_io);
2330 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
2331 		return bdev_write_zeroes_should_split(bdev_io);
2332 	default:
2333 		return false;
2334 	}
2335 }
2336 
2337 static uint32_t
2338 _to_next_boundary(uint64_t offset, uint32_t boundary)
2339 {
2340 	return (boundary - (offset % boundary));
2341 }
2342 
2343 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
2344 
2345 static void _bdev_rw_split(void *_bdev_io);
2346 
2347 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io);
2348 
2349 static void
2350 _bdev_unmap_split(void *_bdev_io)
2351 {
2352 	return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io);
2353 }
2354 
2355 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io);
2356 
2357 static void
2358 _bdev_write_zeroes_split(void *_bdev_io)
2359 {
2360 	return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io);
2361 }
2362 
2363 static int
2364 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf,
2365 		     uint64_t num_blocks, uint64_t *offset, uint64_t *remaining)
2366 {
2367 	int rc;
2368 	uint64_t current_offset, current_remaining;
2369 	spdk_bdev_io_wait_cb io_wait_fn;
2370 
2371 	current_offset = *offset;
2372 	current_remaining = *remaining;
2373 
2374 	bdev_io->u.bdev.split_outstanding++;
2375 
2376 	io_wait_fn = _bdev_rw_split;
2377 	switch (bdev_io->type) {
2378 	case SPDK_BDEV_IO_TYPE_READ:
2379 		rc = bdev_readv_blocks_with_md(bdev_io->internal.desc,
2380 					       spdk_io_channel_from_ctx(bdev_io->internal.ch),
2381 					       iov, iovcnt, md_buf, current_offset,
2382 					       num_blocks,
2383 					       bdev_io_split_done, bdev_io,
2384 					       bdev_io->internal.ext_opts, true);
2385 		break;
2386 	case SPDK_BDEV_IO_TYPE_WRITE:
2387 		rc = bdev_writev_blocks_with_md(bdev_io->internal.desc,
2388 						spdk_io_channel_from_ctx(bdev_io->internal.ch),
2389 						iov, iovcnt, md_buf, current_offset,
2390 						num_blocks,
2391 						bdev_io_split_done, bdev_io,
2392 						bdev_io->internal.ext_opts, true);
2393 		break;
2394 	case SPDK_BDEV_IO_TYPE_UNMAP:
2395 		io_wait_fn = _bdev_unmap_split;
2396 		rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc,
2397 					    spdk_io_channel_from_ctx(bdev_io->internal.ch),
2398 					    current_offset, num_blocks,
2399 					    bdev_io_split_done, bdev_io);
2400 		break;
2401 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
2402 		io_wait_fn = _bdev_write_zeroes_split;
2403 		rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc,
2404 						   spdk_io_channel_from_ctx(bdev_io->internal.ch),
2405 						   current_offset, num_blocks,
2406 						   bdev_io_split_done, bdev_io);
2407 		break;
2408 	default:
2409 		assert(false);
2410 		rc = -EINVAL;
2411 		break;
2412 	}
2413 
2414 	if (rc == 0) {
2415 		current_offset += num_blocks;
2416 		current_remaining -= num_blocks;
2417 		bdev_io->u.bdev.split_current_offset_blocks = current_offset;
2418 		bdev_io->u.bdev.split_remaining_num_blocks = current_remaining;
2419 		*offset = current_offset;
2420 		*remaining = current_remaining;
2421 	} else {
2422 		bdev_io->u.bdev.split_outstanding--;
2423 		if (rc == -ENOMEM) {
2424 			if (bdev_io->u.bdev.split_outstanding == 0) {
2425 				/* No I/O is outstanding. Hence we should wait here. */
2426 				bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn);
2427 			}
2428 		} else {
2429 			bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
2430 			if (bdev_io->u.bdev.split_outstanding == 0) {
2431 				spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx);
2432 				TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link);
2433 				bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
2434 			}
2435 		}
2436 	}
2437 
2438 	return rc;
2439 }
2440 
2441 static void
2442 _bdev_rw_split(void *_bdev_io)
2443 {
2444 	struct iovec *parent_iov, *iov;
2445 	struct spdk_bdev_io *bdev_io = _bdev_io;
2446 	struct spdk_bdev *bdev = bdev_io->bdev;
2447 	uint64_t parent_offset, current_offset, remaining;
2448 	uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt;
2449 	uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes;
2450 	uint32_t iovcnt, iov_len, child_iovsize;
2451 	uint32_t blocklen = bdev->blocklen;
2452 	uint32_t io_boundary = bdev->optimal_io_boundary;
2453 	uint32_t max_segment_size = bdev->max_segment_size;
2454 	uint32_t max_child_iovcnt = bdev->max_num_segments;
2455 	void *md_buf = NULL;
2456 	int rc;
2457 
2458 	max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX;
2459 	max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, BDEV_IO_NUM_CHILD_IOV) :
2460 			   BDEV_IO_NUM_CHILD_IOV;
2461 	io_boundary = bdev->split_on_optimal_io_boundary ? io_boundary : UINT32_MAX;
2462 
2463 	remaining = bdev_io->u.bdev.split_remaining_num_blocks;
2464 	current_offset = bdev_io->u.bdev.split_current_offset_blocks;
2465 	parent_offset = bdev_io->u.bdev.offset_blocks;
2466 	parent_iov_offset = (current_offset - parent_offset) * blocklen;
2467 	parent_iovcnt = bdev_io->u.bdev.iovcnt;
2468 
2469 	for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) {
2470 		parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
2471 		if (parent_iov_offset < parent_iov->iov_len) {
2472 			break;
2473 		}
2474 		parent_iov_offset -= parent_iov->iov_len;
2475 	}
2476 
2477 	child_iovcnt = 0;
2478 	while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) {
2479 		to_next_boundary = _to_next_boundary(current_offset, io_boundary);
2480 		to_next_boundary = spdk_min(remaining, to_next_boundary);
2481 		to_next_boundary_bytes = to_next_boundary * blocklen;
2482 
2483 		iov = &bdev_io->child_iov[child_iovcnt];
2484 		iovcnt = 0;
2485 
2486 		if (bdev_io->u.bdev.md_buf) {
2487 			md_buf = (char *)bdev_io->u.bdev.md_buf +
2488 				 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev);
2489 		}
2490 
2491 		child_iovsize = spdk_min(BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt);
2492 		while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt &&
2493 		       iovcnt < child_iovsize) {
2494 			parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
2495 			iov_len = parent_iov->iov_len - parent_iov_offset;
2496 
2497 			iov_len = spdk_min(iov_len, max_segment_size);
2498 			iov_len = spdk_min(iov_len, to_next_boundary_bytes);
2499 			to_next_boundary_bytes -= iov_len;
2500 
2501 			bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset;
2502 			bdev_io->child_iov[child_iovcnt].iov_len = iov_len;
2503 
2504 			if (iov_len < parent_iov->iov_len - parent_iov_offset) {
2505 				parent_iov_offset += iov_len;
2506 			} else {
2507 				parent_iovpos++;
2508 				parent_iov_offset = 0;
2509 			}
2510 			child_iovcnt++;
2511 			iovcnt++;
2512 		}
2513 
2514 		if (to_next_boundary_bytes > 0) {
2515 			/* We had to stop this child I/O early because we ran out of
2516 			 * child_iov space or were limited by max_num_segments.
2517 			 * Ensure the iovs to be aligned with block size and
2518 			 * then adjust to_next_boundary before starting the
2519 			 * child I/O.
2520 			 */
2521 			assert(child_iovcnt == BDEV_IO_NUM_CHILD_IOV ||
2522 			       iovcnt == child_iovsize);
2523 			to_last_block_bytes = to_next_boundary_bytes % blocklen;
2524 			if (to_last_block_bytes != 0) {
2525 				uint32_t child_iovpos = child_iovcnt - 1;
2526 				/* don't decrease child_iovcnt when it equals to BDEV_IO_NUM_CHILD_IOV
2527 				 * so the loop will naturally end
2528 				 */
2529 
2530 				to_last_block_bytes = blocklen - to_last_block_bytes;
2531 				to_next_boundary_bytes += to_last_block_bytes;
2532 				while (to_last_block_bytes > 0 && iovcnt > 0) {
2533 					iov_len = spdk_min(to_last_block_bytes,
2534 							   bdev_io->child_iov[child_iovpos].iov_len);
2535 					bdev_io->child_iov[child_iovpos].iov_len -= iov_len;
2536 					if (bdev_io->child_iov[child_iovpos].iov_len == 0) {
2537 						child_iovpos--;
2538 						if (--iovcnt == 0) {
2539 							/* If the child IO is less than a block size just return.
2540 							 * If the first child IO of any split round is less than
2541 							 * a block size, an error exit.
2542 							 */
2543 							if (bdev_io->u.bdev.split_outstanding == 0) {
2544 								SPDK_ERRLOG("The first child io was less than a block size\n");
2545 								bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
2546 								spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx);
2547 								TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link);
2548 								bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
2549 							}
2550 
2551 							return;
2552 						}
2553 					}
2554 
2555 					to_last_block_bytes -= iov_len;
2556 
2557 					if (parent_iov_offset == 0) {
2558 						parent_iovpos--;
2559 						parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len;
2560 					}
2561 					parent_iov_offset -= iov_len;
2562 				}
2563 
2564 				assert(to_last_block_bytes == 0);
2565 			}
2566 			to_next_boundary -= to_next_boundary_bytes / blocklen;
2567 		}
2568 
2569 		rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary,
2570 					  &current_offset, &remaining);
2571 		if (spdk_unlikely(rc)) {
2572 			return;
2573 		}
2574 	}
2575 }
2576 
2577 static void
2578 bdev_unmap_split(struct spdk_bdev_io *bdev_io)
2579 {
2580 	uint64_t offset, unmap_blocks, remaining, max_unmap_blocks;
2581 	uint32_t num_children_reqs = 0;
2582 	int rc;
2583 
2584 	offset = bdev_io->u.bdev.split_current_offset_blocks;
2585 	remaining = bdev_io->u.bdev.split_remaining_num_blocks;
2586 	max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments;
2587 
2588 	while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) {
2589 		unmap_blocks = spdk_min(remaining, max_unmap_blocks);
2590 
2591 		rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks,
2592 					  &offset, &remaining);
2593 		if (spdk_likely(rc == 0)) {
2594 			num_children_reqs++;
2595 		} else {
2596 			return;
2597 		}
2598 	}
2599 }
2600 
2601 static void
2602 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io)
2603 {
2604 	uint64_t offset, write_zeroes_blocks, remaining;
2605 	uint32_t num_children_reqs = 0;
2606 	int rc;
2607 
2608 	offset = bdev_io->u.bdev.split_current_offset_blocks;
2609 	remaining = bdev_io->u.bdev.split_remaining_num_blocks;
2610 
2611 	while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) {
2612 		write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes);
2613 
2614 		rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks,
2615 					  &offset, &remaining);
2616 		if (spdk_likely(rc == 0)) {
2617 			num_children_reqs++;
2618 		} else {
2619 			return;
2620 		}
2621 	}
2622 }
2623 
2624 static void
2625 parent_bdev_io_complete(void *ctx, int rc)
2626 {
2627 	struct spdk_bdev_io *parent_io = ctx;
2628 
2629 	if (rc) {
2630 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
2631 	}
2632 
2633 	parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
2634 			       parent_io->internal.caller_ctx);
2635 }
2636 
2637 static void
2638 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
2639 {
2640 	struct spdk_bdev_io *parent_io = cb_arg;
2641 
2642 	spdk_bdev_free_io(bdev_io);
2643 
2644 	if (!success) {
2645 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
2646 		/* If any child I/O failed, stop further splitting process. */
2647 		parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks;
2648 		parent_io->u.bdev.split_remaining_num_blocks = 0;
2649 	}
2650 	parent_io->u.bdev.split_outstanding--;
2651 	if (parent_io->u.bdev.split_outstanding != 0) {
2652 		return;
2653 	}
2654 
2655 	/*
2656 	 * Parent I/O finishes when all blocks are consumed.
2657 	 */
2658 	if (parent_io->u.bdev.split_remaining_num_blocks == 0) {
2659 		assert(parent_io->internal.cb != bdev_io_split_done);
2660 		spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx);
2661 		TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link);
2662 
2663 		if (parent_io->internal.orig_iovcnt != 0) {
2664 			_bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete);
2665 			/* bdev IO will be completed in the callback */
2666 		} else {
2667 			parent_bdev_io_complete(parent_io, 0);
2668 		}
2669 		return;
2670 	}
2671 
2672 	/*
2673 	 * Continue with the splitting process.  This function will complete the parent I/O if the
2674 	 * splitting is done.
2675 	 */
2676 	switch (parent_io->type) {
2677 	case SPDK_BDEV_IO_TYPE_READ:
2678 	case SPDK_BDEV_IO_TYPE_WRITE:
2679 		_bdev_rw_split(parent_io);
2680 		break;
2681 	case SPDK_BDEV_IO_TYPE_UNMAP:
2682 		bdev_unmap_split(parent_io);
2683 		break;
2684 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
2685 		bdev_write_zeroes_split(parent_io);
2686 		break;
2687 	default:
2688 		assert(false);
2689 		break;
2690 	}
2691 }
2692 
2693 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
2694 				     bool success);
2695 
2696 static void
2697 bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
2698 {
2699 	bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks;
2700 	bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks;
2701 	bdev_io->u.bdev.split_outstanding = 0;
2702 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
2703 
2704 	switch (bdev_io->type) {
2705 	case SPDK_BDEV_IO_TYPE_READ:
2706 	case SPDK_BDEV_IO_TYPE_WRITE:
2707 		if (_is_buf_allocated(bdev_io->u.bdev.iovs)) {
2708 			_bdev_rw_split(bdev_io);
2709 		} else {
2710 			assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
2711 			spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb,
2712 					     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
2713 		}
2714 		break;
2715 	case SPDK_BDEV_IO_TYPE_UNMAP:
2716 		bdev_unmap_split(bdev_io);
2717 		break;
2718 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
2719 		bdev_write_zeroes_split(bdev_io);
2720 		break;
2721 	default:
2722 		assert(false);
2723 		break;
2724 	}
2725 }
2726 
2727 static void
2728 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
2729 {
2730 	if (!success) {
2731 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
2732 		return;
2733 	}
2734 
2735 	_bdev_rw_split(bdev_io);
2736 }
2737 
2738 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't
2739  *  be inlined, at least on some compilers.
2740  */
2741 static inline void
2742 _bdev_io_submit(void *ctx)
2743 {
2744 	struct spdk_bdev_io *bdev_io = ctx;
2745 	struct spdk_bdev *bdev = bdev_io->bdev;
2746 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
2747 	uint64_t tsc;
2748 
2749 	tsc = spdk_get_ticks();
2750 	bdev_io->internal.submit_tsc = tsc;
2751 	spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_START, 0, 0, (uintptr_t)bdev_io,
2752 			      (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx,
2753 			      bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks);
2754 
2755 	if (spdk_likely(bdev_ch->flags == 0)) {
2756 		bdev_io_do_submit(bdev_ch, bdev_io);
2757 		return;
2758 	}
2759 
2760 	if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) {
2761 		_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
2762 	} else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) {
2763 		if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) &&
2764 		    bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) {
2765 			_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
2766 		} else {
2767 			TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link);
2768 			bdev_qos_io_submit(bdev_ch, bdev->internal.qos);
2769 		}
2770 	} else {
2771 		SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags);
2772 		_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
2773 	}
2774 }
2775 
2776 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2);
2777 
2778 bool
2779 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2)
2780 {
2781 	if (range1->length == 0 || range2->length == 0) {
2782 		return false;
2783 	}
2784 
2785 	if (range1->offset + range1->length <= range2->offset) {
2786 		return false;
2787 	}
2788 
2789 	if (range2->offset + range2->length <= range1->offset) {
2790 		return false;
2791 	}
2792 
2793 	return true;
2794 }
2795 
2796 static bool
2797 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range)
2798 {
2799 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
2800 	struct lba_range r;
2801 
2802 	switch (bdev_io->type) {
2803 	case SPDK_BDEV_IO_TYPE_NVME_IO:
2804 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2805 		/* Don't try to decode the NVMe command - just assume worst-case and that
2806 		 * it overlaps a locked range.
2807 		 */
2808 		return true;
2809 	case SPDK_BDEV_IO_TYPE_WRITE:
2810 	case SPDK_BDEV_IO_TYPE_UNMAP:
2811 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
2812 	case SPDK_BDEV_IO_TYPE_ZCOPY:
2813 		r.offset = bdev_io->u.bdev.offset_blocks;
2814 		r.length = bdev_io->u.bdev.num_blocks;
2815 		if (!bdev_lba_range_overlapped(range, &r)) {
2816 			/* This I/O doesn't overlap the specified LBA range. */
2817 			return false;
2818 		} else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) {
2819 			/* This I/O overlaps, but the I/O is on the same channel that locked this
2820 			 * range, and the caller_ctx is the same as the locked_ctx.  This means
2821 			 * that this I/O is associated with the lock, and is allowed to execute.
2822 			 */
2823 			return false;
2824 		} else {
2825 			return true;
2826 		}
2827 	default:
2828 		return false;
2829 	}
2830 }
2831 
2832 void
2833 bdev_io_submit(struct spdk_bdev_io *bdev_io)
2834 {
2835 	struct spdk_bdev *bdev = bdev_io->bdev;
2836 	struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io);
2837 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
2838 
2839 	assert(thread != NULL);
2840 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
2841 
2842 	if (!TAILQ_EMPTY(&ch->locked_ranges)) {
2843 		struct lba_range *range;
2844 
2845 		TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
2846 			if (bdev_io_range_is_locked(bdev_io, range)) {
2847 				TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link);
2848 				return;
2849 			}
2850 		}
2851 	}
2852 
2853 	TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link);
2854 
2855 	if (bdev_io_should_split(bdev_io)) {
2856 		bdev_io->internal.submit_tsc = spdk_get_ticks();
2857 		spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0,
2858 				      (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx,
2859 				      bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks);
2860 		bdev_io_split(NULL, bdev_io);
2861 		return;
2862 	}
2863 
2864 	if (ch->flags & BDEV_CH_QOS_ENABLED) {
2865 		if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) {
2866 			_bdev_io_submit(bdev_io);
2867 		} else {
2868 			bdev_io->internal.io_submit_ch = ch;
2869 			bdev_io->internal.ch = bdev->internal.qos->ch;
2870 			spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io);
2871 		}
2872 	} else {
2873 		_bdev_io_submit(bdev_io);
2874 	}
2875 }
2876 
2877 static inline void
2878 _bdev_io_copy_ext_opts(struct spdk_bdev_io *bdev_io, struct spdk_bdev_ext_io_opts *opts)
2879 {
2880 	struct spdk_bdev_ext_io_opts *opts_copy = &bdev_io->internal.ext_opts_copy;
2881 
2882 	/* Zero part we don't copy */
2883 	memset(((char *)opts_copy) + opts->size, 0, sizeof(*opts) - opts->size);
2884 	memcpy(opts_copy, opts, opts->size);
2885 	opts_copy->size = sizeof(*opts_copy);
2886 	opts_copy->metadata = bdev_io->u.bdev.md_buf;
2887 	/* Save pointer to the copied ext_opts which will be used by bdev modules */
2888 	bdev_io->u.bdev.ext_opts = opts_copy;
2889 }
2890 
2891 static inline void
2892 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io)
2893 {
2894 	/* bdev doesn't support memory domains, thereby buffers in this IO request can't
2895 	 * be accessed directly. It is needed to allocate buffers before issuing IO operation.
2896 	 * For write operation we need to pull buffers from memory domain before submitting IO.
2897 	 * Once read operation completes, we need to use memory_domain push functionality to
2898 	 * update data in original memory domain IO buffer
2899 	 * This IO request will go through a regular IO flow, so clear memory domains pointers in
2900 	 * the copied ext_opts */
2901 	bdev_io->internal.ext_opts_copy.memory_domain = NULL;
2902 	bdev_io->internal.ext_opts_copy.memory_domain_ctx = NULL;
2903 	_bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb,
2904 				       bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
2905 }
2906 
2907 static inline void
2908 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io,
2909 		    struct spdk_bdev_ext_io_opts *opts, bool copy_opts)
2910 {
2911 	if (opts) {
2912 		bool use_pull_push = opts->memory_domain && !desc->memory_domains_supported;
2913 		assert(opts->size <= sizeof(*opts));
2914 		/*
2915 		 * copy if size is smaller than opts struct to avoid having to check size
2916 		 * on every access to bdev_io->u.bdev.ext_opts
2917 		 */
2918 		if (copy_opts || use_pull_push || opts->size < sizeof(*opts)) {
2919 			_bdev_io_copy_ext_opts(bdev_io, opts);
2920 			if (use_pull_push) {
2921 				_bdev_io_ext_use_bounce_buffer(bdev_io);
2922 				return;
2923 			}
2924 		}
2925 	}
2926 	bdev_io_submit(bdev_io);
2927 }
2928 
2929 static void
2930 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io)
2931 {
2932 	struct spdk_bdev *bdev = bdev_io->bdev;
2933 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
2934 	struct spdk_io_channel *ch = bdev_ch->channel;
2935 
2936 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
2937 
2938 	bdev_io->internal.in_submit_request = true;
2939 	bdev->fn_table->submit_request(ch, bdev_io);
2940 	bdev_io->internal.in_submit_request = false;
2941 }
2942 
2943 void
2944 bdev_io_init(struct spdk_bdev_io *bdev_io,
2945 	     struct spdk_bdev *bdev, void *cb_arg,
2946 	     spdk_bdev_io_completion_cb cb)
2947 {
2948 	bdev_io->bdev = bdev;
2949 	bdev_io->internal.caller_ctx = cb_arg;
2950 	bdev_io->internal.cb = cb;
2951 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
2952 	bdev_io->internal.in_submit_request = false;
2953 	bdev_io->internal.buf = NULL;
2954 	bdev_io->internal.io_submit_ch = NULL;
2955 	bdev_io->internal.orig_iovs = NULL;
2956 	bdev_io->internal.orig_iovcnt = 0;
2957 	bdev_io->internal.orig_md_iov.iov_base = NULL;
2958 	bdev_io->internal.error.nvme.cdw0 = 0;
2959 	bdev_io->num_retries = 0;
2960 	bdev_io->internal.get_buf_cb = NULL;
2961 	bdev_io->internal.get_aux_buf_cb = NULL;
2962 	bdev_io->internal.ext_opts = NULL;
2963 	bdev_io->internal.data_transfer_cpl = NULL;
2964 }
2965 
2966 static bool
2967 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
2968 {
2969 	return bdev->fn_table->io_type_supported(bdev->ctxt, io_type);
2970 }
2971 
2972 bool
2973 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
2974 {
2975 	bool supported;
2976 
2977 	supported = bdev_io_type_supported(bdev, io_type);
2978 
2979 	if (!supported) {
2980 		switch (io_type) {
2981 		case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
2982 			/* The bdev layer will emulate write zeroes as long as write is supported. */
2983 			supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE);
2984 			break;
2985 		default:
2986 			break;
2987 		}
2988 	}
2989 
2990 	return supported;
2991 }
2992 
2993 int
2994 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
2995 {
2996 	if (bdev->fn_table->dump_info_json) {
2997 		return bdev->fn_table->dump_info_json(bdev->ctxt, w);
2998 	}
2999 
3000 	return 0;
3001 }
3002 
3003 static void
3004 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos)
3005 {
3006 	uint32_t max_per_timeslice = 0;
3007 	int i;
3008 
3009 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3010 		if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
3011 			qos->rate_limits[i].max_per_timeslice = 0;
3012 			continue;
3013 		}
3014 
3015 		max_per_timeslice = qos->rate_limits[i].limit *
3016 				    SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC;
3017 
3018 		qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice,
3019 							qos->rate_limits[i].min_per_timeslice);
3020 
3021 		qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice;
3022 	}
3023 
3024 	bdev_qos_set_ops(qos);
3025 }
3026 
3027 static int
3028 bdev_channel_poll_qos(void *arg)
3029 {
3030 	struct spdk_bdev_qos *qos = arg;
3031 	uint64_t now = spdk_get_ticks();
3032 	int i;
3033 
3034 	if (now < (qos->last_timeslice + qos->timeslice_size)) {
3035 		/* We received our callback earlier than expected - return
3036 		 *  immediately and wait to do accounting until at least one
3037 		 *  timeslice has actually expired.  This should never happen
3038 		 *  with a well-behaved timer implementation.
3039 		 */
3040 		return SPDK_POLLER_IDLE;
3041 	}
3042 
3043 	/* Reset for next round of rate limiting */
3044 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3045 		/* We may have allowed the IOs or bytes to slightly overrun in the last
3046 		 * timeslice. remaining_this_timeslice is signed, so if it's negative
3047 		 * here, we'll account for the overrun so that the next timeslice will
3048 		 * be appropriately reduced.
3049 		 */
3050 		if (qos->rate_limits[i].remaining_this_timeslice > 0) {
3051 			qos->rate_limits[i].remaining_this_timeslice = 0;
3052 		}
3053 	}
3054 
3055 	while (now >= (qos->last_timeslice + qos->timeslice_size)) {
3056 		qos->last_timeslice += qos->timeslice_size;
3057 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3058 			qos->rate_limits[i].remaining_this_timeslice +=
3059 				qos->rate_limits[i].max_per_timeslice;
3060 		}
3061 	}
3062 
3063 	return bdev_qos_io_submit(qos->ch, qos);
3064 }
3065 
3066 static void
3067 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch)
3068 {
3069 	struct spdk_bdev_shared_resource *shared_resource;
3070 	struct lba_range *range;
3071 
3072 	while (!TAILQ_EMPTY(&ch->locked_ranges)) {
3073 		range = TAILQ_FIRST(&ch->locked_ranges);
3074 		TAILQ_REMOVE(&ch->locked_ranges, range, tailq);
3075 		free(range);
3076 	}
3077 
3078 	spdk_put_io_channel(ch->channel);
3079 
3080 	shared_resource = ch->shared_resource;
3081 
3082 	assert(TAILQ_EMPTY(&ch->io_locked));
3083 	assert(TAILQ_EMPTY(&ch->io_submitted));
3084 	assert(ch->io_outstanding == 0);
3085 	assert(shared_resource->ref > 0);
3086 	shared_resource->ref--;
3087 	if (shared_resource->ref == 0) {
3088 		assert(shared_resource->io_outstanding == 0);
3089 		TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link);
3090 		spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch));
3091 		free(shared_resource);
3092 	}
3093 }
3094 
3095 /* Caller must hold bdev->internal.mutex. */
3096 static void
3097 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch)
3098 {
3099 	struct spdk_bdev_qos	*qos = bdev->internal.qos;
3100 	int			i;
3101 
3102 	/* Rate limiting on this bdev enabled */
3103 	if (qos) {
3104 		if (qos->ch == NULL) {
3105 			struct spdk_io_channel *io_ch;
3106 
3107 			SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch,
3108 				      bdev->name, spdk_get_thread());
3109 
3110 			/* No qos channel has been selected, so set one up */
3111 
3112 			/* Take another reference to ch */
3113 			io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev));
3114 			assert(io_ch != NULL);
3115 			qos->ch = ch;
3116 
3117 			qos->thread = spdk_io_channel_get_thread(io_ch);
3118 
3119 			TAILQ_INIT(&qos->queued);
3120 
3121 			for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3122 				if (bdev_qos_is_iops_rate_limit(i) == true) {
3123 					qos->rate_limits[i].min_per_timeslice =
3124 						SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE;
3125 				} else {
3126 					qos->rate_limits[i].min_per_timeslice =
3127 						SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE;
3128 				}
3129 
3130 				if (qos->rate_limits[i].limit == 0) {
3131 					qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
3132 				}
3133 			}
3134 			bdev_qos_update_max_quota_per_timeslice(qos);
3135 			qos->timeslice_size =
3136 				SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC;
3137 			qos->last_timeslice = spdk_get_ticks();
3138 			qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos,
3139 							   qos,
3140 							   SPDK_BDEV_QOS_TIMESLICE_IN_USEC);
3141 		}
3142 
3143 		ch->flags |= BDEV_CH_QOS_ENABLED;
3144 	}
3145 }
3146 
3147 struct poll_timeout_ctx {
3148 	struct spdk_bdev_desc	*desc;
3149 	uint64_t		timeout_in_sec;
3150 	spdk_bdev_io_timeout_cb	cb_fn;
3151 	void			*cb_arg;
3152 };
3153 
3154 static void
3155 bdev_desc_free(struct spdk_bdev_desc *desc)
3156 {
3157 	pthread_mutex_destroy(&desc->mutex);
3158 	free(desc->media_events_buffer);
3159 	free(desc);
3160 }
3161 
3162 static void
3163 bdev_channel_poll_timeout_io_done(struct spdk_io_channel_iter *i, int status)
3164 {
3165 	struct poll_timeout_ctx *ctx  = spdk_io_channel_iter_get_ctx(i);
3166 	struct spdk_bdev_desc *desc = ctx->desc;
3167 
3168 	free(ctx);
3169 
3170 	pthread_mutex_lock(&desc->mutex);
3171 	desc->refs--;
3172 	if (desc->closed == true && desc->refs == 0) {
3173 		pthread_mutex_unlock(&desc->mutex);
3174 		bdev_desc_free(desc);
3175 		return;
3176 	}
3177 	pthread_mutex_unlock(&desc->mutex);
3178 }
3179 
3180 static void
3181 bdev_channel_poll_timeout_io(struct spdk_io_channel_iter *i)
3182 {
3183 	struct poll_timeout_ctx *ctx  = spdk_io_channel_iter_get_ctx(i);
3184 	struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i);
3185 	struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch);
3186 	struct spdk_bdev_desc *desc = ctx->desc;
3187 	struct spdk_bdev_io *bdev_io;
3188 	uint64_t now;
3189 
3190 	pthread_mutex_lock(&desc->mutex);
3191 	if (desc->closed == true) {
3192 		pthread_mutex_unlock(&desc->mutex);
3193 		spdk_for_each_channel_continue(i, -1);
3194 		return;
3195 	}
3196 	pthread_mutex_unlock(&desc->mutex);
3197 
3198 	now = spdk_get_ticks();
3199 	TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) {
3200 		/* Exclude any I/O that are generated via splitting. */
3201 		if (bdev_io->internal.cb == bdev_io_split_done) {
3202 			continue;
3203 		}
3204 
3205 		/* Once we find an I/O that has not timed out, we can immediately
3206 		 * exit the loop.
3207 		 */
3208 		if (now < (bdev_io->internal.submit_tsc +
3209 			   ctx->timeout_in_sec * spdk_get_ticks_hz())) {
3210 			goto end;
3211 		}
3212 
3213 		if (bdev_io->internal.desc == desc) {
3214 			ctx->cb_fn(ctx->cb_arg, bdev_io);
3215 		}
3216 	}
3217 
3218 end:
3219 	spdk_for_each_channel_continue(i, 0);
3220 }
3221 
3222 static int
3223 bdev_poll_timeout_io(void *arg)
3224 {
3225 	struct spdk_bdev_desc *desc = arg;
3226 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
3227 	struct poll_timeout_ctx *ctx;
3228 
3229 	ctx = calloc(1, sizeof(struct poll_timeout_ctx));
3230 	if (!ctx) {
3231 		SPDK_ERRLOG("failed to allocate memory\n");
3232 		return SPDK_POLLER_BUSY;
3233 	}
3234 	ctx->desc = desc;
3235 	ctx->cb_arg = desc->cb_arg;
3236 	ctx->cb_fn = desc->cb_fn;
3237 	ctx->timeout_in_sec = desc->timeout_in_sec;
3238 
3239 	/* Take a ref on the descriptor in case it gets closed while we are checking
3240 	 * all of the channels.
3241 	 */
3242 	pthread_mutex_lock(&desc->mutex);
3243 	desc->refs++;
3244 	pthread_mutex_unlock(&desc->mutex);
3245 
3246 	spdk_for_each_channel(__bdev_to_io_dev(bdev),
3247 			      bdev_channel_poll_timeout_io,
3248 			      ctx,
3249 			      bdev_channel_poll_timeout_io_done);
3250 
3251 	return SPDK_POLLER_BUSY;
3252 }
3253 
3254 int
3255 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec,
3256 		      spdk_bdev_io_timeout_cb cb_fn, void *cb_arg)
3257 {
3258 	assert(desc->thread == spdk_get_thread());
3259 
3260 	spdk_poller_unregister(&desc->io_timeout_poller);
3261 
3262 	if (timeout_in_sec) {
3263 		assert(cb_fn != NULL);
3264 		desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io,
3265 					  desc,
3266 					  SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC /
3267 					  1000);
3268 		if (desc->io_timeout_poller == NULL) {
3269 			SPDK_ERRLOG("can not register the desc timeout IO poller\n");
3270 			return -1;
3271 		}
3272 	}
3273 
3274 	desc->cb_fn = cb_fn;
3275 	desc->cb_arg = cb_arg;
3276 	desc->timeout_in_sec = timeout_in_sec;
3277 
3278 	return 0;
3279 }
3280 
3281 static int
3282 bdev_channel_create(void *io_device, void *ctx_buf)
3283 {
3284 	struct spdk_bdev		*bdev = __bdev_from_io_dev(io_device);
3285 	struct spdk_bdev_channel	*ch = ctx_buf;
3286 	struct spdk_io_channel		*mgmt_io_ch;
3287 	struct spdk_bdev_mgmt_channel	*mgmt_ch;
3288 	struct spdk_bdev_shared_resource *shared_resource;
3289 	struct lba_range		*range;
3290 
3291 	ch->bdev = bdev;
3292 	ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt);
3293 	if (!ch->channel) {
3294 		return -1;
3295 	}
3296 
3297 	spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name,
3298 			  spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel)));
3299 
3300 	assert(ch->histogram == NULL);
3301 	if (bdev->internal.histogram_enabled) {
3302 		ch->histogram = spdk_histogram_data_alloc();
3303 		if (ch->histogram == NULL) {
3304 			SPDK_ERRLOG("Could not allocate histogram\n");
3305 		}
3306 	}
3307 
3308 	mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr);
3309 	if (!mgmt_io_ch) {
3310 		spdk_put_io_channel(ch->channel);
3311 		return -1;
3312 	}
3313 
3314 	mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch);
3315 	TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) {
3316 		if (shared_resource->shared_ch == ch->channel) {
3317 			spdk_put_io_channel(mgmt_io_ch);
3318 			shared_resource->ref++;
3319 			break;
3320 		}
3321 	}
3322 
3323 	if (shared_resource == NULL) {
3324 		shared_resource = calloc(1, sizeof(*shared_resource));
3325 		if (shared_resource == NULL) {
3326 			spdk_put_io_channel(ch->channel);
3327 			spdk_put_io_channel(mgmt_io_ch);
3328 			return -1;
3329 		}
3330 
3331 		shared_resource->mgmt_ch = mgmt_ch;
3332 		shared_resource->io_outstanding = 0;
3333 		TAILQ_INIT(&shared_resource->nomem_io);
3334 		shared_resource->nomem_threshold = 0;
3335 		shared_resource->shared_ch = ch->channel;
3336 		shared_resource->ref = 1;
3337 		TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link);
3338 	}
3339 
3340 	memset(&ch->stat, 0, sizeof(ch->stat));
3341 	ch->stat.ticks_rate = spdk_get_ticks_hz();
3342 	ch->io_outstanding = 0;
3343 	TAILQ_INIT(&ch->queued_resets);
3344 	TAILQ_INIT(&ch->locked_ranges);
3345 	ch->flags = 0;
3346 	ch->shared_resource = shared_resource;
3347 
3348 	TAILQ_INIT(&ch->io_submitted);
3349 	TAILQ_INIT(&ch->io_locked);
3350 
3351 #ifdef SPDK_CONFIG_VTUNE
3352 	{
3353 		char *name;
3354 		__itt_init_ittlib(NULL, 0);
3355 		name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch);
3356 		if (!name) {
3357 			bdev_channel_destroy_resource(ch);
3358 			return -1;
3359 		}
3360 		ch->handle = __itt_string_handle_create(name);
3361 		free(name);
3362 		ch->start_tsc = spdk_get_ticks();
3363 		ch->interval_tsc = spdk_get_ticks_hz() / 100;
3364 		memset(&ch->prev_stat, 0, sizeof(ch->prev_stat));
3365 	}
3366 #endif
3367 
3368 	pthread_mutex_lock(&bdev->internal.mutex);
3369 	bdev_enable_qos(bdev, ch);
3370 
3371 	TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) {
3372 		struct lba_range *new_range;
3373 
3374 		new_range = calloc(1, sizeof(*new_range));
3375 		if (new_range == NULL) {
3376 			pthread_mutex_unlock(&bdev->internal.mutex);
3377 			bdev_channel_destroy_resource(ch);
3378 			return -1;
3379 		}
3380 		new_range->length = range->length;
3381 		new_range->offset = range->offset;
3382 		new_range->locked_ctx = range->locked_ctx;
3383 		TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq);
3384 	}
3385 
3386 	pthread_mutex_unlock(&bdev->internal.mutex);
3387 
3388 	return 0;
3389 }
3390 
3391 /*
3392  * Abort I/O that are waiting on a data buffer.  These types of I/O are
3393  *  linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY.
3394  */
3395 static void
3396 bdev_abort_all_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch)
3397 {
3398 	bdev_io_stailq_t tmp;
3399 	struct spdk_bdev_io *bdev_io;
3400 
3401 	STAILQ_INIT(&tmp);
3402 
3403 	while (!STAILQ_EMPTY(queue)) {
3404 		bdev_io = STAILQ_FIRST(queue);
3405 		STAILQ_REMOVE_HEAD(queue, internal.buf_link);
3406 		if (bdev_io->internal.ch == ch) {
3407 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
3408 		} else {
3409 			STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link);
3410 		}
3411 	}
3412 
3413 	STAILQ_SWAP(&tmp, queue, spdk_bdev_io);
3414 }
3415 
3416 /*
3417  * Abort I/O that are queued waiting for submission.  These types of I/O are
3418  *  linked using the spdk_bdev_io link TAILQ_ENTRY.
3419  */
3420 static void
3421 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch)
3422 {
3423 	struct spdk_bdev_io *bdev_io, *tmp;
3424 
3425 	TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) {
3426 		if (bdev_io->internal.ch == ch) {
3427 			TAILQ_REMOVE(queue, bdev_io, internal.link);
3428 			/*
3429 			 * spdk_bdev_io_complete() assumes that the completed I/O had
3430 			 *  been submitted to the bdev module.  Since in this case it
3431 			 *  hadn't, bump io_outstanding to account for the decrement
3432 			 *  that spdk_bdev_io_complete() will do.
3433 			 */
3434 			if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) {
3435 				ch->io_outstanding++;
3436 				ch->shared_resource->io_outstanding++;
3437 			}
3438 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
3439 		}
3440 	}
3441 }
3442 
3443 static bool
3444 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort)
3445 {
3446 	struct spdk_bdev_io *bdev_io;
3447 
3448 	TAILQ_FOREACH(bdev_io, queue, internal.link) {
3449 		if (bdev_io == bio_to_abort) {
3450 			TAILQ_REMOVE(queue, bio_to_abort, internal.link);
3451 			spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
3452 			return true;
3453 		}
3454 	}
3455 
3456 	return false;
3457 }
3458 
3459 static bool
3460 bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort)
3461 {
3462 	struct spdk_bdev_io *bdev_io;
3463 
3464 	STAILQ_FOREACH(bdev_io, queue, internal.buf_link) {
3465 		if (bdev_io == bio_to_abort) {
3466 			STAILQ_REMOVE(queue, bio_to_abort, spdk_bdev_io, internal.buf_link);
3467 			spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
3468 			return true;
3469 		}
3470 	}
3471 
3472 	return false;
3473 }
3474 
3475 static void
3476 bdev_qos_channel_destroy(void *cb_arg)
3477 {
3478 	struct spdk_bdev_qos *qos = cb_arg;
3479 
3480 	spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
3481 	spdk_poller_unregister(&qos->poller);
3482 
3483 	SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos);
3484 
3485 	free(qos);
3486 }
3487 
3488 static int
3489 bdev_qos_destroy(struct spdk_bdev *bdev)
3490 {
3491 	int i;
3492 
3493 	/*
3494 	 * Cleanly shutting down the QoS poller is tricky, because
3495 	 * during the asynchronous operation the user could open
3496 	 * a new descriptor and create a new channel, spawning
3497 	 * a new QoS poller.
3498 	 *
3499 	 * The strategy is to create a new QoS structure here and swap it
3500 	 * in. The shutdown path then continues to refer to the old one
3501 	 * until it completes and then releases it.
3502 	 */
3503 	struct spdk_bdev_qos *new_qos, *old_qos;
3504 
3505 	old_qos = bdev->internal.qos;
3506 
3507 	new_qos = calloc(1, sizeof(*new_qos));
3508 	if (!new_qos) {
3509 		SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n");
3510 		return -ENOMEM;
3511 	}
3512 
3513 	/* Copy the old QoS data into the newly allocated structure */
3514 	memcpy(new_qos, old_qos, sizeof(*new_qos));
3515 
3516 	/* Zero out the key parts of the QoS structure */
3517 	new_qos->ch = NULL;
3518 	new_qos->thread = NULL;
3519 	new_qos->poller = NULL;
3520 	TAILQ_INIT(&new_qos->queued);
3521 	/*
3522 	 * The limit member of spdk_bdev_qos_limit structure is not zeroed.
3523 	 * It will be used later for the new QoS structure.
3524 	 */
3525 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3526 		new_qos->rate_limits[i].remaining_this_timeslice = 0;
3527 		new_qos->rate_limits[i].min_per_timeslice = 0;
3528 		new_qos->rate_limits[i].max_per_timeslice = 0;
3529 	}
3530 
3531 	bdev->internal.qos = new_qos;
3532 
3533 	if (old_qos->thread == NULL) {
3534 		free(old_qos);
3535 	} else {
3536 		spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos);
3537 	}
3538 
3539 	/* It is safe to continue with destroying the bdev even though the QoS channel hasn't
3540 	 * been destroyed yet. The destruction path will end up waiting for the final
3541 	 * channel to be put before it releases resources. */
3542 
3543 	return 0;
3544 }
3545 
3546 static void
3547 bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add)
3548 {
3549 	total->bytes_read += add->bytes_read;
3550 	total->num_read_ops += add->num_read_ops;
3551 	total->bytes_written += add->bytes_written;
3552 	total->num_write_ops += add->num_write_ops;
3553 	total->bytes_unmapped += add->bytes_unmapped;
3554 	total->num_unmap_ops += add->num_unmap_ops;
3555 	total->read_latency_ticks += add->read_latency_ticks;
3556 	total->write_latency_ticks += add->write_latency_ticks;
3557 	total->unmap_latency_ticks += add->unmap_latency_ticks;
3558 }
3559 
3560 static void
3561 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch)
3562 {
3563 	struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource;
3564 	struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch;
3565 
3566 	bdev_abort_all_queued_io(&shared_resource->nomem_io, ch);
3567 	bdev_abort_all_buf_io(&mgmt_ch->need_buf_small, ch);
3568 	bdev_abort_all_buf_io(&mgmt_ch->need_buf_large, ch);
3569 }
3570 
3571 static void
3572 bdev_channel_destroy(void *io_device, void *ctx_buf)
3573 {
3574 	struct spdk_bdev_channel *ch = ctx_buf;
3575 
3576 	SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name,
3577 		      spdk_get_thread());
3578 
3579 	spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name,
3580 			  spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel)));
3581 
3582 	/* This channel is going away, so add its statistics into the bdev so that they don't get lost. */
3583 	pthread_mutex_lock(&ch->bdev->internal.mutex);
3584 	bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat);
3585 	pthread_mutex_unlock(&ch->bdev->internal.mutex);
3586 
3587 	bdev_abort_all_queued_io(&ch->queued_resets, ch);
3588 
3589 	bdev_channel_abort_queued_ios(ch);
3590 
3591 	if (ch->histogram) {
3592 		spdk_histogram_data_free(ch->histogram);
3593 	}
3594 
3595 	bdev_channel_destroy_resource(ch);
3596 }
3597 
3598 /*
3599  * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer
3600  * to it. Hence we do not have to call bdev_get_by_name() when using this function.
3601  */
3602 static int
3603 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name)
3604 {
3605 	struct spdk_bdev_name *tmp;
3606 
3607 	bdev_name->name = strdup(name);
3608 	if (bdev_name->name == NULL) {
3609 		SPDK_ERRLOG("Unable to allocate bdev name\n");
3610 		return -ENOMEM;
3611 	}
3612 
3613 	bdev_name->bdev = bdev;
3614 
3615 	pthread_mutex_lock(&g_bdev_mgr.mutex);
3616 	tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name);
3617 	pthread_mutex_unlock(&g_bdev_mgr.mutex);
3618 
3619 	if (tmp != NULL) {
3620 		SPDK_ERRLOG("Bdev name %s already exists\n", name);
3621 		free(bdev_name->name);
3622 		return -EEXIST;
3623 	}
3624 
3625 	return 0;
3626 }
3627 
3628 static void
3629 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name)
3630 {
3631 	RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name);
3632 	free(bdev_name->name);
3633 }
3634 
3635 static void
3636 bdev_name_del(struct spdk_bdev_name *bdev_name)
3637 {
3638 	pthread_mutex_lock(&g_bdev_mgr.mutex);
3639 	bdev_name_del_unsafe(bdev_name);
3640 	pthread_mutex_unlock(&g_bdev_mgr.mutex);
3641 }
3642 
3643 int
3644 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias)
3645 {
3646 	struct spdk_bdev_alias *tmp;
3647 	int ret;
3648 
3649 	if (alias == NULL) {
3650 		SPDK_ERRLOG("Empty alias passed\n");
3651 		return -EINVAL;
3652 	}
3653 
3654 	tmp = calloc(1, sizeof(*tmp));
3655 	if (tmp == NULL) {
3656 		SPDK_ERRLOG("Unable to allocate alias\n");
3657 		return -ENOMEM;
3658 	}
3659 
3660 	ret = bdev_name_add(&tmp->alias, bdev, alias);
3661 	if (ret != 0) {
3662 		free(tmp);
3663 		return ret;
3664 	}
3665 
3666 	TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq);
3667 
3668 	return 0;
3669 }
3670 
3671 static int
3672 bdev_alias_del(struct spdk_bdev *bdev, const char *alias,
3673 	       void (*alias_del_fn)(struct spdk_bdev_name *n))
3674 {
3675 	struct spdk_bdev_alias *tmp;
3676 
3677 	TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
3678 		if (strcmp(alias, tmp->alias.name) == 0) {
3679 			TAILQ_REMOVE(&bdev->aliases, tmp, tailq);
3680 			alias_del_fn(&tmp->alias);
3681 			free(tmp);
3682 			return 0;
3683 		}
3684 	}
3685 
3686 	return -ENOENT;
3687 }
3688 
3689 int
3690 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias)
3691 {
3692 	int rc;
3693 
3694 	rc = bdev_alias_del(bdev, alias, bdev_name_del);
3695 	if (rc == -ENOENT) {
3696 		SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias);
3697 	}
3698 
3699 	return rc;
3700 }
3701 
3702 void
3703 spdk_bdev_alias_del_all(struct spdk_bdev *bdev)
3704 {
3705 	struct spdk_bdev_alias *p, *tmp;
3706 
3707 	TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) {
3708 		TAILQ_REMOVE(&bdev->aliases, p, tailq);
3709 		bdev_name_del(&p->alias);
3710 		free(p);
3711 	}
3712 }
3713 
3714 struct spdk_io_channel *
3715 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc)
3716 {
3717 	return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc)));
3718 }
3719 
3720 void *
3721 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc)
3722 {
3723 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
3724 	void *ctx = NULL;
3725 
3726 	if (bdev->fn_table->get_module_ctx) {
3727 		ctx = bdev->fn_table->get_module_ctx(bdev->ctxt);
3728 	}
3729 
3730 	return ctx;
3731 }
3732 
3733 const char *
3734 spdk_bdev_get_module_name(const struct spdk_bdev *bdev)
3735 {
3736 	return bdev->module->name;
3737 }
3738 
3739 const char *
3740 spdk_bdev_get_name(const struct spdk_bdev *bdev)
3741 {
3742 	return bdev->name;
3743 }
3744 
3745 const char *
3746 spdk_bdev_get_product_name(const struct spdk_bdev *bdev)
3747 {
3748 	return bdev->product_name;
3749 }
3750 
3751 const struct spdk_bdev_aliases_list *
3752 spdk_bdev_get_aliases(const struct spdk_bdev *bdev)
3753 {
3754 	return &bdev->aliases;
3755 }
3756 
3757 uint32_t
3758 spdk_bdev_get_block_size(const struct spdk_bdev *bdev)
3759 {
3760 	return bdev->blocklen;
3761 }
3762 
3763 uint32_t
3764 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev)
3765 {
3766 	return bdev->write_unit_size;
3767 }
3768 
3769 uint64_t
3770 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev)
3771 {
3772 	return bdev->blockcnt;
3773 }
3774 
3775 const char *
3776 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type)
3777 {
3778 	return qos_rpc_type[type];
3779 }
3780 
3781 void
3782 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits)
3783 {
3784 	int i;
3785 
3786 	memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES);
3787 
3788 	pthread_mutex_lock(&bdev->internal.mutex);
3789 	if (bdev->internal.qos) {
3790 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3791 			if (bdev->internal.qos->rate_limits[i].limit !=
3792 			    SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
3793 				limits[i] = bdev->internal.qos->rate_limits[i].limit;
3794 				if (bdev_qos_is_iops_rate_limit(i) == false) {
3795 					/* Change from Byte to Megabyte which is user visible. */
3796 					limits[i] = limits[i] / 1024 / 1024;
3797 				}
3798 			}
3799 		}
3800 	}
3801 	pthread_mutex_unlock(&bdev->internal.mutex);
3802 }
3803 
3804 size_t
3805 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev)
3806 {
3807 	return 1 << bdev->required_alignment;
3808 }
3809 
3810 uint32_t
3811 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev)
3812 {
3813 	return bdev->optimal_io_boundary;
3814 }
3815 
3816 bool
3817 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev)
3818 {
3819 	return bdev->write_cache;
3820 }
3821 
3822 const struct spdk_uuid *
3823 spdk_bdev_get_uuid(const struct spdk_bdev *bdev)
3824 {
3825 	return &bdev->uuid;
3826 }
3827 
3828 uint16_t
3829 spdk_bdev_get_acwu(const struct spdk_bdev *bdev)
3830 {
3831 	return bdev->acwu;
3832 }
3833 
3834 uint32_t
3835 spdk_bdev_get_md_size(const struct spdk_bdev *bdev)
3836 {
3837 	return bdev->md_len;
3838 }
3839 
3840 bool
3841 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev)
3842 {
3843 	return (bdev->md_len != 0) && bdev->md_interleave;
3844 }
3845 
3846 bool
3847 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev)
3848 {
3849 	return (bdev->md_len != 0) && !bdev->md_interleave;
3850 }
3851 
3852 bool
3853 spdk_bdev_is_zoned(const struct spdk_bdev *bdev)
3854 {
3855 	return bdev->zoned;
3856 }
3857 
3858 uint32_t
3859 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev)
3860 {
3861 	if (spdk_bdev_is_md_interleaved(bdev)) {
3862 		return bdev->blocklen - bdev->md_len;
3863 	} else {
3864 		return bdev->blocklen;
3865 	}
3866 }
3867 
3868 uint32_t
3869 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev)
3870 {
3871 	return bdev->phys_blocklen;
3872 }
3873 
3874 static uint32_t
3875 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev)
3876 {
3877 	if (!spdk_bdev_is_md_interleaved(bdev)) {
3878 		return bdev->blocklen + bdev->md_len;
3879 	} else {
3880 		return bdev->blocklen;
3881 	}
3882 }
3883 
3884 /* We have to use the typedef in the function declaration to appease astyle. */
3885 typedef enum spdk_dif_type spdk_dif_type_t;
3886 
3887 spdk_dif_type_t
3888 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev)
3889 {
3890 	if (bdev->md_len != 0) {
3891 		return bdev->dif_type;
3892 	} else {
3893 		return SPDK_DIF_DISABLE;
3894 	}
3895 }
3896 
3897 bool
3898 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev)
3899 {
3900 	if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) {
3901 		return bdev->dif_is_head_of_md;
3902 	} else {
3903 		return false;
3904 	}
3905 }
3906 
3907 bool
3908 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev,
3909 			       enum spdk_dif_check_type check_type)
3910 {
3911 	if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) {
3912 		return false;
3913 	}
3914 
3915 	switch (check_type) {
3916 	case SPDK_DIF_CHECK_TYPE_REFTAG:
3917 		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0;
3918 	case SPDK_DIF_CHECK_TYPE_APPTAG:
3919 		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0;
3920 	case SPDK_DIF_CHECK_TYPE_GUARD:
3921 		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0;
3922 	default:
3923 		return false;
3924 	}
3925 }
3926 
3927 uint64_t
3928 spdk_bdev_get_qd(const struct spdk_bdev *bdev)
3929 {
3930 	return bdev->internal.measured_queue_depth;
3931 }
3932 
3933 uint64_t
3934 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev)
3935 {
3936 	return bdev->internal.period;
3937 }
3938 
3939 uint64_t
3940 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev)
3941 {
3942 	return bdev->internal.weighted_io_time;
3943 }
3944 
3945 uint64_t
3946 spdk_bdev_get_io_time(const struct spdk_bdev *bdev)
3947 {
3948 	return bdev->internal.io_time;
3949 }
3950 
3951 static void bdev_update_qd_sampling_period(void *ctx);
3952 
3953 static void
3954 _calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status)
3955 {
3956 	struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i);
3957 
3958 	bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth;
3959 
3960 	if (bdev->internal.measured_queue_depth) {
3961 		bdev->internal.io_time += bdev->internal.period;
3962 		bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth;
3963 	}
3964 
3965 	bdev->internal.qd_poll_in_progress = false;
3966 
3967 	bdev_update_qd_sampling_period(bdev);
3968 }
3969 
3970 static void
3971 _calculate_measured_qd(struct spdk_io_channel_iter *i)
3972 {
3973 	struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i);
3974 	struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i);
3975 	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch);
3976 
3977 	bdev->internal.temporary_queue_depth += ch->io_outstanding;
3978 	spdk_for_each_channel_continue(i, 0);
3979 }
3980 
3981 static int
3982 bdev_calculate_measured_queue_depth(void *ctx)
3983 {
3984 	struct spdk_bdev *bdev = ctx;
3985 
3986 	bdev->internal.qd_poll_in_progress = true;
3987 	bdev->internal.temporary_queue_depth = 0;
3988 	spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev,
3989 			      _calculate_measured_qd_cpl);
3990 	return SPDK_POLLER_BUSY;
3991 }
3992 
3993 static void
3994 bdev_update_qd_sampling_period(void *ctx)
3995 {
3996 	struct spdk_bdev *bdev = ctx;
3997 
3998 	if (bdev->internal.period == bdev->internal.new_period) {
3999 		return;
4000 	}
4001 
4002 	if (bdev->internal.qd_poll_in_progress) {
4003 		return;
4004 	}
4005 
4006 	bdev->internal.period = bdev->internal.new_period;
4007 
4008 	spdk_poller_unregister(&bdev->internal.qd_poller);
4009 	if (bdev->internal.period != 0) {
4010 		bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth,
4011 					   bdev, bdev->internal.period);
4012 	} else {
4013 		spdk_bdev_close(bdev->internal.qd_desc);
4014 		bdev->internal.qd_desc = NULL;
4015 	}
4016 }
4017 
4018 static void
4019 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
4020 {
4021 	SPDK_NOTICELOG("Unexpected event type: %d\n", type);
4022 }
4023 
4024 void
4025 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period)
4026 {
4027 	int rc;
4028 
4029 	if (bdev->internal.new_period == period) {
4030 		return;
4031 	}
4032 
4033 	bdev->internal.new_period = period;
4034 
4035 	if (bdev->internal.qd_desc != NULL) {
4036 		assert(bdev->internal.period != 0);
4037 
4038 		spdk_thread_send_msg(bdev->internal.qd_desc->thread,
4039 				     bdev_update_qd_sampling_period, bdev);
4040 		return;
4041 	}
4042 
4043 	assert(bdev->internal.period == 0);
4044 
4045 	rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb,
4046 				NULL, &bdev->internal.qd_desc);
4047 	if (rc != 0) {
4048 		return;
4049 	}
4050 
4051 	bdev->internal.period = period;
4052 	bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth,
4053 				   bdev, period);
4054 }
4055 
4056 struct bdev_get_current_qd_ctx {
4057 	uint64_t current_qd;
4058 	spdk_bdev_get_current_qd_cb cb_fn;
4059 	void *cb_arg;
4060 };
4061 
4062 static void
4063 bdev_get_current_qd_done(struct spdk_io_channel_iter *i, int status)
4064 {
4065 	struct bdev_get_current_qd_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
4066 	void *io_dev = spdk_io_channel_iter_get_io_device(i);
4067 
4068 	ctx->cb_fn(__bdev_from_io_dev(io_dev), ctx->current_qd, ctx->cb_arg, 0);
4069 
4070 	free(ctx);
4071 }
4072 
4073 static void
4074 bdev_get_current_qd(struct spdk_io_channel_iter *i)
4075 {
4076 	struct bdev_get_current_qd_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
4077 	struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i);
4078 	struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch);
4079 
4080 	ctx->current_qd += bdev_ch->io_outstanding;
4081 
4082 	spdk_for_each_channel_continue(i, 0);
4083 }
4084 
4085 void
4086 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn,
4087 			 void *cb_arg)
4088 {
4089 	struct bdev_get_current_qd_ctx *ctx;
4090 
4091 	assert(cb_fn != NULL);
4092 
4093 	ctx = calloc(1, sizeof(*ctx));
4094 	if (ctx == NULL) {
4095 		cb_fn(bdev, 0, cb_arg, -ENOMEM);
4096 		return;
4097 	}
4098 
4099 	ctx->cb_fn = cb_fn;
4100 	ctx->cb_arg = cb_arg;
4101 
4102 	spdk_for_each_channel(__bdev_to_io_dev(bdev),
4103 			      bdev_get_current_qd,
4104 			      ctx,
4105 			      bdev_get_current_qd_done);
4106 }
4107 
4108 static void
4109 _resize_notify(void *arg)
4110 {
4111 	struct spdk_bdev_desc *desc = arg;
4112 
4113 	pthread_mutex_lock(&desc->mutex);
4114 	desc->refs--;
4115 	if (!desc->closed) {
4116 		pthread_mutex_unlock(&desc->mutex);
4117 		desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE,
4118 					desc->bdev,
4119 					desc->callback.ctx);
4120 		return;
4121 	} else if (0 == desc->refs) {
4122 		/* This descriptor was closed after this resize_notify message was sent.
4123 		 * spdk_bdev_close() could not free the descriptor since this message was
4124 		 * in flight, so we free it now using bdev_desc_free().
4125 		 */
4126 		pthread_mutex_unlock(&desc->mutex);
4127 		bdev_desc_free(desc);
4128 		return;
4129 	}
4130 	pthread_mutex_unlock(&desc->mutex);
4131 }
4132 
4133 int
4134 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size)
4135 {
4136 	struct spdk_bdev_desc *desc;
4137 	int ret;
4138 
4139 	if (size == bdev->blockcnt) {
4140 		return 0;
4141 	}
4142 
4143 	pthread_mutex_lock(&bdev->internal.mutex);
4144 
4145 	/* bdev has open descriptors */
4146 	if (!TAILQ_EMPTY(&bdev->internal.open_descs) &&
4147 	    bdev->blockcnt > size) {
4148 		ret = -EBUSY;
4149 	} else {
4150 		bdev->blockcnt = size;
4151 		TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
4152 			pthread_mutex_lock(&desc->mutex);
4153 			if (!desc->closed) {
4154 				desc->refs++;
4155 				spdk_thread_send_msg(desc->thread, _resize_notify, desc);
4156 			}
4157 			pthread_mutex_unlock(&desc->mutex);
4158 		}
4159 		ret = 0;
4160 	}
4161 
4162 	pthread_mutex_unlock(&bdev->internal.mutex);
4163 
4164 	return ret;
4165 }
4166 
4167 /*
4168  * Convert I/O offset and length from bytes to blocks.
4169  *
4170  * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size.
4171  */
4172 static uint64_t
4173 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks,
4174 		     uint64_t num_bytes, uint64_t *num_blocks)
4175 {
4176 	uint32_t block_size = bdev->blocklen;
4177 	uint8_t shift_cnt;
4178 
4179 	/* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */
4180 	if (spdk_likely(spdk_u32_is_pow2(block_size))) {
4181 		shift_cnt = spdk_u32log2(block_size);
4182 		*offset_blocks = offset_bytes >> shift_cnt;
4183 		*num_blocks = num_bytes >> shift_cnt;
4184 		return (offset_bytes - (*offset_blocks << shift_cnt)) |
4185 		       (num_bytes - (*num_blocks << shift_cnt));
4186 	} else {
4187 		*offset_blocks = offset_bytes / block_size;
4188 		*num_blocks = num_bytes / block_size;
4189 		return (offset_bytes % block_size) | (num_bytes % block_size);
4190 	}
4191 }
4192 
4193 static bool
4194 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks)
4195 {
4196 	/* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there
4197 	 * has been an overflow and hence the offset has been wrapped around */
4198 	if (offset_blocks + num_blocks < offset_blocks) {
4199 		return false;
4200 	}
4201 
4202 	/* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */
4203 	if (offset_blocks + num_blocks > bdev->blockcnt) {
4204 		return false;
4205 	}
4206 
4207 	return true;
4208 }
4209 
4210 static void
4211 bdev_seek_complete_cb(void *ctx)
4212 {
4213 	struct spdk_bdev_io *bdev_io = ctx;
4214 
4215 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
4216 	bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx);
4217 }
4218 
4219 static int
4220 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4221 	  uint64_t offset_blocks, enum spdk_bdev_io_type io_type,
4222 	  spdk_bdev_io_completion_cb cb, void *cb_arg)
4223 {
4224 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
4225 	struct spdk_bdev_io *bdev_io;
4226 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
4227 
4228 	assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE);
4229 
4230 	/* Check if offset_blocks is valid looking at the validity of one block */
4231 	if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) {
4232 		return -EINVAL;
4233 	}
4234 
4235 	bdev_io = bdev_channel_get_io(channel);
4236 	if (!bdev_io) {
4237 		return -ENOMEM;
4238 	}
4239 
4240 	bdev_io->internal.ch = channel;
4241 	bdev_io->internal.desc = desc;
4242 	bdev_io->type = io_type;
4243 	bdev_io->u.bdev.offset_blocks = offset_blocks;
4244 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
4245 
4246 	if (!spdk_bdev_io_type_supported(bdev, io_type)) {
4247 		/* In case bdev doesn't support seek to next data/hole offset,
4248 		 * it is assumed that only data and no holes are present */
4249 		if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) {
4250 			bdev_io->u.bdev.seek.offset = offset_blocks;
4251 		} else {
4252 			bdev_io->u.bdev.seek.offset = UINT64_MAX;
4253 		}
4254 
4255 		spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io);
4256 		return 0;
4257 	}
4258 
4259 	bdev_io_submit(bdev_io);
4260 	return 0;
4261 }
4262 
4263 int
4264 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4265 		    uint64_t offset_blocks,
4266 		    spdk_bdev_io_completion_cb cb, void *cb_arg)
4267 {
4268 	return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg);
4269 }
4270 
4271 int
4272 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4273 		    uint64_t offset_blocks,
4274 		    spdk_bdev_io_completion_cb cb, void *cb_arg)
4275 {
4276 	return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg);
4277 }
4278 
4279 uint64_t
4280 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io)
4281 {
4282 	return bdev_io->u.bdev.seek.offset;
4283 }
4284 
4285 static int
4286 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf,
4287 			 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
4288 			 spdk_bdev_io_completion_cb cb, void *cb_arg)
4289 {
4290 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
4291 	struct spdk_bdev_io *bdev_io;
4292 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
4293 
4294 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
4295 		return -EINVAL;
4296 	}
4297 
4298 	bdev_io = bdev_channel_get_io(channel);
4299 	if (!bdev_io) {
4300 		return -ENOMEM;
4301 	}
4302 
4303 	bdev_io->internal.ch = channel;
4304 	bdev_io->internal.desc = desc;
4305 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
4306 	bdev_io->u.bdev.iovs = &bdev_io->iov;
4307 	bdev_io->u.bdev.iovs[0].iov_base = buf;
4308 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
4309 	bdev_io->u.bdev.iovcnt = 1;
4310 	bdev_io->u.bdev.md_buf = md_buf;
4311 	bdev_io->u.bdev.num_blocks = num_blocks;
4312 	bdev_io->u.bdev.offset_blocks = offset_blocks;
4313 	bdev_io->u.bdev.ext_opts = NULL;
4314 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
4315 
4316 	bdev_io_submit(bdev_io);
4317 	return 0;
4318 }
4319 
4320 int
4321 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4322 	       void *buf, uint64_t offset, uint64_t nbytes,
4323 	       spdk_bdev_io_completion_cb cb, void *cb_arg)
4324 {
4325 	uint64_t offset_blocks, num_blocks;
4326 
4327 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
4328 				 nbytes, &num_blocks) != 0) {
4329 		return -EINVAL;
4330 	}
4331 
4332 	return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
4333 }
4334 
4335 int
4336 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4337 		      void *buf, uint64_t offset_blocks, uint64_t num_blocks,
4338 		      spdk_bdev_io_completion_cb cb, void *cb_arg)
4339 {
4340 	return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg);
4341 }
4342 
4343 int
4344 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4345 			      void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
4346 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
4347 {
4348 	struct iovec iov = {
4349 		.iov_base = buf,
4350 	};
4351 
4352 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
4353 		return -EINVAL;
4354 	}
4355 
4356 	if (md_buf && !_is_buf_allocated(&iov)) {
4357 		return -EINVAL;
4358 	}
4359 
4360 	return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
4361 					cb, cb_arg);
4362 }
4363 
4364 int
4365 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4366 		struct iovec *iov, int iovcnt,
4367 		uint64_t offset, uint64_t nbytes,
4368 		spdk_bdev_io_completion_cb cb, void *cb_arg)
4369 {
4370 	uint64_t offset_blocks, num_blocks;
4371 
4372 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
4373 				 nbytes, &num_blocks) != 0) {
4374 		return -EINVAL;
4375 	}
4376 
4377 	return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
4378 }
4379 
4380 static int
4381 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4382 			  struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks,
4383 			  uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg,
4384 			  struct spdk_bdev_ext_io_opts *opts, bool copy_opts)
4385 {
4386 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
4387 	struct spdk_bdev_io *bdev_io;
4388 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
4389 
4390 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
4391 		return -EINVAL;
4392 	}
4393 
4394 	bdev_io = bdev_channel_get_io(channel);
4395 	if (!bdev_io) {
4396 		return -ENOMEM;
4397 	}
4398 
4399 	bdev_io->internal.ch = channel;
4400 	bdev_io->internal.desc = desc;
4401 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
4402 	bdev_io->u.bdev.iovs = iov;
4403 	bdev_io->u.bdev.iovcnt = iovcnt;
4404 	bdev_io->u.bdev.md_buf = md_buf;
4405 	bdev_io->u.bdev.num_blocks = num_blocks;
4406 	bdev_io->u.bdev.offset_blocks = offset_blocks;
4407 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
4408 	bdev_io->internal.ext_opts = opts;
4409 	bdev_io->u.bdev.ext_opts = opts;
4410 
4411 	_bdev_io_submit_ext(desc, bdev_io, opts, copy_opts);
4412 
4413 	return 0;
4414 }
4415 
4416 int
4417 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4418 		       struct iovec *iov, int iovcnt,
4419 		       uint64_t offset_blocks, uint64_t num_blocks,
4420 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
4421 {
4422 	return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
4423 					 num_blocks, cb, cb_arg, NULL, false);
4424 }
4425 
4426 int
4427 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4428 			       struct iovec *iov, int iovcnt, void *md_buf,
4429 			       uint64_t offset_blocks, uint64_t num_blocks,
4430 			       spdk_bdev_io_completion_cb cb, void *cb_arg)
4431 {
4432 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
4433 		return -EINVAL;
4434 	}
4435 
4436 	if (md_buf && !_is_buf_allocated(iov)) {
4437 		return -EINVAL;
4438 	}
4439 
4440 	return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
4441 					 num_blocks, cb, cb_arg, NULL, false);
4442 }
4443 
4444 static inline bool
4445 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov)
4446 {
4447 	/*
4448 	 * We check if opts size is at least of size when we first introduced
4449 	 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members
4450 	 * are not checked internal.
4451 	 */
4452 	return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) +
4453 	       sizeof(opts->metadata) &&
4454 	       opts->size <= sizeof(*opts) &&
4455 	       /* When memory domain is used, the user must provide data buffers */
4456 	       (!opts->memory_domain || (iov && iov[0].iov_base));
4457 }
4458 
4459 int
4460 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4461 			   struct iovec *iov, int iovcnt,
4462 			   uint64_t offset_blocks, uint64_t num_blocks,
4463 			   spdk_bdev_io_completion_cb cb, void *cb_arg,
4464 			   struct spdk_bdev_ext_io_opts *opts)
4465 {
4466 	void *md = NULL;
4467 
4468 	if (opts) {
4469 		if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) {
4470 			return -EINVAL;
4471 		}
4472 		md = opts->metadata;
4473 	}
4474 
4475 	if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
4476 		return -EINVAL;
4477 	}
4478 
4479 	if (md && !_is_buf_allocated(iov)) {
4480 		return -EINVAL;
4481 	}
4482 
4483 	return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks,
4484 					 num_blocks, cb, cb_arg, opts, false);
4485 }
4486 
4487 static int
4488 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4489 			  void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
4490 			  spdk_bdev_io_completion_cb cb, void *cb_arg)
4491 {
4492 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
4493 	struct spdk_bdev_io *bdev_io;
4494 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
4495 
4496 	if (!desc->write) {
4497 		return -EBADF;
4498 	}
4499 
4500 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
4501 		return -EINVAL;
4502 	}
4503 
4504 	bdev_io = bdev_channel_get_io(channel);
4505 	if (!bdev_io) {
4506 		return -ENOMEM;
4507 	}
4508 
4509 	bdev_io->internal.ch = channel;
4510 	bdev_io->internal.desc = desc;
4511 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
4512 	bdev_io->u.bdev.iovs = &bdev_io->iov;
4513 	bdev_io->u.bdev.iovs[0].iov_base = buf;
4514 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
4515 	bdev_io->u.bdev.iovcnt = 1;
4516 	bdev_io->u.bdev.md_buf = md_buf;
4517 	bdev_io->u.bdev.num_blocks = num_blocks;
4518 	bdev_io->u.bdev.offset_blocks = offset_blocks;
4519 	bdev_io->u.bdev.ext_opts = NULL;
4520 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
4521 
4522 	bdev_io_submit(bdev_io);
4523 	return 0;
4524 }
4525 
4526 int
4527 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4528 		void *buf, uint64_t offset, uint64_t nbytes,
4529 		spdk_bdev_io_completion_cb cb, void *cb_arg)
4530 {
4531 	uint64_t offset_blocks, num_blocks;
4532 
4533 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
4534 				 nbytes, &num_blocks) != 0) {
4535 		return -EINVAL;
4536 	}
4537 
4538 	return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
4539 }
4540 
4541 int
4542 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4543 		       void *buf, uint64_t offset_blocks, uint64_t num_blocks,
4544 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
4545 {
4546 	return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks,
4547 					 cb, cb_arg);
4548 }
4549 
4550 int
4551 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4552 			       void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
4553 			       spdk_bdev_io_completion_cb cb, void *cb_arg)
4554 {
4555 	struct iovec iov = {
4556 		.iov_base = buf,
4557 	};
4558 
4559 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
4560 		return -EINVAL;
4561 	}
4562 
4563 	if (md_buf && !_is_buf_allocated(&iov)) {
4564 		return -EINVAL;
4565 	}
4566 
4567 	return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
4568 					 cb, cb_arg);
4569 }
4570 
4571 static int
4572 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4573 			   struct iovec *iov, int iovcnt, void *md_buf,
4574 			   uint64_t offset_blocks, uint64_t num_blocks,
4575 			   spdk_bdev_io_completion_cb cb, void *cb_arg,
4576 			   struct spdk_bdev_ext_io_opts *opts, bool copy_opts)
4577 {
4578 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
4579 	struct spdk_bdev_io *bdev_io;
4580 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
4581 
4582 	if (!desc->write) {
4583 		return -EBADF;
4584 	}
4585 
4586 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
4587 		return -EINVAL;
4588 	}
4589 
4590 	bdev_io = bdev_channel_get_io(channel);
4591 	if (!bdev_io) {
4592 		return -ENOMEM;
4593 	}
4594 
4595 	bdev_io->internal.ch = channel;
4596 	bdev_io->internal.desc = desc;
4597 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
4598 	bdev_io->u.bdev.iovs = iov;
4599 	bdev_io->u.bdev.iovcnt = iovcnt;
4600 	bdev_io->u.bdev.md_buf = md_buf;
4601 	bdev_io->u.bdev.num_blocks = num_blocks;
4602 	bdev_io->u.bdev.offset_blocks = offset_blocks;
4603 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
4604 	bdev_io->internal.ext_opts = opts;
4605 	bdev_io->u.bdev.ext_opts = opts;
4606 
4607 	_bdev_io_submit_ext(desc, bdev_io, opts, copy_opts);
4608 
4609 	return 0;
4610 }
4611 
4612 int
4613 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4614 		 struct iovec *iov, int iovcnt,
4615 		 uint64_t offset, uint64_t len,
4616 		 spdk_bdev_io_completion_cb cb, void *cb_arg)
4617 {
4618 	uint64_t offset_blocks, num_blocks;
4619 
4620 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
4621 				 len, &num_blocks) != 0) {
4622 		return -EINVAL;
4623 	}
4624 
4625 	return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
4626 }
4627 
4628 int
4629 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4630 			struct iovec *iov, int iovcnt,
4631 			uint64_t offset_blocks, uint64_t num_blocks,
4632 			spdk_bdev_io_completion_cb cb, void *cb_arg)
4633 {
4634 	return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
4635 					  num_blocks, cb, cb_arg, NULL, false);
4636 }
4637 
4638 int
4639 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4640 				struct iovec *iov, int iovcnt, void *md_buf,
4641 				uint64_t offset_blocks, uint64_t num_blocks,
4642 				spdk_bdev_io_completion_cb cb, void *cb_arg)
4643 {
4644 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
4645 		return -EINVAL;
4646 	}
4647 
4648 	if (md_buf && !_is_buf_allocated(iov)) {
4649 		return -EINVAL;
4650 	}
4651 
4652 	return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
4653 					  num_blocks, cb, cb_arg, NULL, false);
4654 }
4655 
4656 int
4657 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4658 			    struct iovec *iov, int iovcnt,
4659 			    uint64_t offset_blocks, uint64_t num_blocks,
4660 			    spdk_bdev_io_completion_cb cb, void *cb_arg,
4661 			    struct spdk_bdev_ext_io_opts *opts)
4662 {
4663 	void *md = NULL;
4664 
4665 	if (opts) {
4666 		if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) {
4667 			return -EINVAL;
4668 		}
4669 		md = opts->metadata;
4670 	}
4671 
4672 	if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
4673 		return -EINVAL;
4674 	}
4675 
4676 	if (md && !_is_buf_allocated(iov)) {
4677 		return -EINVAL;
4678 	}
4679 
4680 	return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks,
4681 					  num_blocks, cb, cb_arg, opts, false);
4682 }
4683 
4684 static void
4685 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
4686 {
4687 	struct spdk_bdev_io *parent_io = cb_arg;
4688 	struct spdk_bdev *bdev = parent_io->bdev;
4689 	uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base;
4690 	int i, rc = 0;
4691 
4692 	if (!success) {
4693 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
4694 		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
4695 		spdk_bdev_free_io(bdev_io);
4696 		return;
4697 	}
4698 
4699 	for (i = 0; i < parent_io->u.bdev.iovcnt; i++) {
4700 		rc = memcmp(read_buf,
4701 			    parent_io->u.bdev.iovs[i].iov_base,
4702 			    parent_io->u.bdev.iovs[i].iov_len);
4703 		if (rc) {
4704 			break;
4705 		}
4706 		read_buf += parent_io->u.bdev.iovs[i].iov_len;
4707 	}
4708 
4709 	if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) {
4710 		rc = memcmp(bdev_io->u.bdev.md_buf,
4711 			    parent_io->u.bdev.md_buf,
4712 			    spdk_bdev_get_md_size(bdev));
4713 	}
4714 
4715 	spdk_bdev_free_io(bdev_io);
4716 
4717 	if (rc == 0) {
4718 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
4719 		parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx);
4720 	} else {
4721 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE;
4722 		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
4723 	}
4724 }
4725 
4726 static void
4727 bdev_compare_do_read(void *_bdev_io)
4728 {
4729 	struct spdk_bdev_io *bdev_io = _bdev_io;
4730 	int rc;
4731 
4732 	rc = spdk_bdev_read_blocks(bdev_io->internal.desc,
4733 				   spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL,
4734 				   bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
4735 				   bdev_compare_do_read_done, bdev_io);
4736 
4737 	if (rc == -ENOMEM) {
4738 		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read);
4739 	} else if (rc != 0) {
4740 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
4741 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
4742 	}
4743 }
4744 
4745 static int
4746 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4747 			     struct iovec *iov, int iovcnt, void *md_buf,
4748 			     uint64_t offset_blocks, uint64_t num_blocks,
4749 			     spdk_bdev_io_completion_cb cb, void *cb_arg)
4750 {
4751 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
4752 	struct spdk_bdev_io *bdev_io;
4753 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
4754 
4755 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
4756 		return -EINVAL;
4757 	}
4758 
4759 	bdev_io = bdev_channel_get_io(channel);
4760 	if (!bdev_io) {
4761 		return -ENOMEM;
4762 	}
4763 
4764 	bdev_io->internal.ch = channel;
4765 	bdev_io->internal.desc = desc;
4766 	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE;
4767 	bdev_io->u.bdev.iovs = iov;
4768 	bdev_io->u.bdev.iovcnt = iovcnt;
4769 	bdev_io->u.bdev.md_buf = md_buf;
4770 	bdev_io->u.bdev.num_blocks = num_blocks;
4771 	bdev_io->u.bdev.offset_blocks = offset_blocks;
4772 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
4773 	bdev_io->u.bdev.ext_opts = NULL;
4774 
4775 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) {
4776 		bdev_io_submit(bdev_io);
4777 		return 0;
4778 	}
4779 
4780 	bdev_compare_do_read(bdev_io);
4781 
4782 	return 0;
4783 }
4784 
4785 int
4786 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4787 			  struct iovec *iov, int iovcnt,
4788 			  uint64_t offset_blocks, uint64_t num_blocks,
4789 			  spdk_bdev_io_completion_cb cb, void *cb_arg)
4790 {
4791 	return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
4792 					    num_blocks, cb, cb_arg);
4793 }
4794 
4795 int
4796 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4797 				  struct iovec *iov, int iovcnt, void *md_buf,
4798 				  uint64_t offset_blocks, uint64_t num_blocks,
4799 				  spdk_bdev_io_completion_cb cb, void *cb_arg)
4800 {
4801 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
4802 		return -EINVAL;
4803 	}
4804 
4805 	if (md_buf && !_is_buf_allocated(iov)) {
4806 		return -EINVAL;
4807 	}
4808 
4809 	return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
4810 					    num_blocks, cb, cb_arg);
4811 }
4812 
4813 static int
4814 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4815 			    void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
4816 			    spdk_bdev_io_completion_cb cb, void *cb_arg)
4817 {
4818 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
4819 	struct spdk_bdev_io *bdev_io;
4820 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
4821 
4822 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
4823 		return -EINVAL;
4824 	}
4825 
4826 	bdev_io = bdev_channel_get_io(channel);
4827 	if (!bdev_io) {
4828 		return -ENOMEM;
4829 	}
4830 
4831 	bdev_io->internal.ch = channel;
4832 	bdev_io->internal.desc = desc;
4833 	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE;
4834 	bdev_io->u.bdev.iovs = &bdev_io->iov;
4835 	bdev_io->u.bdev.iovs[0].iov_base = buf;
4836 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
4837 	bdev_io->u.bdev.iovcnt = 1;
4838 	bdev_io->u.bdev.md_buf = md_buf;
4839 	bdev_io->u.bdev.num_blocks = num_blocks;
4840 	bdev_io->u.bdev.offset_blocks = offset_blocks;
4841 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
4842 	bdev_io->u.bdev.ext_opts = NULL;
4843 
4844 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) {
4845 		bdev_io_submit(bdev_io);
4846 		return 0;
4847 	}
4848 
4849 	bdev_compare_do_read(bdev_io);
4850 
4851 	return 0;
4852 }
4853 
4854 int
4855 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4856 			 void *buf, uint64_t offset_blocks, uint64_t num_blocks,
4857 			 spdk_bdev_io_completion_cb cb, void *cb_arg)
4858 {
4859 	return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks,
4860 					   cb, cb_arg);
4861 }
4862 
4863 int
4864 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4865 				 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
4866 				 spdk_bdev_io_completion_cb cb, void *cb_arg)
4867 {
4868 	struct iovec iov = {
4869 		.iov_base = buf,
4870 	};
4871 
4872 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
4873 		return -EINVAL;
4874 	}
4875 
4876 	if (md_buf && !_is_buf_allocated(&iov)) {
4877 		return -EINVAL;
4878 	}
4879 
4880 	return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
4881 					   cb, cb_arg);
4882 }
4883 
4884 static void
4885 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status)
4886 {
4887 	struct spdk_bdev_io *bdev_io = ctx;
4888 
4889 	if (unlock_status) {
4890 		SPDK_ERRLOG("LBA range unlock failed\n");
4891 	}
4892 
4893 	bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true :
4894 			     false, bdev_io->internal.caller_ctx);
4895 }
4896 
4897 static void
4898 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status)
4899 {
4900 	bdev_io->internal.status = status;
4901 
4902 	bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch),
4903 			      bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
4904 			      bdev_comparev_and_writev_blocks_unlocked, bdev_io);
4905 }
4906 
4907 static void
4908 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
4909 {
4910 	struct spdk_bdev_io *parent_io = cb_arg;
4911 
4912 	if (!success) {
4913 		SPDK_ERRLOG("Compare and write operation failed\n");
4914 	}
4915 
4916 	spdk_bdev_free_io(bdev_io);
4917 
4918 	bdev_comparev_and_writev_blocks_unlock(parent_io,
4919 					       success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED);
4920 }
4921 
4922 static void
4923 bdev_compare_and_write_do_write(void *_bdev_io)
4924 {
4925 	struct spdk_bdev_io *bdev_io = _bdev_io;
4926 	int rc;
4927 
4928 	rc = spdk_bdev_writev_blocks(bdev_io->internal.desc,
4929 				     spdk_io_channel_from_ctx(bdev_io->internal.ch),
4930 				     bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt,
4931 				     bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
4932 				     bdev_compare_and_write_do_write_done, bdev_io);
4933 
4934 
4935 	if (rc == -ENOMEM) {
4936 		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write);
4937 	} else if (rc != 0) {
4938 		bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
4939 	}
4940 }
4941 
4942 static void
4943 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
4944 {
4945 	struct spdk_bdev_io *parent_io = cb_arg;
4946 
4947 	spdk_bdev_free_io(bdev_io);
4948 
4949 	if (!success) {
4950 		bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE);
4951 		return;
4952 	}
4953 
4954 	bdev_compare_and_write_do_write(parent_io);
4955 }
4956 
4957 static void
4958 bdev_compare_and_write_do_compare(void *_bdev_io)
4959 {
4960 	struct spdk_bdev_io *bdev_io = _bdev_io;
4961 	int rc;
4962 
4963 	rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc,
4964 				       spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs,
4965 				       bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
4966 				       bdev_compare_and_write_do_compare_done, bdev_io);
4967 
4968 	if (rc == -ENOMEM) {
4969 		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare);
4970 	} else if (rc != 0) {
4971 		bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED);
4972 	}
4973 }
4974 
4975 static void
4976 bdev_comparev_and_writev_blocks_locked(void *ctx, int status)
4977 {
4978 	struct spdk_bdev_io *bdev_io = ctx;
4979 
4980 	if (status) {
4981 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED;
4982 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
4983 		return;
4984 	}
4985 
4986 	bdev_compare_and_write_do_compare(bdev_io);
4987 }
4988 
4989 int
4990 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4991 				     struct iovec *compare_iov, int compare_iovcnt,
4992 				     struct iovec *write_iov, int write_iovcnt,
4993 				     uint64_t offset_blocks, uint64_t num_blocks,
4994 				     spdk_bdev_io_completion_cb cb, void *cb_arg)
4995 {
4996 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
4997 	struct spdk_bdev_io *bdev_io;
4998 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
4999 
5000 	if (!desc->write) {
5001 		return -EBADF;
5002 	}
5003 
5004 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5005 		return -EINVAL;
5006 	}
5007 
5008 	if (num_blocks > bdev->acwu) {
5009 		return -EINVAL;
5010 	}
5011 
5012 	bdev_io = bdev_channel_get_io(channel);
5013 	if (!bdev_io) {
5014 		return -ENOMEM;
5015 	}
5016 
5017 	bdev_io->internal.ch = channel;
5018 	bdev_io->internal.desc = desc;
5019 	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE;
5020 	bdev_io->u.bdev.iovs = compare_iov;
5021 	bdev_io->u.bdev.iovcnt = compare_iovcnt;
5022 	bdev_io->u.bdev.fused_iovs = write_iov;
5023 	bdev_io->u.bdev.fused_iovcnt = write_iovcnt;
5024 	bdev_io->u.bdev.md_buf = NULL;
5025 	bdev_io->u.bdev.num_blocks = num_blocks;
5026 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5027 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5028 	bdev_io->u.bdev.ext_opts = NULL;
5029 
5030 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) {
5031 		bdev_io_submit(bdev_io);
5032 		return 0;
5033 	}
5034 
5035 	return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks,
5036 				   bdev_comparev_and_writev_blocks_locked, bdev_io);
5037 }
5038 
5039 int
5040 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5041 		      struct iovec *iov, int iovcnt,
5042 		      uint64_t offset_blocks, uint64_t num_blocks,
5043 		      bool populate,
5044 		      spdk_bdev_io_completion_cb cb, void *cb_arg)
5045 {
5046 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5047 	struct spdk_bdev_io *bdev_io;
5048 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
5049 
5050 	if (!desc->write) {
5051 		return -EBADF;
5052 	}
5053 
5054 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5055 		return -EINVAL;
5056 	}
5057 
5058 	if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) {
5059 		return -ENOTSUP;
5060 	}
5061 
5062 	bdev_io = bdev_channel_get_io(channel);
5063 	if (!bdev_io) {
5064 		return -ENOMEM;
5065 	}
5066 
5067 	bdev_io->internal.ch = channel;
5068 	bdev_io->internal.desc = desc;
5069 	bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY;
5070 	bdev_io->u.bdev.num_blocks = num_blocks;
5071 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5072 	bdev_io->u.bdev.iovs = iov;
5073 	bdev_io->u.bdev.iovcnt = iovcnt;
5074 	bdev_io->u.bdev.md_buf = NULL;
5075 	bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0;
5076 	bdev_io->u.bdev.zcopy.commit = 0;
5077 	bdev_io->u.bdev.zcopy.start = 1;
5078 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5079 	bdev_io->u.bdev.ext_opts = NULL;
5080 
5081 	bdev_io_submit(bdev_io);
5082 
5083 	return 0;
5084 }
5085 
5086 int
5087 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit,
5088 		    spdk_bdev_io_completion_cb cb, void *cb_arg)
5089 {
5090 	if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) {
5091 		return -EINVAL;
5092 	}
5093 
5094 	bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0;
5095 	bdev_io->u.bdev.zcopy.start = 0;
5096 	bdev_io->internal.caller_ctx = cb_arg;
5097 	bdev_io->internal.cb = cb;
5098 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
5099 
5100 	bdev_io_submit(bdev_io);
5101 
5102 	return 0;
5103 }
5104 
5105 int
5106 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5107 		       uint64_t offset, uint64_t len,
5108 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
5109 {
5110 	uint64_t offset_blocks, num_blocks;
5111 
5112 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5113 				 len, &num_blocks) != 0) {
5114 		return -EINVAL;
5115 	}
5116 
5117 	return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
5118 }
5119 
5120 int
5121 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5122 			      uint64_t offset_blocks, uint64_t num_blocks,
5123 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
5124 {
5125 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5126 	struct spdk_bdev_io *bdev_io;
5127 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
5128 
5129 	if (!desc->write) {
5130 		return -EBADF;
5131 	}
5132 
5133 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5134 		return -EINVAL;
5135 	}
5136 
5137 	if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) &&
5138 	    !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) {
5139 		return -ENOTSUP;
5140 	}
5141 
5142 	bdev_io = bdev_channel_get_io(channel);
5143 
5144 	if (!bdev_io) {
5145 		return -ENOMEM;
5146 	}
5147 
5148 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES;
5149 	bdev_io->internal.ch = channel;
5150 	bdev_io->internal.desc = desc;
5151 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5152 	bdev_io->u.bdev.num_blocks = num_blocks;
5153 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5154 	bdev_io->u.bdev.ext_opts = NULL;
5155 
5156 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
5157 		bdev_io_submit(bdev_io);
5158 		return 0;
5159 	}
5160 
5161 	assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE));
5162 	assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE);
5163 	bdev_io->u.bdev.split_remaining_num_blocks = num_blocks;
5164 	bdev_io->u.bdev.split_current_offset_blocks = offset_blocks;
5165 	bdev_write_zero_buffer_next(bdev_io);
5166 
5167 	return 0;
5168 }
5169 
5170 int
5171 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5172 		uint64_t offset, uint64_t nbytes,
5173 		spdk_bdev_io_completion_cb cb, void *cb_arg)
5174 {
5175 	uint64_t offset_blocks, num_blocks;
5176 
5177 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5178 				 nbytes, &num_blocks) != 0) {
5179 		return -EINVAL;
5180 	}
5181 
5182 	return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
5183 }
5184 
5185 int
5186 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5187 		       uint64_t offset_blocks, uint64_t num_blocks,
5188 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
5189 {
5190 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5191 	struct spdk_bdev_io *bdev_io;
5192 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
5193 
5194 	if (!desc->write) {
5195 		return -EBADF;
5196 	}
5197 
5198 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5199 		return -EINVAL;
5200 	}
5201 
5202 	if (num_blocks == 0) {
5203 		SPDK_ERRLOG("Can't unmap 0 bytes\n");
5204 		return -EINVAL;
5205 	}
5206 
5207 	bdev_io = bdev_channel_get_io(channel);
5208 	if (!bdev_io) {
5209 		return -ENOMEM;
5210 	}
5211 
5212 	bdev_io->internal.ch = channel;
5213 	bdev_io->internal.desc = desc;
5214 	bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP;
5215 
5216 	bdev_io->u.bdev.iovs = &bdev_io->iov;
5217 	bdev_io->u.bdev.iovs[0].iov_base = NULL;
5218 	bdev_io->u.bdev.iovs[0].iov_len = 0;
5219 	bdev_io->u.bdev.iovcnt = 1;
5220 
5221 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5222 	bdev_io->u.bdev.num_blocks = num_blocks;
5223 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5224 	bdev_io->u.bdev.ext_opts = NULL;
5225 
5226 	bdev_io_submit(bdev_io);
5227 	return 0;
5228 }
5229 
5230 int
5231 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5232 		uint64_t offset, uint64_t length,
5233 		spdk_bdev_io_completion_cb cb, void *cb_arg)
5234 {
5235 	uint64_t offset_blocks, num_blocks;
5236 
5237 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5238 				 length, &num_blocks) != 0) {
5239 		return -EINVAL;
5240 	}
5241 
5242 	return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
5243 }
5244 
5245 int
5246 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5247 		       uint64_t offset_blocks, uint64_t num_blocks,
5248 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
5249 {
5250 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5251 	struct spdk_bdev_io *bdev_io;
5252 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
5253 
5254 	if (!desc->write) {
5255 		return -EBADF;
5256 	}
5257 
5258 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5259 		return -EINVAL;
5260 	}
5261 
5262 	bdev_io = bdev_channel_get_io(channel);
5263 	if (!bdev_io) {
5264 		return -ENOMEM;
5265 	}
5266 
5267 	bdev_io->internal.ch = channel;
5268 	bdev_io->internal.desc = desc;
5269 	bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH;
5270 	bdev_io->u.bdev.iovs = NULL;
5271 	bdev_io->u.bdev.iovcnt = 0;
5272 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5273 	bdev_io->u.bdev.num_blocks = num_blocks;
5274 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5275 
5276 	bdev_io_submit(bdev_io);
5277 	return 0;
5278 }
5279 
5280 static int bdev_reset_poll_for_outstanding_io(void *ctx);
5281 
5282 static void
5283 bdev_reset_check_outstanding_io_done(struct spdk_io_channel_iter *i, int status)
5284 {
5285 	struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i);
5286 	struct spdk_bdev_io *bdev_io;
5287 
5288 	bdev_io = TAILQ_FIRST(&ch->queued_resets);
5289 
5290 	if (status == -EBUSY) {
5291 		if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) {
5292 			bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io,
5293 							      ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD);
5294 		} else {
5295 			/* If outstanding IOs are still present and reset_io_drain_timeout seconds passed,
5296 			 * start the reset. */
5297 			TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
5298 			bdev_io_submit_reset(bdev_io);
5299 		}
5300 	} else {
5301 		TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
5302 		SPDK_DEBUGLOG(bdev,
5303 			      "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n",
5304 			      ch->bdev->name);
5305 		/* Mark the completion status as a SUCCESS and complete the reset. */
5306 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
5307 	}
5308 }
5309 
5310 static void
5311 bdev_reset_check_outstanding_io(struct spdk_io_channel_iter *i)
5312 {
5313 	struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i);
5314 	struct spdk_bdev_channel *cur_ch = spdk_io_channel_get_ctx(io_ch);
5315 	int status = 0;
5316 
5317 	if (cur_ch->io_outstanding > 0) {
5318 		/* If a channel has outstanding IO, set status to -EBUSY code. This will stop
5319 		 * further iteration over the rest of the channels and pass non-zero status
5320 		 * to the callback function. */
5321 		status = -EBUSY;
5322 	}
5323 	spdk_for_each_channel_continue(i, status);
5324 }
5325 
5326 static int
5327 bdev_reset_poll_for_outstanding_io(void *ctx)
5328 {
5329 	struct spdk_bdev_channel *ch = ctx;
5330 	struct spdk_bdev_io *bdev_io;
5331 
5332 	bdev_io = TAILQ_FIRST(&ch->queued_resets);
5333 
5334 	spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller);
5335 	spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_check_outstanding_io,
5336 			      ch, bdev_reset_check_outstanding_io_done);
5337 
5338 	return SPDK_POLLER_BUSY;
5339 }
5340 
5341 static void
5342 bdev_reset_freeze_channel_done(struct spdk_io_channel_iter *i, int status)
5343 {
5344 	struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i);
5345 	struct spdk_bdev *bdev = ch->bdev;
5346 	struct spdk_bdev_io *bdev_io;
5347 
5348 	bdev_io = TAILQ_FIRST(&ch->queued_resets);
5349 
5350 	if (bdev->reset_io_drain_timeout == 0) {
5351 		TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
5352 
5353 		bdev_io_submit_reset(bdev_io);
5354 		return;
5355 	}
5356 
5357 	bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() +
5358 			(ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz());
5359 
5360 	/* In case bdev->reset_io_drain_timeout is not equal to zero,
5361 	 * submit the reset to the underlying module only if outstanding I/O
5362 	 * remain after reset_io_drain_timeout seconds have passed. */
5363 	spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_check_outstanding_io,
5364 			      ch, bdev_reset_check_outstanding_io_done);
5365 }
5366 
5367 static void
5368 bdev_reset_freeze_channel(struct spdk_io_channel_iter *i)
5369 {
5370 	struct spdk_io_channel		*ch;
5371 	struct spdk_bdev_channel	*channel;
5372 	struct spdk_bdev_mgmt_channel	*mgmt_channel;
5373 	struct spdk_bdev_shared_resource *shared_resource;
5374 	bdev_io_tailq_t			tmp_queued;
5375 
5376 	TAILQ_INIT(&tmp_queued);
5377 
5378 	ch = spdk_io_channel_iter_get_channel(i);
5379 	channel = spdk_io_channel_get_ctx(ch);
5380 	shared_resource = channel->shared_resource;
5381 	mgmt_channel = shared_resource->mgmt_ch;
5382 
5383 	channel->flags |= BDEV_CH_RESET_IN_PROGRESS;
5384 
5385 	if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) {
5386 		/* The QoS object is always valid and readable while
5387 		 * the channel flag is set, so the lock here should not
5388 		 * be necessary. We're not in the fast path though, so
5389 		 * just take it anyway. */
5390 		pthread_mutex_lock(&channel->bdev->internal.mutex);
5391 		if (channel->bdev->internal.qos->ch == channel) {
5392 			TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link);
5393 		}
5394 		pthread_mutex_unlock(&channel->bdev->internal.mutex);
5395 	}
5396 
5397 	bdev_abort_all_queued_io(&shared_resource->nomem_io, channel);
5398 	bdev_abort_all_buf_io(&mgmt_channel->need_buf_small, channel);
5399 	bdev_abort_all_buf_io(&mgmt_channel->need_buf_large, channel);
5400 	bdev_abort_all_queued_io(&tmp_queued, channel);
5401 
5402 	spdk_for_each_channel_continue(i, 0);
5403 }
5404 
5405 static void
5406 bdev_start_reset(void *ctx)
5407 {
5408 	struct spdk_bdev_channel *ch = ctx;
5409 
5410 	spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_freeze_channel,
5411 			      ch, bdev_reset_freeze_channel_done);
5412 }
5413 
5414 static void
5415 bdev_channel_start_reset(struct spdk_bdev_channel *ch)
5416 {
5417 	struct spdk_bdev *bdev = ch->bdev;
5418 
5419 	assert(!TAILQ_EMPTY(&ch->queued_resets));
5420 
5421 	pthread_mutex_lock(&bdev->internal.mutex);
5422 	if (bdev->internal.reset_in_progress == NULL) {
5423 		bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets);
5424 		/*
5425 		 * Take a channel reference for the target bdev for the life of this
5426 		 *  reset.  This guards against the channel getting destroyed while
5427 		 *  spdk_for_each_channel() calls related to this reset IO are in
5428 		 *  progress.  We will release the reference when this reset is
5429 		 *  completed.
5430 		 */
5431 		bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev));
5432 		bdev_start_reset(ch);
5433 	}
5434 	pthread_mutex_unlock(&bdev->internal.mutex);
5435 }
5436 
5437 int
5438 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5439 		spdk_bdev_io_completion_cb cb, void *cb_arg)
5440 {
5441 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5442 	struct spdk_bdev_io *bdev_io;
5443 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
5444 
5445 	bdev_io = bdev_channel_get_io(channel);
5446 	if (!bdev_io) {
5447 		return -ENOMEM;
5448 	}
5449 
5450 	bdev_io->internal.ch = channel;
5451 	bdev_io->internal.desc = desc;
5452 	bdev_io->internal.submit_tsc = spdk_get_ticks();
5453 	bdev_io->type = SPDK_BDEV_IO_TYPE_RESET;
5454 	bdev_io->u.reset.ch_ref = NULL;
5455 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5456 
5457 	pthread_mutex_lock(&bdev->internal.mutex);
5458 	TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link);
5459 	pthread_mutex_unlock(&bdev->internal.mutex);
5460 
5461 	TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io,
5462 			  internal.ch_link);
5463 
5464 	bdev_channel_start_reset(channel);
5465 
5466 	return 0;
5467 }
5468 
5469 void
5470 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
5471 		      struct spdk_bdev_io_stat *stat)
5472 {
5473 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
5474 
5475 	*stat = channel->stat;
5476 }
5477 
5478 static void
5479 bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status)
5480 {
5481 	void *io_device = spdk_io_channel_iter_get_io_device(i);
5482 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i);
5483 
5484 	bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat,
5485 			    bdev_iostat_ctx->cb_arg, 0);
5486 	free(bdev_iostat_ctx);
5487 }
5488 
5489 static void
5490 bdev_get_each_channel_stat(struct spdk_io_channel_iter *i)
5491 {
5492 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i);
5493 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
5494 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
5495 
5496 	bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat);
5497 	spdk_for_each_channel_continue(i, 0);
5498 }
5499 
5500 void
5501 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat,
5502 			  spdk_bdev_get_device_stat_cb cb, void *cb_arg)
5503 {
5504 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx;
5505 
5506 	assert(bdev != NULL);
5507 	assert(stat != NULL);
5508 	assert(cb != NULL);
5509 
5510 	bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx));
5511 	if (bdev_iostat_ctx == NULL) {
5512 		SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n");
5513 		cb(bdev, stat, cb_arg, -ENOMEM);
5514 		return;
5515 	}
5516 
5517 	bdev_iostat_ctx->stat = stat;
5518 	bdev_iostat_ctx->cb = cb;
5519 	bdev_iostat_ctx->cb_arg = cb_arg;
5520 
5521 	/* Start with the statistics from previously deleted channels. */
5522 	pthread_mutex_lock(&bdev->internal.mutex);
5523 	bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat);
5524 	pthread_mutex_unlock(&bdev->internal.mutex);
5525 
5526 	/* Then iterate and add the statistics from each existing channel. */
5527 	spdk_for_each_channel(__bdev_to_io_dev(bdev),
5528 			      bdev_get_each_channel_stat,
5529 			      bdev_iostat_ctx,
5530 			      bdev_get_device_stat_done);
5531 }
5532 
5533 int
5534 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5535 			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
5536 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
5537 {
5538 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5539 	struct spdk_bdev_io *bdev_io;
5540 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
5541 
5542 	if (!desc->write) {
5543 		return -EBADF;
5544 	}
5545 
5546 	if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) {
5547 		return -ENOTSUP;
5548 	}
5549 
5550 	bdev_io = bdev_channel_get_io(channel);
5551 	if (!bdev_io) {
5552 		return -ENOMEM;
5553 	}
5554 
5555 	bdev_io->internal.ch = channel;
5556 	bdev_io->internal.desc = desc;
5557 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN;
5558 	bdev_io->u.nvme_passthru.cmd = *cmd;
5559 	bdev_io->u.nvme_passthru.buf = buf;
5560 	bdev_io->u.nvme_passthru.nbytes = nbytes;
5561 	bdev_io->u.nvme_passthru.md_buf = NULL;
5562 	bdev_io->u.nvme_passthru.md_len = 0;
5563 
5564 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5565 
5566 	bdev_io_submit(bdev_io);
5567 	return 0;
5568 }
5569 
5570 int
5571 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5572 			   const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
5573 			   spdk_bdev_io_completion_cb cb, void *cb_arg)
5574 {
5575 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5576 	struct spdk_bdev_io *bdev_io;
5577 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
5578 
5579 	if (!desc->write) {
5580 		/*
5581 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
5582 		 *  to easily determine if the command is a read or write, but for now just
5583 		 *  do not allow io_passthru with a read-only descriptor.
5584 		 */
5585 		return -EBADF;
5586 	}
5587 
5588 	if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) {
5589 		return -ENOTSUP;
5590 	}
5591 
5592 	bdev_io = bdev_channel_get_io(channel);
5593 	if (!bdev_io) {
5594 		return -ENOMEM;
5595 	}
5596 
5597 	bdev_io->internal.ch = channel;
5598 	bdev_io->internal.desc = desc;
5599 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO;
5600 	bdev_io->u.nvme_passthru.cmd = *cmd;
5601 	bdev_io->u.nvme_passthru.buf = buf;
5602 	bdev_io->u.nvme_passthru.nbytes = nbytes;
5603 	bdev_io->u.nvme_passthru.md_buf = NULL;
5604 	bdev_io->u.nvme_passthru.md_len = 0;
5605 
5606 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5607 
5608 	bdev_io_submit(bdev_io);
5609 	return 0;
5610 }
5611 
5612 int
5613 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5614 			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len,
5615 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
5616 {
5617 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5618 	struct spdk_bdev_io *bdev_io;
5619 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
5620 
5621 	if (!desc->write) {
5622 		/*
5623 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
5624 		 *  to easily determine if the command is a read or write, but for now just
5625 		 *  do not allow io_passthru with a read-only descriptor.
5626 		 */
5627 		return -EBADF;
5628 	}
5629 
5630 	if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) {
5631 		return -ENOTSUP;
5632 	}
5633 
5634 	bdev_io = bdev_channel_get_io(channel);
5635 	if (!bdev_io) {
5636 		return -ENOMEM;
5637 	}
5638 
5639 	bdev_io->internal.ch = channel;
5640 	bdev_io->internal.desc = desc;
5641 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD;
5642 	bdev_io->u.nvme_passthru.cmd = *cmd;
5643 	bdev_io->u.nvme_passthru.buf = buf;
5644 	bdev_io->u.nvme_passthru.nbytes = nbytes;
5645 	bdev_io->u.nvme_passthru.md_buf = md_buf;
5646 	bdev_io->u.nvme_passthru.md_len = md_len;
5647 
5648 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5649 
5650 	bdev_io_submit(bdev_io);
5651 	return 0;
5652 }
5653 
5654 static void bdev_abort_retry(void *ctx);
5655 static void bdev_abort(struct spdk_bdev_io *parent_io);
5656 
5657 static void
5658 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
5659 {
5660 	struct spdk_bdev_channel *channel = bdev_io->internal.ch;
5661 	struct spdk_bdev_io *parent_io = cb_arg;
5662 	struct spdk_bdev_io *bio_to_abort, *tmp_io;
5663 
5664 	bio_to_abort = bdev_io->u.abort.bio_to_abort;
5665 
5666 	spdk_bdev_free_io(bdev_io);
5667 
5668 	if (!success) {
5669 		/* Check if the target I/O completed in the meantime. */
5670 		TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) {
5671 			if (tmp_io == bio_to_abort) {
5672 				break;
5673 			}
5674 		}
5675 
5676 		/* If the target I/O still exists, set the parent to failed. */
5677 		if (tmp_io != NULL) {
5678 			parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
5679 		}
5680 	}
5681 
5682 	parent_io->u.bdev.split_outstanding--;
5683 	if (parent_io->u.bdev.split_outstanding == 0) {
5684 		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
5685 			bdev_abort_retry(parent_io);
5686 		} else {
5687 			bdev_io_complete(parent_io);
5688 		}
5689 	}
5690 }
5691 
5692 static int
5693 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel,
5694 	      struct spdk_bdev_io *bio_to_abort,
5695 	      spdk_bdev_io_completion_cb cb, void *cb_arg)
5696 {
5697 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5698 	struct spdk_bdev_io *bdev_io;
5699 
5700 	if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT ||
5701 	    bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) {
5702 		/* TODO: Abort reset or abort request. */
5703 		return -ENOTSUP;
5704 	}
5705 
5706 	bdev_io = bdev_channel_get_io(channel);
5707 	if (bdev_io == NULL) {
5708 		return -ENOMEM;
5709 	}
5710 
5711 	bdev_io->internal.ch = channel;
5712 	bdev_io->internal.desc = desc;
5713 	bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT;
5714 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5715 
5716 	if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) {
5717 		bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort;
5718 
5719 		/* Parent abort request is not submitted directly, but to manage its
5720 		 * execution add it to the submitted list here.
5721 		 */
5722 		bdev_io->internal.submit_tsc = spdk_get_ticks();
5723 		TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link);
5724 
5725 		bdev_abort(bdev_io);
5726 
5727 		return 0;
5728 	}
5729 
5730 	bdev_io->u.abort.bio_to_abort = bio_to_abort;
5731 
5732 	/* Submit the abort request to the underlying bdev module. */
5733 	bdev_io_submit(bdev_io);
5734 
5735 	return 0;
5736 }
5737 
5738 static uint32_t
5739 _bdev_abort(struct spdk_bdev_io *parent_io)
5740 {
5741 	struct spdk_bdev_desc *desc = parent_io->internal.desc;
5742 	struct spdk_bdev_channel *channel = parent_io->internal.ch;
5743 	void *bio_cb_arg;
5744 	struct spdk_bdev_io *bio_to_abort;
5745 	uint32_t matched_ios;
5746 	int rc;
5747 
5748 	bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg;
5749 
5750 	/* matched_ios is returned and will be kept by the caller.
5751 	 *
5752 	 * This funcion will be used for two cases, 1) the same cb_arg is used for
5753 	 * multiple I/Os, 2) a single large I/O is split into smaller ones.
5754 	 * Incrementing split_outstanding directly here may confuse readers especially
5755 	 * for the 1st case.
5756 	 *
5757 	 * Completion of I/O abort is processed after stack unwinding. Hence this trick
5758 	 * works as expected.
5759 	 */
5760 	matched_ios = 0;
5761 	parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
5762 
5763 	TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) {
5764 		if (bio_to_abort->internal.caller_ctx != bio_cb_arg) {
5765 			continue;
5766 		}
5767 
5768 		if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) {
5769 			/* Any I/O which was submitted after this abort command should be excluded. */
5770 			continue;
5771 		}
5772 
5773 		rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io);
5774 		if (rc != 0) {
5775 			if (rc == -ENOMEM) {
5776 				parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM;
5777 			} else {
5778 				parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
5779 			}
5780 			break;
5781 		}
5782 		matched_ios++;
5783 	}
5784 
5785 	return matched_ios;
5786 }
5787 
5788 static void
5789 bdev_abort_retry(void *ctx)
5790 {
5791 	struct spdk_bdev_io *parent_io = ctx;
5792 	uint32_t matched_ios;
5793 
5794 	matched_ios = _bdev_abort(parent_io);
5795 
5796 	if (matched_ios == 0) {
5797 		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
5798 			bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry);
5799 		} else {
5800 			/* For retry, the case that no target I/O was found is success
5801 			 * because it means target I/Os completed in the meantime.
5802 			 */
5803 			bdev_io_complete(parent_io);
5804 		}
5805 		return;
5806 	}
5807 
5808 	/* Use split_outstanding to manage the progress of aborting I/Os. */
5809 	parent_io->u.bdev.split_outstanding = matched_ios;
5810 }
5811 
5812 static void
5813 bdev_abort(struct spdk_bdev_io *parent_io)
5814 {
5815 	uint32_t matched_ios;
5816 
5817 	matched_ios = _bdev_abort(parent_io);
5818 
5819 	if (matched_ios == 0) {
5820 		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
5821 			bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry);
5822 		} else {
5823 			/* The case the no target I/O was found is failure. */
5824 			parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
5825 			bdev_io_complete(parent_io);
5826 		}
5827 		return;
5828 	}
5829 
5830 	/* Use split_outstanding to manage the progress of aborting I/Os. */
5831 	parent_io->u.bdev.split_outstanding = matched_ios;
5832 }
5833 
5834 int
5835 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5836 		void *bio_cb_arg,
5837 		spdk_bdev_io_completion_cb cb, void *cb_arg)
5838 {
5839 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5840 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
5841 	struct spdk_bdev_io *bdev_io;
5842 
5843 	if (bio_cb_arg == NULL) {
5844 		return -EINVAL;
5845 	}
5846 
5847 	if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) {
5848 		return -ENOTSUP;
5849 	}
5850 
5851 	bdev_io = bdev_channel_get_io(channel);
5852 	if (bdev_io == NULL) {
5853 		return -ENOMEM;
5854 	}
5855 
5856 	bdev_io->internal.ch = channel;
5857 	bdev_io->internal.desc = desc;
5858 	bdev_io->internal.submit_tsc = spdk_get_ticks();
5859 	bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT;
5860 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5861 
5862 	bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg;
5863 
5864 	/* Parent abort request is not submitted directly, but to manage its execution,
5865 	 * add it to the submitted list here.
5866 	 */
5867 	TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link);
5868 
5869 	bdev_abort(bdev_io);
5870 
5871 	return 0;
5872 }
5873 
5874 int
5875 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
5876 			struct spdk_bdev_io_wait_entry *entry)
5877 {
5878 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
5879 	struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch;
5880 
5881 	if (bdev != entry->bdev) {
5882 		SPDK_ERRLOG("bdevs do not match\n");
5883 		return -EINVAL;
5884 	}
5885 
5886 	if (mgmt_ch->per_thread_cache_count > 0) {
5887 		SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n");
5888 		return -EINVAL;
5889 	}
5890 
5891 	TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link);
5892 	return 0;
5893 }
5894 
5895 static inline void
5896 bdev_io_complete(void *ctx)
5897 {
5898 	struct spdk_bdev_io *bdev_io = ctx;
5899 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
5900 	uint64_t tsc, tsc_diff;
5901 
5902 	if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) {
5903 		/*
5904 		 * Send the completion to the thread that originally submitted the I/O,
5905 		 * which may not be the current thread in the case of QoS.
5906 		 */
5907 		if (bdev_io->internal.io_submit_ch) {
5908 			bdev_io->internal.ch = bdev_io->internal.io_submit_ch;
5909 			bdev_io->internal.io_submit_ch = NULL;
5910 		}
5911 
5912 		/*
5913 		 * Defer completion to avoid potential infinite recursion if the
5914 		 * user's completion callback issues a new I/O.
5915 		 */
5916 		spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
5917 				     bdev_io_complete, bdev_io);
5918 		return;
5919 	}
5920 
5921 	tsc = spdk_get_ticks();
5922 	tsc_diff = tsc - bdev_io->internal.submit_tsc;
5923 	spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io,
5924 			      bdev_io->internal.caller_ctx);
5925 
5926 	TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link);
5927 
5928 	if (bdev_io->internal.ch->histogram) {
5929 		spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff);
5930 	}
5931 
5932 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
5933 		switch (bdev_io->type) {
5934 		case SPDK_BDEV_IO_TYPE_READ:
5935 			bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
5936 			bdev_io->internal.ch->stat.num_read_ops++;
5937 			bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff;
5938 			break;
5939 		case SPDK_BDEV_IO_TYPE_WRITE:
5940 			bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
5941 			bdev_io->internal.ch->stat.num_write_ops++;
5942 			bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff;
5943 			break;
5944 		case SPDK_BDEV_IO_TYPE_UNMAP:
5945 			bdev_io->internal.ch->stat.bytes_unmapped += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
5946 			bdev_io->internal.ch->stat.num_unmap_ops++;
5947 			bdev_io->internal.ch->stat.unmap_latency_ticks += tsc_diff;
5948 			break;
5949 		case SPDK_BDEV_IO_TYPE_ZCOPY:
5950 			/* Track the data in the start phase only */
5951 			if (bdev_io->u.bdev.zcopy.start) {
5952 				if (bdev_io->u.bdev.zcopy.populate) {
5953 					bdev_io->internal.ch->stat.bytes_read +=
5954 						bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
5955 					bdev_io->internal.ch->stat.num_read_ops++;
5956 					bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff;
5957 				} else {
5958 					bdev_io->internal.ch->stat.bytes_written +=
5959 						bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
5960 					bdev_io->internal.ch->stat.num_write_ops++;
5961 					bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff;
5962 				}
5963 			}
5964 			break;
5965 		default:
5966 			break;
5967 		}
5968 	}
5969 
5970 #ifdef SPDK_CONFIG_VTUNE
5971 	uint64_t now_tsc = spdk_get_ticks();
5972 	if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) {
5973 		uint64_t data[5];
5974 
5975 		data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops;
5976 		data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read;
5977 		data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops;
5978 		data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written;
5979 		data[4] = bdev_io->bdev->fn_table->get_spin_time ?
5980 			  bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0;
5981 
5982 		__itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle,
5983 				   __itt_metadata_u64, 5, data);
5984 
5985 		bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat;
5986 		bdev_io->internal.ch->start_tsc = now_tsc;
5987 	}
5988 #endif
5989 
5990 	assert(bdev_io->internal.cb != NULL);
5991 	assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io));
5992 
5993 	bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
5994 			     bdev_io->internal.caller_ctx);
5995 }
5996 
5997 static void bdev_destroy_cb(void *io_device);
5998 
5999 static void
6000 bdev_reset_complete(struct spdk_io_channel_iter *i, int status)
6001 {
6002 	struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i);
6003 	struct spdk_bdev *bdev = bdev_io->bdev;
6004 
6005 	if (bdev_io->u.reset.ch_ref != NULL) {
6006 		spdk_put_io_channel(bdev_io->u.reset.ch_ref);
6007 		bdev_io->u.reset.ch_ref = NULL;
6008 	}
6009 
6010 	bdev_io_complete(bdev_io);
6011 
6012 	if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING &&
6013 	    TAILQ_EMPTY(&bdev->internal.open_descs)) {
6014 		spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
6015 	}
6016 }
6017 
6018 static void
6019 bdev_unfreeze_channel(struct spdk_io_channel_iter *i)
6020 {
6021 	struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i);
6022 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
6023 	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
6024 	struct spdk_bdev_io *queued_reset;
6025 
6026 	ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS;
6027 	while (!TAILQ_EMPTY(&ch->queued_resets)) {
6028 		queued_reset = TAILQ_FIRST(&ch->queued_resets);
6029 		TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link);
6030 		spdk_bdev_io_complete(queued_reset, bdev_io->internal.status);
6031 	}
6032 
6033 	spdk_for_each_channel_continue(i, 0);
6034 }
6035 
6036 void
6037 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status)
6038 {
6039 	struct spdk_bdev *bdev = bdev_io->bdev;
6040 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
6041 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
6042 
6043 	bdev_io->internal.status = status;
6044 
6045 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) {
6046 		bool unlock_channels = false;
6047 
6048 		if (status == SPDK_BDEV_IO_STATUS_NOMEM) {
6049 			SPDK_ERRLOG("NOMEM returned for reset\n");
6050 		}
6051 		pthread_mutex_lock(&bdev->internal.mutex);
6052 		if (bdev_io == bdev->internal.reset_in_progress) {
6053 			bdev->internal.reset_in_progress = NULL;
6054 			unlock_channels = true;
6055 		}
6056 		pthread_mutex_unlock(&bdev->internal.mutex);
6057 
6058 		if (unlock_channels) {
6059 			spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unfreeze_channel,
6060 					      bdev_io, bdev_reset_complete);
6061 			return;
6062 		}
6063 	} else {
6064 		if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) {
6065 			_bdev_io_push_bounce_data_buffer(bdev_io, _bdev_io_complete_push_bounce_done);
6066 			/* bdev IO will be completed in the callback */
6067 			return;
6068 		}
6069 
6070 		_bdev_io_decrement_outstanding(bdev_ch, shared_resource);
6071 		if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) {
6072 			return;
6073 		}
6074 	}
6075 
6076 	bdev_io_complete(bdev_io);
6077 }
6078 
6079 void
6080 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc,
6081 				  enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq)
6082 {
6083 	if (sc == SPDK_SCSI_STATUS_GOOD) {
6084 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
6085 	} else {
6086 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR;
6087 		bdev_io->internal.error.scsi.sc = sc;
6088 		bdev_io->internal.error.scsi.sk = sk;
6089 		bdev_io->internal.error.scsi.asc = asc;
6090 		bdev_io->internal.error.scsi.ascq = ascq;
6091 	}
6092 
6093 	spdk_bdev_io_complete(bdev_io, bdev_io->internal.status);
6094 }
6095 
6096 void
6097 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io,
6098 			     int *sc, int *sk, int *asc, int *ascq)
6099 {
6100 	assert(sc != NULL);
6101 	assert(sk != NULL);
6102 	assert(asc != NULL);
6103 	assert(ascq != NULL);
6104 
6105 	switch (bdev_io->internal.status) {
6106 	case SPDK_BDEV_IO_STATUS_SUCCESS:
6107 		*sc = SPDK_SCSI_STATUS_GOOD;
6108 		*sk = SPDK_SCSI_SENSE_NO_SENSE;
6109 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
6110 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
6111 		break;
6112 	case SPDK_BDEV_IO_STATUS_NVME_ERROR:
6113 		spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq);
6114 		break;
6115 	case SPDK_BDEV_IO_STATUS_SCSI_ERROR:
6116 		*sc = bdev_io->internal.error.scsi.sc;
6117 		*sk = bdev_io->internal.error.scsi.sk;
6118 		*asc = bdev_io->internal.error.scsi.asc;
6119 		*ascq = bdev_io->internal.error.scsi.ascq;
6120 		break;
6121 	default:
6122 		*sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
6123 		*sk = SPDK_SCSI_SENSE_ABORTED_COMMAND;
6124 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
6125 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
6126 		break;
6127 	}
6128 }
6129 
6130 void
6131 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result)
6132 {
6133 	if (aio_result == 0) {
6134 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
6135 	} else {
6136 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR;
6137 	}
6138 
6139 	bdev_io->internal.error.aio_result = aio_result;
6140 
6141 	spdk_bdev_io_complete(bdev_io, bdev_io->internal.status);
6142 }
6143 
6144 void
6145 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result)
6146 {
6147 	assert(aio_result != NULL);
6148 
6149 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) {
6150 		*aio_result = bdev_io->internal.error.aio_result;
6151 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
6152 		*aio_result = 0;
6153 	} else {
6154 		*aio_result = -EIO;
6155 	}
6156 }
6157 
6158 void
6159 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc)
6160 {
6161 	if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) {
6162 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
6163 	} else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) {
6164 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED;
6165 	} else {
6166 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR;
6167 	}
6168 
6169 	bdev_io->internal.error.nvme.cdw0 = cdw0;
6170 	bdev_io->internal.error.nvme.sct = sct;
6171 	bdev_io->internal.error.nvme.sc = sc;
6172 
6173 	spdk_bdev_io_complete(bdev_io, bdev_io->internal.status);
6174 }
6175 
6176 void
6177 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc)
6178 {
6179 	assert(sct != NULL);
6180 	assert(sc != NULL);
6181 	assert(cdw0 != NULL);
6182 
6183 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) {
6184 		*sct = SPDK_NVME_SCT_GENERIC;
6185 		*sc = SPDK_NVME_SC_SUCCESS;
6186 		if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
6187 			*cdw0 = 0;
6188 		} else {
6189 			*cdw0 = 1U;
6190 		}
6191 		return;
6192 	}
6193 
6194 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
6195 		*sct = bdev_io->internal.error.nvme.sct;
6196 		*sc = bdev_io->internal.error.nvme.sc;
6197 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
6198 		*sct = SPDK_NVME_SCT_GENERIC;
6199 		*sc = SPDK_NVME_SC_SUCCESS;
6200 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) {
6201 		*sct = SPDK_NVME_SCT_GENERIC;
6202 		*sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
6203 	} else {
6204 		*sct = SPDK_NVME_SCT_GENERIC;
6205 		*sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
6206 	}
6207 
6208 	*cdw0 = bdev_io->internal.error.nvme.cdw0;
6209 }
6210 
6211 void
6212 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0,
6213 				   int *first_sct, int *first_sc, int *second_sct, int *second_sc)
6214 {
6215 	assert(first_sct != NULL);
6216 	assert(first_sc != NULL);
6217 	assert(second_sct != NULL);
6218 	assert(second_sc != NULL);
6219 	assert(cdw0 != NULL);
6220 
6221 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
6222 		if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR &&
6223 		    bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) {
6224 			*first_sct = bdev_io->internal.error.nvme.sct;
6225 			*first_sc = bdev_io->internal.error.nvme.sc;
6226 			*second_sct = SPDK_NVME_SCT_GENERIC;
6227 			*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
6228 		} else {
6229 			*first_sct = SPDK_NVME_SCT_GENERIC;
6230 			*first_sc = SPDK_NVME_SC_SUCCESS;
6231 			*second_sct = bdev_io->internal.error.nvme.sct;
6232 			*second_sc = bdev_io->internal.error.nvme.sc;
6233 		}
6234 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) {
6235 		*first_sct = SPDK_NVME_SCT_GENERIC;
6236 		*first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
6237 		*second_sct = SPDK_NVME_SCT_GENERIC;
6238 		*second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
6239 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
6240 		*first_sct = SPDK_NVME_SCT_GENERIC;
6241 		*first_sc = SPDK_NVME_SC_SUCCESS;
6242 		*second_sct = SPDK_NVME_SCT_GENERIC;
6243 		*second_sc = SPDK_NVME_SC_SUCCESS;
6244 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) {
6245 		*first_sct = SPDK_NVME_SCT_GENERIC;
6246 		*first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
6247 		*second_sct = SPDK_NVME_SCT_GENERIC;
6248 		*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
6249 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) {
6250 		*first_sct = SPDK_NVME_SCT_MEDIA_ERROR;
6251 		*first_sc = SPDK_NVME_SC_COMPARE_FAILURE;
6252 		*second_sct = SPDK_NVME_SCT_GENERIC;
6253 		*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
6254 	} else {
6255 		*first_sct = SPDK_NVME_SCT_GENERIC;
6256 		*first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
6257 		*second_sct = SPDK_NVME_SCT_GENERIC;
6258 		*second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
6259 	}
6260 
6261 	*cdw0 = bdev_io->internal.error.nvme.cdw0;
6262 }
6263 
6264 struct spdk_thread *
6265 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io)
6266 {
6267 	return spdk_io_channel_get_thread(bdev_io->internal.ch->channel);
6268 }
6269 
6270 struct spdk_io_channel *
6271 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io)
6272 {
6273 	return bdev_io->internal.ch->channel;
6274 }
6275 
6276 static int
6277 bdev_register(struct spdk_bdev *bdev)
6278 {
6279 	char *bdev_name;
6280 	char uuid[SPDK_UUID_STRING_LEN];
6281 	int ret;
6282 
6283 	assert(bdev->module != NULL);
6284 
6285 	if (!bdev->name) {
6286 		SPDK_ERRLOG("Bdev name is NULL\n");
6287 		return -EINVAL;
6288 	}
6289 
6290 	if (!strlen(bdev->name)) {
6291 		SPDK_ERRLOG("Bdev name must not be an empty string\n");
6292 		return -EINVAL;
6293 	}
6294 
6295 	/* Users often register their own I/O devices using the bdev name. In
6296 	 * order to avoid conflicts, prepend bdev_. */
6297 	bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name);
6298 	if (!bdev_name) {
6299 		SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n");
6300 		return -ENOMEM;
6301 	}
6302 
6303 	bdev->internal.status = SPDK_BDEV_STATUS_READY;
6304 	bdev->internal.measured_queue_depth = UINT64_MAX;
6305 	bdev->internal.claim_module = NULL;
6306 	bdev->internal.qd_poller = NULL;
6307 	bdev->internal.qos = NULL;
6308 
6309 	TAILQ_INIT(&bdev->internal.open_descs);
6310 	TAILQ_INIT(&bdev->internal.locked_ranges);
6311 	TAILQ_INIT(&bdev->internal.pending_locked_ranges);
6312 	TAILQ_INIT(&bdev->aliases);
6313 
6314 	ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name);
6315 	if (ret != 0) {
6316 		free(bdev_name);
6317 		return ret;
6318 	}
6319 
6320 	/* If the user didn't specify a uuid, generate one. */
6321 	if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) {
6322 		spdk_uuid_generate(&bdev->uuid);
6323 	}
6324 
6325 	/* Add the UUID alias only if it's different than the name */
6326 	spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid);
6327 	if (strcmp(bdev->name, uuid) != 0) {
6328 		ret = spdk_bdev_alias_add(bdev, uuid);
6329 		if (ret != 0) {
6330 			SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name);
6331 			bdev_name_del(&bdev->internal.bdev_name);
6332 			free(bdev_name);
6333 			return ret;
6334 		}
6335 	}
6336 
6337 	if (spdk_bdev_get_buf_align(bdev) > 1) {
6338 		if (bdev->split_on_optimal_io_boundary) {
6339 			bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary,
6340 							     SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen);
6341 		} else {
6342 			bdev->split_on_optimal_io_boundary = true;
6343 			bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen;
6344 		}
6345 	}
6346 
6347 	/* If the user didn't specify a write unit size, set it to one. */
6348 	if (bdev->write_unit_size == 0) {
6349 		bdev->write_unit_size = 1;
6350 	}
6351 
6352 	/* Set ACWU value to 1 if bdev module did not set it (does not support it natively) */
6353 	if (bdev->acwu == 0) {
6354 		bdev->acwu = 1;
6355 	}
6356 
6357 	if (bdev->phys_blocklen == 0) {
6358 		bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev);
6359 	}
6360 
6361 	bdev->internal.reset_in_progress = NULL;
6362 	bdev->internal.qd_poll_in_progress = false;
6363 	bdev->internal.period = 0;
6364 	bdev->internal.new_period = 0;
6365 
6366 	spdk_io_device_register(__bdev_to_io_dev(bdev),
6367 				bdev_channel_create, bdev_channel_destroy,
6368 				sizeof(struct spdk_bdev_channel),
6369 				bdev_name);
6370 
6371 	free(bdev_name);
6372 
6373 	pthread_mutex_init(&bdev->internal.mutex, NULL);
6374 
6375 	SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name);
6376 	TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link);
6377 
6378 	return 0;
6379 }
6380 
6381 static void
6382 bdev_destroy_cb(void *io_device)
6383 {
6384 	int			rc;
6385 	struct spdk_bdev	*bdev;
6386 	spdk_bdev_unregister_cb	cb_fn;
6387 	void			*cb_arg;
6388 
6389 	bdev = __bdev_from_io_dev(io_device);
6390 	cb_fn = bdev->internal.unregister_cb;
6391 	cb_arg = bdev->internal.unregister_ctx;
6392 
6393 	pthread_mutex_destroy(&bdev->internal.mutex);
6394 	free(bdev->internal.qos);
6395 
6396 	rc = bdev->fn_table->destruct(bdev->ctxt);
6397 	if (rc < 0) {
6398 		SPDK_ERRLOG("destruct failed\n");
6399 	}
6400 	if (rc <= 0 && cb_fn != NULL) {
6401 		cb_fn(cb_arg, rc);
6402 	}
6403 }
6404 
6405 void
6406 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno)
6407 {
6408 	if (bdev->internal.unregister_cb != NULL) {
6409 		bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno);
6410 	}
6411 }
6412 
6413 static void
6414 _remove_notify(void *arg)
6415 {
6416 	struct spdk_bdev_desc *desc = arg;
6417 
6418 	pthread_mutex_lock(&desc->mutex);
6419 	desc->refs--;
6420 
6421 	if (!desc->closed) {
6422 		pthread_mutex_unlock(&desc->mutex);
6423 		desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx);
6424 		return;
6425 	} else if (0 == desc->refs) {
6426 		/* This descriptor was closed after this remove_notify message was sent.
6427 		 * spdk_bdev_close() could not free the descriptor since this message was
6428 		 * in flight, so we free it now using bdev_desc_free().
6429 		 */
6430 		pthread_mutex_unlock(&desc->mutex);
6431 		bdev_desc_free(desc);
6432 		return;
6433 	}
6434 	pthread_mutex_unlock(&desc->mutex);
6435 }
6436 
6437 /* Must be called while holding g_bdev_mgr.mutex and bdev->internal.mutex.
6438  * returns: 0 - bdev removed and ready to be destructed.
6439  *          -EBUSY - bdev can't be destructed yet.  */
6440 static int
6441 bdev_unregister_unsafe(struct spdk_bdev *bdev)
6442 {
6443 	struct spdk_bdev_desc	*desc, *tmp;
6444 	int			rc = 0;
6445 	char			uuid[SPDK_UUID_STRING_LEN];
6446 
6447 	/* Notify each descriptor about hotremoval */
6448 	TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) {
6449 		rc = -EBUSY;
6450 		pthread_mutex_lock(&desc->mutex);
6451 		/*
6452 		 * Defer invocation of the event_cb to a separate message that will
6453 		 *  run later on its thread.  This ensures this context unwinds and
6454 		 *  we don't recursively unregister this bdev again if the event_cb
6455 		 *  immediately closes its descriptor.
6456 		 */
6457 		desc->refs++;
6458 		spdk_thread_send_msg(desc->thread, _remove_notify, desc);
6459 		pthread_mutex_unlock(&desc->mutex);
6460 	}
6461 
6462 	/* If there are no descriptors, proceed removing the bdev */
6463 	if (rc == 0) {
6464 		TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
6465 		SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name);
6466 
6467 		/* Delete the name and the UUID alias */
6468 		spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid);
6469 		bdev_name_del_unsafe(&bdev->internal.bdev_name);
6470 		bdev_alias_del(bdev, uuid, bdev_name_del_unsafe);
6471 
6472 		spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev));
6473 
6474 		if (bdev->internal.reset_in_progress != NULL) {
6475 			/* If reset is in progress, let the completion callback for reset
6476 			 * unregister the bdev.
6477 			 */
6478 			rc = -EBUSY;
6479 		}
6480 	}
6481 
6482 	return rc;
6483 }
6484 
6485 static void
6486 bdev_unregister_abort_channel(struct spdk_io_channel_iter *i)
6487 {
6488 	struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i);
6489 	struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch);
6490 
6491 	bdev_channel_abort_queued_ios(bdev_ch);
6492 	spdk_for_each_channel_continue(i, 0);
6493 }
6494 
6495 static void
6496 bdev_unregister(struct spdk_io_channel_iter *i, int status)
6497 {
6498 	struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i);
6499 	int rc;
6500 
6501 	pthread_mutex_lock(&g_bdev_mgr.mutex);
6502 	pthread_mutex_lock(&bdev->internal.mutex);
6503 	/*
6504 	 * Set the status to REMOVING after completing to abort channels. Otherwise,
6505 	 * the last spdk_bdev_close() may call spdk_io_device_unregister() while
6506 	 * spdk_for_each_channel() is executed and spdk_io_device_unregister() may fail.
6507 	 */
6508 	bdev->internal.status = SPDK_BDEV_STATUS_REMOVING;
6509 	rc = bdev_unregister_unsafe(bdev);
6510 	pthread_mutex_unlock(&bdev->internal.mutex);
6511 	pthread_mutex_unlock(&g_bdev_mgr.mutex);
6512 
6513 	if (rc == 0) {
6514 		spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
6515 	}
6516 }
6517 
6518 void
6519 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
6520 {
6521 	struct spdk_thread	*thread;
6522 
6523 	SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name);
6524 
6525 	thread = spdk_get_thread();
6526 	if (!thread) {
6527 		/* The user called this from a non-SPDK thread. */
6528 		if (cb_fn != NULL) {
6529 			cb_fn(cb_arg, -ENOTSUP);
6530 		}
6531 		return;
6532 	}
6533 
6534 	pthread_mutex_lock(&g_bdev_mgr.mutex);
6535 	if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING ||
6536 	    bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) {
6537 		pthread_mutex_unlock(&g_bdev_mgr.mutex);
6538 		if (cb_fn) {
6539 			cb_fn(cb_arg, -EBUSY);
6540 		}
6541 		return;
6542 	}
6543 
6544 	pthread_mutex_lock(&bdev->internal.mutex);
6545 	bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING;
6546 	bdev->internal.unregister_cb = cb_fn;
6547 	bdev->internal.unregister_ctx = cb_arg;
6548 	pthread_mutex_unlock(&bdev->internal.mutex);
6549 	pthread_mutex_unlock(&g_bdev_mgr.mutex);
6550 
6551 	spdk_bdev_set_qd_sampling_period(bdev, 0);
6552 
6553 	spdk_for_each_channel(__bdev_to_io_dev(bdev),
6554 			      bdev_unregister_abort_channel,
6555 			      bdev,
6556 			      bdev_unregister);
6557 }
6558 
6559 int
6560 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module,
6561 			     spdk_bdev_unregister_cb cb_fn, void *cb_arg)
6562 {
6563 	struct spdk_bdev_desc *desc;
6564 	struct spdk_bdev *bdev;
6565 	int rc;
6566 
6567 	rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc);
6568 	if (rc != 0) {
6569 		SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name);
6570 		return rc;
6571 	}
6572 
6573 	bdev = spdk_bdev_desc_get_bdev(desc);
6574 
6575 	if (bdev->module != module) {
6576 		spdk_bdev_close(desc);
6577 		SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n",
6578 			    bdev_name);
6579 		return -ENODEV;
6580 	}
6581 
6582 	spdk_bdev_unregister(bdev, cb_fn, cb_arg);
6583 
6584 	spdk_bdev_close(desc);
6585 
6586 	return 0;
6587 }
6588 
6589 static int
6590 bdev_start_qos(struct spdk_bdev *bdev)
6591 {
6592 	struct set_qos_limit_ctx *ctx;
6593 
6594 	/* Enable QoS */
6595 	if (bdev->internal.qos && bdev->internal.qos->thread == NULL) {
6596 		ctx = calloc(1, sizeof(*ctx));
6597 		if (ctx == NULL) {
6598 			SPDK_ERRLOG("Failed to allocate memory for QoS context\n");
6599 			return -ENOMEM;
6600 		}
6601 		ctx->bdev = bdev;
6602 		spdk_for_each_channel(__bdev_to_io_dev(bdev),
6603 				      bdev_enable_qos_msg, ctx,
6604 				      bdev_enable_qos_done);
6605 	}
6606 
6607 	return 0;
6608 }
6609 
6610 static int
6611 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc)
6612 {
6613 	struct spdk_thread *thread;
6614 	int rc = 0;
6615 
6616 	thread = spdk_get_thread();
6617 	if (!thread) {
6618 		SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n");
6619 		return -ENOTSUP;
6620 	}
6621 
6622 	SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
6623 		      spdk_get_thread());
6624 
6625 	desc->bdev = bdev;
6626 	desc->thread = thread;
6627 	desc->write = write;
6628 
6629 	pthread_mutex_lock(&bdev->internal.mutex);
6630 	if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING ||
6631 	    bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) {
6632 		pthread_mutex_unlock(&bdev->internal.mutex);
6633 		return -ENODEV;
6634 	}
6635 
6636 	if (write && bdev->internal.claim_module) {
6637 		SPDK_ERRLOG("Could not open %s - %s module already claimed it\n",
6638 			    bdev->name, bdev->internal.claim_module->name);
6639 		pthread_mutex_unlock(&bdev->internal.mutex);
6640 		return -EPERM;
6641 	}
6642 
6643 	rc = bdev_start_qos(bdev);
6644 	if (rc != 0) {
6645 		SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name);
6646 		pthread_mutex_unlock(&bdev->internal.mutex);
6647 		return rc;
6648 	}
6649 
6650 	TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link);
6651 
6652 	pthread_mutex_unlock(&bdev->internal.mutex);
6653 
6654 	return 0;
6655 }
6656 
6657 static int
6658 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx,
6659 		struct spdk_bdev_desc **_desc)
6660 {
6661 	struct spdk_bdev_desc *desc;
6662 	unsigned int event_id;
6663 
6664 	desc = calloc(1, sizeof(*desc));
6665 	if (desc == NULL) {
6666 		SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n");
6667 		return -ENOMEM;
6668 	}
6669 
6670 	TAILQ_INIT(&desc->pending_media_events);
6671 	TAILQ_INIT(&desc->free_media_events);
6672 
6673 	desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0;
6674 	desc->callback.event_fn = event_cb;
6675 	desc->callback.ctx = event_ctx;
6676 	pthread_mutex_init(&desc->mutex, NULL);
6677 
6678 	if (bdev->media_events) {
6679 		desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE,
6680 						   sizeof(*desc->media_events_buffer));
6681 		if (desc->media_events_buffer == NULL) {
6682 			SPDK_ERRLOG("Failed to initialize media event pool\n");
6683 			bdev_desc_free(desc);
6684 			return -ENOMEM;
6685 		}
6686 
6687 		for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) {
6688 			TAILQ_INSERT_TAIL(&desc->free_media_events,
6689 					  &desc->media_events_buffer[event_id], tailq);
6690 		}
6691 	}
6692 
6693 	*_desc = desc;
6694 
6695 	return 0;
6696 }
6697 
6698 int
6699 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
6700 		   void *event_ctx, struct spdk_bdev_desc **_desc)
6701 {
6702 	struct spdk_bdev_desc *desc;
6703 	struct spdk_bdev *bdev;
6704 	int rc;
6705 
6706 	if (event_cb == NULL) {
6707 		SPDK_ERRLOG("Missing event callback function\n");
6708 		return -EINVAL;
6709 	}
6710 
6711 	pthread_mutex_lock(&g_bdev_mgr.mutex);
6712 
6713 	bdev = bdev_get_by_name(bdev_name);
6714 
6715 	if (bdev == NULL) {
6716 		SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name);
6717 		pthread_mutex_unlock(&g_bdev_mgr.mutex);
6718 		return -ENODEV;
6719 	}
6720 
6721 	rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc);
6722 	if (rc != 0) {
6723 		pthread_mutex_unlock(&g_bdev_mgr.mutex);
6724 		return rc;
6725 	}
6726 
6727 	rc = bdev_open(bdev, write, desc);
6728 	if (rc != 0) {
6729 		bdev_desc_free(desc);
6730 		desc = NULL;
6731 	}
6732 
6733 	*_desc = desc;
6734 
6735 	pthread_mutex_unlock(&g_bdev_mgr.mutex);
6736 
6737 	return rc;
6738 }
6739 
6740 static void
6741 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc)
6742 {
6743 	int rc;
6744 
6745 	pthread_mutex_lock(&bdev->internal.mutex);
6746 	pthread_mutex_lock(&desc->mutex);
6747 
6748 	TAILQ_REMOVE(&bdev->internal.open_descs, desc, link);
6749 
6750 	desc->closed = true;
6751 
6752 	if (0 == desc->refs) {
6753 		pthread_mutex_unlock(&desc->mutex);
6754 		bdev_desc_free(desc);
6755 	} else {
6756 		pthread_mutex_unlock(&desc->mutex);
6757 	}
6758 
6759 	/* If no more descriptors, kill QoS channel */
6760 	if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) {
6761 		SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n",
6762 			      bdev->name, spdk_get_thread());
6763 
6764 		if (bdev_qos_destroy(bdev)) {
6765 			/* There isn't anything we can do to recover here. Just let the
6766 			 * old QoS poller keep running. The QoS handling won't change
6767 			 * cores when the user allocates a new channel, but it won't break. */
6768 			SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n");
6769 		}
6770 	}
6771 
6772 	if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) {
6773 		rc = bdev_unregister_unsafe(bdev);
6774 		pthread_mutex_unlock(&bdev->internal.mutex);
6775 
6776 		if (rc == 0) {
6777 			spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
6778 		}
6779 	} else {
6780 		pthread_mutex_unlock(&bdev->internal.mutex);
6781 	}
6782 }
6783 
6784 void
6785 spdk_bdev_close(struct spdk_bdev_desc *desc)
6786 {
6787 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6788 
6789 	SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
6790 		      spdk_get_thread());
6791 
6792 	assert(desc->thread == spdk_get_thread());
6793 
6794 	spdk_poller_unregister(&desc->io_timeout_poller);
6795 
6796 	pthread_mutex_lock(&g_bdev_mgr.mutex);
6797 
6798 	bdev_close(bdev, desc);
6799 
6800 	pthread_mutex_unlock(&g_bdev_mgr.mutex);
6801 }
6802 
6803 static void
6804 bdev_register_finished(void *arg)
6805 {
6806 	struct spdk_bdev_desc *desc = arg;
6807 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6808 
6809 	spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev));
6810 
6811 	bdev_close(bdev, desc);
6812 }
6813 
6814 int
6815 spdk_bdev_register(struct spdk_bdev *bdev)
6816 {
6817 	struct spdk_bdev_desc *desc;
6818 	int rc;
6819 
6820 	rc = bdev_register(bdev);
6821 	if (rc != 0) {
6822 		return rc;
6823 	}
6824 
6825 	/* A descriptor is opened to prevent bdev deletion during examination */
6826 	rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc);
6827 	if (rc != 0) {
6828 		spdk_bdev_unregister(bdev, NULL, NULL);
6829 		return rc;
6830 	}
6831 
6832 	rc = bdev_open(bdev, false, desc);
6833 	if (rc != 0) {
6834 		bdev_desc_free(desc);
6835 		spdk_bdev_unregister(bdev, NULL, NULL);
6836 		return rc;
6837 	}
6838 
6839 	/* Examine configuration before initializing I/O */
6840 	bdev_examine(bdev);
6841 
6842 	rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc);
6843 	if (rc != 0) {
6844 		bdev_close(bdev, desc);
6845 		spdk_bdev_unregister(bdev, NULL, NULL);
6846 	}
6847 
6848 	return rc;
6849 }
6850 
6851 int
6852 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
6853 			    struct spdk_bdev_module *module)
6854 {
6855 	if (bdev->internal.claim_module != NULL) {
6856 		SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name,
6857 			    bdev->internal.claim_module->name);
6858 		return -EPERM;
6859 	}
6860 
6861 	if (desc && !desc->write) {
6862 		desc->write = true;
6863 	}
6864 
6865 	bdev->internal.claim_module = module;
6866 	return 0;
6867 }
6868 
6869 void
6870 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev)
6871 {
6872 	assert(bdev->internal.claim_module != NULL);
6873 	bdev->internal.claim_module = NULL;
6874 }
6875 
6876 struct spdk_bdev *
6877 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc)
6878 {
6879 	assert(desc != NULL);
6880 	return desc->bdev;
6881 }
6882 
6883 int
6884 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn)
6885 {
6886 	struct spdk_bdev *bdev, *tmp;
6887 	struct spdk_bdev_desc *desc;
6888 	int rc = 0;
6889 
6890 	assert(fn != NULL);
6891 
6892 	pthread_mutex_lock(&g_bdev_mgr.mutex);
6893 	bdev = spdk_bdev_first();
6894 	while (bdev != NULL) {
6895 		rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc);
6896 		if (rc != 0) {
6897 			break;
6898 		}
6899 		rc = bdev_open(bdev, false, desc);
6900 		if (rc != 0) {
6901 			bdev_desc_free(desc);
6902 			break;
6903 		}
6904 		pthread_mutex_unlock(&g_bdev_mgr.mutex);
6905 
6906 		rc = fn(ctx, bdev);
6907 
6908 		pthread_mutex_lock(&g_bdev_mgr.mutex);
6909 		tmp = spdk_bdev_next(bdev);
6910 		bdev_close(bdev, desc);
6911 		if (rc != 0) {
6912 			break;
6913 		}
6914 		bdev = tmp;
6915 	}
6916 	pthread_mutex_unlock(&g_bdev_mgr.mutex);
6917 
6918 	return rc;
6919 }
6920 
6921 int
6922 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn)
6923 {
6924 	struct spdk_bdev *bdev, *tmp;
6925 	struct spdk_bdev_desc *desc;
6926 	int rc = 0;
6927 
6928 	assert(fn != NULL);
6929 
6930 	pthread_mutex_lock(&g_bdev_mgr.mutex);
6931 	bdev = spdk_bdev_first_leaf();
6932 	while (bdev != NULL) {
6933 		rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc);
6934 		if (rc != 0) {
6935 			break;
6936 		}
6937 		rc = bdev_open(bdev, false, desc);
6938 		if (rc != 0) {
6939 			bdev_desc_free(desc);
6940 			break;
6941 		}
6942 		pthread_mutex_unlock(&g_bdev_mgr.mutex);
6943 
6944 		rc = fn(ctx, bdev);
6945 
6946 		pthread_mutex_lock(&g_bdev_mgr.mutex);
6947 		tmp = spdk_bdev_next_leaf(bdev);
6948 		bdev_close(bdev, desc);
6949 		if (rc != 0) {
6950 			break;
6951 		}
6952 		bdev = tmp;
6953 	}
6954 	pthread_mutex_unlock(&g_bdev_mgr.mutex);
6955 
6956 	return rc;
6957 }
6958 
6959 void
6960 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp)
6961 {
6962 	struct iovec *iovs;
6963 	int iovcnt;
6964 
6965 	if (bdev_io == NULL) {
6966 		return;
6967 	}
6968 
6969 	switch (bdev_io->type) {
6970 	case SPDK_BDEV_IO_TYPE_READ:
6971 	case SPDK_BDEV_IO_TYPE_WRITE:
6972 	case SPDK_BDEV_IO_TYPE_ZCOPY:
6973 		iovs = bdev_io->u.bdev.iovs;
6974 		iovcnt = bdev_io->u.bdev.iovcnt;
6975 		break;
6976 	default:
6977 		iovs = NULL;
6978 		iovcnt = 0;
6979 		break;
6980 	}
6981 
6982 	if (iovp) {
6983 		*iovp = iovs;
6984 	}
6985 	if (iovcntp) {
6986 		*iovcntp = iovcnt;
6987 	}
6988 }
6989 
6990 void *
6991 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io)
6992 {
6993 	if (bdev_io == NULL) {
6994 		return NULL;
6995 	}
6996 
6997 	if (!spdk_bdev_is_md_separate(bdev_io->bdev)) {
6998 		return NULL;
6999 	}
7000 
7001 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ ||
7002 	    bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
7003 		return bdev_io->u.bdev.md_buf;
7004 	}
7005 
7006 	return NULL;
7007 }
7008 
7009 void *
7010 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io)
7011 {
7012 	if (bdev_io == NULL) {
7013 		assert(false);
7014 		return NULL;
7015 	}
7016 
7017 	return bdev_io->internal.caller_ctx;
7018 }
7019 
7020 void
7021 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module)
7022 {
7023 
7024 	if (spdk_bdev_module_list_find(bdev_module->name)) {
7025 		SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name);
7026 		assert(false);
7027 	}
7028 
7029 	/*
7030 	 * Modules with examine callbacks must be initialized first, so they are
7031 	 *  ready to handle examine callbacks from later modules that will
7032 	 *  register physical bdevs.
7033 	 */
7034 	if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) {
7035 		TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
7036 	} else {
7037 		TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
7038 	}
7039 }
7040 
7041 struct spdk_bdev_module *
7042 spdk_bdev_module_list_find(const char *name)
7043 {
7044 	struct spdk_bdev_module *bdev_module;
7045 
7046 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
7047 		if (strcmp(name, bdev_module->name) == 0) {
7048 			break;
7049 		}
7050 	}
7051 
7052 	return bdev_module;
7053 }
7054 
7055 static void
7056 bdev_write_zero_buffer_next(void *_bdev_io)
7057 {
7058 	struct spdk_bdev_io *bdev_io = _bdev_io;
7059 	uint64_t num_bytes, num_blocks;
7060 	void *md_buf = NULL;
7061 	int rc;
7062 
7063 	num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) *
7064 			     bdev_io->u.bdev.split_remaining_num_blocks,
7065 			     ZERO_BUFFER_SIZE);
7066 	num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev);
7067 
7068 	if (spdk_bdev_is_md_separate(bdev_io->bdev)) {
7069 		md_buf = (char *)g_bdev_mgr.zero_buffer +
7070 			 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks;
7071 	}
7072 
7073 	rc = bdev_write_blocks_with_md(bdev_io->internal.desc,
7074 				       spdk_io_channel_from_ctx(bdev_io->internal.ch),
7075 				       g_bdev_mgr.zero_buffer, md_buf,
7076 				       bdev_io->u.bdev.split_current_offset_blocks, num_blocks,
7077 				       bdev_write_zero_buffer_done, bdev_io);
7078 	if (rc == 0) {
7079 		bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks;
7080 		bdev_io->u.bdev.split_current_offset_blocks += num_blocks;
7081 	} else if (rc == -ENOMEM) {
7082 		bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next);
7083 	} else {
7084 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7085 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
7086 	}
7087 }
7088 
7089 static void
7090 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
7091 {
7092 	struct spdk_bdev_io *parent_io = cb_arg;
7093 
7094 	spdk_bdev_free_io(bdev_io);
7095 
7096 	if (!success) {
7097 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7098 		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
7099 		return;
7100 	}
7101 
7102 	if (parent_io->u.bdev.split_remaining_num_blocks == 0) {
7103 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
7104 		parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx);
7105 		return;
7106 	}
7107 
7108 	bdev_write_zero_buffer_next(parent_io);
7109 }
7110 
7111 static void
7112 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status)
7113 {
7114 	pthread_mutex_lock(&ctx->bdev->internal.mutex);
7115 	ctx->bdev->internal.qos_mod_in_progress = false;
7116 	pthread_mutex_unlock(&ctx->bdev->internal.mutex);
7117 
7118 	if (ctx->cb_fn) {
7119 		ctx->cb_fn(ctx->cb_arg, status);
7120 	}
7121 	free(ctx);
7122 }
7123 
7124 static void
7125 bdev_disable_qos_done(void *cb_arg)
7126 {
7127 	struct set_qos_limit_ctx *ctx = cb_arg;
7128 	struct spdk_bdev *bdev = ctx->bdev;
7129 	struct spdk_bdev_io *bdev_io;
7130 	struct spdk_bdev_qos *qos;
7131 
7132 	pthread_mutex_lock(&bdev->internal.mutex);
7133 	qos = bdev->internal.qos;
7134 	bdev->internal.qos = NULL;
7135 	pthread_mutex_unlock(&bdev->internal.mutex);
7136 
7137 	while (!TAILQ_EMPTY(&qos->queued)) {
7138 		/* Send queued I/O back to their original thread for resubmission. */
7139 		bdev_io = TAILQ_FIRST(&qos->queued);
7140 		TAILQ_REMOVE(&qos->queued, bdev_io, internal.link);
7141 
7142 		if (bdev_io->internal.io_submit_ch) {
7143 			/*
7144 			 * Channel was changed when sending it to the QoS thread - change it back
7145 			 *  before sending it back to the original thread.
7146 			 */
7147 			bdev_io->internal.ch = bdev_io->internal.io_submit_ch;
7148 			bdev_io->internal.io_submit_ch = NULL;
7149 		}
7150 
7151 		spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
7152 				     _bdev_io_submit, bdev_io);
7153 	}
7154 
7155 	if (qos->thread != NULL) {
7156 		spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
7157 		spdk_poller_unregister(&qos->poller);
7158 	}
7159 
7160 	free(qos);
7161 
7162 	bdev_set_qos_limit_done(ctx, 0);
7163 }
7164 
7165 static void
7166 bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status)
7167 {
7168 	void *io_device = spdk_io_channel_iter_get_io_device(i);
7169 	struct spdk_bdev *bdev = __bdev_from_io_dev(io_device);
7170 	struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
7171 	struct spdk_thread *thread;
7172 
7173 	pthread_mutex_lock(&bdev->internal.mutex);
7174 	thread = bdev->internal.qos->thread;
7175 	pthread_mutex_unlock(&bdev->internal.mutex);
7176 
7177 	if (thread != NULL) {
7178 		spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx);
7179 	} else {
7180 		bdev_disable_qos_done(ctx);
7181 	}
7182 }
7183 
7184 static void
7185 bdev_disable_qos_msg(struct spdk_io_channel_iter *i)
7186 {
7187 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
7188 	struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch);
7189 
7190 	bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED;
7191 
7192 	spdk_for_each_channel_continue(i, 0);
7193 }
7194 
7195 static void
7196 bdev_update_qos_rate_limit_msg(void *cb_arg)
7197 {
7198 	struct set_qos_limit_ctx *ctx = cb_arg;
7199 	struct spdk_bdev *bdev = ctx->bdev;
7200 
7201 	pthread_mutex_lock(&bdev->internal.mutex);
7202 	bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos);
7203 	pthread_mutex_unlock(&bdev->internal.mutex);
7204 
7205 	bdev_set_qos_limit_done(ctx, 0);
7206 }
7207 
7208 static void
7209 bdev_enable_qos_msg(struct spdk_io_channel_iter *i)
7210 {
7211 	void *io_device = spdk_io_channel_iter_get_io_device(i);
7212 	struct spdk_bdev *bdev = __bdev_from_io_dev(io_device);
7213 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
7214 	struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch);
7215 
7216 	pthread_mutex_lock(&bdev->internal.mutex);
7217 	bdev_enable_qos(bdev, bdev_ch);
7218 	pthread_mutex_unlock(&bdev->internal.mutex);
7219 	spdk_for_each_channel_continue(i, 0);
7220 }
7221 
7222 static void
7223 bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status)
7224 {
7225 	struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
7226 
7227 	bdev_set_qos_limit_done(ctx, status);
7228 }
7229 
7230 static void
7231 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits)
7232 {
7233 	int i;
7234 
7235 	assert(bdev->internal.qos != NULL);
7236 
7237 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
7238 		if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
7239 			bdev->internal.qos->rate_limits[i].limit = limits[i];
7240 
7241 			if (limits[i] == 0) {
7242 				bdev->internal.qos->rate_limits[i].limit =
7243 					SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
7244 			}
7245 		}
7246 	}
7247 }
7248 
7249 void
7250 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits,
7251 			      void (*cb_fn)(void *cb_arg, int status), void *cb_arg)
7252 {
7253 	struct set_qos_limit_ctx	*ctx;
7254 	uint32_t			limit_set_complement;
7255 	uint64_t			min_limit_per_sec;
7256 	int				i;
7257 	bool				disable_rate_limit = true;
7258 
7259 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
7260 		if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
7261 			continue;
7262 		}
7263 
7264 		if (limits[i] > 0) {
7265 			disable_rate_limit = false;
7266 		}
7267 
7268 		if (bdev_qos_is_iops_rate_limit(i) == true) {
7269 			min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC;
7270 		} else {
7271 			/* Change from megabyte to byte rate limit */
7272 			limits[i] = limits[i] * 1024 * 1024;
7273 			min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC;
7274 		}
7275 
7276 		limit_set_complement = limits[i] % min_limit_per_sec;
7277 		if (limit_set_complement) {
7278 			SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n",
7279 				    limits[i], min_limit_per_sec);
7280 			limits[i] += min_limit_per_sec - limit_set_complement;
7281 			SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]);
7282 		}
7283 	}
7284 
7285 	ctx = calloc(1, sizeof(*ctx));
7286 	if (ctx == NULL) {
7287 		cb_fn(cb_arg, -ENOMEM);
7288 		return;
7289 	}
7290 
7291 	ctx->cb_fn = cb_fn;
7292 	ctx->cb_arg = cb_arg;
7293 	ctx->bdev = bdev;
7294 
7295 	pthread_mutex_lock(&bdev->internal.mutex);
7296 	if (bdev->internal.qos_mod_in_progress) {
7297 		pthread_mutex_unlock(&bdev->internal.mutex);
7298 		free(ctx);
7299 		cb_fn(cb_arg, -EAGAIN);
7300 		return;
7301 	}
7302 	bdev->internal.qos_mod_in_progress = true;
7303 
7304 	if (disable_rate_limit == true && bdev->internal.qos) {
7305 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
7306 			if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED &&
7307 			    (bdev->internal.qos->rate_limits[i].limit > 0 &&
7308 			     bdev->internal.qos->rate_limits[i].limit !=
7309 			     SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) {
7310 				disable_rate_limit = false;
7311 				break;
7312 			}
7313 		}
7314 	}
7315 
7316 	if (disable_rate_limit == false) {
7317 		if (bdev->internal.qos == NULL) {
7318 			bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos));
7319 			if (!bdev->internal.qos) {
7320 				pthread_mutex_unlock(&bdev->internal.mutex);
7321 				SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n");
7322 				bdev_set_qos_limit_done(ctx, -ENOMEM);
7323 				return;
7324 			}
7325 		}
7326 
7327 		if (bdev->internal.qos->thread == NULL) {
7328 			/* Enabling */
7329 			bdev_set_qos_rate_limits(bdev, limits);
7330 
7331 			spdk_for_each_channel(__bdev_to_io_dev(bdev),
7332 					      bdev_enable_qos_msg, ctx,
7333 					      bdev_enable_qos_done);
7334 		} else {
7335 			/* Updating */
7336 			bdev_set_qos_rate_limits(bdev, limits);
7337 
7338 			spdk_thread_send_msg(bdev->internal.qos->thread,
7339 					     bdev_update_qos_rate_limit_msg, ctx);
7340 		}
7341 	} else {
7342 		if (bdev->internal.qos != NULL) {
7343 			bdev_set_qos_rate_limits(bdev, limits);
7344 
7345 			/* Disabling */
7346 			spdk_for_each_channel(__bdev_to_io_dev(bdev),
7347 					      bdev_disable_qos_msg, ctx,
7348 					      bdev_disable_qos_msg_done);
7349 		} else {
7350 			pthread_mutex_unlock(&bdev->internal.mutex);
7351 			bdev_set_qos_limit_done(ctx, 0);
7352 			return;
7353 		}
7354 	}
7355 
7356 	pthread_mutex_unlock(&bdev->internal.mutex);
7357 }
7358 
7359 struct spdk_bdev_histogram_ctx {
7360 	spdk_bdev_histogram_status_cb cb_fn;
7361 	void *cb_arg;
7362 	struct spdk_bdev *bdev;
7363 	int status;
7364 };
7365 
7366 static void
7367 bdev_histogram_disable_channel_cb(struct spdk_io_channel_iter *i, int status)
7368 {
7369 	struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
7370 
7371 	pthread_mutex_lock(&ctx->bdev->internal.mutex);
7372 	ctx->bdev->internal.histogram_in_progress = false;
7373 	pthread_mutex_unlock(&ctx->bdev->internal.mutex);
7374 	ctx->cb_fn(ctx->cb_arg, ctx->status);
7375 	free(ctx);
7376 }
7377 
7378 static void
7379 bdev_histogram_disable_channel(struct spdk_io_channel_iter *i)
7380 {
7381 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
7382 	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
7383 
7384 	if (ch->histogram != NULL) {
7385 		spdk_histogram_data_free(ch->histogram);
7386 		ch->histogram = NULL;
7387 	}
7388 	spdk_for_each_channel_continue(i, 0);
7389 }
7390 
7391 static void
7392 bdev_histogram_enable_channel_cb(struct spdk_io_channel_iter *i, int status)
7393 {
7394 	struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
7395 
7396 	if (status != 0) {
7397 		ctx->status = status;
7398 		ctx->bdev->internal.histogram_enabled = false;
7399 		spdk_for_each_channel(__bdev_to_io_dev(ctx->bdev), bdev_histogram_disable_channel, ctx,
7400 				      bdev_histogram_disable_channel_cb);
7401 	} else {
7402 		pthread_mutex_lock(&ctx->bdev->internal.mutex);
7403 		ctx->bdev->internal.histogram_in_progress = false;
7404 		pthread_mutex_unlock(&ctx->bdev->internal.mutex);
7405 		ctx->cb_fn(ctx->cb_arg, ctx->status);
7406 		free(ctx);
7407 	}
7408 }
7409 
7410 static void
7411 bdev_histogram_enable_channel(struct spdk_io_channel_iter *i)
7412 {
7413 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
7414 	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
7415 	int status = 0;
7416 
7417 	if (ch->histogram == NULL) {
7418 		ch->histogram = spdk_histogram_data_alloc();
7419 		if (ch->histogram == NULL) {
7420 			status = -ENOMEM;
7421 		}
7422 	}
7423 
7424 	spdk_for_each_channel_continue(i, status);
7425 }
7426 
7427 void
7428 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn,
7429 			   void *cb_arg, bool enable)
7430 {
7431 	struct spdk_bdev_histogram_ctx *ctx;
7432 
7433 	ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx));
7434 	if (ctx == NULL) {
7435 		cb_fn(cb_arg, -ENOMEM);
7436 		return;
7437 	}
7438 
7439 	ctx->bdev = bdev;
7440 	ctx->status = 0;
7441 	ctx->cb_fn = cb_fn;
7442 	ctx->cb_arg = cb_arg;
7443 
7444 	pthread_mutex_lock(&bdev->internal.mutex);
7445 	if (bdev->internal.histogram_in_progress) {
7446 		pthread_mutex_unlock(&bdev->internal.mutex);
7447 		free(ctx);
7448 		cb_fn(cb_arg, -EAGAIN);
7449 		return;
7450 	}
7451 
7452 	bdev->internal.histogram_in_progress = true;
7453 	pthread_mutex_unlock(&bdev->internal.mutex);
7454 
7455 	bdev->internal.histogram_enabled = enable;
7456 
7457 	if (enable) {
7458 		/* Allocate histogram for each channel */
7459 		spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_enable_channel, ctx,
7460 				      bdev_histogram_enable_channel_cb);
7461 	} else {
7462 		spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_disable_channel, ctx,
7463 				      bdev_histogram_disable_channel_cb);
7464 	}
7465 }
7466 
7467 struct spdk_bdev_histogram_data_ctx {
7468 	spdk_bdev_histogram_data_cb cb_fn;
7469 	void *cb_arg;
7470 	struct spdk_bdev *bdev;
7471 	/** merged histogram data from all channels */
7472 	struct spdk_histogram_data	*histogram;
7473 };
7474 
7475 static void
7476 bdev_histogram_get_channel_cb(struct spdk_io_channel_iter *i, int status)
7477 {
7478 	struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
7479 
7480 	ctx->cb_fn(ctx->cb_arg, status, ctx->histogram);
7481 	free(ctx);
7482 }
7483 
7484 static void
7485 bdev_histogram_get_channel(struct spdk_io_channel_iter *i)
7486 {
7487 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
7488 	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
7489 	struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
7490 	int status = 0;
7491 
7492 	if (ch->histogram == NULL) {
7493 		status = -EFAULT;
7494 	} else {
7495 		spdk_histogram_data_merge(ctx->histogram, ch->histogram);
7496 	}
7497 
7498 	spdk_for_each_channel_continue(i, status);
7499 }
7500 
7501 void
7502 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram,
7503 			spdk_bdev_histogram_data_cb cb_fn,
7504 			void *cb_arg)
7505 {
7506 	struct spdk_bdev_histogram_data_ctx *ctx;
7507 
7508 	ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx));
7509 	if (ctx == NULL) {
7510 		cb_fn(cb_arg, -ENOMEM, NULL);
7511 		return;
7512 	}
7513 
7514 	ctx->bdev = bdev;
7515 	ctx->cb_fn = cb_fn;
7516 	ctx->cb_arg = cb_arg;
7517 
7518 	ctx->histogram = histogram;
7519 
7520 	spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_get_channel, ctx,
7521 			      bdev_histogram_get_channel_cb);
7522 }
7523 
7524 size_t
7525 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events,
7526 			   size_t max_events)
7527 {
7528 	struct media_event_entry *entry;
7529 	size_t num_events = 0;
7530 
7531 	for (; num_events < max_events; ++num_events) {
7532 		entry = TAILQ_FIRST(&desc->pending_media_events);
7533 		if (entry == NULL) {
7534 			break;
7535 		}
7536 
7537 		events[num_events] = entry->event;
7538 		TAILQ_REMOVE(&desc->pending_media_events, entry, tailq);
7539 		TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq);
7540 	}
7541 
7542 	return num_events;
7543 }
7544 
7545 int
7546 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events,
7547 			    size_t num_events)
7548 {
7549 	struct spdk_bdev_desc *desc;
7550 	struct media_event_entry *entry;
7551 	size_t event_id;
7552 	int rc = 0;
7553 
7554 	assert(bdev->media_events);
7555 
7556 	pthread_mutex_lock(&bdev->internal.mutex);
7557 	TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
7558 		if (desc->write) {
7559 			break;
7560 		}
7561 	}
7562 
7563 	if (desc == NULL || desc->media_events_buffer == NULL) {
7564 		rc = -ENODEV;
7565 		goto out;
7566 	}
7567 
7568 	for (event_id = 0; event_id < num_events; ++event_id) {
7569 		entry = TAILQ_FIRST(&desc->free_media_events);
7570 		if (entry == NULL) {
7571 			break;
7572 		}
7573 
7574 		TAILQ_REMOVE(&desc->free_media_events, entry, tailq);
7575 		TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq);
7576 		entry->event = events[event_id];
7577 	}
7578 
7579 	rc = event_id;
7580 out:
7581 	pthread_mutex_unlock(&bdev->internal.mutex);
7582 	return rc;
7583 }
7584 
7585 void
7586 spdk_bdev_notify_media_management(struct spdk_bdev *bdev)
7587 {
7588 	struct spdk_bdev_desc *desc;
7589 
7590 	pthread_mutex_lock(&bdev->internal.mutex);
7591 	TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
7592 		if (!TAILQ_EMPTY(&desc->pending_media_events)) {
7593 			desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev,
7594 						desc->callback.ctx);
7595 		}
7596 	}
7597 	pthread_mutex_unlock(&bdev->internal.mutex);
7598 }
7599 
7600 struct locked_lba_range_ctx {
7601 	struct lba_range		range;
7602 	struct spdk_bdev		*bdev;
7603 	struct lba_range		*current_range;
7604 	struct lba_range		*owner_range;
7605 	struct spdk_poller		*poller;
7606 	lock_range_cb			cb_fn;
7607 	void				*cb_arg;
7608 };
7609 
7610 static void
7611 bdev_lock_error_cleanup_cb(struct spdk_io_channel_iter *i, int status)
7612 {
7613 	struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
7614 
7615 	ctx->cb_fn(ctx->cb_arg, -ENOMEM);
7616 	free(ctx);
7617 }
7618 
7619 static void bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i);
7620 
7621 static void
7622 bdev_lock_lba_range_cb(struct spdk_io_channel_iter *i, int status)
7623 {
7624 	struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
7625 	struct spdk_bdev *bdev = ctx->bdev;
7626 
7627 	if (status == -ENOMEM) {
7628 		/* One of the channels could not allocate a range object.
7629 		 * So we have to go back and clean up any ranges that were
7630 		 * allocated successfully before we return error status to
7631 		 * the caller.  We can reuse the unlock function to do that
7632 		 * clean up.
7633 		 */
7634 		spdk_for_each_channel(__bdev_to_io_dev(bdev),
7635 				      bdev_unlock_lba_range_get_channel, ctx,
7636 				      bdev_lock_error_cleanup_cb);
7637 		return;
7638 	}
7639 
7640 	/* All channels have locked this range and no I/O overlapping the range
7641 	 * are outstanding!  Set the owner_ch for the range object for the
7642 	 * locking channel, so that this channel will know that it is allowed
7643 	 * to write to this range.
7644 	 */
7645 	ctx->owner_range->owner_ch = ctx->range.owner_ch;
7646 	ctx->cb_fn(ctx->cb_arg, status);
7647 
7648 	/* Don't free the ctx here.  Its range is in the bdev's global list of
7649 	 * locked ranges still, and will be removed and freed when this range
7650 	 * is later unlocked.
7651 	 */
7652 }
7653 
7654 static int
7655 bdev_lock_lba_range_check_io(void *_i)
7656 {
7657 	struct spdk_io_channel_iter *i = _i;
7658 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
7659 	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
7660 	struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
7661 	struct lba_range *range = ctx->current_range;
7662 	struct spdk_bdev_io *bdev_io;
7663 
7664 	spdk_poller_unregister(&ctx->poller);
7665 
7666 	/* The range is now in the locked_ranges, so no new IO can be submitted to this
7667 	 * range.  But we need to wait until any outstanding IO overlapping with this range
7668 	 * are completed.
7669 	 */
7670 	TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) {
7671 		if (bdev_io_range_is_locked(bdev_io, range)) {
7672 			ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100);
7673 			return SPDK_POLLER_BUSY;
7674 		}
7675 	}
7676 
7677 	spdk_for_each_channel_continue(i, 0);
7678 	return SPDK_POLLER_BUSY;
7679 }
7680 
7681 static void
7682 bdev_lock_lba_range_get_channel(struct spdk_io_channel_iter *i)
7683 {
7684 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
7685 	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
7686 	struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
7687 	struct lba_range *range;
7688 
7689 	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
7690 		if (range->length == ctx->range.length &&
7691 		    range->offset == ctx->range.offset &&
7692 		    range->locked_ctx == ctx->range.locked_ctx) {
7693 			/* This range already exists on this channel, so don't add
7694 			 * it again.  This can happen when a new channel is created
7695 			 * while the for_each_channel operation is in progress.
7696 			 * Do not check for outstanding I/O in that case, since the
7697 			 * range was locked before any I/O could be submitted to the
7698 			 * new channel.
7699 			 */
7700 			spdk_for_each_channel_continue(i, 0);
7701 			return;
7702 		}
7703 	}
7704 
7705 	range = calloc(1, sizeof(*range));
7706 	if (range == NULL) {
7707 		spdk_for_each_channel_continue(i, -ENOMEM);
7708 		return;
7709 	}
7710 
7711 	range->length = ctx->range.length;
7712 	range->offset = ctx->range.offset;
7713 	range->locked_ctx = ctx->range.locked_ctx;
7714 	ctx->current_range = range;
7715 	if (ctx->range.owner_ch == ch) {
7716 		/* This is the range object for the channel that will hold
7717 		 * the lock.  Store it in the ctx object so that we can easily
7718 		 * set its owner_ch after the lock is finally acquired.
7719 		 */
7720 		ctx->owner_range = range;
7721 	}
7722 	TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq);
7723 	bdev_lock_lba_range_check_io(i);
7724 }
7725 
7726 static void
7727 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx)
7728 {
7729 	assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel));
7730 
7731 	/* We will add a copy of this range to each channel now. */
7732 	spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_lock_lba_range_get_channel, ctx,
7733 			      bdev_lock_lba_range_cb);
7734 }
7735 
7736 static bool
7737 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq)
7738 {
7739 	struct lba_range *r;
7740 
7741 	TAILQ_FOREACH(r, tailq, tailq) {
7742 		if (bdev_lba_range_overlapped(range, r)) {
7743 			return true;
7744 		}
7745 	}
7746 	return false;
7747 }
7748 
7749 static int
7750 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
7751 		    uint64_t offset, uint64_t length,
7752 		    lock_range_cb cb_fn, void *cb_arg)
7753 {
7754 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7755 	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
7756 	struct locked_lba_range_ctx *ctx;
7757 
7758 	if (cb_arg == NULL) {
7759 		SPDK_ERRLOG("cb_arg must not be NULL\n");
7760 		return -EINVAL;
7761 	}
7762 
7763 	ctx = calloc(1, sizeof(*ctx));
7764 	if (ctx == NULL) {
7765 		return -ENOMEM;
7766 	}
7767 
7768 	ctx->range.offset = offset;
7769 	ctx->range.length = length;
7770 	ctx->range.owner_ch = ch;
7771 	ctx->range.locked_ctx = cb_arg;
7772 	ctx->bdev = bdev;
7773 	ctx->cb_fn = cb_fn;
7774 	ctx->cb_arg = cb_arg;
7775 
7776 	pthread_mutex_lock(&bdev->internal.mutex);
7777 	if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) {
7778 		/* There is an active lock overlapping with this range.
7779 		 * Put it on the pending list until this range no
7780 		 * longer overlaps with another.
7781 		 */
7782 		TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq);
7783 	} else {
7784 		TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq);
7785 		bdev_lock_lba_range_ctx(bdev, ctx);
7786 	}
7787 	pthread_mutex_unlock(&bdev->internal.mutex);
7788 	return 0;
7789 }
7790 
7791 static void
7792 bdev_lock_lba_range_ctx_msg(void *_ctx)
7793 {
7794 	struct locked_lba_range_ctx *ctx = _ctx;
7795 
7796 	bdev_lock_lba_range_ctx(ctx->bdev, ctx);
7797 }
7798 
7799 static void
7800 bdev_unlock_lba_range_cb(struct spdk_io_channel_iter *i, int status)
7801 {
7802 	struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
7803 	struct locked_lba_range_ctx *pending_ctx;
7804 	struct spdk_bdev_channel *ch = ctx->range.owner_ch;
7805 	struct spdk_bdev *bdev = ch->bdev;
7806 	struct lba_range *range, *tmp;
7807 
7808 	pthread_mutex_lock(&bdev->internal.mutex);
7809 	/* Check if there are any pending locked ranges that overlap with this range
7810 	 * that was just unlocked.  If there are, check that it doesn't overlap with any
7811 	 * other locked ranges before calling bdev_lock_lba_range_ctx which will start
7812 	 * the lock process.
7813 	 */
7814 	TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) {
7815 		if (bdev_lba_range_overlapped(range, &ctx->range) &&
7816 		    !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) {
7817 			TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq);
7818 			pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range);
7819 			TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq);
7820 			spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel),
7821 					     bdev_lock_lba_range_ctx_msg, pending_ctx);
7822 		}
7823 	}
7824 	pthread_mutex_unlock(&bdev->internal.mutex);
7825 
7826 	ctx->cb_fn(ctx->cb_arg, status);
7827 	free(ctx);
7828 }
7829 
7830 static void
7831 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i)
7832 {
7833 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
7834 	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
7835 	struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
7836 	TAILQ_HEAD(, spdk_bdev_io) io_locked;
7837 	struct spdk_bdev_io *bdev_io;
7838 	struct lba_range *range;
7839 
7840 	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
7841 		if (ctx->range.offset == range->offset &&
7842 		    ctx->range.length == range->length &&
7843 		    ctx->range.locked_ctx == range->locked_ctx) {
7844 			TAILQ_REMOVE(&ch->locked_ranges, range, tailq);
7845 			free(range);
7846 			break;
7847 		}
7848 	}
7849 
7850 	/* Note: we should almost always be able to assert that the range specified
7851 	 * was found.  But there are some very rare corner cases where a new channel
7852 	 * gets created simultaneously with a range unlock, where this function
7853 	 * would execute on that new channel and wouldn't have the range.
7854 	 * We also use this to clean up range allocations when a later allocation
7855 	 * fails in the locking path.
7856 	 * So we can't actually assert() here.
7857 	 */
7858 
7859 	/* Swap the locked IO into a temporary list, and then try to submit them again.
7860 	 * We could hyper-optimize this to only resubmit locked I/O that overlap
7861 	 * with the range that was just unlocked, but this isn't a performance path so
7862 	 * we go for simplicity here.
7863 	 */
7864 	TAILQ_INIT(&io_locked);
7865 	TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link);
7866 	while (!TAILQ_EMPTY(&io_locked)) {
7867 		bdev_io = TAILQ_FIRST(&io_locked);
7868 		TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link);
7869 		bdev_io_submit(bdev_io);
7870 	}
7871 
7872 	spdk_for_each_channel_continue(i, 0);
7873 }
7874 
7875 static int
7876 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
7877 		      uint64_t offset, uint64_t length,
7878 		      lock_range_cb cb_fn, void *cb_arg)
7879 {
7880 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7881 	struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
7882 	struct locked_lba_range_ctx *ctx;
7883 	struct lba_range *range;
7884 	bool range_found = false;
7885 
7886 	/* Let's make sure the specified channel actually has a lock on
7887 	 * the specified range.  Note that the range must match exactly.
7888 	 */
7889 	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
7890 		if (range->offset == offset && range->length == length &&
7891 		    range->owner_ch == ch && range->locked_ctx == cb_arg) {
7892 			range_found = true;
7893 			break;
7894 		}
7895 	}
7896 
7897 	if (!range_found) {
7898 		return -EINVAL;
7899 	}
7900 
7901 	pthread_mutex_lock(&bdev->internal.mutex);
7902 	/* We confirmed that this channel has locked the specified range.  To
7903 	 * start the unlock the process, we find the range in the bdev's locked_ranges
7904 	 * and remove it.  This ensures new channels don't inherit the locked range.
7905 	 * Then we will send a message to each channel (including the one specified
7906 	 * here) to remove the range from its per-channel list.
7907 	 */
7908 	TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) {
7909 		if (range->offset == offset && range->length == length &&
7910 		    range->locked_ctx == cb_arg) {
7911 			break;
7912 		}
7913 	}
7914 	if (range == NULL) {
7915 		assert(false);
7916 		pthread_mutex_unlock(&bdev->internal.mutex);
7917 		return -EINVAL;
7918 	}
7919 	TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq);
7920 	ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range);
7921 	pthread_mutex_unlock(&bdev->internal.mutex);
7922 
7923 	ctx->cb_fn = cb_fn;
7924 	ctx->cb_arg = cb_arg;
7925 
7926 	spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unlock_lba_range_get_channel, ctx,
7927 			      bdev_unlock_lba_range_cb);
7928 	return 0;
7929 }
7930 
7931 int
7932 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains,
7933 			     int array_size)
7934 {
7935 	if (!bdev) {
7936 		return -EINVAL;
7937 	}
7938 
7939 	if (bdev->fn_table->get_memory_domains) {
7940 		return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size);
7941 	}
7942 
7943 	return 0;
7944 }
7945 
7946 struct spdk_bdev_for_each_io_ctx {
7947 	void *ctx;
7948 	spdk_bdev_io_fn fn;
7949 	spdk_bdev_for_each_io_cb cb;
7950 };
7951 
7952 static void
7953 bdev_channel_for_each_io(struct spdk_io_channel_iter *i)
7954 {
7955 	struct spdk_bdev_for_each_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
7956 	struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i);
7957 	struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch);
7958 	struct spdk_bdev_io *bdev_io;
7959 	int rc = 0;
7960 
7961 	TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) {
7962 		rc = ctx->fn(ctx->ctx, bdev_io);
7963 		if (rc != 0) {
7964 			break;
7965 		}
7966 	}
7967 
7968 	spdk_for_each_channel_continue(i, rc);
7969 }
7970 
7971 static void
7972 bdev_for_each_io_done(struct spdk_io_channel_iter *i, int status)
7973 {
7974 	struct spdk_bdev_for_each_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
7975 
7976 	ctx->cb(ctx->ctx, status);
7977 
7978 	free(ctx);
7979 }
7980 
7981 void
7982 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn,
7983 			   spdk_bdev_for_each_io_cb cb)
7984 {
7985 	struct spdk_bdev_for_each_io_ctx *ctx;
7986 
7987 	assert(fn != NULL && cb != NULL);
7988 
7989 	ctx = calloc(1, sizeof(*ctx));
7990 	if (ctx == NULL) {
7991 		SPDK_ERRLOG("Failed to allocate context.\n");
7992 		cb(_ctx, -ENOMEM);
7993 		return;
7994 	}
7995 
7996 	ctx->ctx = _ctx;
7997 	ctx->fn = fn;
7998 	ctx->cb = cb;
7999 
8000 	spdk_for_each_channel(__bdev_to_io_dev(bdev),
8001 			      bdev_channel_for_each_io,
8002 			      ctx,
8003 			      bdev_for_each_io_done);
8004 }
8005 
8006 SPDK_LOG_REGISTER_COMPONENT(bdev)
8007 
8008 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV)
8009 {
8010 	struct spdk_trace_tpoint_opts opts[] = {
8011 		{
8012 			"BDEV_IO_START", TRACE_BDEV_IO_START,
8013 			OWNER_BDEV, OBJECT_BDEV_IO, 1,
8014 			{
8015 				{ "type", SPDK_TRACE_ARG_TYPE_INT, 8 },
8016 				{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 },
8017 				{ "offset", SPDK_TRACE_ARG_TYPE_INT, 8 },
8018 				{ "len", SPDK_TRACE_ARG_TYPE_INT, 8 }
8019 			}
8020 		},
8021 		{
8022 			"BDEV_IO_DONE", TRACE_BDEV_IO_DONE,
8023 			OWNER_BDEV, OBJECT_BDEV_IO, 0,
8024 			{{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }}
8025 		},
8026 		{
8027 			"BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE,
8028 			OWNER_BDEV, OBJECT_NONE, 1,
8029 			{
8030 				{ "name", SPDK_TRACE_ARG_TYPE_STR, 40 },
8031 				{ "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8}
8032 			}
8033 		},
8034 		{
8035 			"BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY,
8036 			OWNER_BDEV, OBJECT_NONE, 0,
8037 			{
8038 				{ "name", SPDK_TRACE_ARG_TYPE_STR, 40 },
8039 				{ "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8}
8040 			}
8041 		},
8042 	};
8043 
8044 
8045 	spdk_trace_register_owner(OWNER_BDEV, 'b');
8046 	spdk_trace_register_object(OBJECT_BDEV_IO, 'i');
8047 	spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts));
8048 	spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0);
8049 	spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0);
8050 }
8051