xref: /spdk/lib/bdev/bdev.c (revision fecffda6ecf8853b82edccde429b68252f0a62c5)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2016 Intel Corporation. All rights reserved.
3  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
4  *   Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  */
6 
7 #include "spdk/stdinc.h"
8 
9 #include "spdk/bdev.h"
10 
11 #include "spdk/config.h"
12 #include "spdk/env.h"
13 #include "spdk/thread.h"
14 #include "spdk/likely.h"
15 #include "spdk/queue.h"
16 #include "spdk/nvme_spec.h"
17 #include "spdk/scsi_spec.h"
18 #include "spdk/notify.h"
19 #include "spdk/util.h"
20 #include "spdk/trace.h"
21 #include "spdk/dma.h"
22 
23 #include "spdk/bdev_module.h"
24 #include "spdk/log.h"
25 #include "spdk/string.h"
26 
27 #include "bdev_internal.h"
28 #include "spdk_internal/trace_defs.h"
29 
30 #ifdef SPDK_CONFIG_VTUNE
31 #include "ittnotify.h"
32 #include "ittnotify_types.h"
33 int __itt_init_ittlib(const char *, __itt_group_id);
34 #endif
35 
36 #define SPDK_BDEV_IO_POOL_SIZE			(64 * 1024 - 1)
37 #define SPDK_BDEV_IO_CACHE_SIZE			256
38 #define SPDK_BDEV_AUTO_EXAMINE			true
39 #define BUF_SMALL_POOL_SIZE			8191
40 #define BUF_LARGE_POOL_SIZE			1023
41 #define BUF_SMALL_CACHE_SIZE			128
42 #define BUF_LARGE_CACHE_SIZE			16
43 #define NOMEM_THRESHOLD_COUNT			8
44 
45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC		1000
46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE	1
47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE	512
48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC		1000
49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC		(1024 * 1024)
50 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED		UINT64_MAX
51 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC	1000
52 
53 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command
54  * when splitting into children requests at a time.
55  */
56 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8)
57 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000
58 
59 /* The maximum number of children requests for a COPY command
60  * when splitting into children requests at a time.
61  */
62 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8)
63 
64 static const char *qos_rpc_type[] = {"rw_ios_per_sec",
65 				     "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec"
66 				    };
67 
68 TAILQ_HEAD(spdk_bdev_list, spdk_bdev);
69 
70 RB_HEAD(bdev_name_tree, spdk_bdev_name);
71 
72 static int
73 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2)
74 {
75 	return strcmp(name1->name, name2->name);
76 }
77 
78 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp);
79 
80 struct spdk_bdev_mgr {
81 	struct spdk_mempool *bdev_io_pool;
82 
83 	void *zero_buffer;
84 
85 	TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules;
86 
87 	struct spdk_bdev_list bdevs;
88 	struct bdev_name_tree bdev_names;
89 
90 	bool init_complete;
91 	bool module_init_complete;
92 
93 	struct spdk_spinlock spinlock;
94 
95 #ifdef SPDK_CONFIG_VTUNE
96 	__itt_domain	*domain;
97 #endif
98 };
99 
100 static struct spdk_bdev_mgr g_bdev_mgr = {
101 	.bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules),
102 	.bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs),
103 	.bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names),
104 	.init_complete = false,
105 	.module_init_complete = false,
106 };
107 
108 static void
109 __attribute__((constructor))
110 _bdev_init(void)
111 {
112 	spdk_spin_init(&g_bdev_mgr.spinlock);
113 }
114 
115 typedef void (*lock_range_cb)(void *ctx, int status);
116 
117 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc);
118 
119 struct lba_range {
120 	uint64_t			offset;
121 	uint64_t			length;
122 	void				*locked_ctx;
123 	struct spdk_bdev_channel	*owner_ch;
124 	TAILQ_ENTRY(lba_range)		tailq;
125 };
126 
127 static struct spdk_bdev_opts	g_bdev_opts = {
128 	.bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE,
129 	.bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE,
130 	.bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE,
131 	.small_buf_pool_size = BUF_SMALL_POOL_SIZE,
132 	.large_buf_pool_size = BUF_LARGE_POOL_SIZE,
133 };
134 
135 static spdk_bdev_init_cb	g_init_cb_fn = NULL;
136 static void			*g_init_cb_arg = NULL;
137 
138 static spdk_bdev_fini_cb	g_fini_cb_fn = NULL;
139 static void			*g_fini_cb_arg = NULL;
140 static struct spdk_thread	*g_fini_thread = NULL;
141 
142 struct spdk_bdev_qos_limit {
143 	/** IOs or bytes allowed per second (i.e., 1s). */
144 	uint64_t limit;
145 
146 	/** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms).
147 	 *  For remaining bytes, allowed to run negative if an I/O is submitted when
148 	 *  some bytes are remaining, but the I/O is bigger than that amount. The
149 	 *  excess will be deducted from the next timeslice.
150 	 */
151 	int64_t remaining_this_timeslice;
152 
153 	/** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */
154 	uint32_t min_per_timeslice;
155 
156 	/** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */
157 	uint32_t max_per_timeslice;
158 
159 	/** Function to check whether to queue the IO. */
160 	bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io);
161 
162 	/** Function to update for the submitted IO. */
163 	void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io);
164 };
165 
166 struct spdk_bdev_qos {
167 	/** Types of structure of rate limits. */
168 	struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
169 
170 	/** The channel that all I/O are funneled through. */
171 	struct spdk_bdev_channel *ch;
172 
173 	/** The thread on which the poller is running. */
174 	struct spdk_thread *thread;
175 
176 	/** Queue of I/O waiting to be issued. */
177 	bdev_io_tailq_t queued;
178 
179 	/** Size of a timeslice in tsc ticks. */
180 	uint64_t timeslice_size;
181 
182 	/** Timestamp of start of last timeslice. */
183 	uint64_t last_timeslice;
184 
185 	/** Poller that processes queued I/O commands each time slice. */
186 	struct spdk_poller *poller;
187 };
188 
189 struct spdk_bdev_mgmt_channel {
190 	/*
191 	 * Each thread keeps a cache of bdev_io - this allows
192 	 *  bdev threads which are *not* DPDK threads to still
193 	 *  benefit from a per-thread bdev_io cache.  Without
194 	 *  this, non-DPDK threads fetching from the mempool
195 	 *  incur a cmpxchg on get and put.
196 	 */
197 	bdev_io_stailq_t per_thread_cache;
198 	uint32_t	per_thread_cache_count;
199 	uint32_t	bdev_io_cache_size;
200 
201 	struct spdk_iobuf_channel iobuf;
202 
203 	TAILQ_HEAD(, spdk_bdev_shared_resource)	shared_resources;
204 	TAILQ_HEAD(, spdk_bdev_io_wait_entry)	io_wait_queue;
205 };
206 
207 /*
208  * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device
209  * will queue here their IO that awaits retry. It makes it possible to retry sending
210  * IO to one bdev after IO from other bdev completes.
211  */
212 struct spdk_bdev_shared_resource {
213 	/* The bdev management channel */
214 	struct spdk_bdev_mgmt_channel *mgmt_ch;
215 
216 	/*
217 	 * Count of I/O submitted to bdev module and waiting for completion.
218 	 * Incremented before submit_request() is called on an spdk_bdev_io.
219 	 */
220 	uint64_t		io_outstanding;
221 
222 	/*
223 	 * Queue of IO awaiting retry because of a previous NOMEM status returned
224 	 *  on this channel.
225 	 */
226 	bdev_io_tailq_t		nomem_io;
227 
228 	/*
229 	 * Threshold which io_outstanding must drop to before retrying nomem_io.
230 	 */
231 	uint64_t		nomem_threshold;
232 
233 	/* I/O channel allocated by a bdev module */
234 	struct spdk_io_channel	*shared_ch;
235 
236 	/* Refcount of bdev channels using this resource */
237 	uint32_t		ref;
238 
239 	TAILQ_ENTRY(spdk_bdev_shared_resource) link;
240 };
241 
242 #define BDEV_CH_RESET_IN_PROGRESS	(1 << 0)
243 #define BDEV_CH_QOS_ENABLED		(1 << 1)
244 
245 struct spdk_bdev_channel {
246 	struct spdk_bdev	*bdev;
247 
248 	/* The channel for the underlying device */
249 	struct spdk_io_channel	*channel;
250 
251 	/* Per io_device per thread data */
252 	struct spdk_bdev_shared_resource *shared_resource;
253 
254 	struct spdk_bdev_io_stat *stat;
255 
256 	/*
257 	 * Count of I/O submitted to the underlying dev module through this channel
258 	 * and waiting for completion.
259 	 */
260 	uint64_t		io_outstanding;
261 
262 	/*
263 	 * List of all submitted I/Os including I/O that are generated via splitting.
264 	 */
265 	bdev_io_tailq_t		io_submitted;
266 
267 	/*
268 	 * List of spdk_bdev_io that are currently queued because they write to a locked
269 	 * LBA range.
270 	 */
271 	bdev_io_tailq_t		io_locked;
272 
273 	uint32_t		flags;
274 
275 	struct spdk_histogram_data *histogram;
276 
277 #ifdef SPDK_CONFIG_VTUNE
278 	uint64_t		start_tsc;
279 	uint64_t		interval_tsc;
280 	__itt_string_handle	*handle;
281 	struct spdk_bdev_io_stat *prev_stat;
282 #endif
283 
284 	bdev_io_tailq_t		queued_resets;
285 
286 	lba_range_tailq_t	locked_ranges;
287 };
288 
289 struct media_event_entry {
290 	struct spdk_bdev_media_event	event;
291 	TAILQ_ENTRY(media_event_entry)	tailq;
292 };
293 
294 #define MEDIA_EVENT_POOL_SIZE 64
295 
296 struct spdk_bdev_desc {
297 	struct spdk_bdev		*bdev;
298 	struct spdk_thread		*thread;
299 	struct {
300 		spdk_bdev_event_cb_t event_fn;
301 		void *ctx;
302 	}				callback;
303 	bool				closed;
304 	bool				write;
305 	bool				memory_domains_supported;
306 	struct spdk_spinlock		spinlock;
307 	uint32_t			refs;
308 	TAILQ_HEAD(, media_event_entry)	pending_media_events;
309 	TAILQ_HEAD(, media_event_entry)	free_media_events;
310 	struct media_event_entry	*media_events_buffer;
311 	TAILQ_ENTRY(spdk_bdev_desc)	link;
312 
313 	uint64_t		timeout_in_sec;
314 	spdk_bdev_io_timeout_cb	cb_fn;
315 	void			*cb_arg;
316 	struct spdk_poller	*io_timeout_poller;
317 };
318 
319 struct spdk_bdev_iostat_ctx {
320 	struct spdk_bdev_io_stat *stat;
321 	spdk_bdev_get_device_stat_cb cb;
322 	void *cb_arg;
323 };
324 
325 struct set_qos_limit_ctx {
326 	void (*cb_fn)(void *cb_arg, int status);
327 	void *cb_arg;
328 	struct spdk_bdev *bdev;
329 };
330 
331 struct spdk_bdev_channel_iter {
332 	spdk_bdev_for_each_channel_msg fn;
333 	spdk_bdev_for_each_channel_done cpl;
334 	struct spdk_io_channel_iter *i;
335 	void *ctx;
336 };
337 
338 #define __bdev_to_io_dev(bdev)		(((char *)bdev) + 1)
339 #define __bdev_from_io_dev(io_dev)	((struct spdk_bdev *)(((char *)io_dev) - 1))
340 #define __io_ch_to_bdev_ch(io_ch)	((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch))
341 #define __io_ch_to_bdev_mgmt_ch(io_ch)	((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch))
342 
343 static inline void bdev_io_complete(void *ctx);
344 
345 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
346 static void bdev_write_zero_buffer_next(void *_bdev_io);
347 
348 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
349 				struct spdk_io_channel *ch, void *_ctx);
350 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status);
351 
352 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
353 				     struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks,
354 				     uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg,
355 				     struct spdk_bdev_ext_io_opts *opts, bool copy_opts);
356 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
357 				      struct iovec *iov, int iovcnt, void *md_buf,
358 				      uint64_t offset_blocks, uint64_t num_blocks,
359 				      spdk_bdev_io_completion_cb cb, void *cb_arg,
360 				      struct spdk_bdev_ext_io_opts *opts, bool copy_opts);
361 
362 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
363 			       uint64_t offset, uint64_t length,
364 			       lock_range_cb cb_fn, void *cb_arg);
365 
366 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
367 				 uint64_t offset, uint64_t length,
368 				 lock_range_cb cb_fn, void *cb_arg);
369 
370 static inline void bdev_io_complete(void *ctx);
371 
372 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort);
373 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort);
374 
375 void
376 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size)
377 {
378 	if (!opts) {
379 		SPDK_ERRLOG("opts should not be NULL\n");
380 		return;
381 	}
382 
383 	if (!opts_size) {
384 		SPDK_ERRLOG("opts_size should not be zero value\n");
385 		return;
386 	}
387 
388 	opts->opts_size = opts_size;
389 
390 #define SET_FIELD(field) \
391 	if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \
392 		opts->field = g_bdev_opts.field; \
393 	} \
394 
395 	SET_FIELD(bdev_io_pool_size);
396 	SET_FIELD(bdev_io_cache_size);
397 	SET_FIELD(bdev_auto_examine);
398 	SET_FIELD(small_buf_pool_size);
399 	SET_FIELD(large_buf_pool_size);
400 
401 	/* Do not remove this statement, you should always update this statement when you adding a new field,
402 	 * and do not forget to add the SET_FIELD statement for your added field. */
403 	SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size");
404 
405 #undef SET_FIELD
406 }
407 
408 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_small_buf_pool_size, "spdk_bdev_opts.small_buf_pool_size",
409 			      "v23.05", 0);
410 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_large_buf_pool_size, "spdk_bdev_opts.large_buf_pool_size",
411 			      "v23.05", 0);
412 int
413 spdk_bdev_set_opts(struct spdk_bdev_opts *opts)
414 {
415 	struct spdk_iobuf_opts iobuf_opts;
416 	uint32_t min_pool_size;
417 	int rc;
418 
419 	if (!opts) {
420 		SPDK_ERRLOG("opts cannot be NULL\n");
421 		return -1;
422 	}
423 
424 	if (!opts->opts_size) {
425 		SPDK_ERRLOG("opts_size inside opts cannot be zero value\n");
426 		return -1;
427 	}
428 
429 	/*
430 	 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem
431 	 *  initialization.  A second mgmt_ch will be created on the same thread when the application starts
432 	 *  but before the deferred put_io_channel event is executed for the first mgmt_ch.
433 	 */
434 	min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1);
435 	if (opts->bdev_io_pool_size < min_pool_size) {
436 		SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32
437 			    " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size,
438 			    spdk_thread_get_count());
439 		SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size);
440 		return -1;
441 	}
442 
443 	if (opts->small_buf_pool_size != BUF_SMALL_POOL_SIZE) {
444 		SPDK_LOG_DEPRECATED(bdev_opts_small_buf_pool_size);
445 	}
446 	if (opts->large_buf_pool_size != BUF_LARGE_POOL_SIZE) {
447 		SPDK_LOG_DEPRECATED(bdev_opts_large_buf_pool_size);
448 	}
449 
450 #define SET_FIELD(field) \
451         if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \
452                 g_bdev_opts.field = opts->field; \
453         } \
454 
455 	SET_FIELD(bdev_io_pool_size);
456 	SET_FIELD(bdev_io_cache_size);
457 	SET_FIELD(bdev_auto_examine);
458 	SET_FIELD(small_buf_pool_size);
459 	SET_FIELD(large_buf_pool_size);
460 
461 	spdk_iobuf_get_opts(&iobuf_opts);
462 	iobuf_opts.small_pool_count = opts->small_buf_pool_size;
463 	iobuf_opts.large_pool_count = opts->large_buf_pool_size;
464 
465 	rc = spdk_iobuf_set_opts(&iobuf_opts);
466 	if (rc != 0) {
467 		SPDK_ERRLOG("Failed to set iobuf opts\n");
468 		return -1;
469 	}
470 
471 	g_bdev_opts.opts_size = opts->opts_size;
472 
473 #undef SET_FIELD
474 
475 	return 0;
476 }
477 
478 static struct spdk_bdev *
479 bdev_get_by_name(const char *bdev_name)
480 {
481 	struct spdk_bdev_name find;
482 	struct spdk_bdev_name *res;
483 
484 	find.name = (char *)bdev_name;
485 	res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find);
486 	if (res != NULL) {
487 		return res->bdev;
488 	}
489 
490 	return NULL;
491 }
492 
493 struct spdk_bdev *
494 spdk_bdev_get_by_name(const char *bdev_name)
495 {
496 	struct spdk_bdev *bdev;
497 
498 	spdk_spin_lock(&g_bdev_mgr.spinlock);
499 	bdev = bdev_get_by_name(bdev_name);
500 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
501 
502 	return bdev;
503 }
504 
505 struct spdk_bdev_wait_for_examine_ctx {
506 	struct spdk_poller              *poller;
507 	spdk_bdev_wait_for_examine_cb	cb_fn;
508 	void				*cb_arg;
509 };
510 
511 static bool bdev_module_all_actions_completed(void);
512 
513 static int
514 bdev_wait_for_examine_cb(void *arg)
515 {
516 	struct spdk_bdev_wait_for_examine_ctx *ctx = arg;
517 
518 	if (!bdev_module_all_actions_completed()) {
519 		return SPDK_POLLER_IDLE;
520 	}
521 
522 	spdk_poller_unregister(&ctx->poller);
523 	ctx->cb_fn(ctx->cb_arg);
524 	free(ctx);
525 
526 	return SPDK_POLLER_BUSY;
527 }
528 
529 int
530 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg)
531 {
532 	struct spdk_bdev_wait_for_examine_ctx *ctx;
533 
534 	ctx = calloc(1, sizeof(*ctx));
535 	if (ctx == NULL) {
536 		return -ENOMEM;
537 	}
538 	ctx->cb_fn = cb_fn;
539 	ctx->cb_arg = cb_arg;
540 	ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0);
541 
542 	return 0;
543 }
544 
545 struct spdk_bdev_examine_item {
546 	char *name;
547 	TAILQ_ENTRY(spdk_bdev_examine_item) link;
548 };
549 
550 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item);
551 
552 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER(
553 			g_bdev_examine_allowlist);
554 
555 static inline bool
556 bdev_examine_allowlist_check(const char *name)
557 {
558 	struct spdk_bdev_examine_item *item;
559 	TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) {
560 		if (strcmp(name, item->name) == 0) {
561 			return true;
562 		}
563 	}
564 	return false;
565 }
566 
567 static inline void
568 bdev_examine_allowlist_free(void)
569 {
570 	struct spdk_bdev_examine_item *item;
571 	while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) {
572 		item = TAILQ_FIRST(&g_bdev_examine_allowlist);
573 		TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link);
574 		free(item->name);
575 		free(item);
576 	}
577 }
578 
579 static inline bool
580 bdev_in_examine_allowlist(struct spdk_bdev *bdev)
581 {
582 	struct spdk_bdev_alias *tmp;
583 	if (bdev_examine_allowlist_check(bdev->name)) {
584 		return true;
585 	}
586 	TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
587 		if (bdev_examine_allowlist_check(tmp->alias.name)) {
588 			return true;
589 		}
590 	}
591 	return false;
592 }
593 
594 static inline bool
595 bdev_ok_to_examine(struct spdk_bdev *bdev)
596 {
597 	if (g_bdev_opts.bdev_auto_examine) {
598 		return true;
599 	} else {
600 		return bdev_in_examine_allowlist(bdev);
601 	}
602 }
603 
604 static void
605 bdev_examine(struct spdk_bdev *bdev)
606 {
607 	struct spdk_bdev_module *module;
608 	uint32_t action;
609 
610 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
611 		if (module->examine_config && bdev_ok_to_examine(bdev)) {
612 			action = module->internal.action_in_progress;
613 			module->internal.action_in_progress++;
614 			module->examine_config(bdev);
615 			if (action != module->internal.action_in_progress) {
616 				SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n",
617 					    module->name);
618 			}
619 		}
620 	}
621 
622 	if (bdev->internal.claim_module && bdev_ok_to_examine(bdev)) {
623 		if (bdev->internal.claim_module->examine_disk) {
624 			bdev->internal.claim_module->internal.action_in_progress++;
625 			bdev->internal.claim_module->examine_disk(bdev);
626 		}
627 		return;
628 	}
629 
630 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
631 		if (module->examine_disk && bdev_ok_to_examine(bdev)) {
632 			module->internal.action_in_progress++;
633 			module->examine_disk(bdev);
634 		}
635 	}
636 }
637 
638 int
639 spdk_bdev_examine(const char *name)
640 {
641 	struct spdk_bdev *bdev;
642 	struct spdk_bdev_examine_item *item;
643 
644 	if (g_bdev_opts.bdev_auto_examine) {
645 		SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled");
646 		return -EINVAL;
647 	}
648 
649 	if (bdev_examine_allowlist_check(name)) {
650 		SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name);
651 		return -EEXIST;
652 	}
653 
654 	item = calloc(1, sizeof(*item));
655 	if (!item) {
656 		return -ENOMEM;
657 	}
658 	item->name = strdup(name);
659 	if (!item->name) {
660 		free(item);
661 		return -ENOMEM;
662 	}
663 	TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link);
664 
665 	bdev = spdk_bdev_get_by_name(name);
666 	if (bdev) {
667 		bdev_examine(bdev);
668 	}
669 	return 0;
670 }
671 
672 static inline void
673 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w)
674 {
675 	struct spdk_bdev_examine_item *item;
676 	TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) {
677 		spdk_json_write_object_begin(w);
678 		spdk_json_write_named_string(w, "method", "bdev_examine");
679 		spdk_json_write_named_object_begin(w, "params");
680 		spdk_json_write_named_string(w, "name", item->name);
681 		spdk_json_write_object_end(w);
682 		spdk_json_write_object_end(w);
683 	}
684 }
685 
686 struct spdk_bdev *
687 spdk_bdev_first(void)
688 {
689 	struct spdk_bdev *bdev;
690 
691 	bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs);
692 	if (bdev) {
693 		SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name);
694 	}
695 
696 	return bdev;
697 }
698 
699 struct spdk_bdev *
700 spdk_bdev_next(struct spdk_bdev *prev)
701 {
702 	struct spdk_bdev *bdev;
703 
704 	bdev = TAILQ_NEXT(prev, internal.link);
705 	if (bdev) {
706 		SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name);
707 	}
708 
709 	return bdev;
710 }
711 
712 static struct spdk_bdev *
713 _bdev_next_leaf(struct spdk_bdev *bdev)
714 {
715 	while (bdev != NULL) {
716 		if (bdev->internal.claim_module == NULL) {
717 			return bdev;
718 		} else {
719 			bdev = TAILQ_NEXT(bdev, internal.link);
720 		}
721 	}
722 
723 	return bdev;
724 }
725 
726 struct spdk_bdev *
727 spdk_bdev_first_leaf(void)
728 {
729 	struct spdk_bdev *bdev;
730 
731 	bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs));
732 
733 	if (bdev) {
734 		SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name);
735 	}
736 
737 	return bdev;
738 }
739 
740 struct spdk_bdev *
741 spdk_bdev_next_leaf(struct spdk_bdev *prev)
742 {
743 	struct spdk_bdev *bdev;
744 
745 	bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link));
746 
747 	if (bdev) {
748 		SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name);
749 	}
750 
751 	return bdev;
752 }
753 
754 static inline bool
755 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io)
756 {
757 	return bdev_io->internal.ext_opts && bdev_io->internal.ext_opts->memory_domain;
758 }
759 
760 void
761 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len)
762 {
763 	struct iovec *iovs;
764 
765 	if (bdev_io->u.bdev.iovs == NULL) {
766 		bdev_io->u.bdev.iovs = &bdev_io->iov;
767 		bdev_io->u.bdev.iovcnt = 1;
768 	}
769 
770 	iovs = bdev_io->u.bdev.iovs;
771 
772 	assert(iovs != NULL);
773 	assert(bdev_io->u.bdev.iovcnt >= 1);
774 
775 	iovs[0].iov_base = buf;
776 	iovs[0].iov_len = len;
777 }
778 
779 void
780 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len)
781 {
782 	assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks);
783 	bdev_io->u.bdev.md_buf = md_buf;
784 }
785 
786 static bool
787 _is_buf_allocated(const struct iovec *iovs)
788 {
789 	if (iovs == NULL) {
790 		return false;
791 	}
792 
793 	return iovs[0].iov_base != NULL;
794 }
795 
796 static bool
797 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment)
798 {
799 	int i;
800 	uintptr_t iov_base;
801 
802 	if (spdk_likely(alignment == 1)) {
803 		return true;
804 	}
805 
806 	for (i = 0; i < iovcnt; i++) {
807 		iov_base = (uintptr_t)iovs[i].iov_base;
808 		if ((iov_base & (alignment - 1)) != 0) {
809 			return false;
810 		}
811 	}
812 
813 	return true;
814 }
815 
816 static void
817 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status)
818 {
819 	struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io);
820 	void *buf;
821 
822 	if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) {
823 		buf = bdev_io->internal.buf;
824 		bdev_io->internal.buf = NULL;
825 		bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf);
826 		bdev_io->internal.get_aux_buf_cb = NULL;
827 	} else {
828 		assert(bdev_io->internal.get_buf_cb != NULL);
829 		bdev_io->internal.get_buf_cb(ch, bdev_io, status);
830 		bdev_io->internal.get_buf_cb = NULL;
831 	}
832 }
833 
834 static void
835 _bdev_io_pull_buffer_cpl(void *ctx, int rc)
836 {
837 	struct spdk_bdev_io *bdev_io = ctx;
838 
839 	if (rc) {
840 		SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc);
841 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
842 	}
843 	bdev_io_get_buf_complete(bdev_io, !rc);
844 }
845 
846 static void
847 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len)
848 {
849 	int rc = 0;
850 
851 	/* save original md_buf */
852 	bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf;
853 	bdev_io->internal.orig_md_iov.iov_len = len;
854 	bdev_io->internal.bounce_md_iov.iov_base = md_buf;
855 	bdev_io->internal.bounce_md_iov.iov_len = len;
856 	/* set bounce md_buf */
857 	bdev_io->u.bdev.md_buf = md_buf;
858 
859 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
860 		if (bdev_io_use_memory_domain(bdev_io)) {
861 			rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain,
862 							  bdev_io->internal.ext_opts->memory_domain_ctx,
863 							  &bdev_io->internal.orig_md_iov, 1,
864 							  &bdev_io->internal.bounce_md_iov, 1,
865 							  bdev_io->internal.data_transfer_cpl,
866 							  bdev_io);
867 			if (rc == 0) {
868 				/* Continue to submit IO in completion callback */
869 				return;
870 			}
871 			SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n",
872 				    spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain), rc);
873 		} else {
874 			memcpy(md_buf, bdev_io->internal.orig_md_iov.iov_base, bdev_io->internal.orig_md_iov.iov_len);
875 		}
876 	}
877 
878 	assert(bdev_io->internal.data_transfer_cpl);
879 	bdev_io->internal.data_transfer_cpl(bdev_io, rc);
880 }
881 
882 static void
883 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io)
884 {
885 	struct spdk_bdev *bdev = bdev_io->bdev;
886 	uint64_t md_len;
887 	void *buf;
888 
889 	if (spdk_bdev_is_md_separate(bdev)) {
890 		buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len;
891 		md_len = bdev_io->u.bdev.num_blocks * bdev->md_len;
892 
893 		assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0);
894 
895 		if (bdev_io->u.bdev.md_buf != NULL) {
896 			_bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len);
897 			return;
898 		} else {
899 			spdk_bdev_io_set_md_buf(bdev_io, buf, md_len);
900 		}
901 	}
902 
903 	bdev_io_get_buf_complete(bdev_io, true);
904 }
905 
906 static void
907 _bdev_io_pull_bounce_data_buf_done(void *ctx, int rc)
908 {
909 	struct spdk_bdev_io *bdev_io = ctx;
910 
911 	if (rc) {
912 		SPDK_ERRLOG("Failed to get data buffer\n");
913 		assert(bdev_io->internal.data_transfer_cpl);
914 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
915 		return;
916 	}
917 
918 	_bdev_io_set_md_buf(bdev_io);
919 }
920 
921 static void
922 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len,
923 			      bdev_copy_bounce_buffer_cpl cpl_cb)
924 {
925 	int rc = 0;
926 
927 	bdev_io->internal.data_transfer_cpl = cpl_cb;
928 	/* save original iovec */
929 	bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs;
930 	bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt;
931 	/* set bounce iov */
932 	bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov;
933 	bdev_io->u.bdev.iovcnt = 1;
934 	/* set bounce buffer for this operation */
935 	bdev_io->u.bdev.iovs[0].iov_base = buf;
936 	bdev_io->u.bdev.iovs[0].iov_len = len;
937 	/* if this is write path, copy data from original buffer to bounce buffer */
938 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
939 		if (bdev_io_use_memory_domain(bdev_io)) {
940 			rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain,
941 							  bdev_io->internal.ext_opts->memory_domain_ctx,
942 							  bdev_io->internal.orig_iovs,
943 							  (uint32_t) bdev_io->internal.orig_iovcnt,
944 							  bdev_io->u.bdev.iovs, 1,
945 							  _bdev_io_pull_bounce_data_buf_done,
946 							  bdev_io);
947 			if (rc == 0) {
948 				/* Continue to submit IO in completion callback */
949 				return;
950 			}
951 			SPDK_ERRLOG("Failed to pull data from memory domain %s\n",
952 				    spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain));
953 		} else {
954 			spdk_copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt);
955 		}
956 	}
957 
958 	_bdev_io_pull_bounce_data_buf_done(bdev_io, rc);
959 }
960 
961 static void
962 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len)
963 {
964 	struct spdk_bdev *bdev = bdev_io->bdev;
965 	bool buf_allocated;
966 	uint64_t alignment;
967 	void *aligned_buf;
968 
969 	bdev_io->internal.buf = buf;
970 
971 	if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) {
972 		bdev_io_get_buf_complete(bdev_io, true);
973 		return;
974 	}
975 
976 	alignment = spdk_bdev_get_buf_align(bdev);
977 	buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs);
978 	aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1));
979 
980 	if (buf_allocated) {
981 		_bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl);
982 		/* Continue in completion callback */
983 		return;
984 	} else {
985 		spdk_bdev_io_set_buf(bdev_io, aligned_buf, len);
986 	}
987 
988 	_bdev_io_set_md_buf(bdev_io);
989 }
990 
991 static inline uint64_t
992 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len)
993 {
994 	struct spdk_bdev *bdev = bdev_io->bdev;
995 	uint64_t md_len, alignment;
996 
997 	md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0;
998 	alignment = spdk_bdev_get_buf_align(bdev);
999 
1000 	return len + alignment + md_len;
1001 }
1002 
1003 static void
1004 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len)
1005 {
1006 	struct spdk_bdev_mgmt_channel *ch;
1007 
1008 	ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
1009 	spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len));
1010 }
1011 
1012 static void
1013 bdev_io_put_buf(struct spdk_bdev_io *bdev_io)
1014 {
1015 	assert(bdev_io->internal.buf != NULL);
1016 	_bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len);
1017 	bdev_io->internal.buf = NULL;
1018 }
1019 
1020 void
1021 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf)
1022 {
1023 	uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
1024 
1025 	assert(buf != NULL);
1026 	_bdev_io_put_buf(bdev_io, buf, len);
1027 }
1028 
1029 static void
1030 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
1031 {
1032 	struct spdk_bdev *bdev = bdev_ch->bdev;
1033 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
1034 	struct spdk_bdev_io *bdev_io;
1035 
1036 	if (shared_resource->io_outstanding > shared_resource->nomem_threshold) {
1037 		/*
1038 		 * Allow some more I/O to complete before retrying the nomem_io queue.
1039 		 *  Some drivers (such as nvme) cannot immediately take a new I/O in
1040 		 *  the context of a completion, because the resources for the I/O are
1041 		 *  not released until control returns to the bdev poller.  Also, we
1042 		 *  may require several small I/O to complete before a larger I/O
1043 		 *  (that requires splitting) can be submitted.
1044 		 */
1045 		return;
1046 	}
1047 
1048 	while (!TAILQ_EMPTY(&shared_resource->nomem_io)) {
1049 		bdev_io = TAILQ_FIRST(&shared_resource->nomem_io);
1050 		TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link);
1051 		bdev_io->internal.ch->io_outstanding++;
1052 		shared_resource->io_outstanding++;
1053 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
1054 		bdev_io->internal.error.nvme.cdw0 = 0;
1055 		bdev_io->num_retries++;
1056 		bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io);
1057 		if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
1058 			break;
1059 		}
1060 	}
1061 }
1062 
1063 static inline void
1064 _bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch,
1065 			       struct spdk_bdev_shared_resource *shared_resource)
1066 {
1067 	assert(bdev_ch->io_outstanding > 0);
1068 	assert(shared_resource->io_outstanding > 0);
1069 	bdev_ch->io_outstanding--;
1070 	shared_resource->io_outstanding--;
1071 }
1072 
1073 static inline bool
1074 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io)
1075 {
1076 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
1077 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
1078 
1079 	if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) {
1080 		TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link);
1081 		/*
1082 		 * Wait for some of the outstanding I/O to complete before we
1083 		 *  retry any of the nomem_io.  Normally we will wait for
1084 		 *  NOMEM_THRESHOLD_COUNT I/O to complete but for low queue
1085 		 *  depth channels we will instead wait for half to complete.
1086 		 */
1087 		shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2,
1088 						   (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT);
1089 		return true;
1090 	}
1091 
1092 	if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) {
1093 		bdev_ch_retry_io(bdev_ch);
1094 	}
1095 
1096 	return false;
1097 }
1098 
1099 static void
1100 _bdev_io_complete_push_bounce_done(void *ctx, int rc)
1101 {
1102 	struct spdk_bdev_io *bdev_io = ctx;
1103 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
1104 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
1105 
1106 	if (rc) {
1107 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1108 	}
1109 	/* We want to free the bounce buffer here since we know we're done with it (as opposed
1110 	 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()).
1111 	 */
1112 	bdev_io_put_buf(bdev_io);
1113 
1114 	/* Continue with IO completion flow */
1115 	_bdev_io_decrement_outstanding(bdev_ch, shared_resource);
1116 	if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) {
1117 		return;
1118 	}
1119 
1120 	bdev_io_complete(bdev_io);
1121 }
1122 
1123 static inline void
1124 _bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io)
1125 {
1126 	int rc = 0;
1127 
1128 	/* do the same for metadata buffer */
1129 	if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) {
1130 		assert(spdk_bdev_is_md_separate(bdev_io->bdev));
1131 
1132 		if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ &&
1133 		    bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
1134 			if (bdev_io_use_memory_domain(bdev_io)) {
1135 				/* If memory domain is used then we need to call async push function */
1136 				rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain,
1137 								  bdev_io->internal.ext_opts->memory_domain_ctx,
1138 								  &bdev_io->internal.orig_md_iov,
1139 								  (uint32_t)bdev_io->internal.orig_iovcnt,
1140 								  &bdev_io->internal.bounce_md_iov, 1,
1141 								  bdev_io->internal.data_transfer_cpl,
1142 								  bdev_io);
1143 				if (rc == 0) {
1144 					/* Continue IO completion in async callback */
1145 					return;
1146 				}
1147 				SPDK_ERRLOG("Failed to push md to memory domain %s\n",
1148 					    spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain));
1149 			} else {
1150 				memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf,
1151 				       bdev_io->internal.orig_md_iov.iov_len);
1152 			}
1153 		}
1154 	}
1155 
1156 	assert(bdev_io->internal.data_transfer_cpl);
1157 	bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1158 }
1159 
1160 static void
1161 _bdev_io_push_bounce_data_buffer_done(void *ctx, int rc)
1162 {
1163 	struct spdk_bdev_io *bdev_io = ctx;
1164 
1165 	assert(bdev_io->internal.data_transfer_cpl);
1166 
1167 	if (rc) {
1168 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1169 		return;
1170 	}
1171 
1172 	/* set original buffer for this io */
1173 	bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt;
1174 	bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs;
1175 	/* disable bouncing buffer for this io */
1176 	bdev_io->internal.orig_iovcnt = 0;
1177 	bdev_io->internal.orig_iovs = NULL;
1178 
1179 	_bdev_io_push_bounce_md_buffer(bdev_io);
1180 }
1181 
1182 static inline void
1183 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb)
1184 {
1185 	int rc = 0;
1186 
1187 	bdev_io->internal.data_transfer_cpl = cpl_cb;
1188 
1189 	/* if this is read path, copy data from bounce buffer to original buffer */
1190 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ &&
1191 	    bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
1192 		if (bdev_io_use_memory_domain(bdev_io)) {
1193 			/* If memory domain is used then we need to call async push function */
1194 			rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain,
1195 							  bdev_io->internal.ext_opts->memory_domain_ctx,
1196 							  bdev_io->internal.orig_iovs,
1197 							  (uint32_t)bdev_io->internal.orig_iovcnt,
1198 							  &bdev_io->internal.bounce_iov, 1,
1199 							  _bdev_io_push_bounce_data_buffer_done,
1200 							  bdev_io);
1201 			if (rc == 0) {
1202 				/* Continue IO completion in async callback */
1203 				return;
1204 			}
1205 			SPDK_ERRLOG("Failed to push data to memory domain %s\n",
1206 				    spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain));
1207 		} else {
1208 			spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs,
1209 					      bdev_io->internal.orig_iovcnt,
1210 					      bdev_io->internal.bounce_iov.iov_base,
1211 					      bdev_io->internal.bounce_iov.iov_len);
1212 		}
1213 	}
1214 
1215 	_bdev_io_push_bounce_data_buffer_done(bdev_io, rc);
1216 }
1217 
1218 static void
1219 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf)
1220 {
1221 	struct spdk_bdev_io *bdev_io;
1222 
1223 	bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf);
1224 	_bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len);
1225 }
1226 
1227 static void
1228 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len)
1229 {
1230 	struct spdk_bdev_mgmt_channel *mgmt_ch;
1231 	uint64_t max_len;
1232 	void *buf;
1233 
1234 	assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread());
1235 	mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
1236 	max_len = bdev_io_get_max_buf_len(bdev_io, len);
1237 
1238 	if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) {
1239 		SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len);
1240 		bdev_io_get_buf_complete(bdev_io, false);
1241 		return;
1242 	}
1243 
1244 	bdev_io->internal.buf_len = len;
1245 	buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf,
1246 			     bdev_io_get_iobuf_cb);
1247 	if (buf != NULL) {
1248 		_bdev_io_set_buf(bdev_io, buf, len);
1249 	}
1250 }
1251 
1252 void
1253 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len)
1254 {
1255 	struct spdk_bdev *bdev = bdev_io->bdev;
1256 	uint64_t alignment;
1257 
1258 	assert(cb != NULL);
1259 	bdev_io->internal.get_buf_cb = cb;
1260 
1261 	alignment = spdk_bdev_get_buf_align(bdev);
1262 
1263 	if (_is_buf_allocated(bdev_io->u.bdev.iovs) &&
1264 	    _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) {
1265 		/* Buffer already present and aligned */
1266 		cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true);
1267 		return;
1268 	}
1269 
1270 	bdev_io_get_buf(bdev_io, len);
1271 }
1272 
1273 static void
1274 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
1275 			      bool success)
1276 {
1277 	if (!success) {
1278 		SPDK_ERRLOG("Failed to get data buffer, completing IO\n");
1279 		bdev_io_complete(bdev_io);
1280 	} else {
1281 		bdev_io_submit(bdev_io);
1282 	}
1283 }
1284 
1285 static void
1286 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb,
1287 			       uint64_t len)
1288 {
1289 	assert(cb != NULL);
1290 	bdev_io->internal.get_buf_cb = cb;
1291 
1292 	bdev_io_get_buf(bdev_io, len);
1293 }
1294 
1295 void
1296 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb)
1297 {
1298 	uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
1299 
1300 	assert(cb != NULL);
1301 	assert(bdev_io->internal.get_aux_buf_cb == NULL);
1302 	bdev_io->internal.get_aux_buf_cb = cb;
1303 	bdev_io_get_buf(bdev_io, len);
1304 }
1305 
1306 static int
1307 bdev_module_get_max_ctx_size(void)
1308 {
1309 	struct spdk_bdev_module *bdev_module;
1310 	int max_bdev_module_size = 0;
1311 
1312 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
1313 		if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) {
1314 			max_bdev_module_size = bdev_module->get_ctx_size();
1315 		}
1316 	}
1317 
1318 	return max_bdev_module_size;
1319 }
1320 
1321 static void
1322 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1323 {
1324 	int i;
1325 	struct spdk_bdev_qos *qos = bdev->internal.qos;
1326 	uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
1327 
1328 	if (!qos) {
1329 		return;
1330 	}
1331 
1332 	spdk_bdev_get_qos_rate_limits(bdev, limits);
1333 
1334 	spdk_json_write_object_begin(w);
1335 	spdk_json_write_named_string(w, "method", "bdev_set_qos_limit");
1336 
1337 	spdk_json_write_named_object_begin(w, "params");
1338 	spdk_json_write_named_string(w, "name", bdev->name);
1339 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
1340 		if (limits[i] > 0) {
1341 			spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]);
1342 		}
1343 	}
1344 	spdk_json_write_object_end(w);
1345 
1346 	spdk_json_write_object_end(w);
1347 }
1348 
1349 void
1350 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w)
1351 {
1352 	struct spdk_bdev_module *bdev_module;
1353 	struct spdk_bdev *bdev;
1354 
1355 	assert(w != NULL);
1356 
1357 	spdk_json_write_array_begin(w);
1358 
1359 	spdk_json_write_object_begin(w);
1360 	spdk_json_write_named_string(w, "method", "bdev_set_options");
1361 	spdk_json_write_named_object_begin(w, "params");
1362 	spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size);
1363 	spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size);
1364 	spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine);
1365 	spdk_json_write_object_end(w);
1366 	spdk_json_write_object_end(w);
1367 
1368 	bdev_examine_allowlist_config_json(w);
1369 
1370 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
1371 		if (bdev_module->config_json) {
1372 			bdev_module->config_json(w);
1373 		}
1374 	}
1375 
1376 	spdk_spin_lock(&g_bdev_mgr.spinlock);
1377 
1378 	TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) {
1379 		if (bdev->fn_table->write_config_json) {
1380 			bdev->fn_table->write_config_json(bdev, w);
1381 		}
1382 
1383 		bdev_qos_config_json(bdev, w);
1384 	}
1385 
1386 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
1387 
1388 	/* This has to be last RPC in array to make sure all bdevs finished examine */
1389 	spdk_json_write_object_begin(w);
1390 	spdk_json_write_named_string(w, "method", "bdev_wait_for_examine");
1391 	spdk_json_write_object_end(w);
1392 
1393 	spdk_json_write_array_end(w);
1394 }
1395 
1396 static void
1397 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf)
1398 {
1399 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
1400 	struct spdk_bdev_io *bdev_io;
1401 
1402 	spdk_iobuf_channel_fini(&ch->iobuf);
1403 
1404 	while (!STAILQ_EMPTY(&ch->per_thread_cache)) {
1405 		bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
1406 		STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
1407 		ch->per_thread_cache_count--;
1408 		spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
1409 	}
1410 
1411 	assert(ch->per_thread_cache_count == 0);
1412 }
1413 
1414 static int
1415 bdev_mgmt_channel_create(void *io_device, void *ctx_buf)
1416 {
1417 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
1418 	struct spdk_bdev_io *bdev_io;
1419 	uint32_t i;
1420 	int rc;
1421 
1422 	rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE);
1423 	if (rc != 0) {
1424 		SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc));
1425 		return -1;
1426 	}
1427 
1428 	STAILQ_INIT(&ch->per_thread_cache);
1429 	ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size;
1430 
1431 	/* Pre-populate bdev_io cache to ensure this thread cannot be starved. */
1432 	ch->per_thread_cache_count = 0;
1433 	for (i = 0; i < ch->bdev_io_cache_size; i++) {
1434 		bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
1435 		if (bdev_io == NULL) {
1436 			SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n");
1437 			assert(false);
1438 			bdev_mgmt_channel_destroy(io_device, ctx_buf);
1439 			return -1;
1440 		}
1441 		ch->per_thread_cache_count++;
1442 		STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link);
1443 	}
1444 
1445 	TAILQ_INIT(&ch->shared_resources);
1446 	TAILQ_INIT(&ch->io_wait_queue);
1447 
1448 	return 0;
1449 }
1450 
1451 static void
1452 bdev_init_complete(int rc)
1453 {
1454 	spdk_bdev_init_cb cb_fn = g_init_cb_fn;
1455 	void *cb_arg = g_init_cb_arg;
1456 	struct spdk_bdev_module *m;
1457 
1458 	g_bdev_mgr.init_complete = true;
1459 	g_init_cb_fn = NULL;
1460 	g_init_cb_arg = NULL;
1461 
1462 	/*
1463 	 * For modules that need to know when subsystem init is complete,
1464 	 * inform them now.
1465 	 */
1466 	if (rc == 0) {
1467 		TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
1468 			if (m->init_complete) {
1469 				m->init_complete();
1470 			}
1471 		}
1472 	}
1473 
1474 	cb_fn(cb_arg, rc);
1475 }
1476 
1477 static bool
1478 bdev_module_all_actions_completed(void)
1479 {
1480 	struct spdk_bdev_module *m;
1481 
1482 	TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
1483 		if (m->internal.action_in_progress > 0) {
1484 			return false;
1485 		}
1486 	}
1487 	return true;
1488 }
1489 
1490 static void
1491 bdev_module_action_complete(void)
1492 {
1493 	/*
1494 	 * Don't finish bdev subsystem initialization if
1495 	 * module pre-initialization is still in progress, or
1496 	 * the subsystem been already initialized.
1497 	 */
1498 	if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) {
1499 		return;
1500 	}
1501 
1502 	/*
1503 	 * Check all bdev modules for inits/examinations in progress. If any
1504 	 * exist, return immediately since we cannot finish bdev subsystem
1505 	 * initialization until all are completed.
1506 	 */
1507 	if (!bdev_module_all_actions_completed()) {
1508 		return;
1509 	}
1510 
1511 	/*
1512 	 * Modules already finished initialization - now that all
1513 	 * the bdev modules have finished their asynchronous I/O
1514 	 * processing, the entire bdev layer can be marked as complete.
1515 	 */
1516 	bdev_init_complete(0);
1517 }
1518 
1519 static void
1520 bdev_module_action_done(struct spdk_bdev_module *module)
1521 {
1522 	assert(module->internal.action_in_progress > 0);
1523 	module->internal.action_in_progress--;
1524 	bdev_module_action_complete();
1525 }
1526 
1527 void
1528 spdk_bdev_module_init_done(struct spdk_bdev_module *module)
1529 {
1530 	bdev_module_action_done(module);
1531 }
1532 
1533 void
1534 spdk_bdev_module_examine_done(struct spdk_bdev_module *module)
1535 {
1536 	bdev_module_action_done(module);
1537 }
1538 
1539 /** The last initialized bdev module */
1540 static struct spdk_bdev_module *g_resume_bdev_module = NULL;
1541 
1542 static void
1543 bdev_init_failed(void *cb_arg)
1544 {
1545 	struct spdk_bdev_module *module = cb_arg;
1546 
1547 	module->internal.action_in_progress--;
1548 	bdev_init_complete(-1);
1549 }
1550 
1551 static int
1552 bdev_modules_init(void)
1553 {
1554 	struct spdk_bdev_module *module;
1555 	int rc = 0;
1556 
1557 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
1558 		g_resume_bdev_module = module;
1559 		if (module->async_init) {
1560 			module->internal.action_in_progress = 1;
1561 		}
1562 		rc = module->module_init();
1563 		if (rc != 0) {
1564 			/* Bump action_in_progress to prevent other modules from completion of modules_init
1565 			 * Send message to defer application shutdown until resources are cleaned up */
1566 			module->internal.action_in_progress = 1;
1567 			spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module);
1568 			return rc;
1569 		}
1570 	}
1571 
1572 	g_resume_bdev_module = NULL;
1573 	return 0;
1574 }
1575 
1576 void
1577 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg)
1578 {
1579 	int rc = 0;
1580 	char mempool_name[32];
1581 
1582 	assert(cb_fn != NULL);
1583 
1584 	g_init_cb_fn = cb_fn;
1585 	g_init_cb_arg = cb_arg;
1586 
1587 	spdk_notify_type_register("bdev_register");
1588 	spdk_notify_type_register("bdev_unregister");
1589 
1590 	snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid());
1591 
1592 	rc = spdk_iobuf_register_module("bdev");
1593 	if (rc != 0) {
1594 		SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc));
1595 		bdev_init_complete(-1);
1596 		return;
1597 	}
1598 
1599 	g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name,
1600 				  g_bdev_opts.bdev_io_pool_size,
1601 				  sizeof(struct spdk_bdev_io) +
1602 				  bdev_module_get_max_ctx_size(),
1603 				  0,
1604 				  SPDK_ENV_SOCKET_ID_ANY);
1605 
1606 	if (g_bdev_mgr.bdev_io_pool == NULL) {
1607 		SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n");
1608 		bdev_init_complete(-1);
1609 		return;
1610 	}
1611 
1612 	g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE,
1613 					      NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
1614 	if (!g_bdev_mgr.zero_buffer) {
1615 		SPDK_ERRLOG("create bdev zero buffer failed\n");
1616 		bdev_init_complete(-1);
1617 		return;
1618 	}
1619 
1620 #ifdef SPDK_CONFIG_VTUNE
1621 	g_bdev_mgr.domain = __itt_domain_create("spdk_bdev");
1622 #endif
1623 
1624 	spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create,
1625 				bdev_mgmt_channel_destroy,
1626 				sizeof(struct spdk_bdev_mgmt_channel),
1627 				"bdev_mgr");
1628 
1629 	rc = bdev_modules_init();
1630 	g_bdev_mgr.module_init_complete = true;
1631 	if (rc != 0) {
1632 		SPDK_ERRLOG("bdev modules init failed\n");
1633 		return;
1634 	}
1635 
1636 	bdev_module_action_complete();
1637 }
1638 
1639 static void
1640 bdev_mgr_unregister_cb(void *io_device)
1641 {
1642 	spdk_bdev_fini_cb cb_fn = g_fini_cb_fn;
1643 
1644 	if (g_bdev_mgr.bdev_io_pool) {
1645 		if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) {
1646 			SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n",
1647 				    spdk_mempool_count(g_bdev_mgr.bdev_io_pool),
1648 				    g_bdev_opts.bdev_io_pool_size);
1649 		}
1650 
1651 		spdk_mempool_free(g_bdev_mgr.bdev_io_pool);
1652 	}
1653 
1654 	spdk_free(g_bdev_mgr.zero_buffer);
1655 
1656 	bdev_examine_allowlist_free();
1657 
1658 	cb_fn(g_fini_cb_arg);
1659 	g_fini_cb_fn = NULL;
1660 	g_fini_cb_arg = NULL;
1661 	g_bdev_mgr.init_complete = false;
1662 	g_bdev_mgr.module_init_complete = false;
1663 }
1664 
1665 static void
1666 bdev_module_fini_iter(void *arg)
1667 {
1668 	struct spdk_bdev_module *bdev_module;
1669 
1670 	/* FIXME: Handling initialization failures is broken now,
1671 	 * so we won't even try cleaning up after successfully
1672 	 * initialized modules. if module_init_complete is false,
1673 	 * just call spdk_bdev_mgr_unregister_cb
1674 	 */
1675 	if (!g_bdev_mgr.module_init_complete) {
1676 		bdev_mgr_unregister_cb(NULL);
1677 		return;
1678 	}
1679 
1680 	/* Start iterating from the last touched module */
1681 	if (!g_resume_bdev_module) {
1682 		bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list);
1683 	} else {
1684 		bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list,
1685 					 internal.tailq);
1686 	}
1687 
1688 	while (bdev_module) {
1689 		if (bdev_module->async_fini) {
1690 			/* Save our place so we can resume later. We must
1691 			 * save the variable here, before calling module_fini()
1692 			 * below, because in some cases the module may immediately
1693 			 * call spdk_bdev_module_fini_done() and re-enter
1694 			 * this function to continue iterating. */
1695 			g_resume_bdev_module = bdev_module;
1696 		}
1697 
1698 		if (bdev_module->module_fini) {
1699 			bdev_module->module_fini();
1700 		}
1701 
1702 		if (bdev_module->async_fini) {
1703 			return;
1704 		}
1705 
1706 		bdev_module = TAILQ_PREV(bdev_module, bdev_module_list,
1707 					 internal.tailq);
1708 	}
1709 
1710 	g_resume_bdev_module = NULL;
1711 	spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb);
1712 }
1713 
1714 void
1715 spdk_bdev_module_fini_done(void)
1716 {
1717 	if (spdk_get_thread() != g_fini_thread) {
1718 		spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL);
1719 	} else {
1720 		bdev_module_fini_iter(NULL);
1721 	}
1722 }
1723 
1724 static void
1725 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno)
1726 {
1727 	struct spdk_bdev *bdev = cb_arg;
1728 
1729 	if (bdeverrno && bdev) {
1730 		SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n",
1731 			     bdev->name);
1732 
1733 		/*
1734 		 * Since the call to spdk_bdev_unregister() failed, we have no way to free this
1735 		 *  bdev; try to continue by manually removing this bdev from the list and continue
1736 		 *  with the next bdev in the list.
1737 		 */
1738 		TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
1739 	}
1740 
1741 	if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) {
1742 		SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n");
1743 		/*
1744 		 * Bdev module finish need to be deferred as we might be in the middle of some context
1745 		 * (like bdev part free) that will use this bdev (or private bdev driver ctx data)
1746 		 * after returning.
1747 		 */
1748 		spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL);
1749 		return;
1750 	}
1751 
1752 	/*
1753 	 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem
1754 	 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity
1755 	 * to detect clean shutdown as opposed to run-time hot removal of the underlying
1756 	 * base bdevs.
1757 	 *
1758 	 * Also, walk the list in the reverse order.
1759 	 */
1760 	for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list);
1761 	     bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) {
1762 		if (bdev->internal.claim_module != NULL) {
1763 			SPDK_DEBUGLOG(bdev, "Skipping claimed bdev '%s'(<-'%s').\n",
1764 				      bdev->name, bdev->internal.claim_module->name);
1765 			continue;
1766 		}
1767 
1768 		SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name);
1769 		spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev);
1770 		return;
1771 	}
1772 
1773 	/*
1774 	 * If any bdev fails to unclaim underlying bdev properly, we may face the
1775 	 * case of bdev list consisting of claimed bdevs only (if claims are managed
1776 	 * correctly, this would mean there's a loop in the claims graph which is
1777 	 * clearly impossible). Warn and unregister last bdev on the list then.
1778 	 */
1779 	for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list);
1780 	     bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) {
1781 		SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name);
1782 		spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev);
1783 		return;
1784 	}
1785 }
1786 
1787 static void
1788 bdev_module_fini_start_iter(void *arg)
1789 {
1790 	struct spdk_bdev_module *bdev_module;
1791 
1792 	if (!g_resume_bdev_module) {
1793 		bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list);
1794 	} else {
1795 		bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq);
1796 	}
1797 
1798 	while (bdev_module) {
1799 		if (bdev_module->async_fini_start) {
1800 			/* Save our place so we can resume later. We must
1801 			 * save the variable here, before calling fini_start()
1802 			 * below, because in some cases the module may immediately
1803 			 * call spdk_bdev_module_fini_start_done() and re-enter
1804 			 * this function to continue iterating. */
1805 			g_resume_bdev_module = bdev_module;
1806 		}
1807 
1808 		if (bdev_module->fini_start) {
1809 			bdev_module->fini_start();
1810 		}
1811 
1812 		if (bdev_module->async_fini_start) {
1813 			return;
1814 		}
1815 
1816 		bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq);
1817 	}
1818 
1819 	g_resume_bdev_module = NULL;
1820 
1821 	bdev_finish_unregister_bdevs_iter(NULL, 0);
1822 }
1823 
1824 void
1825 spdk_bdev_module_fini_start_done(void)
1826 {
1827 	if (spdk_get_thread() != g_fini_thread) {
1828 		spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL);
1829 	} else {
1830 		bdev_module_fini_start_iter(NULL);
1831 	}
1832 }
1833 
1834 static void
1835 bdev_finish_wait_for_examine_done(void *cb_arg)
1836 {
1837 	bdev_module_fini_start_iter(NULL);
1838 }
1839 
1840 void
1841 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg)
1842 {
1843 	int rc;
1844 
1845 	assert(cb_fn != NULL);
1846 
1847 	g_fini_thread = spdk_get_thread();
1848 
1849 	g_fini_cb_fn = cb_fn;
1850 	g_fini_cb_arg = cb_arg;
1851 
1852 	rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL);
1853 	if (rc != 0) {
1854 		SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc));
1855 		bdev_finish_wait_for_examine_done(NULL);
1856 	}
1857 }
1858 
1859 struct spdk_bdev_io *
1860 bdev_channel_get_io(struct spdk_bdev_channel *channel)
1861 {
1862 	struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch;
1863 	struct spdk_bdev_io *bdev_io;
1864 
1865 	if (ch->per_thread_cache_count > 0) {
1866 		bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
1867 		STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
1868 		ch->per_thread_cache_count--;
1869 	} else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) {
1870 		/*
1871 		 * Don't try to look for bdev_ios in the global pool if there are
1872 		 * waiters on bdev_ios - we don't want this caller to jump the line.
1873 		 */
1874 		bdev_io = NULL;
1875 	} else {
1876 		bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
1877 	}
1878 
1879 	return bdev_io;
1880 }
1881 
1882 void
1883 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io)
1884 {
1885 	struct spdk_bdev_mgmt_channel *ch;
1886 
1887 	assert(bdev_io != NULL);
1888 	assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING);
1889 
1890 	ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
1891 
1892 	if (bdev_io->internal.buf != NULL) {
1893 		bdev_io_put_buf(bdev_io);
1894 	}
1895 
1896 	if (ch->per_thread_cache_count < ch->bdev_io_cache_size) {
1897 		ch->per_thread_cache_count++;
1898 		STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link);
1899 		while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) {
1900 			struct spdk_bdev_io_wait_entry *entry;
1901 
1902 			entry = TAILQ_FIRST(&ch->io_wait_queue);
1903 			TAILQ_REMOVE(&ch->io_wait_queue, entry, link);
1904 			entry->cb_fn(entry->cb_arg);
1905 		}
1906 	} else {
1907 		/* We should never have a full cache with entries on the io wait queue. */
1908 		assert(TAILQ_EMPTY(&ch->io_wait_queue));
1909 		spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
1910 	}
1911 }
1912 
1913 static bool
1914 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit)
1915 {
1916 	assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES);
1917 
1918 	switch (limit) {
1919 	case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
1920 		return true;
1921 	case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT:
1922 	case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT:
1923 	case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT:
1924 		return false;
1925 	case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES:
1926 	default:
1927 		return false;
1928 	}
1929 }
1930 
1931 static bool
1932 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io)
1933 {
1934 	switch (bdev_io->type) {
1935 	case SPDK_BDEV_IO_TYPE_NVME_IO:
1936 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
1937 	case SPDK_BDEV_IO_TYPE_READ:
1938 	case SPDK_BDEV_IO_TYPE_WRITE:
1939 		return true;
1940 	case SPDK_BDEV_IO_TYPE_ZCOPY:
1941 		if (bdev_io->u.bdev.zcopy.start) {
1942 			return true;
1943 		} else {
1944 			return false;
1945 		}
1946 	default:
1947 		return false;
1948 	}
1949 }
1950 
1951 static bool
1952 bdev_is_read_io(struct spdk_bdev_io *bdev_io)
1953 {
1954 	switch (bdev_io->type) {
1955 	case SPDK_BDEV_IO_TYPE_NVME_IO:
1956 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
1957 		/* Bit 1 (0x2) set for read operation */
1958 		if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) {
1959 			return true;
1960 		} else {
1961 			return false;
1962 		}
1963 	case SPDK_BDEV_IO_TYPE_READ:
1964 		return true;
1965 	case SPDK_BDEV_IO_TYPE_ZCOPY:
1966 		/* Populate to read from disk */
1967 		if (bdev_io->u.bdev.zcopy.populate) {
1968 			return true;
1969 		} else {
1970 			return false;
1971 		}
1972 	default:
1973 		return false;
1974 	}
1975 }
1976 
1977 static uint64_t
1978 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io)
1979 {
1980 	struct spdk_bdev	*bdev = bdev_io->bdev;
1981 
1982 	switch (bdev_io->type) {
1983 	case SPDK_BDEV_IO_TYPE_NVME_IO:
1984 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
1985 		return bdev_io->u.nvme_passthru.nbytes;
1986 	case SPDK_BDEV_IO_TYPE_READ:
1987 	case SPDK_BDEV_IO_TYPE_WRITE:
1988 		return bdev_io->u.bdev.num_blocks * bdev->blocklen;
1989 	case SPDK_BDEV_IO_TYPE_ZCOPY:
1990 		/* Track the data in the start phase only */
1991 		if (bdev_io->u.bdev.zcopy.start) {
1992 			return bdev_io->u.bdev.num_blocks * bdev->blocklen;
1993 		} else {
1994 			return 0;
1995 		}
1996 	default:
1997 		return 0;
1998 	}
1999 }
2000 
2001 static bool
2002 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2003 {
2004 	if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) {
2005 		return true;
2006 	} else {
2007 		return false;
2008 	}
2009 }
2010 
2011 static bool
2012 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2013 {
2014 	if (bdev_is_read_io(io) == false) {
2015 		return false;
2016 	}
2017 
2018 	return bdev_qos_rw_queue_io(limit, io);
2019 }
2020 
2021 static bool
2022 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2023 {
2024 	if (bdev_is_read_io(io) == true) {
2025 		return false;
2026 	}
2027 
2028 	return bdev_qos_rw_queue_io(limit, io);
2029 }
2030 
2031 static void
2032 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2033 {
2034 	limit->remaining_this_timeslice--;
2035 }
2036 
2037 static void
2038 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2039 {
2040 	limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io);
2041 }
2042 
2043 static void
2044 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2045 {
2046 	if (bdev_is_read_io(io) == false) {
2047 		return;
2048 	}
2049 
2050 	return bdev_qos_rw_bps_update_quota(limit, io);
2051 }
2052 
2053 static void
2054 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2055 {
2056 	if (bdev_is_read_io(io) == true) {
2057 		return;
2058 	}
2059 
2060 	return bdev_qos_rw_bps_update_quota(limit, io);
2061 }
2062 
2063 static void
2064 bdev_qos_set_ops(struct spdk_bdev_qos *qos)
2065 {
2066 	int i;
2067 
2068 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2069 		if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
2070 			qos->rate_limits[i].queue_io = NULL;
2071 			qos->rate_limits[i].update_quota = NULL;
2072 			continue;
2073 		}
2074 
2075 		switch (i) {
2076 		case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
2077 			qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io;
2078 			qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota;
2079 			break;
2080 		case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT:
2081 			qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io;
2082 			qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota;
2083 			break;
2084 		case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT:
2085 			qos->rate_limits[i].queue_io = bdev_qos_r_queue_io;
2086 			qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota;
2087 			break;
2088 		case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT:
2089 			qos->rate_limits[i].queue_io = bdev_qos_w_queue_io;
2090 			qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota;
2091 			break;
2092 		default:
2093 			break;
2094 		}
2095 	}
2096 }
2097 
2098 static void
2099 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch,
2100 			    struct spdk_bdev_io *bdev_io,
2101 			    enum spdk_bdev_io_status status)
2102 {
2103 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
2104 
2105 	bdev_io->internal.in_submit_request = true;
2106 	bdev_ch->io_outstanding++;
2107 	shared_resource->io_outstanding++;
2108 	spdk_bdev_io_complete(bdev_io, status);
2109 	bdev_io->internal.in_submit_request = false;
2110 }
2111 
2112 static inline void
2113 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io)
2114 {
2115 	struct spdk_bdev *bdev = bdev_io->bdev;
2116 	struct spdk_io_channel *ch = bdev_ch->channel;
2117 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
2118 
2119 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) {
2120 		struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch;
2121 		struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort;
2122 
2123 		if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) ||
2124 		    bdev_abort_buf_io(mgmt_channel, bio_to_abort)) {
2125 			_bdev_io_complete_in_submit(bdev_ch, bdev_io,
2126 						    SPDK_BDEV_IO_STATUS_SUCCESS);
2127 			return;
2128 		}
2129 	}
2130 
2131 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE &&
2132 			  bdev_io->bdev->split_on_write_unit &&
2133 			  bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) {
2134 		SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n",
2135 			    bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size);
2136 		_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
2137 		return;
2138 	}
2139 
2140 	if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) {
2141 		bdev_ch->io_outstanding++;
2142 		shared_resource->io_outstanding++;
2143 		bdev_io->internal.in_submit_request = true;
2144 		bdev->fn_table->submit_request(ch, bdev_io);
2145 		bdev_io->internal.in_submit_request = false;
2146 	} else {
2147 		TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link);
2148 	}
2149 }
2150 
2151 static bool
2152 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io)
2153 {
2154 	int i;
2155 
2156 	if (bdev_qos_io_to_limit(bdev_io) == true) {
2157 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2158 			if (!qos->rate_limits[i].queue_io) {
2159 				continue;
2160 			}
2161 
2162 			if (qos->rate_limits[i].queue_io(&qos->rate_limits[i],
2163 							 bdev_io) == true) {
2164 				return true;
2165 			}
2166 		}
2167 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2168 			if (!qos->rate_limits[i].update_quota) {
2169 				continue;
2170 			}
2171 
2172 			qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io);
2173 		}
2174 	}
2175 
2176 	return false;
2177 }
2178 
2179 static inline void
2180 _bdev_io_do_submit(void *ctx)
2181 {
2182 	struct spdk_bdev_io *bdev_io = ctx;
2183 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
2184 
2185 	bdev_io_do_submit(ch, bdev_io);
2186 }
2187 
2188 static int
2189 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos)
2190 {
2191 	struct spdk_bdev_io		*bdev_io = NULL, *tmp = NULL;
2192 	int				submitted_ios = 0;
2193 
2194 	TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) {
2195 		if (!bdev_qos_queue_io(qos, bdev_io)) {
2196 			TAILQ_REMOVE(&qos->queued, bdev_io, internal.link);
2197 
2198 			if (bdev_io->internal.io_submit_ch) {
2199 				/* Send back the IO to the original thread for the actual processing. */
2200 				bdev_io->internal.ch = bdev_io->internal.io_submit_ch;
2201 				bdev_io->internal.io_submit_ch = NULL;
2202 				spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
2203 						     _bdev_io_do_submit, bdev_io);
2204 			} else {
2205 				bdev_io_do_submit(ch, bdev_io);
2206 			}
2207 
2208 			submitted_ios++;
2209 		}
2210 	}
2211 
2212 	return submitted_ios;
2213 }
2214 
2215 static void
2216 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn)
2217 {
2218 	int rc;
2219 
2220 	bdev_io->internal.waitq_entry.bdev = bdev_io->bdev;
2221 	bdev_io->internal.waitq_entry.cb_fn = cb_fn;
2222 	bdev_io->internal.waitq_entry.cb_arg = bdev_io;
2223 	rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch),
2224 				     &bdev_io->internal.waitq_entry);
2225 	if (rc != 0) {
2226 		SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc);
2227 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
2228 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
2229 	}
2230 }
2231 
2232 static bool
2233 bdev_rw_should_split(struct spdk_bdev_io *bdev_io)
2234 {
2235 	uint32_t io_boundary;
2236 	struct spdk_bdev *bdev = bdev_io->bdev;
2237 	uint32_t max_size = bdev->max_segment_size;
2238 	int max_segs = bdev->max_num_segments;
2239 
2240 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) {
2241 		io_boundary = bdev->write_unit_size;
2242 	} else if (bdev->split_on_optimal_io_boundary) {
2243 		io_boundary = bdev->optimal_io_boundary;
2244 	} else {
2245 		io_boundary = 0;
2246 	}
2247 
2248 	if (spdk_likely(!io_boundary && !max_segs && !max_size)) {
2249 		return false;
2250 	}
2251 
2252 	if (io_boundary) {
2253 		uint64_t start_stripe, end_stripe;
2254 
2255 		start_stripe = bdev_io->u.bdev.offset_blocks;
2256 		end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1;
2257 		/* Avoid expensive div operations if possible.  These spdk_u32 functions are very cheap. */
2258 		if (spdk_likely(spdk_u32_is_pow2(io_boundary))) {
2259 			start_stripe >>= spdk_u32log2(io_boundary);
2260 			end_stripe >>= spdk_u32log2(io_boundary);
2261 		} else {
2262 			start_stripe /= io_boundary;
2263 			end_stripe /= io_boundary;
2264 		}
2265 
2266 		if (start_stripe != end_stripe) {
2267 			return true;
2268 		}
2269 	}
2270 
2271 	if (max_segs) {
2272 		if (bdev_io->u.bdev.iovcnt > max_segs) {
2273 			return true;
2274 		}
2275 	}
2276 
2277 	if (max_size) {
2278 		for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) {
2279 			if (bdev_io->u.bdev.iovs[i].iov_len > max_size) {
2280 				return true;
2281 			}
2282 		}
2283 	}
2284 
2285 	return false;
2286 }
2287 
2288 static bool
2289 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io)
2290 {
2291 	uint32_t num_unmap_segments;
2292 
2293 	if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) {
2294 		return false;
2295 	}
2296 	num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap);
2297 	if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) {
2298 		return true;
2299 	}
2300 
2301 	return false;
2302 }
2303 
2304 static bool
2305 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io)
2306 {
2307 	if (!bdev_io->bdev->max_write_zeroes) {
2308 		return false;
2309 	}
2310 
2311 	if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) {
2312 		return true;
2313 	}
2314 
2315 	return false;
2316 }
2317 
2318 static bool
2319 bdev_copy_should_split(struct spdk_bdev_io *bdev_io)
2320 {
2321 	if (bdev_io->bdev->max_copy != 0 &&
2322 	    bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) {
2323 		return true;
2324 	}
2325 
2326 	return false;
2327 }
2328 
2329 static bool
2330 bdev_io_should_split(struct spdk_bdev_io *bdev_io)
2331 {
2332 	switch (bdev_io->type) {
2333 	case SPDK_BDEV_IO_TYPE_READ:
2334 	case SPDK_BDEV_IO_TYPE_WRITE:
2335 		return bdev_rw_should_split(bdev_io);
2336 	case SPDK_BDEV_IO_TYPE_UNMAP:
2337 		return bdev_unmap_should_split(bdev_io);
2338 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
2339 		return bdev_write_zeroes_should_split(bdev_io);
2340 	case SPDK_BDEV_IO_TYPE_COPY:
2341 		return bdev_copy_should_split(bdev_io);
2342 	default:
2343 		return false;
2344 	}
2345 }
2346 
2347 static uint32_t
2348 _to_next_boundary(uint64_t offset, uint32_t boundary)
2349 {
2350 	return (boundary - (offset % boundary));
2351 }
2352 
2353 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
2354 
2355 static void _bdev_rw_split(void *_bdev_io);
2356 
2357 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io);
2358 
2359 static void
2360 _bdev_unmap_split(void *_bdev_io)
2361 {
2362 	return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io);
2363 }
2364 
2365 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io);
2366 
2367 static void
2368 _bdev_write_zeroes_split(void *_bdev_io)
2369 {
2370 	return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io);
2371 }
2372 
2373 static void bdev_copy_split(struct spdk_bdev_io *bdev_io);
2374 
2375 static void
2376 _bdev_copy_split(void *_bdev_io)
2377 {
2378 	return bdev_copy_split((struct spdk_bdev_io *)_bdev_io);
2379 }
2380 
2381 static int
2382 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf,
2383 		     uint64_t num_blocks, uint64_t *offset, uint64_t *remaining)
2384 {
2385 	int rc;
2386 	uint64_t current_offset, current_remaining, current_src_offset;
2387 	spdk_bdev_io_wait_cb io_wait_fn;
2388 
2389 	current_offset = *offset;
2390 	current_remaining = *remaining;
2391 
2392 	bdev_io->u.bdev.split_outstanding++;
2393 
2394 	io_wait_fn = _bdev_rw_split;
2395 	switch (bdev_io->type) {
2396 	case SPDK_BDEV_IO_TYPE_READ:
2397 		rc = bdev_readv_blocks_with_md(bdev_io->internal.desc,
2398 					       spdk_io_channel_from_ctx(bdev_io->internal.ch),
2399 					       iov, iovcnt, md_buf, current_offset,
2400 					       num_blocks,
2401 					       bdev_io_split_done, bdev_io,
2402 					       bdev_io->internal.ext_opts, true);
2403 		break;
2404 	case SPDK_BDEV_IO_TYPE_WRITE:
2405 		rc = bdev_writev_blocks_with_md(bdev_io->internal.desc,
2406 						spdk_io_channel_from_ctx(bdev_io->internal.ch),
2407 						iov, iovcnt, md_buf, current_offset,
2408 						num_blocks,
2409 						bdev_io_split_done, bdev_io,
2410 						bdev_io->internal.ext_opts, true);
2411 		break;
2412 	case SPDK_BDEV_IO_TYPE_UNMAP:
2413 		io_wait_fn = _bdev_unmap_split;
2414 		rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc,
2415 					    spdk_io_channel_from_ctx(bdev_io->internal.ch),
2416 					    current_offset, num_blocks,
2417 					    bdev_io_split_done, bdev_io);
2418 		break;
2419 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
2420 		io_wait_fn = _bdev_write_zeroes_split;
2421 		rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc,
2422 						   spdk_io_channel_from_ctx(bdev_io->internal.ch),
2423 						   current_offset, num_blocks,
2424 						   bdev_io_split_done, bdev_io);
2425 		break;
2426 	case SPDK_BDEV_IO_TYPE_COPY:
2427 		io_wait_fn = _bdev_copy_split;
2428 		current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks +
2429 				     (current_offset - bdev_io->u.bdev.offset_blocks);
2430 		rc = spdk_bdev_copy_blocks(bdev_io->internal.desc,
2431 					   spdk_io_channel_from_ctx(bdev_io->internal.ch),
2432 					   current_offset, current_src_offset, num_blocks,
2433 					   bdev_io_split_done, bdev_io);
2434 		break;
2435 	default:
2436 		assert(false);
2437 		rc = -EINVAL;
2438 		break;
2439 	}
2440 
2441 	if (rc == 0) {
2442 		current_offset += num_blocks;
2443 		current_remaining -= num_blocks;
2444 		bdev_io->u.bdev.split_current_offset_blocks = current_offset;
2445 		bdev_io->u.bdev.split_remaining_num_blocks = current_remaining;
2446 		*offset = current_offset;
2447 		*remaining = current_remaining;
2448 	} else {
2449 		bdev_io->u.bdev.split_outstanding--;
2450 		if (rc == -ENOMEM) {
2451 			if (bdev_io->u.bdev.split_outstanding == 0) {
2452 				/* No I/O is outstanding. Hence we should wait here. */
2453 				bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn);
2454 			}
2455 		} else {
2456 			bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
2457 			if (bdev_io->u.bdev.split_outstanding == 0) {
2458 				spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx);
2459 				TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link);
2460 				bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
2461 			}
2462 		}
2463 	}
2464 
2465 	return rc;
2466 }
2467 
2468 static void
2469 _bdev_rw_split(void *_bdev_io)
2470 {
2471 	struct iovec *parent_iov, *iov;
2472 	struct spdk_bdev_io *bdev_io = _bdev_io;
2473 	struct spdk_bdev *bdev = bdev_io->bdev;
2474 	uint64_t parent_offset, current_offset, remaining;
2475 	uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt;
2476 	uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes;
2477 	uint32_t iovcnt, iov_len, child_iovsize;
2478 	uint32_t blocklen = bdev->blocklen;
2479 	uint32_t io_boundary;
2480 	uint32_t max_segment_size = bdev->max_segment_size;
2481 	uint32_t max_child_iovcnt = bdev->max_num_segments;
2482 	void *md_buf = NULL;
2483 	int rc;
2484 
2485 	max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX;
2486 	max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) :
2487 			   SPDK_BDEV_IO_NUM_CHILD_IOV;
2488 
2489 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) {
2490 		io_boundary = bdev->write_unit_size;
2491 	} else if (bdev->split_on_optimal_io_boundary) {
2492 		io_boundary = bdev->optimal_io_boundary;
2493 	} else {
2494 		io_boundary = UINT32_MAX;
2495 	}
2496 
2497 	remaining = bdev_io->u.bdev.split_remaining_num_blocks;
2498 	current_offset = bdev_io->u.bdev.split_current_offset_blocks;
2499 	parent_offset = bdev_io->u.bdev.offset_blocks;
2500 	parent_iov_offset = (current_offset - parent_offset) * blocklen;
2501 	parent_iovcnt = bdev_io->u.bdev.iovcnt;
2502 
2503 	for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) {
2504 		parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
2505 		if (parent_iov_offset < parent_iov->iov_len) {
2506 			break;
2507 		}
2508 		parent_iov_offset -= parent_iov->iov_len;
2509 	}
2510 
2511 	child_iovcnt = 0;
2512 	while (remaining > 0 && parent_iovpos < parent_iovcnt &&
2513 	       child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) {
2514 		to_next_boundary = _to_next_boundary(current_offset, io_boundary);
2515 		to_next_boundary = spdk_min(remaining, to_next_boundary);
2516 		to_next_boundary_bytes = to_next_boundary * blocklen;
2517 
2518 		iov = &bdev_io->child_iov[child_iovcnt];
2519 		iovcnt = 0;
2520 
2521 		if (bdev_io->u.bdev.md_buf) {
2522 			md_buf = (char *)bdev_io->u.bdev.md_buf +
2523 				 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev);
2524 		}
2525 
2526 		child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt);
2527 		while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt &&
2528 		       iovcnt < child_iovsize) {
2529 			parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
2530 			iov_len = parent_iov->iov_len - parent_iov_offset;
2531 
2532 			iov_len = spdk_min(iov_len, max_segment_size);
2533 			iov_len = spdk_min(iov_len, to_next_boundary_bytes);
2534 			to_next_boundary_bytes -= iov_len;
2535 
2536 			bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset;
2537 			bdev_io->child_iov[child_iovcnt].iov_len = iov_len;
2538 
2539 			if (iov_len < parent_iov->iov_len - parent_iov_offset) {
2540 				parent_iov_offset += iov_len;
2541 			} else {
2542 				parent_iovpos++;
2543 				parent_iov_offset = 0;
2544 			}
2545 			child_iovcnt++;
2546 			iovcnt++;
2547 		}
2548 
2549 		if (to_next_boundary_bytes > 0) {
2550 			/* We had to stop this child I/O early because we ran out of
2551 			 * child_iov space or were limited by max_num_segments.
2552 			 * Ensure the iovs to be aligned with block size and
2553 			 * then adjust to_next_boundary before starting the
2554 			 * child I/O.
2555 			 */
2556 			assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV ||
2557 			       iovcnt == child_iovsize);
2558 			to_last_block_bytes = to_next_boundary_bytes % blocklen;
2559 			if (to_last_block_bytes != 0) {
2560 				uint32_t child_iovpos = child_iovcnt - 1;
2561 				/* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV
2562 				 * so the loop will naturally end
2563 				 */
2564 
2565 				to_last_block_bytes = blocklen - to_last_block_bytes;
2566 				to_next_boundary_bytes += to_last_block_bytes;
2567 				while (to_last_block_bytes > 0 && iovcnt > 0) {
2568 					iov_len = spdk_min(to_last_block_bytes,
2569 							   bdev_io->child_iov[child_iovpos].iov_len);
2570 					bdev_io->child_iov[child_iovpos].iov_len -= iov_len;
2571 					if (bdev_io->child_iov[child_iovpos].iov_len == 0) {
2572 						child_iovpos--;
2573 						if (--iovcnt == 0) {
2574 							/* If the child IO is less than a block size just return.
2575 							 * If the first child IO of any split round is less than
2576 							 * a block size, an error exit.
2577 							 */
2578 							if (bdev_io->u.bdev.split_outstanding == 0) {
2579 								SPDK_ERRLOG("The first child io was less than a block size\n");
2580 								bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
2581 								spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx);
2582 								TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link);
2583 								bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
2584 							}
2585 
2586 							return;
2587 						}
2588 					}
2589 
2590 					to_last_block_bytes -= iov_len;
2591 
2592 					if (parent_iov_offset == 0) {
2593 						parent_iovpos--;
2594 						parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len;
2595 					}
2596 					parent_iov_offset -= iov_len;
2597 				}
2598 
2599 				assert(to_last_block_bytes == 0);
2600 			}
2601 			to_next_boundary -= to_next_boundary_bytes / blocklen;
2602 		}
2603 
2604 		rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary,
2605 					  &current_offset, &remaining);
2606 		if (spdk_unlikely(rc)) {
2607 			return;
2608 		}
2609 	}
2610 }
2611 
2612 static void
2613 bdev_unmap_split(struct spdk_bdev_io *bdev_io)
2614 {
2615 	uint64_t offset, unmap_blocks, remaining, max_unmap_blocks;
2616 	uint32_t num_children_reqs = 0;
2617 	int rc;
2618 
2619 	offset = bdev_io->u.bdev.split_current_offset_blocks;
2620 	remaining = bdev_io->u.bdev.split_remaining_num_blocks;
2621 	max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments;
2622 
2623 	while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) {
2624 		unmap_blocks = spdk_min(remaining, max_unmap_blocks);
2625 
2626 		rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks,
2627 					  &offset, &remaining);
2628 		if (spdk_likely(rc == 0)) {
2629 			num_children_reqs++;
2630 		} else {
2631 			return;
2632 		}
2633 	}
2634 }
2635 
2636 static void
2637 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io)
2638 {
2639 	uint64_t offset, write_zeroes_blocks, remaining;
2640 	uint32_t num_children_reqs = 0;
2641 	int rc;
2642 
2643 	offset = bdev_io->u.bdev.split_current_offset_blocks;
2644 	remaining = bdev_io->u.bdev.split_remaining_num_blocks;
2645 
2646 	while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) {
2647 		write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes);
2648 
2649 		rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks,
2650 					  &offset, &remaining);
2651 		if (spdk_likely(rc == 0)) {
2652 			num_children_reqs++;
2653 		} else {
2654 			return;
2655 		}
2656 	}
2657 }
2658 
2659 static void
2660 bdev_copy_split(struct spdk_bdev_io *bdev_io)
2661 {
2662 	uint64_t offset, copy_blocks, remaining;
2663 	uint32_t num_children_reqs = 0;
2664 	int rc;
2665 
2666 	offset = bdev_io->u.bdev.split_current_offset_blocks;
2667 	remaining = bdev_io->u.bdev.split_remaining_num_blocks;
2668 
2669 	assert(bdev_io->bdev->max_copy != 0);
2670 	while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) {
2671 		copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy);
2672 
2673 		rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks,
2674 					  &offset, &remaining);
2675 		if (spdk_likely(rc == 0)) {
2676 			num_children_reqs++;
2677 		} else {
2678 			return;
2679 		}
2680 	}
2681 }
2682 
2683 static void
2684 parent_bdev_io_complete(void *ctx, int rc)
2685 {
2686 	struct spdk_bdev_io *parent_io = ctx;
2687 
2688 	if (rc) {
2689 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
2690 	}
2691 
2692 	parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
2693 			       parent_io->internal.caller_ctx);
2694 }
2695 
2696 static void
2697 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
2698 {
2699 	struct spdk_bdev_io *parent_io = cb_arg;
2700 
2701 	spdk_bdev_free_io(bdev_io);
2702 
2703 	if (!success) {
2704 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
2705 		/* If any child I/O failed, stop further splitting process. */
2706 		parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks;
2707 		parent_io->u.bdev.split_remaining_num_blocks = 0;
2708 	}
2709 	parent_io->u.bdev.split_outstanding--;
2710 	if (parent_io->u.bdev.split_outstanding != 0) {
2711 		return;
2712 	}
2713 
2714 	/*
2715 	 * Parent I/O finishes when all blocks are consumed.
2716 	 */
2717 	if (parent_io->u.bdev.split_remaining_num_blocks == 0) {
2718 		assert(parent_io->internal.cb != bdev_io_split_done);
2719 		spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx);
2720 		TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link);
2721 
2722 		if (parent_io->internal.orig_iovcnt != 0) {
2723 			_bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete);
2724 			/* bdev IO will be completed in the callback */
2725 		} else {
2726 			parent_bdev_io_complete(parent_io, 0);
2727 		}
2728 		return;
2729 	}
2730 
2731 	/*
2732 	 * Continue with the splitting process.  This function will complete the parent I/O if the
2733 	 * splitting is done.
2734 	 */
2735 	switch (parent_io->type) {
2736 	case SPDK_BDEV_IO_TYPE_READ:
2737 	case SPDK_BDEV_IO_TYPE_WRITE:
2738 		_bdev_rw_split(parent_io);
2739 		break;
2740 	case SPDK_BDEV_IO_TYPE_UNMAP:
2741 		bdev_unmap_split(parent_io);
2742 		break;
2743 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
2744 		bdev_write_zeroes_split(parent_io);
2745 		break;
2746 	case SPDK_BDEV_IO_TYPE_COPY:
2747 		bdev_copy_split(parent_io);
2748 		break;
2749 	default:
2750 		assert(false);
2751 		break;
2752 	}
2753 }
2754 
2755 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
2756 				     bool success);
2757 
2758 static void
2759 bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
2760 {
2761 	bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks;
2762 	bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks;
2763 	bdev_io->u.bdev.split_outstanding = 0;
2764 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
2765 
2766 	switch (bdev_io->type) {
2767 	case SPDK_BDEV_IO_TYPE_READ:
2768 	case SPDK_BDEV_IO_TYPE_WRITE:
2769 		if (_is_buf_allocated(bdev_io->u.bdev.iovs)) {
2770 			_bdev_rw_split(bdev_io);
2771 		} else {
2772 			assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
2773 			spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb,
2774 					     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
2775 		}
2776 		break;
2777 	case SPDK_BDEV_IO_TYPE_UNMAP:
2778 		bdev_unmap_split(bdev_io);
2779 		break;
2780 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
2781 		bdev_write_zeroes_split(bdev_io);
2782 		break;
2783 	case SPDK_BDEV_IO_TYPE_COPY:
2784 		bdev_copy_split(bdev_io);
2785 		break;
2786 	default:
2787 		assert(false);
2788 		break;
2789 	}
2790 }
2791 
2792 static void
2793 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
2794 {
2795 	if (!success) {
2796 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
2797 		return;
2798 	}
2799 
2800 	_bdev_rw_split(bdev_io);
2801 }
2802 
2803 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't
2804  *  be inlined, at least on some compilers.
2805  */
2806 static inline void
2807 _bdev_io_submit(void *ctx)
2808 {
2809 	struct spdk_bdev_io *bdev_io = ctx;
2810 	struct spdk_bdev *bdev = bdev_io->bdev;
2811 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
2812 
2813 	if (spdk_likely(bdev_ch->flags == 0)) {
2814 		bdev_io_do_submit(bdev_ch, bdev_io);
2815 		return;
2816 	}
2817 
2818 	if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) {
2819 		_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
2820 	} else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) {
2821 		if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) &&
2822 		    bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) {
2823 			_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
2824 		} else {
2825 			TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link);
2826 			bdev_qos_io_submit(bdev_ch, bdev->internal.qos);
2827 		}
2828 	} else {
2829 		SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags);
2830 		_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
2831 	}
2832 }
2833 
2834 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2);
2835 
2836 bool
2837 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2)
2838 {
2839 	if (range1->length == 0 || range2->length == 0) {
2840 		return false;
2841 	}
2842 
2843 	if (range1->offset + range1->length <= range2->offset) {
2844 		return false;
2845 	}
2846 
2847 	if (range2->offset + range2->length <= range1->offset) {
2848 		return false;
2849 	}
2850 
2851 	return true;
2852 }
2853 
2854 static bool
2855 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range)
2856 {
2857 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
2858 	struct lba_range r;
2859 
2860 	switch (bdev_io->type) {
2861 	case SPDK_BDEV_IO_TYPE_NVME_IO:
2862 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2863 		/* Don't try to decode the NVMe command - just assume worst-case and that
2864 		 * it overlaps a locked range.
2865 		 */
2866 		return true;
2867 	case SPDK_BDEV_IO_TYPE_WRITE:
2868 	case SPDK_BDEV_IO_TYPE_UNMAP:
2869 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
2870 	case SPDK_BDEV_IO_TYPE_ZCOPY:
2871 	case SPDK_BDEV_IO_TYPE_COPY:
2872 		r.offset = bdev_io->u.bdev.offset_blocks;
2873 		r.length = bdev_io->u.bdev.num_blocks;
2874 		if (!bdev_lba_range_overlapped(range, &r)) {
2875 			/* This I/O doesn't overlap the specified LBA range. */
2876 			return false;
2877 		} else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) {
2878 			/* This I/O overlaps, but the I/O is on the same channel that locked this
2879 			 * range, and the caller_ctx is the same as the locked_ctx.  This means
2880 			 * that this I/O is associated with the lock, and is allowed to execute.
2881 			 */
2882 			return false;
2883 		} else {
2884 			return true;
2885 		}
2886 	default:
2887 		return false;
2888 	}
2889 }
2890 
2891 void
2892 bdev_io_submit(struct spdk_bdev_io *bdev_io)
2893 {
2894 	struct spdk_bdev *bdev = bdev_io->bdev;
2895 	struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io);
2896 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
2897 
2898 	assert(thread != NULL);
2899 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
2900 
2901 	if (!TAILQ_EMPTY(&ch->locked_ranges)) {
2902 		struct lba_range *range;
2903 
2904 		TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
2905 			if (bdev_io_range_is_locked(bdev_io, range)) {
2906 				TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link);
2907 				return;
2908 			}
2909 		}
2910 	}
2911 
2912 	TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link);
2913 
2914 	bdev_io->internal.submit_tsc = spdk_get_ticks();
2915 	spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0,
2916 			      (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx,
2917 			      bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
2918 			      spdk_bdev_get_name(bdev));
2919 
2920 	if (bdev_io_should_split(bdev_io)) {
2921 		bdev_io_split(NULL, bdev_io);
2922 		return;
2923 	}
2924 
2925 	if (ch->flags & BDEV_CH_QOS_ENABLED) {
2926 		if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) {
2927 			_bdev_io_submit(bdev_io);
2928 		} else {
2929 			bdev_io->internal.io_submit_ch = ch;
2930 			bdev_io->internal.ch = bdev->internal.qos->ch;
2931 			spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io);
2932 		}
2933 	} else {
2934 		_bdev_io_submit(bdev_io);
2935 	}
2936 }
2937 
2938 static inline void
2939 _bdev_io_copy_ext_opts(struct spdk_bdev_io *bdev_io, struct spdk_bdev_ext_io_opts *opts)
2940 {
2941 	struct spdk_bdev_ext_io_opts *opts_copy = &bdev_io->internal.ext_opts_copy;
2942 
2943 	/* Zero part we don't copy */
2944 	memset(((char *)opts_copy) + opts->size, 0, sizeof(*opts) - opts->size);
2945 	memcpy(opts_copy, opts, opts->size);
2946 	opts_copy->size = sizeof(*opts_copy);
2947 	opts_copy->metadata = bdev_io->u.bdev.md_buf;
2948 	/* Save pointer to the copied ext_opts which will be used by bdev modules */
2949 	bdev_io->u.bdev.ext_opts = opts_copy;
2950 }
2951 
2952 static inline void
2953 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io)
2954 {
2955 	/* bdev doesn't support memory domains, thereby buffers in this IO request can't
2956 	 * be accessed directly. It is needed to allocate buffers before issuing IO operation.
2957 	 * For write operation we need to pull buffers from memory domain before submitting IO.
2958 	 * Once read operation completes, we need to use memory_domain push functionality to
2959 	 * update data in original memory domain IO buffer
2960 	 * This IO request will go through a regular IO flow, so clear memory domains pointers in
2961 	 * the copied ext_opts */
2962 	bdev_io->internal.ext_opts_copy.memory_domain = NULL;
2963 	bdev_io->internal.ext_opts_copy.memory_domain_ctx = NULL;
2964 	_bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb,
2965 				       bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
2966 }
2967 
2968 static inline void
2969 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io,
2970 		    struct spdk_bdev_ext_io_opts *opts, bool copy_opts)
2971 {
2972 	if (opts) {
2973 		bool use_pull_push = opts->memory_domain && !desc->memory_domains_supported;
2974 		assert(opts->size <= sizeof(*opts));
2975 		/*
2976 		 * copy if size is smaller than opts struct to avoid having to check size
2977 		 * on every access to bdev_io->u.bdev.ext_opts
2978 		 */
2979 		if (copy_opts || use_pull_push || opts->size < sizeof(*opts)) {
2980 			_bdev_io_copy_ext_opts(bdev_io, opts);
2981 			if (use_pull_push) {
2982 				_bdev_io_ext_use_bounce_buffer(bdev_io);
2983 				return;
2984 			}
2985 		}
2986 	}
2987 	bdev_io_submit(bdev_io);
2988 }
2989 
2990 static void
2991 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io)
2992 {
2993 	struct spdk_bdev *bdev = bdev_io->bdev;
2994 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
2995 	struct spdk_io_channel *ch = bdev_ch->channel;
2996 
2997 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
2998 
2999 	bdev_io->internal.in_submit_request = true;
3000 	bdev->fn_table->submit_request(ch, bdev_io);
3001 	bdev_io->internal.in_submit_request = false;
3002 }
3003 
3004 void
3005 bdev_io_init(struct spdk_bdev_io *bdev_io,
3006 	     struct spdk_bdev *bdev, void *cb_arg,
3007 	     spdk_bdev_io_completion_cb cb)
3008 {
3009 	bdev_io->bdev = bdev;
3010 	bdev_io->internal.caller_ctx = cb_arg;
3011 	bdev_io->internal.cb = cb;
3012 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
3013 	bdev_io->internal.in_submit_request = false;
3014 	bdev_io->internal.buf = NULL;
3015 	bdev_io->internal.io_submit_ch = NULL;
3016 	bdev_io->internal.orig_iovs = NULL;
3017 	bdev_io->internal.orig_iovcnt = 0;
3018 	bdev_io->internal.orig_md_iov.iov_base = NULL;
3019 	bdev_io->internal.error.nvme.cdw0 = 0;
3020 	bdev_io->num_retries = 0;
3021 	bdev_io->internal.get_buf_cb = NULL;
3022 	bdev_io->internal.get_aux_buf_cb = NULL;
3023 	bdev_io->internal.ext_opts = NULL;
3024 	bdev_io->internal.data_transfer_cpl = NULL;
3025 }
3026 
3027 static bool
3028 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
3029 {
3030 	return bdev->fn_table->io_type_supported(bdev->ctxt, io_type);
3031 }
3032 
3033 bool
3034 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
3035 {
3036 	bool supported;
3037 
3038 	supported = bdev_io_type_supported(bdev, io_type);
3039 
3040 	if (!supported) {
3041 		switch (io_type) {
3042 		case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3043 			/* The bdev layer will emulate write zeroes as long as write is supported. */
3044 			supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE);
3045 			break;
3046 		default:
3047 			break;
3048 		}
3049 	}
3050 
3051 	return supported;
3052 }
3053 
3054 uint64_t
3055 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io)
3056 {
3057 	return bdev_io->internal.submit_tsc;
3058 }
3059 
3060 int
3061 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
3062 {
3063 	if (bdev->fn_table->dump_info_json) {
3064 		return bdev->fn_table->dump_info_json(bdev->ctxt, w);
3065 	}
3066 
3067 	return 0;
3068 }
3069 
3070 static void
3071 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos)
3072 {
3073 	uint32_t max_per_timeslice = 0;
3074 	int i;
3075 
3076 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3077 		if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
3078 			qos->rate_limits[i].max_per_timeslice = 0;
3079 			continue;
3080 		}
3081 
3082 		max_per_timeslice = qos->rate_limits[i].limit *
3083 				    SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC;
3084 
3085 		qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice,
3086 							qos->rate_limits[i].min_per_timeslice);
3087 
3088 		qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice;
3089 	}
3090 
3091 	bdev_qos_set_ops(qos);
3092 }
3093 
3094 static int
3095 bdev_channel_poll_qos(void *arg)
3096 {
3097 	struct spdk_bdev_qos *qos = arg;
3098 	uint64_t now = spdk_get_ticks();
3099 	int i;
3100 
3101 	if (now < (qos->last_timeslice + qos->timeslice_size)) {
3102 		/* We received our callback earlier than expected - return
3103 		 *  immediately and wait to do accounting until at least one
3104 		 *  timeslice has actually expired.  This should never happen
3105 		 *  with a well-behaved timer implementation.
3106 		 */
3107 		return SPDK_POLLER_IDLE;
3108 	}
3109 
3110 	/* Reset for next round of rate limiting */
3111 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3112 		/* We may have allowed the IOs or bytes to slightly overrun in the last
3113 		 * timeslice. remaining_this_timeslice is signed, so if it's negative
3114 		 * here, we'll account for the overrun so that the next timeslice will
3115 		 * be appropriately reduced.
3116 		 */
3117 		if (qos->rate_limits[i].remaining_this_timeslice > 0) {
3118 			qos->rate_limits[i].remaining_this_timeslice = 0;
3119 		}
3120 	}
3121 
3122 	while (now >= (qos->last_timeslice + qos->timeslice_size)) {
3123 		qos->last_timeslice += qos->timeslice_size;
3124 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3125 			qos->rate_limits[i].remaining_this_timeslice +=
3126 				qos->rate_limits[i].max_per_timeslice;
3127 		}
3128 	}
3129 
3130 	return bdev_qos_io_submit(qos->ch, qos);
3131 }
3132 
3133 static void
3134 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch)
3135 {
3136 	struct spdk_bdev_shared_resource *shared_resource;
3137 	struct lba_range *range;
3138 
3139 	bdev_io_stat_free(ch->stat);
3140 #ifdef SPDK_CONFIG_VTUNE
3141 	bdev_io_stat_free(ch->prev_stat);
3142 #endif
3143 
3144 	while (!TAILQ_EMPTY(&ch->locked_ranges)) {
3145 		range = TAILQ_FIRST(&ch->locked_ranges);
3146 		TAILQ_REMOVE(&ch->locked_ranges, range, tailq);
3147 		free(range);
3148 	}
3149 
3150 	spdk_put_io_channel(ch->channel);
3151 
3152 	shared_resource = ch->shared_resource;
3153 
3154 	assert(TAILQ_EMPTY(&ch->io_locked));
3155 	assert(TAILQ_EMPTY(&ch->io_submitted));
3156 	assert(ch->io_outstanding == 0);
3157 	assert(shared_resource->ref > 0);
3158 	shared_resource->ref--;
3159 	if (shared_resource->ref == 0) {
3160 		assert(shared_resource->io_outstanding == 0);
3161 		TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link);
3162 		spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch));
3163 		free(shared_resource);
3164 	}
3165 }
3166 
3167 static void
3168 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch)
3169 {
3170 	struct spdk_bdev_qos	*qos = bdev->internal.qos;
3171 	int			i;
3172 
3173 	assert(spdk_spin_held(&bdev->internal.spinlock));
3174 
3175 	/* Rate limiting on this bdev enabled */
3176 	if (qos) {
3177 		if (qos->ch == NULL) {
3178 			struct spdk_io_channel *io_ch;
3179 
3180 			SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch,
3181 				      bdev->name, spdk_get_thread());
3182 
3183 			/* No qos channel has been selected, so set one up */
3184 
3185 			/* Take another reference to ch */
3186 			io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev));
3187 			assert(io_ch != NULL);
3188 			qos->ch = ch;
3189 
3190 			qos->thread = spdk_io_channel_get_thread(io_ch);
3191 
3192 			TAILQ_INIT(&qos->queued);
3193 
3194 			for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3195 				if (bdev_qos_is_iops_rate_limit(i) == true) {
3196 					qos->rate_limits[i].min_per_timeslice =
3197 						SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE;
3198 				} else {
3199 					qos->rate_limits[i].min_per_timeslice =
3200 						SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE;
3201 				}
3202 
3203 				if (qos->rate_limits[i].limit == 0) {
3204 					qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
3205 				}
3206 			}
3207 			bdev_qos_update_max_quota_per_timeslice(qos);
3208 			qos->timeslice_size =
3209 				SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC;
3210 			qos->last_timeslice = spdk_get_ticks();
3211 			qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos,
3212 							   qos,
3213 							   SPDK_BDEV_QOS_TIMESLICE_IN_USEC);
3214 		}
3215 
3216 		ch->flags |= BDEV_CH_QOS_ENABLED;
3217 	}
3218 }
3219 
3220 struct poll_timeout_ctx {
3221 	struct spdk_bdev_desc	*desc;
3222 	uint64_t		timeout_in_sec;
3223 	spdk_bdev_io_timeout_cb	cb_fn;
3224 	void			*cb_arg;
3225 };
3226 
3227 static void
3228 bdev_desc_free(struct spdk_bdev_desc *desc)
3229 {
3230 	spdk_spin_destroy(&desc->spinlock);
3231 	free(desc->media_events_buffer);
3232 	free(desc);
3233 }
3234 
3235 static void
3236 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
3237 {
3238 	struct poll_timeout_ctx *ctx  = _ctx;
3239 	struct spdk_bdev_desc *desc = ctx->desc;
3240 
3241 	free(ctx);
3242 
3243 	spdk_spin_lock(&desc->spinlock);
3244 	desc->refs--;
3245 	if (desc->closed == true && desc->refs == 0) {
3246 		spdk_spin_unlock(&desc->spinlock);
3247 		bdev_desc_free(desc);
3248 		return;
3249 	}
3250 	spdk_spin_unlock(&desc->spinlock);
3251 }
3252 
3253 static void
3254 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
3255 			     struct spdk_io_channel *io_ch, void *_ctx)
3256 {
3257 	struct poll_timeout_ctx *ctx  = _ctx;
3258 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
3259 	struct spdk_bdev_desc *desc = ctx->desc;
3260 	struct spdk_bdev_io *bdev_io;
3261 	uint64_t now;
3262 
3263 	spdk_spin_lock(&desc->spinlock);
3264 	if (desc->closed == true) {
3265 		spdk_spin_unlock(&desc->spinlock);
3266 		spdk_bdev_for_each_channel_continue(i, -1);
3267 		return;
3268 	}
3269 	spdk_spin_unlock(&desc->spinlock);
3270 
3271 	now = spdk_get_ticks();
3272 	TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) {
3273 		/* Exclude any I/O that are generated via splitting. */
3274 		if (bdev_io->internal.cb == bdev_io_split_done) {
3275 			continue;
3276 		}
3277 
3278 		/* Once we find an I/O that has not timed out, we can immediately
3279 		 * exit the loop.
3280 		 */
3281 		if (now < (bdev_io->internal.submit_tsc +
3282 			   ctx->timeout_in_sec * spdk_get_ticks_hz())) {
3283 			goto end;
3284 		}
3285 
3286 		if (bdev_io->internal.desc == desc) {
3287 			ctx->cb_fn(ctx->cb_arg, bdev_io);
3288 		}
3289 	}
3290 
3291 end:
3292 	spdk_bdev_for_each_channel_continue(i, 0);
3293 }
3294 
3295 static int
3296 bdev_poll_timeout_io(void *arg)
3297 {
3298 	struct spdk_bdev_desc *desc = arg;
3299 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
3300 	struct poll_timeout_ctx *ctx;
3301 
3302 	ctx = calloc(1, sizeof(struct poll_timeout_ctx));
3303 	if (!ctx) {
3304 		SPDK_ERRLOG("failed to allocate memory\n");
3305 		return SPDK_POLLER_BUSY;
3306 	}
3307 	ctx->desc = desc;
3308 	ctx->cb_arg = desc->cb_arg;
3309 	ctx->cb_fn = desc->cb_fn;
3310 	ctx->timeout_in_sec = desc->timeout_in_sec;
3311 
3312 	/* Take a ref on the descriptor in case it gets closed while we are checking
3313 	 * all of the channels.
3314 	 */
3315 	spdk_spin_lock(&desc->spinlock);
3316 	desc->refs++;
3317 	spdk_spin_unlock(&desc->spinlock);
3318 
3319 	spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx,
3320 				   bdev_channel_poll_timeout_io_done);
3321 
3322 	return SPDK_POLLER_BUSY;
3323 }
3324 
3325 int
3326 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec,
3327 		      spdk_bdev_io_timeout_cb cb_fn, void *cb_arg)
3328 {
3329 	assert(desc->thread == spdk_get_thread());
3330 
3331 	spdk_poller_unregister(&desc->io_timeout_poller);
3332 
3333 	if (timeout_in_sec) {
3334 		assert(cb_fn != NULL);
3335 		desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io,
3336 					  desc,
3337 					  SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC /
3338 					  1000);
3339 		if (desc->io_timeout_poller == NULL) {
3340 			SPDK_ERRLOG("can not register the desc timeout IO poller\n");
3341 			return -1;
3342 		}
3343 	}
3344 
3345 	desc->cb_fn = cb_fn;
3346 	desc->cb_arg = cb_arg;
3347 	desc->timeout_in_sec = timeout_in_sec;
3348 
3349 	return 0;
3350 }
3351 
3352 static int
3353 bdev_channel_create(void *io_device, void *ctx_buf)
3354 {
3355 	struct spdk_bdev		*bdev = __bdev_from_io_dev(io_device);
3356 	struct spdk_bdev_channel	*ch = ctx_buf;
3357 	struct spdk_io_channel		*mgmt_io_ch;
3358 	struct spdk_bdev_mgmt_channel	*mgmt_ch;
3359 	struct spdk_bdev_shared_resource *shared_resource;
3360 	struct lba_range		*range;
3361 
3362 	ch->bdev = bdev;
3363 	ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt);
3364 	if (!ch->channel) {
3365 		return -1;
3366 	}
3367 
3368 	spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name,
3369 			  spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel)));
3370 
3371 	assert(ch->histogram == NULL);
3372 	if (bdev->internal.histogram_enabled) {
3373 		ch->histogram = spdk_histogram_data_alloc();
3374 		if (ch->histogram == NULL) {
3375 			SPDK_ERRLOG("Could not allocate histogram\n");
3376 		}
3377 	}
3378 
3379 	mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr);
3380 	if (!mgmt_io_ch) {
3381 		spdk_put_io_channel(ch->channel);
3382 		return -1;
3383 	}
3384 
3385 	mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch);
3386 	TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) {
3387 		if (shared_resource->shared_ch == ch->channel) {
3388 			spdk_put_io_channel(mgmt_io_ch);
3389 			shared_resource->ref++;
3390 			break;
3391 		}
3392 	}
3393 
3394 	if (shared_resource == NULL) {
3395 		shared_resource = calloc(1, sizeof(*shared_resource));
3396 		if (shared_resource == NULL) {
3397 			spdk_put_io_channel(ch->channel);
3398 			spdk_put_io_channel(mgmt_io_ch);
3399 			return -1;
3400 		}
3401 
3402 		shared_resource->mgmt_ch = mgmt_ch;
3403 		shared_resource->io_outstanding = 0;
3404 		TAILQ_INIT(&shared_resource->nomem_io);
3405 		shared_resource->nomem_threshold = 0;
3406 		shared_resource->shared_ch = ch->channel;
3407 		shared_resource->ref = 1;
3408 		TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link);
3409 	}
3410 
3411 	ch->io_outstanding = 0;
3412 	TAILQ_INIT(&ch->queued_resets);
3413 	TAILQ_INIT(&ch->locked_ranges);
3414 	ch->flags = 0;
3415 	ch->shared_resource = shared_resource;
3416 
3417 	TAILQ_INIT(&ch->io_submitted);
3418 	TAILQ_INIT(&ch->io_locked);
3419 
3420 	ch->stat = bdev_io_stat_alloc();
3421 	if (ch->stat == NULL) {
3422 		bdev_channel_destroy_resource(ch);
3423 		return -1;
3424 	}
3425 
3426 	ch->stat->ticks_rate = spdk_get_ticks_hz();
3427 
3428 #ifdef SPDK_CONFIG_VTUNE
3429 	{
3430 		char *name;
3431 		__itt_init_ittlib(NULL, 0);
3432 		name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch);
3433 		if (!name) {
3434 			bdev_channel_destroy_resource(ch);
3435 			return -1;
3436 		}
3437 		ch->handle = __itt_string_handle_create(name);
3438 		free(name);
3439 		ch->start_tsc = spdk_get_ticks();
3440 		ch->interval_tsc = spdk_get_ticks_hz() / 100;
3441 		ch->prev_stat = bdev_io_stat_alloc();
3442 		if (ch->prev_stat == NULL) {
3443 			bdev_channel_destroy_resource(ch);
3444 			return -1;
3445 		}
3446 	}
3447 #endif
3448 
3449 	spdk_spin_lock(&bdev->internal.spinlock);
3450 	bdev_enable_qos(bdev, ch);
3451 
3452 	TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) {
3453 		struct lba_range *new_range;
3454 
3455 		new_range = calloc(1, sizeof(*new_range));
3456 		if (new_range == NULL) {
3457 			spdk_spin_unlock(&bdev->internal.spinlock);
3458 			bdev_channel_destroy_resource(ch);
3459 			return -1;
3460 		}
3461 		new_range->length = range->length;
3462 		new_range->offset = range->offset;
3463 		new_range->locked_ctx = range->locked_ctx;
3464 		TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq);
3465 	}
3466 
3467 	spdk_spin_unlock(&bdev->internal.spinlock);
3468 
3469 	return 0;
3470 }
3471 
3472 static int
3473 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry,
3474 			 void *cb_ctx)
3475 {
3476 	struct spdk_bdev_channel *bdev_ch = cb_ctx;
3477 	struct spdk_bdev_io *bdev_io;
3478 	uint64_t buf_len;
3479 
3480 	bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf);
3481 	if (bdev_io->internal.ch == bdev_ch) {
3482 		buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len);
3483 		spdk_iobuf_entry_abort(ch, entry, buf_len);
3484 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
3485 	}
3486 
3487 	return 0;
3488 }
3489 
3490 /*
3491  * Abort I/O that are waiting on a data buffer.
3492  */
3493 static void
3494 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch)
3495 {
3496 	spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small,
3497 				  bdev_abort_all_buf_io_cb, ch);
3498 	spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large,
3499 				  bdev_abort_all_buf_io_cb, ch);
3500 }
3501 
3502 /*
3503  * Abort I/O that are queued waiting for submission.  These types of I/O are
3504  *  linked using the spdk_bdev_io link TAILQ_ENTRY.
3505  */
3506 static void
3507 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch)
3508 {
3509 	struct spdk_bdev_io *bdev_io, *tmp;
3510 
3511 	TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) {
3512 		if (bdev_io->internal.ch == ch) {
3513 			TAILQ_REMOVE(queue, bdev_io, internal.link);
3514 			/*
3515 			 * spdk_bdev_io_complete() assumes that the completed I/O had
3516 			 *  been submitted to the bdev module.  Since in this case it
3517 			 *  hadn't, bump io_outstanding to account for the decrement
3518 			 *  that spdk_bdev_io_complete() will do.
3519 			 */
3520 			if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) {
3521 				ch->io_outstanding++;
3522 				ch->shared_resource->io_outstanding++;
3523 			}
3524 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
3525 		}
3526 	}
3527 }
3528 
3529 static bool
3530 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort)
3531 {
3532 	struct spdk_bdev_io *bdev_io;
3533 
3534 	TAILQ_FOREACH(bdev_io, queue, internal.link) {
3535 		if (bdev_io == bio_to_abort) {
3536 			TAILQ_REMOVE(queue, bio_to_abort, internal.link);
3537 			spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
3538 			return true;
3539 		}
3540 	}
3541 
3542 	return false;
3543 }
3544 
3545 static int
3546 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx)
3547 {
3548 	struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx;
3549 	uint64_t buf_len;
3550 
3551 	bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf);
3552 	if (bdev_io == bio_to_abort) {
3553 		buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len);
3554 		spdk_iobuf_entry_abort(ch, entry, buf_len);
3555 		spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
3556 		return 1;
3557 	}
3558 
3559 	return 0;
3560 }
3561 
3562 static bool
3563 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort)
3564 {
3565 	int rc;
3566 
3567 	rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small,
3568 				       bdev_abort_buf_io_cb, bio_to_abort);
3569 	if (rc == 1) {
3570 		return true;
3571 	}
3572 
3573 	rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large,
3574 				       bdev_abort_buf_io_cb, bio_to_abort);
3575 	return rc == 1;
3576 }
3577 
3578 static void
3579 bdev_qos_channel_destroy(void *cb_arg)
3580 {
3581 	struct spdk_bdev_qos *qos = cb_arg;
3582 
3583 	spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
3584 	spdk_poller_unregister(&qos->poller);
3585 
3586 	SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos);
3587 
3588 	free(qos);
3589 }
3590 
3591 static int
3592 bdev_qos_destroy(struct spdk_bdev *bdev)
3593 {
3594 	int i;
3595 
3596 	/*
3597 	 * Cleanly shutting down the QoS poller is tricky, because
3598 	 * during the asynchronous operation the user could open
3599 	 * a new descriptor and create a new channel, spawning
3600 	 * a new QoS poller.
3601 	 *
3602 	 * The strategy is to create a new QoS structure here and swap it
3603 	 * in. The shutdown path then continues to refer to the old one
3604 	 * until it completes and then releases it.
3605 	 */
3606 	struct spdk_bdev_qos *new_qos, *old_qos;
3607 
3608 	old_qos = bdev->internal.qos;
3609 
3610 	new_qos = calloc(1, sizeof(*new_qos));
3611 	if (!new_qos) {
3612 		SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n");
3613 		return -ENOMEM;
3614 	}
3615 
3616 	/* Copy the old QoS data into the newly allocated structure */
3617 	memcpy(new_qos, old_qos, sizeof(*new_qos));
3618 
3619 	/* Zero out the key parts of the QoS structure */
3620 	new_qos->ch = NULL;
3621 	new_qos->thread = NULL;
3622 	new_qos->poller = NULL;
3623 	TAILQ_INIT(&new_qos->queued);
3624 	/*
3625 	 * The limit member of spdk_bdev_qos_limit structure is not zeroed.
3626 	 * It will be used later for the new QoS structure.
3627 	 */
3628 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3629 		new_qos->rate_limits[i].remaining_this_timeslice = 0;
3630 		new_qos->rate_limits[i].min_per_timeslice = 0;
3631 		new_qos->rate_limits[i].max_per_timeslice = 0;
3632 	}
3633 
3634 	bdev->internal.qos = new_qos;
3635 
3636 	if (old_qos->thread == NULL) {
3637 		free(old_qos);
3638 	} else {
3639 		spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos);
3640 	}
3641 
3642 	/* It is safe to continue with destroying the bdev even though the QoS channel hasn't
3643 	 * been destroyed yet. The destruction path will end up waiting for the final
3644 	 * channel to be put before it releases resources. */
3645 
3646 	return 0;
3647 }
3648 
3649 static void
3650 bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add)
3651 {
3652 	total->bytes_read += add->bytes_read;
3653 	total->num_read_ops += add->num_read_ops;
3654 	total->bytes_written += add->bytes_written;
3655 	total->num_write_ops += add->num_write_ops;
3656 	total->bytes_unmapped += add->bytes_unmapped;
3657 	total->num_unmap_ops += add->num_unmap_ops;
3658 	total->bytes_copied += add->bytes_copied;
3659 	total->num_copy_ops += add->num_copy_ops;
3660 	total->read_latency_ticks += add->read_latency_ticks;
3661 	total->write_latency_ticks += add->write_latency_ticks;
3662 	total->unmap_latency_ticks += add->unmap_latency_ticks;
3663 	total->copy_latency_ticks += add->copy_latency_ticks;
3664 	if (total->max_read_latency_ticks < add->max_read_latency_ticks) {
3665 		total->max_read_latency_ticks = add->max_read_latency_ticks;
3666 	}
3667 	if (total->min_read_latency_ticks > add->min_read_latency_ticks) {
3668 		total->min_read_latency_ticks = add->min_read_latency_ticks;
3669 	}
3670 	if (total->max_write_latency_ticks < add->max_write_latency_ticks) {
3671 		total->max_write_latency_ticks = add->max_write_latency_ticks;
3672 	}
3673 	if (total->min_write_latency_ticks > add->min_write_latency_ticks) {
3674 		total->min_write_latency_ticks = add->min_write_latency_ticks;
3675 	}
3676 	if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) {
3677 		total->max_unmap_latency_ticks = add->max_unmap_latency_ticks;
3678 	}
3679 	if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) {
3680 		total->min_unmap_latency_ticks = add->min_unmap_latency_ticks;
3681 	}
3682 	if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) {
3683 		total->max_copy_latency_ticks = add->max_copy_latency_ticks;
3684 	}
3685 	if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) {
3686 		total->min_copy_latency_ticks = add->min_copy_latency_ticks;
3687 	}
3688 }
3689 
3690 static void
3691 bdev_io_stat_get(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat)
3692 {
3693 	memcpy(to_stat, from_stat, sizeof(struct spdk_bdev_io_stat));
3694 }
3695 
3696 static void
3697 bdev_io_stat_reset(struct spdk_bdev_io_stat *stat, enum bdev_reset_stat_mode mode)
3698 {
3699 	stat->max_read_latency_ticks = 0;
3700 	stat->min_read_latency_ticks = UINT64_MAX;
3701 	stat->max_write_latency_ticks = 0;
3702 	stat->min_write_latency_ticks = UINT64_MAX;
3703 	stat->max_unmap_latency_ticks = 0;
3704 	stat->min_unmap_latency_ticks = UINT64_MAX;
3705 	stat->max_copy_latency_ticks = 0;
3706 	stat->min_copy_latency_ticks = UINT64_MAX;
3707 
3708 	if (mode != BDEV_RESET_STAT_ALL) {
3709 		return;
3710 	}
3711 
3712 	stat->bytes_read = 0;
3713 	stat->num_read_ops = 0;
3714 	stat->bytes_written = 0;
3715 	stat->num_write_ops = 0;
3716 	stat->bytes_unmapped = 0;
3717 	stat->num_unmap_ops = 0;
3718 	stat->read_latency_ticks = 0;
3719 	stat->write_latency_ticks = 0;
3720 	stat->unmap_latency_ticks = 0;
3721 }
3722 
3723 struct spdk_bdev_io_stat *
3724 bdev_io_stat_alloc(void)
3725 {
3726 	struct spdk_bdev_io_stat *stat;
3727 
3728 	stat = malloc(sizeof(struct spdk_bdev_io_stat));
3729 	if (stat != NULL) {
3730 		bdev_io_stat_reset(stat, BDEV_RESET_STAT_ALL);
3731 	}
3732 
3733 	return stat;
3734 }
3735 
3736 void
3737 bdev_io_stat_free(struct spdk_bdev_io_stat *stat)
3738 {
3739 	free(stat);
3740 }
3741 
3742 void
3743 bdev_io_stat_dump_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w)
3744 {
3745 	spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read);
3746 	spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops);
3747 	spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written);
3748 	spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops);
3749 	spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped);
3750 	spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops);
3751 	spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied);
3752 	spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops);
3753 	spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks);
3754 	spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks);
3755 	spdk_json_write_named_uint64(w, "min_read_latency_ticks",
3756 				     stat->min_read_latency_ticks != UINT64_MAX ?
3757 				     stat->min_read_latency_ticks : 0);
3758 	spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks);
3759 	spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks);
3760 	spdk_json_write_named_uint64(w, "min_write_latency_ticks",
3761 				     stat->min_write_latency_ticks != UINT64_MAX ?
3762 				     stat->min_write_latency_ticks : 0);
3763 	spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks);
3764 	spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks);
3765 	spdk_json_write_named_uint64(w, "min_unmap_latency_ticks",
3766 				     stat->min_unmap_latency_ticks != UINT64_MAX ?
3767 				     stat->min_unmap_latency_ticks : 0);
3768 	spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks);
3769 	spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks);
3770 	spdk_json_write_named_uint64(w, "min_copy_latency_ticks",
3771 				     stat->min_copy_latency_ticks != UINT64_MAX ?
3772 				     stat->min_copy_latency_ticks : 0);
3773 }
3774 
3775 static void
3776 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch)
3777 {
3778 	struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource;
3779 	struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch;
3780 
3781 	bdev_abort_all_queued_io(&shared_resource->nomem_io, ch);
3782 	bdev_abort_all_buf_io(mgmt_ch, ch);
3783 	bdev_abort_all_buf_io(mgmt_ch, ch);
3784 }
3785 
3786 static void
3787 bdev_channel_destroy(void *io_device, void *ctx_buf)
3788 {
3789 	struct spdk_bdev_channel *ch = ctx_buf;
3790 
3791 	SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name,
3792 		      spdk_get_thread());
3793 
3794 	spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name,
3795 			  spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel)));
3796 
3797 	/* This channel is going away, so add its statistics into the bdev so that they don't get lost. */
3798 	spdk_spin_lock(&ch->bdev->internal.spinlock);
3799 	bdev_io_stat_add(ch->bdev->internal.stat, ch->stat);
3800 	spdk_spin_unlock(&ch->bdev->internal.spinlock);
3801 
3802 	bdev_abort_all_queued_io(&ch->queued_resets, ch);
3803 
3804 	bdev_channel_abort_queued_ios(ch);
3805 
3806 	if (ch->histogram) {
3807 		spdk_histogram_data_free(ch->histogram);
3808 	}
3809 
3810 	bdev_channel_destroy_resource(ch);
3811 }
3812 
3813 /*
3814  * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer
3815  * to it. Hence we do not have to call bdev_get_by_name() when using this function.
3816  */
3817 static int
3818 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name)
3819 {
3820 	struct spdk_bdev_name *tmp;
3821 
3822 	bdev_name->name = strdup(name);
3823 	if (bdev_name->name == NULL) {
3824 		SPDK_ERRLOG("Unable to allocate bdev name\n");
3825 		return -ENOMEM;
3826 	}
3827 
3828 	bdev_name->bdev = bdev;
3829 
3830 	spdk_spin_lock(&g_bdev_mgr.spinlock);
3831 	tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name);
3832 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
3833 
3834 	if (tmp != NULL) {
3835 		SPDK_ERRLOG("Bdev name %s already exists\n", name);
3836 		free(bdev_name->name);
3837 		return -EEXIST;
3838 	}
3839 
3840 	return 0;
3841 }
3842 
3843 static void
3844 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name)
3845 {
3846 	RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name);
3847 	free(bdev_name->name);
3848 }
3849 
3850 static void
3851 bdev_name_del(struct spdk_bdev_name *bdev_name)
3852 {
3853 	spdk_spin_lock(&g_bdev_mgr.spinlock);
3854 	bdev_name_del_unsafe(bdev_name);
3855 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
3856 }
3857 
3858 int
3859 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias)
3860 {
3861 	struct spdk_bdev_alias *tmp;
3862 	int ret;
3863 
3864 	if (alias == NULL) {
3865 		SPDK_ERRLOG("Empty alias passed\n");
3866 		return -EINVAL;
3867 	}
3868 
3869 	tmp = calloc(1, sizeof(*tmp));
3870 	if (tmp == NULL) {
3871 		SPDK_ERRLOG("Unable to allocate alias\n");
3872 		return -ENOMEM;
3873 	}
3874 
3875 	ret = bdev_name_add(&tmp->alias, bdev, alias);
3876 	if (ret != 0) {
3877 		free(tmp);
3878 		return ret;
3879 	}
3880 
3881 	TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq);
3882 
3883 	return 0;
3884 }
3885 
3886 static int
3887 bdev_alias_del(struct spdk_bdev *bdev, const char *alias,
3888 	       void (*alias_del_fn)(struct spdk_bdev_name *n))
3889 {
3890 	struct spdk_bdev_alias *tmp;
3891 
3892 	TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
3893 		if (strcmp(alias, tmp->alias.name) == 0) {
3894 			TAILQ_REMOVE(&bdev->aliases, tmp, tailq);
3895 			alias_del_fn(&tmp->alias);
3896 			free(tmp);
3897 			return 0;
3898 		}
3899 	}
3900 
3901 	return -ENOENT;
3902 }
3903 
3904 int
3905 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias)
3906 {
3907 	int rc;
3908 
3909 	rc = bdev_alias_del(bdev, alias, bdev_name_del);
3910 	if (rc == -ENOENT) {
3911 		SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias);
3912 	}
3913 
3914 	return rc;
3915 }
3916 
3917 void
3918 spdk_bdev_alias_del_all(struct spdk_bdev *bdev)
3919 {
3920 	struct spdk_bdev_alias *p, *tmp;
3921 
3922 	TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) {
3923 		TAILQ_REMOVE(&bdev->aliases, p, tailq);
3924 		bdev_name_del(&p->alias);
3925 		free(p);
3926 	}
3927 }
3928 
3929 struct spdk_io_channel *
3930 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc)
3931 {
3932 	return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc)));
3933 }
3934 
3935 void *
3936 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc)
3937 {
3938 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
3939 	void *ctx = NULL;
3940 
3941 	if (bdev->fn_table->get_module_ctx) {
3942 		ctx = bdev->fn_table->get_module_ctx(bdev->ctxt);
3943 	}
3944 
3945 	return ctx;
3946 }
3947 
3948 const char *
3949 spdk_bdev_get_module_name(const struct spdk_bdev *bdev)
3950 {
3951 	return bdev->module->name;
3952 }
3953 
3954 const char *
3955 spdk_bdev_get_name(const struct spdk_bdev *bdev)
3956 {
3957 	return bdev->name;
3958 }
3959 
3960 const char *
3961 spdk_bdev_get_product_name(const struct spdk_bdev *bdev)
3962 {
3963 	return bdev->product_name;
3964 }
3965 
3966 const struct spdk_bdev_aliases_list *
3967 spdk_bdev_get_aliases(const struct spdk_bdev *bdev)
3968 {
3969 	return &bdev->aliases;
3970 }
3971 
3972 uint32_t
3973 spdk_bdev_get_block_size(const struct spdk_bdev *bdev)
3974 {
3975 	return bdev->blocklen;
3976 }
3977 
3978 uint32_t
3979 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev)
3980 {
3981 	return bdev->write_unit_size;
3982 }
3983 
3984 uint64_t
3985 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev)
3986 {
3987 	return bdev->blockcnt;
3988 }
3989 
3990 const char *
3991 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type)
3992 {
3993 	return qos_rpc_type[type];
3994 }
3995 
3996 void
3997 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits)
3998 {
3999 	int i;
4000 
4001 	memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES);
4002 
4003 	spdk_spin_lock(&bdev->internal.spinlock);
4004 	if (bdev->internal.qos) {
4005 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4006 			if (bdev->internal.qos->rate_limits[i].limit !=
4007 			    SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
4008 				limits[i] = bdev->internal.qos->rate_limits[i].limit;
4009 				if (bdev_qos_is_iops_rate_limit(i) == false) {
4010 					/* Change from Byte to Megabyte which is user visible. */
4011 					limits[i] = limits[i] / 1024 / 1024;
4012 				}
4013 			}
4014 		}
4015 	}
4016 	spdk_spin_unlock(&bdev->internal.spinlock);
4017 }
4018 
4019 size_t
4020 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev)
4021 {
4022 	return 1 << bdev->required_alignment;
4023 }
4024 
4025 uint32_t
4026 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev)
4027 {
4028 	return bdev->optimal_io_boundary;
4029 }
4030 
4031 bool
4032 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev)
4033 {
4034 	return bdev->write_cache;
4035 }
4036 
4037 const struct spdk_uuid *
4038 spdk_bdev_get_uuid(const struct spdk_bdev *bdev)
4039 {
4040 	return &bdev->uuid;
4041 }
4042 
4043 uint16_t
4044 spdk_bdev_get_acwu(const struct spdk_bdev *bdev)
4045 {
4046 	return bdev->acwu;
4047 }
4048 
4049 uint32_t
4050 spdk_bdev_get_md_size(const struct spdk_bdev *bdev)
4051 {
4052 	return bdev->md_len;
4053 }
4054 
4055 bool
4056 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev)
4057 {
4058 	return (bdev->md_len != 0) && bdev->md_interleave;
4059 }
4060 
4061 bool
4062 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev)
4063 {
4064 	return (bdev->md_len != 0) && !bdev->md_interleave;
4065 }
4066 
4067 bool
4068 spdk_bdev_is_zoned(const struct spdk_bdev *bdev)
4069 {
4070 	return bdev->zoned;
4071 }
4072 
4073 uint32_t
4074 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev)
4075 {
4076 	if (spdk_bdev_is_md_interleaved(bdev)) {
4077 		return bdev->blocklen - bdev->md_len;
4078 	} else {
4079 		return bdev->blocklen;
4080 	}
4081 }
4082 
4083 uint32_t
4084 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev)
4085 {
4086 	return bdev->phys_blocklen;
4087 }
4088 
4089 static uint32_t
4090 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev)
4091 {
4092 	if (!spdk_bdev_is_md_interleaved(bdev)) {
4093 		return bdev->blocklen + bdev->md_len;
4094 	} else {
4095 		return bdev->blocklen;
4096 	}
4097 }
4098 
4099 /* We have to use the typedef in the function declaration to appease astyle. */
4100 typedef enum spdk_dif_type spdk_dif_type_t;
4101 
4102 spdk_dif_type_t
4103 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev)
4104 {
4105 	if (bdev->md_len != 0) {
4106 		return bdev->dif_type;
4107 	} else {
4108 		return SPDK_DIF_DISABLE;
4109 	}
4110 }
4111 
4112 bool
4113 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev)
4114 {
4115 	if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) {
4116 		return bdev->dif_is_head_of_md;
4117 	} else {
4118 		return false;
4119 	}
4120 }
4121 
4122 bool
4123 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev,
4124 			       enum spdk_dif_check_type check_type)
4125 {
4126 	if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) {
4127 		return false;
4128 	}
4129 
4130 	switch (check_type) {
4131 	case SPDK_DIF_CHECK_TYPE_REFTAG:
4132 		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0;
4133 	case SPDK_DIF_CHECK_TYPE_APPTAG:
4134 		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0;
4135 	case SPDK_DIF_CHECK_TYPE_GUARD:
4136 		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0;
4137 	default:
4138 		return false;
4139 	}
4140 }
4141 
4142 uint32_t
4143 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev)
4144 {
4145 	return bdev->max_copy;
4146 }
4147 
4148 uint64_t
4149 spdk_bdev_get_qd(const struct spdk_bdev *bdev)
4150 {
4151 	return bdev->internal.measured_queue_depth;
4152 }
4153 
4154 uint64_t
4155 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev)
4156 {
4157 	return bdev->internal.period;
4158 }
4159 
4160 uint64_t
4161 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev)
4162 {
4163 	return bdev->internal.weighted_io_time;
4164 }
4165 
4166 uint64_t
4167 spdk_bdev_get_io_time(const struct spdk_bdev *bdev)
4168 {
4169 	return bdev->internal.io_time;
4170 }
4171 
4172 static void bdev_update_qd_sampling_period(void *ctx);
4173 
4174 static void
4175 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status)
4176 {
4177 	bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth;
4178 
4179 	if (bdev->internal.measured_queue_depth) {
4180 		bdev->internal.io_time += bdev->internal.period;
4181 		bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth;
4182 	}
4183 
4184 	bdev->internal.qd_poll_in_progress = false;
4185 
4186 	bdev_update_qd_sampling_period(bdev);
4187 }
4188 
4189 static void
4190 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
4191 		       struct spdk_io_channel *io_ch, void *_ctx)
4192 {
4193 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch);
4194 
4195 	bdev->internal.temporary_queue_depth += ch->io_outstanding;
4196 	spdk_bdev_for_each_channel_continue(i, 0);
4197 }
4198 
4199 static int
4200 bdev_calculate_measured_queue_depth(void *ctx)
4201 {
4202 	struct spdk_bdev *bdev = ctx;
4203 
4204 	bdev->internal.qd_poll_in_progress = true;
4205 	bdev->internal.temporary_queue_depth = 0;
4206 	spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl);
4207 	return SPDK_POLLER_BUSY;
4208 }
4209 
4210 static void
4211 bdev_update_qd_sampling_period(void *ctx)
4212 {
4213 	struct spdk_bdev *bdev = ctx;
4214 
4215 	if (bdev->internal.period == bdev->internal.new_period) {
4216 		return;
4217 	}
4218 
4219 	if (bdev->internal.qd_poll_in_progress) {
4220 		return;
4221 	}
4222 
4223 	bdev->internal.period = bdev->internal.new_period;
4224 
4225 	spdk_poller_unregister(&bdev->internal.qd_poller);
4226 	if (bdev->internal.period != 0) {
4227 		bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth,
4228 					   bdev, bdev->internal.period);
4229 	} else {
4230 		spdk_bdev_close(bdev->internal.qd_desc);
4231 		bdev->internal.qd_desc = NULL;
4232 	}
4233 }
4234 
4235 static void
4236 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
4237 {
4238 	SPDK_NOTICELOG("Unexpected event type: %d\n", type);
4239 }
4240 
4241 void
4242 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period)
4243 {
4244 	int rc;
4245 
4246 	if (bdev->internal.new_period == period) {
4247 		return;
4248 	}
4249 
4250 	bdev->internal.new_period = period;
4251 
4252 	if (bdev->internal.qd_desc != NULL) {
4253 		assert(bdev->internal.period != 0);
4254 
4255 		spdk_thread_send_msg(bdev->internal.qd_desc->thread,
4256 				     bdev_update_qd_sampling_period, bdev);
4257 		return;
4258 	}
4259 
4260 	assert(bdev->internal.period == 0);
4261 
4262 	rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb,
4263 				NULL, &bdev->internal.qd_desc);
4264 	if (rc != 0) {
4265 		return;
4266 	}
4267 
4268 	bdev->internal.period = period;
4269 	bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth,
4270 				   bdev, period);
4271 }
4272 
4273 struct bdev_get_current_qd_ctx {
4274 	uint64_t current_qd;
4275 	spdk_bdev_get_current_qd_cb cb_fn;
4276 	void *cb_arg;
4277 };
4278 
4279 static void
4280 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status)
4281 {
4282 	struct bdev_get_current_qd_ctx *ctx = _ctx;
4283 
4284 	ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0);
4285 
4286 	free(ctx);
4287 }
4288 
4289 static void
4290 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
4291 		    struct spdk_io_channel *io_ch, void *_ctx)
4292 {
4293 	struct bdev_get_current_qd_ctx *ctx = _ctx;
4294 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
4295 
4296 	ctx->current_qd += bdev_ch->io_outstanding;
4297 
4298 	spdk_bdev_for_each_channel_continue(i, 0);
4299 }
4300 
4301 void
4302 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn,
4303 			 void *cb_arg)
4304 {
4305 	struct bdev_get_current_qd_ctx *ctx;
4306 
4307 	assert(cb_fn != NULL);
4308 
4309 	ctx = calloc(1, sizeof(*ctx));
4310 	if (ctx == NULL) {
4311 		cb_fn(bdev, 0, cb_arg, -ENOMEM);
4312 		return;
4313 	}
4314 
4315 	ctx->cb_fn = cb_fn;
4316 	ctx->cb_arg = cb_arg;
4317 
4318 	spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done);
4319 }
4320 
4321 static void
4322 _resize_notify(void *arg)
4323 {
4324 	struct spdk_bdev_desc *desc = arg;
4325 
4326 	spdk_spin_lock(&desc->spinlock);
4327 	desc->refs--;
4328 	if (!desc->closed) {
4329 		spdk_spin_unlock(&desc->spinlock);
4330 		desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE,
4331 					desc->bdev,
4332 					desc->callback.ctx);
4333 		return;
4334 	} else if (0 == desc->refs) {
4335 		/* This descriptor was closed after this resize_notify message was sent.
4336 		 * spdk_bdev_close() could not free the descriptor since this message was
4337 		 * in flight, so we free it now using bdev_desc_free().
4338 		 */
4339 		spdk_spin_unlock(&desc->spinlock);
4340 		bdev_desc_free(desc);
4341 		return;
4342 	}
4343 	spdk_spin_unlock(&desc->spinlock);
4344 }
4345 
4346 int
4347 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size)
4348 {
4349 	struct spdk_bdev_desc *desc;
4350 	int ret;
4351 
4352 	if (size == bdev->blockcnt) {
4353 		return 0;
4354 	}
4355 
4356 	spdk_spin_lock(&bdev->internal.spinlock);
4357 
4358 	/* bdev has open descriptors */
4359 	if (!TAILQ_EMPTY(&bdev->internal.open_descs) &&
4360 	    bdev->blockcnt > size) {
4361 		ret = -EBUSY;
4362 	} else {
4363 		bdev->blockcnt = size;
4364 		TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
4365 			spdk_spin_lock(&desc->spinlock);
4366 			if (!desc->closed) {
4367 				desc->refs++;
4368 				spdk_thread_send_msg(desc->thread, _resize_notify, desc);
4369 			}
4370 			spdk_spin_unlock(&desc->spinlock);
4371 		}
4372 		ret = 0;
4373 	}
4374 
4375 	spdk_spin_unlock(&bdev->internal.spinlock);
4376 
4377 	return ret;
4378 }
4379 
4380 /*
4381  * Convert I/O offset and length from bytes to blocks.
4382  *
4383  * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size.
4384  */
4385 static uint64_t
4386 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks,
4387 		     uint64_t num_bytes, uint64_t *num_blocks)
4388 {
4389 	uint32_t block_size = bdev->blocklen;
4390 	uint8_t shift_cnt;
4391 
4392 	/* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */
4393 	if (spdk_likely(spdk_u32_is_pow2(block_size))) {
4394 		shift_cnt = spdk_u32log2(block_size);
4395 		*offset_blocks = offset_bytes >> shift_cnt;
4396 		*num_blocks = num_bytes >> shift_cnt;
4397 		return (offset_bytes - (*offset_blocks << shift_cnt)) |
4398 		       (num_bytes - (*num_blocks << shift_cnt));
4399 	} else {
4400 		*offset_blocks = offset_bytes / block_size;
4401 		*num_blocks = num_bytes / block_size;
4402 		return (offset_bytes % block_size) | (num_bytes % block_size);
4403 	}
4404 }
4405 
4406 static bool
4407 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks)
4408 {
4409 	/* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there
4410 	 * has been an overflow and hence the offset has been wrapped around */
4411 	if (offset_blocks + num_blocks < offset_blocks) {
4412 		return false;
4413 	}
4414 
4415 	/* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */
4416 	if (offset_blocks + num_blocks > bdev->blockcnt) {
4417 		return false;
4418 	}
4419 
4420 	return true;
4421 }
4422 
4423 static void
4424 bdev_seek_complete_cb(void *ctx)
4425 {
4426 	struct spdk_bdev_io *bdev_io = ctx;
4427 
4428 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
4429 	bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx);
4430 }
4431 
4432 static int
4433 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4434 	  uint64_t offset_blocks, enum spdk_bdev_io_type io_type,
4435 	  spdk_bdev_io_completion_cb cb, void *cb_arg)
4436 {
4437 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
4438 	struct spdk_bdev_io *bdev_io;
4439 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
4440 
4441 	assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE);
4442 
4443 	/* Check if offset_blocks is valid looking at the validity of one block */
4444 	if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) {
4445 		return -EINVAL;
4446 	}
4447 
4448 	bdev_io = bdev_channel_get_io(channel);
4449 	if (!bdev_io) {
4450 		return -ENOMEM;
4451 	}
4452 
4453 	bdev_io->internal.ch = channel;
4454 	bdev_io->internal.desc = desc;
4455 	bdev_io->type = io_type;
4456 	bdev_io->u.bdev.offset_blocks = offset_blocks;
4457 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
4458 
4459 	if (!spdk_bdev_io_type_supported(bdev, io_type)) {
4460 		/* In case bdev doesn't support seek to next data/hole offset,
4461 		 * it is assumed that only data and no holes are present */
4462 		if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) {
4463 			bdev_io->u.bdev.seek.offset = offset_blocks;
4464 		} else {
4465 			bdev_io->u.bdev.seek.offset = UINT64_MAX;
4466 		}
4467 
4468 		spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io);
4469 		return 0;
4470 	}
4471 
4472 	bdev_io_submit(bdev_io);
4473 	return 0;
4474 }
4475 
4476 int
4477 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4478 		    uint64_t offset_blocks,
4479 		    spdk_bdev_io_completion_cb cb, void *cb_arg)
4480 {
4481 	return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg);
4482 }
4483 
4484 int
4485 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4486 		    uint64_t offset_blocks,
4487 		    spdk_bdev_io_completion_cb cb, void *cb_arg)
4488 {
4489 	return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg);
4490 }
4491 
4492 uint64_t
4493 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io)
4494 {
4495 	return bdev_io->u.bdev.seek.offset;
4496 }
4497 
4498 static int
4499 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf,
4500 			 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
4501 			 spdk_bdev_io_completion_cb cb, void *cb_arg)
4502 {
4503 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
4504 	struct spdk_bdev_io *bdev_io;
4505 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
4506 
4507 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
4508 		return -EINVAL;
4509 	}
4510 
4511 	bdev_io = bdev_channel_get_io(channel);
4512 	if (!bdev_io) {
4513 		return -ENOMEM;
4514 	}
4515 
4516 	bdev_io->internal.ch = channel;
4517 	bdev_io->internal.desc = desc;
4518 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
4519 	bdev_io->u.bdev.iovs = &bdev_io->iov;
4520 	bdev_io->u.bdev.iovs[0].iov_base = buf;
4521 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
4522 	bdev_io->u.bdev.iovcnt = 1;
4523 	bdev_io->u.bdev.md_buf = md_buf;
4524 	bdev_io->u.bdev.num_blocks = num_blocks;
4525 	bdev_io->u.bdev.offset_blocks = offset_blocks;
4526 	bdev_io->u.bdev.ext_opts = NULL;
4527 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
4528 
4529 	bdev_io_submit(bdev_io);
4530 	return 0;
4531 }
4532 
4533 int
4534 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4535 	       void *buf, uint64_t offset, uint64_t nbytes,
4536 	       spdk_bdev_io_completion_cb cb, void *cb_arg)
4537 {
4538 	uint64_t offset_blocks, num_blocks;
4539 
4540 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
4541 				 nbytes, &num_blocks) != 0) {
4542 		return -EINVAL;
4543 	}
4544 
4545 	return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
4546 }
4547 
4548 int
4549 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4550 		      void *buf, uint64_t offset_blocks, uint64_t num_blocks,
4551 		      spdk_bdev_io_completion_cb cb, void *cb_arg)
4552 {
4553 	return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg);
4554 }
4555 
4556 int
4557 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4558 			      void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
4559 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
4560 {
4561 	struct iovec iov = {
4562 		.iov_base = buf,
4563 	};
4564 
4565 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
4566 		return -EINVAL;
4567 	}
4568 
4569 	if (md_buf && !_is_buf_allocated(&iov)) {
4570 		return -EINVAL;
4571 	}
4572 
4573 	return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
4574 					cb, cb_arg);
4575 }
4576 
4577 int
4578 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4579 		struct iovec *iov, int iovcnt,
4580 		uint64_t offset, uint64_t nbytes,
4581 		spdk_bdev_io_completion_cb cb, void *cb_arg)
4582 {
4583 	uint64_t offset_blocks, num_blocks;
4584 
4585 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
4586 				 nbytes, &num_blocks) != 0) {
4587 		return -EINVAL;
4588 	}
4589 
4590 	return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
4591 }
4592 
4593 static int
4594 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4595 			  struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks,
4596 			  uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg,
4597 			  struct spdk_bdev_ext_io_opts *opts, bool copy_opts)
4598 {
4599 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
4600 	struct spdk_bdev_io *bdev_io;
4601 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
4602 
4603 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
4604 		return -EINVAL;
4605 	}
4606 
4607 	bdev_io = bdev_channel_get_io(channel);
4608 	if (!bdev_io) {
4609 		return -ENOMEM;
4610 	}
4611 
4612 	bdev_io->internal.ch = channel;
4613 	bdev_io->internal.desc = desc;
4614 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
4615 	bdev_io->u.bdev.iovs = iov;
4616 	bdev_io->u.bdev.iovcnt = iovcnt;
4617 	bdev_io->u.bdev.md_buf = md_buf;
4618 	bdev_io->u.bdev.num_blocks = num_blocks;
4619 	bdev_io->u.bdev.offset_blocks = offset_blocks;
4620 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
4621 	bdev_io->internal.ext_opts = opts;
4622 	bdev_io->u.bdev.ext_opts = opts;
4623 
4624 	_bdev_io_submit_ext(desc, bdev_io, opts, copy_opts);
4625 
4626 	return 0;
4627 }
4628 
4629 int
4630 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4631 		       struct iovec *iov, int iovcnt,
4632 		       uint64_t offset_blocks, uint64_t num_blocks,
4633 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
4634 {
4635 	return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
4636 					 num_blocks, cb, cb_arg, NULL, false);
4637 }
4638 
4639 int
4640 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4641 			       struct iovec *iov, int iovcnt, void *md_buf,
4642 			       uint64_t offset_blocks, uint64_t num_blocks,
4643 			       spdk_bdev_io_completion_cb cb, void *cb_arg)
4644 {
4645 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
4646 		return -EINVAL;
4647 	}
4648 
4649 	if (md_buf && !_is_buf_allocated(iov)) {
4650 		return -EINVAL;
4651 	}
4652 
4653 	return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
4654 					 num_blocks, cb, cb_arg, NULL, false);
4655 }
4656 
4657 static inline bool
4658 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov)
4659 {
4660 	/*
4661 	 * We check if opts size is at least of size when we first introduced
4662 	 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members
4663 	 * are not checked internal.
4664 	 */
4665 	return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) +
4666 	       sizeof(opts->metadata) &&
4667 	       opts->size <= sizeof(*opts) &&
4668 	       /* When memory domain is used, the user must provide data buffers */
4669 	       (!opts->memory_domain || (iov && iov[0].iov_base));
4670 }
4671 
4672 int
4673 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4674 			   struct iovec *iov, int iovcnt,
4675 			   uint64_t offset_blocks, uint64_t num_blocks,
4676 			   spdk_bdev_io_completion_cb cb, void *cb_arg,
4677 			   struct spdk_bdev_ext_io_opts *opts)
4678 {
4679 	void *md = NULL;
4680 
4681 	if (opts) {
4682 		if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) {
4683 			return -EINVAL;
4684 		}
4685 		md = opts->metadata;
4686 	}
4687 
4688 	if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
4689 		return -EINVAL;
4690 	}
4691 
4692 	if (md && !_is_buf_allocated(iov)) {
4693 		return -EINVAL;
4694 	}
4695 
4696 	return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks,
4697 					 num_blocks, cb, cb_arg, opts, false);
4698 }
4699 
4700 static int
4701 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4702 			  void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
4703 			  spdk_bdev_io_completion_cb cb, void *cb_arg)
4704 {
4705 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
4706 	struct spdk_bdev_io *bdev_io;
4707 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
4708 
4709 	if (!desc->write) {
4710 		return -EBADF;
4711 	}
4712 
4713 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
4714 		return -EINVAL;
4715 	}
4716 
4717 	bdev_io = bdev_channel_get_io(channel);
4718 	if (!bdev_io) {
4719 		return -ENOMEM;
4720 	}
4721 
4722 	bdev_io->internal.ch = channel;
4723 	bdev_io->internal.desc = desc;
4724 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
4725 	bdev_io->u.bdev.iovs = &bdev_io->iov;
4726 	bdev_io->u.bdev.iovs[0].iov_base = buf;
4727 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
4728 	bdev_io->u.bdev.iovcnt = 1;
4729 	bdev_io->u.bdev.md_buf = md_buf;
4730 	bdev_io->u.bdev.num_blocks = num_blocks;
4731 	bdev_io->u.bdev.offset_blocks = offset_blocks;
4732 	bdev_io->u.bdev.ext_opts = NULL;
4733 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
4734 
4735 	bdev_io_submit(bdev_io);
4736 	return 0;
4737 }
4738 
4739 int
4740 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4741 		void *buf, uint64_t offset, uint64_t nbytes,
4742 		spdk_bdev_io_completion_cb cb, void *cb_arg)
4743 {
4744 	uint64_t offset_blocks, num_blocks;
4745 
4746 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
4747 				 nbytes, &num_blocks) != 0) {
4748 		return -EINVAL;
4749 	}
4750 
4751 	return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
4752 }
4753 
4754 int
4755 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4756 		       void *buf, uint64_t offset_blocks, uint64_t num_blocks,
4757 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
4758 {
4759 	return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks,
4760 					 cb, cb_arg);
4761 }
4762 
4763 int
4764 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4765 			       void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
4766 			       spdk_bdev_io_completion_cb cb, void *cb_arg)
4767 {
4768 	struct iovec iov = {
4769 		.iov_base = buf,
4770 	};
4771 
4772 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
4773 		return -EINVAL;
4774 	}
4775 
4776 	if (md_buf && !_is_buf_allocated(&iov)) {
4777 		return -EINVAL;
4778 	}
4779 
4780 	return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
4781 					 cb, cb_arg);
4782 }
4783 
4784 static int
4785 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4786 			   struct iovec *iov, int iovcnt, void *md_buf,
4787 			   uint64_t offset_blocks, uint64_t num_blocks,
4788 			   spdk_bdev_io_completion_cb cb, void *cb_arg,
4789 			   struct spdk_bdev_ext_io_opts *opts, bool copy_opts)
4790 {
4791 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
4792 	struct spdk_bdev_io *bdev_io;
4793 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
4794 
4795 	if (!desc->write) {
4796 		return -EBADF;
4797 	}
4798 
4799 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
4800 		return -EINVAL;
4801 	}
4802 
4803 	bdev_io = bdev_channel_get_io(channel);
4804 	if (!bdev_io) {
4805 		return -ENOMEM;
4806 	}
4807 
4808 	bdev_io->internal.ch = channel;
4809 	bdev_io->internal.desc = desc;
4810 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
4811 	bdev_io->u.bdev.iovs = iov;
4812 	bdev_io->u.bdev.iovcnt = iovcnt;
4813 	bdev_io->u.bdev.md_buf = md_buf;
4814 	bdev_io->u.bdev.num_blocks = num_blocks;
4815 	bdev_io->u.bdev.offset_blocks = offset_blocks;
4816 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
4817 	bdev_io->internal.ext_opts = opts;
4818 	bdev_io->u.bdev.ext_opts = opts;
4819 
4820 	_bdev_io_submit_ext(desc, bdev_io, opts, copy_opts);
4821 
4822 	return 0;
4823 }
4824 
4825 int
4826 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4827 		 struct iovec *iov, int iovcnt,
4828 		 uint64_t offset, uint64_t len,
4829 		 spdk_bdev_io_completion_cb cb, void *cb_arg)
4830 {
4831 	uint64_t offset_blocks, num_blocks;
4832 
4833 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
4834 				 len, &num_blocks) != 0) {
4835 		return -EINVAL;
4836 	}
4837 
4838 	return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
4839 }
4840 
4841 int
4842 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4843 			struct iovec *iov, int iovcnt,
4844 			uint64_t offset_blocks, uint64_t num_blocks,
4845 			spdk_bdev_io_completion_cb cb, void *cb_arg)
4846 {
4847 	return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
4848 					  num_blocks, cb, cb_arg, NULL, false);
4849 }
4850 
4851 int
4852 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4853 				struct iovec *iov, int iovcnt, void *md_buf,
4854 				uint64_t offset_blocks, uint64_t num_blocks,
4855 				spdk_bdev_io_completion_cb cb, void *cb_arg)
4856 {
4857 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
4858 		return -EINVAL;
4859 	}
4860 
4861 	if (md_buf && !_is_buf_allocated(iov)) {
4862 		return -EINVAL;
4863 	}
4864 
4865 	return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
4866 					  num_blocks, cb, cb_arg, NULL, false);
4867 }
4868 
4869 int
4870 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4871 			    struct iovec *iov, int iovcnt,
4872 			    uint64_t offset_blocks, uint64_t num_blocks,
4873 			    spdk_bdev_io_completion_cb cb, void *cb_arg,
4874 			    struct spdk_bdev_ext_io_opts *opts)
4875 {
4876 	void *md = NULL;
4877 
4878 	if (opts) {
4879 		if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) {
4880 			return -EINVAL;
4881 		}
4882 		md = opts->metadata;
4883 	}
4884 
4885 	if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
4886 		return -EINVAL;
4887 	}
4888 
4889 	if (md && !_is_buf_allocated(iov)) {
4890 		return -EINVAL;
4891 	}
4892 
4893 	return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks,
4894 					  num_blocks, cb, cb_arg, opts, false);
4895 }
4896 
4897 static void
4898 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
4899 {
4900 	struct spdk_bdev_io *parent_io = cb_arg;
4901 	struct spdk_bdev *bdev = parent_io->bdev;
4902 	uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base;
4903 	int i, rc = 0;
4904 
4905 	if (!success) {
4906 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
4907 		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
4908 		spdk_bdev_free_io(bdev_io);
4909 		return;
4910 	}
4911 
4912 	for (i = 0; i < parent_io->u.bdev.iovcnt; i++) {
4913 		rc = memcmp(read_buf,
4914 			    parent_io->u.bdev.iovs[i].iov_base,
4915 			    parent_io->u.bdev.iovs[i].iov_len);
4916 		if (rc) {
4917 			break;
4918 		}
4919 		read_buf += parent_io->u.bdev.iovs[i].iov_len;
4920 	}
4921 
4922 	if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) {
4923 		rc = memcmp(bdev_io->u.bdev.md_buf,
4924 			    parent_io->u.bdev.md_buf,
4925 			    spdk_bdev_get_md_size(bdev));
4926 	}
4927 
4928 	spdk_bdev_free_io(bdev_io);
4929 
4930 	if (rc == 0) {
4931 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
4932 		parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx);
4933 	} else {
4934 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE;
4935 		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
4936 	}
4937 }
4938 
4939 static void
4940 bdev_compare_do_read(void *_bdev_io)
4941 {
4942 	struct spdk_bdev_io *bdev_io = _bdev_io;
4943 	int rc;
4944 
4945 	rc = spdk_bdev_read_blocks(bdev_io->internal.desc,
4946 				   spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL,
4947 				   bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
4948 				   bdev_compare_do_read_done, bdev_io);
4949 
4950 	if (rc == -ENOMEM) {
4951 		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read);
4952 	} else if (rc != 0) {
4953 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
4954 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
4955 	}
4956 }
4957 
4958 static int
4959 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
4960 			     struct iovec *iov, int iovcnt, void *md_buf,
4961 			     uint64_t offset_blocks, uint64_t num_blocks,
4962 			     spdk_bdev_io_completion_cb cb, void *cb_arg)
4963 {
4964 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
4965 	struct spdk_bdev_io *bdev_io;
4966 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
4967 
4968 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
4969 		return -EINVAL;
4970 	}
4971 
4972 	bdev_io = bdev_channel_get_io(channel);
4973 	if (!bdev_io) {
4974 		return -ENOMEM;
4975 	}
4976 
4977 	bdev_io->internal.ch = channel;
4978 	bdev_io->internal.desc = desc;
4979 	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE;
4980 	bdev_io->u.bdev.iovs = iov;
4981 	bdev_io->u.bdev.iovcnt = iovcnt;
4982 	bdev_io->u.bdev.md_buf = md_buf;
4983 	bdev_io->u.bdev.num_blocks = num_blocks;
4984 	bdev_io->u.bdev.offset_blocks = offset_blocks;
4985 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
4986 	bdev_io->u.bdev.ext_opts = NULL;
4987 
4988 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) {
4989 		bdev_io_submit(bdev_io);
4990 		return 0;
4991 	}
4992 
4993 	bdev_compare_do_read(bdev_io);
4994 
4995 	return 0;
4996 }
4997 
4998 int
4999 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5000 			  struct iovec *iov, int iovcnt,
5001 			  uint64_t offset_blocks, uint64_t num_blocks,
5002 			  spdk_bdev_io_completion_cb cb, void *cb_arg)
5003 {
5004 	return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
5005 					    num_blocks, cb, cb_arg);
5006 }
5007 
5008 int
5009 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5010 				  struct iovec *iov, int iovcnt, void *md_buf,
5011 				  uint64_t offset_blocks, uint64_t num_blocks,
5012 				  spdk_bdev_io_completion_cb cb, void *cb_arg)
5013 {
5014 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5015 		return -EINVAL;
5016 	}
5017 
5018 	if (md_buf && !_is_buf_allocated(iov)) {
5019 		return -EINVAL;
5020 	}
5021 
5022 	return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
5023 					    num_blocks, cb, cb_arg);
5024 }
5025 
5026 static int
5027 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5028 			    void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5029 			    spdk_bdev_io_completion_cb cb, void *cb_arg)
5030 {
5031 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5032 	struct spdk_bdev_io *bdev_io;
5033 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5034 
5035 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5036 		return -EINVAL;
5037 	}
5038 
5039 	bdev_io = bdev_channel_get_io(channel);
5040 	if (!bdev_io) {
5041 		return -ENOMEM;
5042 	}
5043 
5044 	bdev_io->internal.ch = channel;
5045 	bdev_io->internal.desc = desc;
5046 	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE;
5047 	bdev_io->u.bdev.iovs = &bdev_io->iov;
5048 	bdev_io->u.bdev.iovs[0].iov_base = buf;
5049 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
5050 	bdev_io->u.bdev.iovcnt = 1;
5051 	bdev_io->u.bdev.md_buf = md_buf;
5052 	bdev_io->u.bdev.num_blocks = num_blocks;
5053 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5054 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5055 	bdev_io->u.bdev.ext_opts = NULL;
5056 
5057 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) {
5058 		bdev_io_submit(bdev_io);
5059 		return 0;
5060 	}
5061 
5062 	bdev_compare_do_read(bdev_io);
5063 
5064 	return 0;
5065 }
5066 
5067 int
5068 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5069 			 void *buf, uint64_t offset_blocks, uint64_t num_blocks,
5070 			 spdk_bdev_io_completion_cb cb, void *cb_arg)
5071 {
5072 	return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks,
5073 					   cb, cb_arg);
5074 }
5075 
5076 int
5077 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5078 				 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5079 				 spdk_bdev_io_completion_cb cb, void *cb_arg)
5080 {
5081 	struct iovec iov = {
5082 		.iov_base = buf,
5083 	};
5084 
5085 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5086 		return -EINVAL;
5087 	}
5088 
5089 	if (md_buf && !_is_buf_allocated(&iov)) {
5090 		return -EINVAL;
5091 	}
5092 
5093 	return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
5094 					   cb, cb_arg);
5095 }
5096 
5097 static void
5098 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status)
5099 {
5100 	struct spdk_bdev_io *bdev_io = ctx;
5101 
5102 	if (unlock_status) {
5103 		SPDK_ERRLOG("LBA range unlock failed\n");
5104 	}
5105 
5106 	bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true :
5107 			     false, bdev_io->internal.caller_ctx);
5108 }
5109 
5110 static void
5111 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status)
5112 {
5113 	bdev_io->internal.status = status;
5114 
5115 	bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch),
5116 			      bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
5117 			      bdev_comparev_and_writev_blocks_unlocked, bdev_io);
5118 }
5119 
5120 static void
5121 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
5122 {
5123 	struct spdk_bdev_io *parent_io = cb_arg;
5124 
5125 	if (!success) {
5126 		SPDK_ERRLOG("Compare and write operation failed\n");
5127 	}
5128 
5129 	spdk_bdev_free_io(bdev_io);
5130 
5131 	bdev_comparev_and_writev_blocks_unlock(parent_io,
5132 					       success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED);
5133 }
5134 
5135 static void
5136 bdev_compare_and_write_do_write(void *_bdev_io)
5137 {
5138 	struct spdk_bdev_io *bdev_io = _bdev_io;
5139 	int rc;
5140 
5141 	rc = spdk_bdev_writev_blocks(bdev_io->internal.desc,
5142 				     spdk_io_channel_from_ctx(bdev_io->internal.ch),
5143 				     bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt,
5144 				     bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
5145 				     bdev_compare_and_write_do_write_done, bdev_io);
5146 
5147 
5148 	if (rc == -ENOMEM) {
5149 		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write);
5150 	} else if (rc != 0) {
5151 		bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
5152 	}
5153 }
5154 
5155 static void
5156 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
5157 {
5158 	struct spdk_bdev_io *parent_io = cb_arg;
5159 
5160 	spdk_bdev_free_io(bdev_io);
5161 
5162 	if (!success) {
5163 		bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE);
5164 		return;
5165 	}
5166 
5167 	bdev_compare_and_write_do_write(parent_io);
5168 }
5169 
5170 static void
5171 bdev_compare_and_write_do_compare(void *_bdev_io)
5172 {
5173 	struct spdk_bdev_io *bdev_io = _bdev_io;
5174 	int rc;
5175 
5176 	rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc,
5177 				       spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs,
5178 				       bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
5179 				       bdev_compare_and_write_do_compare_done, bdev_io);
5180 
5181 	if (rc == -ENOMEM) {
5182 		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare);
5183 	} else if (rc != 0) {
5184 		bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED);
5185 	}
5186 }
5187 
5188 static void
5189 bdev_comparev_and_writev_blocks_locked(void *ctx, int status)
5190 {
5191 	struct spdk_bdev_io *bdev_io = ctx;
5192 
5193 	if (status) {
5194 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED;
5195 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
5196 		return;
5197 	}
5198 
5199 	bdev_compare_and_write_do_compare(bdev_io);
5200 }
5201 
5202 int
5203 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5204 				     struct iovec *compare_iov, int compare_iovcnt,
5205 				     struct iovec *write_iov, int write_iovcnt,
5206 				     uint64_t offset_blocks, uint64_t num_blocks,
5207 				     spdk_bdev_io_completion_cb cb, void *cb_arg)
5208 {
5209 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5210 	struct spdk_bdev_io *bdev_io;
5211 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5212 
5213 	if (!desc->write) {
5214 		return -EBADF;
5215 	}
5216 
5217 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5218 		return -EINVAL;
5219 	}
5220 
5221 	if (num_blocks > bdev->acwu) {
5222 		return -EINVAL;
5223 	}
5224 
5225 	bdev_io = bdev_channel_get_io(channel);
5226 	if (!bdev_io) {
5227 		return -ENOMEM;
5228 	}
5229 
5230 	bdev_io->internal.ch = channel;
5231 	bdev_io->internal.desc = desc;
5232 	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE;
5233 	bdev_io->u.bdev.iovs = compare_iov;
5234 	bdev_io->u.bdev.iovcnt = compare_iovcnt;
5235 	bdev_io->u.bdev.fused_iovs = write_iov;
5236 	bdev_io->u.bdev.fused_iovcnt = write_iovcnt;
5237 	bdev_io->u.bdev.md_buf = NULL;
5238 	bdev_io->u.bdev.num_blocks = num_blocks;
5239 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5240 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5241 	bdev_io->u.bdev.ext_opts = NULL;
5242 
5243 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) {
5244 		bdev_io_submit(bdev_io);
5245 		return 0;
5246 	}
5247 
5248 	return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks,
5249 				   bdev_comparev_and_writev_blocks_locked, bdev_io);
5250 }
5251 
5252 int
5253 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5254 		      struct iovec *iov, int iovcnt,
5255 		      uint64_t offset_blocks, uint64_t num_blocks,
5256 		      bool populate,
5257 		      spdk_bdev_io_completion_cb cb, void *cb_arg)
5258 {
5259 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5260 	struct spdk_bdev_io *bdev_io;
5261 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5262 
5263 	if (!desc->write) {
5264 		return -EBADF;
5265 	}
5266 
5267 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5268 		return -EINVAL;
5269 	}
5270 
5271 	if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) {
5272 		return -ENOTSUP;
5273 	}
5274 
5275 	bdev_io = bdev_channel_get_io(channel);
5276 	if (!bdev_io) {
5277 		return -ENOMEM;
5278 	}
5279 
5280 	bdev_io->internal.ch = channel;
5281 	bdev_io->internal.desc = desc;
5282 	bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY;
5283 	bdev_io->u.bdev.num_blocks = num_blocks;
5284 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5285 	bdev_io->u.bdev.iovs = iov;
5286 	bdev_io->u.bdev.iovcnt = iovcnt;
5287 	bdev_io->u.bdev.md_buf = NULL;
5288 	bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0;
5289 	bdev_io->u.bdev.zcopy.commit = 0;
5290 	bdev_io->u.bdev.zcopy.start = 1;
5291 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5292 	bdev_io->u.bdev.ext_opts = NULL;
5293 
5294 	bdev_io_submit(bdev_io);
5295 
5296 	return 0;
5297 }
5298 
5299 int
5300 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit,
5301 		    spdk_bdev_io_completion_cb cb, void *cb_arg)
5302 {
5303 	if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) {
5304 		return -EINVAL;
5305 	}
5306 
5307 	bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0;
5308 	bdev_io->u.bdev.zcopy.start = 0;
5309 	bdev_io->internal.caller_ctx = cb_arg;
5310 	bdev_io->internal.cb = cb;
5311 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
5312 
5313 	bdev_io_submit(bdev_io);
5314 
5315 	return 0;
5316 }
5317 
5318 int
5319 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5320 		       uint64_t offset, uint64_t len,
5321 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
5322 {
5323 	uint64_t offset_blocks, num_blocks;
5324 
5325 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5326 				 len, &num_blocks) != 0) {
5327 		return -EINVAL;
5328 	}
5329 
5330 	return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
5331 }
5332 
5333 int
5334 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5335 			      uint64_t offset_blocks, uint64_t num_blocks,
5336 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
5337 {
5338 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5339 	struct spdk_bdev_io *bdev_io;
5340 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5341 
5342 	if (!desc->write) {
5343 		return -EBADF;
5344 	}
5345 
5346 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5347 		return -EINVAL;
5348 	}
5349 
5350 	if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) &&
5351 	    !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) {
5352 		return -ENOTSUP;
5353 	}
5354 
5355 	bdev_io = bdev_channel_get_io(channel);
5356 
5357 	if (!bdev_io) {
5358 		return -ENOMEM;
5359 	}
5360 
5361 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES;
5362 	bdev_io->internal.ch = channel;
5363 	bdev_io->internal.desc = desc;
5364 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5365 	bdev_io->u.bdev.num_blocks = num_blocks;
5366 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5367 	bdev_io->u.bdev.ext_opts = NULL;
5368 
5369 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
5370 		bdev_io_submit(bdev_io);
5371 		return 0;
5372 	}
5373 
5374 	assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE));
5375 	assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE);
5376 	bdev_io->u.bdev.split_remaining_num_blocks = num_blocks;
5377 	bdev_io->u.bdev.split_current_offset_blocks = offset_blocks;
5378 	bdev_write_zero_buffer_next(bdev_io);
5379 
5380 	return 0;
5381 }
5382 
5383 int
5384 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5385 		uint64_t offset, uint64_t nbytes,
5386 		spdk_bdev_io_completion_cb cb, void *cb_arg)
5387 {
5388 	uint64_t offset_blocks, num_blocks;
5389 
5390 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5391 				 nbytes, &num_blocks) != 0) {
5392 		return -EINVAL;
5393 	}
5394 
5395 	return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
5396 }
5397 
5398 int
5399 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5400 		       uint64_t offset_blocks, uint64_t num_blocks,
5401 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
5402 {
5403 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5404 	struct spdk_bdev_io *bdev_io;
5405 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5406 
5407 	if (!desc->write) {
5408 		return -EBADF;
5409 	}
5410 
5411 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5412 		return -EINVAL;
5413 	}
5414 
5415 	if (num_blocks == 0) {
5416 		SPDK_ERRLOG("Can't unmap 0 bytes\n");
5417 		return -EINVAL;
5418 	}
5419 
5420 	bdev_io = bdev_channel_get_io(channel);
5421 	if (!bdev_io) {
5422 		return -ENOMEM;
5423 	}
5424 
5425 	bdev_io->internal.ch = channel;
5426 	bdev_io->internal.desc = desc;
5427 	bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP;
5428 
5429 	bdev_io->u.bdev.iovs = &bdev_io->iov;
5430 	bdev_io->u.bdev.iovs[0].iov_base = NULL;
5431 	bdev_io->u.bdev.iovs[0].iov_len = 0;
5432 	bdev_io->u.bdev.iovcnt = 1;
5433 
5434 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5435 	bdev_io->u.bdev.num_blocks = num_blocks;
5436 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5437 	bdev_io->u.bdev.ext_opts = NULL;
5438 
5439 	bdev_io_submit(bdev_io);
5440 	return 0;
5441 }
5442 
5443 int
5444 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5445 		uint64_t offset, uint64_t length,
5446 		spdk_bdev_io_completion_cb cb, void *cb_arg)
5447 {
5448 	uint64_t offset_blocks, num_blocks;
5449 
5450 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5451 				 length, &num_blocks) != 0) {
5452 		return -EINVAL;
5453 	}
5454 
5455 	return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
5456 }
5457 
5458 int
5459 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5460 		       uint64_t offset_blocks, uint64_t num_blocks,
5461 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
5462 {
5463 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5464 	struct spdk_bdev_io *bdev_io;
5465 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5466 
5467 	if (!desc->write) {
5468 		return -EBADF;
5469 	}
5470 
5471 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5472 		return -EINVAL;
5473 	}
5474 
5475 	bdev_io = bdev_channel_get_io(channel);
5476 	if (!bdev_io) {
5477 		return -ENOMEM;
5478 	}
5479 
5480 	bdev_io->internal.ch = channel;
5481 	bdev_io->internal.desc = desc;
5482 	bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH;
5483 	bdev_io->u.bdev.iovs = NULL;
5484 	bdev_io->u.bdev.iovcnt = 0;
5485 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5486 	bdev_io->u.bdev.num_blocks = num_blocks;
5487 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5488 
5489 	bdev_io_submit(bdev_io);
5490 	return 0;
5491 }
5492 
5493 static int bdev_reset_poll_for_outstanding_io(void *ctx);
5494 
5495 static void
5496 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
5497 {
5498 	struct spdk_bdev_channel *ch = _ctx;
5499 	struct spdk_bdev_io *bdev_io;
5500 
5501 	bdev_io = TAILQ_FIRST(&ch->queued_resets);
5502 
5503 	if (status == -EBUSY) {
5504 		if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) {
5505 			bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io,
5506 							      ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD);
5507 		} else {
5508 			/* If outstanding IOs are still present and reset_io_drain_timeout seconds passed,
5509 			 * start the reset. */
5510 			TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
5511 			bdev_io_submit_reset(bdev_io);
5512 		}
5513 	} else {
5514 		TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
5515 		SPDK_DEBUGLOG(bdev,
5516 			      "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n",
5517 			      ch->bdev->name);
5518 		/* Mark the completion status as a SUCCESS and complete the reset. */
5519 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
5520 	}
5521 }
5522 
5523 static void
5524 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
5525 				struct spdk_io_channel *io_ch, void *_ctx)
5526 {
5527 	struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch);
5528 	int status = 0;
5529 
5530 	if (cur_ch->io_outstanding > 0) {
5531 		/* If a channel has outstanding IO, set status to -EBUSY code. This will stop
5532 		 * further iteration over the rest of the channels and pass non-zero status
5533 		 * to the callback function. */
5534 		status = -EBUSY;
5535 	}
5536 	spdk_bdev_for_each_channel_continue(i, status);
5537 }
5538 
5539 static int
5540 bdev_reset_poll_for_outstanding_io(void *ctx)
5541 {
5542 	struct spdk_bdev_channel *ch = ctx;
5543 	struct spdk_bdev_io *bdev_io;
5544 
5545 	bdev_io = TAILQ_FIRST(&ch->queued_resets);
5546 
5547 	spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller);
5548 	spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch,
5549 				   bdev_reset_check_outstanding_io_done);
5550 
5551 	return SPDK_POLLER_BUSY;
5552 }
5553 
5554 static void
5555 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status)
5556 {
5557 	struct spdk_bdev_channel *ch = _ctx;
5558 	struct spdk_bdev_io *bdev_io;
5559 
5560 	bdev_io = TAILQ_FIRST(&ch->queued_resets);
5561 
5562 	if (bdev->reset_io_drain_timeout == 0) {
5563 		TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
5564 
5565 		bdev_io_submit_reset(bdev_io);
5566 		return;
5567 	}
5568 
5569 	bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() +
5570 			(ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz());
5571 
5572 	/* In case bdev->reset_io_drain_timeout is not equal to zero,
5573 	 * submit the reset to the underlying module only if outstanding I/O
5574 	 * remain after reset_io_drain_timeout seconds have passed. */
5575 	spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch,
5576 				   bdev_reset_check_outstanding_io_done);
5577 }
5578 
5579 static void
5580 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
5581 			  struct spdk_io_channel *ch, void *_ctx)
5582 {
5583 	struct spdk_bdev_channel	*channel;
5584 	struct spdk_bdev_mgmt_channel	*mgmt_channel;
5585 	struct spdk_bdev_shared_resource *shared_resource;
5586 	bdev_io_tailq_t			tmp_queued;
5587 
5588 	TAILQ_INIT(&tmp_queued);
5589 
5590 	channel = __io_ch_to_bdev_ch(ch);
5591 	shared_resource = channel->shared_resource;
5592 	mgmt_channel = shared_resource->mgmt_ch;
5593 
5594 	channel->flags |= BDEV_CH_RESET_IN_PROGRESS;
5595 
5596 	if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) {
5597 		/* The QoS object is always valid and readable while
5598 		 * the channel flag is set, so the lock here should not
5599 		 * be necessary. We're not in the fast path though, so
5600 		 * just take it anyway. */
5601 		spdk_spin_lock(&channel->bdev->internal.spinlock);
5602 		if (channel->bdev->internal.qos->ch == channel) {
5603 			TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link);
5604 		}
5605 		spdk_spin_unlock(&channel->bdev->internal.spinlock);
5606 	}
5607 
5608 	bdev_abort_all_queued_io(&shared_resource->nomem_io, channel);
5609 	bdev_abort_all_buf_io(mgmt_channel, channel);
5610 	bdev_abort_all_buf_io(mgmt_channel, channel);
5611 	bdev_abort_all_queued_io(&tmp_queued, channel);
5612 
5613 	spdk_bdev_for_each_channel_continue(i, 0);
5614 }
5615 
5616 static void
5617 bdev_start_reset(void *ctx)
5618 {
5619 	struct spdk_bdev_channel *ch = ctx;
5620 
5621 	spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch,
5622 				   bdev_reset_freeze_channel_done);
5623 }
5624 
5625 static void
5626 bdev_channel_start_reset(struct spdk_bdev_channel *ch)
5627 {
5628 	struct spdk_bdev *bdev = ch->bdev;
5629 
5630 	assert(!TAILQ_EMPTY(&ch->queued_resets));
5631 
5632 	spdk_spin_lock(&bdev->internal.spinlock);
5633 	if (bdev->internal.reset_in_progress == NULL) {
5634 		bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets);
5635 		/*
5636 		 * Take a channel reference for the target bdev for the life of this
5637 		 *  reset.  This guards against the channel getting destroyed while
5638 		 *  spdk_bdev_for_each_channel() calls related to this reset IO are in
5639 		 *  progress.  We will release the reference when this reset is
5640 		 *  completed.
5641 		 */
5642 		bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev));
5643 		bdev_start_reset(ch);
5644 	}
5645 	spdk_spin_unlock(&bdev->internal.spinlock);
5646 }
5647 
5648 int
5649 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5650 		spdk_bdev_io_completion_cb cb, void *cb_arg)
5651 {
5652 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5653 	struct spdk_bdev_io *bdev_io;
5654 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5655 
5656 	bdev_io = bdev_channel_get_io(channel);
5657 	if (!bdev_io) {
5658 		return -ENOMEM;
5659 	}
5660 
5661 	bdev_io->internal.ch = channel;
5662 	bdev_io->internal.desc = desc;
5663 	bdev_io->internal.submit_tsc = spdk_get_ticks();
5664 	bdev_io->type = SPDK_BDEV_IO_TYPE_RESET;
5665 	bdev_io->u.reset.ch_ref = NULL;
5666 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5667 
5668 	spdk_spin_lock(&bdev->internal.spinlock);
5669 	TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link);
5670 	spdk_spin_unlock(&bdev->internal.spinlock);
5671 
5672 	TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io,
5673 			  internal.ch_link);
5674 
5675 	bdev_channel_start_reset(channel);
5676 
5677 	return 0;
5678 }
5679 
5680 void
5681 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
5682 		      struct spdk_bdev_io_stat *stat)
5683 {
5684 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5685 
5686 	bdev_io_stat_get(stat, channel->stat);
5687 }
5688 
5689 static void
5690 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status)
5691 {
5692 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx;
5693 
5694 	bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat,
5695 			    bdev_iostat_ctx->cb_arg, 0);
5696 	free(bdev_iostat_ctx);
5697 }
5698 
5699 static void
5700 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
5701 			   struct spdk_io_channel *ch, void *_ctx)
5702 {
5703 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx;
5704 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5705 
5706 	bdev_io_stat_add(bdev_iostat_ctx->stat, channel->stat);
5707 	spdk_bdev_for_each_channel_continue(i, 0);
5708 }
5709 
5710 void
5711 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat,
5712 			  spdk_bdev_get_device_stat_cb cb, void *cb_arg)
5713 {
5714 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx;
5715 
5716 	assert(bdev != NULL);
5717 	assert(stat != NULL);
5718 	assert(cb != NULL);
5719 
5720 	bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx));
5721 	if (bdev_iostat_ctx == NULL) {
5722 		SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n");
5723 		cb(bdev, stat, cb_arg, -ENOMEM);
5724 		return;
5725 	}
5726 
5727 	bdev_iostat_ctx->stat = stat;
5728 	bdev_iostat_ctx->cb = cb;
5729 	bdev_iostat_ctx->cb_arg = cb_arg;
5730 
5731 	/* Start with the statistics from previously deleted channels. */
5732 	spdk_spin_lock(&bdev->internal.spinlock);
5733 	bdev_io_stat_get(bdev_iostat_ctx->stat, bdev->internal.stat);
5734 	spdk_spin_unlock(&bdev->internal.spinlock);
5735 
5736 	/* Then iterate and add the statistics from each existing channel. */
5737 	spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx,
5738 				   bdev_get_device_stat_done);
5739 }
5740 
5741 struct bdev_iostat_reset_ctx {
5742 	enum bdev_reset_stat_mode mode;
5743 	bdev_reset_device_stat_cb cb;
5744 	void *cb_arg;
5745 };
5746 
5747 static void
5748 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status)
5749 {
5750 	struct bdev_iostat_reset_ctx *ctx = _ctx;
5751 
5752 	ctx->cb(bdev, ctx->cb_arg, 0);
5753 
5754 	free(ctx);
5755 }
5756 
5757 static void
5758 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
5759 			     struct spdk_io_channel *ch, void *_ctx)
5760 {
5761 	struct bdev_iostat_reset_ctx *ctx = _ctx;
5762 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5763 
5764 	bdev_io_stat_reset(channel->stat, ctx->mode);
5765 
5766 	spdk_bdev_for_each_channel_continue(i, 0);
5767 }
5768 
5769 void
5770 bdev_reset_device_stat(struct spdk_bdev *bdev, enum bdev_reset_stat_mode mode,
5771 		       bdev_reset_device_stat_cb cb, void *cb_arg)
5772 {
5773 	struct bdev_iostat_reset_ctx *ctx;
5774 
5775 	assert(bdev != NULL);
5776 	assert(cb != NULL);
5777 
5778 	ctx = calloc(1, sizeof(*ctx));
5779 	if (ctx == NULL) {
5780 		SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n");
5781 		cb(bdev, cb_arg, -ENOMEM);
5782 		return;
5783 	}
5784 
5785 	ctx->mode = mode;
5786 	ctx->cb = cb;
5787 	ctx->cb_arg = cb_arg;
5788 
5789 	spdk_spin_lock(&bdev->internal.spinlock);
5790 	bdev_io_stat_reset(bdev->internal.stat, mode);
5791 	spdk_spin_unlock(&bdev->internal.spinlock);
5792 
5793 	spdk_bdev_for_each_channel(bdev,
5794 				   bdev_reset_each_channel_stat,
5795 				   ctx,
5796 				   bdev_reset_device_stat_done);
5797 }
5798 
5799 int
5800 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5801 			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
5802 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
5803 {
5804 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5805 	struct spdk_bdev_io *bdev_io;
5806 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5807 
5808 	if (!desc->write) {
5809 		return -EBADF;
5810 	}
5811 
5812 	if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) {
5813 		return -ENOTSUP;
5814 	}
5815 
5816 	bdev_io = bdev_channel_get_io(channel);
5817 	if (!bdev_io) {
5818 		return -ENOMEM;
5819 	}
5820 
5821 	bdev_io->internal.ch = channel;
5822 	bdev_io->internal.desc = desc;
5823 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN;
5824 	bdev_io->u.nvme_passthru.cmd = *cmd;
5825 	bdev_io->u.nvme_passthru.buf = buf;
5826 	bdev_io->u.nvme_passthru.nbytes = nbytes;
5827 	bdev_io->u.nvme_passthru.md_buf = NULL;
5828 	bdev_io->u.nvme_passthru.md_len = 0;
5829 
5830 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5831 
5832 	bdev_io_submit(bdev_io);
5833 	return 0;
5834 }
5835 
5836 int
5837 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5838 			   const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
5839 			   spdk_bdev_io_completion_cb cb, void *cb_arg)
5840 {
5841 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5842 	struct spdk_bdev_io *bdev_io;
5843 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5844 
5845 	if (!desc->write) {
5846 		/*
5847 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
5848 		 *  to easily determine if the command is a read or write, but for now just
5849 		 *  do not allow io_passthru with a read-only descriptor.
5850 		 */
5851 		return -EBADF;
5852 	}
5853 
5854 	if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) {
5855 		return -ENOTSUP;
5856 	}
5857 
5858 	bdev_io = bdev_channel_get_io(channel);
5859 	if (!bdev_io) {
5860 		return -ENOMEM;
5861 	}
5862 
5863 	bdev_io->internal.ch = channel;
5864 	bdev_io->internal.desc = desc;
5865 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO;
5866 	bdev_io->u.nvme_passthru.cmd = *cmd;
5867 	bdev_io->u.nvme_passthru.buf = buf;
5868 	bdev_io->u.nvme_passthru.nbytes = nbytes;
5869 	bdev_io->u.nvme_passthru.md_buf = NULL;
5870 	bdev_io->u.nvme_passthru.md_len = 0;
5871 
5872 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5873 
5874 	bdev_io_submit(bdev_io);
5875 	return 0;
5876 }
5877 
5878 int
5879 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5880 			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len,
5881 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
5882 {
5883 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5884 	struct spdk_bdev_io *bdev_io;
5885 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5886 
5887 	if (!desc->write) {
5888 		/*
5889 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
5890 		 *  to easily determine if the command is a read or write, but for now just
5891 		 *  do not allow io_passthru with a read-only descriptor.
5892 		 */
5893 		return -EBADF;
5894 	}
5895 
5896 	if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) {
5897 		return -ENOTSUP;
5898 	}
5899 
5900 	bdev_io = bdev_channel_get_io(channel);
5901 	if (!bdev_io) {
5902 		return -ENOMEM;
5903 	}
5904 
5905 	bdev_io->internal.ch = channel;
5906 	bdev_io->internal.desc = desc;
5907 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD;
5908 	bdev_io->u.nvme_passthru.cmd = *cmd;
5909 	bdev_io->u.nvme_passthru.buf = buf;
5910 	bdev_io->u.nvme_passthru.nbytes = nbytes;
5911 	bdev_io->u.nvme_passthru.md_buf = md_buf;
5912 	bdev_io->u.nvme_passthru.md_len = md_len;
5913 
5914 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5915 
5916 	bdev_io_submit(bdev_io);
5917 	return 0;
5918 }
5919 
5920 static void bdev_abort_retry(void *ctx);
5921 static void bdev_abort(struct spdk_bdev_io *parent_io);
5922 
5923 static void
5924 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
5925 {
5926 	struct spdk_bdev_channel *channel = bdev_io->internal.ch;
5927 	struct spdk_bdev_io *parent_io = cb_arg;
5928 	struct spdk_bdev_io *bio_to_abort, *tmp_io;
5929 
5930 	bio_to_abort = bdev_io->u.abort.bio_to_abort;
5931 
5932 	spdk_bdev_free_io(bdev_io);
5933 
5934 	if (!success) {
5935 		/* Check if the target I/O completed in the meantime. */
5936 		TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) {
5937 			if (tmp_io == bio_to_abort) {
5938 				break;
5939 			}
5940 		}
5941 
5942 		/* If the target I/O still exists, set the parent to failed. */
5943 		if (tmp_io != NULL) {
5944 			parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
5945 		}
5946 	}
5947 
5948 	parent_io->u.bdev.split_outstanding--;
5949 	if (parent_io->u.bdev.split_outstanding == 0) {
5950 		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
5951 			bdev_abort_retry(parent_io);
5952 		} else {
5953 			bdev_io_complete(parent_io);
5954 		}
5955 	}
5956 }
5957 
5958 static int
5959 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel,
5960 	      struct spdk_bdev_io *bio_to_abort,
5961 	      spdk_bdev_io_completion_cb cb, void *cb_arg)
5962 {
5963 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5964 	struct spdk_bdev_io *bdev_io;
5965 
5966 	if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT ||
5967 	    bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) {
5968 		/* TODO: Abort reset or abort request. */
5969 		return -ENOTSUP;
5970 	}
5971 
5972 	bdev_io = bdev_channel_get_io(channel);
5973 	if (bdev_io == NULL) {
5974 		return -ENOMEM;
5975 	}
5976 
5977 	bdev_io->internal.ch = channel;
5978 	bdev_io->internal.desc = desc;
5979 	bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT;
5980 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5981 
5982 	if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) {
5983 		bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort;
5984 
5985 		/* Parent abort request is not submitted directly, but to manage its
5986 		 * execution add it to the submitted list here.
5987 		 */
5988 		bdev_io->internal.submit_tsc = spdk_get_ticks();
5989 		TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link);
5990 
5991 		bdev_abort(bdev_io);
5992 
5993 		return 0;
5994 	}
5995 
5996 	bdev_io->u.abort.bio_to_abort = bio_to_abort;
5997 
5998 	/* Submit the abort request to the underlying bdev module. */
5999 	bdev_io_submit(bdev_io);
6000 
6001 	return 0;
6002 }
6003 
6004 static uint32_t
6005 _bdev_abort(struct spdk_bdev_io *parent_io)
6006 {
6007 	struct spdk_bdev_desc *desc = parent_io->internal.desc;
6008 	struct spdk_bdev_channel *channel = parent_io->internal.ch;
6009 	void *bio_cb_arg;
6010 	struct spdk_bdev_io *bio_to_abort;
6011 	uint32_t matched_ios;
6012 	int rc;
6013 
6014 	bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg;
6015 
6016 	/* matched_ios is returned and will be kept by the caller.
6017 	 *
6018 	 * This function will be used for two cases, 1) the same cb_arg is used for
6019 	 * multiple I/Os, 2) a single large I/O is split into smaller ones.
6020 	 * Incrementing split_outstanding directly here may confuse readers especially
6021 	 * for the 1st case.
6022 	 *
6023 	 * Completion of I/O abort is processed after stack unwinding. Hence this trick
6024 	 * works as expected.
6025 	 */
6026 	matched_ios = 0;
6027 	parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
6028 
6029 	TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) {
6030 		if (bio_to_abort->internal.caller_ctx != bio_cb_arg) {
6031 			continue;
6032 		}
6033 
6034 		if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) {
6035 			/* Any I/O which was submitted after this abort command should be excluded. */
6036 			continue;
6037 		}
6038 
6039 		rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io);
6040 		if (rc != 0) {
6041 			if (rc == -ENOMEM) {
6042 				parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM;
6043 			} else {
6044 				parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
6045 			}
6046 			break;
6047 		}
6048 		matched_ios++;
6049 	}
6050 
6051 	return matched_ios;
6052 }
6053 
6054 static void
6055 bdev_abort_retry(void *ctx)
6056 {
6057 	struct spdk_bdev_io *parent_io = ctx;
6058 	uint32_t matched_ios;
6059 
6060 	matched_ios = _bdev_abort(parent_io);
6061 
6062 	if (matched_ios == 0) {
6063 		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
6064 			bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry);
6065 		} else {
6066 			/* For retry, the case that no target I/O was found is success
6067 			 * because it means target I/Os completed in the meantime.
6068 			 */
6069 			bdev_io_complete(parent_io);
6070 		}
6071 		return;
6072 	}
6073 
6074 	/* Use split_outstanding to manage the progress of aborting I/Os. */
6075 	parent_io->u.bdev.split_outstanding = matched_ios;
6076 }
6077 
6078 static void
6079 bdev_abort(struct spdk_bdev_io *parent_io)
6080 {
6081 	uint32_t matched_ios;
6082 
6083 	matched_ios = _bdev_abort(parent_io);
6084 
6085 	if (matched_ios == 0) {
6086 		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
6087 			bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry);
6088 		} else {
6089 			/* The case the no target I/O was found is failure. */
6090 			parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
6091 			bdev_io_complete(parent_io);
6092 		}
6093 		return;
6094 	}
6095 
6096 	/* Use split_outstanding to manage the progress of aborting I/Os. */
6097 	parent_io->u.bdev.split_outstanding = matched_ios;
6098 }
6099 
6100 int
6101 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6102 		void *bio_cb_arg,
6103 		spdk_bdev_io_completion_cb cb, void *cb_arg)
6104 {
6105 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6106 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6107 	struct spdk_bdev_io *bdev_io;
6108 
6109 	if (bio_cb_arg == NULL) {
6110 		return -EINVAL;
6111 	}
6112 
6113 	if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) {
6114 		return -ENOTSUP;
6115 	}
6116 
6117 	bdev_io = bdev_channel_get_io(channel);
6118 	if (bdev_io == NULL) {
6119 		return -ENOMEM;
6120 	}
6121 
6122 	bdev_io->internal.ch = channel;
6123 	bdev_io->internal.desc = desc;
6124 	bdev_io->internal.submit_tsc = spdk_get_ticks();
6125 	bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT;
6126 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6127 
6128 	bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg;
6129 
6130 	/* Parent abort request is not submitted directly, but to manage its execution,
6131 	 * add it to the submitted list here.
6132 	 */
6133 	TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link);
6134 
6135 	bdev_abort(bdev_io);
6136 
6137 	return 0;
6138 }
6139 
6140 int
6141 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
6142 			struct spdk_bdev_io_wait_entry *entry)
6143 {
6144 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6145 	struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch;
6146 
6147 	if (bdev != entry->bdev) {
6148 		SPDK_ERRLOG("bdevs do not match\n");
6149 		return -EINVAL;
6150 	}
6151 
6152 	if (mgmt_ch->per_thread_cache_count > 0) {
6153 		SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n");
6154 		return -EINVAL;
6155 	}
6156 
6157 	TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link);
6158 	return 0;
6159 }
6160 
6161 static inline void
6162 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff)
6163 {
6164 	struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat;
6165 	uint64_t num_blocks = bdev_io->u.bdev.num_blocks;
6166 	uint32_t blocklen = bdev_io->bdev->blocklen;
6167 
6168 	if (spdk_likely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
6169 		switch (bdev_io->type) {
6170 		case SPDK_BDEV_IO_TYPE_READ:
6171 			io_stat->bytes_read += num_blocks * blocklen;
6172 			io_stat->num_read_ops++;
6173 			io_stat->read_latency_ticks += tsc_diff;
6174 			if (io_stat->max_read_latency_ticks < tsc_diff) {
6175 				io_stat->max_read_latency_ticks = tsc_diff;
6176 			}
6177 			if (io_stat->min_read_latency_ticks > tsc_diff) {
6178 				io_stat->min_read_latency_ticks = tsc_diff;
6179 			}
6180 			break;
6181 		case SPDK_BDEV_IO_TYPE_WRITE:
6182 			io_stat->bytes_written += num_blocks * blocklen;
6183 			io_stat->num_write_ops++;
6184 			io_stat->write_latency_ticks += tsc_diff;
6185 			if (io_stat->max_write_latency_ticks < tsc_diff) {
6186 				io_stat->max_write_latency_ticks = tsc_diff;
6187 			}
6188 			if (io_stat->min_write_latency_ticks > tsc_diff) {
6189 				io_stat->min_write_latency_ticks = tsc_diff;
6190 			}
6191 			break;
6192 		case SPDK_BDEV_IO_TYPE_UNMAP:
6193 			io_stat->bytes_unmapped += num_blocks * blocklen;
6194 			io_stat->num_unmap_ops++;
6195 			io_stat->unmap_latency_ticks += tsc_diff;
6196 			if (io_stat->max_unmap_latency_ticks < tsc_diff) {
6197 				io_stat->max_unmap_latency_ticks = tsc_diff;
6198 			}
6199 			if (io_stat->min_unmap_latency_ticks > tsc_diff) {
6200 				io_stat->min_unmap_latency_ticks = tsc_diff;
6201 			}
6202 			break;
6203 		case SPDK_BDEV_IO_TYPE_ZCOPY:
6204 			/* Track the data in the start phase only */
6205 			if (bdev_io->u.bdev.zcopy.start) {
6206 				if (bdev_io->u.bdev.zcopy.populate) {
6207 					io_stat->bytes_read += num_blocks * blocklen;
6208 					io_stat->num_read_ops++;
6209 					io_stat->read_latency_ticks += tsc_diff;
6210 					if (io_stat->max_read_latency_ticks < tsc_diff) {
6211 						io_stat->max_read_latency_ticks = tsc_diff;
6212 					}
6213 					if (io_stat->min_read_latency_ticks > tsc_diff) {
6214 						io_stat->min_read_latency_ticks = tsc_diff;
6215 					}
6216 				} else {
6217 					io_stat->bytes_written += num_blocks * blocklen;
6218 					io_stat->num_write_ops++;
6219 					io_stat->write_latency_ticks += tsc_diff;
6220 					if (io_stat->max_write_latency_ticks < tsc_diff) {
6221 						io_stat->max_write_latency_ticks = tsc_diff;
6222 					}
6223 					if (io_stat->min_write_latency_ticks > tsc_diff) {
6224 						io_stat->min_write_latency_ticks = tsc_diff;
6225 					}
6226 				}
6227 			}
6228 			break;
6229 		case SPDK_BDEV_IO_TYPE_COPY:
6230 			io_stat->bytes_copied += num_blocks * blocklen;
6231 			io_stat->num_copy_ops++;
6232 			bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff;
6233 			if (io_stat->max_copy_latency_ticks < tsc_diff) {
6234 				io_stat->max_copy_latency_ticks = tsc_diff;
6235 			}
6236 			if (io_stat->min_copy_latency_ticks > tsc_diff) {
6237 				io_stat->min_copy_latency_ticks = tsc_diff;
6238 			}
6239 			break;
6240 		default:
6241 			break;
6242 		}
6243 	}
6244 
6245 #ifdef SPDK_CONFIG_VTUNE
6246 	uint64_t now_tsc = spdk_get_ticks();
6247 	if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) {
6248 		uint64_t data[5];
6249 		struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat;
6250 
6251 		data[0] = io_stat->num_read_ops - prev_stat->num_read_ops;
6252 		data[1] = io_stat->bytes_read - prev_stat->bytes_read;
6253 		data[2] = io_stat->num_write_ops - prev_stat->num_write_ops;
6254 		data[3] = io_stat->bytes_written - prev_stat->bytes_written;
6255 		data[4] = bdev_io->bdev->fn_table->get_spin_time ?
6256 			  bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0;
6257 
6258 		__itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle,
6259 				   __itt_metadata_u64, 5, data);
6260 
6261 		memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat));
6262 		bdev_io->internal.ch->start_tsc = now_tsc;
6263 	}
6264 #endif
6265 }
6266 
6267 static inline void
6268 bdev_io_complete(void *ctx)
6269 {
6270 	struct spdk_bdev_io *bdev_io = ctx;
6271 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
6272 	uint64_t tsc, tsc_diff;
6273 
6274 	if (spdk_unlikely(bdev_io->internal.in_submit_request)) {
6275 		/*
6276 		 * Defer completion to avoid potential infinite recursion if the
6277 		 * user's completion callback issues a new I/O.
6278 		 */
6279 		spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
6280 				     bdev_io_complete, bdev_io);
6281 		return;
6282 	}
6283 
6284 	tsc = spdk_get_ticks();
6285 	tsc_diff = tsc - bdev_io->internal.submit_tsc;
6286 	spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io,
6287 			      bdev_io->internal.caller_ctx);
6288 
6289 	TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link);
6290 
6291 	if (bdev_io->internal.ch->histogram) {
6292 		spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff);
6293 	}
6294 
6295 	bdev_io_update_io_stat(bdev_io, tsc_diff);
6296 
6297 	assert(bdev_io->internal.cb != NULL);
6298 	assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io));
6299 
6300 	bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
6301 			     bdev_io->internal.caller_ctx);
6302 }
6303 
6304 static void bdev_destroy_cb(void *io_device);
6305 
6306 static void
6307 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status)
6308 {
6309 	struct spdk_bdev_io *bdev_io = _ctx;
6310 
6311 	if (bdev_io->u.reset.ch_ref != NULL) {
6312 		spdk_put_io_channel(bdev_io->u.reset.ch_ref);
6313 		bdev_io->u.reset.ch_ref = NULL;
6314 	}
6315 
6316 	bdev_io_complete(bdev_io);
6317 
6318 	if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING &&
6319 	    TAILQ_EMPTY(&bdev->internal.open_descs)) {
6320 		spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
6321 	}
6322 }
6323 
6324 static void
6325 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6326 		      struct spdk_io_channel *_ch, void *_ctx)
6327 {
6328 	struct spdk_bdev_io *bdev_io = _ctx;
6329 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
6330 	struct spdk_bdev_io *queued_reset;
6331 
6332 	ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS;
6333 	while (!TAILQ_EMPTY(&ch->queued_resets)) {
6334 		queued_reset = TAILQ_FIRST(&ch->queued_resets);
6335 		TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link);
6336 		spdk_bdev_io_complete(queued_reset, bdev_io->internal.status);
6337 	}
6338 
6339 	spdk_bdev_for_each_channel_continue(i, 0);
6340 }
6341 
6342 void
6343 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status)
6344 {
6345 	struct spdk_bdev *bdev = bdev_io->bdev;
6346 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
6347 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
6348 
6349 	bdev_io->internal.status = status;
6350 
6351 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) {
6352 		bool unlock_channels = false;
6353 
6354 		if (status == SPDK_BDEV_IO_STATUS_NOMEM) {
6355 			SPDK_ERRLOG("NOMEM returned for reset\n");
6356 		}
6357 		spdk_spin_lock(&bdev->internal.spinlock);
6358 		if (bdev_io == bdev->internal.reset_in_progress) {
6359 			bdev->internal.reset_in_progress = NULL;
6360 			unlock_channels = true;
6361 		}
6362 		spdk_spin_unlock(&bdev->internal.spinlock);
6363 
6364 		if (unlock_channels) {
6365 			spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io,
6366 						   bdev_reset_complete);
6367 			return;
6368 		}
6369 	} else {
6370 		if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) {
6371 			_bdev_io_push_bounce_data_buffer(bdev_io, _bdev_io_complete_push_bounce_done);
6372 			/* bdev IO will be completed in the callback */
6373 			return;
6374 		}
6375 
6376 		_bdev_io_decrement_outstanding(bdev_ch, shared_resource);
6377 		if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) {
6378 			return;
6379 		}
6380 	}
6381 
6382 	bdev_io_complete(bdev_io);
6383 }
6384 
6385 void
6386 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc,
6387 				  enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq)
6388 {
6389 	if (sc == SPDK_SCSI_STATUS_GOOD) {
6390 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
6391 	} else {
6392 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR;
6393 		bdev_io->internal.error.scsi.sc = sc;
6394 		bdev_io->internal.error.scsi.sk = sk;
6395 		bdev_io->internal.error.scsi.asc = asc;
6396 		bdev_io->internal.error.scsi.ascq = ascq;
6397 	}
6398 
6399 	spdk_bdev_io_complete(bdev_io, bdev_io->internal.status);
6400 }
6401 
6402 void
6403 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io,
6404 			     int *sc, int *sk, int *asc, int *ascq)
6405 {
6406 	assert(sc != NULL);
6407 	assert(sk != NULL);
6408 	assert(asc != NULL);
6409 	assert(ascq != NULL);
6410 
6411 	switch (bdev_io->internal.status) {
6412 	case SPDK_BDEV_IO_STATUS_SUCCESS:
6413 		*sc = SPDK_SCSI_STATUS_GOOD;
6414 		*sk = SPDK_SCSI_SENSE_NO_SENSE;
6415 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
6416 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
6417 		break;
6418 	case SPDK_BDEV_IO_STATUS_NVME_ERROR:
6419 		spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq);
6420 		break;
6421 	case SPDK_BDEV_IO_STATUS_SCSI_ERROR:
6422 		*sc = bdev_io->internal.error.scsi.sc;
6423 		*sk = bdev_io->internal.error.scsi.sk;
6424 		*asc = bdev_io->internal.error.scsi.asc;
6425 		*ascq = bdev_io->internal.error.scsi.ascq;
6426 		break;
6427 	default:
6428 		*sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
6429 		*sk = SPDK_SCSI_SENSE_ABORTED_COMMAND;
6430 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
6431 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
6432 		break;
6433 	}
6434 }
6435 
6436 void
6437 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result)
6438 {
6439 	if (aio_result == 0) {
6440 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
6441 	} else {
6442 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR;
6443 	}
6444 
6445 	bdev_io->internal.error.aio_result = aio_result;
6446 
6447 	spdk_bdev_io_complete(bdev_io, bdev_io->internal.status);
6448 }
6449 
6450 void
6451 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result)
6452 {
6453 	assert(aio_result != NULL);
6454 
6455 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) {
6456 		*aio_result = bdev_io->internal.error.aio_result;
6457 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
6458 		*aio_result = 0;
6459 	} else {
6460 		*aio_result = -EIO;
6461 	}
6462 }
6463 
6464 void
6465 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc)
6466 {
6467 	if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) {
6468 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
6469 	} else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) {
6470 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED;
6471 	} else {
6472 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR;
6473 	}
6474 
6475 	bdev_io->internal.error.nvme.cdw0 = cdw0;
6476 	bdev_io->internal.error.nvme.sct = sct;
6477 	bdev_io->internal.error.nvme.sc = sc;
6478 
6479 	spdk_bdev_io_complete(bdev_io, bdev_io->internal.status);
6480 }
6481 
6482 void
6483 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc)
6484 {
6485 	assert(sct != NULL);
6486 	assert(sc != NULL);
6487 	assert(cdw0 != NULL);
6488 
6489 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) {
6490 		*sct = SPDK_NVME_SCT_GENERIC;
6491 		*sc = SPDK_NVME_SC_SUCCESS;
6492 		if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
6493 			*cdw0 = 0;
6494 		} else {
6495 			*cdw0 = 1U;
6496 		}
6497 		return;
6498 	}
6499 
6500 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
6501 		*sct = bdev_io->internal.error.nvme.sct;
6502 		*sc = bdev_io->internal.error.nvme.sc;
6503 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
6504 		*sct = SPDK_NVME_SCT_GENERIC;
6505 		*sc = SPDK_NVME_SC_SUCCESS;
6506 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) {
6507 		*sct = SPDK_NVME_SCT_GENERIC;
6508 		*sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
6509 	} else {
6510 		*sct = SPDK_NVME_SCT_GENERIC;
6511 		*sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
6512 	}
6513 
6514 	*cdw0 = bdev_io->internal.error.nvme.cdw0;
6515 }
6516 
6517 void
6518 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0,
6519 				   int *first_sct, int *first_sc, int *second_sct, int *second_sc)
6520 {
6521 	assert(first_sct != NULL);
6522 	assert(first_sc != NULL);
6523 	assert(second_sct != NULL);
6524 	assert(second_sc != NULL);
6525 	assert(cdw0 != NULL);
6526 
6527 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
6528 		if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR &&
6529 		    bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) {
6530 			*first_sct = bdev_io->internal.error.nvme.sct;
6531 			*first_sc = bdev_io->internal.error.nvme.sc;
6532 			*second_sct = SPDK_NVME_SCT_GENERIC;
6533 			*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
6534 		} else {
6535 			*first_sct = SPDK_NVME_SCT_GENERIC;
6536 			*first_sc = SPDK_NVME_SC_SUCCESS;
6537 			*second_sct = bdev_io->internal.error.nvme.sct;
6538 			*second_sc = bdev_io->internal.error.nvme.sc;
6539 		}
6540 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) {
6541 		*first_sct = SPDK_NVME_SCT_GENERIC;
6542 		*first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
6543 		*second_sct = SPDK_NVME_SCT_GENERIC;
6544 		*second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
6545 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
6546 		*first_sct = SPDK_NVME_SCT_GENERIC;
6547 		*first_sc = SPDK_NVME_SC_SUCCESS;
6548 		*second_sct = SPDK_NVME_SCT_GENERIC;
6549 		*second_sc = SPDK_NVME_SC_SUCCESS;
6550 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) {
6551 		*first_sct = SPDK_NVME_SCT_GENERIC;
6552 		*first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
6553 		*second_sct = SPDK_NVME_SCT_GENERIC;
6554 		*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
6555 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) {
6556 		*first_sct = SPDK_NVME_SCT_MEDIA_ERROR;
6557 		*first_sc = SPDK_NVME_SC_COMPARE_FAILURE;
6558 		*second_sct = SPDK_NVME_SCT_GENERIC;
6559 		*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
6560 	} else {
6561 		*first_sct = SPDK_NVME_SCT_GENERIC;
6562 		*first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
6563 		*second_sct = SPDK_NVME_SCT_GENERIC;
6564 		*second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
6565 	}
6566 
6567 	*cdw0 = bdev_io->internal.error.nvme.cdw0;
6568 }
6569 
6570 struct spdk_thread *
6571 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io)
6572 {
6573 	return spdk_io_channel_get_thread(bdev_io->internal.ch->channel);
6574 }
6575 
6576 struct spdk_io_channel *
6577 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io)
6578 {
6579 	return bdev_io->internal.ch->channel;
6580 }
6581 
6582 static int
6583 bdev_register(struct spdk_bdev *bdev)
6584 {
6585 	char *bdev_name;
6586 	char uuid[SPDK_UUID_STRING_LEN];
6587 	int ret;
6588 
6589 	assert(bdev->module != NULL);
6590 
6591 	if (!bdev->name) {
6592 		SPDK_ERRLOG("Bdev name is NULL\n");
6593 		return -EINVAL;
6594 	}
6595 
6596 	if (!strlen(bdev->name)) {
6597 		SPDK_ERRLOG("Bdev name must not be an empty string\n");
6598 		return -EINVAL;
6599 	}
6600 
6601 	/* Users often register their own I/O devices using the bdev name. In
6602 	 * order to avoid conflicts, prepend bdev_. */
6603 	bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name);
6604 	if (!bdev_name) {
6605 		SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n");
6606 		return -ENOMEM;
6607 	}
6608 
6609 	bdev->internal.stat = bdev_io_stat_alloc();
6610 	if (!bdev->internal.stat) {
6611 		SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n");
6612 		free(bdev_name);
6613 		return -ENOMEM;
6614 	}
6615 
6616 	bdev->internal.status = SPDK_BDEV_STATUS_READY;
6617 	bdev->internal.measured_queue_depth = UINT64_MAX;
6618 	bdev->internal.claim_module = NULL;
6619 	bdev->internal.qd_poller = NULL;
6620 	bdev->internal.qos = NULL;
6621 
6622 	TAILQ_INIT(&bdev->internal.open_descs);
6623 	TAILQ_INIT(&bdev->internal.locked_ranges);
6624 	TAILQ_INIT(&bdev->internal.pending_locked_ranges);
6625 	TAILQ_INIT(&bdev->aliases);
6626 
6627 	ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name);
6628 	if (ret != 0) {
6629 		bdev_io_stat_free(bdev->internal.stat);
6630 		free(bdev_name);
6631 		return ret;
6632 	}
6633 
6634 	/* UUID has to be specified by the user or defined by bdev itself.
6635 	 * Otherwise this field must remain empty, to indicate that this
6636 	 * value cannot be depended upon. */
6637 	if (!spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) {
6638 		/* Add the UUID alias only if it's different than the name */
6639 		spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid);
6640 		if (strcmp(bdev->name, uuid) != 0) {
6641 			ret = spdk_bdev_alias_add(bdev, uuid);
6642 			if (ret != 0) {
6643 				SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name);
6644 				bdev_name_del(&bdev->internal.bdev_name);
6645 				bdev_io_stat_free(bdev->internal.stat);
6646 				free(bdev_name);
6647 				return ret;
6648 			}
6649 		}
6650 	}
6651 
6652 	if (spdk_bdev_get_buf_align(bdev) > 1) {
6653 		if (bdev->split_on_optimal_io_boundary) {
6654 			bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary,
6655 							     SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen);
6656 		} else {
6657 			bdev->split_on_optimal_io_boundary = true;
6658 			bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen;
6659 		}
6660 	}
6661 
6662 	/* If the user didn't specify a write unit size, set it to one. */
6663 	if (bdev->write_unit_size == 0) {
6664 		bdev->write_unit_size = 1;
6665 	}
6666 
6667 	/* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */
6668 	if (bdev->acwu == 0) {
6669 		bdev->acwu = bdev->write_unit_size;
6670 	}
6671 
6672 	if (bdev->phys_blocklen == 0) {
6673 		bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev);
6674 	}
6675 
6676 	bdev->internal.reset_in_progress = NULL;
6677 	bdev->internal.qd_poll_in_progress = false;
6678 	bdev->internal.period = 0;
6679 	bdev->internal.new_period = 0;
6680 
6681 	spdk_io_device_register(__bdev_to_io_dev(bdev),
6682 				bdev_channel_create, bdev_channel_destroy,
6683 				sizeof(struct spdk_bdev_channel),
6684 				bdev_name);
6685 
6686 	free(bdev_name);
6687 
6688 	spdk_spin_init(&bdev->internal.spinlock);
6689 
6690 	SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name);
6691 	TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link);
6692 
6693 	return 0;
6694 }
6695 
6696 static void
6697 bdev_destroy_cb(void *io_device)
6698 {
6699 	int			rc;
6700 	struct spdk_bdev	*bdev;
6701 	spdk_bdev_unregister_cb	cb_fn;
6702 	void			*cb_arg;
6703 
6704 	bdev = __bdev_from_io_dev(io_device);
6705 	cb_fn = bdev->internal.unregister_cb;
6706 	cb_arg = bdev->internal.unregister_ctx;
6707 
6708 	spdk_spin_destroy(&bdev->internal.spinlock);
6709 	free(bdev->internal.qos);
6710 	bdev_io_stat_free(bdev->internal.stat);
6711 
6712 	rc = bdev->fn_table->destruct(bdev->ctxt);
6713 	if (rc < 0) {
6714 		SPDK_ERRLOG("destruct failed\n");
6715 	}
6716 	if (rc <= 0 && cb_fn != NULL) {
6717 		cb_fn(cb_arg, rc);
6718 	}
6719 }
6720 
6721 void
6722 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno)
6723 {
6724 	if (bdev->internal.unregister_cb != NULL) {
6725 		bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno);
6726 	}
6727 }
6728 
6729 static void
6730 _remove_notify(void *arg)
6731 {
6732 	struct spdk_bdev_desc *desc = arg;
6733 
6734 	spdk_spin_lock(&desc->spinlock);
6735 	desc->refs--;
6736 
6737 	if (!desc->closed) {
6738 		spdk_spin_unlock(&desc->spinlock);
6739 		desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx);
6740 		return;
6741 	} else if (0 == desc->refs) {
6742 		/* This descriptor was closed after this remove_notify message was sent.
6743 		 * spdk_bdev_close() could not free the descriptor since this message was
6744 		 * in flight, so we free it now using bdev_desc_free().
6745 		 */
6746 		spdk_spin_unlock(&desc->spinlock);
6747 		bdev_desc_free(desc);
6748 		return;
6749 	}
6750 	spdk_spin_unlock(&desc->spinlock);
6751 }
6752 
6753 /* returns: 0 - bdev removed and ready to be destructed.
6754  *          -EBUSY - bdev can't be destructed yet.  */
6755 static int
6756 bdev_unregister_unsafe(struct spdk_bdev *bdev)
6757 {
6758 	struct spdk_bdev_desc	*desc, *tmp;
6759 	int			rc = 0;
6760 	char			uuid[SPDK_UUID_STRING_LEN];
6761 
6762 	assert(spdk_spin_held(&g_bdev_mgr.spinlock));
6763 	assert(spdk_spin_held(&bdev->internal.spinlock));
6764 
6765 	/* Notify each descriptor about hotremoval */
6766 	TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) {
6767 		rc = -EBUSY;
6768 		spdk_spin_lock(&desc->spinlock);
6769 		/*
6770 		 * Defer invocation of the event_cb to a separate message that will
6771 		 *  run later on its thread.  This ensures this context unwinds and
6772 		 *  we don't recursively unregister this bdev again if the event_cb
6773 		 *  immediately closes its descriptor.
6774 		 */
6775 		desc->refs++;
6776 		spdk_thread_send_msg(desc->thread, _remove_notify, desc);
6777 		spdk_spin_unlock(&desc->spinlock);
6778 	}
6779 
6780 	/* If there are no descriptors, proceed removing the bdev */
6781 	if (rc == 0) {
6782 		TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
6783 		SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name);
6784 
6785 		/* Delete the name and the UUID alias */
6786 		spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid);
6787 		bdev_name_del_unsafe(&bdev->internal.bdev_name);
6788 		bdev_alias_del(bdev, uuid, bdev_name_del_unsafe);
6789 
6790 		spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev));
6791 
6792 		if (bdev->internal.reset_in_progress != NULL) {
6793 			/* If reset is in progress, let the completion callback for reset
6794 			 * unregister the bdev.
6795 			 */
6796 			rc = -EBUSY;
6797 		}
6798 	}
6799 
6800 	return rc;
6801 }
6802 
6803 static void
6804 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6805 			      struct spdk_io_channel *io_ch, void *_ctx)
6806 {
6807 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
6808 
6809 	bdev_channel_abort_queued_ios(bdev_ch);
6810 	spdk_bdev_for_each_channel_continue(i, 0);
6811 }
6812 
6813 static void
6814 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status)
6815 {
6816 	int rc;
6817 
6818 	spdk_spin_lock(&g_bdev_mgr.spinlock);
6819 	spdk_spin_lock(&bdev->internal.spinlock);
6820 	/*
6821 	 * Set the status to REMOVING after completing to abort channels. Otherwise,
6822 	 * the last spdk_bdev_close() may call spdk_io_device_unregister() while
6823 	 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister()
6824 	 * may fail.
6825 	 */
6826 	bdev->internal.status = SPDK_BDEV_STATUS_REMOVING;
6827 	rc = bdev_unregister_unsafe(bdev);
6828 	spdk_spin_unlock(&bdev->internal.spinlock);
6829 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
6830 
6831 	if (rc == 0) {
6832 		spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
6833 	}
6834 }
6835 
6836 void
6837 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
6838 {
6839 	struct spdk_thread	*thread;
6840 
6841 	SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name);
6842 
6843 	thread = spdk_get_thread();
6844 	if (!thread) {
6845 		/* The user called this from a non-SPDK thread. */
6846 		if (cb_fn != NULL) {
6847 			cb_fn(cb_arg, -ENOTSUP);
6848 		}
6849 		return;
6850 	}
6851 
6852 	spdk_spin_lock(&g_bdev_mgr.spinlock);
6853 	if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING ||
6854 	    bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) {
6855 		spdk_spin_unlock(&g_bdev_mgr.spinlock);
6856 		if (cb_fn) {
6857 			cb_fn(cb_arg, -EBUSY);
6858 		}
6859 		return;
6860 	}
6861 
6862 	spdk_spin_lock(&bdev->internal.spinlock);
6863 	bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING;
6864 	bdev->internal.unregister_cb = cb_fn;
6865 	bdev->internal.unregister_ctx = cb_arg;
6866 	spdk_spin_unlock(&bdev->internal.spinlock);
6867 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
6868 
6869 	spdk_bdev_set_qd_sampling_period(bdev, 0);
6870 
6871 	spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev,
6872 				   bdev_unregister);
6873 }
6874 
6875 int
6876 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module,
6877 			     spdk_bdev_unregister_cb cb_fn, void *cb_arg)
6878 {
6879 	struct spdk_bdev_desc *desc;
6880 	struct spdk_bdev *bdev;
6881 	int rc;
6882 
6883 	rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc);
6884 	if (rc != 0) {
6885 		SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name);
6886 		return rc;
6887 	}
6888 
6889 	bdev = spdk_bdev_desc_get_bdev(desc);
6890 
6891 	if (bdev->module != module) {
6892 		spdk_bdev_close(desc);
6893 		SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n",
6894 			    bdev_name);
6895 		return -ENODEV;
6896 	}
6897 
6898 	spdk_bdev_unregister(bdev, cb_fn, cb_arg);
6899 
6900 	spdk_bdev_close(desc);
6901 
6902 	return 0;
6903 }
6904 
6905 static int
6906 bdev_start_qos(struct spdk_bdev *bdev)
6907 {
6908 	struct set_qos_limit_ctx *ctx;
6909 
6910 	/* Enable QoS */
6911 	if (bdev->internal.qos && bdev->internal.qos->thread == NULL) {
6912 		ctx = calloc(1, sizeof(*ctx));
6913 		if (ctx == NULL) {
6914 			SPDK_ERRLOG("Failed to allocate memory for QoS context\n");
6915 			return -ENOMEM;
6916 		}
6917 		ctx->bdev = bdev;
6918 		spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done);
6919 	}
6920 
6921 	return 0;
6922 }
6923 
6924 static int
6925 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc)
6926 {
6927 	struct spdk_thread *thread;
6928 	int rc = 0;
6929 
6930 	thread = spdk_get_thread();
6931 	if (!thread) {
6932 		SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n");
6933 		return -ENOTSUP;
6934 	}
6935 
6936 	SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
6937 		      spdk_get_thread());
6938 
6939 	desc->bdev = bdev;
6940 	desc->thread = thread;
6941 	desc->write = write;
6942 
6943 	spdk_spin_lock(&bdev->internal.spinlock);
6944 	if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING ||
6945 	    bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) {
6946 		spdk_spin_unlock(&bdev->internal.spinlock);
6947 		return -ENODEV;
6948 	}
6949 
6950 	if (write && bdev->internal.claim_module) {
6951 		SPDK_ERRLOG("Could not open %s - %s module already claimed it\n",
6952 			    bdev->name, bdev->internal.claim_module->name);
6953 		spdk_spin_unlock(&bdev->internal.spinlock);
6954 		return -EPERM;
6955 	}
6956 
6957 	rc = bdev_start_qos(bdev);
6958 	if (rc != 0) {
6959 		SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name);
6960 		spdk_spin_unlock(&bdev->internal.spinlock);
6961 		return rc;
6962 	}
6963 
6964 	TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link);
6965 
6966 	spdk_spin_unlock(&bdev->internal.spinlock);
6967 
6968 	return 0;
6969 }
6970 
6971 static int
6972 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx,
6973 		struct spdk_bdev_desc **_desc)
6974 {
6975 	struct spdk_bdev_desc *desc;
6976 	unsigned int event_id;
6977 
6978 	desc = calloc(1, sizeof(*desc));
6979 	if (desc == NULL) {
6980 		SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n");
6981 		return -ENOMEM;
6982 	}
6983 
6984 	TAILQ_INIT(&desc->pending_media_events);
6985 	TAILQ_INIT(&desc->free_media_events);
6986 
6987 	desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0;
6988 	desc->callback.event_fn = event_cb;
6989 	desc->callback.ctx = event_ctx;
6990 	spdk_spin_init(&desc->spinlock);
6991 
6992 	if (bdev->media_events) {
6993 		desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE,
6994 						   sizeof(*desc->media_events_buffer));
6995 		if (desc->media_events_buffer == NULL) {
6996 			SPDK_ERRLOG("Failed to initialize media event pool\n");
6997 			bdev_desc_free(desc);
6998 			return -ENOMEM;
6999 		}
7000 
7001 		for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) {
7002 			TAILQ_INSERT_TAIL(&desc->free_media_events,
7003 					  &desc->media_events_buffer[event_id], tailq);
7004 		}
7005 	}
7006 
7007 	*_desc = desc;
7008 
7009 	return 0;
7010 }
7011 
7012 int
7013 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
7014 		   void *event_ctx, struct spdk_bdev_desc **_desc)
7015 {
7016 	struct spdk_bdev_desc *desc;
7017 	struct spdk_bdev *bdev;
7018 	int rc;
7019 
7020 	if (event_cb == NULL) {
7021 		SPDK_ERRLOG("Missing event callback function\n");
7022 		return -EINVAL;
7023 	}
7024 
7025 	spdk_spin_lock(&g_bdev_mgr.spinlock);
7026 
7027 	bdev = bdev_get_by_name(bdev_name);
7028 
7029 	if (bdev == NULL) {
7030 		SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name);
7031 		spdk_spin_unlock(&g_bdev_mgr.spinlock);
7032 		return -ENODEV;
7033 	}
7034 
7035 	rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc);
7036 	if (rc != 0) {
7037 		spdk_spin_unlock(&g_bdev_mgr.spinlock);
7038 		return rc;
7039 	}
7040 
7041 	rc = bdev_open(bdev, write, desc);
7042 	if (rc != 0) {
7043 		bdev_desc_free(desc);
7044 		desc = NULL;
7045 	}
7046 
7047 	*_desc = desc;
7048 
7049 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
7050 
7051 	return rc;
7052 }
7053 
7054 static void
7055 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc)
7056 {
7057 	int rc;
7058 
7059 	spdk_spin_lock(&bdev->internal.spinlock);
7060 	spdk_spin_lock(&desc->spinlock);
7061 
7062 	TAILQ_REMOVE(&bdev->internal.open_descs, desc, link);
7063 
7064 	desc->closed = true;
7065 
7066 	if (0 == desc->refs) {
7067 		spdk_spin_unlock(&desc->spinlock);
7068 		bdev_desc_free(desc);
7069 	} else {
7070 		spdk_spin_unlock(&desc->spinlock);
7071 	}
7072 
7073 	/* If no more descriptors, kill QoS channel */
7074 	if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) {
7075 		SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n",
7076 			      bdev->name, spdk_get_thread());
7077 
7078 		if (bdev_qos_destroy(bdev)) {
7079 			/* There isn't anything we can do to recover here. Just let the
7080 			 * old QoS poller keep running. The QoS handling won't change
7081 			 * cores when the user allocates a new channel, but it won't break. */
7082 			SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n");
7083 		}
7084 	}
7085 
7086 	if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) {
7087 		rc = bdev_unregister_unsafe(bdev);
7088 		spdk_spin_unlock(&bdev->internal.spinlock);
7089 
7090 		if (rc == 0) {
7091 			spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
7092 		}
7093 	} else {
7094 		spdk_spin_unlock(&bdev->internal.spinlock);
7095 	}
7096 }
7097 
7098 void
7099 spdk_bdev_close(struct spdk_bdev_desc *desc)
7100 {
7101 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7102 
7103 	SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
7104 		      spdk_get_thread());
7105 
7106 	assert(desc->thread == spdk_get_thread());
7107 
7108 	spdk_poller_unregister(&desc->io_timeout_poller);
7109 
7110 	spdk_spin_lock(&g_bdev_mgr.spinlock);
7111 
7112 	bdev_close(bdev, desc);
7113 
7114 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
7115 }
7116 
7117 static void
7118 bdev_register_finished(void *arg)
7119 {
7120 	struct spdk_bdev_desc *desc = arg;
7121 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7122 
7123 	spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev));
7124 
7125 	spdk_spin_lock(&g_bdev_mgr.spinlock);
7126 
7127 	bdev_close(bdev, desc);
7128 
7129 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
7130 }
7131 
7132 int
7133 spdk_bdev_register(struct spdk_bdev *bdev)
7134 {
7135 	struct spdk_bdev_desc *desc;
7136 	int rc;
7137 
7138 	rc = bdev_register(bdev);
7139 	if (rc != 0) {
7140 		return rc;
7141 	}
7142 
7143 	/* A descriptor is opened to prevent bdev deletion during examination */
7144 	rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc);
7145 	if (rc != 0) {
7146 		spdk_bdev_unregister(bdev, NULL, NULL);
7147 		return rc;
7148 	}
7149 
7150 	rc = bdev_open(bdev, false, desc);
7151 	if (rc != 0) {
7152 		bdev_desc_free(desc);
7153 		spdk_bdev_unregister(bdev, NULL, NULL);
7154 		return rc;
7155 	}
7156 
7157 	/* Examine configuration before initializing I/O */
7158 	bdev_examine(bdev);
7159 
7160 	rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc);
7161 	if (rc != 0) {
7162 		bdev_close(bdev, desc);
7163 		spdk_bdev_unregister(bdev, NULL, NULL);
7164 	}
7165 
7166 	return rc;
7167 }
7168 
7169 int
7170 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
7171 			    struct spdk_bdev_module *module)
7172 {
7173 	if (bdev->internal.claim_module != NULL) {
7174 		SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name,
7175 			    bdev->internal.claim_module->name);
7176 		return -EPERM;
7177 	}
7178 
7179 	if (desc && !desc->write) {
7180 		desc->write = true;
7181 	}
7182 
7183 	bdev->internal.claim_module = module;
7184 	return 0;
7185 }
7186 
7187 void
7188 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev)
7189 {
7190 	assert(bdev->internal.claim_module != NULL);
7191 	bdev->internal.claim_module = NULL;
7192 }
7193 
7194 struct spdk_bdev *
7195 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc)
7196 {
7197 	assert(desc != NULL);
7198 	return desc->bdev;
7199 }
7200 
7201 int
7202 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn)
7203 {
7204 	struct spdk_bdev *bdev, *tmp;
7205 	struct spdk_bdev_desc *desc;
7206 	int rc = 0;
7207 
7208 	assert(fn != NULL);
7209 
7210 	spdk_spin_lock(&g_bdev_mgr.spinlock);
7211 	bdev = spdk_bdev_first();
7212 	while (bdev != NULL) {
7213 		rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc);
7214 		if (rc != 0) {
7215 			break;
7216 		}
7217 		rc = bdev_open(bdev, false, desc);
7218 		if (rc != 0) {
7219 			bdev_desc_free(desc);
7220 			if (rc == -ENODEV) {
7221 				/* Ignore the error and move to the next bdev. */
7222 				rc = 0;
7223 				bdev = spdk_bdev_next(bdev);
7224 				continue;
7225 			}
7226 			break;
7227 		}
7228 		spdk_spin_unlock(&g_bdev_mgr.spinlock);
7229 
7230 		rc = fn(ctx, bdev);
7231 
7232 		spdk_spin_lock(&g_bdev_mgr.spinlock);
7233 		tmp = spdk_bdev_next(bdev);
7234 		bdev_close(bdev, desc);
7235 		if (rc != 0) {
7236 			break;
7237 		}
7238 		bdev = tmp;
7239 	}
7240 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
7241 
7242 	return rc;
7243 }
7244 
7245 int
7246 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn)
7247 {
7248 	struct spdk_bdev *bdev, *tmp;
7249 	struct spdk_bdev_desc *desc;
7250 	int rc = 0;
7251 
7252 	assert(fn != NULL);
7253 
7254 	spdk_spin_lock(&g_bdev_mgr.spinlock);
7255 	bdev = spdk_bdev_first_leaf();
7256 	while (bdev != NULL) {
7257 		rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc);
7258 		if (rc != 0) {
7259 			break;
7260 		}
7261 		rc = bdev_open(bdev, false, desc);
7262 		if (rc != 0) {
7263 			bdev_desc_free(desc);
7264 			if (rc == -ENODEV) {
7265 				/* Ignore the error and move to the next bdev. */
7266 				rc = 0;
7267 				bdev = spdk_bdev_next_leaf(bdev);
7268 				continue;
7269 			}
7270 			break;
7271 		}
7272 		spdk_spin_unlock(&g_bdev_mgr.spinlock);
7273 
7274 		rc = fn(ctx, bdev);
7275 
7276 		spdk_spin_lock(&g_bdev_mgr.spinlock);
7277 		tmp = spdk_bdev_next_leaf(bdev);
7278 		bdev_close(bdev, desc);
7279 		if (rc != 0) {
7280 			break;
7281 		}
7282 		bdev = tmp;
7283 	}
7284 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
7285 
7286 	return rc;
7287 }
7288 
7289 void
7290 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp)
7291 {
7292 	struct iovec *iovs;
7293 	int iovcnt;
7294 
7295 	if (bdev_io == NULL) {
7296 		return;
7297 	}
7298 
7299 	switch (bdev_io->type) {
7300 	case SPDK_BDEV_IO_TYPE_READ:
7301 	case SPDK_BDEV_IO_TYPE_WRITE:
7302 	case SPDK_BDEV_IO_TYPE_ZCOPY:
7303 		iovs = bdev_io->u.bdev.iovs;
7304 		iovcnt = bdev_io->u.bdev.iovcnt;
7305 		break;
7306 	default:
7307 		iovs = NULL;
7308 		iovcnt = 0;
7309 		break;
7310 	}
7311 
7312 	if (iovp) {
7313 		*iovp = iovs;
7314 	}
7315 	if (iovcntp) {
7316 		*iovcntp = iovcnt;
7317 	}
7318 }
7319 
7320 void *
7321 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io)
7322 {
7323 	if (bdev_io == NULL) {
7324 		return NULL;
7325 	}
7326 
7327 	if (!spdk_bdev_is_md_separate(bdev_io->bdev)) {
7328 		return NULL;
7329 	}
7330 
7331 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ ||
7332 	    bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
7333 		return bdev_io->u.bdev.md_buf;
7334 	}
7335 
7336 	return NULL;
7337 }
7338 
7339 void *
7340 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io)
7341 {
7342 	if (bdev_io == NULL) {
7343 		assert(false);
7344 		return NULL;
7345 	}
7346 
7347 	return bdev_io->internal.caller_ctx;
7348 }
7349 
7350 void
7351 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module)
7352 {
7353 
7354 	if (spdk_bdev_module_list_find(bdev_module->name)) {
7355 		SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name);
7356 		assert(false);
7357 	}
7358 
7359 	/*
7360 	 * Modules with examine callbacks must be initialized first, so they are
7361 	 *  ready to handle examine callbacks from later modules that will
7362 	 *  register physical bdevs.
7363 	 */
7364 	if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) {
7365 		TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
7366 	} else {
7367 		TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
7368 	}
7369 }
7370 
7371 struct spdk_bdev_module *
7372 spdk_bdev_module_list_find(const char *name)
7373 {
7374 	struct spdk_bdev_module *bdev_module;
7375 
7376 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
7377 		if (strcmp(name, bdev_module->name) == 0) {
7378 			break;
7379 		}
7380 	}
7381 
7382 	return bdev_module;
7383 }
7384 
7385 static void
7386 bdev_write_zero_buffer_next(void *_bdev_io)
7387 {
7388 	struct spdk_bdev_io *bdev_io = _bdev_io;
7389 	uint64_t num_bytes, num_blocks;
7390 	void *md_buf = NULL;
7391 	int rc;
7392 
7393 	num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) *
7394 			     bdev_io->u.bdev.split_remaining_num_blocks,
7395 			     ZERO_BUFFER_SIZE);
7396 	num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev);
7397 	num_blocks -= num_blocks % bdev_io->bdev->write_unit_size;
7398 
7399 	if (spdk_bdev_is_md_separate(bdev_io->bdev)) {
7400 		md_buf = (char *)g_bdev_mgr.zero_buffer +
7401 			 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks;
7402 	}
7403 
7404 	rc = bdev_write_blocks_with_md(bdev_io->internal.desc,
7405 				       spdk_io_channel_from_ctx(bdev_io->internal.ch),
7406 				       g_bdev_mgr.zero_buffer, md_buf,
7407 				       bdev_io->u.bdev.split_current_offset_blocks, num_blocks,
7408 				       bdev_write_zero_buffer_done, bdev_io);
7409 	if (rc == 0) {
7410 		bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks;
7411 		bdev_io->u.bdev.split_current_offset_blocks += num_blocks;
7412 	} else if (rc == -ENOMEM) {
7413 		bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next);
7414 	} else {
7415 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7416 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
7417 	}
7418 }
7419 
7420 static void
7421 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
7422 {
7423 	struct spdk_bdev_io *parent_io = cb_arg;
7424 
7425 	spdk_bdev_free_io(bdev_io);
7426 
7427 	if (!success) {
7428 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7429 		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
7430 		return;
7431 	}
7432 
7433 	if (parent_io->u.bdev.split_remaining_num_blocks == 0) {
7434 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
7435 		parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx);
7436 		return;
7437 	}
7438 
7439 	bdev_write_zero_buffer_next(parent_io);
7440 }
7441 
7442 static void
7443 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status)
7444 {
7445 	spdk_spin_lock(&ctx->bdev->internal.spinlock);
7446 	ctx->bdev->internal.qos_mod_in_progress = false;
7447 	spdk_spin_unlock(&ctx->bdev->internal.spinlock);
7448 
7449 	if (ctx->cb_fn) {
7450 		ctx->cb_fn(ctx->cb_arg, status);
7451 	}
7452 	free(ctx);
7453 }
7454 
7455 static void
7456 bdev_disable_qos_done(void *cb_arg)
7457 {
7458 	struct set_qos_limit_ctx *ctx = cb_arg;
7459 	struct spdk_bdev *bdev = ctx->bdev;
7460 	struct spdk_bdev_io *bdev_io;
7461 	struct spdk_bdev_qos *qos;
7462 
7463 	spdk_spin_lock(&bdev->internal.spinlock);
7464 	qos = bdev->internal.qos;
7465 	bdev->internal.qos = NULL;
7466 	spdk_spin_unlock(&bdev->internal.spinlock);
7467 
7468 	while (!TAILQ_EMPTY(&qos->queued)) {
7469 		/* Send queued I/O back to their original thread for resubmission. */
7470 		bdev_io = TAILQ_FIRST(&qos->queued);
7471 		TAILQ_REMOVE(&qos->queued, bdev_io, internal.link);
7472 
7473 		if (bdev_io->internal.io_submit_ch) {
7474 			/*
7475 			 * Channel was changed when sending it to the QoS thread - change it back
7476 			 *  before sending it back to the original thread.
7477 			 */
7478 			bdev_io->internal.ch = bdev_io->internal.io_submit_ch;
7479 			bdev_io->internal.io_submit_ch = NULL;
7480 		}
7481 
7482 		spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
7483 				     _bdev_io_submit, bdev_io);
7484 	}
7485 
7486 	if (qos->thread != NULL) {
7487 		spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
7488 		spdk_poller_unregister(&qos->poller);
7489 	}
7490 
7491 	free(qos);
7492 
7493 	bdev_set_qos_limit_done(ctx, 0);
7494 }
7495 
7496 static void
7497 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status)
7498 {
7499 	struct set_qos_limit_ctx *ctx = _ctx;
7500 	struct spdk_thread *thread;
7501 
7502 	spdk_spin_lock(&bdev->internal.spinlock);
7503 	thread = bdev->internal.qos->thread;
7504 	spdk_spin_unlock(&bdev->internal.spinlock);
7505 
7506 	if (thread != NULL) {
7507 		spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx);
7508 	} else {
7509 		bdev_disable_qos_done(ctx);
7510 	}
7511 }
7512 
7513 static void
7514 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
7515 		     struct spdk_io_channel *ch, void *_ctx)
7516 {
7517 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
7518 
7519 	bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED;
7520 
7521 	spdk_bdev_for_each_channel_continue(i, 0);
7522 }
7523 
7524 static void
7525 bdev_update_qos_rate_limit_msg(void *cb_arg)
7526 {
7527 	struct set_qos_limit_ctx *ctx = cb_arg;
7528 	struct spdk_bdev *bdev = ctx->bdev;
7529 
7530 	spdk_spin_lock(&bdev->internal.spinlock);
7531 	bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos);
7532 	spdk_spin_unlock(&bdev->internal.spinlock);
7533 
7534 	bdev_set_qos_limit_done(ctx, 0);
7535 }
7536 
7537 static void
7538 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
7539 		    struct spdk_io_channel *ch, void *_ctx)
7540 {
7541 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
7542 
7543 	spdk_spin_lock(&bdev->internal.spinlock);
7544 	bdev_enable_qos(bdev, bdev_ch);
7545 	spdk_spin_unlock(&bdev->internal.spinlock);
7546 	spdk_bdev_for_each_channel_continue(i, 0);
7547 }
7548 
7549 static void
7550 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status)
7551 {
7552 	struct set_qos_limit_ctx *ctx = _ctx;
7553 
7554 	bdev_set_qos_limit_done(ctx, status);
7555 }
7556 
7557 static void
7558 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits)
7559 {
7560 	int i;
7561 
7562 	assert(bdev->internal.qos != NULL);
7563 
7564 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
7565 		if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
7566 			bdev->internal.qos->rate_limits[i].limit = limits[i];
7567 
7568 			if (limits[i] == 0) {
7569 				bdev->internal.qos->rate_limits[i].limit =
7570 					SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
7571 			}
7572 		}
7573 	}
7574 }
7575 
7576 void
7577 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits,
7578 			      void (*cb_fn)(void *cb_arg, int status), void *cb_arg)
7579 {
7580 	struct set_qos_limit_ctx	*ctx;
7581 	uint32_t			limit_set_complement;
7582 	uint64_t			min_limit_per_sec;
7583 	int				i;
7584 	bool				disable_rate_limit = true;
7585 
7586 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
7587 		if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
7588 			continue;
7589 		}
7590 
7591 		if (limits[i] > 0) {
7592 			disable_rate_limit = false;
7593 		}
7594 
7595 		if (bdev_qos_is_iops_rate_limit(i) == true) {
7596 			min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC;
7597 		} else {
7598 			/* Change from megabyte to byte rate limit */
7599 			limits[i] = limits[i] * 1024 * 1024;
7600 			min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC;
7601 		}
7602 
7603 		limit_set_complement = limits[i] % min_limit_per_sec;
7604 		if (limit_set_complement) {
7605 			SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n",
7606 				    limits[i], min_limit_per_sec);
7607 			limits[i] += min_limit_per_sec - limit_set_complement;
7608 			SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]);
7609 		}
7610 	}
7611 
7612 	ctx = calloc(1, sizeof(*ctx));
7613 	if (ctx == NULL) {
7614 		cb_fn(cb_arg, -ENOMEM);
7615 		return;
7616 	}
7617 
7618 	ctx->cb_fn = cb_fn;
7619 	ctx->cb_arg = cb_arg;
7620 	ctx->bdev = bdev;
7621 
7622 	spdk_spin_lock(&bdev->internal.spinlock);
7623 	if (bdev->internal.qos_mod_in_progress) {
7624 		spdk_spin_unlock(&bdev->internal.spinlock);
7625 		free(ctx);
7626 		cb_fn(cb_arg, -EAGAIN);
7627 		return;
7628 	}
7629 	bdev->internal.qos_mod_in_progress = true;
7630 
7631 	if (disable_rate_limit == true && bdev->internal.qos) {
7632 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
7633 			if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED &&
7634 			    (bdev->internal.qos->rate_limits[i].limit > 0 &&
7635 			     bdev->internal.qos->rate_limits[i].limit !=
7636 			     SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) {
7637 				disable_rate_limit = false;
7638 				break;
7639 			}
7640 		}
7641 	}
7642 
7643 	if (disable_rate_limit == false) {
7644 		if (bdev->internal.qos == NULL) {
7645 			bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos));
7646 			if (!bdev->internal.qos) {
7647 				spdk_spin_unlock(&bdev->internal.spinlock);
7648 				SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n");
7649 				bdev_set_qos_limit_done(ctx, -ENOMEM);
7650 				return;
7651 			}
7652 		}
7653 
7654 		if (bdev->internal.qos->thread == NULL) {
7655 			/* Enabling */
7656 			bdev_set_qos_rate_limits(bdev, limits);
7657 
7658 			spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx,
7659 						   bdev_enable_qos_done);
7660 		} else {
7661 			/* Updating */
7662 			bdev_set_qos_rate_limits(bdev, limits);
7663 
7664 			spdk_thread_send_msg(bdev->internal.qos->thread,
7665 					     bdev_update_qos_rate_limit_msg, ctx);
7666 		}
7667 	} else {
7668 		if (bdev->internal.qos != NULL) {
7669 			bdev_set_qos_rate_limits(bdev, limits);
7670 
7671 			/* Disabling */
7672 			spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx,
7673 						   bdev_disable_qos_msg_done);
7674 		} else {
7675 			spdk_spin_unlock(&bdev->internal.spinlock);
7676 			bdev_set_qos_limit_done(ctx, 0);
7677 			return;
7678 		}
7679 	}
7680 
7681 	spdk_spin_unlock(&bdev->internal.spinlock);
7682 }
7683 
7684 struct spdk_bdev_histogram_ctx {
7685 	spdk_bdev_histogram_status_cb cb_fn;
7686 	void *cb_arg;
7687 	struct spdk_bdev *bdev;
7688 	int status;
7689 };
7690 
7691 static void
7692 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
7693 {
7694 	struct spdk_bdev_histogram_ctx *ctx = _ctx;
7695 
7696 	spdk_spin_lock(&ctx->bdev->internal.spinlock);
7697 	ctx->bdev->internal.histogram_in_progress = false;
7698 	spdk_spin_unlock(&ctx->bdev->internal.spinlock);
7699 	ctx->cb_fn(ctx->cb_arg, ctx->status);
7700 	free(ctx);
7701 }
7702 
7703 static void
7704 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
7705 			       struct spdk_io_channel *_ch, void *_ctx)
7706 {
7707 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
7708 
7709 	if (ch->histogram != NULL) {
7710 		spdk_histogram_data_free(ch->histogram);
7711 		ch->histogram = NULL;
7712 	}
7713 	spdk_bdev_for_each_channel_continue(i, 0);
7714 }
7715 
7716 static void
7717 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
7718 {
7719 	struct spdk_bdev_histogram_ctx *ctx = _ctx;
7720 
7721 	if (status != 0) {
7722 		ctx->status = status;
7723 		ctx->bdev->internal.histogram_enabled = false;
7724 		spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx,
7725 					   bdev_histogram_disable_channel_cb);
7726 	} else {
7727 		spdk_spin_lock(&ctx->bdev->internal.spinlock);
7728 		ctx->bdev->internal.histogram_in_progress = false;
7729 		spdk_spin_unlock(&ctx->bdev->internal.spinlock);
7730 		ctx->cb_fn(ctx->cb_arg, ctx->status);
7731 		free(ctx);
7732 	}
7733 }
7734 
7735 static void
7736 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
7737 			      struct spdk_io_channel *_ch, void *_ctx)
7738 {
7739 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
7740 	int status = 0;
7741 
7742 	if (ch->histogram == NULL) {
7743 		ch->histogram = spdk_histogram_data_alloc();
7744 		if (ch->histogram == NULL) {
7745 			status = -ENOMEM;
7746 		}
7747 	}
7748 
7749 	spdk_bdev_for_each_channel_continue(i, status);
7750 }
7751 
7752 void
7753 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn,
7754 			   void *cb_arg, bool enable)
7755 {
7756 	struct spdk_bdev_histogram_ctx *ctx;
7757 
7758 	ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx));
7759 	if (ctx == NULL) {
7760 		cb_fn(cb_arg, -ENOMEM);
7761 		return;
7762 	}
7763 
7764 	ctx->bdev = bdev;
7765 	ctx->status = 0;
7766 	ctx->cb_fn = cb_fn;
7767 	ctx->cb_arg = cb_arg;
7768 
7769 	spdk_spin_lock(&bdev->internal.spinlock);
7770 	if (bdev->internal.histogram_in_progress) {
7771 		spdk_spin_unlock(&bdev->internal.spinlock);
7772 		free(ctx);
7773 		cb_fn(cb_arg, -EAGAIN);
7774 		return;
7775 	}
7776 
7777 	bdev->internal.histogram_in_progress = true;
7778 	spdk_spin_unlock(&bdev->internal.spinlock);
7779 
7780 	bdev->internal.histogram_enabled = enable;
7781 
7782 	if (enable) {
7783 		/* Allocate histogram for each channel */
7784 		spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx,
7785 					   bdev_histogram_enable_channel_cb);
7786 	} else {
7787 		spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx,
7788 					   bdev_histogram_disable_channel_cb);
7789 	}
7790 }
7791 
7792 struct spdk_bdev_histogram_data_ctx {
7793 	spdk_bdev_histogram_data_cb cb_fn;
7794 	void *cb_arg;
7795 	struct spdk_bdev *bdev;
7796 	/** merged histogram data from all channels */
7797 	struct spdk_histogram_data	*histogram;
7798 };
7799 
7800 static void
7801 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
7802 {
7803 	struct spdk_bdev_histogram_data_ctx *ctx = _ctx;
7804 
7805 	ctx->cb_fn(ctx->cb_arg, status, ctx->histogram);
7806 	free(ctx);
7807 }
7808 
7809 static void
7810 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
7811 			   struct spdk_io_channel *_ch, void *_ctx)
7812 {
7813 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
7814 	struct spdk_bdev_histogram_data_ctx *ctx = _ctx;
7815 	int status = 0;
7816 
7817 	if (ch->histogram == NULL) {
7818 		status = -EFAULT;
7819 	} else {
7820 		spdk_histogram_data_merge(ctx->histogram, ch->histogram);
7821 	}
7822 
7823 	spdk_bdev_for_each_channel_continue(i, status);
7824 }
7825 
7826 void
7827 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram,
7828 			spdk_bdev_histogram_data_cb cb_fn,
7829 			void *cb_arg)
7830 {
7831 	struct spdk_bdev_histogram_data_ctx *ctx;
7832 
7833 	ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx));
7834 	if (ctx == NULL) {
7835 		cb_fn(cb_arg, -ENOMEM, NULL);
7836 		return;
7837 	}
7838 
7839 	ctx->bdev = bdev;
7840 	ctx->cb_fn = cb_fn;
7841 	ctx->cb_arg = cb_arg;
7842 
7843 	ctx->histogram = histogram;
7844 
7845 	spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx,
7846 				   bdev_histogram_get_channel_cb);
7847 }
7848 
7849 void
7850 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn,
7851 				void *cb_arg)
7852 {
7853 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
7854 	int status = 0;
7855 
7856 	assert(cb_fn != NULL);
7857 
7858 	if (bdev_ch->histogram == NULL) {
7859 		status = -EFAULT;
7860 	}
7861 	cb_fn(cb_arg, status, bdev_ch->histogram);
7862 }
7863 
7864 size_t
7865 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events,
7866 			   size_t max_events)
7867 {
7868 	struct media_event_entry *entry;
7869 	size_t num_events = 0;
7870 
7871 	for (; num_events < max_events; ++num_events) {
7872 		entry = TAILQ_FIRST(&desc->pending_media_events);
7873 		if (entry == NULL) {
7874 			break;
7875 		}
7876 
7877 		events[num_events] = entry->event;
7878 		TAILQ_REMOVE(&desc->pending_media_events, entry, tailq);
7879 		TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq);
7880 	}
7881 
7882 	return num_events;
7883 }
7884 
7885 int
7886 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events,
7887 			    size_t num_events)
7888 {
7889 	struct spdk_bdev_desc *desc;
7890 	struct media_event_entry *entry;
7891 	size_t event_id;
7892 	int rc = 0;
7893 
7894 	assert(bdev->media_events);
7895 
7896 	spdk_spin_lock(&bdev->internal.spinlock);
7897 	TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
7898 		if (desc->write) {
7899 			break;
7900 		}
7901 	}
7902 
7903 	if (desc == NULL || desc->media_events_buffer == NULL) {
7904 		rc = -ENODEV;
7905 		goto out;
7906 	}
7907 
7908 	for (event_id = 0; event_id < num_events; ++event_id) {
7909 		entry = TAILQ_FIRST(&desc->free_media_events);
7910 		if (entry == NULL) {
7911 			break;
7912 		}
7913 
7914 		TAILQ_REMOVE(&desc->free_media_events, entry, tailq);
7915 		TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq);
7916 		entry->event = events[event_id];
7917 	}
7918 
7919 	rc = event_id;
7920 out:
7921 	spdk_spin_unlock(&bdev->internal.spinlock);
7922 	return rc;
7923 }
7924 
7925 void
7926 spdk_bdev_notify_media_management(struct spdk_bdev *bdev)
7927 {
7928 	struct spdk_bdev_desc *desc;
7929 
7930 	spdk_spin_lock(&bdev->internal.spinlock);
7931 	TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
7932 		if (!TAILQ_EMPTY(&desc->pending_media_events)) {
7933 			desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev,
7934 						desc->callback.ctx);
7935 		}
7936 	}
7937 	spdk_spin_unlock(&bdev->internal.spinlock);
7938 }
7939 
7940 struct locked_lba_range_ctx {
7941 	struct lba_range		range;
7942 	struct spdk_bdev		*bdev;
7943 	struct lba_range		*current_range;
7944 	struct lba_range		*owner_range;
7945 	struct spdk_poller		*poller;
7946 	lock_range_cb			cb_fn;
7947 	void				*cb_arg;
7948 };
7949 
7950 static void
7951 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status)
7952 {
7953 	struct locked_lba_range_ctx *ctx = _ctx;
7954 
7955 	ctx->cb_fn(ctx->cb_arg, -ENOMEM);
7956 	free(ctx);
7957 }
7958 
7959 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i,
7960 		struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx);
7961 
7962 static void
7963 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status)
7964 {
7965 	struct locked_lba_range_ctx *ctx = _ctx;
7966 
7967 	if (status == -ENOMEM) {
7968 		/* One of the channels could not allocate a range object.
7969 		 * So we have to go back and clean up any ranges that were
7970 		 * allocated successfully before we return error status to
7971 		 * the caller.  We can reuse the unlock function to do that
7972 		 * clean up.
7973 		 */
7974 		spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx,
7975 					   bdev_lock_error_cleanup_cb);
7976 		return;
7977 	}
7978 
7979 	/* All channels have locked this range and no I/O overlapping the range
7980 	 * are outstanding!  Set the owner_ch for the range object for the
7981 	 * locking channel, so that this channel will know that it is allowed
7982 	 * to write to this range.
7983 	 */
7984 	ctx->owner_range->owner_ch = ctx->range.owner_ch;
7985 	ctx->cb_fn(ctx->cb_arg, status);
7986 
7987 	/* Don't free the ctx here.  Its range is in the bdev's global list of
7988 	 * locked ranges still, and will be removed and freed when this range
7989 	 * is later unlocked.
7990 	 */
7991 }
7992 
7993 static int
7994 bdev_lock_lba_range_check_io(void *_i)
7995 {
7996 	struct spdk_bdev_channel_iter *i = _i;
7997 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i);
7998 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
7999 	struct locked_lba_range_ctx *ctx = i->ctx;
8000 	struct lba_range *range = ctx->current_range;
8001 	struct spdk_bdev_io *bdev_io;
8002 
8003 	spdk_poller_unregister(&ctx->poller);
8004 
8005 	/* The range is now in the locked_ranges, so no new IO can be submitted to this
8006 	 * range.  But we need to wait until any outstanding IO overlapping with this range
8007 	 * are completed.
8008 	 */
8009 	TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) {
8010 		if (bdev_io_range_is_locked(bdev_io, range)) {
8011 			ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100);
8012 			return SPDK_POLLER_BUSY;
8013 		}
8014 	}
8015 
8016 	spdk_bdev_for_each_channel_continue(i, 0);
8017 	return SPDK_POLLER_BUSY;
8018 }
8019 
8020 static void
8021 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
8022 				struct spdk_io_channel *_ch, void *_ctx)
8023 {
8024 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
8025 	struct locked_lba_range_ctx *ctx = _ctx;
8026 	struct lba_range *range;
8027 
8028 	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
8029 		if (range->length == ctx->range.length &&
8030 		    range->offset == ctx->range.offset &&
8031 		    range->locked_ctx == ctx->range.locked_ctx) {
8032 			/* This range already exists on this channel, so don't add
8033 			 * it again.  This can happen when a new channel is created
8034 			 * while the for_each_channel operation is in progress.
8035 			 * Do not check for outstanding I/O in that case, since the
8036 			 * range was locked before any I/O could be submitted to the
8037 			 * new channel.
8038 			 */
8039 			spdk_bdev_for_each_channel_continue(i, 0);
8040 			return;
8041 		}
8042 	}
8043 
8044 	range = calloc(1, sizeof(*range));
8045 	if (range == NULL) {
8046 		spdk_bdev_for_each_channel_continue(i, -ENOMEM);
8047 		return;
8048 	}
8049 
8050 	range->length = ctx->range.length;
8051 	range->offset = ctx->range.offset;
8052 	range->locked_ctx = ctx->range.locked_ctx;
8053 	ctx->current_range = range;
8054 	if (ctx->range.owner_ch == ch) {
8055 		/* This is the range object for the channel that will hold
8056 		 * the lock.  Store it in the ctx object so that we can easily
8057 		 * set its owner_ch after the lock is finally acquired.
8058 		 */
8059 		ctx->owner_range = range;
8060 	}
8061 	TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq);
8062 	bdev_lock_lba_range_check_io(i);
8063 }
8064 
8065 static void
8066 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx)
8067 {
8068 	assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel));
8069 
8070 	/* We will add a copy of this range to each channel now. */
8071 	spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx,
8072 				   bdev_lock_lba_range_cb);
8073 }
8074 
8075 static bool
8076 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq)
8077 {
8078 	struct lba_range *r;
8079 
8080 	TAILQ_FOREACH(r, tailq, tailq) {
8081 		if (bdev_lba_range_overlapped(range, r)) {
8082 			return true;
8083 		}
8084 	}
8085 	return false;
8086 }
8087 
8088 static int
8089 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
8090 		    uint64_t offset, uint64_t length,
8091 		    lock_range_cb cb_fn, void *cb_arg)
8092 {
8093 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
8094 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
8095 	struct locked_lba_range_ctx *ctx;
8096 
8097 	if (cb_arg == NULL) {
8098 		SPDK_ERRLOG("cb_arg must not be NULL\n");
8099 		return -EINVAL;
8100 	}
8101 
8102 	ctx = calloc(1, sizeof(*ctx));
8103 	if (ctx == NULL) {
8104 		return -ENOMEM;
8105 	}
8106 
8107 	ctx->range.offset = offset;
8108 	ctx->range.length = length;
8109 	ctx->range.owner_ch = ch;
8110 	ctx->range.locked_ctx = cb_arg;
8111 	ctx->bdev = bdev;
8112 	ctx->cb_fn = cb_fn;
8113 	ctx->cb_arg = cb_arg;
8114 
8115 	spdk_spin_lock(&bdev->internal.spinlock);
8116 	if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) {
8117 		/* There is an active lock overlapping with this range.
8118 		 * Put it on the pending list until this range no
8119 		 * longer overlaps with another.
8120 		 */
8121 		TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq);
8122 	} else {
8123 		TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq);
8124 		bdev_lock_lba_range_ctx(bdev, ctx);
8125 	}
8126 	spdk_spin_unlock(&bdev->internal.spinlock);
8127 	return 0;
8128 }
8129 
8130 static void
8131 bdev_lock_lba_range_ctx_msg(void *_ctx)
8132 {
8133 	struct locked_lba_range_ctx *ctx = _ctx;
8134 
8135 	bdev_lock_lba_range_ctx(ctx->bdev, ctx);
8136 }
8137 
8138 static void
8139 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status)
8140 {
8141 	struct locked_lba_range_ctx *ctx = _ctx;
8142 	struct locked_lba_range_ctx *pending_ctx;
8143 	struct lba_range *range, *tmp;
8144 
8145 	spdk_spin_lock(&bdev->internal.spinlock);
8146 	/* Check if there are any pending locked ranges that overlap with this range
8147 	 * that was just unlocked.  If there are, check that it doesn't overlap with any
8148 	 * other locked ranges before calling bdev_lock_lba_range_ctx which will start
8149 	 * the lock process.
8150 	 */
8151 	TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) {
8152 		if (bdev_lba_range_overlapped(range, &ctx->range) &&
8153 		    !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) {
8154 			TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq);
8155 			pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range);
8156 			TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq);
8157 			spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel),
8158 					     bdev_lock_lba_range_ctx_msg, pending_ctx);
8159 		}
8160 	}
8161 	spdk_spin_unlock(&bdev->internal.spinlock);
8162 
8163 	ctx->cb_fn(ctx->cb_arg, status);
8164 	free(ctx);
8165 }
8166 
8167 static void
8168 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
8169 				  struct spdk_io_channel *_ch, void *_ctx)
8170 {
8171 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
8172 	struct locked_lba_range_ctx *ctx = _ctx;
8173 	TAILQ_HEAD(, spdk_bdev_io) io_locked;
8174 	struct spdk_bdev_io *bdev_io;
8175 	struct lba_range *range;
8176 
8177 	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
8178 		if (ctx->range.offset == range->offset &&
8179 		    ctx->range.length == range->length &&
8180 		    ctx->range.locked_ctx == range->locked_ctx) {
8181 			TAILQ_REMOVE(&ch->locked_ranges, range, tailq);
8182 			free(range);
8183 			break;
8184 		}
8185 	}
8186 
8187 	/* Note: we should almost always be able to assert that the range specified
8188 	 * was found.  But there are some very rare corner cases where a new channel
8189 	 * gets created simultaneously with a range unlock, where this function
8190 	 * would execute on that new channel and wouldn't have the range.
8191 	 * We also use this to clean up range allocations when a later allocation
8192 	 * fails in the locking path.
8193 	 * So we can't actually assert() here.
8194 	 */
8195 
8196 	/* Swap the locked IO into a temporary list, and then try to submit them again.
8197 	 * We could hyper-optimize this to only resubmit locked I/O that overlap
8198 	 * with the range that was just unlocked, but this isn't a performance path so
8199 	 * we go for simplicity here.
8200 	 */
8201 	TAILQ_INIT(&io_locked);
8202 	TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link);
8203 	while (!TAILQ_EMPTY(&io_locked)) {
8204 		bdev_io = TAILQ_FIRST(&io_locked);
8205 		TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link);
8206 		bdev_io_submit(bdev_io);
8207 	}
8208 
8209 	spdk_bdev_for_each_channel_continue(i, 0);
8210 }
8211 
8212 static int
8213 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
8214 		      uint64_t offset, uint64_t length,
8215 		      lock_range_cb cb_fn, void *cb_arg)
8216 {
8217 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
8218 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
8219 	struct locked_lba_range_ctx *ctx;
8220 	struct lba_range *range;
8221 	bool range_found = false;
8222 
8223 	/* Let's make sure the specified channel actually has a lock on
8224 	 * the specified range.  Note that the range must match exactly.
8225 	 */
8226 	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
8227 		if (range->offset == offset && range->length == length &&
8228 		    range->owner_ch == ch && range->locked_ctx == cb_arg) {
8229 			range_found = true;
8230 			break;
8231 		}
8232 	}
8233 
8234 	if (!range_found) {
8235 		return -EINVAL;
8236 	}
8237 
8238 	spdk_spin_lock(&bdev->internal.spinlock);
8239 	/* We confirmed that this channel has locked the specified range.  To
8240 	 * start the unlock the process, we find the range in the bdev's locked_ranges
8241 	 * and remove it.  This ensures new channels don't inherit the locked range.
8242 	 * Then we will send a message to each channel (including the one specified
8243 	 * here) to remove the range from its per-channel list.
8244 	 */
8245 	TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) {
8246 		if (range->offset == offset && range->length == length &&
8247 		    range->locked_ctx == cb_arg) {
8248 			break;
8249 		}
8250 	}
8251 	if (range == NULL) {
8252 		assert(false);
8253 		spdk_spin_unlock(&bdev->internal.spinlock);
8254 		return -EINVAL;
8255 	}
8256 	TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq);
8257 	ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range);
8258 	spdk_spin_unlock(&bdev->internal.spinlock);
8259 
8260 	ctx->cb_fn = cb_fn;
8261 	ctx->cb_arg = cb_arg;
8262 
8263 	spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx,
8264 				   bdev_unlock_lba_range_cb);
8265 	return 0;
8266 }
8267 
8268 int
8269 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains,
8270 			     int array_size)
8271 {
8272 	if (!bdev) {
8273 		return -EINVAL;
8274 	}
8275 
8276 	if (bdev->fn_table->get_memory_domains) {
8277 		return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size);
8278 	}
8279 
8280 	return 0;
8281 }
8282 
8283 struct spdk_bdev_for_each_io_ctx {
8284 	void *ctx;
8285 	spdk_bdev_io_fn fn;
8286 	spdk_bdev_for_each_io_cb cb;
8287 };
8288 
8289 static void
8290 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
8291 			 struct spdk_io_channel *io_ch, void *_ctx)
8292 {
8293 	struct spdk_bdev_for_each_io_ctx *ctx = _ctx;
8294 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
8295 	struct spdk_bdev_io *bdev_io;
8296 	int rc = 0;
8297 
8298 	TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) {
8299 		rc = ctx->fn(ctx->ctx, bdev_io);
8300 		if (rc != 0) {
8301 			break;
8302 		}
8303 	}
8304 
8305 	spdk_bdev_for_each_channel_continue(i, rc);
8306 }
8307 
8308 static void
8309 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
8310 {
8311 	struct spdk_bdev_for_each_io_ctx *ctx = _ctx;
8312 
8313 	ctx->cb(ctx->ctx, status);
8314 
8315 	free(ctx);
8316 }
8317 
8318 void
8319 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn,
8320 			   spdk_bdev_for_each_io_cb cb)
8321 {
8322 	struct spdk_bdev_for_each_io_ctx *ctx;
8323 
8324 	assert(fn != NULL && cb != NULL);
8325 
8326 	ctx = calloc(1, sizeof(*ctx));
8327 	if (ctx == NULL) {
8328 		SPDK_ERRLOG("Failed to allocate context.\n");
8329 		cb(_ctx, -ENOMEM);
8330 		return;
8331 	}
8332 
8333 	ctx->ctx = _ctx;
8334 	ctx->fn = fn;
8335 	ctx->cb = cb;
8336 
8337 	spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx,
8338 				   bdev_for_each_io_done);
8339 }
8340 
8341 void
8342 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status)
8343 {
8344 	spdk_for_each_channel_continue(iter->i, status);
8345 }
8346 
8347 static struct spdk_bdev *
8348 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i)
8349 {
8350 	void *io_device = spdk_io_channel_iter_get_io_device(i);
8351 
8352 	return __bdev_from_io_dev(io_device);
8353 }
8354 
8355 static void
8356 bdev_each_channel_msg(struct spdk_io_channel_iter *i)
8357 {
8358 	struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
8359 	struct spdk_bdev *bdev = io_channel_iter_get_bdev(i);
8360 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
8361 
8362 	iter->i = i;
8363 	iter->fn(iter, bdev, ch, iter->ctx);
8364 }
8365 
8366 static void
8367 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status)
8368 {
8369 	struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
8370 	struct spdk_bdev *bdev = io_channel_iter_get_bdev(i);
8371 
8372 	iter->i = i;
8373 	iter->cpl(bdev, iter->ctx, status);
8374 
8375 	free(iter);
8376 }
8377 
8378 void
8379 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn,
8380 			   void *ctx, spdk_bdev_for_each_channel_done cpl)
8381 {
8382 	struct spdk_bdev_channel_iter *iter;
8383 
8384 	assert(bdev != NULL && fn != NULL && ctx != NULL);
8385 
8386 	iter = calloc(1, sizeof(struct spdk_bdev_channel_iter));
8387 	if (iter == NULL) {
8388 		SPDK_ERRLOG("Unable to allocate iterator\n");
8389 		assert(false);
8390 		return;
8391 	}
8392 
8393 	iter->fn = fn;
8394 	iter->cpl = cpl;
8395 	iter->ctx = ctx;
8396 
8397 	spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg,
8398 			      iter, bdev_each_channel_cpl);
8399 }
8400 
8401 int
8402 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
8403 		      uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks,
8404 		      spdk_bdev_io_completion_cb cb, void *cb_arg)
8405 {
8406 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
8407 	struct spdk_bdev_io *bdev_io;
8408 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
8409 
8410 	if (!desc->write) {
8411 		return -EBADF;
8412 	}
8413 
8414 	if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY))) {
8415 		SPDK_DEBUGLOG(bdev, "Copy IO type is not supported\n");
8416 		return -ENOTSUP;
8417 	}
8418 
8419 	if (num_blocks == 0) {
8420 		SPDK_ERRLOG("Can't copy 0 blocks\n");
8421 		return -EINVAL;
8422 	}
8423 
8424 	if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) ||
8425 	    !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) {
8426 		SPDK_DEBUGLOG(bdev,
8427 			      "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n",
8428 			      dst_offset_blocks, src_offset_blocks, num_blocks);
8429 		return -EINVAL;
8430 	}
8431 
8432 	bdev_io = bdev_channel_get_io(channel);
8433 	if (!bdev_io) {
8434 		return -ENOMEM;
8435 	}
8436 
8437 	bdev_io->internal.ch = channel;
8438 	bdev_io->internal.desc = desc;
8439 	bdev_io->type = SPDK_BDEV_IO_TYPE_COPY;
8440 
8441 	bdev_io->u.bdev.offset_blocks = dst_offset_blocks;
8442 	bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks;
8443 	bdev_io->u.bdev.num_blocks = num_blocks;
8444 	bdev_io->u.bdev.ext_opts = NULL;
8445 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
8446 
8447 	bdev_io_submit(bdev_io);
8448 	return 0;
8449 }
8450 
8451 SPDK_LOG_REGISTER_COMPONENT(bdev)
8452 
8453 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV)
8454 {
8455 	struct spdk_trace_tpoint_opts opts[] = {
8456 		{
8457 			"BDEV_IO_START", TRACE_BDEV_IO_START,
8458 			OWNER_BDEV, OBJECT_BDEV_IO, 1,
8459 			{
8460 				{ "type", SPDK_TRACE_ARG_TYPE_INT, 8 },
8461 				{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 },
8462 				{ "offset", SPDK_TRACE_ARG_TYPE_INT, 8 },
8463 				{ "len", SPDK_TRACE_ARG_TYPE_INT, 8 },
8464 				{ "name", SPDK_TRACE_ARG_TYPE_STR, 40}
8465 			}
8466 		},
8467 		{
8468 			"BDEV_IO_DONE", TRACE_BDEV_IO_DONE,
8469 			OWNER_BDEV, OBJECT_BDEV_IO, 0,
8470 			{{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }}
8471 		},
8472 		{
8473 			"BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE,
8474 			OWNER_BDEV, OBJECT_NONE, 1,
8475 			{
8476 				{ "name", SPDK_TRACE_ARG_TYPE_STR, 40 },
8477 				{ "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8}
8478 			}
8479 		},
8480 		{
8481 			"BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY,
8482 			OWNER_BDEV, OBJECT_NONE, 0,
8483 			{
8484 				{ "name", SPDK_TRACE_ARG_TYPE_STR, 40 },
8485 				{ "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8}
8486 			}
8487 		},
8488 	};
8489 
8490 
8491 	spdk_trace_register_owner(OWNER_BDEV, 'b');
8492 	spdk_trace_register_object(OBJECT_BDEV_IO, 'i');
8493 	spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts));
8494 	spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0);
8495 	spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0);
8496 }
8497