xref: /spdk/lib/bdev/bdev.c (revision b0556d4a090e8eba55bd798efdcfc944bf6c45be)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2016 Intel Corporation. All rights reserved.
3  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
4  *   Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  */
6 
7 #include "spdk/stdinc.h"
8 
9 #include "spdk/bdev.h"
10 
11 #include "spdk/accel.h"
12 #include "spdk/config.h"
13 #include "spdk/env.h"
14 #include "spdk/thread.h"
15 #include "spdk/likely.h"
16 #include "spdk/queue.h"
17 #include "spdk/nvme_spec.h"
18 #include "spdk/scsi_spec.h"
19 #include "spdk/notify.h"
20 #include "spdk/util.h"
21 #include "spdk/trace.h"
22 #include "spdk/dma.h"
23 
24 #include "spdk/bdev_module.h"
25 #include "spdk/log.h"
26 #include "spdk/string.h"
27 
28 #include "bdev_internal.h"
29 #include "spdk_internal/trace_defs.h"
30 #include "spdk_internal/assert.h"
31 
32 #ifdef SPDK_CONFIG_VTUNE
33 #include "ittnotify.h"
34 #include "ittnotify_types.h"
35 int __itt_init_ittlib(const char *, __itt_group_id);
36 #endif
37 
38 #define SPDK_BDEV_IO_POOL_SIZE			(64 * 1024 - 1)
39 #define SPDK_BDEV_IO_CACHE_SIZE			256
40 #define SPDK_BDEV_AUTO_EXAMINE			true
41 #define BUF_SMALL_POOL_SIZE			8191
42 #define BUF_LARGE_POOL_SIZE			1023
43 #define BUF_SMALL_CACHE_SIZE			128
44 #define BUF_LARGE_CACHE_SIZE			16
45 #define NOMEM_THRESHOLD_COUNT			8
46 
47 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC		1000
48 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE	1
49 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE	512
50 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC		1000
51 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC		(1024 * 1024)
52 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED		UINT64_MAX
53 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC	1000
54 
55 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command
56  * when splitting into children requests at a time.
57  */
58 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8)
59 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000
60 
61 /* The maximum number of children requests for a COPY command
62  * when splitting into children requests at a time.
63  */
64 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8)
65 
66 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \
67 	log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev)
68 #ifdef DEBUG
69 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \
70 	log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev)
71 #else
72 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0)
73 #endif
74 
75 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func,
76 				const char *detail, struct spdk_bdev *bdev);
77 
78 static const char *qos_rpc_type[] = {"rw_ios_per_sec",
79 				     "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec"
80 				    };
81 
82 TAILQ_HEAD(spdk_bdev_list, spdk_bdev);
83 
84 RB_HEAD(bdev_name_tree, spdk_bdev_name);
85 
86 static int
87 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2)
88 {
89 	return strcmp(name1->name, name2->name);
90 }
91 
92 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp);
93 
94 struct spdk_bdev_mgr {
95 	struct spdk_mempool *bdev_io_pool;
96 
97 	void *zero_buffer;
98 
99 	TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules;
100 
101 	struct spdk_bdev_list bdevs;
102 	struct bdev_name_tree bdev_names;
103 
104 	bool init_complete;
105 	bool module_init_complete;
106 
107 	struct spdk_spinlock spinlock;
108 
109 	TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens;
110 
111 #ifdef SPDK_CONFIG_VTUNE
112 	__itt_domain	*domain;
113 #endif
114 };
115 
116 static struct spdk_bdev_mgr g_bdev_mgr = {
117 	.bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules),
118 	.bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs),
119 	.bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names),
120 	.init_complete = false,
121 	.module_init_complete = false,
122 	.async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens),
123 };
124 
125 static void
126 __attribute__((constructor))
127 _bdev_init(void)
128 {
129 	spdk_spin_init(&g_bdev_mgr.spinlock);
130 }
131 
132 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status);
133 
134 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc);
135 
136 struct lba_range {
137 	struct spdk_bdev		*bdev;
138 	uint64_t			offset;
139 	uint64_t			length;
140 	bool				quiesce;
141 	void				*locked_ctx;
142 	struct spdk_thread		*owner_thread;
143 	struct spdk_bdev_channel	*owner_ch;
144 	TAILQ_ENTRY(lba_range)		tailq;
145 	TAILQ_ENTRY(lba_range)		tailq_module;
146 };
147 
148 static struct spdk_bdev_opts	g_bdev_opts = {
149 	.bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE,
150 	.bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE,
151 	.bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE,
152 	.iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE,
153 	.iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE,
154 };
155 
156 static spdk_bdev_init_cb	g_init_cb_fn = NULL;
157 static void			*g_init_cb_arg = NULL;
158 
159 static spdk_bdev_fini_cb	g_fini_cb_fn = NULL;
160 static void			*g_fini_cb_arg = NULL;
161 static struct spdk_thread	*g_fini_thread = NULL;
162 
163 struct spdk_bdev_qos_limit {
164 	/** IOs or bytes allowed per second (i.e., 1s). */
165 	uint64_t limit;
166 
167 	/** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms).
168 	 *  For remaining bytes, allowed to run negative if an I/O is submitted when
169 	 *  some bytes are remaining, but the I/O is bigger than that amount. The
170 	 *  excess will be deducted from the next timeslice.
171 	 */
172 	int64_t remaining_this_timeslice;
173 
174 	/** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */
175 	uint32_t min_per_timeslice;
176 
177 	/** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */
178 	uint32_t max_per_timeslice;
179 
180 	/** Function to check whether to queue the IO.
181 	 * If The IO is allowed to pass, the quota will be reduced correspondingly.
182 	 */
183 	bool (*queue_io)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io);
184 
185 	/** Function to rewind the quota once the IO was allowed to be sent by this
186 	 * limit but queued due to one of the further limits.
187 	 */
188 	void (*rewind_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io);
189 };
190 
191 struct spdk_bdev_qos {
192 	/** Types of structure of rate limits. */
193 	struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
194 
195 	/** The channel that all I/O are funneled through. */
196 	struct spdk_bdev_channel *ch;
197 
198 	/** The thread on which the poller is running. */
199 	struct spdk_thread *thread;
200 
201 	/** Size of a timeslice in tsc ticks. */
202 	uint64_t timeslice_size;
203 
204 	/** Timestamp of start of last timeslice. */
205 	uint64_t last_timeslice;
206 
207 	/** Poller that processes queued I/O commands each time slice. */
208 	struct spdk_poller *poller;
209 };
210 
211 struct spdk_bdev_mgmt_channel {
212 	/*
213 	 * Each thread keeps a cache of bdev_io - this allows
214 	 *  bdev threads which are *not* DPDK threads to still
215 	 *  benefit from a per-thread bdev_io cache.  Without
216 	 *  this, non-DPDK threads fetching from the mempool
217 	 *  incur a cmpxchg on get and put.
218 	 */
219 	bdev_io_stailq_t per_thread_cache;
220 	uint32_t	per_thread_cache_count;
221 	uint32_t	bdev_io_cache_size;
222 
223 	struct spdk_iobuf_channel iobuf;
224 
225 	TAILQ_HEAD(, spdk_bdev_shared_resource)	shared_resources;
226 	TAILQ_HEAD(, spdk_bdev_io_wait_entry)	io_wait_queue;
227 };
228 
229 /*
230  * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device
231  * will queue here their IO that awaits retry. It makes it possible to retry sending
232  * IO to one bdev after IO from other bdev completes.
233  */
234 struct spdk_bdev_shared_resource {
235 	/* The bdev management channel */
236 	struct spdk_bdev_mgmt_channel *mgmt_ch;
237 
238 	/*
239 	 * Count of I/O submitted to bdev module and waiting for completion.
240 	 * Incremented before submit_request() is called on an spdk_bdev_io.
241 	 */
242 	uint64_t		io_outstanding;
243 
244 	/*
245 	 * Queue of IO awaiting retry because of a previous NOMEM status returned
246 	 *  on this channel.
247 	 */
248 	bdev_io_tailq_t		nomem_io;
249 
250 	/*
251 	 * Threshold which io_outstanding must drop to before retrying nomem_io.
252 	 */
253 	uint64_t		nomem_threshold;
254 
255 	/* I/O channel allocated by a bdev module */
256 	struct spdk_io_channel	*shared_ch;
257 
258 	struct spdk_poller	*nomem_poller;
259 
260 	/* Refcount of bdev channels using this resource */
261 	uint32_t		ref;
262 
263 	TAILQ_ENTRY(spdk_bdev_shared_resource) link;
264 };
265 
266 #define BDEV_CH_RESET_IN_PROGRESS	(1 << 0)
267 #define BDEV_CH_QOS_ENABLED		(1 << 1)
268 
269 struct spdk_bdev_channel {
270 	struct spdk_bdev	*bdev;
271 
272 	/* The channel for the underlying device */
273 	struct spdk_io_channel	*channel;
274 
275 	/* Accel channel */
276 	struct spdk_io_channel	*accel_channel;
277 
278 	/* Per io_device per thread data */
279 	struct spdk_bdev_shared_resource *shared_resource;
280 
281 	struct spdk_bdev_io_stat *stat;
282 
283 	/*
284 	 * Count of I/O submitted to the underlying dev module through this channel
285 	 * and waiting for completion.
286 	 */
287 	uint64_t		io_outstanding;
288 
289 	/*
290 	 * List of all submitted I/Os including I/O that are generated via splitting.
291 	 */
292 	bdev_io_tailq_t		io_submitted;
293 
294 	/*
295 	 * List of spdk_bdev_io that are currently queued because they write to a locked
296 	 * LBA range.
297 	 */
298 	bdev_io_tailq_t		io_locked;
299 
300 	/* List of I/Os with accel sequence being currently executed */
301 	bdev_io_tailq_t		io_accel_exec;
302 
303 	/* List of I/Os doing memory domain pull/push */
304 	bdev_io_tailq_t		io_memory_domain;
305 
306 	uint32_t		flags;
307 
308 	struct spdk_histogram_data *histogram;
309 
310 #ifdef SPDK_CONFIG_VTUNE
311 	uint64_t		start_tsc;
312 	uint64_t		interval_tsc;
313 	__itt_string_handle	*handle;
314 	struct spdk_bdev_io_stat *prev_stat;
315 #endif
316 
317 	bdev_io_tailq_t		queued_resets;
318 
319 	lba_range_tailq_t	locked_ranges;
320 
321 	/** List of I/Os queued by QoS. */
322 	bdev_io_tailq_t		qos_queued_io;
323 };
324 
325 struct media_event_entry {
326 	struct spdk_bdev_media_event	event;
327 	TAILQ_ENTRY(media_event_entry)	tailq;
328 };
329 
330 #define MEDIA_EVENT_POOL_SIZE 64
331 
332 struct spdk_bdev_desc {
333 	struct spdk_bdev		*bdev;
334 	struct spdk_thread		*thread;
335 	struct {
336 		spdk_bdev_event_cb_t event_fn;
337 		void *ctx;
338 	}				callback;
339 	bool				closed;
340 	bool				write;
341 	bool				memory_domains_supported;
342 	bool				accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES];
343 	struct spdk_spinlock		spinlock;
344 	uint32_t			refs;
345 	TAILQ_HEAD(, media_event_entry)	pending_media_events;
346 	TAILQ_HEAD(, media_event_entry)	free_media_events;
347 	struct media_event_entry	*media_events_buffer;
348 	TAILQ_ENTRY(spdk_bdev_desc)	link;
349 
350 	uint64_t		timeout_in_sec;
351 	spdk_bdev_io_timeout_cb	cb_fn;
352 	void			*cb_arg;
353 	struct spdk_poller	*io_timeout_poller;
354 	struct spdk_bdev_module_claim	*claim;
355 };
356 
357 struct spdk_bdev_iostat_ctx {
358 	struct spdk_bdev_io_stat *stat;
359 	spdk_bdev_get_device_stat_cb cb;
360 	void *cb_arg;
361 };
362 
363 struct set_qos_limit_ctx {
364 	void (*cb_fn)(void *cb_arg, int status);
365 	void *cb_arg;
366 	struct spdk_bdev *bdev;
367 };
368 
369 struct spdk_bdev_channel_iter {
370 	spdk_bdev_for_each_channel_msg fn;
371 	spdk_bdev_for_each_channel_done cpl;
372 	struct spdk_io_channel_iter *i;
373 	void *ctx;
374 };
375 
376 struct spdk_bdev_io_error_stat {
377 	uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS];
378 };
379 
380 enum bdev_io_retry_state {
381 	BDEV_IO_RETRY_STATE_INVALID,
382 	BDEV_IO_RETRY_STATE_PULL,
383 	BDEV_IO_RETRY_STATE_PULL_MD,
384 	BDEV_IO_RETRY_STATE_SUBMIT,
385 	BDEV_IO_RETRY_STATE_PUSH,
386 	BDEV_IO_RETRY_STATE_PUSH_MD,
387 };
388 
389 #define __bdev_to_io_dev(bdev)		(((char *)bdev) + 1)
390 #define __bdev_from_io_dev(io_dev)	((struct spdk_bdev *)(((char *)io_dev) - 1))
391 #define __io_ch_to_bdev_ch(io_ch)	((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch))
392 #define __io_ch_to_bdev_mgmt_ch(io_ch)	((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch))
393 
394 static inline void bdev_io_complete(void *ctx);
395 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io);
396 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io);
397 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io);
398 
399 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
400 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io);
401 
402 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
403 				struct spdk_io_channel *ch, void *_ctx);
404 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status);
405 
406 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
407 				     struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks,
408 				     uint64_t num_blocks,
409 				     struct spdk_memory_domain *domain, void *domain_ctx,
410 				     struct spdk_accel_sequence *seq,
411 				     spdk_bdev_io_completion_cb cb, void *cb_arg);
412 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
413 				      struct iovec *iov, int iovcnt, void *md_buf,
414 				      uint64_t offset_blocks, uint64_t num_blocks,
415 				      struct spdk_memory_domain *domain, void *domain_ctx,
416 				      struct spdk_accel_sequence *seq,
417 				      spdk_bdev_io_completion_cb cb, void *cb_arg);
418 
419 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
420 			       uint64_t offset, uint64_t length,
421 			       lock_range_cb cb_fn, void *cb_arg);
422 
423 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
424 				 uint64_t offset, uint64_t length,
425 				 lock_range_cb cb_fn, void *cb_arg);
426 
427 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort);
428 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort);
429 
430 static bool claim_type_is_v2(enum spdk_bdev_claim_type type);
431 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc);
432 static void claim_reset(struct spdk_bdev *bdev);
433 
434 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch);
435 
436 #define bdev_get_ext_io_opt(opts, field, defval) \
437 	(((opts) != NULL && offsetof(struct spdk_bdev_ext_io_opts, field) + \
438 	 sizeof((opts)->field) <= (opts)->size) ? (opts)->field : (defval))
439 
440 void
441 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size)
442 {
443 	if (!opts) {
444 		SPDK_ERRLOG("opts should not be NULL\n");
445 		return;
446 	}
447 
448 	if (!opts_size) {
449 		SPDK_ERRLOG("opts_size should not be zero value\n");
450 		return;
451 	}
452 
453 	opts->opts_size = opts_size;
454 
455 #define SET_FIELD(field) \
456 	if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \
457 		opts->field = g_bdev_opts.field; \
458 	} \
459 
460 	SET_FIELD(bdev_io_pool_size);
461 	SET_FIELD(bdev_io_cache_size);
462 	SET_FIELD(bdev_auto_examine);
463 	SET_FIELD(iobuf_small_cache_size);
464 	SET_FIELD(iobuf_large_cache_size);
465 
466 	/* Do not remove this statement, you should always update this statement when you adding a new field,
467 	 * and do not forget to add the SET_FIELD statement for your added field. */
468 	SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size");
469 
470 #undef SET_FIELD
471 }
472 
473 int
474 spdk_bdev_set_opts(struct spdk_bdev_opts *opts)
475 {
476 	uint32_t min_pool_size;
477 
478 	if (!opts) {
479 		SPDK_ERRLOG("opts cannot be NULL\n");
480 		return -1;
481 	}
482 
483 	if (!opts->opts_size) {
484 		SPDK_ERRLOG("opts_size inside opts cannot be zero value\n");
485 		return -1;
486 	}
487 
488 	/*
489 	 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem
490 	 *  initialization.  A second mgmt_ch will be created on the same thread when the application starts
491 	 *  but before the deferred put_io_channel event is executed for the first mgmt_ch.
492 	 */
493 	min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1);
494 	if (opts->bdev_io_pool_size < min_pool_size) {
495 		SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32
496 			    " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size,
497 			    spdk_thread_get_count());
498 		SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size);
499 		return -1;
500 	}
501 
502 #define SET_FIELD(field) \
503         if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \
504                 g_bdev_opts.field = opts->field; \
505         } \
506 
507 	SET_FIELD(bdev_io_pool_size);
508 	SET_FIELD(bdev_io_cache_size);
509 	SET_FIELD(bdev_auto_examine);
510 	SET_FIELD(iobuf_small_cache_size);
511 	SET_FIELD(iobuf_large_cache_size);
512 
513 	g_bdev_opts.opts_size = opts->opts_size;
514 
515 #undef SET_FIELD
516 
517 	return 0;
518 }
519 
520 static struct spdk_bdev *
521 bdev_get_by_name(const char *bdev_name)
522 {
523 	struct spdk_bdev_name find;
524 	struct spdk_bdev_name *res;
525 
526 	find.name = (char *)bdev_name;
527 	res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find);
528 	if (res != NULL) {
529 		return res->bdev;
530 	}
531 
532 	return NULL;
533 }
534 
535 struct spdk_bdev *
536 spdk_bdev_get_by_name(const char *bdev_name)
537 {
538 	struct spdk_bdev *bdev;
539 
540 	spdk_spin_lock(&g_bdev_mgr.spinlock);
541 	bdev = bdev_get_by_name(bdev_name);
542 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
543 
544 	return bdev;
545 }
546 
547 struct bdev_io_status_string {
548 	enum spdk_bdev_io_status status;
549 	const char *str;
550 };
551 
552 static const struct bdev_io_status_string bdev_io_status_strings[] = {
553 	{ SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" },
554 	{ SPDK_BDEV_IO_STATUS_ABORTED, "aborted" },
555 	{ SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" },
556 	{ SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" },
557 	{ SPDK_BDEV_IO_STATUS_NOMEM, "nomem" },
558 	{ SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" },
559 	{ SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" },
560 	{ SPDK_BDEV_IO_STATUS_FAILED, "failed" },
561 	{ SPDK_BDEV_IO_STATUS_PENDING, "pending" },
562 	{ SPDK_BDEV_IO_STATUS_SUCCESS, "success" },
563 };
564 
565 static const char *
566 bdev_io_status_get_string(enum spdk_bdev_io_status status)
567 {
568 	uint32_t i;
569 
570 	for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) {
571 		if (bdev_io_status_strings[i].status == status) {
572 			return bdev_io_status_strings[i].str;
573 		}
574 	}
575 
576 	return "reserved";
577 }
578 
579 struct spdk_bdev_wait_for_examine_ctx {
580 	struct spdk_poller              *poller;
581 	spdk_bdev_wait_for_examine_cb	cb_fn;
582 	void				*cb_arg;
583 };
584 
585 static bool bdev_module_all_actions_completed(void);
586 
587 static int
588 bdev_wait_for_examine_cb(void *arg)
589 {
590 	struct spdk_bdev_wait_for_examine_ctx *ctx = arg;
591 
592 	if (!bdev_module_all_actions_completed()) {
593 		return SPDK_POLLER_IDLE;
594 	}
595 
596 	spdk_poller_unregister(&ctx->poller);
597 	ctx->cb_fn(ctx->cb_arg);
598 	free(ctx);
599 
600 	return SPDK_POLLER_BUSY;
601 }
602 
603 int
604 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg)
605 {
606 	struct spdk_bdev_wait_for_examine_ctx *ctx;
607 
608 	ctx = calloc(1, sizeof(*ctx));
609 	if (ctx == NULL) {
610 		return -ENOMEM;
611 	}
612 	ctx->cb_fn = cb_fn;
613 	ctx->cb_arg = cb_arg;
614 	ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0);
615 
616 	return 0;
617 }
618 
619 struct spdk_bdev_examine_item {
620 	char *name;
621 	TAILQ_ENTRY(spdk_bdev_examine_item) link;
622 };
623 
624 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item);
625 
626 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER(
627 			g_bdev_examine_allowlist);
628 
629 static inline bool
630 bdev_examine_allowlist_check(const char *name)
631 {
632 	struct spdk_bdev_examine_item *item;
633 	TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) {
634 		if (strcmp(name, item->name) == 0) {
635 			return true;
636 		}
637 	}
638 	return false;
639 }
640 
641 static inline void
642 bdev_examine_allowlist_free(void)
643 {
644 	struct spdk_bdev_examine_item *item;
645 	while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) {
646 		item = TAILQ_FIRST(&g_bdev_examine_allowlist);
647 		TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link);
648 		free(item->name);
649 		free(item);
650 	}
651 }
652 
653 static inline bool
654 bdev_in_examine_allowlist(struct spdk_bdev *bdev)
655 {
656 	struct spdk_bdev_alias *tmp;
657 	if (bdev_examine_allowlist_check(bdev->name)) {
658 		return true;
659 	}
660 	TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
661 		if (bdev_examine_allowlist_check(tmp->alias.name)) {
662 			return true;
663 		}
664 	}
665 	return false;
666 }
667 
668 static inline bool
669 bdev_ok_to_examine(struct spdk_bdev *bdev)
670 {
671 	if (g_bdev_opts.bdev_auto_examine) {
672 		return true;
673 	} else {
674 		return bdev_in_examine_allowlist(bdev);
675 	}
676 }
677 
678 static void
679 bdev_examine(struct spdk_bdev *bdev)
680 {
681 	struct spdk_bdev_module *module;
682 	struct spdk_bdev_module_claim *claim, *tmpclaim;
683 	uint32_t action;
684 
685 	if (!bdev_ok_to_examine(bdev)) {
686 		return;
687 	}
688 
689 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
690 		if (module->examine_config) {
691 			spdk_spin_lock(&module->internal.spinlock);
692 			action = module->internal.action_in_progress;
693 			module->internal.action_in_progress++;
694 			spdk_spin_unlock(&module->internal.spinlock);
695 			module->examine_config(bdev);
696 			if (action != module->internal.action_in_progress) {
697 				SPDK_ERRLOG("examine_config for module %s did not call "
698 					    "spdk_bdev_module_examine_done()\n", module->name);
699 			}
700 		}
701 	}
702 
703 	spdk_spin_lock(&bdev->internal.spinlock);
704 
705 	switch (bdev->internal.claim_type) {
706 	case SPDK_BDEV_CLAIM_NONE:
707 		/* Examine by all bdev modules */
708 		TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
709 			if (module->examine_disk) {
710 				spdk_spin_lock(&module->internal.spinlock);
711 				module->internal.action_in_progress++;
712 				spdk_spin_unlock(&module->internal.spinlock);
713 				spdk_spin_unlock(&bdev->internal.spinlock);
714 				module->examine_disk(bdev);
715 				spdk_spin_lock(&bdev->internal.spinlock);
716 			}
717 		}
718 		break;
719 	case SPDK_BDEV_CLAIM_EXCL_WRITE:
720 		/* Examine by the one bdev module with a v1 claim */
721 		module = bdev->internal.claim.v1.module;
722 		if (module->examine_disk) {
723 			spdk_spin_lock(&module->internal.spinlock);
724 			module->internal.action_in_progress++;
725 			spdk_spin_unlock(&module->internal.spinlock);
726 			spdk_spin_unlock(&bdev->internal.spinlock);
727 			module->examine_disk(bdev);
728 			return;
729 		}
730 		break;
731 	default:
732 		/* Examine by all bdev modules with a v2 claim */
733 		assert(claim_type_is_v2(bdev->internal.claim_type));
734 		/*
735 		 * Removal of tailq nodes while iterating can cause the iteration to jump out of the
736 		 * list, perhaps accessing freed memory. Without protection, this could happen
737 		 * while the lock is dropped during the examine callback.
738 		 */
739 		bdev->internal.examine_in_progress++;
740 
741 		TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) {
742 			module = claim->module;
743 
744 			if (module == NULL) {
745 				/* This is a vestigial claim, held by examine_count */
746 				continue;
747 			}
748 
749 			if (module->examine_disk == NULL) {
750 				continue;
751 			}
752 
753 			spdk_spin_lock(&module->internal.spinlock);
754 			module->internal.action_in_progress++;
755 			spdk_spin_unlock(&module->internal.spinlock);
756 
757 			/* Call examine_disk without holding internal.spinlock. */
758 			spdk_spin_unlock(&bdev->internal.spinlock);
759 			module->examine_disk(bdev);
760 			spdk_spin_lock(&bdev->internal.spinlock);
761 		}
762 
763 		assert(bdev->internal.examine_in_progress > 0);
764 		bdev->internal.examine_in_progress--;
765 		if (bdev->internal.examine_in_progress == 0) {
766 			/* Remove any claims that were released during examine_disk */
767 			TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) {
768 				if (claim->desc != NULL) {
769 					continue;
770 				}
771 
772 				TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link);
773 				free(claim);
774 			}
775 			if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) {
776 				claim_reset(bdev);
777 			}
778 		}
779 	}
780 
781 	spdk_spin_unlock(&bdev->internal.spinlock);
782 }
783 
784 int
785 spdk_bdev_examine(const char *name)
786 {
787 	struct spdk_bdev *bdev;
788 	struct spdk_bdev_examine_item *item;
789 	struct spdk_thread *thread = spdk_get_thread();
790 
791 	if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) {
792 		SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread,
793 			    thread ? spdk_thread_get_name(thread) : "null");
794 		return -EINVAL;
795 	}
796 
797 	if (g_bdev_opts.bdev_auto_examine) {
798 		SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled");
799 		return -EINVAL;
800 	}
801 
802 	if (bdev_examine_allowlist_check(name)) {
803 		SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name);
804 		return -EEXIST;
805 	}
806 
807 	item = calloc(1, sizeof(*item));
808 	if (!item) {
809 		return -ENOMEM;
810 	}
811 	item->name = strdup(name);
812 	if (!item->name) {
813 		free(item);
814 		return -ENOMEM;
815 	}
816 	TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link);
817 
818 	bdev = spdk_bdev_get_by_name(name);
819 	if (bdev) {
820 		bdev_examine(bdev);
821 	}
822 	return 0;
823 }
824 
825 static inline void
826 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w)
827 {
828 	struct spdk_bdev_examine_item *item;
829 	TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) {
830 		spdk_json_write_object_begin(w);
831 		spdk_json_write_named_string(w, "method", "bdev_examine");
832 		spdk_json_write_named_object_begin(w, "params");
833 		spdk_json_write_named_string(w, "name", item->name);
834 		spdk_json_write_object_end(w);
835 		spdk_json_write_object_end(w);
836 	}
837 }
838 
839 struct spdk_bdev *
840 spdk_bdev_first(void)
841 {
842 	struct spdk_bdev *bdev;
843 
844 	bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs);
845 	if (bdev) {
846 		SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name);
847 	}
848 
849 	return bdev;
850 }
851 
852 struct spdk_bdev *
853 spdk_bdev_next(struct spdk_bdev *prev)
854 {
855 	struct spdk_bdev *bdev;
856 
857 	bdev = TAILQ_NEXT(prev, internal.link);
858 	if (bdev) {
859 		SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name);
860 	}
861 
862 	return bdev;
863 }
864 
865 static struct spdk_bdev *
866 _bdev_next_leaf(struct spdk_bdev *bdev)
867 {
868 	while (bdev != NULL) {
869 		if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) {
870 			return bdev;
871 		} else {
872 			bdev = TAILQ_NEXT(bdev, internal.link);
873 		}
874 	}
875 
876 	return bdev;
877 }
878 
879 struct spdk_bdev *
880 spdk_bdev_first_leaf(void)
881 {
882 	struct spdk_bdev *bdev;
883 
884 	bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs));
885 
886 	if (bdev) {
887 		SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name);
888 	}
889 
890 	return bdev;
891 }
892 
893 struct spdk_bdev *
894 spdk_bdev_next_leaf(struct spdk_bdev *prev)
895 {
896 	struct spdk_bdev *bdev;
897 
898 	bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link));
899 
900 	if (bdev) {
901 		SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name);
902 	}
903 
904 	return bdev;
905 }
906 
907 static inline bool
908 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io)
909 {
910 	return bdev_io->internal.memory_domain;
911 }
912 
913 static inline bool
914 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io)
915 {
916 	return bdev_io->internal.has_accel_sequence;
917 }
918 
919 static inline void
920 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource,
921 			 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state)
922 {
923 	/* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io.
924 	 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth
925 	 * channels we will instead wait for half to complete.
926 	 */
927 	shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2,
928 					   (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT);
929 
930 	assert(state != BDEV_IO_RETRY_STATE_INVALID);
931 	bdev_io->internal.retry_state = state;
932 	TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link);
933 }
934 
935 static inline void
936 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource,
937 			 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state)
938 {
939 	/* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while
940 	 * the queue isn't empty, so we don't need to update the nomem_threshold here */
941 	assert(!TAILQ_EMPTY(&shared_resource->nomem_io));
942 
943 	assert(state != BDEV_IO_RETRY_STATE_INVALID);
944 	bdev_io->internal.retry_state = state;
945 	TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link);
946 }
947 
948 void
949 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len)
950 {
951 	struct iovec *iovs;
952 
953 	if (bdev_io->u.bdev.iovs == NULL) {
954 		bdev_io->u.bdev.iovs = &bdev_io->iov;
955 		bdev_io->u.bdev.iovcnt = 1;
956 	}
957 
958 	iovs = bdev_io->u.bdev.iovs;
959 
960 	assert(iovs != NULL);
961 	assert(bdev_io->u.bdev.iovcnt >= 1);
962 
963 	iovs[0].iov_base = buf;
964 	iovs[0].iov_len = len;
965 }
966 
967 void
968 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len)
969 {
970 	assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks);
971 	bdev_io->u.bdev.md_buf = md_buf;
972 }
973 
974 static bool
975 _is_buf_allocated(const struct iovec *iovs)
976 {
977 	if (iovs == NULL) {
978 		return false;
979 	}
980 
981 	return iovs[0].iov_base != NULL;
982 }
983 
984 static bool
985 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment)
986 {
987 	int i;
988 	uintptr_t iov_base;
989 
990 	if (spdk_likely(alignment == 1)) {
991 		return true;
992 	}
993 
994 	for (i = 0; i < iovcnt; i++) {
995 		iov_base = (uintptr_t)iovs[i].iov_base;
996 		if ((iov_base & (alignment - 1)) != 0) {
997 			return false;
998 		}
999 	}
1000 
1001 	return true;
1002 }
1003 
1004 static inline bool
1005 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io)
1006 {
1007 	if (!bdev_io->internal.accel_sequence) {
1008 		return false;
1009 	}
1010 
1011 	/* For now, we don't allow splitting IOs with an accel sequence and will treat them as if
1012 	 * bdev module didn't support accel sequences */
1013 	return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.split;
1014 }
1015 
1016 static inline void
1017 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch,
1018 			      struct spdk_bdev_shared_resource *shared_resource)
1019 {
1020 	bdev_ch->io_outstanding++;
1021 	shared_resource->io_outstanding++;
1022 }
1023 
1024 static inline void
1025 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch,
1026 			      struct spdk_bdev_shared_resource *shared_resource)
1027 {
1028 	assert(bdev_ch->io_outstanding > 0);
1029 	assert(shared_resource->io_outstanding > 0);
1030 	bdev_ch->io_outstanding--;
1031 	shared_resource->io_outstanding--;
1032 }
1033 
1034 static void
1035 bdev_io_submit_sequence_cb(void *ctx, int status)
1036 {
1037 	struct spdk_bdev_io *bdev_io = ctx;
1038 
1039 	bdev_io->u.bdev.accel_sequence = NULL;
1040 	bdev_io->internal.accel_sequence = NULL;
1041 
1042 	if (spdk_unlikely(status != 0)) {
1043 		SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status);
1044 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1045 		bdev_io_complete_unsubmitted(bdev_io);
1046 		return;
1047 	}
1048 
1049 	bdev_io_submit(bdev_io);
1050 }
1051 
1052 static void
1053 bdev_io_exec_sequence_cb(void *ctx, int status)
1054 {
1055 	struct spdk_bdev_io *bdev_io = ctx;
1056 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1057 
1058 	TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link);
1059 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1060 
1061 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1062 		bdev_ch_retry_io(ch);
1063 	}
1064 
1065 	bdev_io->internal.data_transfer_cpl(bdev_io, status);
1066 }
1067 
1068 static void
1069 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status))
1070 {
1071 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1072 
1073 	assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io));
1074 	assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
1075 
1076 	/* Since the operations are appended during submission, they're in the opposite order than
1077 	 * how we want to execute them for reads (i.e. we need to execute the most recently added
1078 	 * operation first), so reverse the sequence before executing it.
1079 	 */
1080 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
1081 		spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence);
1082 	}
1083 
1084 	TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link);
1085 	bdev_io_increment_outstanding(ch, ch->shared_resource);
1086 	bdev_io->internal.data_transfer_cpl = cb_fn;
1087 
1088 	spdk_accel_sequence_finish(bdev_io->internal.accel_sequence,
1089 				   bdev_io_exec_sequence_cb, bdev_io);
1090 }
1091 
1092 static void
1093 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status)
1094 {
1095 	struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io);
1096 	void *buf;
1097 
1098 	if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) {
1099 		buf = bdev_io->internal.buf;
1100 		bdev_io->internal.buf = NULL;
1101 		bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf);
1102 		bdev_io->internal.get_aux_buf_cb = NULL;
1103 	} else {
1104 		assert(bdev_io->internal.get_buf_cb != NULL);
1105 		bdev_io->internal.get_buf_cb(ch, bdev_io, status);
1106 		bdev_io->internal.get_buf_cb = NULL;
1107 	}
1108 }
1109 
1110 static void
1111 _bdev_io_pull_buffer_cpl(void *ctx, int rc)
1112 {
1113 	struct spdk_bdev_io *bdev_io = ctx;
1114 
1115 	if (rc) {
1116 		SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc);
1117 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1118 	}
1119 	bdev_io_get_buf_complete(bdev_io, !rc);
1120 }
1121 
1122 static void
1123 bdev_io_pull_md_buf_done(void *ctx, int status)
1124 {
1125 	struct spdk_bdev_io *bdev_io = ctx;
1126 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1127 
1128 	TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1129 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1130 
1131 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1132 		bdev_ch_retry_io(ch);
1133 	}
1134 
1135 	assert(bdev_io->internal.data_transfer_cpl);
1136 	bdev_io->internal.data_transfer_cpl(bdev_io, status);
1137 }
1138 
1139 static void
1140 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io)
1141 {
1142 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1143 	int rc = 0;
1144 
1145 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1146 		if (bdev_io_use_memory_domain(bdev_io)) {
1147 			TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1148 			bdev_io_increment_outstanding(ch, ch->shared_resource);
1149 			rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain,
1150 							  bdev_io->internal.memory_domain_ctx,
1151 							  &bdev_io->internal.orig_md_iov, 1,
1152 							  &bdev_io->internal.bounce_md_iov, 1,
1153 							  bdev_io_pull_md_buf_done, bdev_io);
1154 			if (rc == 0) {
1155 				/* Continue to submit IO in completion callback */
1156 				return;
1157 			}
1158 			bdev_io_decrement_outstanding(ch, ch->shared_resource);
1159 			TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1160 			if (rc != -ENOMEM) {
1161 				SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n",
1162 					    spdk_memory_domain_get_dma_device_id(
1163 						    bdev_io->internal.memory_domain), rc);
1164 			}
1165 		} else {
1166 			memcpy(bdev_io->internal.bounce_md_iov.iov_base,
1167 			       bdev_io->internal.orig_md_iov.iov_base,
1168 			       bdev_io->internal.orig_md_iov.iov_len);
1169 		}
1170 	}
1171 
1172 	if (spdk_unlikely(rc == -ENOMEM)) {
1173 		bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD);
1174 	} else {
1175 		assert(bdev_io->internal.data_transfer_cpl);
1176 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1177 	}
1178 }
1179 
1180 static void
1181 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len)
1182 {
1183 	/* save original md_buf */
1184 	bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf;
1185 	bdev_io->internal.orig_md_iov.iov_len = len;
1186 	bdev_io->internal.bounce_md_iov.iov_base = md_buf;
1187 	bdev_io->internal.bounce_md_iov.iov_len = len;
1188 	/* set bounce md_buf */
1189 	bdev_io->u.bdev.md_buf = md_buf;
1190 
1191 	bdev_io_pull_md_buf(bdev_io);
1192 }
1193 
1194 static void
1195 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io)
1196 {
1197 	struct spdk_bdev *bdev = bdev_io->bdev;
1198 	uint64_t md_len;
1199 	void *buf;
1200 
1201 	if (spdk_bdev_is_md_separate(bdev)) {
1202 		assert(!bdev_io_use_accel_sequence(bdev_io));
1203 
1204 		buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len;
1205 		md_len = bdev_io->u.bdev.num_blocks * bdev->md_len;
1206 
1207 		assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0);
1208 
1209 		if (bdev_io->u.bdev.md_buf != NULL) {
1210 			_bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len);
1211 			return;
1212 		} else {
1213 			spdk_bdev_io_set_md_buf(bdev_io, buf, md_len);
1214 		}
1215 	}
1216 
1217 	bdev_io_get_buf_complete(bdev_io, true);
1218 }
1219 
1220 static inline void
1221 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc)
1222 {
1223 	if (rc) {
1224 		SPDK_ERRLOG("Failed to get data buffer\n");
1225 		assert(bdev_io->internal.data_transfer_cpl);
1226 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1227 		return;
1228 	}
1229 
1230 	_bdev_io_set_md_buf(bdev_io);
1231 }
1232 
1233 static void
1234 bdev_io_pull_data_done_and_track(void *ctx, int status)
1235 {
1236 	struct spdk_bdev_io *bdev_io = ctx;
1237 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1238 
1239 	TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1240 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1241 
1242 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1243 		bdev_ch_retry_io(ch);
1244 	}
1245 
1246 	bdev_io_pull_data_done(bdev_io, status);
1247 }
1248 
1249 static void
1250 bdev_io_pull_data(struct spdk_bdev_io *bdev_io)
1251 {
1252 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1253 	int rc = 0;
1254 
1255 	/* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a
1256 	 * sequence, append a copy operation making accel change the src/dst buffers of the previous
1257 	 * operation */
1258 	if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) ||
1259 	    (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) {
1260 		if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1261 			rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel,
1262 						    bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
1263 						    NULL, NULL,
1264 						    bdev_io->internal.orig_iovs,
1265 						    bdev_io->internal.orig_iovcnt,
1266 						    bdev_io->internal.memory_domain,
1267 						    bdev_io->internal.memory_domain_ctx,
1268 						    0, NULL, NULL);
1269 		} else {
1270 			/* We need to reverse the src/dst for reads */
1271 			assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
1272 			rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel,
1273 						    bdev_io->internal.orig_iovs,
1274 						    bdev_io->internal.orig_iovcnt,
1275 						    bdev_io->internal.memory_domain,
1276 						    bdev_io->internal.memory_domain_ctx,
1277 						    bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
1278 						    NULL, NULL, 0, NULL, NULL);
1279 		}
1280 
1281 		if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) {
1282 			SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n",
1283 				    bdev_io->internal.accel_sequence);
1284 		}
1285 	} else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1286 		/* if this is write path, copy data from original buffer to bounce buffer */
1287 		if (bdev_io_use_memory_domain(bdev_io)) {
1288 			TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1289 			bdev_io_increment_outstanding(ch, ch->shared_resource);
1290 			rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain,
1291 							  bdev_io->internal.memory_domain_ctx,
1292 							  bdev_io->internal.orig_iovs,
1293 							  (uint32_t) bdev_io->internal.orig_iovcnt,
1294 							  bdev_io->u.bdev.iovs, 1,
1295 							  bdev_io_pull_data_done_and_track,
1296 							  bdev_io);
1297 			if (rc == 0) {
1298 				/* Continue to submit IO in completion callback */
1299 				return;
1300 			}
1301 			TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1302 			bdev_io_decrement_outstanding(ch, ch->shared_resource);
1303 			if (rc != -ENOMEM) {
1304 				SPDK_ERRLOG("Failed to pull data from memory domain %s\n",
1305 					    spdk_memory_domain_get_dma_device_id(
1306 						    bdev_io->internal.memory_domain));
1307 			}
1308 		} else {
1309 			assert(bdev_io->u.bdev.iovcnt == 1);
1310 			spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base,
1311 					      bdev_io->u.bdev.iovs[0].iov_len,
1312 					      bdev_io->internal.orig_iovs,
1313 					      bdev_io->internal.orig_iovcnt);
1314 		}
1315 	}
1316 
1317 	if (spdk_unlikely(rc == -ENOMEM)) {
1318 		bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL);
1319 	} else {
1320 		bdev_io_pull_data_done(bdev_io, rc);
1321 	}
1322 }
1323 
1324 static void
1325 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len,
1326 			      bdev_copy_bounce_buffer_cpl cpl_cb)
1327 {
1328 	struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource;
1329 
1330 	bdev_io->internal.data_transfer_cpl = cpl_cb;
1331 	/* save original iovec */
1332 	bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs;
1333 	bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt;
1334 	/* set bounce iov */
1335 	bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov;
1336 	bdev_io->u.bdev.iovcnt = 1;
1337 	/* set bounce buffer for this operation */
1338 	bdev_io->u.bdev.iovs[0].iov_base = buf;
1339 	bdev_io->u.bdev.iovs[0].iov_len = len;
1340 
1341 	if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) {
1342 		bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL);
1343 	} else {
1344 		bdev_io_pull_data(bdev_io);
1345 	}
1346 }
1347 
1348 static void
1349 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len)
1350 {
1351 	struct spdk_bdev *bdev = bdev_io->bdev;
1352 	bool buf_allocated;
1353 	uint64_t alignment;
1354 	void *aligned_buf;
1355 
1356 	bdev_io->internal.buf = buf;
1357 
1358 	if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) {
1359 		bdev_io_get_buf_complete(bdev_io, true);
1360 		return;
1361 	}
1362 
1363 	alignment = spdk_bdev_get_buf_align(bdev);
1364 	buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs);
1365 	aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1));
1366 
1367 	if (buf_allocated) {
1368 		_bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl);
1369 		/* Continue in completion callback */
1370 		return;
1371 	} else {
1372 		spdk_bdev_io_set_buf(bdev_io, aligned_buf, len);
1373 	}
1374 
1375 	_bdev_io_set_md_buf(bdev_io);
1376 }
1377 
1378 static inline uint64_t
1379 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len)
1380 {
1381 	struct spdk_bdev *bdev = bdev_io->bdev;
1382 	uint64_t md_len, alignment;
1383 
1384 	md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0;
1385 
1386 	/* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */
1387 	alignment = spdk_bdev_get_buf_align(bdev) - 1;
1388 
1389 	return len + alignment + md_len;
1390 }
1391 
1392 static void
1393 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len)
1394 {
1395 	struct spdk_bdev_mgmt_channel *ch;
1396 
1397 	ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
1398 	spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len));
1399 }
1400 
1401 static void
1402 bdev_io_put_buf(struct spdk_bdev_io *bdev_io)
1403 {
1404 	assert(bdev_io->internal.buf != NULL);
1405 	_bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len);
1406 	bdev_io->internal.buf = NULL;
1407 }
1408 
1409 void
1410 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf)
1411 {
1412 	uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
1413 
1414 	assert(buf != NULL);
1415 	_bdev_io_put_buf(bdev_io, buf, len);
1416 }
1417 
1418 static inline void
1419 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch,
1420 		    struct spdk_bdev_io *bdev_io)
1421 {
1422 	/* After a request is submitted to a bdev module, the ownership of an accel sequence
1423 	 * associated with that bdev_io is transferred to the bdev module. So, clear the internal
1424 	 * sequence pointer to make sure we won't touch it anymore. */
1425 	if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE ||
1426 	     bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) {
1427 		assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io));
1428 		bdev_io->internal.accel_sequence = NULL;
1429 	}
1430 
1431 	bdev->fn_table->submit_request(ioch, bdev_io);
1432 }
1433 
1434 static inline void
1435 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io)
1436 {
1437 	struct spdk_bdev *bdev = bdev_io->bdev;
1438 
1439 	bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource);
1440 	bdev_io->internal.error.nvme.cdw0 = 0;
1441 	bdev_io->num_retries++;
1442 	bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io);
1443 }
1444 
1445 static void
1446 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource)
1447 {
1448 	struct spdk_bdev_io *bdev_io;
1449 
1450 	if (shared_resource->io_outstanding > shared_resource->nomem_threshold) {
1451 		/*
1452 		 * Allow some more I/O to complete before retrying the nomem_io queue.
1453 		 *  Some drivers (such as nvme) cannot immediately take a new I/O in
1454 		 *  the context of a completion, because the resources for the I/O are
1455 		 *  not released until control returns to the bdev poller.  Also, we
1456 		 *  may require several small I/O to complete before a larger I/O
1457 		 *  (that requires splitting) can be submitted.
1458 		 */
1459 		return;
1460 	}
1461 
1462 	while (!TAILQ_EMPTY(&shared_resource->nomem_io)) {
1463 		bdev_io = TAILQ_FIRST(&shared_resource->nomem_io);
1464 		TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link);
1465 
1466 		switch (bdev_io->internal.retry_state) {
1467 		case BDEV_IO_RETRY_STATE_SUBMIT:
1468 			bdev_ch_resubmit_io(shared_resource, bdev_io);
1469 			break;
1470 		case BDEV_IO_RETRY_STATE_PULL:
1471 			bdev_io_pull_data(bdev_io);
1472 			break;
1473 		case BDEV_IO_RETRY_STATE_PULL_MD:
1474 			bdev_io_pull_md_buf(bdev_io);
1475 			break;
1476 		case BDEV_IO_RETRY_STATE_PUSH:
1477 			bdev_io_push_bounce_data(bdev_io);
1478 			break;
1479 		case BDEV_IO_RETRY_STATE_PUSH_MD:
1480 			bdev_io_push_bounce_md_buf(bdev_io);
1481 			break;
1482 		default:
1483 			assert(0 && "invalid retry state");
1484 			break;
1485 		}
1486 
1487 		if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) {
1488 			/* This IO completed again with NOMEM status, so break the loop and
1489 			 * don't try anymore.  Note that a bdev_io that fails with NOMEM
1490 			 * always gets requeued at the front of the list, to maintain
1491 			 * ordering.
1492 			 */
1493 			break;
1494 		}
1495 	}
1496 }
1497 
1498 static void
1499 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
1500 {
1501 	bdev_shared_ch_retry_io(bdev_ch->shared_resource);
1502 }
1503 
1504 static int
1505 bdev_no_mem_poller(void *ctx)
1506 {
1507 	struct spdk_bdev_shared_resource *shared_resource = ctx;
1508 
1509 	spdk_poller_unregister(&shared_resource->nomem_poller);
1510 
1511 	if (!TAILQ_EMPTY(&shared_resource->nomem_io)) {
1512 		bdev_shared_ch_retry_io(shared_resource);
1513 	}
1514 	if (!TAILQ_EMPTY(&shared_resource->nomem_io) && shared_resource->io_outstanding == 0) {
1515 		/* No IOs were submitted, try again */
1516 		shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource,
1517 						SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10);
1518 	}
1519 
1520 	return SPDK_POLLER_BUSY;
1521 }
1522 
1523 static inline bool
1524 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state)
1525 {
1526 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
1527 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
1528 
1529 	if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) {
1530 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
1531 		bdev_queue_nomem_io_head(shared_resource, bdev_io, state);
1532 
1533 		if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) {
1534 			/* Special case when we have nomem IOs and no outstanding IOs which completions
1535 			 * could trigger retry of queued IOs
1536 			 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no
1537 			 * new IOs submitted, e.g. qd==1 */
1538 			shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource,
1539 							SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10);
1540 		}
1541 		/* If bdev module completed an I/O that has an accel sequence with NOMEM status, the
1542 		 * ownership of that sequence is transferred back to the bdev layer, so we need to
1543 		 * restore internal.accel_sequence to make sure that the sequence is handled
1544 		 * correctly in case the I/O is later aborted. */
1545 		if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ ||
1546 		     bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) {
1547 			assert(bdev_io->internal.accel_sequence == NULL);
1548 			bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence;
1549 		}
1550 
1551 		return true;
1552 	}
1553 
1554 	if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) {
1555 		bdev_ch_retry_io(bdev_ch);
1556 	}
1557 
1558 	return false;
1559 }
1560 
1561 static void
1562 _bdev_io_complete_push_bounce_done(void *ctx, int rc)
1563 {
1564 	struct spdk_bdev_io *bdev_io = ctx;
1565 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1566 
1567 	if (rc) {
1568 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1569 	}
1570 	/* We want to free the bounce buffer here since we know we're done with it (as opposed
1571 	 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()).
1572 	 */
1573 	bdev_io_put_buf(bdev_io);
1574 
1575 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1576 		bdev_ch_retry_io(ch);
1577 	}
1578 
1579 	/* Continue with IO completion flow */
1580 	bdev_io_complete(bdev_io);
1581 }
1582 
1583 static void
1584 bdev_io_push_bounce_md_buf_done(void *ctx, int rc)
1585 {
1586 	struct spdk_bdev_io *bdev_io = ctx;
1587 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1588 
1589 	TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1590 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1591 
1592 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1593 		bdev_ch_retry_io(ch);
1594 	}
1595 
1596 	bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1597 }
1598 
1599 static inline void
1600 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io)
1601 {
1602 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1603 	int rc = 0;
1604 
1605 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
1606 	/* do the same for metadata buffer */
1607 	if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) {
1608 		assert(spdk_bdev_is_md_separate(bdev_io->bdev));
1609 
1610 		if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
1611 			if (bdev_io_use_memory_domain(bdev_io)) {
1612 				TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1613 				bdev_io_increment_outstanding(ch, ch->shared_resource);
1614 				/* If memory domain is used then we need to call async push function */
1615 				rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain,
1616 								  bdev_io->internal.memory_domain_ctx,
1617 								  &bdev_io->internal.orig_md_iov,
1618 								  (uint32_t)bdev_io->internal.orig_iovcnt,
1619 								  &bdev_io->internal.bounce_md_iov, 1,
1620 								  bdev_io_push_bounce_md_buf_done,
1621 								  bdev_io);
1622 				if (rc == 0) {
1623 					/* Continue IO completion in async callback */
1624 					return;
1625 				}
1626 				TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1627 				bdev_io_decrement_outstanding(ch, ch->shared_resource);
1628 				if (rc != -ENOMEM) {
1629 					SPDK_ERRLOG("Failed to push md to memory domain %s\n",
1630 						    spdk_memory_domain_get_dma_device_id(
1631 							    bdev_io->internal.memory_domain));
1632 				}
1633 			} else {
1634 				memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf,
1635 				       bdev_io->internal.orig_md_iov.iov_len);
1636 			}
1637 		}
1638 	}
1639 
1640 	if (spdk_unlikely(rc == -ENOMEM)) {
1641 		bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD);
1642 	} else {
1643 		assert(bdev_io->internal.data_transfer_cpl);
1644 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1645 	}
1646 }
1647 
1648 static inline void
1649 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc)
1650 {
1651 	assert(bdev_io->internal.data_transfer_cpl);
1652 	if (rc) {
1653 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1654 		return;
1655 	}
1656 
1657 	/* set original buffer for this io */
1658 	bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt;
1659 	bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs;
1660 	/* disable bouncing buffer for this io */
1661 	bdev_io->internal.orig_iovcnt = 0;
1662 	bdev_io->internal.orig_iovs = NULL;
1663 
1664 	bdev_io_push_bounce_md_buf(bdev_io);
1665 }
1666 
1667 static void
1668 bdev_io_push_bounce_data_done_and_track(void *ctx, int status)
1669 {
1670 	struct spdk_bdev_io *bdev_io = ctx;
1671 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1672 
1673 	TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1674 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1675 
1676 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1677 		bdev_ch_retry_io(ch);
1678 	}
1679 
1680 	bdev_io_push_bounce_data_done(bdev_io, status);
1681 }
1682 
1683 static inline void
1684 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io)
1685 {
1686 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1687 	int rc = 0;
1688 
1689 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
1690 	assert(!bdev_io_use_accel_sequence(bdev_io));
1691 
1692 	/* if this is read path, copy data from bounce buffer to original buffer */
1693 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
1694 		if (bdev_io_use_memory_domain(bdev_io)) {
1695 			TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1696 			bdev_io_increment_outstanding(ch, ch->shared_resource);
1697 			/* If memory domain is used then we need to call async push function */
1698 			rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain,
1699 							  bdev_io->internal.memory_domain_ctx,
1700 							  bdev_io->internal.orig_iovs,
1701 							  (uint32_t)bdev_io->internal.orig_iovcnt,
1702 							  &bdev_io->internal.bounce_iov, 1,
1703 							  bdev_io_push_bounce_data_done_and_track,
1704 							  bdev_io);
1705 			if (rc == 0) {
1706 				/* Continue IO completion in async callback */
1707 				return;
1708 			}
1709 
1710 			TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1711 			bdev_io_decrement_outstanding(ch, ch->shared_resource);
1712 			if (rc != -ENOMEM) {
1713 				SPDK_ERRLOG("Failed to push data to memory domain %s\n",
1714 					    spdk_memory_domain_get_dma_device_id(
1715 						    bdev_io->internal.memory_domain));
1716 			}
1717 		} else {
1718 			spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs,
1719 					      bdev_io->internal.orig_iovcnt,
1720 					      bdev_io->internal.bounce_iov.iov_base,
1721 					      bdev_io->internal.bounce_iov.iov_len);
1722 		}
1723 	}
1724 
1725 	if (spdk_unlikely(rc == -ENOMEM)) {
1726 		bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH);
1727 	} else {
1728 		bdev_io_push_bounce_data_done(bdev_io, rc);
1729 	}
1730 }
1731 
1732 static inline void
1733 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb)
1734 {
1735 	bdev_io->internal.data_transfer_cpl = cpl_cb;
1736 	bdev_io_push_bounce_data(bdev_io);
1737 }
1738 
1739 static void
1740 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf)
1741 {
1742 	struct spdk_bdev_io *bdev_io;
1743 
1744 	bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf);
1745 	_bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len);
1746 }
1747 
1748 static void
1749 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len)
1750 {
1751 	struct spdk_bdev_mgmt_channel *mgmt_ch;
1752 	uint64_t max_len;
1753 	void *buf;
1754 
1755 	assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread());
1756 	mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
1757 	max_len = bdev_io_get_max_buf_len(bdev_io, len);
1758 
1759 	if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) {
1760 		SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len);
1761 		bdev_io_get_buf_complete(bdev_io, false);
1762 		return;
1763 	}
1764 
1765 	bdev_io->internal.buf_len = len;
1766 	buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf,
1767 			     bdev_io_get_iobuf_cb);
1768 	if (buf != NULL) {
1769 		_bdev_io_set_buf(bdev_io, buf, len);
1770 	}
1771 }
1772 
1773 void
1774 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len)
1775 {
1776 	struct spdk_bdev *bdev = bdev_io->bdev;
1777 	uint64_t alignment;
1778 
1779 	assert(cb != NULL);
1780 	bdev_io->internal.get_buf_cb = cb;
1781 
1782 	alignment = spdk_bdev_get_buf_align(bdev);
1783 
1784 	if (_is_buf_allocated(bdev_io->u.bdev.iovs) &&
1785 	    _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) {
1786 		/* Buffer already present and aligned */
1787 		cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true);
1788 		return;
1789 	}
1790 
1791 	bdev_io_get_buf(bdev_io, len);
1792 }
1793 
1794 static void
1795 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
1796 			      bool success)
1797 {
1798 	if (!success) {
1799 		SPDK_ERRLOG("Failed to get data buffer, completing IO\n");
1800 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1801 		bdev_io_complete_unsubmitted(bdev_io);
1802 		return;
1803 	}
1804 
1805 	if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) {
1806 		if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1807 			bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb);
1808 			return;
1809 		}
1810 		/* For reads we'll execute the sequence after the data is read, so, for now, only
1811 		 * clear out accel_sequence pointer and submit the IO */
1812 		assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
1813 		bdev_io->u.bdev.accel_sequence = NULL;
1814 	}
1815 
1816 	bdev_io_submit(bdev_io);
1817 }
1818 
1819 static void
1820 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb,
1821 			       uint64_t len)
1822 {
1823 	assert(cb != NULL);
1824 	bdev_io->internal.get_buf_cb = cb;
1825 
1826 	bdev_io_get_buf(bdev_io, len);
1827 }
1828 
1829 void
1830 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb)
1831 {
1832 	uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
1833 
1834 	assert(cb != NULL);
1835 	assert(bdev_io->internal.get_aux_buf_cb == NULL);
1836 	bdev_io->internal.get_aux_buf_cb = cb;
1837 	bdev_io_get_buf(bdev_io, len);
1838 }
1839 
1840 static int
1841 bdev_module_get_max_ctx_size(void)
1842 {
1843 	struct spdk_bdev_module *bdev_module;
1844 	int max_bdev_module_size = 0;
1845 
1846 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
1847 		if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) {
1848 			max_bdev_module_size = bdev_module->get_ctx_size();
1849 		}
1850 	}
1851 
1852 	return max_bdev_module_size;
1853 }
1854 
1855 static void
1856 bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1857 {
1858 	if (!bdev->internal.histogram_enabled) {
1859 		return;
1860 	}
1861 
1862 	spdk_json_write_object_begin(w);
1863 	spdk_json_write_named_string(w, "method", "bdev_enable_histogram");
1864 
1865 	spdk_json_write_named_object_begin(w, "params");
1866 	spdk_json_write_named_string(w, "name", bdev->name);
1867 
1868 	spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled);
1869 	spdk_json_write_object_end(w);
1870 
1871 	spdk_json_write_object_end(w);
1872 }
1873 
1874 static void
1875 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1876 {
1877 	int i;
1878 	struct spdk_bdev_qos *qos = bdev->internal.qos;
1879 	uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
1880 
1881 	if (!qos) {
1882 		return;
1883 	}
1884 
1885 	spdk_bdev_get_qos_rate_limits(bdev, limits);
1886 
1887 	spdk_json_write_object_begin(w);
1888 	spdk_json_write_named_string(w, "method", "bdev_set_qos_limit");
1889 
1890 	spdk_json_write_named_object_begin(w, "params");
1891 	spdk_json_write_named_string(w, "name", bdev->name);
1892 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
1893 		if (limits[i] > 0) {
1894 			spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]);
1895 		}
1896 	}
1897 	spdk_json_write_object_end(w);
1898 
1899 	spdk_json_write_object_end(w);
1900 }
1901 
1902 void
1903 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w)
1904 {
1905 	struct spdk_bdev_module *bdev_module;
1906 	struct spdk_bdev *bdev;
1907 
1908 	assert(w != NULL);
1909 
1910 	spdk_json_write_array_begin(w);
1911 
1912 	spdk_json_write_object_begin(w);
1913 	spdk_json_write_named_string(w, "method", "bdev_set_options");
1914 	spdk_json_write_named_object_begin(w, "params");
1915 	spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size);
1916 	spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size);
1917 	spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine);
1918 	spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size);
1919 	spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size);
1920 	spdk_json_write_object_end(w);
1921 	spdk_json_write_object_end(w);
1922 
1923 	bdev_examine_allowlist_config_json(w);
1924 
1925 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
1926 		if (bdev_module->config_json) {
1927 			bdev_module->config_json(w);
1928 		}
1929 	}
1930 
1931 	spdk_spin_lock(&g_bdev_mgr.spinlock);
1932 
1933 	TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) {
1934 		if (bdev->fn_table->write_config_json) {
1935 			bdev->fn_table->write_config_json(bdev, w);
1936 		}
1937 
1938 		bdev_qos_config_json(bdev, w);
1939 		bdev_enable_histogram_config_json(bdev, w);
1940 	}
1941 
1942 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
1943 
1944 	/* This has to be last RPC in array to make sure all bdevs finished examine */
1945 	spdk_json_write_object_begin(w);
1946 	spdk_json_write_named_string(w, "method", "bdev_wait_for_examine");
1947 	spdk_json_write_object_end(w);
1948 
1949 	spdk_json_write_array_end(w);
1950 }
1951 
1952 static void
1953 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf)
1954 {
1955 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
1956 	struct spdk_bdev_io *bdev_io;
1957 
1958 	spdk_iobuf_channel_fini(&ch->iobuf);
1959 
1960 	while (!STAILQ_EMPTY(&ch->per_thread_cache)) {
1961 		bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
1962 		STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
1963 		ch->per_thread_cache_count--;
1964 		spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
1965 	}
1966 
1967 	assert(ch->per_thread_cache_count == 0);
1968 }
1969 
1970 static int
1971 bdev_mgmt_channel_create(void *io_device, void *ctx_buf)
1972 {
1973 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
1974 	struct spdk_bdev_io *bdev_io;
1975 	uint32_t i;
1976 	int rc;
1977 
1978 	rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev",
1979 				     g_bdev_opts.iobuf_small_cache_size,
1980 				     g_bdev_opts.iobuf_large_cache_size);
1981 	if (rc != 0) {
1982 		SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc));
1983 		return -1;
1984 	}
1985 
1986 	STAILQ_INIT(&ch->per_thread_cache);
1987 	ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size;
1988 
1989 	/* Pre-populate bdev_io cache to ensure this thread cannot be starved. */
1990 	ch->per_thread_cache_count = 0;
1991 	for (i = 0; i < ch->bdev_io_cache_size; i++) {
1992 		bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
1993 		if (bdev_io == NULL) {
1994 			SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n");
1995 			assert(false);
1996 			bdev_mgmt_channel_destroy(io_device, ctx_buf);
1997 			return -1;
1998 		}
1999 		ch->per_thread_cache_count++;
2000 		STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link);
2001 	}
2002 
2003 	TAILQ_INIT(&ch->shared_resources);
2004 	TAILQ_INIT(&ch->io_wait_queue);
2005 
2006 	return 0;
2007 }
2008 
2009 static void
2010 bdev_init_complete(int rc)
2011 {
2012 	spdk_bdev_init_cb cb_fn = g_init_cb_fn;
2013 	void *cb_arg = g_init_cb_arg;
2014 	struct spdk_bdev_module *m;
2015 
2016 	g_bdev_mgr.init_complete = true;
2017 	g_init_cb_fn = NULL;
2018 	g_init_cb_arg = NULL;
2019 
2020 	/*
2021 	 * For modules that need to know when subsystem init is complete,
2022 	 * inform them now.
2023 	 */
2024 	if (rc == 0) {
2025 		TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
2026 			if (m->init_complete) {
2027 				m->init_complete();
2028 			}
2029 		}
2030 	}
2031 
2032 	cb_fn(cb_arg, rc);
2033 }
2034 
2035 static bool
2036 bdev_module_all_actions_completed(void)
2037 {
2038 	struct spdk_bdev_module *m;
2039 
2040 	TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
2041 		if (m->internal.action_in_progress > 0) {
2042 			return false;
2043 		}
2044 	}
2045 	return true;
2046 }
2047 
2048 static void
2049 bdev_module_action_complete(void)
2050 {
2051 	/*
2052 	 * Don't finish bdev subsystem initialization if
2053 	 * module pre-initialization is still in progress, or
2054 	 * the subsystem been already initialized.
2055 	 */
2056 	if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) {
2057 		return;
2058 	}
2059 
2060 	/*
2061 	 * Check all bdev modules for inits/examinations in progress. If any
2062 	 * exist, return immediately since we cannot finish bdev subsystem
2063 	 * initialization until all are completed.
2064 	 */
2065 	if (!bdev_module_all_actions_completed()) {
2066 		return;
2067 	}
2068 
2069 	/*
2070 	 * Modules already finished initialization - now that all
2071 	 * the bdev modules have finished their asynchronous I/O
2072 	 * processing, the entire bdev layer can be marked as complete.
2073 	 */
2074 	bdev_init_complete(0);
2075 }
2076 
2077 static void
2078 bdev_module_action_done(struct spdk_bdev_module *module)
2079 {
2080 	spdk_spin_lock(&module->internal.spinlock);
2081 	assert(module->internal.action_in_progress > 0);
2082 	module->internal.action_in_progress--;
2083 	spdk_spin_unlock(&module->internal.spinlock);
2084 	bdev_module_action_complete();
2085 }
2086 
2087 void
2088 spdk_bdev_module_init_done(struct spdk_bdev_module *module)
2089 {
2090 	assert(module->async_init);
2091 	bdev_module_action_done(module);
2092 }
2093 
2094 void
2095 spdk_bdev_module_examine_done(struct spdk_bdev_module *module)
2096 {
2097 	bdev_module_action_done(module);
2098 }
2099 
2100 /** The last initialized bdev module */
2101 static struct spdk_bdev_module *g_resume_bdev_module = NULL;
2102 
2103 static void
2104 bdev_init_failed(void *cb_arg)
2105 {
2106 	struct spdk_bdev_module *module = cb_arg;
2107 
2108 	spdk_spin_lock(&module->internal.spinlock);
2109 	assert(module->internal.action_in_progress > 0);
2110 	module->internal.action_in_progress--;
2111 	spdk_spin_unlock(&module->internal.spinlock);
2112 	bdev_init_complete(-1);
2113 }
2114 
2115 static int
2116 bdev_modules_init(void)
2117 {
2118 	struct spdk_bdev_module *module;
2119 	int rc = 0;
2120 
2121 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
2122 		g_resume_bdev_module = module;
2123 		if (module->async_init) {
2124 			spdk_spin_lock(&module->internal.spinlock);
2125 			module->internal.action_in_progress = 1;
2126 			spdk_spin_unlock(&module->internal.spinlock);
2127 		}
2128 		rc = module->module_init();
2129 		if (rc != 0) {
2130 			/* Bump action_in_progress to prevent other modules from completion of modules_init
2131 			 * Send message to defer application shutdown until resources are cleaned up */
2132 			spdk_spin_lock(&module->internal.spinlock);
2133 			module->internal.action_in_progress = 1;
2134 			spdk_spin_unlock(&module->internal.spinlock);
2135 			spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module);
2136 			return rc;
2137 		}
2138 	}
2139 
2140 	g_resume_bdev_module = NULL;
2141 	return 0;
2142 }
2143 
2144 void
2145 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg)
2146 {
2147 	int rc = 0;
2148 	char mempool_name[32];
2149 
2150 	assert(cb_fn != NULL);
2151 
2152 	g_init_cb_fn = cb_fn;
2153 	g_init_cb_arg = cb_arg;
2154 
2155 	spdk_notify_type_register("bdev_register");
2156 	spdk_notify_type_register("bdev_unregister");
2157 
2158 	snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid());
2159 
2160 	rc = spdk_iobuf_register_module("bdev");
2161 	if (rc != 0) {
2162 		SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc));
2163 		bdev_init_complete(-1);
2164 		return;
2165 	}
2166 
2167 	g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name,
2168 				  g_bdev_opts.bdev_io_pool_size,
2169 				  sizeof(struct spdk_bdev_io) +
2170 				  bdev_module_get_max_ctx_size(),
2171 				  0,
2172 				  SPDK_ENV_SOCKET_ID_ANY);
2173 
2174 	if (g_bdev_mgr.bdev_io_pool == NULL) {
2175 		SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n");
2176 		bdev_init_complete(-1);
2177 		return;
2178 	}
2179 
2180 	g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE,
2181 					      NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
2182 	if (!g_bdev_mgr.zero_buffer) {
2183 		SPDK_ERRLOG("create bdev zero buffer failed\n");
2184 		bdev_init_complete(-1);
2185 		return;
2186 	}
2187 
2188 #ifdef SPDK_CONFIG_VTUNE
2189 	g_bdev_mgr.domain = __itt_domain_create("spdk_bdev");
2190 #endif
2191 
2192 	spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create,
2193 				bdev_mgmt_channel_destroy,
2194 				sizeof(struct spdk_bdev_mgmt_channel),
2195 				"bdev_mgr");
2196 
2197 	rc = bdev_modules_init();
2198 	g_bdev_mgr.module_init_complete = true;
2199 	if (rc != 0) {
2200 		SPDK_ERRLOG("bdev modules init failed\n");
2201 		return;
2202 	}
2203 
2204 	bdev_module_action_complete();
2205 }
2206 
2207 static void
2208 bdev_mgr_unregister_cb(void *io_device)
2209 {
2210 	spdk_bdev_fini_cb cb_fn = g_fini_cb_fn;
2211 
2212 	if (g_bdev_mgr.bdev_io_pool) {
2213 		if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) {
2214 			SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n",
2215 				    spdk_mempool_count(g_bdev_mgr.bdev_io_pool),
2216 				    g_bdev_opts.bdev_io_pool_size);
2217 		}
2218 
2219 		spdk_mempool_free(g_bdev_mgr.bdev_io_pool);
2220 	}
2221 
2222 	spdk_free(g_bdev_mgr.zero_buffer);
2223 
2224 	bdev_examine_allowlist_free();
2225 
2226 	cb_fn(g_fini_cb_arg);
2227 	g_fini_cb_fn = NULL;
2228 	g_fini_cb_arg = NULL;
2229 	g_bdev_mgr.init_complete = false;
2230 	g_bdev_mgr.module_init_complete = false;
2231 }
2232 
2233 static void
2234 bdev_module_fini_iter(void *arg)
2235 {
2236 	struct spdk_bdev_module *bdev_module;
2237 
2238 	/* FIXME: Handling initialization failures is broken now,
2239 	 * so we won't even try cleaning up after successfully
2240 	 * initialized modules. if module_init_complete is false,
2241 	 * just call spdk_bdev_mgr_unregister_cb
2242 	 */
2243 	if (!g_bdev_mgr.module_init_complete) {
2244 		bdev_mgr_unregister_cb(NULL);
2245 		return;
2246 	}
2247 
2248 	/* Start iterating from the last touched module */
2249 	if (!g_resume_bdev_module) {
2250 		bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list);
2251 	} else {
2252 		bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list,
2253 					 internal.tailq);
2254 	}
2255 
2256 	while (bdev_module) {
2257 		if (bdev_module->async_fini) {
2258 			/* Save our place so we can resume later. We must
2259 			 * save the variable here, before calling module_fini()
2260 			 * below, because in some cases the module may immediately
2261 			 * call spdk_bdev_module_fini_done() and re-enter
2262 			 * this function to continue iterating. */
2263 			g_resume_bdev_module = bdev_module;
2264 		}
2265 
2266 		if (bdev_module->module_fini) {
2267 			bdev_module->module_fini();
2268 		}
2269 
2270 		if (bdev_module->async_fini) {
2271 			return;
2272 		}
2273 
2274 		bdev_module = TAILQ_PREV(bdev_module, bdev_module_list,
2275 					 internal.tailq);
2276 	}
2277 
2278 	g_resume_bdev_module = NULL;
2279 	spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb);
2280 }
2281 
2282 void
2283 spdk_bdev_module_fini_done(void)
2284 {
2285 	if (spdk_get_thread() != g_fini_thread) {
2286 		spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL);
2287 	} else {
2288 		bdev_module_fini_iter(NULL);
2289 	}
2290 }
2291 
2292 static void
2293 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno)
2294 {
2295 	struct spdk_bdev *bdev = cb_arg;
2296 
2297 	if (bdeverrno && bdev) {
2298 		SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n",
2299 			     bdev->name);
2300 
2301 		/*
2302 		 * Since the call to spdk_bdev_unregister() failed, we have no way to free this
2303 		 *  bdev; try to continue by manually removing this bdev from the list and continue
2304 		 *  with the next bdev in the list.
2305 		 */
2306 		TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
2307 	}
2308 
2309 	if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) {
2310 		SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n");
2311 		/*
2312 		 * Bdev module finish need to be deferred as we might be in the middle of some context
2313 		 * (like bdev part free) that will use this bdev (or private bdev driver ctx data)
2314 		 * after returning.
2315 		 */
2316 		spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL);
2317 		return;
2318 	}
2319 
2320 	/*
2321 	 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem
2322 	 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity
2323 	 * to detect clean shutdown as opposed to run-time hot removal of the underlying
2324 	 * base bdevs.
2325 	 *
2326 	 * Also, walk the list in the reverse order.
2327 	 */
2328 	for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list);
2329 	     bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) {
2330 		spdk_spin_lock(&bdev->internal.spinlock);
2331 		if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
2332 			LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev);
2333 			spdk_spin_unlock(&bdev->internal.spinlock);
2334 			continue;
2335 		}
2336 		spdk_spin_unlock(&bdev->internal.spinlock);
2337 
2338 		SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name);
2339 		spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev);
2340 		return;
2341 	}
2342 
2343 	/*
2344 	 * If any bdev fails to unclaim underlying bdev properly, we may face the
2345 	 * case of bdev list consisting of claimed bdevs only (if claims are managed
2346 	 * correctly, this would mean there's a loop in the claims graph which is
2347 	 * clearly impossible). Warn and unregister last bdev on the list then.
2348 	 */
2349 	for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list);
2350 	     bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) {
2351 		SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name);
2352 		spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev);
2353 		return;
2354 	}
2355 }
2356 
2357 static void
2358 bdev_module_fini_start_iter(void *arg)
2359 {
2360 	struct spdk_bdev_module *bdev_module;
2361 
2362 	if (!g_resume_bdev_module) {
2363 		bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list);
2364 	} else {
2365 		bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq);
2366 	}
2367 
2368 	while (bdev_module) {
2369 		if (bdev_module->async_fini_start) {
2370 			/* Save our place so we can resume later. We must
2371 			 * save the variable here, before calling fini_start()
2372 			 * below, because in some cases the module may immediately
2373 			 * call spdk_bdev_module_fini_start_done() and re-enter
2374 			 * this function to continue iterating. */
2375 			g_resume_bdev_module = bdev_module;
2376 		}
2377 
2378 		if (bdev_module->fini_start) {
2379 			bdev_module->fini_start();
2380 		}
2381 
2382 		if (bdev_module->async_fini_start) {
2383 			return;
2384 		}
2385 
2386 		bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq);
2387 	}
2388 
2389 	g_resume_bdev_module = NULL;
2390 
2391 	bdev_finish_unregister_bdevs_iter(NULL, 0);
2392 }
2393 
2394 void
2395 spdk_bdev_module_fini_start_done(void)
2396 {
2397 	if (spdk_get_thread() != g_fini_thread) {
2398 		spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL);
2399 	} else {
2400 		bdev_module_fini_start_iter(NULL);
2401 	}
2402 }
2403 
2404 static void
2405 bdev_finish_wait_for_examine_done(void *cb_arg)
2406 {
2407 	bdev_module_fini_start_iter(NULL);
2408 }
2409 
2410 static void bdev_open_async_fini(void);
2411 
2412 void
2413 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg)
2414 {
2415 	int rc;
2416 
2417 	assert(cb_fn != NULL);
2418 
2419 	g_fini_thread = spdk_get_thread();
2420 
2421 	g_fini_cb_fn = cb_fn;
2422 	g_fini_cb_arg = cb_arg;
2423 
2424 	bdev_open_async_fini();
2425 
2426 	rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL);
2427 	if (rc != 0) {
2428 		SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc));
2429 		bdev_finish_wait_for_examine_done(NULL);
2430 	}
2431 }
2432 
2433 struct spdk_bdev_io *
2434 bdev_channel_get_io(struct spdk_bdev_channel *channel)
2435 {
2436 	struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch;
2437 	struct spdk_bdev_io *bdev_io;
2438 
2439 	if (ch->per_thread_cache_count > 0) {
2440 		bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
2441 		STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
2442 		ch->per_thread_cache_count--;
2443 	} else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) {
2444 		/*
2445 		 * Don't try to look for bdev_ios in the global pool if there are
2446 		 * waiters on bdev_ios - we don't want this caller to jump the line.
2447 		 */
2448 		bdev_io = NULL;
2449 	} else {
2450 		bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
2451 	}
2452 
2453 	return bdev_io;
2454 }
2455 
2456 void
2457 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io)
2458 {
2459 	struct spdk_bdev_mgmt_channel *ch;
2460 
2461 	assert(bdev_io != NULL);
2462 	assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING);
2463 
2464 	ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
2465 
2466 	if (bdev_io->internal.buf != NULL) {
2467 		bdev_io_put_buf(bdev_io);
2468 	}
2469 
2470 	if (ch->per_thread_cache_count < ch->bdev_io_cache_size) {
2471 		ch->per_thread_cache_count++;
2472 		STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link);
2473 		while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) {
2474 			struct spdk_bdev_io_wait_entry *entry;
2475 
2476 			entry = TAILQ_FIRST(&ch->io_wait_queue);
2477 			TAILQ_REMOVE(&ch->io_wait_queue, entry, link);
2478 			entry->cb_fn(entry->cb_arg);
2479 		}
2480 	} else {
2481 		/* We should never have a full cache with entries on the io wait queue. */
2482 		assert(TAILQ_EMPTY(&ch->io_wait_queue));
2483 		spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
2484 	}
2485 }
2486 
2487 static bool
2488 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit)
2489 {
2490 	assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES);
2491 
2492 	switch (limit) {
2493 	case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
2494 		return true;
2495 	case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT:
2496 	case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT:
2497 	case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT:
2498 		return false;
2499 	case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES:
2500 	default:
2501 		return false;
2502 	}
2503 }
2504 
2505 static bool
2506 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io)
2507 {
2508 	switch (bdev_io->type) {
2509 	case SPDK_BDEV_IO_TYPE_NVME_IO:
2510 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2511 	case SPDK_BDEV_IO_TYPE_READ:
2512 	case SPDK_BDEV_IO_TYPE_WRITE:
2513 		return true;
2514 	case SPDK_BDEV_IO_TYPE_ZCOPY:
2515 		if (bdev_io->u.bdev.zcopy.start) {
2516 			return true;
2517 		} else {
2518 			return false;
2519 		}
2520 	default:
2521 		return false;
2522 	}
2523 }
2524 
2525 static bool
2526 bdev_is_read_io(struct spdk_bdev_io *bdev_io)
2527 {
2528 	switch (bdev_io->type) {
2529 	case SPDK_BDEV_IO_TYPE_NVME_IO:
2530 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2531 		/* Bit 1 (0x2) set for read operation */
2532 		if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) {
2533 			return true;
2534 		} else {
2535 			return false;
2536 		}
2537 	case SPDK_BDEV_IO_TYPE_READ:
2538 		return true;
2539 	case SPDK_BDEV_IO_TYPE_ZCOPY:
2540 		/* Populate to read from disk */
2541 		if (bdev_io->u.bdev.zcopy.populate) {
2542 			return true;
2543 		} else {
2544 			return false;
2545 		}
2546 	default:
2547 		return false;
2548 	}
2549 }
2550 
2551 static uint64_t
2552 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io)
2553 {
2554 	struct spdk_bdev	*bdev = bdev_io->bdev;
2555 
2556 	switch (bdev_io->type) {
2557 	case SPDK_BDEV_IO_TYPE_NVME_IO:
2558 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2559 		return bdev_io->u.nvme_passthru.nbytes;
2560 	case SPDK_BDEV_IO_TYPE_READ:
2561 	case SPDK_BDEV_IO_TYPE_WRITE:
2562 		return bdev_io->u.bdev.num_blocks * bdev->blocklen;
2563 	case SPDK_BDEV_IO_TYPE_ZCOPY:
2564 		/* Track the data in the start phase only */
2565 		if (bdev_io->u.bdev.zcopy.start) {
2566 			return bdev_io->u.bdev.num_blocks * bdev->blocklen;
2567 		} else {
2568 			return 0;
2569 		}
2570 	default:
2571 		return 0;
2572 	}
2573 }
2574 
2575 static inline bool
2576 bdev_qos_rw_queue_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta)
2577 {
2578 	int64_t remaining_this_timeslice;
2579 
2580 	if (!limit->max_per_timeslice) {
2581 		/* The QoS is disabled */
2582 		return false;
2583 	}
2584 
2585 	remaining_this_timeslice = __atomic_sub_fetch(&limit->remaining_this_timeslice, delta,
2586 				   __ATOMIC_RELAXED);
2587 	if (remaining_this_timeslice + (int64_t)delta > 0) {
2588 		/* There was still a quota for this delta -> the IO shouldn't be queued
2589 		 *
2590 		 * We allow a slight quota overrun here so an IO bigger than the per-timeslice
2591 		 * quota can be allowed once a while. Such overrun then taken into account in
2592 		 * the QoS poller, where the next timeslice quota is calculated.
2593 		 */
2594 		return false;
2595 	}
2596 
2597 	/* There was no quota for this delta -> the IO should be queued
2598 	 * The remaining_this_timeslice must be rewinded so it reflects the real
2599 	 * amount of IOs or bytes allowed.
2600 	 */
2601 	__atomic_add_fetch(
2602 		&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED);
2603 	return true;
2604 }
2605 
2606 static inline void
2607 bdev_qos_rw_rewind_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta)
2608 {
2609 	__atomic_add_fetch(&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED);
2610 }
2611 
2612 static bool
2613 bdev_qos_rw_iops_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2614 {
2615 	return bdev_qos_rw_queue_io(limit, io, 1);
2616 }
2617 
2618 static void
2619 bdev_qos_rw_iops_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2620 {
2621 	bdev_qos_rw_rewind_io(limit, io, 1);
2622 }
2623 
2624 static bool
2625 bdev_qos_rw_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2626 {
2627 	return bdev_qos_rw_queue_io(limit, io, bdev_get_io_size_in_byte(io));
2628 }
2629 
2630 static void
2631 bdev_qos_rw_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2632 {
2633 	bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io));
2634 }
2635 
2636 static bool
2637 bdev_qos_r_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2638 {
2639 	if (bdev_is_read_io(io) == false) {
2640 		return false;
2641 	}
2642 
2643 	return bdev_qos_rw_bps_queue(limit, io);
2644 }
2645 
2646 static void
2647 bdev_qos_r_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2648 {
2649 	if (bdev_is_read_io(io) != false) {
2650 		bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io));
2651 	}
2652 }
2653 
2654 static bool
2655 bdev_qos_w_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2656 {
2657 	if (bdev_is_read_io(io) == true) {
2658 		return false;
2659 	}
2660 
2661 	return bdev_qos_rw_bps_queue(limit, io);
2662 }
2663 
2664 static void
2665 bdev_qos_w_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2666 {
2667 	if (bdev_is_read_io(io) != true) {
2668 		bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io));
2669 	}
2670 }
2671 
2672 static void
2673 bdev_qos_set_ops(struct spdk_bdev_qos *qos)
2674 {
2675 	int i;
2676 
2677 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2678 		if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
2679 			qos->rate_limits[i].queue_io = NULL;
2680 			continue;
2681 		}
2682 
2683 		switch (i) {
2684 		case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
2685 			qos->rate_limits[i].queue_io = bdev_qos_rw_iops_queue;
2686 			qos->rate_limits[i].rewind_quota = bdev_qos_rw_iops_rewind_quota;
2687 			break;
2688 		case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT:
2689 			qos->rate_limits[i].queue_io = bdev_qos_rw_bps_queue;
2690 			qos->rate_limits[i].rewind_quota = bdev_qos_rw_bps_rewind_quota;
2691 			break;
2692 		case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT:
2693 			qos->rate_limits[i].queue_io = bdev_qos_r_bps_queue;
2694 			qos->rate_limits[i].rewind_quota = bdev_qos_r_bps_rewind_quota;
2695 			break;
2696 		case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT:
2697 			qos->rate_limits[i].queue_io = bdev_qos_w_bps_queue;
2698 			qos->rate_limits[i].rewind_quota = bdev_qos_w_bps_rewind_quota;
2699 			break;
2700 		default:
2701 			break;
2702 		}
2703 	}
2704 }
2705 
2706 static void
2707 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch,
2708 			    struct spdk_bdev_io *bdev_io,
2709 			    enum spdk_bdev_io_status status)
2710 {
2711 	bdev_io->internal.in_submit_request = true;
2712 	bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource);
2713 	spdk_bdev_io_complete(bdev_io, status);
2714 	bdev_io->internal.in_submit_request = false;
2715 }
2716 
2717 static inline void
2718 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io)
2719 {
2720 	struct spdk_bdev *bdev = bdev_io->bdev;
2721 	struct spdk_io_channel *ch = bdev_ch->channel;
2722 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
2723 
2724 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) {
2725 		struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch;
2726 		struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort;
2727 
2728 		if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) ||
2729 		    bdev_abort_buf_io(mgmt_channel, bio_to_abort)) {
2730 			_bdev_io_complete_in_submit(bdev_ch, bdev_io,
2731 						    SPDK_BDEV_IO_STATUS_SUCCESS);
2732 			return;
2733 		}
2734 	}
2735 
2736 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE &&
2737 			  bdev_io->bdev->split_on_write_unit &&
2738 			  bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) {
2739 		SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n",
2740 			    bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size);
2741 		_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
2742 		return;
2743 	}
2744 
2745 	if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) {
2746 		bdev_io_increment_outstanding(bdev_ch, shared_resource);
2747 		bdev_io->internal.in_submit_request = true;
2748 		bdev_submit_request(bdev, ch, bdev_io);
2749 		bdev_io->internal.in_submit_request = false;
2750 	} else {
2751 		bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT);
2752 		if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) {
2753 			/* Special case when we have nomem IOs and no outstanding IOs which completions
2754 			 * could trigger retry of queued IOs */
2755 			bdev_shared_ch_retry_io(shared_resource);
2756 		}
2757 	}
2758 }
2759 
2760 static bool
2761 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io)
2762 {
2763 	int i;
2764 
2765 	if (bdev_qos_io_to_limit(bdev_io) == true) {
2766 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2767 			if (!qos->rate_limits[i].queue_io) {
2768 				continue;
2769 			}
2770 
2771 			if (qos->rate_limits[i].queue_io(&qos->rate_limits[i],
2772 							 bdev_io) == true) {
2773 				for (i -= 1; i >= 0 ; i--) {
2774 					if (!qos->rate_limits[i].queue_io) {
2775 						continue;
2776 					}
2777 
2778 					qos->rate_limits[i].rewind_quota(&qos->rate_limits[i], bdev_io);
2779 				}
2780 				return true;
2781 			}
2782 		}
2783 	}
2784 
2785 	return false;
2786 }
2787 
2788 static int
2789 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos)
2790 {
2791 	struct spdk_bdev_io		*bdev_io = NULL, *tmp = NULL;
2792 	int				submitted_ios = 0;
2793 
2794 	TAILQ_FOREACH_SAFE(bdev_io, &ch->qos_queued_io, internal.link, tmp) {
2795 		if (!bdev_qos_queue_io(qos, bdev_io)) {
2796 			TAILQ_REMOVE(&ch->qos_queued_io, bdev_io, internal.link);
2797 			bdev_io_do_submit(ch, bdev_io);
2798 
2799 			submitted_ios++;
2800 		}
2801 	}
2802 
2803 	return submitted_ios;
2804 }
2805 
2806 static void
2807 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn)
2808 {
2809 	int rc;
2810 
2811 	bdev_io->internal.waitq_entry.bdev = bdev_io->bdev;
2812 	bdev_io->internal.waitq_entry.cb_fn = cb_fn;
2813 	bdev_io->internal.waitq_entry.cb_arg = bdev_io;
2814 	rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch),
2815 				     &bdev_io->internal.waitq_entry);
2816 	if (rc != 0) {
2817 		SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc);
2818 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
2819 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
2820 	}
2821 }
2822 
2823 static bool
2824 bdev_rw_should_split(struct spdk_bdev_io *bdev_io)
2825 {
2826 	uint32_t io_boundary;
2827 	struct spdk_bdev *bdev = bdev_io->bdev;
2828 	uint32_t max_segment_size = bdev->max_segment_size;
2829 	uint32_t max_size = bdev->max_rw_size;
2830 	int max_segs = bdev->max_num_segments;
2831 
2832 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) {
2833 		io_boundary = bdev->write_unit_size;
2834 	} else if (bdev->split_on_optimal_io_boundary) {
2835 		io_boundary = bdev->optimal_io_boundary;
2836 	} else {
2837 		io_boundary = 0;
2838 	}
2839 
2840 	if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) {
2841 		return false;
2842 	}
2843 
2844 	if (io_boundary) {
2845 		uint64_t start_stripe, end_stripe;
2846 
2847 		start_stripe = bdev_io->u.bdev.offset_blocks;
2848 		end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1;
2849 		/* Avoid expensive div operations if possible.  These spdk_u32 functions are very cheap. */
2850 		if (spdk_likely(spdk_u32_is_pow2(io_boundary))) {
2851 			start_stripe >>= spdk_u32log2(io_boundary);
2852 			end_stripe >>= spdk_u32log2(io_boundary);
2853 		} else {
2854 			start_stripe /= io_boundary;
2855 			end_stripe /= io_boundary;
2856 		}
2857 
2858 		if (start_stripe != end_stripe) {
2859 			return true;
2860 		}
2861 	}
2862 
2863 	if (max_segs) {
2864 		if (bdev_io->u.bdev.iovcnt > max_segs) {
2865 			return true;
2866 		}
2867 	}
2868 
2869 	if (max_segment_size) {
2870 		for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) {
2871 			if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) {
2872 				return true;
2873 			}
2874 		}
2875 	}
2876 
2877 	if (max_size) {
2878 		if (bdev_io->u.bdev.num_blocks > max_size) {
2879 			return true;
2880 		}
2881 	}
2882 
2883 	return false;
2884 }
2885 
2886 static bool
2887 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io)
2888 {
2889 	uint32_t num_unmap_segments;
2890 
2891 	if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) {
2892 		return false;
2893 	}
2894 	num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap);
2895 	if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) {
2896 		return true;
2897 	}
2898 
2899 	return false;
2900 }
2901 
2902 static bool
2903 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io)
2904 {
2905 	if (!bdev_io->bdev->max_write_zeroes) {
2906 		return false;
2907 	}
2908 
2909 	if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) {
2910 		return true;
2911 	}
2912 
2913 	return false;
2914 }
2915 
2916 static bool
2917 bdev_copy_should_split(struct spdk_bdev_io *bdev_io)
2918 {
2919 	if (bdev_io->bdev->max_copy != 0 &&
2920 	    bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) {
2921 		return true;
2922 	}
2923 
2924 	return false;
2925 }
2926 
2927 static bool
2928 bdev_io_should_split(struct spdk_bdev_io *bdev_io)
2929 {
2930 	switch (bdev_io->type) {
2931 	case SPDK_BDEV_IO_TYPE_READ:
2932 	case SPDK_BDEV_IO_TYPE_WRITE:
2933 		return bdev_rw_should_split(bdev_io);
2934 	case SPDK_BDEV_IO_TYPE_UNMAP:
2935 		return bdev_unmap_should_split(bdev_io);
2936 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
2937 		return bdev_write_zeroes_should_split(bdev_io);
2938 	case SPDK_BDEV_IO_TYPE_COPY:
2939 		return bdev_copy_should_split(bdev_io);
2940 	default:
2941 		return false;
2942 	}
2943 }
2944 
2945 static uint32_t
2946 _to_next_boundary(uint64_t offset, uint32_t boundary)
2947 {
2948 	return (boundary - (offset % boundary));
2949 }
2950 
2951 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
2952 
2953 static void _bdev_rw_split(void *_bdev_io);
2954 
2955 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io);
2956 
2957 static void
2958 _bdev_unmap_split(void *_bdev_io)
2959 {
2960 	return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io);
2961 }
2962 
2963 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io);
2964 
2965 static void
2966 _bdev_write_zeroes_split(void *_bdev_io)
2967 {
2968 	return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io);
2969 }
2970 
2971 static void bdev_copy_split(struct spdk_bdev_io *bdev_io);
2972 
2973 static void
2974 _bdev_copy_split(void *_bdev_io)
2975 {
2976 	return bdev_copy_split((struct spdk_bdev_io *)_bdev_io);
2977 }
2978 
2979 static int
2980 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf,
2981 		     uint64_t num_blocks, uint64_t *offset, uint64_t *remaining)
2982 {
2983 	int rc;
2984 	uint64_t current_offset, current_remaining, current_src_offset;
2985 	spdk_bdev_io_wait_cb io_wait_fn;
2986 
2987 	current_offset = *offset;
2988 	current_remaining = *remaining;
2989 
2990 	bdev_io->u.bdev.split_outstanding++;
2991 
2992 	io_wait_fn = _bdev_rw_split;
2993 	switch (bdev_io->type) {
2994 	case SPDK_BDEV_IO_TYPE_READ:
2995 		assert(bdev_io->u.bdev.accel_sequence == NULL);
2996 		rc = bdev_readv_blocks_with_md(bdev_io->internal.desc,
2997 					       spdk_io_channel_from_ctx(bdev_io->internal.ch),
2998 					       iov, iovcnt, md_buf, current_offset,
2999 					       num_blocks, bdev_io->internal.memory_domain,
3000 					       bdev_io->internal.memory_domain_ctx, NULL,
3001 					       bdev_io_split_done, bdev_io);
3002 		break;
3003 	case SPDK_BDEV_IO_TYPE_WRITE:
3004 		assert(bdev_io->u.bdev.accel_sequence == NULL);
3005 		rc = bdev_writev_blocks_with_md(bdev_io->internal.desc,
3006 						spdk_io_channel_from_ctx(bdev_io->internal.ch),
3007 						iov, iovcnt, md_buf, current_offset,
3008 						num_blocks, bdev_io->internal.memory_domain,
3009 						bdev_io->internal.memory_domain_ctx, NULL,
3010 						bdev_io_split_done, bdev_io);
3011 		break;
3012 	case SPDK_BDEV_IO_TYPE_UNMAP:
3013 		io_wait_fn = _bdev_unmap_split;
3014 		rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc,
3015 					    spdk_io_channel_from_ctx(bdev_io->internal.ch),
3016 					    current_offset, num_blocks,
3017 					    bdev_io_split_done, bdev_io);
3018 		break;
3019 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3020 		io_wait_fn = _bdev_write_zeroes_split;
3021 		rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc,
3022 						   spdk_io_channel_from_ctx(bdev_io->internal.ch),
3023 						   current_offset, num_blocks,
3024 						   bdev_io_split_done, bdev_io);
3025 		break;
3026 	case SPDK_BDEV_IO_TYPE_COPY:
3027 		io_wait_fn = _bdev_copy_split;
3028 		current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks +
3029 				     (current_offset - bdev_io->u.bdev.offset_blocks);
3030 		rc = spdk_bdev_copy_blocks(bdev_io->internal.desc,
3031 					   spdk_io_channel_from_ctx(bdev_io->internal.ch),
3032 					   current_offset, current_src_offset, num_blocks,
3033 					   bdev_io_split_done, bdev_io);
3034 		break;
3035 	default:
3036 		assert(false);
3037 		rc = -EINVAL;
3038 		break;
3039 	}
3040 
3041 	if (rc == 0) {
3042 		current_offset += num_blocks;
3043 		current_remaining -= num_blocks;
3044 		bdev_io->u.bdev.split_current_offset_blocks = current_offset;
3045 		bdev_io->u.bdev.split_remaining_num_blocks = current_remaining;
3046 		*offset = current_offset;
3047 		*remaining = current_remaining;
3048 	} else {
3049 		bdev_io->u.bdev.split_outstanding--;
3050 		if (rc == -ENOMEM) {
3051 			if (bdev_io->u.bdev.split_outstanding == 0) {
3052 				/* No I/O is outstanding. Hence we should wait here. */
3053 				bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn);
3054 			}
3055 		} else {
3056 			bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3057 			if (bdev_io->u.bdev.split_outstanding == 0) {
3058 				spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx);
3059 				TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link);
3060 				bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
3061 			}
3062 		}
3063 	}
3064 
3065 	return rc;
3066 }
3067 
3068 static void
3069 _bdev_rw_split(void *_bdev_io)
3070 {
3071 	struct iovec *parent_iov, *iov;
3072 	struct spdk_bdev_io *bdev_io = _bdev_io;
3073 	struct spdk_bdev *bdev = bdev_io->bdev;
3074 	uint64_t parent_offset, current_offset, remaining;
3075 	uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt;
3076 	uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes;
3077 	uint32_t iovcnt, iov_len, child_iovsize;
3078 	uint32_t blocklen = bdev->blocklen;
3079 	uint32_t io_boundary;
3080 	uint32_t max_segment_size = bdev->max_segment_size;
3081 	uint32_t max_child_iovcnt = bdev->max_num_segments;
3082 	uint32_t max_size = bdev->max_rw_size;
3083 	void *md_buf = NULL;
3084 	int rc;
3085 
3086 	max_size = max_size ? max_size : UINT32_MAX;
3087 	max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX;
3088 	max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) :
3089 			   SPDK_BDEV_IO_NUM_CHILD_IOV;
3090 
3091 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) {
3092 		io_boundary = bdev->write_unit_size;
3093 	} else if (bdev->split_on_optimal_io_boundary) {
3094 		io_boundary = bdev->optimal_io_boundary;
3095 	} else {
3096 		io_boundary = UINT32_MAX;
3097 	}
3098 
3099 	remaining = bdev_io->u.bdev.split_remaining_num_blocks;
3100 	current_offset = bdev_io->u.bdev.split_current_offset_blocks;
3101 	parent_offset = bdev_io->u.bdev.offset_blocks;
3102 	parent_iov_offset = (current_offset - parent_offset) * blocklen;
3103 	parent_iovcnt = bdev_io->u.bdev.iovcnt;
3104 
3105 	for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) {
3106 		parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
3107 		if (parent_iov_offset < parent_iov->iov_len) {
3108 			break;
3109 		}
3110 		parent_iov_offset -= parent_iov->iov_len;
3111 	}
3112 
3113 	child_iovcnt = 0;
3114 	while (remaining > 0 && parent_iovpos < parent_iovcnt &&
3115 	       child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) {
3116 		to_next_boundary = _to_next_boundary(current_offset, io_boundary);
3117 		to_next_boundary = spdk_min(remaining, to_next_boundary);
3118 		to_next_boundary = spdk_min(max_size, to_next_boundary);
3119 		to_next_boundary_bytes = to_next_boundary * blocklen;
3120 
3121 		iov = &bdev_io->child_iov[child_iovcnt];
3122 		iovcnt = 0;
3123 
3124 		if (bdev_io->u.bdev.md_buf) {
3125 			md_buf = (char *)bdev_io->u.bdev.md_buf +
3126 				 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev);
3127 		}
3128 
3129 		child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt);
3130 		while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt &&
3131 		       iovcnt < child_iovsize) {
3132 			parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
3133 			iov_len = parent_iov->iov_len - parent_iov_offset;
3134 
3135 			iov_len = spdk_min(iov_len, max_segment_size);
3136 			iov_len = spdk_min(iov_len, to_next_boundary_bytes);
3137 			to_next_boundary_bytes -= iov_len;
3138 
3139 			bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset;
3140 			bdev_io->child_iov[child_iovcnt].iov_len = iov_len;
3141 
3142 			if (iov_len < parent_iov->iov_len - parent_iov_offset) {
3143 				parent_iov_offset += iov_len;
3144 			} else {
3145 				parent_iovpos++;
3146 				parent_iov_offset = 0;
3147 			}
3148 			child_iovcnt++;
3149 			iovcnt++;
3150 		}
3151 
3152 		if (to_next_boundary_bytes > 0) {
3153 			/* We had to stop this child I/O early because we ran out of
3154 			 * child_iov space or were limited by max_num_segments.
3155 			 * Ensure the iovs to be aligned with block size and
3156 			 * then adjust to_next_boundary before starting the
3157 			 * child I/O.
3158 			 */
3159 			assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV ||
3160 			       iovcnt == child_iovsize);
3161 			to_last_block_bytes = to_next_boundary_bytes % blocklen;
3162 			if (to_last_block_bytes != 0) {
3163 				uint32_t child_iovpos = child_iovcnt - 1;
3164 				/* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV
3165 				 * so the loop will naturally end
3166 				 */
3167 
3168 				to_last_block_bytes = blocklen - to_last_block_bytes;
3169 				to_next_boundary_bytes += to_last_block_bytes;
3170 				while (to_last_block_bytes > 0 && iovcnt > 0) {
3171 					iov_len = spdk_min(to_last_block_bytes,
3172 							   bdev_io->child_iov[child_iovpos].iov_len);
3173 					bdev_io->child_iov[child_iovpos].iov_len -= iov_len;
3174 					if (bdev_io->child_iov[child_iovpos].iov_len == 0) {
3175 						child_iovpos--;
3176 						if (--iovcnt == 0) {
3177 							/* If the child IO is less than a block size just return.
3178 							 * If the first child IO of any split round is less than
3179 							 * a block size, an error exit.
3180 							 */
3181 							if (bdev_io->u.bdev.split_outstanding == 0) {
3182 								SPDK_ERRLOG("The first child io was less than a block size\n");
3183 								bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3184 								spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx);
3185 								TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link);
3186 								bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
3187 							}
3188 
3189 							return;
3190 						}
3191 					}
3192 
3193 					to_last_block_bytes -= iov_len;
3194 
3195 					if (parent_iov_offset == 0) {
3196 						parent_iovpos--;
3197 						parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len;
3198 					}
3199 					parent_iov_offset -= iov_len;
3200 				}
3201 
3202 				assert(to_last_block_bytes == 0);
3203 			}
3204 			to_next_boundary -= to_next_boundary_bytes / blocklen;
3205 		}
3206 
3207 		rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary,
3208 					  &current_offset, &remaining);
3209 		if (spdk_unlikely(rc)) {
3210 			return;
3211 		}
3212 	}
3213 }
3214 
3215 static void
3216 bdev_unmap_split(struct spdk_bdev_io *bdev_io)
3217 {
3218 	uint64_t offset, unmap_blocks, remaining, max_unmap_blocks;
3219 	uint32_t num_children_reqs = 0;
3220 	int rc;
3221 
3222 	offset = bdev_io->u.bdev.split_current_offset_blocks;
3223 	remaining = bdev_io->u.bdev.split_remaining_num_blocks;
3224 	max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments;
3225 
3226 	while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) {
3227 		unmap_blocks = spdk_min(remaining, max_unmap_blocks);
3228 
3229 		rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks,
3230 					  &offset, &remaining);
3231 		if (spdk_likely(rc == 0)) {
3232 			num_children_reqs++;
3233 		} else {
3234 			return;
3235 		}
3236 	}
3237 }
3238 
3239 static void
3240 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io)
3241 {
3242 	uint64_t offset, write_zeroes_blocks, remaining;
3243 	uint32_t num_children_reqs = 0;
3244 	int rc;
3245 
3246 	offset = bdev_io->u.bdev.split_current_offset_blocks;
3247 	remaining = bdev_io->u.bdev.split_remaining_num_blocks;
3248 
3249 	while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) {
3250 		write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes);
3251 
3252 		rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks,
3253 					  &offset, &remaining);
3254 		if (spdk_likely(rc == 0)) {
3255 			num_children_reqs++;
3256 		} else {
3257 			return;
3258 		}
3259 	}
3260 }
3261 
3262 static void
3263 bdev_copy_split(struct spdk_bdev_io *bdev_io)
3264 {
3265 	uint64_t offset, copy_blocks, remaining;
3266 	uint32_t num_children_reqs = 0;
3267 	int rc;
3268 
3269 	offset = bdev_io->u.bdev.split_current_offset_blocks;
3270 	remaining = bdev_io->u.bdev.split_remaining_num_blocks;
3271 
3272 	assert(bdev_io->bdev->max_copy != 0);
3273 	while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) {
3274 		copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy);
3275 
3276 		rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks,
3277 					  &offset, &remaining);
3278 		if (spdk_likely(rc == 0)) {
3279 			num_children_reqs++;
3280 		} else {
3281 			return;
3282 		}
3283 	}
3284 }
3285 
3286 static void
3287 parent_bdev_io_complete(void *ctx, int rc)
3288 {
3289 	struct spdk_bdev_io *parent_io = ctx;
3290 
3291 	if (rc) {
3292 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3293 	}
3294 
3295 	parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
3296 			       parent_io->internal.caller_ctx);
3297 }
3298 
3299 static void
3300 bdev_io_complete_parent_sequence_cb(void *ctx, int status)
3301 {
3302 	struct spdk_bdev_io *bdev_io = ctx;
3303 
3304 	/* u.bdev.accel_sequence should have already been cleared at this point */
3305 	assert(bdev_io->u.bdev.accel_sequence == NULL);
3306 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
3307 	bdev_io->internal.accel_sequence = NULL;
3308 
3309 	if (spdk_unlikely(status != 0)) {
3310 		SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status);
3311 	}
3312 
3313 	parent_bdev_io_complete(bdev_io, status);
3314 }
3315 
3316 static void
3317 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
3318 {
3319 	struct spdk_bdev_io *parent_io = cb_arg;
3320 
3321 	spdk_bdev_free_io(bdev_io);
3322 
3323 	if (!success) {
3324 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3325 		/* If any child I/O failed, stop further splitting process. */
3326 		parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks;
3327 		parent_io->u.bdev.split_remaining_num_blocks = 0;
3328 	}
3329 	parent_io->u.bdev.split_outstanding--;
3330 	if (parent_io->u.bdev.split_outstanding != 0) {
3331 		return;
3332 	}
3333 
3334 	/*
3335 	 * Parent I/O finishes when all blocks are consumed.
3336 	 */
3337 	if (parent_io->u.bdev.split_remaining_num_blocks == 0) {
3338 		assert(parent_io->internal.cb != bdev_io_split_done);
3339 		spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx);
3340 		TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link);
3341 
3342 		if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
3343 			if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) {
3344 				bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb);
3345 				return;
3346 			} else if (parent_io->internal.orig_iovcnt != 0 &&
3347 				   !bdev_io_use_accel_sequence(bdev_io)) {
3348 				/* bdev IO will be completed in the callback */
3349 				_bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete);
3350 				return;
3351 			}
3352 		}
3353 
3354 		parent_bdev_io_complete(parent_io, 0);
3355 		return;
3356 	}
3357 
3358 	/*
3359 	 * Continue with the splitting process.  This function will complete the parent I/O if the
3360 	 * splitting is done.
3361 	 */
3362 	switch (parent_io->type) {
3363 	case SPDK_BDEV_IO_TYPE_READ:
3364 	case SPDK_BDEV_IO_TYPE_WRITE:
3365 		_bdev_rw_split(parent_io);
3366 		break;
3367 	case SPDK_BDEV_IO_TYPE_UNMAP:
3368 		bdev_unmap_split(parent_io);
3369 		break;
3370 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3371 		bdev_write_zeroes_split(parent_io);
3372 		break;
3373 	case SPDK_BDEV_IO_TYPE_COPY:
3374 		bdev_copy_split(parent_io);
3375 		break;
3376 	default:
3377 		assert(false);
3378 		break;
3379 	}
3380 }
3381 
3382 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
3383 				     bool success);
3384 
3385 static void
3386 bdev_io_split(struct spdk_bdev_io *bdev_io)
3387 {
3388 	assert(bdev_io_should_split(bdev_io));
3389 
3390 	bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks;
3391 	bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks;
3392 	bdev_io->u.bdev.split_outstanding = 0;
3393 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
3394 
3395 	switch (bdev_io->type) {
3396 	case SPDK_BDEV_IO_TYPE_READ:
3397 	case SPDK_BDEV_IO_TYPE_WRITE:
3398 		if (_is_buf_allocated(bdev_io->u.bdev.iovs)) {
3399 			_bdev_rw_split(bdev_io);
3400 		} else {
3401 			assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
3402 			spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb,
3403 					     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
3404 		}
3405 		break;
3406 	case SPDK_BDEV_IO_TYPE_UNMAP:
3407 		bdev_unmap_split(bdev_io);
3408 		break;
3409 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3410 		bdev_write_zeroes_split(bdev_io);
3411 		break;
3412 	case SPDK_BDEV_IO_TYPE_COPY:
3413 		bdev_copy_split(bdev_io);
3414 		break;
3415 	default:
3416 		assert(false);
3417 		break;
3418 	}
3419 }
3420 
3421 static void
3422 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
3423 {
3424 	if (!success) {
3425 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
3426 		return;
3427 	}
3428 
3429 	_bdev_rw_split(bdev_io);
3430 }
3431 
3432 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't
3433  *  be inlined, at least on some compilers.
3434  */
3435 static inline void
3436 _bdev_io_submit(void *ctx)
3437 {
3438 	struct spdk_bdev_io *bdev_io = ctx;
3439 	struct spdk_bdev *bdev = bdev_io->bdev;
3440 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
3441 
3442 	if (spdk_likely(bdev_ch->flags == 0)) {
3443 		bdev_io_do_submit(bdev_ch, bdev_io);
3444 		return;
3445 	}
3446 
3447 	if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) {
3448 		_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
3449 	} else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) {
3450 		if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) &&
3451 		    bdev_abort_queued_io(&bdev_ch->qos_queued_io, bdev_io->u.abort.bio_to_abort)) {
3452 			_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
3453 		} else {
3454 			TAILQ_INSERT_TAIL(&bdev_ch->qos_queued_io, bdev_io, internal.link);
3455 			bdev_qos_io_submit(bdev_ch, bdev->internal.qos);
3456 		}
3457 	} else {
3458 		SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags);
3459 		_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
3460 	}
3461 }
3462 
3463 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2);
3464 
3465 bool
3466 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2)
3467 {
3468 	if (range1->length == 0 || range2->length == 0) {
3469 		return false;
3470 	}
3471 
3472 	if (range1->offset + range1->length <= range2->offset) {
3473 		return false;
3474 	}
3475 
3476 	if (range2->offset + range2->length <= range1->offset) {
3477 		return false;
3478 	}
3479 
3480 	return true;
3481 }
3482 
3483 static bool
3484 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range)
3485 {
3486 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
3487 	struct lba_range r;
3488 
3489 	switch (bdev_io->type) {
3490 	case SPDK_BDEV_IO_TYPE_NVME_IO:
3491 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
3492 		/* Don't try to decode the NVMe command - just assume worst-case and that
3493 		 * it overlaps a locked range.
3494 		 */
3495 		return true;
3496 	case SPDK_BDEV_IO_TYPE_READ:
3497 		if (!range->quiesce) {
3498 			return false;
3499 		}
3500 	/* fallthrough */
3501 	case SPDK_BDEV_IO_TYPE_WRITE:
3502 	case SPDK_BDEV_IO_TYPE_UNMAP:
3503 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3504 	case SPDK_BDEV_IO_TYPE_ZCOPY:
3505 	case SPDK_BDEV_IO_TYPE_COPY:
3506 		r.offset = bdev_io->u.bdev.offset_blocks;
3507 		r.length = bdev_io->u.bdev.num_blocks;
3508 		if (!bdev_lba_range_overlapped(range, &r)) {
3509 			/* This I/O doesn't overlap the specified LBA range. */
3510 			return false;
3511 		} else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) {
3512 			/* This I/O overlaps, but the I/O is on the same channel that locked this
3513 			 * range, and the caller_ctx is the same as the locked_ctx.  This means
3514 			 * that this I/O is associated with the lock, and is allowed to execute.
3515 			 */
3516 			return false;
3517 		} else {
3518 			return true;
3519 		}
3520 	default:
3521 		return false;
3522 	}
3523 }
3524 
3525 void
3526 bdev_io_submit(struct spdk_bdev_io *bdev_io)
3527 {
3528 	struct spdk_bdev *bdev = bdev_io->bdev;
3529 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
3530 
3531 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
3532 
3533 	if (!TAILQ_EMPTY(&ch->locked_ranges)) {
3534 		struct lba_range *range;
3535 
3536 		TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
3537 			if (bdev_io_range_is_locked(bdev_io, range)) {
3538 				TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link);
3539 				return;
3540 			}
3541 		}
3542 	}
3543 
3544 	TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link);
3545 
3546 	bdev_io->internal.submit_tsc = spdk_get_ticks();
3547 	spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0,
3548 			      (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx,
3549 			      bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
3550 			      spdk_bdev_get_name(bdev));
3551 
3552 	if (bdev_io->internal.split) {
3553 		bdev_io_split(bdev_io);
3554 		return;
3555 	}
3556 
3557 	_bdev_io_submit(bdev_io);
3558 }
3559 
3560 static inline void
3561 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io)
3562 {
3563 	/* bdev doesn't support memory domains, thereby buffers in this IO request can't
3564 	 * be accessed directly. It is needed to allocate buffers before issuing IO operation.
3565 	 * For write operation we need to pull buffers from memory domain before submitting IO.
3566 	 * Once read operation completes, we need to use memory_domain push functionality to
3567 	 * update data in original memory domain IO buffer
3568 	 * This IO request will go through a regular IO flow, so clear memory domains pointers */
3569 	bdev_io->u.bdev.memory_domain = NULL;
3570 	bdev_io->u.bdev.memory_domain_ctx = NULL;
3571 	_bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb,
3572 				       bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
3573 }
3574 
3575 static inline void
3576 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io)
3577 {
3578 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
3579 	bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io);
3580 
3581 	if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) {
3582 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED;
3583 		bdev_io_complete_unsubmitted(bdev_io);
3584 		return;
3585 	}
3586 
3587 	/* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does
3588 	 * support them, but we need to execute an accel sequence and the data buffer is from accel
3589 	 * memory domain (to avoid doing a push/pull from that domain).
3590 	 */
3591 	if ((bdev_io->internal.memory_domain && !desc->memory_domains_supported) ||
3592 	    (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) {
3593 		_bdev_io_ext_use_bounce_buffer(bdev_io);
3594 		return;
3595 	}
3596 
3597 	if (needs_exec) {
3598 		if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
3599 			bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb);
3600 			return;
3601 		}
3602 		/* For reads we'll execute the sequence after the data is read, so, for now, only
3603 		 * clear out accel_sequence pointer and submit the IO */
3604 		assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
3605 		bdev_io->u.bdev.accel_sequence = NULL;
3606 	}
3607 
3608 	bdev_io_submit(bdev_io);
3609 }
3610 
3611 static void
3612 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io)
3613 {
3614 	struct spdk_bdev *bdev = bdev_io->bdev;
3615 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
3616 	struct spdk_io_channel *ch = bdev_ch->channel;
3617 
3618 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
3619 
3620 	bdev_io->internal.in_submit_request = true;
3621 	bdev_submit_request(bdev, ch, bdev_io);
3622 	bdev_io->internal.in_submit_request = false;
3623 }
3624 
3625 void
3626 bdev_io_init(struct spdk_bdev_io *bdev_io,
3627 	     struct spdk_bdev *bdev, void *cb_arg,
3628 	     spdk_bdev_io_completion_cb cb)
3629 {
3630 	bdev_io->bdev = bdev;
3631 	bdev_io->internal.caller_ctx = cb_arg;
3632 	bdev_io->internal.cb = cb;
3633 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
3634 	bdev_io->internal.in_submit_request = false;
3635 	bdev_io->internal.buf = NULL;
3636 	bdev_io->internal.orig_iovs = NULL;
3637 	bdev_io->internal.orig_iovcnt = 0;
3638 	bdev_io->internal.orig_md_iov.iov_base = NULL;
3639 	bdev_io->internal.error.nvme.cdw0 = 0;
3640 	bdev_io->num_retries = 0;
3641 	bdev_io->internal.get_buf_cb = NULL;
3642 	bdev_io->internal.get_aux_buf_cb = NULL;
3643 	bdev_io->internal.memory_domain = NULL;
3644 	bdev_io->internal.memory_domain_ctx = NULL;
3645 	bdev_io->internal.data_transfer_cpl = NULL;
3646 	bdev_io->internal.split = bdev_io_should_split(bdev_io);
3647 	bdev_io->internal.accel_sequence = NULL;
3648 	bdev_io->internal.has_accel_sequence = false;
3649 }
3650 
3651 static bool
3652 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
3653 {
3654 	return bdev->fn_table->io_type_supported(bdev->ctxt, io_type);
3655 }
3656 
3657 bool
3658 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
3659 {
3660 	bool supported;
3661 
3662 	supported = bdev_io_type_supported(bdev, io_type);
3663 
3664 	if (!supported) {
3665 		switch (io_type) {
3666 		case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3667 			/* The bdev layer will emulate write zeroes as long as write is supported. */
3668 			supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE);
3669 			break;
3670 		default:
3671 			break;
3672 		}
3673 	}
3674 
3675 	return supported;
3676 }
3677 
3678 uint64_t
3679 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io)
3680 {
3681 	return bdev_io->internal.submit_tsc;
3682 }
3683 
3684 int
3685 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
3686 {
3687 	if (bdev->fn_table->dump_info_json) {
3688 		return bdev->fn_table->dump_info_json(bdev->ctxt, w);
3689 	}
3690 
3691 	return 0;
3692 }
3693 
3694 static void
3695 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos)
3696 {
3697 	uint32_t max_per_timeslice = 0;
3698 	int i;
3699 
3700 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3701 		if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
3702 			qos->rate_limits[i].max_per_timeslice = 0;
3703 			continue;
3704 		}
3705 
3706 		max_per_timeslice = qos->rate_limits[i].limit *
3707 				    SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC;
3708 
3709 		qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice,
3710 							qos->rate_limits[i].min_per_timeslice);
3711 
3712 		__atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice,
3713 				 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELEASE);
3714 	}
3715 
3716 	bdev_qos_set_ops(qos);
3717 }
3718 
3719 static void
3720 bdev_channel_submit_qos_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
3721 			   struct spdk_io_channel *io_ch, void *ctx)
3722 {
3723 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
3724 	int status;
3725 
3726 	bdev_qos_io_submit(bdev_ch, bdev->internal.qos);
3727 
3728 	/* if all IOs were sent then continue the iteration, otherwise - stop it */
3729 	/* TODO: channels round robing */
3730 	status = TAILQ_EMPTY(&bdev_ch->qos_queued_io) ? 0 : 1;
3731 
3732 	spdk_bdev_for_each_channel_continue(i, status);
3733 }
3734 
3735 
3736 static void
3737 bdev_channel_submit_qos_io_done(struct spdk_bdev *bdev, void *ctx, int status)
3738 {
3739 
3740 }
3741 
3742 static int
3743 bdev_channel_poll_qos(void *arg)
3744 {
3745 	struct spdk_bdev *bdev = arg;
3746 	struct spdk_bdev_qos *qos = bdev->internal.qos;
3747 	uint64_t now = spdk_get_ticks();
3748 	int i;
3749 	int64_t remaining_last_timeslice;
3750 
3751 	if (now < (qos->last_timeslice + qos->timeslice_size)) {
3752 		/* We received our callback earlier than expected - return
3753 		 *  immediately and wait to do accounting until at least one
3754 		 *  timeslice has actually expired.  This should never happen
3755 		 *  with a well-behaved timer implementation.
3756 		 */
3757 		return SPDK_POLLER_IDLE;
3758 	}
3759 
3760 	/* Reset for next round of rate limiting */
3761 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3762 		/* We may have allowed the IOs or bytes to slightly overrun in the last
3763 		 * timeslice. remaining_this_timeslice is signed, so if it's negative
3764 		 * here, we'll account for the overrun so that the next timeslice will
3765 		 * be appropriately reduced.
3766 		 */
3767 		remaining_last_timeslice = __atomic_exchange_n(&qos->rate_limits[i].remaining_this_timeslice,
3768 					   0, __ATOMIC_RELAXED);
3769 		if (remaining_last_timeslice < 0) {
3770 			/* There could be a race condition here as both bdev_qos_rw_queue_io() and bdev_channel_poll_qos()
3771 			 * potentially use 2 atomic ops each, so they can intertwine.
3772 			 * This race can potentialy cause the limits to be a little fuzzy but won't cause any real damage.
3773 			 */
3774 			__atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice,
3775 					 remaining_last_timeslice, __ATOMIC_RELAXED);
3776 		}
3777 	}
3778 
3779 	while (now >= (qos->last_timeslice + qos->timeslice_size)) {
3780 		qos->last_timeslice += qos->timeslice_size;
3781 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3782 			__atomic_add_fetch(&qos->rate_limits[i].remaining_this_timeslice,
3783 					   qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELAXED);
3784 		}
3785 	}
3786 
3787 	spdk_bdev_for_each_channel(bdev, bdev_channel_submit_qos_io, qos,
3788 				   bdev_channel_submit_qos_io_done);
3789 
3790 	return SPDK_POLLER_BUSY;
3791 }
3792 
3793 static void
3794 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch)
3795 {
3796 	struct spdk_bdev_shared_resource *shared_resource;
3797 	struct lba_range *range;
3798 
3799 	bdev_free_io_stat(ch->stat);
3800 #ifdef SPDK_CONFIG_VTUNE
3801 	bdev_free_io_stat(ch->prev_stat);
3802 #endif
3803 
3804 	while (!TAILQ_EMPTY(&ch->locked_ranges)) {
3805 		range = TAILQ_FIRST(&ch->locked_ranges);
3806 		TAILQ_REMOVE(&ch->locked_ranges, range, tailq);
3807 		free(range);
3808 	}
3809 
3810 	spdk_put_io_channel(ch->channel);
3811 	spdk_put_io_channel(ch->accel_channel);
3812 
3813 	shared_resource = ch->shared_resource;
3814 
3815 	assert(TAILQ_EMPTY(&ch->io_locked));
3816 	assert(TAILQ_EMPTY(&ch->io_submitted));
3817 	assert(TAILQ_EMPTY(&ch->io_accel_exec));
3818 	assert(TAILQ_EMPTY(&ch->io_memory_domain));
3819 	assert(ch->io_outstanding == 0);
3820 	assert(shared_resource->ref > 0);
3821 	shared_resource->ref--;
3822 	if (shared_resource->ref == 0) {
3823 		assert(shared_resource->io_outstanding == 0);
3824 		TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link);
3825 		spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch));
3826 		spdk_poller_unregister(&shared_resource->nomem_poller);
3827 		free(shared_resource);
3828 	}
3829 }
3830 
3831 static void
3832 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch)
3833 {
3834 	struct spdk_bdev_qos	*qos = bdev->internal.qos;
3835 	int			i;
3836 
3837 	assert(spdk_spin_held(&bdev->internal.spinlock));
3838 
3839 	/* Rate limiting on this bdev enabled */
3840 	if (qos) {
3841 		if (qos->ch == NULL) {
3842 			struct spdk_io_channel *io_ch;
3843 
3844 			SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch,
3845 				      bdev->name, spdk_get_thread());
3846 
3847 			/* No qos channel has been selected, so set one up */
3848 
3849 			/* Take another reference to ch */
3850 			io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev));
3851 			assert(io_ch != NULL);
3852 			qos->ch = ch;
3853 
3854 			qos->thread = spdk_io_channel_get_thread(io_ch);
3855 
3856 			for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3857 				if (bdev_qos_is_iops_rate_limit(i) == true) {
3858 					qos->rate_limits[i].min_per_timeslice =
3859 						SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE;
3860 				} else {
3861 					qos->rate_limits[i].min_per_timeslice =
3862 						SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE;
3863 				}
3864 
3865 				if (qos->rate_limits[i].limit == 0) {
3866 					qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
3867 				}
3868 			}
3869 			bdev_qos_update_max_quota_per_timeslice(qos);
3870 			qos->timeslice_size =
3871 				SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC;
3872 			qos->last_timeslice = spdk_get_ticks();
3873 			qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos,
3874 							   bdev,
3875 							   SPDK_BDEV_QOS_TIMESLICE_IN_USEC);
3876 		}
3877 
3878 		ch->flags |= BDEV_CH_QOS_ENABLED;
3879 	}
3880 }
3881 
3882 struct poll_timeout_ctx {
3883 	struct spdk_bdev_desc	*desc;
3884 	uint64_t		timeout_in_sec;
3885 	spdk_bdev_io_timeout_cb	cb_fn;
3886 	void			*cb_arg;
3887 };
3888 
3889 static void
3890 bdev_desc_free(struct spdk_bdev_desc *desc)
3891 {
3892 	spdk_spin_destroy(&desc->spinlock);
3893 	free(desc->media_events_buffer);
3894 	free(desc);
3895 }
3896 
3897 static void
3898 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
3899 {
3900 	struct poll_timeout_ctx *ctx  = _ctx;
3901 	struct spdk_bdev_desc *desc = ctx->desc;
3902 
3903 	free(ctx);
3904 
3905 	spdk_spin_lock(&desc->spinlock);
3906 	desc->refs--;
3907 	if (desc->closed == true && desc->refs == 0) {
3908 		spdk_spin_unlock(&desc->spinlock);
3909 		bdev_desc_free(desc);
3910 		return;
3911 	}
3912 	spdk_spin_unlock(&desc->spinlock);
3913 }
3914 
3915 static void
3916 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
3917 			     struct spdk_io_channel *io_ch, void *_ctx)
3918 {
3919 	struct poll_timeout_ctx *ctx  = _ctx;
3920 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
3921 	struct spdk_bdev_desc *desc = ctx->desc;
3922 	struct spdk_bdev_io *bdev_io;
3923 	uint64_t now;
3924 
3925 	spdk_spin_lock(&desc->spinlock);
3926 	if (desc->closed == true) {
3927 		spdk_spin_unlock(&desc->spinlock);
3928 		spdk_bdev_for_each_channel_continue(i, -1);
3929 		return;
3930 	}
3931 	spdk_spin_unlock(&desc->spinlock);
3932 
3933 	now = spdk_get_ticks();
3934 	TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) {
3935 		/* Exclude any I/O that are generated via splitting. */
3936 		if (bdev_io->internal.cb == bdev_io_split_done) {
3937 			continue;
3938 		}
3939 
3940 		/* Once we find an I/O that has not timed out, we can immediately
3941 		 * exit the loop.
3942 		 */
3943 		if (now < (bdev_io->internal.submit_tsc +
3944 			   ctx->timeout_in_sec * spdk_get_ticks_hz())) {
3945 			goto end;
3946 		}
3947 
3948 		if (bdev_io->internal.desc == desc) {
3949 			ctx->cb_fn(ctx->cb_arg, bdev_io);
3950 		}
3951 	}
3952 
3953 end:
3954 	spdk_bdev_for_each_channel_continue(i, 0);
3955 }
3956 
3957 static int
3958 bdev_poll_timeout_io(void *arg)
3959 {
3960 	struct spdk_bdev_desc *desc = arg;
3961 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
3962 	struct poll_timeout_ctx *ctx;
3963 
3964 	ctx = calloc(1, sizeof(struct poll_timeout_ctx));
3965 	if (!ctx) {
3966 		SPDK_ERRLOG("failed to allocate memory\n");
3967 		return SPDK_POLLER_BUSY;
3968 	}
3969 	ctx->desc = desc;
3970 	ctx->cb_arg = desc->cb_arg;
3971 	ctx->cb_fn = desc->cb_fn;
3972 	ctx->timeout_in_sec = desc->timeout_in_sec;
3973 
3974 	/* Take a ref on the descriptor in case it gets closed while we are checking
3975 	 * all of the channels.
3976 	 */
3977 	spdk_spin_lock(&desc->spinlock);
3978 	desc->refs++;
3979 	spdk_spin_unlock(&desc->spinlock);
3980 
3981 	spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx,
3982 				   bdev_channel_poll_timeout_io_done);
3983 
3984 	return SPDK_POLLER_BUSY;
3985 }
3986 
3987 int
3988 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec,
3989 		      spdk_bdev_io_timeout_cb cb_fn, void *cb_arg)
3990 {
3991 	assert(desc->thread == spdk_get_thread());
3992 
3993 	spdk_poller_unregister(&desc->io_timeout_poller);
3994 
3995 	if (timeout_in_sec) {
3996 		assert(cb_fn != NULL);
3997 		desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io,
3998 					  desc,
3999 					  SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC /
4000 					  1000);
4001 		if (desc->io_timeout_poller == NULL) {
4002 			SPDK_ERRLOG("can not register the desc timeout IO poller\n");
4003 			return -1;
4004 		}
4005 	}
4006 
4007 	desc->cb_fn = cb_fn;
4008 	desc->cb_arg = cb_arg;
4009 	desc->timeout_in_sec = timeout_in_sec;
4010 
4011 	return 0;
4012 }
4013 
4014 static int
4015 bdev_channel_create(void *io_device, void *ctx_buf)
4016 {
4017 	struct spdk_bdev		*bdev = __bdev_from_io_dev(io_device);
4018 	struct spdk_bdev_channel	*ch = ctx_buf;
4019 	struct spdk_io_channel		*mgmt_io_ch;
4020 	struct spdk_bdev_mgmt_channel	*mgmt_ch;
4021 	struct spdk_bdev_shared_resource *shared_resource;
4022 	struct lba_range		*range;
4023 
4024 	ch->bdev = bdev;
4025 	ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt);
4026 	if (!ch->channel) {
4027 		return -1;
4028 	}
4029 
4030 	ch->accel_channel = spdk_accel_get_io_channel();
4031 	if (!ch->accel_channel) {
4032 		spdk_put_io_channel(ch->channel);
4033 		return -1;
4034 	}
4035 
4036 	spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name,
4037 			  spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel)));
4038 
4039 	assert(ch->histogram == NULL);
4040 	if (bdev->internal.histogram_enabled) {
4041 		ch->histogram = spdk_histogram_data_alloc();
4042 		if (ch->histogram == NULL) {
4043 			SPDK_ERRLOG("Could not allocate histogram\n");
4044 		}
4045 	}
4046 
4047 	mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr);
4048 	if (!mgmt_io_ch) {
4049 		spdk_put_io_channel(ch->channel);
4050 		spdk_put_io_channel(ch->accel_channel);
4051 		return -1;
4052 	}
4053 
4054 	mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch);
4055 	TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) {
4056 		if (shared_resource->shared_ch == ch->channel) {
4057 			spdk_put_io_channel(mgmt_io_ch);
4058 			shared_resource->ref++;
4059 			break;
4060 		}
4061 	}
4062 
4063 	if (shared_resource == NULL) {
4064 		shared_resource = calloc(1, sizeof(*shared_resource));
4065 		if (shared_resource == NULL) {
4066 			spdk_put_io_channel(ch->channel);
4067 			spdk_put_io_channel(ch->accel_channel);
4068 			spdk_put_io_channel(mgmt_io_ch);
4069 			return -1;
4070 		}
4071 
4072 		shared_resource->mgmt_ch = mgmt_ch;
4073 		shared_resource->io_outstanding = 0;
4074 		TAILQ_INIT(&shared_resource->nomem_io);
4075 		shared_resource->nomem_threshold = 0;
4076 		shared_resource->shared_ch = ch->channel;
4077 		shared_resource->ref = 1;
4078 		TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link);
4079 	}
4080 
4081 	ch->io_outstanding = 0;
4082 	TAILQ_INIT(&ch->queued_resets);
4083 	TAILQ_INIT(&ch->locked_ranges);
4084 	TAILQ_INIT(&ch->qos_queued_io);
4085 	ch->flags = 0;
4086 	ch->shared_resource = shared_resource;
4087 
4088 	TAILQ_INIT(&ch->io_submitted);
4089 	TAILQ_INIT(&ch->io_locked);
4090 	TAILQ_INIT(&ch->io_accel_exec);
4091 	TAILQ_INIT(&ch->io_memory_domain);
4092 
4093 	ch->stat = bdev_alloc_io_stat(false);
4094 	if (ch->stat == NULL) {
4095 		bdev_channel_destroy_resource(ch);
4096 		return -1;
4097 	}
4098 
4099 	ch->stat->ticks_rate = spdk_get_ticks_hz();
4100 
4101 #ifdef SPDK_CONFIG_VTUNE
4102 	{
4103 		char *name;
4104 		__itt_init_ittlib(NULL, 0);
4105 		name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch);
4106 		if (!name) {
4107 			bdev_channel_destroy_resource(ch);
4108 			return -1;
4109 		}
4110 		ch->handle = __itt_string_handle_create(name);
4111 		free(name);
4112 		ch->start_tsc = spdk_get_ticks();
4113 		ch->interval_tsc = spdk_get_ticks_hz() / 100;
4114 		ch->prev_stat = bdev_alloc_io_stat(false);
4115 		if (ch->prev_stat == NULL) {
4116 			bdev_channel_destroy_resource(ch);
4117 			return -1;
4118 		}
4119 	}
4120 #endif
4121 
4122 	spdk_spin_lock(&bdev->internal.spinlock);
4123 	bdev_enable_qos(bdev, ch);
4124 
4125 	TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) {
4126 		struct lba_range *new_range;
4127 
4128 		new_range = calloc(1, sizeof(*new_range));
4129 		if (new_range == NULL) {
4130 			spdk_spin_unlock(&bdev->internal.spinlock);
4131 			bdev_channel_destroy_resource(ch);
4132 			return -1;
4133 		}
4134 		new_range->length = range->length;
4135 		new_range->offset = range->offset;
4136 		new_range->locked_ctx = range->locked_ctx;
4137 		TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq);
4138 	}
4139 
4140 	spdk_spin_unlock(&bdev->internal.spinlock);
4141 
4142 	return 0;
4143 }
4144 
4145 static int
4146 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry,
4147 			 void *cb_ctx)
4148 {
4149 	struct spdk_bdev_channel *bdev_ch = cb_ctx;
4150 	struct spdk_bdev_io *bdev_io;
4151 	uint64_t buf_len;
4152 
4153 	bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf);
4154 	if (bdev_io->internal.ch == bdev_ch) {
4155 		buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len);
4156 		spdk_iobuf_entry_abort(ch, entry, buf_len);
4157 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
4158 	}
4159 
4160 	return 0;
4161 }
4162 
4163 /*
4164  * Abort I/O that are waiting on a data buffer.
4165  */
4166 static void
4167 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch)
4168 {
4169 	spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small,
4170 				  bdev_abort_all_buf_io_cb, ch);
4171 	spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large,
4172 				  bdev_abort_all_buf_io_cb, ch);
4173 }
4174 
4175 /*
4176  * Abort I/O that are queued waiting for submission.  These types of I/O are
4177  *  linked using the spdk_bdev_io link TAILQ_ENTRY.
4178  */
4179 static void
4180 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch)
4181 {
4182 	struct spdk_bdev_io *bdev_io, *tmp;
4183 
4184 	TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) {
4185 		if (bdev_io->internal.ch == ch) {
4186 			TAILQ_REMOVE(queue, bdev_io, internal.link);
4187 			/*
4188 			 * spdk_bdev_io_complete() assumes that the completed I/O had
4189 			 *  been submitted to the bdev module.  Since in this case it
4190 			 *  hadn't, bump io_outstanding to account for the decrement
4191 			 *  that spdk_bdev_io_complete() will do.
4192 			 */
4193 			if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) {
4194 				bdev_io_increment_outstanding(ch, ch->shared_resource);
4195 			}
4196 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
4197 		}
4198 	}
4199 }
4200 
4201 static bool
4202 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort)
4203 {
4204 	struct spdk_bdev_io *bdev_io;
4205 
4206 	TAILQ_FOREACH(bdev_io, queue, internal.link) {
4207 		if (bdev_io == bio_to_abort) {
4208 			TAILQ_REMOVE(queue, bio_to_abort, internal.link);
4209 			spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
4210 			return true;
4211 		}
4212 	}
4213 
4214 	return false;
4215 }
4216 
4217 static int
4218 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx)
4219 {
4220 	struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx;
4221 	uint64_t buf_len;
4222 
4223 	bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf);
4224 	if (bdev_io == bio_to_abort) {
4225 		buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len);
4226 		spdk_iobuf_entry_abort(ch, entry, buf_len);
4227 		spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
4228 		return 1;
4229 	}
4230 
4231 	return 0;
4232 }
4233 
4234 static bool
4235 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort)
4236 {
4237 	int rc;
4238 
4239 	rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small,
4240 				       bdev_abort_buf_io_cb, bio_to_abort);
4241 	if (rc == 1) {
4242 		return true;
4243 	}
4244 
4245 	rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large,
4246 				       bdev_abort_buf_io_cb, bio_to_abort);
4247 	return rc == 1;
4248 }
4249 
4250 static void
4251 bdev_qos_channel_destroy(void *cb_arg)
4252 {
4253 	struct spdk_bdev_qos *qos = cb_arg;
4254 
4255 	spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
4256 	spdk_poller_unregister(&qos->poller);
4257 
4258 	SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos);
4259 
4260 	free(qos);
4261 }
4262 
4263 static int
4264 bdev_qos_destroy(struct spdk_bdev *bdev)
4265 {
4266 	int i;
4267 
4268 	/*
4269 	 * Cleanly shutting down the QoS poller is tricky, because
4270 	 * during the asynchronous operation the user could open
4271 	 * a new descriptor and create a new channel, spawning
4272 	 * a new QoS poller.
4273 	 *
4274 	 * The strategy is to create a new QoS structure here and swap it
4275 	 * in. The shutdown path then continues to refer to the old one
4276 	 * until it completes and then releases it.
4277 	 */
4278 	struct spdk_bdev_qos *new_qos, *old_qos;
4279 
4280 	old_qos = bdev->internal.qos;
4281 
4282 	new_qos = calloc(1, sizeof(*new_qos));
4283 	if (!new_qos) {
4284 		SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n");
4285 		return -ENOMEM;
4286 	}
4287 
4288 	/* Copy the old QoS data into the newly allocated structure */
4289 	memcpy(new_qos, old_qos, sizeof(*new_qos));
4290 
4291 	/* Zero out the key parts of the QoS structure */
4292 	new_qos->ch = NULL;
4293 	new_qos->thread = NULL;
4294 	new_qos->poller = NULL;
4295 	/*
4296 	 * The limit member of spdk_bdev_qos_limit structure is not zeroed.
4297 	 * It will be used later for the new QoS structure.
4298 	 */
4299 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4300 		new_qos->rate_limits[i].remaining_this_timeslice = 0;
4301 		new_qos->rate_limits[i].min_per_timeslice = 0;
4302 		new_qos->rate_limits[i].max_per_timeslice = 0;
4303 	}
4304 
4305 	bdev->internal.qos = new_qos;
4306 
4307 	if (old_qos->thread == NULL) {
4308 		free(old_qos);
4309 	} else {
4310 		spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos);
4311 	}
4312 
4313 	/* It is safe to continue with destroying the bdev even though the QoS channel hasn't
4314 	 * been destroyed yet. The destruction path will end up waiting for the final
4315 	 * channel to be put before it releases resources. */
4316 
4317 	return 0;
4318 }
4319 
4320 void
4321 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add)
4322 {
4323 	total->bytes_read += add->bytes_read;
4324 	total->num_read_ops += add->num_read_ops;
4325 	total->bytes_written += add->bytes_written;
4326 	total->num_write_ops += add->num_write_ops;
4327 	total->bytes_unmapped += add->bytes_unmapped;
4328 	total->num_unmap_ops += add->num_unmap_ops;
4329 	total->bytes_copied += add->bytes_copied;
4330 	total->num_copy_ops += add->num_copy_ops;
4331 	total->read_latency_ticks += add->read_latency_ticks;
4332 	total->write_latency_ticks += add->write_latency_ticks;
4333 	total->unmap_latency_ticks += add->unmap_latency_ticks;
4334 	total->copy_latency_ticks += add->copy_latency_ticks;
4335 	if (total->max_read_latency_ticks < add->max_read_latency_ticks) {
4336 		total->max_read_latency_ticks = add->max_read_latency_ticks;
4337 	}
4338 	if (total->min_read_latency_ticks > add->min_read_latency_ticks) {
4339 		total->min_read_latency_ticks = add->min_read_latency_ticks;
4340 	}
4341 	if (total->max_write_latency_ticks < add->max_write_latency_ticks) {
4342 		total->max_write_latency_ticks = add->max_write_latency_ticks;
4343 	}
4344 	if (total->min_write_latency_ticks > add->min_write_latency_ticks) {
4345 		total->min_write_latency_ticks = add->min_write_latency_ticks;
4346 	}
4347 	if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) {
4348 		total->max_unmap_latency_ticks = add->max_unmap_latency_ticks;
4349 	}
4350 	if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) {
4351 		total->min_unmap_latency_ticks = add->min_unmap_latency_ticks;
4352 	}
4353 	if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) {
4354 		total->max_copy_latency_ticks = add->max_copy_latency_ticks;
4355 	}
4356 	if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) {
4357 		total->min_copy_latency_ticks = add->min_copy_latency_ticks;
4358 	}
4359 }
4360 
4361 static void
4362 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat)
4363 {
4364 	memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error));
4365 
4366 	if (to_stat->io_error != NULL && from_stat->io_error != NULL) {
4367 		memcpy(to_stat->io_error, from_stat->io_error,
4368 		       sizeof(struct spdk_bdev_io_error_stat));
4369 	}
4370 }
4371 
4372 void
4373 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode)
4374 {
4375 	stat->max_read_latency_ticks = 0;
4376 	stat->min_read_latency_ticks = UINT64_MAX;
4377 	stat->max_write_latency_ticks = 0;
4378 	stat->min_write_latency_ticks = UINT64_MAX;
4379 	stat->max_unmap_latency_ticks = 0;
4380 	stat->min_unmap_latency_ticks = UINT64_MAX;
4381 	stat->max_copy_latency_ticks = 0;
4382 	stat->min_copy_latency_ticks = UINT64_MAX;
4383 
4384 	if (mode != SPDK_BDEV_RESET_STAT_ALL) {
4385 		return;
4386 	}
4387 
4388 	stat->bytes_read = 0;
4389 	stat->num_read_ops = 0;
4390 	stat->bytes_written = 0;
4391 	stat->num_write_ops = 0;
4392 	stat->bytes_unmapped = 0;
4393 	stat->num_unmap_ops = 0;
4394 	stat->bytes_copied = 0;
4395 	stat->num_copy_ops = 0;
4396 	stat->read_latency_ticks = 0;
4397 	stat->write_latency_ticks = 0;
4398 	stat->unmap_latency_ticks = 0;
4399 	stat->copy_latency_ticks = 0;
4400 
4401 	if (stat->io_error != NULL) {
4402 		memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat));
4403 	}
4404 }
4405 
4406 struct spdk_bdev_io_stat *
4407 bdev_alloc_io_stat(bool io_error_stat)
4408 {
4409 	struct spdk_bdev_io_stat *stat;
4410 
4411 	stat = malloc(sizeof(struct spdk_bdev_io_stat));
4412 	if (stat == NULL) {
4413 		return NULL;
4414 	}
4415 
4416 	if (io_error_stat) {
4417 		stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat));
4418 		if (stat->io_error == NULL) {
4419 			free(stat);
4420 			return NULL;
4421 		}
4422 	} else {
4423 		stat->io_error = NULL;
4424 	}
4425 
4426 	spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL);
4427 
4428 	return stat;
4429 }
4430 
4431 void
4432 bdev_free_io_stat(struct spdk_bdev_io_stat *stat)
4433 {
4434 	if (stat != NULL) {
4435 		free(stat->io_error);
4436 		free(stat);
4437 	}
4438 }
4439 
4440 void
4441 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w)
4442 {
4443 	int i;
4444 
4445 	spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read);
4446 	spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops);
4447 	spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written);
4448 	spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops);
4449 	spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped);
4450 	spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops);
4451 	spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied);
4452 	spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops);
4453 	spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks);
4454 	spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks);
4455 	spdk_json_write_named_uint64(w, "min_read_latency_ticks",
4456 				     stat->min_read_latency_ticks != UINT64_MAX ?
4457 				     stat->min_read_latency_ticks : 0);
4458 	spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks);
4459 	spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks);
4460 	spdk_json_write_named_uint64(w, "min_write_latency_ticks",
4461 				     stat->min_write_latency_ticks != UINT64_MAX ?
4462 				     stat->min_write_latency_ticks : 0);
4463 	spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks);
4464 	spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks);
4465 	spdk_json_write_named_uint64(w, "min_unmap_latency_ticks",
4466 				     stat->min_unmap_latency_ticks != UINT64_MAX ?
4467 				     stat->min_unmap_latency_ticks : 0);
4468 	spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks);
4469 	spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks);
4470 	spdk_json_write_named_uint64(w, "min_copy_latency_ticks",
4471 				     stat->min_copy_latency_ticks != UINT64_MAX ?
4472 				     stat->min_copy_latency_ticks : 0);
4473 
4474 	if (stat->io_error != NULL) {
4475 		spdk_json_write_named_object_begin(w, "io_error");
4476 		for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) {
4477 			if (stat->io_error->error_status[i] != 0) {
4478 				spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)),
4479 							     stat->io_error->error_status[i]);
4480 			}
4481 		}
4482 		spdk_json_write_object_end(w);
4483 	}
4484 }
4485 
4486 static void
4487 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch)
4488 {
4489 	struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource;
4490 	struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch;
4491 
4492 	bdev_abort_all_queued_io(&shared_resource->nomem_io, ch);
4493 	bdev_abort_all_buf_io(mgmt_ch, ch);
4494 }
4495 
4496 static void
4497 bdev_channel_destroy(void *io_device, void *ctx_buf)
4498 {
4499 	struct spdk_bdev_channel *ch = ctx_buf;
4500 
4501 	SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name,
4502 		      spdk_get_thread());
4503 
4504 	spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name,
4505 			  spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel)));
4506 
4507 	/* This channel is going away, so add its statistics into the bdev so that they don't get lost. */
4508 	spdk_spin_lock(&ch->bdev->internal.spinlock);
4509 	spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat);
4510 	spdk_spin_unlock(&ch->bdev->internal.spinlock);
4511 
4512 	bdev_abort_all_queued_io(&ch->queued_resets, ch);
4513 
4514 	bdev_channel_abort_queued_ios(ch);
4515 
4516 	if (ch->histogram) {
4517 		spdk_histogram_data_free(ch->histogram);
4518 	}
4519 
4520 	bdev_channel_destroy_resource(ch);
4521 }
4522 
4523 /*
4524  * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer
4525  * to it. Hence we do not have to call bdev_get_by_name() when using this function.
4526  */
4527 static int
4528 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name)
4529 {
4530 	struct spdk_bdev_name *tmp;
4531 
4532 	bdev_name->name = strdup(name);
4533 	if (bdev_name->name == NULL) {
4534 		SPDK_ERRLOG("Unable to allocate bdev name\n");
4535 		return -ENOMEM;
4536 	}
4537 
4538 	bdev_name->bdev = bdev;
4539 
4540 	spdk_spin_lock(&g_bdev_mgr.spinlock);
4541 	tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name);
4542 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
4543 
4544 	if (tmp != NULL) {
4545 		SPDK_ERRLOG("Bdev name %s already exists\n", name);
4546 		free(bdev_name->name);
4547 		return -EEXIST;
4548 	}
4549 
4550 	return 0;
4551 }
4552 
4553 static void
4554 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name)
4555 {
4556 	RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name);
4557 	free(bdev_name->name);
4558 }
4559 
4560 static void
4561 bdev_name_del(struct spdk_bdev_name *bdev_name)
4562 {
4563 	spdk_spin_lock(&g_bdev_mgr.spinlock);
4564 	bdev_name_del_unsafe(bdev_name);
4565 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
4566 }
4567 
4568 int
4569 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias)
4570 {
4571 	struct spdk_bdev_alias *tmp;
4572 	int ret;
4573 
4574 	if (alias == NULL) {
4575 		SPDK_ERRLOG("Empty alias passed\n");
4576 		return -EINVAL;
4577 	}
4578 
4579 	tmp = calloc(1, sizeof(*tmp));
4580 	if (tmp == NULL) {
4581 		SPDK_ERRLOG("Unable to allocate alias\n");
4582 		return -ENOMEM;
4583 	}
4584 
4585 	ret = bdev_name_add(&tmp->alias, bdev, alias);
4586 	if (ret != 0) {
4587 		free(tmp);
4588 		return ret;
4589 	}
4590 
4591 	TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq);
4592 
4593 	return 0;
4594 }
4595 
4596 static int
4597 bdev_alias_del(struct spdk_bdev *bdev, const char *alias,
4598 	       void (*alias_del_fn)(struct spdk_bdev_name *n))
4599 {
4600 	struct spdk_bdev_alias *tmp;
4601 
4602 	TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
4603 		if (strcmp(alias, tmp->alias.name) == 0) {
4604 			TAILQ_REMOVE(&bdev->aliases, tmp, tailq);
4605 			alias_del_fn(&tmp->alias);
4606 			free(tmp);
4607 			return 0;
4608 		}
4609 	}
4610 
4611 	return -ENOENT;
4612 }
4613 
4614 int
4615 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias)
4616 {
4617 	int rc;
4618 
4619 	rc = bdev_alias_del(bdev, alias, bdev_name_del);
4620 	if (rc == -ENOENT) {
4621 		SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias);
4622 	}
4623 
4624 	return rc;
4625 }
4626 
4627 void
4628 spdk_bdev_alias_del_all(struct spdk_bdev *bdev)
4629 {
4630 	struct spdk_bdev_alias *p, *tmp;
4631 
4632 	TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) {
4633 		TAILQ_REMOVE(&bdev->aliases, p, tailq);
4634 		bdev_name_del(&p->alias);
4635 		free(p);
4636 	}
4637 }
4638 
4639 struct spdk_io_channel *
4640 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc)
4641 {
4642 	return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc)));
4643 }
4644 
4645 void *
4646 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc)
4647 {
4648 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
4649 	void *ctx = NULL;
4650 
4651 	if (bdev->fn_table->get_module_ctx) {
4652 		ctx = bdev->fn_table->get_module_ctx(bdev->ctxt);
4653 	}
4654 
4655 	return ctx;
4656 }
4657 
4658 const char *
4659 spdk_bdev_get_module_name(const struct spdk_bdev *bdev)
4660 {
4661 	return bdev->module->name;
4662 }
4663 
4664 const char *
4665 spdk_bdev_get_name(const struct spdk_bdev *bdev)
4666 {
4667 	return bdev->name;
4668 }
4669 
4670 const char *
4671 spdk_bdev_get_product_name(const struct spdk_bdev *bdev)
4672 {
4673 	return bdev->product_name;
4674 }
4675 
4676 const struct spdk_bdev_aliases_list *
4677 spdk_bdev_get_aliases(const struct spdk_bdev *bdev)
4678 {
4679 	return &bdev->aliases;
4680 }
4681 
4682 uint32_t
4683 spdk_bdev_get_block_size(const struct spdk_bdev *bdev)
4684 {
4685 	return bdev->blocklen;
4686 }
4687 
4688 uint32_t
4689 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev)
4690 {
4691 	return bdev->write_unit_size;
4692 }
4693 
4694 uint64_t
4695 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev)
4696 {
4697 	return bdev->blockcnt;
4698 }
4699 
4700 const char *
4701 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type)
4702 {
4703 	return qos_rpc_type[type];
4704 }
4705 
4706 void
4707 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits)
4708 {
4709 	int i;
4710 
4711 	memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES);
4712 
4713 	spdk_spin_lock(&bdev->internal.spinlock);
4714 	if (bdev->internal.qos) {
4715 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4716 			if (bdev->internal.qos->rate_limits[i].limit !=
4717 			    SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
4718 				limits[i] = bdev->internal.qos->rate_limits[i].limit;
4719 				if (bdev_qos_is_iops_rate_limit(i) == false) {
4720 					/* Change from Byte to Megabyte which is user visible. */
4721 					limits[i] = limits[i] / 1024 / 1024;
4722 				}
4723 			}
4724 		}
4725 	}
4726 	spdk_spin_unlock(&bdev->internal.spinlock);
4727 }
4728 
4729 size_t
4730 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev)
4731 {
4732 	return 1 << bdev->required_alignment;
4733 }
4734 
4735 uint32_t
4736 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev)
4737 {
4738 	return bdev->optimal_io_boundary;
4739 }
4740 
4741 bool
4742 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev)
4743 {
4744 	return bdev->write_cache;
4745 }
4746 
4747 const struct spdk_uuid *
4748 spdk_bdev_get_uuid(const struct spdk_bdev *bdev)
4749 {
4750 	return &bdev->uuid;
4751 }
4752 
4753 uint16_t
4754 spdk_bdev_get_acwu(const struct spdk_bdev *bdev)
4755 {
4756 	return bdev->acwu;
4757 }
4758 
4759 uint32_t
4760 spdk_bdev_get_md_size(const struct spdk_bdev *bdev)
4761 {
4762 	return bdev->md_len;
4763 }
4764 
4765 bool
4766 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev)
4767 {
4768 	return (bdev->md_len != 0) && bdev->md_interleave;
4769 }
4770 
4771 bool
4772 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev)
4773 {
4774 	return (bdev->md_len != 0) && !bdev->md_interleave;
4775 }
4776 
4777 bool
4778 spdk_bdev_is_zoned(const struct spdk_bdev *bdev)
4779 {
4780 	return bdev->zoned;
4781 }
4782 
4783 uint32_t
4784 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev)
4785 {
4786 	if (spdk_bdev_is_md_interleaved(bdev)) {
4787 		return bdev->blocklen - bdev->md_len;
4788 	} else {
4789 		return bdev->blocklen;
4790 	}
4791 }
4792 
4793 uint32_t
4794 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev)
4795 {
4796 	return bdev->phys_blocklen;
4797 }
4798 
4799 static uint32_t
4800 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev)
4801 {
4802 	if (!spdk_bdev_is_md_interleaved(bdev)) {
4803 		return bdev->blocklen + bdev->md_len;
4804 	} else {
4805 		return bdev->blocklen;
4806 	}
4807 }
4808 
4809 /* We have to use the typedef in the function declaration to appease astyle. */
4810 typedef enum spdk_dif_type spdk_dif_type_t;
4811 
4812 spdk_dif_type_t
4813 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev)
4814 {
4815 	if (bdev->md_len != 0) {
4816 		return bdev->dif_type;
4817 	} else {
4818 		return SPDK_DIF_DISABLE;
4819 	}
4820 }
4821 
4822 bool
4823 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev)
4824 {
4825 	if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) {
4826 		return bdev->dif_is_head_of_md;
4827 	} else {
4828 		return false;
4829 	}
4830 }
4831 
4832 bool
4833 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev,
4834 			       enum spdk_dif_check_type check_type)
4835 {
4836 	if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) {
4837 		return false;
4838 	}
4839 
4840 	switch (check_type) {
4841 	case SPDK_DIF_CHECK_TYPE_REFTAG:
4842 		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0;
4843 	case SPDK_DIF_CHECK_TYPE_APPTAG:
4844 		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0;
4845 	case SPDK_DIF_CHECK_TYPE_GUARD:
4846 		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0;
4847 	default:
4848 		return false;
4849 	}
4850 }
4851 
4852 static uint32_t
4853 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes)
4854 {
4855 	uint64_t aligned_length, max_write_blocks;
4856 
4857 	aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1);
4858 	max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev);
4859 	max_write_blocks -= max_write_blocks % bdev->write_unit_size;
4860 
4861 	return max_write_blocks;
4862 }
4863 
4864 uint32_t
4865 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev)
4866 {
4867 	return bdev->max_copy;
4868 }
4869 
4870 uint64_t
4871 spdk_bdev_get_qd(const struct spdk_bdev *bdev)
4872 {
4873 	return bdev->internal.measured_queue_depth;
4874 }
4875 
4876 uint64_t
4877 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev)
4878 {
4879 	return bdev->internal.period;
4880 }
4881 
4882 uint64_t
4883 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev)
4884 {
4885 	return bdev->internal.weighted_io_time;
4886 }
4887 
4888 uint64_t
4889 spdk_bdev_get_io_time(const struct spdk_bdev *bdev)
4890 {
4891 	return bdev->internal.io_time;
4892 }
4893 
4894 static void bdev_update_qd_sampling_period(void *ctx);
4895 
4896 static void
4897 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status)
4898 {
4899 	bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth;
4900 
4901 	if (bdev->internal.measured_queue_depth) {
4902 		bdev->internal.io_time += bdev->internal.period;
4903 		bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth;
4904 	}
4905 
4906 	bdev->internal.qd_poll_in_progress = false;
4907 
4908 	bdev_update_qd_sampling_period(bdev);
4909 }
4910 
4911 static void
4912 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
4913 		       struct spdk_io_channel *io_ch, void *_ctx)
4914 {
4915 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch);
4916 
4917 	bdev->internal.temporary_queue_depth += ch->io_outstanding;
4918 	spdk_bdev_for_each_channel_continue(i, 0);
4919 }
4920 
4921 static int
4922 bdev_calculate_measured_queue_depth(void *ctx)
4923 {
4924 	struct spdk_bdev *bdev = ctx;
4925 
4926 	bdev->internal.qd_poll_in_progress = true;
4927 	bdev->internal.temporary_queue_depth = 0;
4928 	spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl);
4929 	return SPDK_POLLER_BUSY;
4930 }
4931 
4932 static void
4933 bdev_update_qd_sampling_period(void *ctx)
4934 {
4935 	struct spdk_bdev *bdev = ctx;
4936 
4937 	if (bdev->internal.period == bdev->internal.new_period) {
4938 		return;
4939 	}
4940 
4941 	if (bdev->internal.qd_poll_in_progress) {
4942 		return;
4943 	}
4944 
4945 	bdev->internal.period = bdev->internal.new_period;
4946 
4947 	spdk_poller_unregister(&bdev->internal.qd_poller);
4948 	if (bdev->internal.period != 0) {
4949 		bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth,
4950 					   bdev, bdev->internal.period);
4951 	} else {
4952 		spdk_bdev_close(bdev->internal.qd_desc);
4953 		bdev->internal.qd_desc = NULL;
4954 	}
4955 }
4956 
4957 static void
4958 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
4959 {
4960 	SPDK_NOTICELOG("Unexpected event type: %d\n", type);
4961 }
4962 
4963 void
4964 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period)
4965 {
4966 	int rc;
4967 
4968 	if (bdev->internal.new_period == period) {
4969 		return;
4970 	}
4971 
4972 	bdev->internal.new_period = period;
4973 
4974 	if (bdev->internal.qd_desc != NULL) {
4975 		assert(bdev->internal.period != 0);
4976 
4977 		spdk_thread_send_msg(bdev->internal.qd_desc->thread,
4978 				     bdev_update_qd_sampling_period, bdev);
4979 		return;
4980 	}
4981 
4982 	assert(bdev->internal.period == 0);
4983 
4984 	rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb,
4985 				NULL, &bdev->internal.qd_desc);
4986 	if (rc != 0) {
4987 		return;
4988 	}
4989 
4990 	bdev->internal.period = period;
4991 	bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth,
4992 				   bdev, period);
4993 }
4994 
4995 struct bdev_get_current_qd_ctx {
4996 	uint64_t current_qd;
4997 	spdk_bdev_get_current_qd_cb cb_fn;
4998 	void *cb_arg;
4999 };
5000 
5001 static void
5002 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status)
5003 {
5004 	struct bdev_get_current_qd_ctx *ctx = _ctx;
5005 
5006 	ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0);
5007 
5008 	free(ctx);
5009 }
5010 
5011 static void
5012 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
5013 		    struct spdk_io_channel *io_ch, void *_ctx)
5014 {
5015 	struct bdev_get_current_qd_ctx *ctx = _ctx;
5016 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
5017 
5018 	ctx->current_qd += bdev_ch->io_outstanding;
5019 
5020 	spdk_bdev_for_each_channel_continue(i, 0);
5021 }
5022 
5023 void
5024 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn,
5025 			 void *cb_arg)
5026 {
5027 	struct bdev_get_current_qd_ctx *ctx;
5028 
5029 	assert(cb_fn != NULL);
5030 
5031 	ctx = calloc(1, sizeof(*ctx));
5032 	if (ctx == NULL) {
5033 		cb_fn(bdev, 0, cb_arg, -ENOMEM);
5034 		return;
5035 	}
5036 
5037 	ctx->cb_fn = cb_fn;
5038 	ctx->cb_arg = cb_arg;
5039 
5040 	spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done);
5041 }
5042 
5043 static void
5044 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type)
5045 {
5046 	assert(desc->thread == spdk_get_thread());
5047 
5048 	spdk_spin_lock(&desc->spinlock);
5049 	desc->refs--;
5050 	if (!desc->closed) {
5051 		spdk_spin_unlock(&desc->spinlock);
5052 		desc->callback.event_fn(type,
5053 					desc->bdev,
5054 					desc->callback.ctx);
5055 		return;
5056 	} else if (desc->refs == 0) {
5057 		/* This descriptor was closed after this event_notify message was sent.
5058 		 * spdk_bdev_close() could not free the descriptor since this message was
5059 		 * in flight, so we free it now using bdev_desc_free().
5060 		 */
5061 		spdk_spin_unlock(&desc->spinlock);
5062 		bdev_desc_free(desc);
5063 		return;
5064 	}
5065 	spdk_spin_unlock(&desc->spinlock);
5066 }
5067 
5068 static void
5069 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn)
5070 {
5071 	spdk_spin_lock(&desc->spinlock);
5072 	desc->refs++;
5073 	spdk_thread_send_msg(desc->thread, event_notify_fn, desc);
5074 	spdk_spin_unlock(&desc->spinlock);
5075 }
5076 
5077 static void
5078 _resize_notify(void *ctx)
5079 {
5080 	struct spdk_bdev_desc *desc = ctx;
5081 
5082 	_event_notify(desc, SPDK_BDEV_EVENT_RESIZE);
5083 }
5084 
5085 int
5086 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size)
5087 {
5088 	struct spdk_bdev_desc *desc;
5089 	int ret;
5090 
5091 	if (size == bdev->blockcnt) {
5092 		return 0;
5093 	}
5094 
5095 	spdk_spin_lock(&bdev->internal.spinlock);
5096 
5097 	/* bdev has open descriptors */
5098 	if (!TAILQ_EMPTY(&bdev->internal.open_descs) &&
5099 	    bdev->blockcnt > size) {
5100 		ret = -EBUSY;
5101 	} else {
5102 		bdev->blockcnt = size;
5103 		TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
5104 			event_notify(desc, _resize_notify);
5105 		}
5106 		ret = 0;
5107 	}
5108 
5109 	spdk_spin_unlock(&bdev->internal.spinlock);
5110 
5111 	return ret;
5112 }
5113 
5114 /*
5115  * Convert I/O offset and length from bytes to blocks.
5116  *
5117  * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size.
5118  */
5119 static uint64_t
5120 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks,
5121 		     uint64_t num_bytes, uint64_t *num_blocks)
5122 {
5123 	uint32_t block_size = bdev->blocklen;
5124 	uint8_t shift_cnt;
5125 
5126 	/* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */
5127 	if (spdk_likely(spdk_u32_is_pow2(block_size))) {
5128 		shift_cnt = spdk_u32log2(block_size);
5129 		*offset_blocks = offset_bytes >> shift_cnt;
5130 		*num_blocks = num_bytes >> shift_cnt;
5131 		return (offset_bytes - (*offset_blocks << shift_cnt)) |
5132 		       (num_bytes - (*num_blocks << shift_cnt));
5133 	} else {
5134 		*offset_blocks = offset_bytes / block_size;
5135 		*num_blocks = num_bytes / block_size;
5136 		return (offset_bytes % block_size) | (num_bytes % block_size);
5137 	}
5138 }
5139 
5140 static bool
5141 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks)
5142 {
5143 	/* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there
5144 	 * has been an overflow and hence the offset has been wrapped around */
5145 	if (offset_blocks + num_blocks < offset_blocks) {
5146 		return false;
5147 	}
5148 
5149 	/* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */
5150 	if (offset_blocks + num_blocks > bdev->blockcnt) {
5151 		return false;
5152 	}
5153 
5154 	return true;
5155 }
5156 
5157 static void
5158 bdev_seek_complete_cb(void *ctx)
5159 {
5160 	struct spdk_bdev_io *bdev_io = ctx;
5161 
5162 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
5163 	bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx);
5164 }
5165 
5166 static int
5167 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5168 	  uint64_t offset_blocks, enum spdk_bdev_io_type io_type,
5169 	  spdk_bdev_io_completion_cb cb, void *cb_arg)
5170 {
5171 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5172 	struct spdk_bdev_io *bdev_io;
5173 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5174 
5175 	assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE);
5176 
5177 	/* Check if offset_blocks is valid looking at the validity of one block */
5178 	if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) {
5179 		return -EINVAL;
5180 	}
5181 
5182 	bdev_io = bdev_channel_get_io(channel);
5183 	if (!bdev_io) {
5184 		return -ENOMEM;
5185 	}
5186 
5187 	bdev_io->internal.ch = channel;
5188 	bdev_io->internal.desc = desc;
5189 	bdev_io->type = io_type;
5190 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5191 	bdev_io->u.bdev.memory_domain = NULL;
5192 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5193 	bdev_io->u.bdev.accel_sequence = NULL;
5194 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5195 
5196 	if (!spdk_bdev_io_type_supported(bdev, io_type)) {
5197 		/* In case bdev doesn't support seek to next data/hole offset,
5198 		 * it is assumed that only data and no holes are present */
5199 		if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) {
5200 			bdev_io->u.bdev.seek.offset = offset_blocks;
5201 		} else {
5202 			bdev_io->u.bdev.seek.offset = UINT64_MAX;
5203 		}
5204 
5205 		spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io);
5206 		return 0;
5207 	}
5208 
5209 	bdev_io_submit(bdev_io);
5210 	return 0;
5211 }
5212 
5213 int
5214 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5215 		    uint64_t offset_blocks,
5216 		    spdk_bdev_io_completion_cb cb, void *cb_arg)
5217 {
5218 	return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg);
5219 }
5220 
5221 int
5222 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5223 		    uint64_t offset_blocks,
5224 		    spdk_bdev_io_completion_cb cb, void *cb_arg)
5225 {
5226 	return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg);
5227 }
5228 
5229 uint64_t
5230 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io)
5231 {
5232 	return bdev_io->u.bdev.seek.offset;
5233 }
5234 
5235 static int
5236 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf,
5237 			 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5238 			 spdk_bdev_io_completion_cb cb, void *cb_arg)
5239 {
5240 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5241 	struct spdk_bdev_io *bdev_io;
5242 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5243 
5244 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5245 		return -EINVAL;
5246 	}
5247 
5248 	bdev_io = bdev_channel_get_io(channel);
5249 	if (!bdev_io) {
5250 		return -ENOMEM;
5251 	}
5252 
5253 	bdev_io->internal.ch = channel;
5254 	bdev_io->internal.desc = desc;
5255 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
5256 	bdev_io->u.bdev.iovs = &bdev_io->iov;
5257 	bdev_io->u.bdev.iovs[0].iov_base = buf;
5258 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
5259 	bdev_io->u.bdev.iovcnt = 1;
5260 	bdev_io->u.bdev.md_buf = md_buf;
5261 	bdev_io->u.bdev.num_blocks = num_blocks;
5262 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5263 	bdev_io->u.bdev.memory_domain = NULL;
5264 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5265 	bdev_io->u.bdev.accel_sequence = NULL;
5266 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5267 
5268 	bdev_io_submit(bdev_io);
5269 	return 0;
5270 }
5271 
5272 int
5273 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5274 	       void *buf, uint64_t offset, uint64_t nbytes,
5275 	       spdk_bdev_io_completion_cb cb, void *cb_arg)
5276 {
5277 	uint64_t offset_blocks, num_blocks;
5278 
5279 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5280 				 nbytes, &num_blocks) != 0) {
5281 		return -EINVAL;
5282 	}
5283 
5284 	return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
5285 }
5286 
5287 int
5288 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5289 		      void *buf, uint64_t offset_blocks, uint64_t num_blocks,
5290 		      spdk_bdev_io_completion_cb cb, void *cb_arg)
5291 {
5292 	return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg);
5293 }
5294 
5295 int
5296 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5297 			      void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5298 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
5299 {
5300 	struct iovec iov = {
5301 		.iov_base = buf,
5302 	};
5303 
5304 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5305 		return -EINVAL;
5306 	}
5307 
5308 	if (md_buf && !_is_buf_allocated(&iov)) {
5309 		return -EINVAL;
5310 	}
5311 
5312 	return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
5313 					cb, cb_arg);
5314 }
5315 
5316 int
5317 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5318 		struct iovec *iov, int iovcnt,
5319 		uint64_t offset, uint64_t nbytes,
5320 		spdk_bdev_io_completion_cb cb, void *cb_arg)
5321 {
5322 	uint64_t offset_blocks, num_blocks;
5323 
5324 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5325 				 nbytes, &num_blocks) != 0) {
5326 		return -EINVAL;
5327 	}
5328 
5329 	return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
5330 }
5331 
5332 static int
5333 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5334 			  struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks,
5335 			  uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx,
5336 			  struct spdk_accel_sequence *seq,
5337 			  spdk_bdev_io_completion_cb cb, void *cb_arg)
5338 {
5339 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5340 	struct spdk_bdev_io *bdev_io;
5341 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5342 
5343 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5344 		return -EINVAL;
5345 	}
5346 
5347 	bdev_io = bdev_channel_get_io(channel);
5348 	if (!bdev_io) {
5349 		return -ENOMEM;
5350 	}
5351 
5352 	bdev_io->internal.ch = channel;
5353 	bdev_io->internal.desc = desc;
5354 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
5355 	bdev_io->u.bdev.iovs = iov;
5356 	bdev_io->u.bdev.iovcnt = iovcnt;
5357 	bdev_io->u.bdev.md_buf = md_buf;
5358 	bdev_io->u.bdev.num_blocks = num_blocks;
5359 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5360 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5361 	bdev_io->internal.memory_domain = domain;
5362 	bdev_io->internal.memory_domain_ctx = domain_ctx;
5363 	bdev_io->internal.accel_sequence = seq;
5364 	bdev_io->internal.has_accel_sequence = seq != NULL;
5365 	bdev_io->u.bdev.memory_domain = domain;
5366 	bdev_io->u.bdev.memory_domain_ctx = domain_ctx;
5367 	bdev_io->u.bdev.accel_sequence = seq;
5368 
5369 	_bdev_io_submit_ext(desc, bdev_io);
5370 
5371 	return 0;
5372 }
5373 
5374 int
5375 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5376 		       struct iovec *iov, int iovcnt,
5377 		       uint64_t offset_blocks, uint64_t num_blocks,
5378 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
5379 {
5380 	return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
5381 					 num_blocks, NULL, NULL, NULL, cb, cb_arg);
5382 }
5383 
5384 int
5385 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5386 			       struct iovec *iov, int iovcnt, void *md_buf,
5387 			       uint64_t offset_blocks, uint64_t num_blocks,
5388 			       spdk_bdev_io_completion_cb cb, void *cb_arg)
5389 {
5390 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5391 		return -EINVAL;
5392 	}
5393 
5394 	if (md_buf && !_is_buf_allocated(iov)) {
5395 		return -EINVAL;
5396 	}
5397 
5398 	return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
5399 					 num_blocks, NULL, NULL, NULL, cb, cb_arg);
5400 }
5401 
5402 static inline bool
5403 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov)
5404 {
5405 	/*
5406 	 * We check if opts size is at least of size when we first introduced
5407 	 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members
5408 	 * are not checked internal.
5409 	 */
5410 	return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) +
5411 	       sizeof(opts->metadata) &&
5412 	       opts->size <= sizeof(*opts) &&
5413 	       /* When memory domain is used, the user must provide data buffers */
5414 	       (!opts->memory_domain || (iov && iov[0].iov_base));
5415 }
5416 
5417 int
5418 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5419 			   struct iovec *iov, int iovcnt,
5420 			   uint64_t offset_blocks, uint64_t num_blocks,
5421 			   spdk_bdev_io_completion_cb cb, void *cb_arg,
5422 			   struct spdk_bdev_ext_io_opts *opts)
5423 {
5424 	void *md = NULL;
5425 
5426 	if (opts) {
5427 		if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) {
5428 			return -EINVAL;
5429 		}
5430 		md = opts->metadata;
5431 	}
5432 
5433 	if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5434 		return -EINVAL;
5435 	}
5436 
5437 	if (md && !_is_buf_allocated(iov)) {
5438 		return -EINVAL;
5439 	}
5440 
5441 	return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks,
5442 					 num_blocks,
5443 					 bdev_get_ext_io_opt(opts, memory_domain, NULL),
5444 					 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL),
5445 					 bdev_get_ext_io_opt(opts, accel_sequence, NULL),
5446 					 cb, cb_arg);
5447 }
5448 
5449 static int
5450 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5451 			  void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5452 			  spdk_bdev_io_completion_cb cb, void *cb_arg)
5453 {
5454 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5455 	struct spdk_bdev_io *bdev_io;
5456 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5457 
5458 	if (!desc->write) {
5459 		return -EBADF;
5460 	}
5461 
5462 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5463 		return -EINVAL;
5464 	}
5465 
5466 	bdev_io = bdev_channel_get_io(channel);
5467 	if (!bdev_io) {
5468 		return -ENOMEM;
5469 	}
5470 
5471 	bdev_io->internal.ch = channel;
5472 	bdev_io->internal.desc = desc;
5473 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
5474 	bdev_io->u.bdev.iovs = &bdev_io->iov;
5475 	bdev_io->u.bdev.iovs[0].iov_base = buf;
5476 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
5477 	bdev_io->u.bdev.iovcnt = 1;
5478 	bdev_io->u.bdev.md_buf = md_buf;
5479 	bdev_io->u.bdev.num_blocks = num_blocks;
5480 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5481 	bdev_io->u.bdev.memory_domain = NULL;
5482 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5483 	bdev_io->u.bdev.accel_sequence = NULL;
5484 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5485 
5486 	bdev_io_submit(bdev_io);
5487 	return 0;
5488 }
5489 
5490 int
5491 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5492 		void *buf, uint64_t offset, uint64_t nbytes,
5493 		spdk_bdev_io_completion_cb cb, void *cb_arg)
5494 {
5495 	uint64_t offset_blocks, num_blocks;
5496 
5497 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5498 				 nbytes, &num_blocks) != 0) {
5499 		return -EINVAL;
5500 	}
5501 
5502 	return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
5503 }
5504 
5505 int
5506 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5507 		       void *buf, uint64_t offset_blocks, uint64_t num_blocks,
5508 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
5509 {
5510 	return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks,
5511 					 cb, cb_arg);
5512 }
5513 
5514 int
5515 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5516 			       void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5517 			       spdk_bdev_io_completion_cb cb, void *cb_arg)
5518 {
5519 	struct iovec iov = {
5520 		.iov_base = buf,
5521 	};
5522 
5523 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5524 		return -EINVAL;
5525 	}
5526 
5527 	if (md_buf && !_is_buf_allocated(&iov)) {
5528 		return -EINVAL;
5529 	}
5530 
5531 	return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
5532 					 cb, cb_arg);
5533 }
5534 
5535 static int
5536 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5537 			   struct iovec *iov, int iovcnt, void *md_buf,
5538 			   uint64_t offset_blocks, uint64_t num_blocks,
5539 			   struct spdk_memory_domain *domain, void *domain_ctx,
5540 			   struct spdk_accel_sequence *seq,
5541 			   spdk_bdev_io_completion_cb cb, void *cb_arg)
5542 {
5543 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5544 	struct spdk_bdev_io *bdev_io;
5545 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5546 
5547 	if (!desc->write) {
5548 		return -EBADF;
5549 	}
5550 
5551 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5552 		return -EINVAL;
5553 	}
5554 
5555 	bdev_io = bdev_channel_get_io(channel);
5556 	if (!bdev_io) {
5557 		return -ENOMEM;
5558 	}
5559 
5560 	bdev_io->internal.ch = channel;
5561 	bdev_io->internal.desc = desc;
5562 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
5563 	bdev_io->u.bdev.iovs = iov;
5564 	bdev_io->u.bdev.iovcnt = iovcnt;
5565 	bdev_io->u.bdev.md_buf = md_buf;
5566 	bdev_io->u.bdev.num_blocks = num_blocks;
5567 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5568 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5569 	bdev_io->internal.memory_domain = domain;
5570 	bdev_io->internal.memory_domain_ctx = domain_ctx;
5571 	bdev_io->internal.accel_sequence = seq;
5572 	bdev_io->internal.has_accel_sequence = seq != NULL;
5573 	bdev_io->u.bdev.memory_domain = domain;
5574 	bdev_io->u.bdev.memory_domain_ctx = domain_ctx;
5575 	bdev_io->u.bdev.accel_sequence = seq;
5576 
5577 	_bdev_io_submit_ext(desc, bdev_io);
5578 
5579 	return 0;
5580 }
5581 
5582 int
5583 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5584 		 struct iovec *iov, int iovcnt,
5585 		 uint64_t offset, uint64_t len,
5586 		 spdk_bdev_io_completion_cb cb, void *cb_arg)
5587 {
5588 	uint64_t offset_blocks, num_blocks;
5589 
5590 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5591 				 len, &num_blocks) != 0) {
5592 		return -EINVAL;
5593 	}
5594 
5595 	return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
5596 }
5597 
5598 int
5599 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5600 			struct iovec *iov, int iovcnt,
5601 			uint64_t offset_blocks, uint64_t num_blocks,
5602 			spdk_bdev_io_completion_cb cb, void *cb_arg)
5603 {
5604 	return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
5605 					  num_blocks, NULL, NULL, NULL, cb, cb_arg);
5606 }
5607 
5608 int
5609 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5610 				struct iovec *iov, int iovcnt, void *md_buf,
5611 				uint64_t offset_blocks, uint64_t num_blocks,
5612 				spdk_bdev_io_completion_cb cb, void *cb_arg)
5613 {
5614 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5615 		return -EINVAL;
5616 	}
5617 
5618 	if (md_buf && !_is_buf_allocated(iov)) {
5619 		return -EINVAL;
5620 	}
5621 
5622 	return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
5623 					  num_blocks, NULL, NULL, NULL, cb, cb_arg);
5624 }
5625 
5626 int
5627 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5628 			    struct iovec *iov, int iovcnt,
5629 			    uint64_t offset_blocks, uint64_t num_blocks,
5630 			    spdk_bdev_io_completion_cb cb, void *cb_arg,
5631 			    struct spdk_bdev_ext_io_opts *opts)
5632 {
5633 	void *md = NULL;
5634 
5635 	if (opts) {
5636 		if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) {
5637 			return -EINVAL;
5638 		}
5639 		md = opts->metadata;
5640 	}
5641 
5642 	if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5643 		return -EINVAL;
5644 	}
5645 
5646 	if (md && !_is_buf_allocated(iov)) {
5647 		return -EINVAL;
5648 	}
5649 
5650 	return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks,
5651 					  bdev_get_ext_io_opt(opts, memory_domain, NULL),
5652 					  bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL),
5653 					  bdev_get_ext_io_opt(opts, accel_sequence, NULL),
5654 					  cb, cb_arg);
5655 }
5656 
5657 static void
5658 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
5659 {
5660 	struct spdk_bdev_io *parent_io = cb_arg;
5661 	struct spdk_bdev *bdev = parent_io->bdev;
5662 	uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base;
5663 	int i, rc = 0;
5664 
5665 	if (!success) {
5666 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
5667 		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
5668 		spdk_bdev_free_io(bdev_io);
5669 		return;
5670 	}
5671 
5672 	for (i = 0; i < parent_io->u.bdev.iovcnt; i++) {
5673 		rc = memcmp(read_buf,
5674 			    parent_io->u.bdev.iovs[i].iov_base,
5675 			    parent_io->u.bdev.iovs[i].iov_len);
5676 		if (rc) {
5677 			break;
5678 		}
5679 		read_buf += parent_io->u.bdev.iovs[i].iov_len;
5680 	}
5681 
5682 	if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) {
5683 		rc = memcmp(bdev_io->u.bdev.md_buf,
5684 			    parent_io->u.bdev.md_buf,
5685 			    spdk_bdev_get_md_size(bdev));
5686 	}
5687 
5688 	spdk_bdev_free_io(bdev_io);
5689 
5690 	if (rc == 0) {
5691 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
5692 		parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx);
5693 	} else {
5694 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE;
5695 		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
5696 	}
5697 }
5698 
5699 static void
5700 bdev_compare_do_read(void *_bdev_io)
5701 {
5702 	struct spdk_bdev_io *bdev_io = _bdev_io;
5703 	int rc;
5704 
5705 	rc = spdk_bdev_read_blocks(bdev_io->internal.desc,
5706 				   spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL,
5707 				   bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
5708 				   bdev_compare_do_read_done, bdev_io);
5709 
5710 	if (rc == -ENOMEM) {
5711 		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read);
5712 	} else if (rc != 0) {
5713 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
5714 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
5715 	}
5716 }
5717 
5718 static int
5719 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5720 			     struct iovec *iov, int iovcnt, void *md_buf,
5721 			     uint64_t offset_blocks, uint64_t num_blocks,
5722 			     spdk_bdev_io_completion_cb cb, void *cb_arg)
5723 {
5724 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5725 	struct spdk_bdev_io *bdev_io;
5726 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5727 
5728 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5729 		return -EINVAL;
5730 	}
5731 
5732 	bdev_io = bdev_channel_get_io(channel);
5733 	if (!bdev_io) {
5734 		return -ENOMEM;
5735 	}
5736 
5737 	bdev_io->internal.ch = channel;
5738 	bdev_io->internal.desc = desc;
5739 	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE;
5740 	bdev_io->u.bdev.iovs = iov;
5741 	bdev_io->u.bdev.iovcnt = iovcnt;
5742 	bdev_io->u.bdev.md_buf = md_buf;
5743 	bdev_io->u.bdev.num_blocks = num_blocks;
5744 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5745 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5746 	bdev_io->u.bdev.memory_domain = NULL;
5747 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5748 	bdev_io->u.bdev.accel_sequence = NULL;
5749 
5750 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) {
5751 		bdev_io_submit(bdev_io);
5752 		return 0;
5753 	}
5754 
5755 	bdev_compare_do_read(bdev_io);
5756 
5757 	return 0;
5758 }
5759 
5760 int
5761 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5762 			  struct iovec *iov, int iovcnt,
5763 			  uint64_t offset_blocks, uint64_t num_blocks,
5764 			  spdk_bdev_io_completion_cb cb, void *cb_arg)
5765 {
5766 	return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
5767 					    num_blocks, cb, cb_arg);
5768 }
5769 
5770 int
5771 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5772 				  struct iovec *iov, int iovcnt, void *md_buf,
5773 				  uint64_t offset_blocks, uint64_t num_blocks,
5774 				  spdk_bdev_io_completion_cb cb, void *cb_arg)
5775 {
5776 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5777 		return -EINVAL;
5778 	}
5779 
5780 	if (md_buf && !_is_buf_allocated(iov)) {
5781 		return -EINVAL;
5782 	}
5783 
5784 	return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
5785 					    num_blocks, cb, cb_arg);
5786 }
5787 
5788 static int
5789 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5790 			    void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5791 			    spdk_bdev_io_completion_cb cb, void *cb_arg)
5792 {
5793 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5794 	struct spdk_bdev_io *bdev_io;
5795 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5796 
5797 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5798 		return -EINVAL;
5799 	}
5800 
5801 	bdev_io = bdev_channel_get_io(channel);
5802 	if (!bdev_io) {
5803 		return -ENOMEM;
5804 	}
5805 
5806 	bdev_io->internal.ch = channel;
5807 	bdev_io->internal.desc = desc;
5808 	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE;
5809 	bdev_io->u.bdev.iovs = &bdev_io->iov;
5810 	bdev_io->u.bdev.iovs[0].iov_base = buf;
5811 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
5812 	bdev_io->u.bdev.iovcnt = 1;
5813 	bdev_io->u.bdev.md_buf = md_buf;
5814 	bdev_io->u.bdev.num_blocks = num_blocks;
5815 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5816 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5817 	bdev_io->u.bdev.memory_domain = NULL;
5818 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5819 	bdev_io->u.bdev.accel_sequence = NULL;
5820 
5821 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) {
5822 		bdev_io_submit(bdev_io);
5823 		return 0;
5824 	}
5825 
5826 	bdev_compare_do_read(bdev_io);
5827 
5828 	return 0;
5829 }
5830 
5831 int
5832 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5833 			 void *buf, uint64_t offset_blocks, uint64_t num_blocks,
5834 			 spdk_bdev_io_completion_cb cb, void *cb_arg)
5835 {
5836 	return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks,
5837 					   cb, cb_arg);
5838 }
5839 
5840 int
5841 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5842 				 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5843 				 spdk_bdev_io_completion_cb cb, void *cb_arg)
5844 {
5845 	struct iovec iov = {
5846 		.iov_base = buf,
5847 	};
5848 
5849 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5850 		return -EINVAL;
5851 	}
5852 
5853 	if (md_buf && !_is_buf_allocated(&iov)) {
5854 		return -EINVAL;
5855 	}
5856 
5857 	return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
5858 					   cb, cb_arg);
5859 }
5860 
5861 static void
5862 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status)
5863 {
5864 	struct spdk_bdev_io *bdev_io = ctx;
5865 
5866 	if (unlock_status) {
5867 		SPDK_ERRLOG("LBA range unlock failed\n");
5868 	}
5869 
5870 	bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true :
5871 			     false, bdev_io->internal.caller_ctx);
5872 }
5873 
5874 static void
5875 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status)
5876 {
5877 	bdev_io->internal.status = status;
5878 
5879 	bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch),
5880 			      bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
5881 			      bdev_comparev_and_writev_blocks_unlocked, bdev_io);
5882 }
5883 
5884 static void
5885 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
5886 {
5887 	struct spdk_bdev_io *parent_io = cb_arg;
5888 
5889 	if (!success) {
5890 		SPDK_ERRLOG("Compare and write operation failed\n");
5891 	}
5892 
5893 	spdk_bdev_free_io(bdev_io);
5894 
5895 	bdev_comparev_and_writev_blocks_unlock(parent_io,
5896 					       success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED);
5897 }
5898 
5899 static void
5900 bdev_compare_and_write_do_write(void *_bdev_io)
5901 {
5902 	struct spdk_bdev_io *bdev_io = _bdev_io;
5903 	int rc;
5904 
5905 	rc = spdk_bdev_writev_blocks(bdev_io->internal.desc,
5906 				     spdk_io_channel_from_ctx(bdev_io->internal.ch),
5907 				     bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt,
5908 				     bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
5909 				     bdev_compare_and_write_do_write_done, bdev_io);
5910 
5911 
5912 	if (rc == -ENOMEM) {
5913 		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write);
5914 	} else if (rc != 0) {
5915 		bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
5916 	}
5917 }
5918 
5919 static void
5920 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
5921 {
5922 	struct spdk_bdev_io *parent_io = cb_arg;
5923 
5924 	spdk_bdev_free_io(bdev_io);
5925 
5926 	if (!success) {
5927 		bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE);
5928 		return;
5929 	}
5930 
5931 	bdev_compare_and_write_do_write(parent_io);
5932 }
5933 
5934 static void
5935 bdev_compare_and_write_do_compare(void *_bdev_io)
5936 {
5937 	struct spdk_bdev_io *bdev_io = _bdev_io;
5938 	int rc;
5939 
5940 	rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc,
5941 				       spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs,
5942 				       bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
5943 				       bdev_compare_and_write_do_compare_done, bdev_io);
5944 
5945 	if (rc == -ENOMEM) {
5946 		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare);
5947 	} else if (rc != 0) {
5948 		bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED);
5949 	}
5950 }
5951 
5952 static void
5953 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status)
5954 {
5955 	struct spdk_bdev_io *bdev_io = ctx;
5956 
5957 	if (status) {
5958 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED;
5959 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
5960 		return;
5961 	}
5962 
5963 	bdev_compare_and_write_do_compare(bdev_io);
5964 }
5965 
5966 int
5967 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5968 				     struct iovec *compare_iov, int compare_iovcnt,
5969 				     struct iovec *write_iov, int write_iovcnt,
5970 				     uint64_t offset_blocks, uint64_t num_blocks,
5971 				     spdk_bdev_io_completion_cb cb, void *cb_arg)
5972 {
5973 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5974 	struct spdk_bdev_io *bdev_io;
5975 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5976 
5977 	if (!desc->write) {
5978 		return -EBADF;
5979 	}
5980 
5981 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5982 		return -EINVAL;
5983 	}
5984 
5985 	if (num_blocks > bdev->acwu) {
5986 		return -EINVAL;
5987 	}
5988 
5989 	bdev_io = bdev_channel_get_io(channel);
5990 	if (!bdev_io) {
5991 		return -ENOMEM;
5992 	}
5993 
5994 	bdev_io->internal.ch = channel;
5995 	bdev_io->internal.desc = desc;
5996 	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE;
5997 	bdev_io->u.bdev.iovs = compare_iov;
5998 	bdev_io->u.bdev.iovcnt = compare_iovcnt;
5999 	bdev_io->u.bdev.fused_iovs = write_iov;
6000 	bdev_io->u.bdev.fused_iovcnt = write_iovcnt;
6001 	bdev_io->u.bdev.md_buf = NULL;
6002 	bdev_io->u.bdev.num_blocks = num_blocks;
6003 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6004 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6005 	bdev_io->u.bdev.memory_domain = NULL;
6006 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6007 	bdev_io->u.bdev.accel_sequence = NULL;
6008 
6009 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) {
6010 		bdev_io_submit(bdev_io);
6011 		return 0;
6012 	}
6013 
6014 	return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks,
6015 				   bdev_comparev_and_writev_blocks_locked, bdev_io);
6016 }
6017 
6018 int
6019 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6020 		      struct iovec *iov, int iovcnt,
6021 		      uint64_t offset_blocks, uint64_t num_blocks,
6022 		      bool populate,
6023 		      spdk_bdev_io_completion_cb cb, void *cb_arg)
6024 {
6025 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6026 	struct spdk_bdev_io *bdev_io;
6027 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6028 
6029 	if (!desc->write) {
6030 		return -EBADF;
6031 	}
6032 
6033 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6034 		return -EINVAL;
6035 	}
6036 
6037 	if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) {
6038 		return -ENOTSUP;
6039 	}
6040 
6041 	bdev_io = bdev_channel_get_io(channel);
6042 	if (!bdev_io) {
6043 		return -ENOMEM;
6044 	}
6045 
6046 	bdev_io->internal.ch = channel;
6047 	bdev_io->internal.desc = desc;
6048 	bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY;
6049 	bdev_io->u.bdev.num_blocks = num_blocks;
6050 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6051 	bdev_io->u.bdev.iovs = iov;
6052 	bdev_io->u.bdev.iovcnt = iovcnt;
6053 	bdev_io->u.bdev.md_buf = NULL;
6054 	bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0;
6055 	bdev_io->u.bdev.zcopy.commit = 0;
6056 	bdev_io->u.bdev.zcopy.start = 1;
6057 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6058 	bdev_io->u.bdev.memory_domain = NULL;
6059 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6060 	bdev_io->u.bdev.accel_sequence = NULL;
6061 
6062 	bdev_io_submit(bdev_io);
6063 
6064 	return 0;
6065 }
6066 
6067 int
6068 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit,
6069 		    spdk_bdev_io_completion_cb cb, void *cb_arg)
6070 {
6071 	if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) {
6072 		return -EINVAL;
6073 	}
6074 
6075 	bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0;
6076 	bdev_io->u.bdev.zcopy.start = 0;
6077 	bdev_io->internal.caller_ctx = cb_arg;
6078 	bdev_io->internal.cb = cb;
6079 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
6080 
6081 	bdev_io_submit(bdev_io);
6082 
6083 	return 0;
6084 }
6085 
6086 int
6087 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6088 		       uint64_t offset, uint64_t len,
6089 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
6090 {
6091 	uint64_t offset_blocks, num_blocks;
6092 
6093 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
6094 				 len, &num_blocks) != 0) {
6095 		return -EINVAL;
6096 	}
6097 
6098 	return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
6099 }
6100 
6101 int
6102 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6103 			      uint64_t offset_blocks, uint64_t num_blocks,
6104 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
6105 {
6106 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6107 	struct spdk_bdev_io *bdev_io;
6108 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6109 
6110 	if (!desc->write) {
6111 		return -EBADF;
6112 	}
6113 
6114 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6115 		return -EINVAL;
6116 	}
6117 
6118 	if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) &&
6119 	    !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) {
6120 		return -ENOTSUP;
6121 	}
6122 
6123 	bdev_io = bdev_channel_get_io(channel);
6124 
6125 	if (!bdev_io) {
6126 		return -ENOMEM;
6127 	}
6128 
6129 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES;
6130 	bdev_io->internal.ch = channel;
6131 	bdev_io->internal.desc = desc;
6132 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6133 	bdev_io->u.bdev.num_blocks = num_blocks;
6134 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6135 	bdev_io->u.bdev.memory_domain = NULL;
6136 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6137 	bdev_io->u.bdev.accel_sequence = NULL;
6138 
6139 	/* If the write_zeroes size is large and should be split, use the generic split
6140 	 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not.
6141 	 *
6142 	 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported
6143 	 * or emulate it using regular write request otherwise.
6144 	 */
6145 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) ||
6146 	    bdev_io->internal.split) {
6147 		bdev_io_submit(bdev_io);
6148 		return 0;
6149 	}
6150 
6151 	assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE);
6152 
6153 	return bdev_write_zero_buffer(bdev_io);
6154 }
6155 
6156 int
6157 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6158 		uint64_t offset, uint64_t nbytes,
6159 		spdk_bdev_io_completion_cb cb, void *cb_arg)
6160 {
6161 	uint64_t offset_blocks, num_blocks;
6162 
6163 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
6164 				 nbytes, &num_blocks) != 0) {
6165 		return -EINVAL;
6166 	}
6167 
6168 	return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
6169 }
6170 
6171 int
6172 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6173 		       uint64_t offset_blocks, uint64_t num_blocks,
6174 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
6175 {
6176 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6177 	struct spdk_bdev_io *bdev_io;
6178 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6179 
6180 	if (!desc->write) {
6181 		return -EBADF;
6182 	}
6183 
6184 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6185 		return -EINVAL;
6186 	}
6187 
6188 	if (num_blocks == 0) {
6189 		SPDK_ERRLOG("Can't unmap 0 bytes\n");
6190 		return -EINVAL;
6191 	}
6192 
6193 	bdev_io = bdev_channel_get_io(channel);
6194 	if (!bdev_io) {
6195 		return -ENOMEM;
6196 	}
6197 
6198 	bdev_io->internal.ch = channel;
6199 	bdev_io->internal.desc = desc;
6200 	bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP;
6201 
6202 	bdev_io->u.bdev.iovs = &bdev_io->iov;
6203 	bdev_io->u.bdev.iovs[0].iov_base = NULL;
6204 	bdev_io->u.bdev.iovs[0].iov_len = 0;
6205 	bdev_io->u.bdev.iovcnt = 1;
6206 
6207 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6208 	bdev_io->u.bdev.num_blocks = num_blocks;
6209 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6210 	bdev_io->u.bdev.memory_domain = NULL;
6211 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6212 	bdev_io->u.bdev.accel_sequence = NULL;
6213 
6214 	bdev_io_submit(bdev_io);
6215 	return 0;
6216 }
6217 
6218 int
6219 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6220 		uint64_t offset, uint64_t length,
6221 		spdk_bdev_io_completion_cb cb, void *cb_arg)
6222 {
6223 	uint64_t offset_blocks, num_blocks;
6224 
6225 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
6226 				 length, &num_blocks) != 0) {
6227 		return -EINVAL;
6228 	}
6229 
6230 	return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
6231 }
6232 
6233 int
6234 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6235 		       uint64_t offset_blocks, uint64_t num_blocks,
6236 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
6237 {
6238 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6239 	struct spdk_bdev_io *bdev_io;
6240 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6241 
6242 	if (!desc->write) {
6243 		return -EBADF;
6244 	}
6245 
6246 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6247 		return -EINVAL;
6248 	}
6249 
6250 	bdev_io = bdev_channel_get_io(channel);
6251 	if (!bdev_io) {
6252 		return -ENOMEM;
6253 	}
6254 
6255 	bdev_io->internal.ch = channel;
6256 	bdev_io->internal.desc = desc;
6257 	bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH;
6258 	bdev_io->u.bdev.iovs = NULL;
6259 	bdev_io->u.bdev.iovcnt = 0;
6260 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6261 	bdev_io->u.bdev.num_blocks = num_blocks;
6262 	bdev_io->u.bdev.memory_domain = NULL;
6263 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6264 	bdev_io->u.bdev.accel_sequence = NULL;
6265 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6266 
6267 	bdev_io_submit(bdev_io);
6268 	return 0;
6269 }
6270 
6271 static int bdev_reset_poll_for_outstanding_io(void *ctx);
6272 
6273 static void
6274 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
6275 {
6276 	struct spdk_bdev_channel *ch = _ctx;
6277 	struct spdk_bdev_io *bdev_io;
6278 
6279 	bdev_io = TAILQ_FIRST(&ch->queued_resets);
6280 
6281 	if (status == -EBUSY) {
6282 		if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) {
6283 			bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io,
6284 							      ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD);
6285 		} else {
6286 			TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
6287 
6288 			if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) {
6289 				/* If outstanding IOs are still present and reset_io_drain_timeout
6290 				 * seconds passed, start the reset. */
6291 				bdev_io_submit_reset(bdev_io);
6292 			} else {
6293 				/* We still have in progress memory domain pull/push or we're
6294 				 * executing accel sequence.  Since we cannot abort either of those
6295 				 * operaions, fail the reset request. */
6296 				spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
6297 			}
6298 		}
6299 	} else {
6300 		TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
6301 		SPDK_DEBUGLOG(bdev,
6302 			      "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n",
6303 			      ch->bdev->name);
6304 		/* Mark the completion status as a SUCCESS and complete the reset. */
6305 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
6306 	}
6307 }
6308 
6309 static void
6310 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6311 				struct spdk_io_channel *io_ch, void *_ctx)
6312 {
6313 	struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch);
6314 	int status = 0;
6315 
6316 	if (cur_ch->io_outstanding > 0 ||
6317 	    !TAILQ_EMPTY(&cur_ch->io_memory_domain) ||
6318 	    !TAILQ_EMPTY(&cur_ch->io_accel_exec)) {
6319 		/* If a channel has outstanding IO, set status to -EBUSY code. This will stop
6320 		 * further iteration over the rest of the channels and pass non-zero status
6321 		 * to the callback function. */
6322 		status = -EBUSY;
6323 	}
6324 	spdk_bdev_for_each_channel_continue(i, status);
6325 }
6326 
6327 static int
6328 bdev_reset_poll_for_outstanding_io(void *ctx)
6329 {
6330 	struct spdk_bdev_channel *ch = ctx;
6331 	struct spdk_bdev_io *bdev_io;
6332 
6333 	bdev_io = TAILQ_FIRST(&ch->queued_resets);
6334 
6335 	spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller);
6336 	spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch,
6337 				   bdev_reset_check_outstanding_io_done);
6338 
6339 	return SPDK_POLLER_BUSY;
6340 }
6341 
6342 static void
6343 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status)
6344 {
6345 	struct spdk_bdev_channel *ch = _ctx;
6346 	struct spdk_bdev_io *bdev_io;
6347 
6348 	bdev_io = TAILQ_FIRST(&ch->queued_resets);
6349 
6350 	if (bdev->reset_io_drain_timeout == 0) {
6351 		TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
6352 
6353 		bdev_io_submit_reset(bdev_io);
6354 		return;
6355 	}
6356 
6357 	bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() +
6358 			(ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz());
6359 
6360 	/* In case bdev->reset_io_drain_timeout is not equal to zero,
6361 	 * submit the reset to the underlying module only if outstanding I/O
6362 	 * remain after reset_io_drain_timeout seconds have passed. */
6363 	spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch,
6364 				   bdev_reset_check_outstanding_io_done);
6365 }
6366 
6367 static void
6368 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6369 			  struct spdk_io_channel *ch, void *_ctx)
6370 {
6371 	struct spdk_bdev_channel	*channel;
6372 	struct spdk_bdev_mgmt_channel	*mgmt_channel;
6373 	struct spdk_bdev_shared_resource *shared_resource;
6374 	bdev_io_tailq_t			tmp_queued;
6375 
6376 	TAILQ_INIT(&tmp_queued);
6377 
6378 	channel = __io_ch_to_bdev_ch(ch);
6379 	shared_resource = channel->shared_resource;
6380 	mgmt_channel = shared_resource->mgmt_ch;
6381 
6382 	channel->flags |= BDEV_CH_RESET_IN_PROGRESS;
6383 
6384 	if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) {
6385 		TAILQ_SWAP(&channel->qos_queued_io, &tmp_queued, spdk_bdev_io, internal.link);
6386 	}
6387 
6388 	bdev_abort_all_queued_io(&shared_resource->nomem_io, channel);
6389 	bdev_abort_all_buf_io(mgmt_channel, channel);
6390 	bdev_abort_all_queued_io(&tmp_queued, channel);
6391 
6392 	spdk_bdev_for_each_channel_continue(i, 0);
6393 }
6394 
6395 static void
6396 bdev_start_reset(void *ctx)
6397 {
6398 	struct spdk_bdev_channel *ch = ctx;
6399 
6400 	spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch,
6401 				   bdev_reset_freeze_channel_done);
6402 }
6403 
6404 static void
6405 bdev_channel_start_reset(struct spdk_bdev_channel *ch)
6406 {
6407 	struct spdk_bdev *bdev = ch->bdev;
6408 
6409 	assert(!TAILQ_EMPTY(&ch->queued_resets));
6410 
6411 	spdk_spin_lock(&bdev->internal.spinlock);
6412 	if (bdev->internal.reset_in_progress == NULL) {
6413 		bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets);
6414 		/*
6415 		 * Take a channel reference for the target bdev for the life of this
6416 		 *  reset.  This guards against the channel getting destroyed while
6417 		 *  spdk_bdev_for_each_channel() calls related to this reset IO are in
6418 		 *  progress.  We will release the reference when this reset is
6419 		 *  completed.
6420 		 */
6421 		bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev));
6422 		bdev_start_reset(ch);
6423 	}
6424 	spdk_spin_unlock(&bdev->internal.spinlock);
6425 }
6426 
6427 int
6428 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6429 		spdk_bdev_io_completion_cb cb, void *cb_arg)
6430 {
6431 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6432 	struct spdk_bdev_io *bdev_io;
6433 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6434 
6435 	bdev_io = bdev_channel_get_io(channel);
6436 	if (!bdev_io) {
6437 		return -ENOMEM;
6438 	}
6439 
6440 	bdev_io->internal.ch = channel;
6441 	bdev_io->internal.desc = desc;
6442 	bdev_io->internal.submit_tsc = spdk_get_ticks();
6443 	bdev_io->type = SPDK_BDEV_IO_TYPE_RESET;
6444 	bdev_io->u.reset.ch_ref = NULL;
6445 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6446 
6447 	spdk_spin_lock(&bdev->internal.spinlock);
6448 	TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link);
6449 	spdk_spin_unlock(&bdev->internal.spinlock);
6450 
6451 	TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io,
6452 			  internal.ch_link);
6453 
6454 	bdev_channel_start_reset(channel);
6455 
6456 	return 0;
6457 }
6458 
6459 void
6460 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
6461 		      struct spdk_bdev_io_stat *stat)
6462 {
6463 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6464 
6465 	bdev_get_io_stat(stat, channel->stat);
6466 }
6467 
6468 static void
6469 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status)
6470 {
6471 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx;
6472 
6473 	bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat,
6474 			    bdev_iostat_ctx->cb_arg, 0);
6475 	free(bdev_iostat_ctx);
6476 }
6477 
6478 static void
6479 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6480 			   struct spdk_io_channel *ch, void *_ctx)
6481 {
6482 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx;
6483 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6484 
6485 	spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat);
6486 	spdk_bdev_for_each_channel_continue(i, 0);
6487 }
6488 
6489 void
6490 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat,
6491 			  spdk_bdev_get_device_stat_cb cb, void *cb_arg)
6492 {
6493 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx;
6494 
6495 	assert(bdev != NULL);
6496 	assert(stat != NULL);
6497 	assert(cb != NULL);
6498 
6499 	bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx));
6500 	if (bdev_iostat_ctx == NULL) {
6501 		SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n");
6502 		cb(bdev, stat, cb_arg, -ENOMEM);
6503 		return;
6504 	}
6505 
6506 	bdev_iostat_ctx->stat = stat;
6507 	bdev_iostat_ctx->cb = cb;
6508 	bdev_iostat_ctx->cb_arg = cb_arg;
6509 
6510 	/* Start with the statistics from previously deleted channels. */
6511 	spdk_spin_lock(&bdev->internal.spinlock);
6512 	bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat);
6513 	spdk_spin_unlock(&bdev->internal.spinlock);
6514 
6515 	/* Then iterate and add the statistics from each existing channel. */
6516 	spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx,
6517 				   bdev_get_device_stat_done);
6518 }
6519 
6520 struct bdev_iostat_reset_ctx {
6521 	enum spdk_bdev_reset_stat_mode mode;
6522 	bdev_reset_device_stat_cb cb;
6523 	void *cb_arg;
6524 };
6525 
6526 static void
6527 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status)
6528 {
6529 	struct bdev_iostat_reset_ctx *ctx = _ctx;
6530 
6531 	ctx->cb(bdev, ctx->cb_arg, 0);
6532 
6533 	free(ctx);
6534 }
6535 
6536 static void
6537 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6538 			     struct spdk_io_channel *ch, void *_ctx)
6539 {
6540 	struct bdev_iostat_reset_ctx *ctx = _ctx;
6541 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6542 
6543 	spdk_bdev_reset_io_stat(channel->stat, ctx->mode);
6544 
6545 	spdk_bdev_for_each_channel_continue(i, 0);
6546 }
6547 
6548 void
6549 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode,
6550 		       bdev_reset_device_stat_cb cb, void *cb_arg)
6551 {
6552 	struct bdev_iostat_reset_ctx *ctx;
6553 
6554 	assert(bdev != NULL);
6555 	assert(cb != NULL);
6556 
6557 	ctx = calloc(1, sizeof(*ctx));
6558 	if (ctx == NULL) {
6559 		SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n");
6560 		cb(bdev, cb_arg, -ENOMEM);
6561 		return;
6562 	}
6563 
6564 	ctx->mode = mode;
6565 	ctx->cb = cb;
6566 	ctx->cb_arg = cb_arg;
6567 
6568 	spdk_spin_lock(&bdev->internal.spinlock);
6569 	spdk_bdev_reset_io_stat(bdev->internal.stat, mode);
6570 	spdk_spin_unlock(&bdev->internal.spinlock);
6571 
6572 	spdk_bdev_for_each_channel(bdev,
6573 				   bdev_reset_each_channel_stat,
6574 				   ctx,
6575 				   bdev_reset_device_stat_done);
6576 }
6577 
6578 int
6579 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6580 			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
6581 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
6582 {
6583 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6584 	struct spdk_bdev_io *bdev_io;
6585 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6586 
6587 	if (!desc->write) {
6588 		return -EBADF;
6589 	}
6590 
6591 	if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) {
6592 		return -ENOTSUP;
6593 	}
6594 
6595 	bdev_io = bdev_channel_get_io(channel);
6596 	if (!bdev_io) {
6597 		return -ENOMEM;
6598 	}
6599 
6600 	bdev_io->internal.ch = channel;
6601 	bdev_io->internal.desc = desc;
6602 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN;
6603 	bdev_io->u.nvme_passthru.cmd = *cmd;
6604 	bdev_io->u.nvme_passthru.buf = buf;
6605 	bdev_io->u.nvme_passthru.nbytes = nbytes;
6606 	bdev_io->u.nvme_passthru.md_buf = NULL;
6607 	bdev_io->u.nvme_passthru.md_len = 0;
6608 
6609 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6610 
6611 	bdev_io_submit(bdev_io);
6612 	return 0;
6613 }
6614 
6615 int
6616 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6617 			   const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
6618 			   spdk_bdev_io_completion_cb cb, void *cb_arg)
6619 {
6620 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6621 	struct spdk_bdev_io *bdev_io;
6622 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6623 
6624 	if (!desc->write) {
6625 		/*
6626 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
6627 		 *  to easily determine if the command is a read or write, but for now just
6628 		 *  do not allow io_passthru with a read-only descriptor.
6629 		 */
6630 		return -EBADF;
6631 	}
6632 
6633 	if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) {
6634 		return -ENOTSUP;
6635 	}
6636 
6637 	bdev_io = bdev_channel_get_io(channel);
6638 	if (!bdev_io) {
6639 		return -ENOMEM;
6640 	}
6641 
6642 	bdev_io->internal.ch = channel;
6643 	bdev_io->internal.desc = desc;
6644 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO;
6645 	bdev_io->u.nvme_passthru.cmd = *cmd;
6646 	bdev_io->u.nvme_passthru.buf = buf;
6647 	bdev_io->u.nvme_passthru.nbytes = nbytes;
6648 	bdev_io->u.nvme_passthru.md_buf = NULL;
6649 	bdev_io->u.nvme_passthru.md_len = 0;
6650 
6651 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6652 
6653 	bdev_io_submit(bdev_io);
6654 	return 0;
6655 }
6656 
6657 int
6658 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6659 			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len,
6660 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
6661 {
6662 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6663 	struct spdk_bdev_io *bdev_io;
6664 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6665 
6666 	if (!desc->write) {
6667 		/*
6668 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
6669 		 *  to easily determine if the command is a read or write, but for now just
6670 		 *  do not allow io_passthru with a read-only descriptor.
6671 		 */
6672 		return -EBADF;
6673 	}
6674 
6675 	if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) {
6676 		return -ENOTSUP;
6677 	}
6678 
6679 	bdev_io = bdev_channel_get_io(channel);
6680 	if (!bdev_io) {
6681 		return -ENOMEM;
6682 	}
6683 
6684 	bdev_io->internal.ch = channel;
6685 	bdev_io->internal.desc = desc;
6686 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD;
6687 	bdev_io->u.nvme_passthru.cmd = *cmd;
6688 	bdev_io->u.nvme_passthru.buf = buf;
6689 	bdev_io->u.nvme_passthru.nbytes = nbytes;
6690 	bdev_io->u.nvme_passthru.md_buf = md_buf;
6691 	bdev_io->u.nvme_passthru.md_len = md_len;
6692 
6693 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6694 
6695 	bdev_io_submit(bdev_io);
6696 	return 0;
6697 }
6698 
6699 int
6700 spdk_bdev_nvme_iov_passthru_md(struct spdk_bdev_desc *desc,
6701 			       struct spdk_io_channel *ch,
6702 			       const struct spdk_nvme_cmd *cmd,
6703 			       struct iovec *iov, int iovcnt, size_t nbytes,
6704 			       void *md_buf, size_t md_len,
6705 			       spdk_bdev_io_completion_cb cb, void *cb_arg)
6706 {
6707 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6708 	struct spdk_bdev_io *bdev_io;
6709 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6710 
6711 	if (!desc->write) {
6712 		/*
6713 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
6714 		 * to easily determine if the command is a read or write, but for now just
6715 		 * do not allow io_passthru with a read-only descriptor.
6716 		 */
6717 		return -EBADF;
6718 	}
6719 
6720 	if (md_buf && spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) {
6721 		return -ENOTSUP;
6722 	} else if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) {
6723 		return -ENOTSUP;
6724 	}
6725 
6726 	bdev_io = bdev_channel_get_io(channel);
6727 	if (!bdev_io) {
6728 		return -ENOMEM;
6729 	}
6730 
6731 	bdev_io->internal.ch = channel;
6732 	bdev_io->internal.desc = desc;
6733 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IOV_MD;
6734 	bdev_io->u.nvme_passthru.cmd = *cmd;
6735 	bdev_io->u.nvme_passthru.iovs = iov;
6736 	bdev_io->u.nvme_passthru.iovcnt = iovcnt;
6737 	bdev_io->u.nvme_passthru.nbytes = nbytes;
6738 	bdev_io->u.nvme_passthru.md_buf = md_buf;
6739 	bdev_io->u.nvme_passthru.md_len = md_len;
6740 
6741 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6742 
6743 	bdev_io_submit(bdev_io);
6744 	return 0;
6745 }
6746 
6747 static void bdev_abort_retry(void *ctx);
6748 static void bdev_abort(struct spdk_bdev_io *parent_io);
6749 
6750 static void
6751 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
6752 {
6753 	struct spdk_bdev_channel *channel = bdev_io->internal.ch;
6754 	struct spdk_bdev_io *parent_io = cb_arg;
6755 	struct spdk_bdev_io *bio_to_abort, *tmp_io;
6756 
6757 	bio_to_abort = bdev_io->u.abort.bio_to_abort;
6758 
6759 	spdk_bdev_free_io(bdev_io);
6760 
6761 	if (!success) {
6762 		/* Check if the target I/O completed in the meantime. */
6763 		TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) {
6764 			if (tmp_io == bio_to_abort) {
6765 				break;
6766 			}
6767 		}
6768 
6769 		/* If the target I/O still exists, set the parent to failed. */
6770 		if (tmp_io != NULL) {
6771 			parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
6772 		}
6773 	}
6774 
6775 	parent_io->u.bdev.split_outstanding--;
6776 	if (parent_io->u.bdev.split_outstanding == 0) {
6777 		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
6778 			bdev_abort_retry(parent_io);
6779 		} else {
6780 			bdev_io_complete(parent_io);
6781 		}
6782 	}
6783 }
6784 
6785 static int
6786 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel,
6787 	      struct spdk_bdev_io *bio_to_abort,
6788 	      spdk_bdev_io_completion_cb cb, void *cb_arg)
6789 {
6790 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6791 	struct spdk_bdev_io *bdev_io;
6792 
6793 	if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT ||
6794 	    bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) {
6795 		/* TODO: Abort reset or abort request. */
6796 		return -ENOTSUP;
6797 	}
6798 
6799 	bdev_io = bdev_channel_get_io(channel);
6800 	if (bdev_io == NULL) {
6801 		return -ENOMEM;
6802 	}
6803 
6804 	bdev_io->internal.ch = channel;
6805 	bdev_io->internal.desc = desc;
6806 	bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT;
6807 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6808 
6809 	if (bdev->split_on_optimal_io_boundary && bio_to_abort->internal.split) {
6810 		assert(bdev_io_should_split(bio_to_abort));
6811 		bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort;
6812 
6813 		/* Parent abort request is not submitted directly, but to manage its
6814 		 * execution add it to the submitted list here.
6815 		 */
6816 		bdev_io->internal.submit_tsc = spdk_get_ticks();
6817 		TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link);
6818 
6819 		bdev_abort(bdev_io);
6820 
6821 		return 0;
6822 	}
6823 
6824 	bdev_io->u.abort.bio_to_abort = bio_to_abort;
6825 
6826 	/* Submit the abort request to the underlying bdev module. */
6827 	bdev_io_submit(bdev_io);
6828 
6829 	return 0;
6830 }
6831 
6832 static bool
6833 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq)
6834 {
6835 	struct spdk_bdev_io *iter;
6836 
6837 	TAILQ_FOREACH(iter, tailq, internal.link) {
6838 		if (iter == bdev_io) {
6839 			return true;
6840 		}
6841 	}
6842 
6843 	return false;
6844 }
6845 
6846 static uint32_t
6847 _bdev_abort(struct spdk_bdev_io *parent_io)
6848 {
6849 	struct spdk_bdev_desc *desc = parent_io->internal.desc;
6850 	struct spdk_bdev_channel *channel = parent_io->internal.ch;
6851 	void *bio_cb_arg;
6852 	struct spdk_bdev_io *bio_to_abort;
6853 	uint32_t matched_ios;
6854 	int rc;
6855 
6856 	bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg;
6857 
6858 	/* matched_ios is returned and will be kept by the caller.
6859 	 *
6860 	 * This function will be used for two cases, 1) the same cb_arg is used for
6861 	 * multiple I/Os, 2) a single large I/O is split into smaller ones.
6862 	 * Incrementing split_outstanding directly here may confuse readers especially
6863 	 * for the 1st case.
6864 	 *
6865 	 * Completion of I/O abort is processed after stack unwinding. Hence this trick
6866 	 * works as expected.
6867 	 */
6868 	matched_ios = 0;
6869 	parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
6870 
6871 	TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) {
6872 		if (bio_to_abort->internal.caller_ctx != bio_cb_arg) {
6873 			continue;
6874 		}
6875 
6876 		if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) {
6877 			/* Any I/O which was submitted after this abort command should be excluded. */
6878 			continue;
6879 		}
6880 
6881 		/* We can't abort a request that's being pushed/pulled or executed by accel */
6882 		if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) ||
6883 		    bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) {
6884 			parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
6885 			break;
6886 		}
6887 
6888 		rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io);
6889 		if (rc != 0) {
6890 			if (rc == -ENOMEM) {
6891 				parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM;
6892 			} else {
6893 				parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
6894 			}
6895 			break;
6896 		}
6897 		matched_ios++;
6898 	}
6899 
6900 	return matched_ios;
6901 }
6902 
6903 static void
6904 bdev_abort_retry(void *ctx)
6905 {
6906 	struct spdk_bdev_io *parent_io = ctx;
6907 	uint32_t matched_ios;
6908 
6909 	matched_ios = _bdev_abort(parent_io);
6910 
6911 	if (matched_ios == 0) {
6912 		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
6913 			bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry);
6914 		} else {
6915 			/* For retry, the case that no target I/O was found is success
6916 			 * because it means target I/Os completed in the meantime.
6917 			 */
6918 			bdev_io_complete(parent_io);
6919 		}
6920 		return;
6921 	}
6922 
6923 	/* Use split_outstanding to manage the progress of aborting I/Os. */
6924 	parent_io->u.bdev.split_outstanding = matched_ios;
6925 }
6926 
6927 static void
6928 bdev_abort(struct spdk_bdev_io *parent_io)
6929 {
6930 	uint32_t matched_ios;
6931 
6932 	matched_ios = _bdev_abort(parent_io);
6933 
6934 	if (matched_ios == 0) {
6935 		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
6936 			bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry);
6937 		} else {
6938 			/* The case the no target I/O was found is failure. */
6939 			parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
6940 			bdev_io_complete(parent_io);
6941 		}
6942 		return;
6943 	}
6944 
6945 	/* Use split_outstanding to manage the progress of aborting I/Os. */
6946 	parent_io->u.bdev.split_outstanding = matched_ios;
6947 }
6948 
6949 int
6950 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6951 		void *bio_cb_arg,
6952 		spdk_bdev_io_completion_cb cb, void *cb_arg)
6953 {
6954 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6955 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6956 	struct spdk_bdev_io *bdev_io;
6957 
6958 	if (bio_cb_arg == NULL) {
6959 		return -EINVAL;
6960 	}
6961 
6962 	if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) {
6963 		return -ENOTSUP;
6964 	}
6965 
6966 	bdev_io = bdev_channel_get_io(channel);
6967 	if (bdev_io == NULL) {
6968 		return -ENOMEM;
6969 	}
6970 
6971 	bdev_io->internal.ch = channel;
6972 	bdev_io->internal.desc = desc;
6973 	bdev_io->internal.submit_tsc = spdk_get_ticks();
6974 	bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT;
6975 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6976 
6977 	bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg;
6978 
6979 	/* Parent abort request is not submitted directly, but to manage its execution,
6980 	 * add it to the submitted list here.
6981 	 */
6982 	TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link);
6983 
6984 	bdev_abort(bdev_io);
6985 
6986 	return 0;
6987 }
6988 
6989 int
6990 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
6991 			struct spdk_bdev_io_wait_entry *entry)
6992 {
6993 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6994 	struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch;
6995 
6996 	if (bdev != entry->bdev) {
6997 		SPDK_ERRLOG("bdevs do not match\n");
6998 		return -EINVAL;
6999 	}
7000 
7001 	if (mgmt_ch->per_thread_cache_count > 0) {
7002 		SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n");
7003 		return -EINVAL;
7004 	}
7005 
7006 	TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link);
7007 	return 0;
7008 }
7009 
7010 static inline void
7011 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff)
7012 {
7013 	enum spdk_bdev_io_status io_status = bdev_io->internal.status;
7014 	struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat;
7015 	uint64_t num_blocks = bdev_io->u.bdev.num_blocks;
7016 	uint32_t blocklen = bdev_io->bdev->blocklen;
7017 
7018 	if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
7019 		switch (bdev_io->type) {
7020 		case SPDK_BDEV_IO_TYPE_READ:
7021 			io_stat->bytes_read += num_blocks * blocklen;
7022 			io_stat->num_read_ops++;
7023 			io_stat->read_latency_ticks += tsc_diff;
7024 			if (io_stat->max_read_latency_ticks < tsc_diff) {
7025 				io_stat->max_read_latency_ticks = tsc_diff;
7026 			}
7027 			if (io_stat->min_read_latency_ticks > tsc_diff) {
7028 				io_stat->min_read_latency_ticks = tsc_diff;
7029 			}
7030 			break;
7031 		case SPDK_BDEV_IO_TYPE_WRITE:
7032 			io_stat->bytes_written += num_blocks * blocklen;
7033 			io_stat->num_write_ops++;
7034 			io_stat->write_latency_ticks += tsc_diff;
7035 			if (io_stat->max_write_latency_ticks < tsc_diff) {
7036 				io_stat->max_write_latency_ticks = tsc_diff;
7037 			}
7038 			if (io_stat->min_write_latency_ticks > tsc_diff) {
7039 				io_stat->min_write_latency_ticks = tsc_diff;
7040 			}
7041 			break;
7042 		case SPDK_BDEV_IO_TYPE_UNMAP:
7043 			io_stat->bytes_unmapped += num_blocks * blocklen;
7044 			io_stat->num_unmap_ops++;
7045 			io_stat->unmap_latency_ticks += tsc_diff;
7046 			if (io_stat->max_unmap_latency_ticks < tsc_diff) {
7047 				io_stat->max_unmap_latency_ticks = tsc_diff;
7048 			}
7049 			if (io_stat->min_unmap_latency_ticks > tsc_diff) {
7050 				io_stat->min_unmap_latency_ticks = tsc_diff;
7051 			}
7052 			break;
7053 		case SPDK_BDEV_IO_TYPE_ZCOPY:
7054 			/* Track the data in the start phase only */
7055 			if (bdev_io->u.bdev.zcopy.start) {
7056 				if (bdev_io->u.bdev.zcopy.populate) {
7057 					io_stat->bytes_read += num_blocks * blocklen;
7058 					io_stat->num_read_ops++;
7059 					io_stat->read_latency_ticks += tsc_diff;
7060 					if (io_stat->max_read_latency_ticks < tsc_diff) {
7061 						io_stat->max_read_latency_ticks = tsc_diff;
7062 					}
7063 					if (io_stat->min_read_latency_ticks > tsc_diff) {
7064 						io_stat->min_read_latency_ticks = tsc_diff;
7065 					}
7066 				} else {
7067 					io_stat->bytes_written += num_blocks * blocklen;
7068 					io_stat->num_write_ops++;
7069 					io_stat->write_latency_ticks += tsc_diff;
7070 					if (io_stat->max_write_latency_ticks < tsc_diff) {
7071 						io_stat->max_write_latency_ticks = tsc_diff;
7072 					}
7073 					if (io_stat->min_write_latency_ticks > tsc_diff) {
7074 						io_stat->min_write_latency_ticks = tsc_diff;
7075 					}
7076 				}
7077 			}
7078 			break;
7079 		case SPDK_BDEV_IO_TYPE_COPY:
7080 			io_stat->bytes_copied += num_blocks * blocklen;
7081 			io_stat->num_copy_ops++;
7082 			bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff;
7083 			if (io_stat->max_copy_latency_ticks < tsc_diff) {
7084 				io_stat->max_copy_latency_ticks = tsc_diff;
7085 			}
7086 			if (io_stat->min_copy_latency_ticks > tsc_diff) {
7087 				io_stat->min_copy_latency_ticks = tsc_diff;
7088 			}
7089 			break;
7090 		default:
7091 			break;
7092 		}
7093 	} else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) {
7094 		io_stat = bdev_io->bdev->internal.stat;
7095 		assert(io_stat->io_error != NULL);
7096 
7097 		spdk_spin_lock(&bdev_io->bdev->internal.spinlock);
7098 		io_stat->io_error->error_status[-io_status - 1]++;
7099 		spdk_spin_unlock(&bdev_io->bdev->internal.spinlock);
7100 	}
7101 
7102 #ifdef SPDK_CONFIG_VTUNE
7103 	uint64_t now_tsc = spdk_get_ticks();
7104 	if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) {
7105 		uint64_t data[5];
7106 		struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat;
7107 
7108 		data[0] = io_stat->num_read_ops - prev_stat->num_read_ops;
7109 		data[1] = io_stat->bytes_read - prev_stat->bytes_read;
7110 		data[2] = io_stat->num_write_ops - prev_stat->num_write_ops;
7111 		data[3] = io_stat->bytes_written - prev_stat->bytes_written;
7112 		data[4] = bdev_io->bdev->fn_table->get_spin_time ?
7113 			  bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0;
7114 
7115 		__itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle,
7116 				   __itt_metadata_u64, 5, data);
7117 
7118 		memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat));
7119 		bdev_io->internal.ch->start_tsc = now_tsc;
7120 	}
7121 #endif
7122 }
7123 
7124 static inline void
7125 _bdev_io_complete(void *ctx)
7126 {
7127 	struct spdk_bdev_io *bdev_io = ctx;
7128 
7129 	if (spdk_unlikely(bdev_io->internal.accel_sequence != NULL)) {
7130 		assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS);
7131 		spdk_accel_sequence_abort(bdev_io->internal.accel_sequence);
7132 	}
7133 
7134 	assert(bdev_io->internal.cb != NULL);
7135 	assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io));
7136 
7137 	bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
7138 			     bdev_io->internal.caller_ctx);
7139 }
7140 
7141 static inline void
7142 bdev_io_complete(void *ctx)
7143 {
7144 	struct spdk_bdev_io *bdev_io = ctx;
7145 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
7146 	uint64_t tsc, tsc_diff;
7147 
7148 	if (spdk_unlikely(bdev_io->internal.in_submit_request)) {
7149 		/*
7150 		 * Defer completion to avoid potential infinite recursion if the
7151 		 * user's completion callback issues a new I/O.
7152 		 */
7153 		spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
7154 				     bdev_io_complete, bdev_io);
7155 		return;
7156 	}
7157 
7158 	tsc = spdk_get_ticks();
7159 	tsc_diff = tsc - bdev_io->internal.submit_tsc;
7160 	spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io,
7161 			      bdev_io->internal.caller_ctx);
7162 
7163 	TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link);
7164 
7165 	if (bdev_io->internal.ch->histogram) {
7166 		spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff);
7167 	}
7168 
7169 	bdev_io_update_io_stat(bdev_io, tsc_diff);
7170 	_bdev_io_complete(bdev_io);
7171 }
7172 
7173 /* The difference between this function and bdev_io_complete() is that this should be called to
7174  * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the
7175  * io_submitted list and don't have submit_tsc updated.
7176  */
7177 static inline void
7178 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io)
7179 {
7180 	/* Since the IO hasn't been submitted it's bound to be failed */
7181 	assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS);
7182 
7183 	/* At this point we don't know if the IO is completed from submission context or not, but,
7184 	 * since this is an error path, we can always do an spdk_thread_send_msg(). */
7185 	spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
7186 			     _bdev_io_complete, bdev_io);
7187 }
7188 
7189 static void bdev_destroy_cb(void *io_device);
7190 
7191 static void
7192 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status)
7193 {
7194 	struct spdk_bdev_io *bdev_io = _ctx;
7195 
7196 	if (bdev_io->u.reset.ch_ref != NULL) {
7197 		spdk_put_io_channel(bdev_io->u.reset.ch_ref);
7198 		bdev_io->u.reset.ch_ref = NULL;
7199 	}
7200 
7201 	bdev_io_complete(bdev_io);
7202 
7203 	if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING &&
7204 	    TAILQ_EMPTY(&bdev->internal.open_descs)) {
7205 		spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
7206 	}
7207 }
7208 
7209 static void
7210 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
7211 		      struct spdk_io_channel *_ch, void *_ctx)
7212 {
7213 	struct spdk_bdev_io *bdev_io = _ctx;
7214 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
7215 	struct spdk_bdev_io *queued_reset;
7216 
7217 	ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS;
7218 	while (!TAILQ_EMPTY(&ch->queued_resets)) {
7219 		queued_reset = TAILQ_FIRST(&ch->queued_resets);
7220 		TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link);
7221 		spdk_bdev_io_complete(queued_reset, bdev_io->internal.status);
7222 	}
7223 
7224 	spdk_bdev_for_each_channel_continue(i, 0);
7225 }
7226 
7227 static void
7228 bdev_io_complete_sequence_cb(void *ctx, int status)
7229 {
7230 	struct spdk_bdev_io *bdev_io = ctx;
7231 
7232 	/* u.bdev.accel_sequence should have already been cleared at this point */
7233 	assert(bdev_io->u.bdev.accel_sequence == NULL);
7234 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
7235 	bdev_io->internal.accel_sequence = NULL;
7236 
7237 	if (spdk_unlikely(status != 0)) {
7238 		SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status);
7239 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7240 	}
7241 
7242 	bdev_io_complete(bdev_io);
7243 }
7244 
7245 void
7246 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status)
7247 {
7248 	struct spdk_bdev *bdev = bdev_io->bdev;
7249 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
7250 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
7251 
7252 	if (bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING) {
7253 		SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n",
7254 			    spdk_bdev_get_module_name(bdev),
7255 			    bdev_io_status_get_string(bdev_io->internal.status));
7256 		assert(false);
7257 	}
7258 	bdev_io->internal.status = status;
7259 
7260 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) {
7261 		bool unlock_channels = false;
7262 
7263 		if (status == SPDK_BDEV_IO_STATUS_NOMEM) {
7264 			SPDK_ERRLOG("NOMEM returned for reset\n");
7265 		}
7266 		spdk_spin_lock(&bdev->internal.spinlock);
7267 		if (bdev_io == bdev->internal.reset_in_progress) {
7268 			bdev->internal.reset_in_progress = NULL;
7269 			unlock_channels = true;
7270 		}
7271 		spdk_spin_unlock(&bdev->internal.spinlock);
7272 
7273 		if (unlock_channels) {
7274 			spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io,
7275 						   bdev_reset_complete);
7276 			return;
7277 		}
7278 	} else {
7279 		bdev_io_decrement_outstanding(bdev_ch, shared_resource);
7280 		if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
7281 			if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) {
7282 				bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb);
7283 				return;
7284 			} else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0 &&
7285 						 !bdev_io_use_accel_sequence(bdev_io))) {
7286 				_bdev_io_push_bounce_data_buffer(bdev_io,
7287 								 _bdev_io_complete_push_bounce_done);
7288 				/* bdev IO will be completed in the callback */
7289 				return;
7290 			}
7291 		}
7292 
7293 		if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) {
7294 			return;
7295 		}
7296 	}
7297 
7298 	bdev_io_complete(bdev_io);
7299 }
7300 
7301 void
7302 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc,
7303 				  enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq)
7304 {
7305 	enum spdk_bdev_io_status status;
7306 
7307 	if (sc == SPDK_SCSI_STATUS_GOOD) {
7308 		status = SPDK_BDEV_IO_STATUS_SUCCESS;
7309 	} else {
7310 		status = SPDK_BDEV_IO_STATUS_SCSI_ERROR;
7311 		bdev_io->internal.error.scsi.sc = sc;
7312 		bdev_io->internal.error.scsi.sk = sk;
7313 		bdev_io->internal.error.scsi.asc = asc;
7314 		bdev_io->internal.error.scsi.ascq = ascq;
7315 	}
7316 
7317 	spdk_bdev_io_complete(bdev_io, status);
7318 }
7319 
7320 void
7321 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io,
7322 			     int *sc, int *sk, int *asc, int *ascq)
7323 {
7324 	assert(sc != NULL);
7325 	assert(sk != NULL);
7326 	assert(asc != NULL);
7327 	assert(ascq != NULL);
7328 
7329 	switch (bdev_io->internal.status) {
7330 	case SPDK_BDEV_IO_STATUS_SUCCESS:
7331 		*sc = SPDK_SCSI_STATUS_GOOD;
7332 		*sk = SPDK_SCSI_SENSE_NO_SENSE;
7333 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
7334 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
7335 		break;
7336 	case SPDK_BDEV_IO_STATUS_NVME_ERROR:
7337 		spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq);
7338 		break;
7339 	case SPDK_BDEV_IO_STATUS_MISCOMPARE:
7340 		*sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
7341 		*sk = SPDK_SCSI_SENSE_MISCOMPARE;
7342 		*asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION;
7343 		*ascq = bdev_io->internal.error.scsi.ascq;
7344 		break;
7345 	case SPDK_BDEV_IO_STATUS_SCSI_ERROR:
7346 		*sc = bdev_io->internal.error.scsi.sc;
7347 		*sk = bdev_io->internal.error.scsi.sk;
7348 		*asc = bdev_io->internal.error.scsi.asc;
7349 		*ascq = bdev_io->internal.error.scsi.ascq;
7350 		break;
7351 	default:
7352 		*sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
7353 		*sk = SPDK_SCSI_SENSE_ABORTED_COMMAND;
7354 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
7355 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
7356 		break;
7357 	}
7358 }
7359 
7360 void
7361 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result)
7362 {
7363 	enum spdk_bdev_io_status status;
7364 
7365 	if (aio_result == 0) {
7366 		status = SPDK_BDEV_IO_STATUS_SUCCESS;
7367 	} else {
7368 		status = SPDK_BDEV_IO_STATUS_AIO_ERROR;
7369 	}
7370 
7371 	bdev_io->internal.error.aio_result = aio_result;
7372 
7373 	spdk_bdev_io_complete(bdev_io, status);
7374 }
7375 
7376 void
7377 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result)
7378 {
7379 	assert(aio_result != NULL);
7380 
7381 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) {
7382 		*aio_result = bdev_io->internal.error.aio_result;
7383 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
7384 		*aio_result = 0;
7385 	} else {
7386 		*aio_result = -EIO;
7387 	}
7388 }
7389 
7390 void
7391 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc)
7392 {
7393 	enum spdk_bdev_io_status status;
7394 
7395 	if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) {
7396 		status = SPDK_BDEV_IO_STATUS_SUCCESS;
7397 	} else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) {
7398 		status = SPDK_BDEV_IO_STATUS_ABORTED;
7399 	} else {
7400 		status = SPDK_BDEV_IO_STATUS_NVME_ERROR;
7401 	}
7402 
7403 	bdev_io->internal.error.nvme.cdw0 = cdw0;
7404 	bdev_io->internal.error.nvme.sct = sct;
7405 	bdev_io->internal.error.nvme.sc = sc;
7406 
7407 	spdk_bdev_io_complete(bdev_io, status);
7408 }
7409 
7410 void
7411 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc)
7412 {
7413 	assert(sct != NULL);
7414 	assert(sc != NULL);
7415 	assert(cdw0 != NULL);
7416 
7417 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) {
7418 		*sct = SPDK_NVME_SCT_GENERIC;
7419 		*sc = SPDK_NVME_SC_SUCCESS;
7420 		if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
7421 			*cdw0 = 0;
7422 		} else {
7423 			*cdw0 = 1U;
7424 		}
7425 		return;
7426 	}
7427 
7428 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
7429 		*sct = bdev_io->internal.error.nvme.sct;
7430 		*sc = bdev_io->internal.error.nvme.sc;
7431 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
7432 		*sct = SPDK_NVME_SCT_GENERIC;
7433 		*sc = SPDK_NVME_SC_SUCCESS;
7434 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) {
7435 		*sct = SPDK_NVME_SCT_GENERIC;
7436 		*sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
7437 	} else {
7438 		*sct = SPDK_NVME_SCT_GENERIC;
7439 		*sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
7440 	}
7441 
7442 	*cdw0 = bdev_io->internal.error.nvme.cdw0;
7443 }
7444 
7445 void
7446 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0,
7447 				   int *first_sct, int *first_sc, int *second_sct, int *second_sc)
7448 {
7449 	assert(first_sct != NULL);
7450 	assert(first_sc != NULL);
7451 	assert(second_sct != NULL);
7452 	assert(second_sc != NULL);
7453 	assert(cdw0 != NULL);
7454 
7455 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
7456 		if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR &&
7457 		    bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) {
7458 			*first_sct = bdev_io->internal.error.nvme.sct;
7459 			*first_sc = bdev_io->internal.error.nvme.sc;
7460 			*second_sct = SPDK_NVME_SCT_GENERIC;
7461 			*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
7462 		} else {
7463 			*first_sct = SPDK_NVME_SCT_GENERIC;
7464 			*first_sc = SPDK_NVME_SC_SUCCESS;
7465 			*second_sct = bdev_io->internal.error.nvme.sct;
7466 			*second_sc = bdev_io->internal.error.nvme.sc;
7467 		}
7468 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) {
7469 		*first_sct = SPDK_NVME_SCT_GENERIC;
7470 		*first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
7471 		*second_sct = SPDK_NVME_SCT_GENERIC;
7472 		*second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
7473 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
7474 		*first_sct = SPDK_NVME_SCT_GENERIC;
7475 		*first_sc = SPDK_NVME_SC_SUCCESS;
7476 		*second_sct = SPDK_NVME_SCT_GENERIC;
7477 		*second_sc = SPDK_NVME_SC_SUCCESS;
7478 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) {
7479 		*first_sct = SPDK_NVME_SCT_GENERIC;
7480 		*first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
7481 		*second_sct = SPDK_NVME_SCT_GENERIC;
7482 		*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
7483 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) {
7484 		*first_sct = SPDK_NVME_SCT_MEDIA_ERROR;
7485 		*first_sc = SPDK_NVME_SC_COMPARE_FAILURE;
7486 		*second_sct = SPDK_NVME_SCT_GENERIC;
7487 		*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
7488 	} else {
7489 		*first_sct = SPDK_NVME_SCT_GENERIC;
7490 		*first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
7491 		*second_sct = SPDK_NVME_SCT_GENERIC;
7492 		*second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
7493 	}
7494 
7495 	*cdw0 = bdev_io->internal.error.nvme.cdw0;
7496 }
7497 
7498 struct spdk_thread *
7499 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io)
7500 {
7501 	return spdk_io_channel_get_thread(bdev_io->internal.ch->channel);
7502 }
7503 
7504 struct spdk_io_channel *
7505 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io)
7506 {
7507 	return bdev_io->internal.ch->channel;
7508 }
7509 
7510 static int
7511 bdev_register(struct spdk_bdev *bdev)
7512 {
7513 	char *bdev_name;
7514 	char uuid[SPDK_UUID_STRING_LEN];
7515 	struct spdk_iobuf_opts iobuf_opts;
7516 	int ret, i;
7517 
7518 	assert(bdev->module != NULL);
7519 
7520 	if (!bdev->name) {
7521 		SPDK_ERRLOG("Bdev name is NULL\n");
7522 		return -EINVAL;
7523 	}
7524 
7525 	if (!strlen(bdev->name)) {
7526 		SPDK_ERRLOG("Bdev name must not be an empty string\n");
7527 		return -EINVAL;
7528 	}
7529 
7530 	for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) {
7531 		if (bdev->fn_table->accel_sequence_supported == NULL) {
7532 			continue;
7533 		}
7534 		if (!bdev->fn_table->accel_sequence_supported(bdev->ctxt,
7535 				(enum spdk_bdev_io_type)i)) {
7536 			continue;
7537 		}
7538 
7539 		if (spdk_bdev_is_md_separate(bdev)) {
7540 			SPDK_ERRLOG("Separate metadata is currently unsupported for bdevs with "
7541 				    "accel sequence support\n");
7542 			return -EINVAL;
7543 		}
7544 	}
7545 
7546 	/* Users often register their own I/O devices using the bdev name. In
7547 	 * order to avoid conflicts, prepend bdev_. */
7548 	bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name);
7549 	if (!bdev_name) {
7550 		SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n");
7551 		return -ENOMEM;
7552 	}
7553 
7554 	bdev->internal.stat = bdev_alloc_io_stat(true);
7555 	if (!bdev->internal.stat) {
7556 		SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n");
7557 		free(bdev_name);
7558 		return -ENOMEM;
7559 	}
7560 
7561 	bdev->internal.status = SPDK_BDEV_STATUS_READY;
7562 	bdev->internal.measured_queue_depth = UINT64_MAX;
7563 	bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE;
7564 	memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim));
7565 	bdev->internal.qd_poller = NULL;
7566 	bdev->internal.qos = NULL;
7567 
7568 	TAILQ_INIT(&bdev->internal.open_descs);
7569 	TAILQ_INIT(&bdev->internal.locked_ranges);
7570 	TAILQ_INIT(&bdev->internal.pending_locked_ranges);
7571 	TAILQ_INIT(&bdev->aliases);
7572 
7573 	ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name);
7574 	if (ret != 0) {
7575 		bdev_free_io_stat(bdev->internal.stat);
7576 		free(bdev_name);
7577 		return ret;
7578 	}
7579 
7580 	/* UUID may be specified by the user or defined by bdev itself.
7581 	 * Otherwise it will be generated here, so this field will never be empty. */
7582 	if (spdk_uuid_is_null(&bdev->uuid)) {
7583 		spdk_uuid_generate(&bdev->uuid);
7584 	}
7585 
7586 	/* Add the UUID alias only if it's different than the name */
7587 	spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid);
7588 	if (strcmp(bdev->name, uuid) != 0) {
7589 		ret = spdk_bdev_alias_add(bdev, uuid);
7590 		if (ret != 0) {
7591 			SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name);
7592 			bdev_name_del(&bdev->internal.bdev_name);
7593 			bdev_free_io_stat(bdev->internal.stat);
7594 			free(bdev_name);
7595 			return ret;
7596 		}
7597 	}
7598 
7599 	spdk_iobuf_get_opts(&iobuf_opts);
7600 	if (spdk_bdev_get_buf_align(bdev) > 1) {
7601 		bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX,
7602 					     iobuf_opts.large_bufsize / bdev->blocklen);
7603 	}
7604 
7605 	/* If the user didn't specify a write unit size, set it to one. */
7606 	if (bdev->write_unit_size == 0) {
7607 		bdev->write_unit_size = 1;
7608 	}
7609 
7610 	/* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */
7611 	if (bdev->acwu == 0) {
7612 		bdev->acwu = bdev->write_unit_size;
7613 	}
7614 
7615 	if (bdev->phys_blocklen == 0) {
7616 		bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev);
7617 	}
7618 
7619 	if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) {
7620 		bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize);
7621 	}
7622 
7623 	if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
7624 		bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE);
7625 	}
7626 
7627 	bdev->internal.reset_in_progress = NULL;
7628 	bdev->internal.qd_poll_in_progress = false;
7629 	bdev->internal.period = 0;
7630 	bdev->internal.new_period = 0;
7631 
7632 	spdk_io_device_register(__bdev_to_io_dev(bdev),
7633 				bdev_channel_create, bdev_channel_destroy,
7634 				sizeof(struct spdk_bdev_channel),
7635 				bdev_name);
7636 
7637 	free(bdev_name);
7638 
7639 	spdk_spin_init(&bdev->internal.spinlock);
7640 
7641 	SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name);
7642 	TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link);
7643 
7644 	return 0;
7645 }
7646 
7647 static void
7648 bdev_destroy_cb(void *io_device)
7649 {
7650 	int			rc;
7651 	struct spdk_bdev	*bdev;
7652 	spdk_bdev_unregister_cb	cb_fn;
7653 	void			*cb_arg;
7654 
7655 	bdev = __bdev_from_io_dev(io_device);
7656 
7657 	if (bdev->internal.unregister_td != spdk_get_thread()) {
7658 		spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device);
7659 		return;
7660 	}
7661 
7662 	cb_fn = bdev->internal.unregister_cb;
7663 	cb_arg = bdev->internal.unregister_ctx;
7664 
7665 	spdk_spin_destroy(&bdev->internal.spinlock);
7666 	free(bdev->internal.qos);
7667 	bdev_free_io_stat(bdev->internal.stat);
7668 
7669 	rc = bdev->fn_table->destruct(bdev->ctxt);
7670 	if (rc < 0) {
7671 		SPDK_ERRLOG("destruct failed\n");
7672 	}
7673 	if (rc <= 0 && cb_fn != NULL) {
7674 		cb_fn(cb_arg, rc);
7675 	}
7676 }
7677 
7678 void
7679 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno)
7680 {
7681 	if (bdev->internal.unregister_cb != NULL) {
7682 		bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno);
7683 	}
7684 }
7685 
7686 static void
7687 _remove_notify(void *arg)
7688 {
7689 	struct spdk_bdev_desc *desc = arg;
7690 
7691 	_event_notify(desc, SPDK_BDEV_EVENT_REMOVE);
7692 }
7693 
7694 /* returns: 0 - bdev removed and ready to be destructed.
7695  *          -EBUSY - bdev can't be destructed yet.  */
7696 static int
7697 bdev_unregister_unsafe(struct spdk_bdev *bdev)
7698 {
7699 	struct spdk_bdev_desc	*desc, *tmp;
7700 	int			rc = 0;
7701 	char			uuid[SPDK_UUID_STRING_LEN];
7702 
7703 	assert(spdk_spin_held(&g_bdev_mgr.spinlock));
7704 	assert(spdk_spin_held(&bdev->internal.spinlock));
7705 
7706 	/* Notify each descriptor about hotremoval */
7707 	TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) {
7708 		rc = -EBUSY;
7709 		/*
7710 		 * Defer invocation of the event_cb to a separate message that will
7711 		 *  run later on its thread.  This ensures this context unwinds and
7712 		 *  we don't recursively unregister this bdev again if the event_cb
7713 		 *  immediately closes its descriptor.
7714 		 */
7715 		event_notify(desc, _remove_notify);
7716 	}
7717 
7718 	/* If there are no descriptors, proceed removing the bdev */
7719 	if (rc == 0) {
7720 		TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
7721 		SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name);
7722 
7723 		/* Delete the name and the UUID alias */
7724 		spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid);
7725 		bdev_name_del_unsafe(&bdev->internal.bdev_name);
7726 		bdev_alias_del(bdev, uuid, bdev_name_del_unsafe);
7727 
7728 		spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev));
7729 
7730 		if (bdev->internal.reset_in_progress != NULL) {
7731 			/* If reset is in progress, let the completion callback for reset
7732 			 * unregister the bdev.
7733 			 */
7734 			rc = -EBUSY;
7735 		}
7736 	}
7737 
7738 	return rc;
7739 }
7740 
7741 static void
7742 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
7743 			      struct spdk_io_channel *io_ch, void *_ctx)
7744 {
7745 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
7746 
7747 	bdev_channel_abort_queued_ios(bdev_ch);
7748 	spdk_bdev_for_each_channel_continue(i, 0);
7749 }
7750 
7751 static void
7752 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status)
7753 {
7754 	int rc;
7755 
7756 	spdk_spin_lock(&g_bdev_mgr.spinlock);
7757 	spdk_spin_lock(&bdev->internal.spinlock);
7758 	/*
7759 	 * Set the status to REMOVING after completing to abort channels. Otherwise,
7760 	 * the last spdk_bdev_close() may call spdk_io_device_unregister() while
7761 	 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister()
7762 	 * may fail.
7763 	 */
7764 	bdev->internal.status = SPDK_BDEV_STATUS_REMOVING;
7765 	rc = bdev_unregister_unsafe(bdev);
7766 	spdk_spin_unlock(&bdev->internal.spinlock);
7767 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
7768 
7769 	if (rc == 0) {
7770 		spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
7771 	}
7772 }
7773 
7774 void
7775 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
7776 {
7777 	struct spdk_thread	*thread;
7778 
7779 	SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name);
7780 
7781 	thread = spdk_get_thread();
7782 	if (!thread) {
7783 		/* The user called this from a non-SPDK thread. */
7784 		if (cb_fn != NULL) {
7785 			cb_fn(cb_arg, -ENOTSUP);
7786 		}
7787 		return;
7788 	}
7789 
7790 	spdk_spin_lock(&g_bdev_mgr.spinlock);
7791 	if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING ||
7792 	    bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) {
7793 		spdk_spin_unlock(&g_bdev_mgr.spinlock);
7794 		if (cb_fn) {
7795 			cb_fn(cb_arg, -EBUSY);
7796 		}
7797 		return;
7798 	}
7799 
7800 	spdk_spin_lock(&bdev->internal.spinlock);
7801 	bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING;
7802 	bdev->internal.unregister_cb = cb_fn;
7803 	bdev->internal.unregister_ctx = cb_arg;
7804 	bdev->internal.unregister_td = thread;
7805 	spdk_spin_unlock(&bdev->internal.spinlock);
7806 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
7807 
7808 	spdk_bdev_set_qd_sampling_period(bdev, 0);
7809 
7810 	spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev,
7811 				   bdev_unregister);
7812 }
7813 
7814 int
7815 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module,
7816 			     spdk_bdev_unregister_cb cb_fn, void *cb_arg)
7817 {
7818 	struct spdk_bdev_desc *desc;
7819 	struct spdk_bdev *bdev;
7820 	int rc;
7821 
7822 	rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc);
7823 	if (rc != 0) {
7824 		SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name);
7825 		return rc;
7826 	}
7827 
7828 	bdev = spdk_bdev_desc_get_bdev(desc);
7829 
7830 	if (bdev->module != module) {
7831 		spdk_bdev_close(desc);
7832 		SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n",
7833 			    bdev_name);
7834 		return -ENODEV;
7835 	}
7836 
7837 	spdk_bdev_unregister(bdev, cb_fn, cb_arg);
7838 
7839 	spdk_bdev_close(desc);
7840 
7841 	return 0;
7842 }
7843 
7844 static int
7845 bdev_start_qos(struct spdk_bdev *bdev)
7846 {
7847 	struct set_qos_limit_ctx *ctx;
7848 
7849 	/* Enable QoS */
7850 	if (bdev->internal.qos && bdev->internal.qos->thread == NULL) {
7851 		ctx = calloc(1, sizeof(*ctx));
7852 		if (ctx == NULL) {
7853 			SPDK_ERRLOG("Failed to allocate memory for QoS context\n");
7854 			return -ENOMEM;
7855 		}
7856 		ctx->bdev = bdev;
7857 		spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done);
7858 	}
7859 
7860 	return 0;
7861 }
7862 
7863 static void
7864 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail,
7865 		    struct spdk_bdev *bdev)
7866 {
7867 	enum spdk_bdev_claim_type type;
7868 	const char *typename, *modname;
7869 	extern struct spdk_log_flag SPDK_LOG_bdev;
7870 
7871 	assert(spdk_spin_held(&bdev->internal.spinlock));
7872 
7873 	if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) {
7874 		return;
7875 	}
7876 
7877 	type = bdev->internal.claim_type;
7878 	typename = spdk_bdev_claim_get_name(type);
7879 
7880 	if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) {
7881 		modname = bdev->internal.claim.v1.module->name;
7882 		spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n",
7883 			 bdev->name, detail, typename, modname);
7884 		return;
7885 	}
7886 
7887 	if (claim_type_is_v2(type)) {
7888 		struct spdk_bdev_module_claim *claim;
7889 
7890 		TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) {
7891 			modname = claim->module->name;
7892 			spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n",
7893 				 bdev->name, detail, typename, modname);
7894 		}
7895 		return;
7896 	}
7897 
7898 	assert(false);
7899 }
7900 
7901 static int
7902 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc)
7903 {
7904 	struct spdk_thread *thread;
7905 	int rc = 0;
7906 
7907 	thread = spdk_get_thread();
7908 	if (!thread) {
7909 		SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n");
7910 		return -ENOTSUP;
7911 	}
7912 
7913 	SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
7914 		      spdk_get_thread());
7915 
7916 	desc->bdev = bdev;
7917 	desc->thread = thread;
7918 	desc->write = write;
7919 
7920 	spdk_spin_lock(&bdev->internal.spinlock);
7921 	if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING ||
7922 	    bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) {
7923 		spdk_spin_unlock(&bdev->internal.spinlock);
7924 		return -ENODEV;
7925 	}
7926 
7927 	if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
7928 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
7929 		spdk_spin_unlock(&bdev->internal.spinlock);
7930 		return -EPERM;
7931 	}
7932 
7933 	rc = bdev_start_qos(bdev);
7934 	if (rc != 0) {
7935 		SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name);
7936 		spdk_spin_unlock(&bdev->internal.spinlock);
7937 		return rc;
7938 	}
7939 
7940 	TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link);
7941 
7942 	spdk_spin_unlock(&bdev->internal.spinlock);
7943 
7944 	return 0;
7945 }
7946 
7947 static int
7948 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx,
7949 		struct spdk_bdev_desc **_desc)
7950 {
7951 	struct spdk_bdev_desc *desc;
7952 	unsigned int i;
7953 
7954 	desc = calloc(1, sizeof(*desc));
7955 	if (desc == NULL) {
7956 		SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n");
7957 		return -ENOMEM;
7958 	}
7959 
7960 	TAILQ_INIT(&desc->pending_media_events);
7961 	TAILQ_INIT(&desc->free_media_events);
7962 
7963 	desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0;
7964 	desc->callback.event_fn = event_cb;
7965 	desc->callback.ctx = event_ctx;
7966 	spdk_spin_init(&desc->spinlock);
7967 
7968 	if (bdev->media_events) {
7969 		desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE,
7970 						   sizeof(*desc->media_events_buffer));
7971 		if (desc->media_events_buffer == NULL) {
7972 			SPDK_ERRLOG("Failed to initialize media event pool\n");
7973 			bdev_desc_free(desc);
7974 			return -ENOMEM;
7975 		}
7976 
7977 		for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) {
7978 			TAILQ_INSERT_TAIL(&desc->free_media_events,
7979 					  &desc->media_events_buffer[i], tailq);
7980 		}
7981 	}
7982 
7983 	if (bdev->fn_table->accel_sequence_supported != NULL) {
7984 		for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) {
7985 			desc->accel_sequence_supported[i] =
7986 				bdev->fn_table->accel_sequence_supported(bdev->ctxt,
7987 						(enum spdk_bdev_io_type)i);
7988 		}
7989 	}
7990 
7991 	*_desc = desc;
7992 
7993 	return 0;
7994 }
7995 
7996 static int
7997 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
7998 	      void *event_ctx, struct spdk_bdev_desc **_desc)
7999 {
8000 	struct spdk_bdev_desc *desc;
8001 	struct spdk_bdev *bdev;
8002 	int rc;
8003 
8004 	bdev = bdev_get_by_name(bdev_name);
8005 
8006 	if (bdev == NULL) {
8007 		SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name);
8008 		return -ENODEV;
8009 	}
8010 
8011 	rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc);
8012 	if (rc != 0) {
8013 		return rc;
8014 	}
8015 
8016 	rc = bdev_open(bdev, write, desc);
8017 	if (rc != 0) {
8018 		bdev_desc_free(desc);
8019 		desc = NULL;
8020 	}
8021 
8022 	*_desc = desc;
8023 
8024 	return rc;
8025 }
8026 
8027 int
8028 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
8029 		   void *event_ctx, struct spdk_bdev_desc **_desc)
8030 {
8031 	int rc;
8032 
8033 	if (event_cb == NULL) {
8034 		SPDK_ERRLOG("Missing event callback function\n");
8035 		return -EINVAL;
8036 	}
8037 
8038 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8039 	rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, _desc);
8040 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8041 
8042 	return rc;
8043 }
8044 
8045 struct spdk_bdev_open_async_ctx {
8046 	char					*bdev_name;
8047 	spdk_bdev_event_cb_t			event_cb;
8048 	void					*event_ctx;
8049 	bool					write;
8050 	int					rc;
8051 	spdk_bdev_open_async_cb_t		cb_fn;
8052 	void					*cb_arg;
8053 	struct spdk_bdev_desc			*desc;
8054 	struct spdk_bdev_open_async_opts	opts;
8055 	uint64_t				start_ticks;
8056 	struct spdk_thread			*orig_thread;
8057 	struct spdk_poller			*poller;
8058 	TAILQ_ENTRY(spdk_bdev_open_async_ctx)	tailq;
8059 };
8060 
8061 static void
8062 bdev_open_async_done(void *arg)
8063 {
8064 	struct spdk_bdev_open_async_ctx *ctx = arg;
8065 
8066 	ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg);
8067 
8068 	free(ctx->bdev_name);
8069 	free(ctx);
8070 }
8071 
8072 static void
8073 bdev_open_async_cancel(void *arg)
8074 {
8075 	struct spdk_bdev_open_async_ctx *ctx = arg;
8076 
8077 	assert(ctx->rc == -ESHUTDOWN);
8078 
8079 	spdk_poller_unregister(&ctx->poller);
8080 
8081 	bdev_open_async_done(ctx);
8082 }
8083 
8084 /* This is called when the bdev library finishes at shutdown. */
8085 static void
8086 bdev_open_async_fini(void)
8087 {
8088 	struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx;
8089 
8090 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8091 	TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) {
8092 		TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq);
8093 		/*
8094 		 * We have to move to ctx->orig_thread to unregister ctx->poller.
8095 		 * However, there is a chance that ctx->poller is executed before
8096 		 * message is executed, which could result in bdev_open_async_done()
8097 		 * being called twice. To avoid such race condition, set ctx->rc to
8098 		 * -ESHUTDOWN.
8099 		 */
8100 		ctx->rc = -ESHUTDOWN;
8101 		spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx);
8102 	}
8103 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8104 }
8105 
8106 static int bdev_open_async(void *arg);
8107 
8108 static void
8109 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx)
8110 {
8111 	uint64_t timeout_ticks;
8112 
8113 	if (ctx->rc == -ESHUTDOWN) {
8114 		/* This context is being canceled. Do nothing. */
8115 		return;
8116 	}
8117 
8118 	ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx,
8119 				&ctx->desc);
8120 	if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) {
8121 		goto exit;
8122 	}
8123 
8124 	timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull;
8125 	if (spdk_get_ticks() >= timeout_ticks) {
8126 		SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name);
8127 		ctx->rc = -ETIMEDOUT;
8128 		goto exit;
8129 	}
8130 
8131 	return;
8132 
8133 exit:
8134 	spdk_poller_unregister(&ctx->poller);
8135 	TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq);
8136 
8137 	/* Completion callback is processed after stack unwinding. */
8138 	spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx);
8139 }
8140 
8141 static int
8142 bdev_open_async(void *arg)
8143 {
8144 	struct spdk_bdev_open_async_ctx *ctx = arg;
8145 
8146 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8147 
8148 	_bdev_open_async(ctx);
8149 
8150 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8151 
8152 	return SPDK_POLLER_BUSY;
8153 }
8154 
8155 static void
8156 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts,
8157 			  struct spdk_bdev_open_async_opts *opts_src,
8158 			  size_t size)
8159 {
8160 	assert(opts);
8161 	assert(opts_src);
8162 
8163 	opts->size = size;
8164 
8165 #define SET_FIELD(field) \
8166 	if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \
8167 		opts->field = opts_src->field; \
8168 	} \
8169 
8170 	SET_FIELD(timeout_ms);
8171 
8172 	/* Do not remove this statement, you should always update this statement when you adding a new field,
8173 	 * and do not forget to add the SET_FIELD statement for your added field. */
8174 	SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size");
8175 
8176 #undef SET_FIELD
8177 }
8178 
8179 static void
8180 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size)
8181 {
8182 	assert(opts);
8183 
8184 	opts->size = size;
8185 
8186 #define SET_FIELD(field, value) \
8187 	if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \
8188 		opts->field = value; \
8189 	} \
8190 
8191 	SET_FIELD(timeout_ms, 0);
8192 
8193 #undef SET_FIELD
8194 }
8195 
8196 int
8197 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
8198 		     void *event_ctx, struct spdk_bdev_open_async_opts *opts,
8199 		     spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg)
8200 {
8201 	struct spdk_bdev_open_async_ctx *ctx;
8202 
8203 	if (event_cb == NULL) {
8204 		SPDK_ERRLOG("Missing event callback function\n");
8205 		return -EINVAL;
8206 	}
8207 
8208 	if (open_cb == NULL) {
8209 		SPDK_ERRLOG("Missing open callback function\n");
8210 		return -EINVAL;
8211 	}
8212 
8213 	if (opts != NULL && opts->size == 0) {
8214 		SPDK_ERRLOG("size in the options structure should not be zero\n");
8215 		return -EINVAL;
8216 	}
8217 
8218 	ctx = calloc(1, sizeof(*ctx));
8219 	if (ctx == NULL) {
8220 		SPDK_ERRLOG("Failed to allocate open context\n");
8221 		return -ENOMEM;
8222 	}
8223 
8224 	ctx->bdev_name = strdup(bdev_name);
8225 	if (ctx->bdev_name == NULL) {
8226 		SPDK_ERRLOG("Failed to duplicate bdev_name\n");
8227 		free(ctx);
8228 		return -ENOMEM;
8229 	}
8230 
8231 	ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000);
8232 	if (ctx->poller == NULL) {
8233 		SPDK_ERRLOG("Failed to register bdev_open_async poller\n");
8234 		free(ctx->bdev_name);
8235 		free(ctx);
8236 		return -ENOMEM;
8237 	}
8238 
8239 	ctx->cb_fn = open_cb;
8240 	ctx->cb_arg = open_cb_arg;
8241 	ctx->write = write;
8242 	ctx->event_cb = event_cb;
8243 	ctx->event_ctx = event_ctx;
8244 	ctx->orig_thread = spdk_get_thread();
8245 	ctx->start_ticks = spdk_get_ticks();
8246 
8247 	bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts));
8248 	if (opts != NULL) {
8249 		bdev_open_async_opts_copy(&ctx->opts, opts, opts->size);
8250 	}
8251 
8252 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8253 
8254 	TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq);
8255 	_bdev_open_async(ctx);
8256 
8257 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8258 
8259 	return 0;
8260 }
8261 
8262 static void
8263 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc)
8264 {
8265 	int rc;
8266 
8267 	spdk_spin_lock(&bdev->internal.spinlock);
8268 	spdk_spin_lock(&desc->spinlock);
8269 
8270 	TAILQ_REMOVE(&bdev->internal.open_descs, desc, link);
8271 
8272 	desc->closed = true;
8273 
8274 	if (desc->claim != NULL) {
8275 		bdev_desc_release_claims(desc);
8276 	}
8277 
8278 	if (0 == desc->refs) {
8279 		spdk_spin_unlock(&desc->spinlock);
8280 		bdev_desc_free(desc);
8281 	} else {
8282 		spdk_spin_unlock(&desc->spinlock);
8283 	}
8284 
8285 	/* If no more descriptors, kill QoS channel */
8286 	if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) {
8287 		SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n",
8288 			      bdev->name, spdk_get_thread());
8289 
8290 		if (bdev_qos_destroy(bdev)) {
8291 			/* There isn't anything we can do to recover here. Just let the
8292 			 * old QoS poller keep running. The QoS handling won't change
8293 			 * cores when the user allocates a new channel, but it won't break. */
8294 			SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n");
8295 		}
8296 	}
8297 
8298 	if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) {
8299 		rc = bdev_unregister_unsafe(bdev);
8300 		spdk_spin_unlock(&bdev->internal.spinlock);
8301 
8302 		if (rc == 0) {
8303 			spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
8304 		}
8305 	} else {
8306 		spdk_spin_unlock(&bdev->internal.spinlock);
8307 	}
8308 }
8309 
8310 void
8311 spdk_bdev_close(struct spdk_bdev_desc *desc)
8312 {
8313 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
8314 
8315 	SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
8316 		      spdk_get_thread());
8317 
8318 	assert(desc->thread == spdk_get_thread());
8319 
8320 	spdk_poller_unregister(&desc->io_timeout_poller);
8321 
8322 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8323 
8324 	bdev_close(bdev, desc);
8325 
8326 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8327 }
8328 
8329 static void
8330 bdev_register_finished(void *arg)
8331 {
8332 	struct spdk_bdev_desc *desc = arg;
8333 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
8334 
8335 	spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev));
8336 
8337 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8338 
8339 	bdev_close(bdev, desc);
8340 
8341 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8342 }
8343 
8344 int
8345 spdk_bdev_register(struct spdk_bdev *bdev)
8346 {
8347 	struct spdk_bdev_desc *desc;
8348 	struct spdk_thread *thread = spdk_get_thread();
8349 	int rc;
8350 
8351 	if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) {
8352 		SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", bdev->name, thread,
8353 			    thread ? spdk_thread_get_name(thread) : "null");
8354 		return -EINVAL;
8355 	}
8356 
8357 	rc = bdev_register(bdev);
8358 	if (rc != 0) {
8359 		return rc;
8360 	}
8361 
8362 	/* A descriptor is opened to prevent bdev deletion during examination */
8363 	rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc);
8364 	if (rc != 0) {
8365 		spdk_bdev_unregister(bdev, NULL, NULL);
8366 		return rc;
8367 	}
8368 
8369 	rc = bdev_open(bdev, false, desc);
8370 	if (rc != 0) {
8371 		bdev_desc_free(desc);
8372 		spdk_bdev_unregister(bdev, NULL, NULL);
8373 		return rc;
8374 	}
8375 
8376 	/* Examine configuration before initializing I/O */
8377 	bdev_examine(bdev);
8378 
8379 	rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc);
8380 	if (rc != 0) {
8381 		bdev_close(bdev, desc);
8382 		spdk_bdev_unregister(bdev, NULL, NULL);
8383 	}
8384 
8385 	return rc;
8386 }
8387 
8388 int
8389 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
8390 			    struct spdk_bdev_module *module)
8391 {
8392 	spdk_spin_lock(&bdev->internal.spinlock);
8393 
8394 	if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
8395 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
8396 		spdk_spin_unlock(&bdev->internal.spinlock);
8397 		return -EPERM;
8398 	}
8399 
8400 	if (desc && !desc->write) {
8401 		desc->write = true;
8402 	}
8403 
8404 	bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE;
8405 	bdev->internal.claim.v1.module = module;
8406 
8407 	spdk_spin_unlock(&bdev->internal.spinlock);
8408 	return 0;
8409 }
8410 
8411 void
8412 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev)
8413 {
8414 	spdk_spin_lock(&bdev->internal.spinlock);
8415 
8416 	assert(bdev->internal.claim.v1.module != NULL);
8417 	assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE);
8418 	bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE;
8419 	bdev->internal.claim.v1.module = NULL;
8420 
8421 	spdk_spin_unlock(&bdev->internal.spinlock);
8422 }
8423 
8424 /*
8425  * Start claims v2
8426  */
8427 
8428 const char *
8429 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type)
8430 {
8431 	switch (type) {
8432 	case SPDK_BDEV_CLAIM_NONE:
8433 		return "not_claimed";
8434 	case SPDK_BDEV_CLAIM_EXCL_WRITE:
8435 		return "exclusive_write";
8436 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
8437 		return "read_many_write_one";
8438 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE:
8439 		return "read_many_write_none";
8440 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
8441 		return "read_many_write_many";
8442 	default:
8443 		break;
8444 	}
8445 	return "invalid_claim";
8446 }
8447 
8448 static bool
8449 claim_type_is_v2(enum spdk_bdev_claim_type type)
8450 {
8451 	switch (type) {
8452 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
8453 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE:
8454 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
8455 		return true;
8456 	default:
8457 		break;
8458 	}
8459 	return false;
8460 }
8461 
8462 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */
8463 static bool
8464 claim_type_promotes_to_write(enum spdk_bdev_claim_type type)
8465 {
8466 	switch (type) {
8467 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
8468 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
8469 		return true;
8470 	default:
8471 		break;
8472 	}
8473 	return false;
8474 }
8475 
8476 void
8477 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size)
8478 {
8479 	if (opts == NULL) {
8480 		SPDK_ERRLOG("opts should not be NULL\n");
8481 		assert(opts != NULL);
8482 		return;
8483 	}
8484 	if (size == 0) {
8485 		SPDK_ERRLOG("size should not be zero\n");
8486 		assert(size != 0);
8487 		return;
8488 	}
8489 
8490 	memset(opts, 0, size);
8491 	opts->opts_size = size;
8492 
8493 #define FIELD_OK(field) \
8494         offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size
8495 
8496 #define SET_FIELD(field, value) \
8497         if (FIELD_OK(field)) { \
8498                 opts->field = value; \
8499         } \
8500 
8501 	SET_FIELD(shared_claim_key, 0);
8502 
8503 #undef FIELD_OK
8504 #undef SET_FIELD
8505 }
8506 
8507 static int
8508 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst)
8509 {
8510 	if (src->opts_size == 0) {
8511 		SPDK_ERRLOG("size should not be zero\n");
8512 		return -1;
8513 	}
8514 
8515 	memset(dst, 0, sizeof(*dst));
8516 	dst->opts_size = src->opts_size;
8517 
8518 #define FIELD_OK(field) \
8519         offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size
8520 
8521 #define SET_FIELD(field) \
8522         if (FIELD_OK(field)) { \
8523                 dst->field = src->field; \
8524         } \
8525 
8526 	if (FIELD_OK(name)) {
8527 		snprintf(dst->name, sizeof(dst->name), "%s", src->name);
8528 	}
8529 
8530 	SET_FIELD(shared_claim_key);
8531 
8532 	/* You should not remove this statement, but need to update the assert statement
8533 	 * if you add a new field, and also add a corresponding SET_FIELD statement */
8534 	SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size");
8535 
8536 #undef FIELD_OK
8537 #undef SET_FIELD
8538 	return 0;
8539 }
8540 
8541 /* Returns 0 if a read-write-once claim can be taken. */
8542 static int
8543 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
8544 		 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
8545 {
8546 	struct spdk_bdev *bdev = desc->bdev;
8547 	struct spdk_bdev_desc *open_desc;
8548 
8549 	assert(spdk_spin_held(&bdev->internal.spinlock));
8550 	assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE);
8551 
8552 	if (opts->shared_claim_key != 0) {
8553 		SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n",
8554 			    bdev->name);
8555 		return -EINVAL;
8556 	}
8557 	if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
8558 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
8559 		return -EPERM;
8560 	}
8561 	if (desc->claim != NULL) {
8562 		SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n",
8563 			       bdev->name, desc->claim->module->name);
8564 		return -EPERM;
8565 	}
8566 	TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) {
8567 		if (desc != open_desc && open_desc->write) {
8568 			SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while "
8569 				       "another descriptor is open for writing\n",
8570 				       bdev->name);
8571 			return -EPERM;
8572 		}
8573 	}
8574 
8575 	return 0;
8576 }
8577 
8578 /* Returns 0 if a read-only-many claim can be taken. */
8579 static int
8580 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
8581 		 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
8582 {
8583 	struct spdk_bdev *bdev = desc->bdev;
8584 	struct spdk_bdev_desc *open_desc;
8585 
8586 	assert(spdk_spin_held(&bdev->internal.spinlock));
8587 	assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE);
8588 	assert(desc->claim == NULL);
8589 
8590 	if (desc->write) {
8591 		SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n",
8592 			    bdev->name);
8593 		return -EINVAL;
8594 	}
8595 	if (opts->shared_claim_key != 0) {
8596 		SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name);
8597 		return -EINVAL;
8598 	}
8599 	if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) {
8600 		TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) {
8601 			if (open_desc->write) {
8602 				SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while "
8603 					       "another descriptor is open for writing\n",
8604 					       bdev->name);
8605 				return -EPERM;
8606 			}
8607 		}
8608 	}
8609 
8610 	return 0;
8611 }
8612 
8613 /* Returns 0 if a read-write-many claim can be taken. */
8614 static int
8615 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
8616 		 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
8617 {
8618 	struct spdk_bdev *bdev = desc->bdev;
8619 	struct spdk_bdev_desc *open_desc;
8620 
8621 	assert(spdk_spin_held(&bdev->internal.spinlock));
8622 	assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED);
8623 	assert(desc->claim == NULL);
8624 
8625 	if (opts->shared_claim_key == 0) {
8626 		SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n",
8627 			    bdev->name);
8628 		return -EINVAL;
8629 	}
8630 	switch (bdev->internal.claim_type) {
8631 	case SPDK_BDEV_CLAIM_NONE:
8632 		TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) {
8633 			if (open_desc == desc) {
8634 				continue;
8635 			}
8636 			if (open_desc->write) {
8637 				SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while "
8638 					       "another descriptor is open for writing without a "
8639 					       "claim\n", bdev->name);
8640 				return -EPERM;
8641 			}
8642 		}
8643 		break;
8644 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
8645 		if (opts->shared_claim_key != bdev->internal.claim.v2.key) {
8646 			LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev);
8647 			return -EPERM;
8648 		}
8649 		break;
8650 	default:
8651 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
8652 		return -EBUSY;
8653 	}
8654 
8655 	return 0;
8656 }
8657 
8658 /* Updates desc and its bdev with a v2 claim. */
8659 static int
8660 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
8661 	   struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
8662 {
8663 	struct spdk_bdev *bdev = desc->bdev;
8664 	struct spdk_bdev_module_claim *claim;
8665 
8666 	assert(spdk_spin_held(&bdev->internal.spinlock));
8667 	assert(claim_type_is_v2(type));
8668 	assert(desc->claim == NULL);
8669 
8670 	claim = calloc(1, sizeof(*desc->claim));
8671 	if (claim == NULL) {
8672 		SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name);
8673 		return -ENOMEM;
8674 	}
8675 	claim->module = module;
8676 	claim->desc = desc;
8677 	SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match");
8678 	memcpy(claim->name, opts->name, sizeof(claim->name));
8679 	desc->claim = claim;
8680 
8681 	if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) {
8682 		bdev->internal.claim_type = type;
8683 		TAILQ_INIT(&bdev->internal.claim.v2.claims);
8684 		bdev->internal.claim.v2.key = opts->shared_claim_key;
8685 	}
8686 	assert(type == bdev->internal.claim_type);
8687 
8688 	TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link);
8689 
8690 	if (!desc->write && claim_type_promotes_to_write(type)) {
8691 		desc->write = true;
8692 	}
8693 
8694 	return 0;
8695 }
8696 
8697 int
8698 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
8699 				 struct spdk_bdev_claim_opts *_opts,
8700 				 struct spdk_bdev_module *module)
8701 {
8702 	struct spdk_bdev *bdev;
8703 	struct spdk_bdev_claim_opts opts;
8704 	int rc = 0;
8705 
8706 	if (desc == NULL) {
8707 		SPDK_ERRLOG("descriptor must not be NULL\n");
8708 		return -EINVAL;
8709 	}
8710 
8711 	bdev = desc->bdev;
8712 
8713 	if (_opts == NULL) {
8714 		spdk_bdev_claim_opts_init(&opts, sizeof(opts));
8715 	} else if (claim_opts_copy(_opts, &opts) != 0) {
8716 		return -EINVAL;
8717 	}
8718 
8719 	spdk_spin_lock(&bdev->internal.spinlock);
8720 
8721 	if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE &&
8722 	    bdev->internal.claim_type != type) {
8723 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
8724 		spdk_spin_unlock(&bdev->internal.spinlock);
8725 		return -EPERM;
8726 	}
8727 
8728 	if (claim_type_is_v2(type) && desc->claim != NULL) {
8729 		SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n",
8730 			    bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name);
8731 		spdk_spin_unlock(&bdev->internal.spinlock);
8732 		return -EPERM;
8733 	}
8734 
8735 	switch (type) {
8736 	case SPDK_BDEV_CLAIM_EXCL_WRITE:
8737 		spdk_spin_unlock(&bdev->internal.spinlock);
8738 		return spdk_bdev_module_claim_bdev(bdev, desc, module);
8739 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
8740 		rc = claim_verify_rwo(desc, type, &opts, module);
8741 		break;
8742 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE:
8743 		rc = claim_verify_rom(desc, type, &opts, module);
8744 		break;
8745 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
8746 		rc = claim_verify_rwm(desc, type, &opts, module);
8747 		break;
8748 	default:
8749 		SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type);
8750 		rc = -ENOTSUP;
8751 	}
8752 
8753 	if (rc == 0) {
8754 		rc = claim_bdev(desc, type, &opts, module);
8755 	}
8756 
8757 	spdk_spin_unlock(&bdev->internal.spinlock);
8758 	return rc;
8759 }
8760 
8761 static void
8762 claim_reset(struct spdk_bdev *bdev)
8763 {
8764 	assert(spdk_spin_held(&bdev->internal.spinlock));
8765 	assert(claim_type_is_v2(bdev->internal.claim_type));
8766 	assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims));
8767 
8768 	memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim));
8769 	bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE;
8770 }
8771 
8772 static void
8773 bdev_desc_release_claims(struct spdk_bdev_desc *desc)
8774 {
8775 	struct spdk_bdev *bdev = desc->bdev;
8776 
8777 	assert(spdk_spin_held(&bdev->internal.spinlock));
8778 	assert(claim_type_is_v2(bdev->internal.claim_type));
8779 
8780 	if (bdev->internal.examine_in_progress == 0) {
8781 		TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link);
8782 		free(desc->claim);
8783 		if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) {
8784 			claim_reset(bdev);
8785 		}
8786 	} else {
8787 		/* This is a dead claim that will be cleaned up when bdev_examine() is done. */
8788 		desc->claim->module = NULL;
8789 		desc->claim->desc = NULL;
8790 	}
8791 	desc->claim = NULL;
8792 }
8793 
8794 /*
8795  * End claims v2
8796  */
8797 
8798 struct spdk_bdev *
8799 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc)
8800 {
8801 	assert(desc != NULL);
8802 	return desc->bdev;
8803 }
8804 
8805 int
8806 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn)
8807 {
8808 	struct spdk_bdev *bdev, *tmp;
8809 	struct spdk_bdev_desc *desc;
8810 	int rc = 0;
8811 
8812 	assert(fn != NULL);
8813 
8814 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8815 	bdev = spdk_bdev_first();
8816 	while (bdev != NULL) {
8817 		rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc);
8818 		if (rc != 0) {
8819 			break;
8820 		}
8821 		rc = bdev_open(bdev, false, desc);
8822 		if (rc != 0) {
8823 			bdev_desc_free(desc);
8824 			if (rc == -ENODEV) {
8825 				/* Ignore the error and move to the next bdev. */
8826 				rc = 0;
8827 				bdev = spdk_bdev_next(bdev);
8828 				continue;
8829 			}
8830 			break;
8831 		}
8832 		spdk_spin_unlock(&g_bdev_mgr.spinlock);
8833 
8834 		rc = fn(ctx, bdev);
8835 
8836 		spdk_spin_lock(&g_bdev_mgr.spinlock);
8837 		tmp = spdk_bdev_next(bdev);
8838 		bdev_close(bdev, desc);
8839 		if (rc != 0) {
8840 			break;
8841 		}
8842 		bdev = tmp;
8843 	}
8844 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8845 
8846 	return rc;
8847 }
8848 
8849 int
8850 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn)
8851 {
8852 	struct spdk_bdev *bdev, *tmp;
8853 	struct spdk_bdev_desc *desc;
8854 	int rc = 0;
8855 
8856 	assert(fn != NULL);
8857 
8858 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8859 	bdev = spdk_bdev_first_leaf();
8860 	while (bdev != NULL) {
8861 		rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc);
8862 		if (rc != 0) {
8863 			break;
8864 		}
8865 		rc = bdev_open(bdev, false, desc);
8866 		if (rc != 0) {
8867 			bdev_desc_free(desc);
8868 			if (rc == -ENODEV) {
8869 				/* Ignore the error and move to the next bdev. */
8870 				rc = 0;
8871 				bdev = spdk_bdev_next_leaf(bdev);
8872 				continue;
8873 			}
8874 			break;
8875 		}
8876 		spdk_spin_unlock(&g_bdev_mgr.spinlock);
8877 
8878 		rc = fn(ctx, bdev);
8879 
8880 		spdk_spin_lock(&g_bdev_mgr.spinlock);
8881 		tmp = spdk_bdev_next_leaf(bdev);
8882 		bdev_close(bdev, desc);
8883 		if (rc != 0) {
8884 			break;
8885 		}
8886 		bdev = tmp;
8887 	}
8888 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8889 
8890 	return rc;
8891 }
8892 
8893 void
8894 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp)
8895 {
8896 	struct iovec *iovs;
8897 	int iovcnt;
8898 
8899 	if (bdev_io == NULL) {
8900 		return;
8901 	}
8902 
8903 	switch (bdev_io->type) {
8904 	case SPDK_BDEV_IO_TYPE_READ:
8905 	case SPDK_BDEV_IO_TYPE_WRITE:
8906 	case SPDK_BDEV_IO_TYPE_ZCOPY:
8907 		iovs = bdev_io->u.bdev.iovs;
8908 		iovcnt = bdev_io->u.bdev.iovcnt;
8909 		break;
8910 	default:
8911 		iovs = NULL;
8912 		iovcnt = 0;
8913 		break;
8914 	}
8915 
8916 	if (iovp) {
8917 		*iovp = iovs;
8918 	}
8919 	if (iovcntp) {
8920 		*iovcntp = iovcnt;
8921 	}
8922 }
8923 
8924 void *
8925 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io)
8926 {
8927 	if (bdev_io == NULL) {
8928 		return NULL;
8929 	}
8930 
8931 	if (!spdk_bdev_is_md_separate(bdev_io->bdev)) {
8932 		return NULL;
8933 	}
8934 
8935 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ ||
8936 	    bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
8937 		return bdev_io->u.bdev.md_buf;
8938 	}
8939 
8940 	return NULL;
8941 }
8942 
8943 void *
8944 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io)
8945 {
8946 	if (bdev_io == NULL) {
8947 		assert(false);
8948 		return NULL;
8949 	}
8950 
8951 	return bdev_io->internal.caller_ctx;
8952 }
8953 
8954 void
8955 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module)
8956 {
8957 
8958 	if (spdk_bdev_module_list_find(bdev_module->name)) {
8959 		SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name);
8960 		assert(false);
8961 	}
8962 
8963 	spdk_spin_init(&bdev_module->internal.spinlock);
8964 	TAILQ_INIT(&bdev_module->internal.quiesced_ranges);
8965 
8966 	/*
8967 	 * Modules with examine callbacks must be initialized first, so they are
8968 	 *  ready to handle examine callbacks from later modules that will
8969 	 *  register physical bdevs.
8970 	 */
8971 	if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) {
8972 		TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
8973 	} else {
8974 		TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
8975 	}
8976 }
8977 
8978 struct spdk_bdev_module *
8979 spdk_bdev_module_list_find(const char *name)
8980 {
8981 	struct spdk_bdev_module *bdev_module;
8982 
8983 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
8984 		if (strcmp(name, bdev_module->name) == 0) {
8985 			break;
8986 		}
8987 	}
8988 
8989 	return bdev_module;
8990 }
8991 
8992 static int
8993 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io)
8994 {
8995 	uint64_t num_blocks;
8996 	void *md_buf = NULL;
8997 
8998 	num_blocks = bdev_io->u.bdev.num_blocks;
8999 
9000 	if (spdk_bdev_is_md_separate(bdev_io->bdev)) {
9001 		md_buf = (char *)g_bdev_mgr.zero_buffer +
9002 			 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks;
9003 	}
9004 
9005 	return bdev_write_blocks_with_md(bdev_io->internal.desc,
9006 					 spdk_io_channel_from_ctx(bdev_io->internal.ch),
9007 					 g_bdev_mgr.zero_buffer, md_buf,
9008 					 bdev_io->u.bdev.offset_blocks, num_blocks,
9009 					 bdev_write_zero_buffer_done, bdev_io);
9010 }
9011 
9012 static void
9013 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
9014 {
9015 	struct spdk_bdev_io *parent_io = cb_arg;
9016 
9017 	spdk_bdev_free_io(bdev_io);
9018 
9019 	parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
9020 	parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx);
9021 }
9022 
9023 static void
9024 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status)
9025 {
9026 	spdk_spin_lock(&ctx->bdev->internal.spinlock);
9027 	ctx->bdev->internal.qos_mod_in_progress = false;
9028 	spdk_spin_unlock(&ctx->bdev->internal.spinlock);
9029 
9030 	if (ctx->cb_fn) {
9031 		ctx->cb_fn(ctx->cb_arg, status);
9032 	}
9033 	free(ctx);
9034 }
9035 
9036 static void
9037 bdev_disable_qos_done(void *cb_arg)
9038 {
9039 	struct set_qos_limit_ctx *ctx = cb_arg;
9040 	struct spdk_bdev *bdev = ctx->bdev;
9041 	struct spdk_bdev_qos *qos;
9042 
9043 	spdk_spin_lock(&bdev->internal.spinlock);
9044 	qos = bdev->internal.qos;
9045 	bdev->internal.qos = NULL;
9046 	spdk_spin_unlock(&bdev->internal.spinlock);
9047 
9048 	if (qos->thread != NULL) {
9049 		spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
9050 		spdk_poller_unregister(&qos->poller);
9051 	}
9052 
9053 	free(qos);
9054 
9055 	bdev_set_qos_limit_done(ctx, 0);
9056 }
9057 
9058 static void
9059 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status)
9060 {
9061 	struct set_qos_limit_ctx *ctx = _ctx;
9062 	struct spdk_thread *thread;
9063 
9064 	spdk_spin_lock(&bdev->internal.spinlock);
9065 	thread = bdev->internal.qos->thread;
9066 	spdk_spin_unlock(&bdev->internal.spinlock);
9067 
9068 	if (thread != NULL) {
9069 		spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx);
9070 	} else {
9071 		bdev_disable_qos_done(ctx);
9072 	}
9073 }
9074 
9075 static void
9076 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9077 		     struct spdk_io_channel *ch, void *_ctx)
9078 {
9079 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
9080 	struct spdk_bdev_io *bdev_io;
9081 
9082 	bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED;
9083 
9084 	while (!TAILQ_EMPTY(&bdev_ch->qos_queued_io)) {
9085 		/* Re-submit the queued I/O. */
9086 		bdev_io = TAILQ_FIRST(&bdev_ch->qos_queued_io);
9087 		TAILQ_REMOVE(&bdev_ch->qos_queued_io, bdev_io, internal.link);
9088 		_bdev_io_submit(bdev_io);
9089 	}
9090 
9091 	spdk_bdev_for_each_channel_continue(i, 0);
9092 }
9093 
9094 static void
9095 bdev_update_qos_rate_limit_msg(void *cb_arg)
9096 {
9097 	struct set_qos_limit_ctx *ctx = cb_arg;
9098 	struct spdk_bdev *bdev = ctx->bdev;
9099 
9100 	spdk_spin_lock(&bdev->internal.spinlock);
9101 	bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos);
9102 	spdk_spin_unlock(&bdev->internal.spinlock);
9103 
9104 	bdev_set_qos_limit_done(ctx, 0);
9105 }
9106 
9107 static void
9108 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9109 		    struct spdk_io_channel *ch, void *_ctx)
9110 {
9111 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
9112 
9113 	spdk_spin_lock(&bdev->internal.spinlock);
9114 	bdev_enable_qos(bdev, bdev_ch);
9115 	spdk_spin_unlock(&bdev->internal.spinlock);
9116 	spdk_bdev_for_each_channel_continue(i, 0);
9117 }
9118 
9119 static void
9120 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status)
9121 {
9122 	struct set_qos_limit_ctx *ctx = _ctx;
9123 
9124 	bdev_set_qos_limit_done(ctx, status);
9125 }
9126 
9127 static void
9128 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits)
9129 {
9130 	int i;
9131 
9132 	assert(bdev->internal.qos != NULL);
9133 
9134 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
9135 		if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
9136 			bdev->internal.qos->rate_limits[i].limit = limits[i];
9137 
9138 			if (limits[i] == 0) {
9139 				bdev->internal.qos->rate_limits[i].limit =
9140 					SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
9141 			}
9142 		}
9143 	}
9144 }
9145 
9146 void
9147 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits,
9148 			      void (*cb_fn)(void *cb_arg, int status), void *cb_arg)
9149 {
9150 	struct set_qos_limit_ctx	*ctx;
9151 	uint32_t			limit_set_complement;
9152 	uint64_t			min_limit_per_sec;
9153 	int				i;
9154 	bool				disable_rate_limit = true;
9155 
9156 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
9157 		if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
9158 			continue;
9159 		}
9160 
9161 		if (limits[i] > 0) {
9162 			disable_rate_limit = false;
9163 		}
9164 
9165 		if (bdev_qos_is_iops_rate_limit(i) == true) {
9166 			min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC;
9167 		} else {
9168 			/* Change from megabyte to byte rate limit */
9169 			limits[i] = limits[i] * 1024 * 1024;
9170 			min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC;
9171 		}
9172 
9173 		limit_set_complement = limits[i] % min_limit_per_sec;
9174 		if (limit_set_complement) {
9175 			SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n",
9176 				    limits[i], min_limit_per_sec);
9177 			limits[i] += min_limit_per_sec - limit_set_complement;
9178 			SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]);
9179 		}
9180 	}
9181 
9182 	ctx = calloc(1, sizeof(*ctx));
9183 	if (ctx == NULL) {
9184 		cb_fn(cb_arg, -ENOMEM);
9185 		return;
9186 	}
9187 
9188 	ctx->cb_fn = cb_fn;
9189 	ctx->cb_arg = cb_arg;
9190 	ctx->bdev = bdev;
9191 
9192 	spdk_spin_lock(&bdev->internal.spinlock);
9193 	if (bdev->internal.qos_mod_in_progress) {
9194 		spdk_spin_unlock(&bdev->internal.spinlock);
9195 		free(ctx);
9196 		cb_fn(cb_arg, -EAGAIN);
9197 		return;
9198 	}
9199 	bdev->internal.qos_mod_in_progress = true;
9200 
9201 	if (disable_rate_limit == true && bdev->internal.qos) {
9202 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
9203 			if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED &&
9204 			    (bdev->internal.qos->rate_limits[i].limit > 0 &&
9205 			     bdev->internal.qos->rate_limits[i].limit !=
9206 			     SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) {
9207 				disable_rate_limit = false;
9208 				break;
9209 			}
9210 		}
9211 	}
9212 
9213 	if (disable_rate_limit == false) {
9214 		if (bdev->internal.qos == NULL) {
9215 			bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos));
9216 			if (!bdev->internal.qos) {
9217 				spdk_spin_unlock(&bdev->internal.spinlock);
9218 				SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n");
9219 				bdev_set_qos_limit_done(ctx, -ENOMEM);
9220 				return;
9221 			}
9222 		}
9223 
9224 		if (bdev->internal.qos->thread == NULL) {
9225 			/* Enabling */
9226 			bdev_set_qos_rate_limits(bdev, limits);
9227 
9228 			spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx,
9229 						   bdev_enable_qos_done);
9230 		} else {
9231 			/* Updating */
9232 			bdev_set_qos_rate_limits(bdev, limits);
9233 
9234 			spdk_thread_send_msg(bdev->internal.qos->thread,
9235 					     bdev_update_qos_rate_limit_msg, ctx);
9236 		}
9237 	} else {
9238 		if (bdev->internal.qos != NULL) {
9239 			bdev_set_qos_rate_limits(bdev, limits);
9240 
9241 			/* Disabling */
9242 			spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx,
9243 						   bdev_disable_qos_msg_done);
9244 		} else {
9245 			spdk_spin_unlock(&bdev->internal.spinlock);
9246 			bdev_set_qos_limit_done(ctx, 0);
9247 			return;
9248 		}
9249 	}
9250 
9251 	spdk_spin_unlock(&bdev->internal.spinlock);
9252 }
9253 
9254 struct spdk_bdev_histogram_ctx {
9255 	spdk_bdev_histogram_status_cb cb_fn;
9256 	void *cb_arg;
9257 	struct spdk_bdev *bdev;
9258 	int status;
9259 };
9260 
9261 static void
9262 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
9263 {
9264 	struct spdk_bdev_histogram_ctx *ctx = _ctx;
9265 
9266 	spdk_spin_lock(&ctx->bdev->internal.spinlock);
9267 	ctx->bdev->internal.histogram_in_progress = false;
9268 	spdk_spin_unlock(&ctx->bdev->internal.spinlock);
9269 	ctx->cb_fn(ctx->cb_arg, ctx->status);
9270 	free(ctx);
9271 }
9272 
9273 static void
9274 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9275 			       struct spdk_io_channel *_ch, void *_ctx)
9276 {
9277 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9278 
9279 	if (ch->histogram != NULL) {
9280 		spdk_histogram_data_free(ch->histogram);
9281 		ch->histogram = NULL;
9282 	}
9283 	spdk_bdev_for_each_channel_continue(i, 0);
9284 }
9285 
9286 static void
9287 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
9288 {
9289 	struct spdk_bdev_histogram_ctx *ctx = _ctx;
9290 
9291 	if (status != 0) {
9292 		ctx->status = status;
9293 		ctx->bdev->internal.histogram_enabled = false;
9294 		spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx,
9295 					   bdev_histogram_disable_channel_cb);
9296 	} else {
9297 		spdk_spin_lock(&ctx->bdev->internal.spinlock);
9298 		ctx->bdev->internal.histogram_in_progress = false;
9299 		spdk_spin_unlock(&ctx->bdev->internal.spinlock);
9300 		ctx->cb_fn(ctx->cb_arg, ctx->status);
9301 		free(ctx);
9302 	}
9303 }
9304 
9305 static void
9306 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9307 			      struct spdk_io_channel *_ch, void *_ctx)
9308 {
9309 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9310 	int status = 0;
9311 
9312 	if (ch->histogram == NULL) {
9313 		ch->histogram = spdk_histogram_data_alloc();
9314 		if (ch->histogram == NULL) {
9315 			status = -ENOMEM;
9316 		}
9317 	}
9318 
9319 	spdk_bdev_for_each_channel_continue(i, status);
9320 }
9321 
9322 void
9323 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn,
9324 			   void *cb_arg, bool enable)
9325 {
9326 	struct spdk_bdev_histogram_ctx *ctx;
9327 
9328 	ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx));
9329 	if (ctx == NULL) {
9330 		cb_fn(cb_arg, -ENOMEM);
9331 		return;
9332 	}
9333 
9334 	ctx->bdev = bdev;
9335 	ctx->status = 0;
9336 	ctx->cb_fn = cb_fn;
9337 	ctx->cb_arg = cb_arg;
9338 
9339 	spdk_spin_lock(&bdev->internal.spinlock);
9340 	if (bdev->internal.histogram_in_progress) {
9341 		spdk_spin_unlock(&bdev->internal.spinlock);
9342 		free(ctx);
9343 		cb_fn(cb_arg, -EAGAIN);
9344 		return;
9345 	}
9346 
9347 	bdev->internal.histogram_in_progress = true;
9348 	spdk_spin_unlock(&bdev->internal.spinlock);
9349 
9350 	bdev->internal.histogram_enabled = enable;
9351 
9352 	if (enable) {
9353 		/* Allocate histogram for each channel */
9354 		spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx,
9355 					   bdev_histogram_enable_channel_cb);
9356 	} else {
9357 		spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx,
9358 					   bdev_histogram_disable_channel_cb);
9359 	}
9360 }
9361 
9362 struct spdk_bdev_histogram_data_ctx {
9363 	spdk_bdev_histogram_data_cb cb_fn;
9364 	void *cb_arg;
9365 	struct spdk_bdev *bdev;
9366 	/** merged histogram data from all channels */
9367 	struct spdk_histogram_data	*histogram;
9368 };
9369 
9370 static void
9371 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
9372 {
9373 	struct spdk_bdev_histogram_data_ctx *ctx = _ctx;
9374 
9375 	ctx->cb_fn(ctx->cb_arg, status, ctx->histogram);
9376 	free(ctx);
9377 }
9378 
9379 static void
9380 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9381 			   struct spdk_io_channel *_ch, void *_ctx)
9382 {
9383 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9384 	struct spdk_bdev_histogram_data_ctx *ctx = _ctx;
9385 	int status = 0;
9386 
9387 	if (ch->histogram == NULL) {
9388 		status = -EFAULT;
9389 	} else {
9390 		spdk_histogram_data_merge(ctx->histogram, ch->histogram);
9391 	}
9392 
9393 	spdk_bdev_for_each_channel_continue(i, status);
9394 }
9395 
9396 void
9397 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram,
9398 			spdk_bdev_histogram_data_cb cb_fn,
9399 			void *cb_arg)
9400 {
9401 	struct spdk_bdev_histogram_data_ctx *ctx;
9402 
9403 	ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx));
9404 	if (ctx == NULL) {
9405 		cb_fn(cb_arg, -ENOMEM, NULL);
9406 		return;
9407 	}
9408 
9409 	ctx->bdev = bdev;
9410 	ctx->cb_fn = cb_fn;
9411 	ctx->cb_arg = cb_arg;
9412 
9413 	ctx->histogram = histogram;
9414 
9415 	spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx,
9416 				   bdev_histogram_get_channel_cb);
9417 }
9418 
9419 void
9420 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn,
9421 				void *cb_arg)
9422 {
9423 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
9424 	int status = 0;
9425 
9426 	assert(cb_fn != NULL);
9427 
9428 	if (bdev_ch->histogram == NULL) {
9429 		status = -EFAULT;
9430 	}
9431 	cb_fn(cb_arg, status, bdev_ch->histogram);
9432 }
9433 
9434 size_t
9435 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events,
9436 			   size_t max_events)
9437 {
9438 	struct media_event_entry *entry;
9439 	size_t num_events = 0;
9440 
9441 	for (; num_events < max_events; ++num_events) {
9442 		entry = TAILQ_FIRST(&desc->pending_media_events);
9443 		if (entry == NULL) {
9444 			break;
9445 		}
9446 
9447 		events[num_events] = entry->event;
9448 		TAILQ_REMOVE(&desc->pending_media_events, entry, tailq);
9449 		TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq);
9450 	}
9451 
9452 	return num_events;
9453 }
9454 
9455 int
9456 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events,
9457 			    size_t num_events)
9458 {
9459 	struct spdk_bdev_desc *desc;
9460 	struct media_event_entry *entry;
9461 	size_t event_id;
9462 	int rc = 0;
9463 
9464 	assert(bdev->media_events);
9465 
9466 	spdk_spin_lock(&bdev->internal.spinlock);
9467 	TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
9468 		if (desc->write) {
9469 			break;
9470 		}
9471 	}
9472 
9473 	if (desc == NULL || desc->media_events_buffer == NULL) {
9474 		rc = -ENODEV;
9475 		goto out;
9476 	}
9477 
9478 	for (event_id = 0; event_id < num_events; ++event_id) {
9479 		entry = TAILQ_FIRST(&desc->free_media_events);
9480 		if (entry == NULL) {
9481 			break;
9482 		}
9483 
9484 		TAILQ_REMOVE(&desc->free_media_events, entry, tailq);
9485 		TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq);
9486 		entry->event = events[event_id];
9487 	}
9488 
9489 	rc = event_id;
9490 out:
9491 	spdk_spin_unlock(&bdev->internal.spinlock);
9492 	return rc;
9493 }
9494 
9495 static void
9496 _media_management_notify(void *arg)
9497 {
9498 	struct spdk_bdev_desc *desc = arg;
9499 
9500 	_event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT);
9501 }
9502 
9503 void
9504 spdk_bdev_notify_media_management(struct spdk_bdev *bdev)
9505 {
9506 	struct spdk_bdev_desc *desc;
9507 
9508 	spdk_spin_lock(&bdev->internal.spinlock);
9509 	TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
9510 		if (!TAILQ_EMPTY(&desc->pending_media_events)) {
9511 			event_notify(desc, _media_management_notify);
9512 		}
9513 	}
9514 	spdk_spin_unlock(&bdev->internal.spinlock);
9515 }
9516 
9517 struct locked_lba_range_ctx {
9518 	struct lba_range		range;
9519 	struct lba_range		*current_range;
9520 	struct lba_range		*owner_range;
9521 	struct spdk_poller		*poller;
9522 	lock_range_cb			cb_fn;
9523 	void				*cb_arg;
9524 };
9525 
9526 static void
9527 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status)
9528 {
9529 	struct locked_lba_range_ctx *ctx = _ctx;
9530 
9531 	ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM);
9532 	free(ctx);
9533 }
9534 
9535 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i,
9536 		struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx);
9537 
9538 static void
9539 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status)
9540 {
9541 	struct locked_lba_range_ctx *ctx = _ctx;
9542 
9543 	if (status == -ENOMEM) {
9544 		/* One of the channels could not allocate a range object.
9545 		 * So we have to go back and clean up any ranges that were
9546 		 * allocated successfully before we return error status to
9547 		 * the caller.  We can reuse the unlock function to do that
9548 		 * clean up.
9549 		 */
9550 		spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx,
9551 					   bdev_lock_error_cleanup_cb);
9552 		return;
9553 	}
9554 
9555 	/* All channels have locked this range and no I/O overlapping the range
9556 	 * are outstanding!  Set the owner_ch for the range object for the
9557 	 * locking channel, so that this channel will know that it is allowed
9558 	 * to write to this range.
9559 	 */
9560 	if (ctx->owner_range != NULL) {
9561 		ctx->owner_range->owner_ch = ctx->range.owner_ch;
9562 	}
9563 
9564 	ctx->cb_fn(&ctx->range, ctx->cb_arg, status);
9565 
9566 	/* Don't free the ctx here.  Its range is in the bdev's global list of
9567 	 * locked ranges still, and will be removed and freed when this range
9568 	 * is later unlocked.
9569 	 */
9570 }
9571 
9572 static int
9573 bdev_lock_lba_range_check_io(void *_i)
9574 {
9575 	struct spdk_bdev_channel_iter *i = _i;
9576 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i);
9577 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9578 	struct locked_lba_range_ctx *ctx = i->ctx;
9579 	struct lba_range *range = ctx->current_range;
9580 	struct spdk_bdev_io *bdev_io;
9581 
9582 	spdk_poller_unregister(&ctx->poller);
9583 
9584 	/* The range is now in the locked_ranges, so no new IO can be submitted to this
9585 	 * range.  But we need to wait until any outstanding IO overlapping with this range
9586 	 * are completed.
9587 	 */
9588 	TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) {
9589 		if (bdev_io_range_is_locked(bdev_io, range)) {
9590 			ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100);
9591 			return SPDK_POLLER_BUSY;
9592 		}
9593 	}
9594 
9595 	spdk_bdev_for_each_channel_continue(i, 0);
9596 	return SPDK_POLLER_BUSY;
9597 }
9598 
9599 static void
9600 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9601 				struct spdk_io_channel *_ch, void *_ctx)
9602 {
9603 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9604 	struct locked_lba_range_ctx *ctx = _ctx;
9605 	struct lba_range *range;
9606 
9607 	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
9608 		if (range->length == ctx->range.length &&
9609 		    range->offset == ctx->range.offset &&
9610 		    range->locked_ctx == ctx->range.locked_ctx) {
9611 			/* This range already exists on this channel, so don't add
9612 			 * it again.  This can happen when a new channel is created
9613 			 * while the for_each_channel operation is in progress.
9614 			 * Do not check for outstanding I/O in that case, since the
9615 			 * range was locked before any I/O could be submitted to the
9616 			 * new channel.
9617 			 */
9618 			spdk_bdev_for_each_channel_continue(i, 0);
9619 			return;
9620 		}
9621 	}
9622 
9623 	range = calloc(1, sizeof(*range));
9624 	if (range == NULL) {
9625 		spdk_bdev_for_each_channel_continue(i, -ENOMEM);
9626 		return;
9627 	}
9628 
9629 	range->length = ctx->range.length;
9630 	range->offset = ctx->range.offset;
9631 	range->locked_ctx = ctx->range.locked_ctx;
9632 	range->quiesce = ctx->range.quiesce;
9633 	ctx->current_range = range;
9634 	if (ctx->range.owner_ch == ch) {
9635 		/* This is the range object for the channel that will hold
9636 		 * the lock.  Store it in the ctx object so that we can easily
9637 		 * set its owner_ch after the lock is finally acquired.
9638 		 */
9639 		ctx->owner_range = range;
9640 	}
9641 	TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq);
9642 	bdev_lock_lba_range_check_io(i);
9643 }
9644 
9645 static void
9646 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx)
9647 {
9648 	assert(spdk_get_thread() == ctx->range.owner_thread);
9649 	assert(ctx->range.owner_ch == NULL ||
9650 	       spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread);
9651 
9652 	/* We will add a copy of this range to each channel now. */
9653 	spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx,
9654 				   bdev_lock_lba_range_cb);
9655 }
9656 
9657 static bool
9658 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq)
9659 {
9660 	struct lba_range *r;
9661 
9662 	TAILQ_FOREACH(r, tailq, tailq) {
9663 		if (bdev_lba_range_overlapped(range, r)) {
9664 			return true;
9665 		}
9666 	}
9667 	return false;
9668 }
9669 
9670 static void bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status);
9671 
9672 static int
9673 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch,
9674 		     uint64_t offset, uint64_t length,
9675 		     lock_range_cb cb_fn, void *cb_arg)
9676 {
9677 	struct locked_lba_range_ctx *ctx;
9678 
9679 	ctx = calloc(1, sizeof(*ctx));
9680 	if (ctx == NULL) {
9681 		return -ENOMEM;
9682 	}
9683 
9684 	ctx->range.offset = offset;
9685 	ctx->range.length = length;
9686 	ctx->range.owner_thread = spdk_get_thread();
9687 	ctx->range.owner_ch = ch;
9688 	ctx->range.locked_ctx = cb_arg;
9689 	ctx->range.bdev = bdev;
9690 	ctx->range.quiesce = (cb_fn == bdev_quiesce_range_locked);
9691 	ctx->cb_fn = cb_fn;
9692 	ctx->cb_arg = cb_arg;
9693 
9694 	spdk_spin_lock(&bdev->internal.spinlock);
9695 	if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) {
9696 		/* There is an active lock overlapping with this range.
9697 		 * Put it on the pending list until this range no
9698 		 * longer overlaps with another.
9699 		 */
9700 		TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq);
9701 	} else {
9702 		TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq);
9703 		bdev_lock_lba_range_ctx(bdev, ctx);
9704 	}
9705 	spdk_spin_unlock(&bdev->internal.spinlock);
9706 	return 0;
9707 }
9708 
9709 static int
9710 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
9711 		    uint64_t offset, uint64_t length,
9712 		    lock_range_cb cb_fn, void *cb_arg)
9713 {
9714 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
9715 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9716 
9717 	if (cb_arg == NULL) {
9718 		SPDK_ERRLOG("cb_arg must not be NULL\n");
9719 		return -EINVAL;
9720 	}
9721 
9722 	return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg);
9723 }
9724 
9725 static void
9726 bdev_lock_lba_range_ctx_msg(void *_ctx)
9727 {
9728 	struct locked_lba_range_ctx *ctx = _ctx;
9729 
9730 	bdev_lock_lba_range_ctx(ctx->range.bdev, ctx);
9731 }
9732 
9733 static void
9734 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status)
9735 {
9736 	struct locked_lba_range_ctx *ctx = _ctx;
9737 	struct locked_lba_range_ctx *pending_ctx;
9738 	struct lba_range *range, *tmp;
9739 
9740 	spdk_spin_lock(&bdev->internal.spinlock);
9741 	/* Check if there are any pending locked ranges that overlap with this range
9742 	 * that was just unlocked.  If there are, check that it doesn't overlap with any
9743 	 * other locked ranges before calling bdev_lock_lba_range_ctx which will start
9744 	 * the lock process.
9745 	 */
9746 	TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) {
9747 		if (bdev_lba_range_overlapped(range, &ctx->range) &&
9748 		    !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) {
9749 			TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq);
9750 			pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range);
9751 			TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq);
9752 			spdk_thread_send_msg(pending_ctx->range.owner_thread,
9753 					     bdev_lock_lba_range_ctx_msg, pending_ctx);
9754 		}
9755 	}
9756 	spdk_spin_unlock(&bdev->internal.spinlock);
9757 
9758 	ctx->cb_fn(&ctx->range, ctx->cb_arg, status);
9759 	free(ctx);
9760 }
9761 
9762 static void
9763 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9764 				  struct spdk_io_channel *_ch, void *_ctx)
9765 {
9766 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9767 	struct locked_lba_range_ctx *ctx = _ctx;
9768 	TAILQ_HEAD(, spdk_bdev_io) io_locked;
9769 	struct spdk_bdev_io *bdev_io;
9770 	struct lba_range *range;
9771 
9772 	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
9773 		if (ctx->range.offset == range->offset &&
9774 		    ctx->range.length == range->length &&
9775 		    ctx->range.locked_ctx == range->locked_ctx) {
9776 			TAILQ_REMOVE(&ch->locked_ranges, range, tailq);
9777 			free(range);
9778 			break;
9779 		}
9780 	}
9781 
9782 	/* Note: we should almost always be able to assert that the range specified
9783 	 * was found.  But there are some very rare corner cases where a new channel
9784 	 * gets created simultaneously with a range unlock, where this function
9785 	 * would execute on that new channel and wouldn't have the range.
9786 	 * We also use this to clean up range allocations when a later allocation
9787 	 * fails in the locking path.
9788 	 * So we can't actually assert() here.
9789 	 */
9790 
9791 	/* Swap the locked IO into a temporary list, and then try to submit them again.
9792 	 * We could hyper-optimize this to only resubmit locked I/O that overlap
9793 	 * with the range that was just unlocked, but this isn't a performance path so
9794 	 * we go for simplicity here.
9795 	 */
9796 	TAILQ_INIT(&io_locked);
9797 	TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link);
9798 	while (!TAILQ_EMPTY(&io_locked)) {
9799 		bdev_io = TAILQ_FIRST(&io_locked);
9800 		TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link);
9801 		bdev_io_submit(bdev_io);
9802 	}
9803 
9804 	spdk_bdev_for_each_channel_continue(i, 0);
9805 }
9806 
9807 static int
9808 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length,
9809 		       lock_range_cb cb_fn, void *cb_arg)
9810 {
9811 	struct locked_lba_range_ctx *ctx;
9812 	struct lba_range *range;
9813 
9814 	spdk_spin_lock(&bdev->internal.spinlock);
9815 	/* To start the unlock the process, we find the range in the bdev's locked_ranges
9816 	 * and remove it. This ensures new channels don't inherit the locked range.
9817 	 * Then we will send a message to each channel to remove the range from its
9818 	 * per-channel list.
9819 	 */
9820 	TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) {
9821 		if (range->offset == offset && range->length == length &&
9822 		    (range->owner_ch == NULL || range->locked_ctx == cb_arg)) {
9823 			break;
9824 		}
9825 	}
9826 	if (range == NULL) {
9827 		assert(false);
9828 		spdk_spin_unlock(&bdev->internal.spinlock);
9829 		return -EINVAL;
9830 	}
9831 	TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq);
9832 	ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range);
9833 	spdk_spin_unlock(&bdev->internal.spinlock);
9834 
9835 	ctx->cb_fn = cb_fn;
9836 	ctx->cb_arg = cb_arg;
9837 
9838 	spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx,
9839 				   bdev_unlock_lba_range_cb);
9840 	return 0;
9841 }
9842 
9843 static int
9844 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
9845 		      uint64_t offset, uint64_t length,
9846 		      lock_range_cb cb_fn, void *cb_arg)
9847 {
9848 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
9849 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9850 	struct lba_range *range;
9851 	bool range_found = false;
9852 
9853 	/* Let's make sure the specified channel actually has a lock on
9854 	 * the specified range.  Note that the range must match exactly.
9855 	 */
9856 	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
9857 		if (range->offset == offset && range->length == length &&
9858 		    range->owner_ch == ch && range->locked_ctx == cb_arg) {
9859 			range_found = true;
9860 			break;
9861 		}
9862 	}
9863 
9864 	if (!range_found) {
9865 		return -EINVAL;
9866 	}
9867 
9868 	return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg);
9869 }
9870 
9871 struct bdev_quiesce_ctx {
9872 	spdk_bdev_quiesce_cb cb_fn;
9873 	void *cb_arg;
9874 };
9875 
9876 static void
9877 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status)
9878 {
9879 	struct bdev_quiesce_ctx *quiesce_ctx = ctx;
9880 
9881 	if (quiesce_ctx->cb_fn != NULL) {
9882 		quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status);
9883 	}
9884 
9885 	free(quiesce_ctx);
9886 }
9887 
9888 static void
9889 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status)
9890 {
9891 	struct bdev_quiesce_ctx *quiesce_ctx = ctx;
9892 	struct spdk_bdev_module *module = range->bdev->module;
9893 
9894 	if (status != 0) {
9895 		if (quiesce_ctx->cb_fn != NULL) {
9896 			quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status);
9897 		}
9898 		free(quiesce_ctx);
9899 		return;
9900 	}
9901 
9902 	spdk_spin_lock(&module->internal.spinlock);
9903 	TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module);
9904 	spdk_spin_unlock(&module->internal.spinlock);
9905 
9906 	if (quiesce_ctx->cb_fn != NULL) {
9907 		/* copy the context in case the range is unlocked by the callback */
9908 		struct bdev_quiesce_ctx tmp = *quiesce_ctx;
9909 
9910 		quiesce_ctx->cb_fn = NULL;
9911 		quiesce_ctx->cb_arg = NULL;
9912 
9913 		tmp.cb_fn(tmp.cb_arg, status);
9914 	}
9915 	/* quiesce_ctx will be freed on unquiesce */
9916 }
9917 
9918 static int
9919 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
9920 		   uint64_t offset, uint64_t length,
9921 		   spdk_bdev_quiesce_cb cb_fn, void *cb_arg,
9922 		   bool unquiesce)
9923 {
9924 	struct bdev_quiesce_ctx *quiesce_ctx;
9925 	int rc;
9926 
9927 	if (module != bdev->module) {
9928 		SPDK_ERRLOG("Bdev does not belong to specified module.\n");
9929 		return -EINVAL;
9930 	}
9931 
9932 	if (!bdev_io_valid_blocks(bdev, offset, length)) {
9933 		return -EINVAL;
9934 	}
9935 
9936 	if (unquiesce) {
9937 		struct lba_range *range;
9938 
9939 		/* Make sure the specified range is actually quiesced in the specified module and
9940 		 * then remove it from the list. Note that the range must match exactly.
9941 		 */
9942 		spdk_spin_lock(&module->internal.spinlock);
9943 		TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) {
9944 			if (range->bdev == bdev && range->offset == offset && range->length == length) {
9945 				TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module);
9946 				break;
9947 			}
9948 		}
9949 		spdk_spin_unlock(&module->internal.spinlock);
9950 
9951 		if (range == NULL) {
9952 			SPDK_ERRLOG("The range to unquiesce was not found.\n");
9953 			return -EINVAL;
9954 		}
9955 
9956 		quiesce_ctx = range->locked_ctx;
9957 		quiesce_ctx->cb_fn = cb_fn;
9958 		quiesce_ctx->cb_arg = cb_arg;
9959 
9960 		rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx);
9961 	} else {
9962 		quiesce_ctx = malloc(sizeof(*quiesce_ctx));
9963 		if (quiesce_ctx == NULL) {
9964 			return -ENOMEM;
9965 		}
9966 
9967 		quiesce_ctx->cb_fn = cb_fn;
9968 		quiesce_ctx->cb_arg = cb_arg;
9969 
9970 		rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx);
9971 		if (rc != 0) {
9972 			free(quiesce_ctx);
9973 		}
9974 	}
9975 
9976 	return rc;
9977 }
9978 
9979 int
9980 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
9981 		  spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
9982 {
9983 	return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false);
9984 }
9985 
9986 int
9987 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
9988 		    spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
9989 {
9990 	return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true);
9991 }
9992 
9993 int
9994 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
9995 			uint64_t offset, uint64_t length,
9996 			spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
9997 {
9998 	return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false);
9999 }
10000 
10001 int
10002 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
10003 			  uint64_t offset, uint64_t length,
10004 			  spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
10005 {
10006 	return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true);
10007 }
10008 
10009 int
10010 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains,
10011 			     int array_size)
10012 {
10013 	if (!bdev) {
10014 		return -EINVAL;
10015 	}
10016 
10017 	if (bdev->fn_table->get_memory_domains) {
10018 		return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size);
10019 	}
10020 
10021 	return 0;
10022 }
10023 
10024 struct spdk_bdev_for_each_io_ctx {
10025 	void *ctx;
10026 	spdk_bdev_io_fn fn;
10027 	spdk_bdev_for_each_io_cb cb;
10028 };
10029 
10030 static void
10031 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
10032 			 struct spdk_io_channel *io_ch, void *_ctx)
10033 {
10034 	struct spdk_bdev_for_each_io_ctx *ctx = _ctx;
10035 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
10036 	struct spdk_bdev_io *bdev_io;
10037 	int rc = 0;
10038 
10039 	TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) {
10040 		rc = ctx->fn(ctx->ctx, bdev_io);
10041 		if (rc != 0) {
10042 			break;
10043 		}
10044 	}
10045 
10046 	spdk_bdev_for_each_channel_continue(i, rc);
10047 }
10048 
10049 static void
10050 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
10051 {
10052 	struct spdk_bdev_for_each_io_ctx *ctx = _ctx;
10053 
10054 	ctx->cb(ctx->ctx, status);
10055 
10056 	free(ctx);
10057 }
10058 
10059 void
10060 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn,
10061 			   spdk_bdev_for_each_io_cb cb)
10062 {
10063 	struct spdk_bdev_for_each_io_ctx *ctx;
10064 
10065 	assert(fn != NULL && cb != NULL);
10066 
10067 	ctx = calloc(1, sizeof(*ctx));
10068 	if (ctx == NULL) {
10069 		SPDK_ERRLOG("Failed to allocate context.\n");
10070 		cb(_ctx, -ENOMEM);
10071 		return;
10072 	}
10073 
10074 	ctx->ctx = _ctx;
10075 	ctx->fn = fn;
10076 	ctx->cb = cb;
10077 
10078 	spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx,
10079 				   bdev_for_each_io_done);
10080 }
10081 
10082 void
10083 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status)
10084 {
10085 	spdk_for_each_channel_continue(iter->i, status);
10086 }
10087 
10088 static struct spdk_bdev *
10089 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i)
10090 {
10091 	void *io_device = spdk_io_channel_iter_get_io_device(i);
10092 
10093 	return __bdev_from_io_dev(io_device);
10094 }
10095 
10096 static void
10097 bdev_each_channel_msg(struct spdk_io_channel_iter *i)
10098 {
10099 	struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
10100 	struct spdk_bdev *bdev = io_channel_iter_get_bdev(i);
10101 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
10102 
10103 	iter->i = i;
10104 	iter->fn(iter, bdev, ch, iter->ctx);
10105 }
10106 
10107 static void
10108 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status)
10109 {
10110 	struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
10111 	struct spdk_bdev *bdev = io_channel_iter_get_bdev(i);
10112 
10113 	iter->i = i;
10114 	iter->cpl(bdev, iter->ctx, status);
10115 
10116 	free(iter);
10117 }
10118 
10119 void
10120 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn,
10121 			   void *ctx, spdk_bdev_for_each_channel_done cpl)
10122 {
10123 	struct spdk_bdev_channel_iter *iter;
10124 
10125 	assert(bdev != NULL && fn != NULL && ctx != NULL);
10126 
10127 	iter = calloc(1, sizeof(struct spdk_bdev_channel_iter));
10128 	if (iter == NULL) {
10129 		SPDK_ERRLOG("Unable to allocate iterator\n");
10130 		assert(false);
10131 		return;
10132 	}
10133 
10134 	iter->fn = fn;
10135 	iter->cpl = cpl;
10136 	iter->ctx = ctx;
10137 
10138 	spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg,
10139 			      iter, bdev_each_channel_cpl);
10140 }
10141 
10142 static void
10143 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
10144 {
10145 	struct spdk_bdev_io *parent_io = cb_arg;
10146 
10147 	spdk_bdev_free_io(bdev_io);
10148 
10149 	/* Check return status of write */
10150 	parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
10151 	parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx);
10152 }
10153 
10154 static void
10155 bdev_copy_do_write(void *_bdev_io)
10156 {
10157 	struct spdk_bdev_io *bdev_io = _bdev_io;
10158 	int rc;
10159 
10160 	/* Write blocks */
10161 	rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc,
10162 					    spdk_io_channel_from_ctx(bdev_io->internal.ch),
10163 					    bdev_io->u.bdev.iovs[0].iov_base,
10164 					    bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks,
10165 					    bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io);
10166 
10167 	if (rc == -ENOMEM) {
10168 		bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write);
10169 	} else if (rc != 0) {
10170 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10171 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
10172 	}
10173 }
10174 
10175 static void
10176 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
10177 {
10178 	struct spdk_bdev_io *parent_io = cb_arg;
10179 
10180 	spdk_bdev_free_io(bdev_io);
10181 
10182 	/* Check return status of read */
10183 	if (!success) {
10184 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10185 		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
10186 		return;
10187 	}
10188 
10189 	/* Do write */
10190 	bdev_copy_do_write(parent_io);
10191 }
10192 
10193 static void
10194 bdev_copy_do_read(void *_bdev_io)
10195 {
10196 	struct spdk_bdev_io *bdev_io = _bdev_io;
10197 	int rc;
10198 
10199 	/* Read blocks */
10200 	rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc,
10201 					   spdk_io_channel_from_ctx(bdev_io->internal.ch),
10202 					   bdev_io->u.bdev.iovs[0].iov_base,
10203 					   bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks,
10204 					   bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io);
10205 
10206 	if (rc == -ENOMEM) {
10207 		bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read);
10208 	} else if (rc != 0) {
10209 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10210 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
10211 	}
10212 }
10213 
10214 static void
10215 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
10216 {
10217 	if (!success) {
10218 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10219 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
10220 		return;
10221 	}
10222 
10223 	bdev_copy_do_read(bdev_io);
10224 }
10225 
10226 int
10227 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
10228 		      uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks,
10229 		      spdk_bdev_io_completion_cb cb, void *cb_arg)
10230 {
10231 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
10232 	struct spdk_bdev_io *bdev_io;
10233 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
10234 
10235 	if (!desc->write) {
10236 		return -EBADF;
10237 	}
10238 
10239 	if (num_blocks == 0) {
10240 		SPDK_ERRLOG("Can't copy 0 blocks\n");
10241 		return -EINVAL;
10242 	}
10243 
10244 	if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) ||
10245 	    !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) {
10246 		SPDK_DEBUGLOG(bdev,
10247 			      "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n",
10248 			      dst_offset_blocks, src_offset_blocks, num_blocks);
10249 		return -EINVAL;
10250 	}
10251 
10252 	bdev_io = bdev_channel_get_io(channel);
10253 	if (!bdev_io) {
10254 		return -ENOMEM;
10255 	}
10256 
10257 	bdev_io->internal.ch = channel;
10258 	bdev_io->internal.desc = desc;
10259 	bdev_io->type = SPDK_BDEV_IO_TYPE_COPY;
10260 
10261 	bdev_io->u.bdev.offset_blocks = dst_offset_blocks;
10262 	bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks;
10263 	bdev_io->u.bdev.num_blocks = num_blocks;
10264 	bdev_io->u.bdev.memory_domain = NULL;
10265 	bdev_io->u.bdev.memory_domain_ctx = NULL;
10266 	bdev_io->u.bdev.iovs = NULL;
10267 	bdev_io->u.bdev.iovcnt = 0;
10268 	bdev_io->u.bdev.md_buf = NULL;
10269 	bdev_io->u.bdev.accel_sequence = NULL;
10270 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
10271 
10272 	if (dst_offset_blocks == src_offset_blocks) {
10273 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
10274 		bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx);
10275 
10276 		return 0;
10277 	}
10278 
10279 
10280 	/* If the copy size is large and should be split, use the generic split logic
10281 	 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not.
10282 	 *
10283 	 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or
10284 	 * emulate it using regular read and write requests otherwise.
10285 	 */
10286 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) ||
10287 	    bdev_io->internal.split) {
10288 		bdev_io_submit(bdev_io);
10289 		return 0;
10290 	}
10291 
10292 	spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev));
10293 
10294 	return 0;
10295 }
10296 
10297 SPDK_LOG_REGISTER_COMPONENT(bdev)
10298 
10299 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV)
10300 {
10301 	struct spdk_trace_tpoint_opts opts[] = {
10302 		{
10303 			"BDEV_IO_START", TRACE_BDEV_IO_START,
10304 			OWNER_BDEV, OBJECT_BDEV_IO, 1,
10305 			{
10306 				{ "type", SPDK_TRACE_ARG_TYPE_INT, 8 },
10307 				{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 },
10308 				{ "offset", SPDK_TRACE_ARG_TYPE_INT, 8 },
10309 				{ "len", SPDK_TRACE_ARG_TYPE_INT, 8 },
10310 				{ "name", SPDK_TRACE_ARG_TYPE_STR, 40}
10311 			}
10312 		},
10313 		{
10314 			"BDEV_IO_DONE", TRACE_BDEV_IO_DONE,
10315 			OWNER_BDEV, OBJECT_BDEV_IO, 0,
10316 			{{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }}
10317 		},
10318 		{
10319 			"BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE,
10320 			OWNER_BDEV, OBJECT_NONE, 1,
10321 			{
10322 				{ "name", SPDK_TRACE_ARG_TYPE_STR, 40 },
10323 				{ "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8}
10324 			}
10325 		},
10326 		{
10327 			"BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY,
10328 			OWNER_BDEV, OBJECT_NONE, 0,
10329 			{
10330 				{ "name", SPDK_TRACE_ARG_TYPE_STR, 40 },
10331 				{ "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8}
10332 			}
10333 		},
10334 	};
10335 
10336 
10337 	spdk_trace_register_owner(OWNER_BDEV, 'b');
10338 	spdk_trace_register_object(OBJECT_BDEV_IO, 'i');
10339 	spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts));
10340 	spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0);
10341 	spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0);
10342 }
10343