xref: /spdk/lib/bdev/bdev.c (revision 63e0c25dad5f2793fdb9ff9b1e6ce516673dc6aa)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2016 Intel Corporation. All rights reserved.
3  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
4  *   Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  */
6 
7 #include "spdk/stdinc.h"
8 
9 #include "spdk/bdev.h"
10 
11 #include "spdk/accel.h"
12 #include "spdk/config.h"
13 #include "spdk/env.h"
14 #include "spdk/thread.h"
15 #include "spdk/likely.h"
16 #include "spdk/queue.h"
17 #include "spdk/nvme_spec.h"
18 #include "spdk/scsi_spec.h"
19 #include "spdk/notify.h"
20 #include "spdk/util.h"
21 #include "spdk/trace.h"
22 #include "spdk/dma.h"
23 
24 #include "spdk/bdev_module.h"
25 #include "spdk/log.h"
26 #include "spdk/string.h"
27 
28 #include "bdev_internal.h"
29 #include "spdk_internal/trace_defs.h"
30 #include "spdk_internal/assert.h"
31 
32 #ifdef SPDK_CONFIG_VTUNE
33 #include "ittnotify.h"
34 #include "ittnotify_types.h"
35 int __itt_init_ittlib(const char *, __itt_group_id);
36 #endif
37 
38 #define SPDK_BDEV_IO_POOL_SIZE			(64 * 1024 - 1)
39 #define SPDK_BDEV_IO_CACHE_SIZE			256
40 #define SPDK_BDEV_AUTO_EXAMINE			true
41 #define BUF_SMALL_CACHE_SIZE			128
42 #define BUF_LARGE_CACHE_SIZE			16
43 #define NOMEM_THRESHOLD_COUNT			8
44 
45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC		1000
46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE	1
47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE	512
48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC		1000
49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC		(1024 * 1024)
50 #define SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC	(UINT64_MAX / (1024 * 1024))
51 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED		UINT64_MAX
52 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC	1000
53 
54 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command
55  * when splitting into children requests at a time.
56  */
57 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8)
58 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000
59 
60 /* The maximum number of children requests for a COPY command
61  * when splitting into children requests at a time.
62  */
63 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8)
64 
65 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \
66 	log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev)
67 #ifdef DEBUG
68 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \
69 	log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev)
70 #else
71 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0)
72 #endif
73 
74 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func,
75 				const char *detail, struct spdk_bdev *bdev);
76 
77 static const char *qos_rpc_type[] = {"rw_ios_per_sec",
78 				     "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec"
79 				    };
80 
81 TAILQ_HEAD(spdk_bdev_list, spdk_bdev);
82 
83 RB_HEAD(bdev_name_tree, spdk_bdev_name);
84 
85 static int
86 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2)
87 {
88 	return strcmp(name1->name, name2->name);
89 }
90 
91 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp);
92 
93 struct spdk_bdev_mgr {
94 	struct spdk_mempool *bdev_io_pool;
95 
96 	void *zero_buffer;
97 
98 	TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules;
99 
100 	struct spdk_bdev_list bdevs;
101 	struct bdev_name_tree bdev_names;
102 
103 	bool init_complete;
104 	bool module_init_complete;
105 
106 	struct spdk_spinlock spinlock;
107 
108 	TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens;
109 
110 #ifdef SPDK_CONFIG_VTUNE
111 	__itt_domain	*domain;
112 #endif
113 };
114 
115 static struct spdk_bdev_mgr g_bdev_mgr = {
116 	.bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules),
117 	.bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs),
118 	.bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names),
119 	.init_complete = false,
120 	.module_init_complete = false,
121 	.async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens),
122 };
123 
124 static void
125 __attribute__((constructor))
126 _bdev_init(void)
127 {
128 	spdk_spin_init(&g_bdev_mgr.spinlock);
129 }
130 
131 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status);
132 
133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc);
134 
135 struct lba_range {
136 	struct spdk_bdev		*bdev;
137 	uint64_t			offset;
138 	uint64_t			length;
139 	bool				quiesce;
140 	void				*locked_ctx;
141 	struct spdk_thread		*owner_thread;
142 	struct spdk_bdev_channel	*owner_ch;
143 	TAILQ_ENTRY(lba_range)		tailq;
144 	TAILQ_ENTRY(lba_range)		tailq_module;
145 };
146 
147 static struct spdk_bdev_opts	g_bdev_opts = {
148 	.bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE,
149 	.bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE,
150 	.bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE,
151 	.iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE,
152 	.iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE,
153 };
154 
155 static spdk_bdev_init_cb	g_init_cb_fn = NULL;
156 static void			*g_init_cb_arg = NULL;
157 
158 static spdk_bdev_fini_cb	g_fini_cb_fn = NULL;
159 static void			*g_fini_cb_arg = NULL;
160 static struct spdk_thread	*g_fini_thread = NULL;
161 
162 struct spdk_bdev_qos_limit {
163 	/** IOs or bytes allowed per second (i.e., 1s). */
164 	uint64_t limit;
165 
166 	/** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms).
167 	 *  For remaining bytes, allowed to run negative if an I/O is submitted when
168 	 *  some bytes are remaining, but the I/O is bigger than that amount. The
169 	 *  excess will be deducted from the next timeslice.
170 	 */
171 	int64_t remaining_this_timeslice;
172 
173 	/** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */
174 	uint32_t min_per_timeslice;
175 
176 	/** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */
177 	uint32_t max_per_timeslice;
178 
179 	/** Function to check whether to queue the IO.
180 	 * If The IO is allowed to pass, the quota will be reduced correspondingly.
181 	 */
182 	bool (*queue_io)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io);
183 
184 	/** Function to rewind the quota once the IO was allowed to be sent by this
185 	 * limit but queued due to one of the further limits.
186 	 */
187 	void (*rewind_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io);
188 };
189 
190 struct spdk_bdev_qos {
191 	/** Types of structure of rate limits. */
192 	struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
193 
194 	/** The channel that all I/O are funneled through. */
195 	struct spdk_bdev_channel *ch;
196 
197 	/** The thread on which the poller is running. */
198 	struct spdk_thread *thread;
199 
200 	/** Size of a timeslice in tsc ticks. */
201 	uint64_t timeslice_size;
202 
203 	/** Timestamp of start of last timeslice. */
204 	uint64_t last_timeslice;
205 
206 	/** Poller that processes queued I/O commands each time slice. */
207 	struct spdk_poller *poller;
208 };
209 
210 struct spdk_bdev_mgmt_channel {
211 	/*
212 	 * Each thread keeps a cache of bdev_io - this allows
213 	 *  bdev threads which are *not* DPDK threads to still
214 	 *  benefit from a per-thread bdev_io cache.  Without
215 	 *  this, non-DPDK threads fetching from the mempool
216 	 *  incur a cmpxchg on get and put.
217 	 */
218 	bdev_io_stailq_t per_thread_cache;
219 	uint32_t	per_thread_cache_count;
220 	uint32_t	bdev_io_cache_size;
221 
222 	struct spdk_iobuf_channel iobuf;
223 
224 	TAILQ_HEAD(, spdk_bdev_shared_resource)	shared_resources;
225 	TAILQ_HEAD(, spdk_bdev_io_wait_entry)	io_wait_queue;
226 };
227 
228 /*
229  * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device
230  * will queue here their IO that awaits retry. It makes it possible to retry sending
231  * IO to one bdev after IO from other bdev completes.
232  */
233 struct spdk_bdev_shared_resource {
234 	/* The bdev management channel */
235 	struct spdk_bdev_mgmt_channel *mgmt_ch;
236 
237 	/*
238 	 * Count of I/O submitted to bdev module and waiting for completion.
239 	 * Incremented before submit_request() is called on an spdk_bdev_io.
240 	 */
241 	uint64_t		io_outstanding;
242 
243 	/*
244 	 * Queue of IO awaiting retry because of a previous NOMEM status returned
245 	 *  on this channel.
246 	 */
247 	bdev_io_tailq_t		nomem_io;
248 
249 	/*
250 	 * Threshold which io_outstanding must drop to before retrying nomem_io.
251 	 */
252 	uint64_t		nomem_threshold;
253 
254 	/* I/O channel allocated by a bdev module */
255 	struct spdk_io_channel	*shared_ch;
256 
257 	struct spdk_poller	*nomem_poller;
258 
259 	/* Refcount of bdev channels using this resource */
260 	uint32_t		ref;
261 
262 	TAILQ_ENTRY(spdk_bdev_shared_resource) link;
263 };
264 
265 #define BDEV_CH_RESET_IN_PROGRESS	(1 << 0)
266 #define BDEV_CH_QOS_ENABLED		(1 << 1)
267 
268 struct spdk_bdev_channel {
269 	struct spdk_bdev	*bdev;
270 
271 	/* The channel for the underlying device */
272 	struct spdk_io_channel	*channel;
273 
274 	/* Accel channel */
275 	struct spdk_io_channel	*accel_channel;
276 
277 	/* Per io_device per thread data */
278 	struct spdk_bdev_shared_resource *shared_resource;
279 
280 	struct spdk_bdev_io_stat *stat;
281 
282 	/*
283 	 * Count of I/O submitted to the underlying dev module through this channel
284 	 * and waiting for completion.
285 	 */
286 	uint64_t		io_outstanding;
287 
288 	/*
289 	 * List of all submitted I/Os including I/O that are generated via splitting.
290 	 */
291 	bdev_io_tailq_t		io_submitted;
292 
293 	/*
294 	 * List of spdk_bdev_io that are currently queued because they write to a locked
295 	 * LBA range.
296 	 */
297 	bdev_io_tailq_t		io_locked;
298 
299 	/* List of I/Os with accel sequence being currently executed */
300 	bdev_io_tailq_t		io_accel_exec;
301 
302 	/* List of I/Os doing memory domain pull/push */
303 	bdev_io_tailq_t		io_memory_domain;
304 
305 	uint32_t		flags;
306 
307 	/* Counts number of bdev_io in the io_submitted TAILQ */
308 	uint16_t		queue_depth;
309 
310 	uint16_t		trace_id;
311 
312 	struct spdk_histogram_data *histogram;
313 
314 #ifdef SPDK_CONFIG_VTUNE
315 	uint64_t		start_tsc;
316 	uint64_t		interval_tsc;
317 	__itt_string_handle	*handle;
318 	struct spdk_bdev_io_stat *prev_stat;
319 #endif
320 
321 	bdev_io_tailq_t		queued_resets;
322 
323 	lba_range_tailq_t	locked_ranges;
324 
325 	/** List of I/Os queued by QoS. */
326 	bdev_io_tailq_t		qos_queued_io;
327 };
328 
329 struct media_event_entry {
330 	struct spdk_bdev_media_event	event;
331 	TAILQ_ENTRY(media_event_entry)	tailq;
332 };
333 
334 #define MEDIA_EVENT_POOL_SIZE 64
335 
336 struct spdk_bdev_desc {
337 	struct spdk_bdev		*bdev;
338 	struct spdk_thread		*thread;
339 	struct {
340 		spdk_bdev_event_cb_t event_fn;
341 		void *ctx;
342 	}				callback;
343 	bool				closed;
344 	bool				write;
345 	bool				memory_domains_supported;
346 	bool				accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES];
347 	struct spdk_spinlock		spinlock;
348 	uint32_t			refs;
349 	TAILQ_HEAD(, media_event_entry)	pending_media_events;
350 	TAILQ_HEAD(, media_event_entry)	free_media_events;
351 	struct media_event_entry	*media_events_buffer;
352 	TAILQ_ENTRY(spdk_bdev_desc)	link;
353 
354 	uint64_t		timeout_in_sec;
355 	spdk_bdev_io_timeout_cb	cb_fn;
356 	void			*cb_arg;
357 	struct spdk_poller	*io_timeout_poller;
358 	struct spdk_bdev_module_claim	*claim;
359 };
360 
361 struct spdk_bdev_iostat_ctx {
362 	struct spdk_bdev_io_stat *stat;
363 	enum spdk_bdev_reset_stat_mode reset_mode;
364 	spdk_bdev_get_device_stat_cb cb;
365 	void *cb_arg;
366 };
367 
368 struct set_qos_limit_ctx {
369 	void (*cb_fn)(void *cb_arg, int status);
370 	void *cb_arg;
371 	struct spdk_bdev *bdev;
372 };
373 
374 struct spdk_bdev_channel_iter {
375 	spdk_bdev_for_each_channel_msg fn;
376 	spdk_bdev_for_each_channel_done cpl;
377 	struct spdk_io_channel_iter *i;
378 	void *ctx;
379 };
380 
381 struct spdk_bdev_io_error_stat {
382 	uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS];
383 };
384 
385 enum bdev_io_retry_state {
386 	BDEV_IO_RETRY_STATE_INVALID,
387 	BDEV_IO_RETRY_STATE_PULL,
388 	BDEV_IO_RETRY_STATE_PULL_MD,
389 	BDEV_IO_RETRY_STATE_SUBMIT,
390 	BDEV_IO_RETRY_STATE_PUSH,
391 	BDEV_IO_RETRY_STATE_PUSH_MD,
392 };
393 
394 #define __bdev_to_io_dev(bdev)		(((char *)bdev) + 1)
395 #define __bdev_from_io_dev(io_dev)	((struct spdk_bdev *)(((char *)io_dev) - 1))
396 #define __io_ch_to_bdev_ch(io_ch)	((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch))
397 #define __io_ch_to_bdev_mgmt_ch(io_ch)	((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch))
398 
399 static inline void bdev_io_complete(void *ctx);
400 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io);
401 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io);
402 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io);
403 
404 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
405 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io);
406 
407 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
408 				struct spdk_io_channel *ch, void *_ctx);
409 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status);
410 
411 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
412 				     struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks,
413 				     uint64_t num_blocks,
414 				     struct spdk_memory_domain *domain, void *domain_ctx,
415 				     struct spdk_accel_sequence *seq, uint32_t dif_check_flags,
416 				     spdk_bdev_io_completion_cb cb, void *cb_arg);
417 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
418 				      struct iovec *iov, int iovcnt, void *md_buf,
419 				      uint64_t offset_blocks, uint64_t num_blocks,
420 				      struct spdk_memory_domain *domain, void *domain_ctx,
421 				      struct spdk_accel_sequence *seq, uint32_t dif_check_flags,
422 				      uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw,
423 				      spdk_bdev_io_completion_cb cb, void *cb_arg);
424 
425 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
426 			       uint64_t offset, uint64_t length,
427 			       lock_range_cb cb_fn, void *cb_arg);
428 
429 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
430 				 uint64_t offset, uint64_t length,
431 				 lock_range_cb cb_fn, void *cb_arg);
432 
433 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort);
434 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort);
435 
436 static bool claim_type_is_v2(enum spdk_bdev_claim_type type);
437 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc);
438 static void claim_reset(struct spdk_bdev *bdev);
439 
440 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch);
441 
442 static bool bdev_io_should_split(struct spdk_bdev_io *bdev_io);
443 
444 #define bdev_get_ext_io_opt(opts, field, defval) \
445 	((opts) != NULL ? SPDK_GET_FIELD(opts, field, defval) : (defval))
446 
447 static inline void
448 bdev_ch_add_to_io_submitted(struct spdk_bdev_io *bdev_io)
449 {
450 	TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link);
451 	bdev_io->internal.ch->queue_depth++;
452 }
453 
454 static inline void
455 bdev_ch_remove_from_io_submitted(struct spdk_bdev_io *bdev_io)
456 {
457 	TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link);
458 	bdev_io->internal.ch->queue_depth--;
459 }
460 
461 void
462 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size)
463 {
464 	if (!opts) {
465 		SPDK_ERRLOG("opts should not be NULL\n");
466 		return;
467 	}
468 
469 	if (!opts_size) {
470 		SPDK_ERRLOG("opts_size should not be zero value\n");
471 		return;
472 	}
473 
474 	opts->opts_size = opts_size;
475 
476 #define SET_FIELD(field) \
477 	if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \
478 		opts->field = g_bdev_opts.field; \
479 	} \
480 
481 	SET_FIELD(bdev_io_pool_size);
482 	SET_FIELD(bdev_io_cache_size);
483 	SET_FIELD(bdev_auto_examine);
484 	SET_FIELD(iobuf_small_cache_size);
485 	SET_FIELD(iobuf_large_cache_size);
486 
487 	/* Do not remove this statement, you should always update this statement when you adding a new field,
488 	 * and do not forget to add the SET_FIELD statement for your added field. */
489 	SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size");
490 
491 #undef SET_FIELD
492 }
493 
494 int
495 spdk_bdev_set_opts(struct spdk_bdev_opts *opts)
496 {
497 	uint32_t min_pool_size;
498 
499 	if (!opts) {
500 		SPDK_ERRLOG("opts cannot be NULL\n");
501 		return -1;
502 	}
503 
504 	if (!opts->opts_size) {
505 		SPDK_ERRLOG("opts_size inside opts cannot be zero value\n");
506 		return -1;
507 	}
508 
509 	/*
510 	 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem
511 	 *  initialization.  A second mgmt_ch will be created on the same thread when the application starts
512 	 *  but before the deferred put_io_channel event is executed for the first mgmt_ch.
513 	 */
514 	min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1);
515 	if (opts->bdev_io_pool_size < min_pool_size) {
516 		SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32
517 			    " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size,
518 			    spdk_thread_get_count());
519 		SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size);
520 		return -1;
521 	}
522 
523 #define SET_FIELD(field) \
524         if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \
525                 g_bdev_opts.field = opts->field; \
526         } \
527 
528 	SET_FIELD(bdev_io_pool_size);
529 	SET_FIELD(bdev_io_cache_size);
530 	SET_FIELD(bdev_auto_examine);
531 	SET_FIELD(iobuf_small_cache_size);
532 	SET_FIELD(iobuf_large_cache_size);
533 
534 	g_bdev_opts.opts_size = opts->opts_size;
535 
536 #undef SET_FIELD
537 
538 	return 0;
539 }
540 
541 static struct spdk_bdev *
542 bdev_get_by_name(const char *bdev_name)
543 {
544 	struct spdk_bdev_name find;
545 	struct spdk_bdev_name *res;
546 
547 	find.name = (char *)bdev_name;
548 	res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find);
549 	if (res != NULL) {
550 		return res->bdev;
551 	}
552 
553 	return NULL;
554 }
555 
556 struct spdk_bdev *
557 spdk_bdev_get_by_name(const char *bdev_name)
558 {
559 	struct spdk_bdev *bdev;
560 
561 	spdk_spin_lock(&g_bdev_mgr.spinlock);
562 	bdev = bdev_get_by_name(bdev_name);
563 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
564 
565 	return bdev;
566 }
567 
568 struct bdev_io_status_string {
569 	enum spdk_bdev_io_status status;
570 	const char *str;
571 };
572 
573 static const struct bdev_io_status_string bdev_io_status_strings[] = {
574 	{ SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" },
575 	{ SPDK_BDEV_IO_STATUS_ABORTED, "aborted" },
576 	{ SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" },
577 	{ SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" },
578 	{ SPDK_BDEV_IO_STATUS_NOMEM, "nomem" },
579 	{ SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" },
580 	{ SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" },
581 	{ SPDK_BDEV_IO_STATUS_FAILED, "failed" },
582 	{ SPDK_BDEV_IO_STATUS_PENDING, "pending" },
583 	{ SPDK_BDEV_IO_STATUS_SUCCESS, "success" },
584 };
585 
586 static const char *
587 bdev_io_status_get_string(enum spdk_bdev_io_status status)
588 {
589 	uint32_t i;
590 
591 	for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) {
592 		if (bdev_io_status_strings[i].status == status) {
593 			return bdev_io_status_strings[i].str;
594 		}
595 	}
596 
597 	return "reserved";
598 }
599 
600 struct spdk_bdev_wait_for_examine_ctx {
601 	struct spdk_poller              *poller;
602 	spdk_bdev_wait_for_examine_cb	cb_fn;
603 	void				*cb_arg;
604 };
605 
606 static bool bdev_module_all_actions_completed(void);
607 
608 static int
609 bdev_wait_for_examine_cb(void *arg)
610 {
611 	struct spdk_bdev_wait_for_examine_ctx *ctx = arg;
612 
613 	if (!bdev_module_all_actions_completed()) {
614 		return SPDK_POLLER_IDLE;
615 	}
616 
617 	spdk_poller_unregister(&ctx->poller);
618 	ctx->cb_fn(ctx->cb_arg);
619 	free(ctx);
620 
621 	return SPDK_POLLER_BUSY;
622 }
623 
624 int
625 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg)
626 {
627 	struct spdk_bdev_wait_for_examine_ctx *ctx;
628 
629 	ctx = calloc(1, sizeof(*ctx));
630 	if (ctx == NULL) {
631 		return -ENOMEM;
632 	}
633 	ctx->cb_fn = cb_fn;
634 	ctx->cb_arg = cb_arg;
635 	ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0);
636 
637 	return 0;
638 }
639 
640 struct spdk_bdev_examine_item {
641 	char *name;
642 	TAILQ_ENTRY(spdk_bdev_examine_item) link;
643 };
644 
645 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item);
646 
647 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER(
648 			g_bdev_examine_allowlist);
649 
650 static inline bool
651 bdev_examine_allowlist_check(const char *name)
652 {
653 	struct spdk_bdev_examine_item *item;
654 	TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) {
655 		if (strcmp(name, item->name) == 0) {
656 			return true;
657 		}
658 	}
659 	return false;
660 }
661 
662 static inline void
663 bdev_examine_allowlist_free(void)
664 {
665 	struct spdk_bdev_examine_item *item;
666 	while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) {
667 		item = TAILQ_FIRST(&g_bdev_examine_allowlist);
668 		TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link);
669 		free(item->name);
670 		free(item);
671 	}
672 }
673 
674 static inline bool
675 bdev_in_examine_allowlist(struct spdk_bdev *bdev)
676 {
677 	struct spdk_bdev_alias *tmp;
678 	if (bdev_examine_allowlist_check(bdev->name)) {
679 		return true;
680 	}
681 	TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
682 		if (bdev_examine_allowlist_check(tmp->alias.name)) {
683 			return true;
684 		}
685 	}
686 	return false;
687 }
688 
689 static inline bool
690 bdev_ok_to_examine(struct spdk_bdev *bdev)
691 {
692 	/* Some bdevs may not support the READ command.
693 	 * Do not try to examine them.
694 	 */
695 	if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ)) {
696 		return false;
697 	}
698 
699 	if (g_bdev_opts.bdev_auto_examine) {
700 		return true;
701 	} else {
702 		return bdev_in_examine_allowlist(bdev);
703 	}
704 }
705 
706 static void
707 bdev_examine(struct spdk_bdev *bdev)
708 {
709 	struct spdk_bdev_module *module;
710 	struct spdk_bdev_module_claim *claim, *tmpclaim;
711 	uint32_t action;
712 
713 	if (!bdev_ok_to_examine(bdev)) {
714 		return;
715 	}
716 
717 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
718 		if (module->examine_config) {
719 			spdk_spin_lock(&module->internal.spinlock);
720 			action = module->internal.action_in_progress;
721 			module->internal.action_in_progress++;
722 			spdk_spin_unlock(&module->internal.spinlock);
723 			module->examine_config(bdev);
724 			if (action != module->internal.action_in_progress) {
725 				SPDK_ERRLOG("examine_config for module %s did not call "
726 					    "spdk_bdev_module_examine_done()\n", module->name);
727 			}
728 		}
729 	}
730 
731 	spdk_spin_lock(&bdev->internal.spinlock);
732 
733 	switch (bdev->internal.claim_type) {
734 	case SPDK_BDEV_CLAIM_NONE:
735 		/* Examine by all bdev modules */
736 		TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
737 			if (module->examine_disk) {
738 				spdk_spin_lock(&module->internal.spinlock);
739 				module->internal.action_in_progress++;
740 				spdk_spin_unlock(&module->internal.spinlock);
741 				spdk_spin_unlock(&bdev->internal.spinlock);
742 				module->examine_disk(bdev);
743 				spdk_spin_lock(&bdev->internal.spinlock);
744 			}
745 		}
746 		break;
747 	case SPDK_BDEV_CLAIM_EXCL_WRITE:
748 		/* Examine by the one bdev module with a v1 claim */
749 		module = bdev->internal.claim.v1.module;
750 		if (module->examine_disk) {
751 			spdk_spin_lock(&module->internal.spinlock);
752 			module->internal.action_in_progress++;
753 			spdk_spin_unlock(&module->internal.spinlock);
754 			spdk_spin_unlock(&bdev->internal.spinlock);
755 			module->examine_disk(bdev);
756 			return;
757 		}
758 		break;
759 	default:
760 		/* Examine by all bdev modules with a v2 claim */
761 		assert(claim_type_is_v2(bdev->internal.claim_type));
762 		/*
763 		 * Removal of tailq nodes while iterating can cause the iteration to jump out of the
764 		 * list, perhaps accessing freed memory. Without protection, this could happen
765 		 * while the lock is dropped during the examine callback.
766 		 */
767 		bdev->internal.examine_in_progress++;
768 
769 		TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) {
770 			module = claim->module;
771 
772 			if (module == NULL) {
773 				/* This is a vestigial claim, held by examine_count */
774 				continue;
775 			}
776 
777 			if (module->examine_disk == NULL) {
778 				continue;
779 			}
780 
781 			spdk_spin_lock(&module->internal.spinlock);
782 			module->internal.action_in_progress++;
783 			spdk_spin_unlock(&module->internal.spinlock);
784 
785 			/* Call examine_disk without holding internal.spinlock. */
786 			spdk_spin_unlock(&bdev->internal.spinlock);
787 			module->examine_disk(bdev);
788 			spdk_spin_lock(&bdev->internal.spinlock);
789 		}
790 
791 		assert(bdev->internal.examine_in_progress > 0);
792 		bdev->internal.examine_in_progress--;
793 		if (bdev->internal.examine_in_progress == 0) {
794 			/* Remove any claims that were released during examine_disk */
795 			TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) {
796 				if (claim->desc != NULL) {
797 					continue;
798 				}
799 
800 				TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link);
801 				free(claim);
802 			}
803 			if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) {
804 				claim_reset(bdev);
805 			}
806 		}
807 	}
808 
809 	spdk_spin_unlock(&bdev->internal.spinlock);
810 }
811 
812 int
813 spdk_bdev_examine(const char *name)
814 {
815 	struct spdk_bdev *bdev;
816 	struct spdk_bdev_examine_item *item;
817 	struct spdk_thread *thread = spdk_get_thread();
818 
819 	if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) {
820 		SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread,
821 			    thread ? spdk_thread_get_name(thread) : "null");
822 		return -EINVAL;
823 	}
824 
825 	if (g_bdev_opts.bdev_auto_examine) {
826 		SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled\n");
827 		return -EINVAL;
828 	}
829 
830 	if (bdev_examine_allowlist_check(name)) {
831 		SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name);
832 		return -EEXIST;
833 	}
834 
835 	item = calloc(1, sizeof(*item));
836 	if (!item) {
837 		return -ENOMEM;
838 	}
839 	item->name = strdup(name);
840 	if (!item->name) {
841 		free(item);
842 		return -ENOMEM;
843 	}
844 	TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link);
845 
846 	bdev = spdk_bdev_get_by_name(name);
847 	if (bdev) {
848 		bdev_examine(bdev);
849 	}
850 	return 0;
851 }
852 
853 static inline void
854 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w)
855 {
856 	struct spdk_bdev_examine_item *item;
857 	TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) {
858 		spdk_json_write_object_begin(w);
859 		spdk_json_write_named_string(w, "method", "bdev_examine");
860 		spdk_json_write_named_object_begin(w, "params");
861 		spdk_json_write_named_string(w, "name", item->name);
862 		spdk_json_write_object_end(w);
863 		spdk_json_write_object_end(w);
864 	}
865 }
866 
867 struct spdk_bdev *
868 spdk_bdev_first(void)
869 {
870 	struct spdk_bdev *bdev;
871 
872 	bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs);
873 	if (bdev) {
874 		SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name);
875 	}
876 
877 	return bdev;
878 }
879 
880 struct spdk_bdev *
881 spdk_bdev_next(struct spdk_bdev *prev)
882 {
883 	struct spdk_bdev *bdev;
884 
885 	bdev = TAILQ_NEXT(prev, internal.link);
886 	if (bdev) {
887 		SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name);
888 	}
889 
890 	return bdev;
891 }
892 
893 static struct spdk_bdev *
894 _bdev_next_leaf(struct spdk_bdev *bdev)
895 {
896 	while (bdev != NULL) {
897 		if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) {
898 			return bdev;
899 		} else {
900 			bdev = TAILQ_NEXT(bdev, internal.link);
901 		}
902 	}
903 
904 	return bdev;
905 }
906 
907 struct spdk_bdev *
908 spdk_bdev_first_leaf(void)
909 {
910 	struct spdk_bdev *bdev;
911 
912 	bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs));
913 
914 	if (bdev) {
915 		SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name);
916 	}
917 
918 	return bdev;
919 }
920 
921 struct spdk_bdev *
922 spdk_bdev_next_leaf(struct spdk_bdev *prev)
923 {
924 	struct spdk_bdev *bdev;
925 
926 	bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link));
927 
928 	if (bdev) {
929 		SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name);
930 	}
931 
932 	return bdev;
933 }
934 
935 static inline bool
936 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io)
937 {
938 	return bdev_io->internal.f.has_memory_domain;
939 }
940 
941 static inline bool
942 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io)
943 {
944 	return bdev_io->internal.f.has_accel_sequence;
945 }
946 
947 static inline void
948 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource,
949 			 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state)
950 {
951 	/* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io.
952 	 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth
953 	 * channels we will instead wait for half to complete.
954 	 */
955 	shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2,
956 					   (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT);
957 
958 	assert(state != BDEV_IO_RETRY_STATE_INVALID);
959 	bdev_io->internal.retry_state = state;
960 	TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link);
961 }
962 
963 static inline void
964 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource,
965 			 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state)
966 {
967 	/* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while
968 	 * the queue isn't empty, so we don't need to update the nomem_threshold here */
969 	assert(!TAILQ_EMPTY(&shared_resource->nomem_io));
970 
971 	assert(state != BDEV_IO_RETRY_STATE_INVALID);
972 	bdev_io->internal.retry_state = state;
973 	TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link);
974 }
975 
976 void
977 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len)
978 {
979 	struct iovec *iovs;
980 
981 	if (bdev_io->u.bdev.iovs == NULL) {
982 		bdev_io->u.bdev.iovs = &bdev_io->iov;
983 		bdev_io->u.bdev.iovcnt = 1;
984 	}
985 
986 	iovs = bdev_io->u.bdev.iovs;
987 
988 	assert(iovs != NULL);
989 	assert(bdev_io->u.bdev.iovcnt >= 1);
990 
991 	iovs[0].iov_base = buf;
992 	iovs[0].iov_len = len;
993 }
994 
995 void
996 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len)
997 {
998 	assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks);
999 	bdev_io->u.bdev.md_buf = md_buf;
1000 }
1001 
1002 static bool
1003 _is_buf_allocated(const struct iovec *iovs)
1004 {
1005 	if (iovs == NULL) {
1006 		return false;
1007 	}
1008 
1009 	return iovs[0].iov_base != NULL;
1010 }
1011 
1012 static bool
1013 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment)
1014 {
1015 	int i;
1016 	uintptr_t iov_base;
1017 
1018 	if (spdk_likely(alignment == 1)) {
1019 		return true;
1020 	}
1021 
1022 	for (i = 0; i < iovcnt; i++) {
1023 		iov_base = (uintptr_t)iovs[i].iov_base;
1024 		if ((iov_base & (alignment - 1)) != 0) {
1025 			return false;
1026 		}
1027 	}
1028 
1029 	return true;
1030 }
1031 
1032 static inline bool
1033 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io)
1034 {
1035 	if (!bdev_io_use_accel_sequence(bdev_io)) {
1036 		return false;
1037 	}
1038 
1039 	/* For now, we don't allow splitting IOs with an accel sequence and will treat them as if
1040 	 * bdev module didn't support accel sequences */
1041 	return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.f.split;
1042 }
1043 
1044 static inline void
1045 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch,
1046 			      struct spdk_bdev_shared_resource *shared_resource)
1047 {
1048 	bdev_ch->io_outstanding++;
1049 	shared_resource->io_outstanding++;
1050 }
1051 
1052 static inline void
1053 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch,
1054 			      struct spdk_bdev_shared_resource *shared_resource)
1055 {
1056 	assert(bdev_ch->io_outstanding > 0);
1057 	assert(shared_resource->io_outstanding > 0);
1058 	bdev_ch->io_outstanding--;
1059 	shared_resource->io_outstanding--;
1060 }
1061 
1062 static void
1063 bdev_io_submit_sequence_cb(void *ctx, int status)
1064 {
1065 	struct spdk_bdev_io *bdev_io = ctx;
1066 
1067 	assert(bdev_io_use_accel_sequence(bdev_io));
1068 
1069 	bdev_io->u.bdev.accel_sequence = NULL;
1070 	bdev_io->internal.f.has_accel_sequence = false;
1071 
1072 	if (spdk_unlikely(status != 0)) {
1073 		SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status);
1074 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1075 		bdev_io_complete_unsubmitted(bdev_io);
1076 		return;
1077 	}
1078 
1079 	bdev_io_submit(bdev_io);
1080 }
1081 
1082 static void
1083 bdev_io_exec_sequence_cb(void *ctx, int status)
1084 {
1085 	struct spdk_bdev_io *bdev_io = ctx;
1086 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1087 
1088 	TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link);
1089 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1090 
1091 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1092 		bdev_ch_retry_io(ch);
1093 	}
1094 
1095 	bdev_io->internal.data_transfer_cpl(bdev_io, status);
1096 }
1097 
1098 static void
1099 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status))
1100 {
1101 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1102 
1103 	assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io));
1104 	assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
1105 	assert(bdev_io_use_accel_sequence(bdev_io));
1106 
1107 	/* Since the operations are appended during submission, they're in the opposite order than
1108 	 * how we want to execute them for reads (i.e. we need to execute the most recently added
1109 	 * operation first), so reverse the sequence before executing it.
1110 	 */
1111 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
1112 		spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence);
1113 	}
1114 
1115 	TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link);
1116 	bdev_io_increment_outstanding(ch, ch->shared_resource);
1117 	bdev_io->internal.data_transfer_cpl = cb_fn;
1118 
1119 	spdk_accel_sequence_finish(bdev_io->internal.accel_sequence,
1120 				   bdev_io_exec_sequence_cb, bdev_io);
1121 }
1122 
1123 static void
1124 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status)
1125 {
1126 	struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io);
1127 	void *buf;
1128 
1129 	if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) {
1130 		buf = bdev_io->internal.buf.ptr;
1131 		bdev_io->internal.buf.ptr = NULL;
1132 		bdev_io->internal.f.has_buf = false;
1133 		bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf);
1134 		bdev_io->internal.get_aux_buf_cb = NULL;
1135 	} else {
1136 		assert(bdev_io->internal.get_buf_cb != NULL);
1137 		bdev_io->internal.get_buf_cb(ch, bdev_io, status);
1138 		bdev_io->internal.get_buf_cb = NULL;
1139 	}
1140 }
1141 
1142 static void
1143 _bdev_io_pull_buffer_cpl(void *ctx, int rc)
1144 {
1145 	struct spdk_bdev_io *bdev_io = ctx;
1146 
1147 	if (rc) {
1148 		SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc);
1149 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1150 	}
1151 	bdev_io_get_buf_complete(bdev_io, !rc);
1152 }
1153 
1154 static void
1155 bdev_io_pull_md_buf_done(void *ctx, int status)
1156 {
1157 	struct spdk_bdev_io *bdev_io = ctx;
1158 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1159 
1160 	TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1161 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1162 
1163 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1164 		bdev_ch_retry_io(ch);
1165 	}
1166 
1167 	assert(bdev_io->internal.data_transfer_cpl);
1168 	bdev_io->internal.data_transfer_cpl(bdev_io, status);
1169 }
1170 
1171 static void
1172 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io)
1173 {
1174 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1175 	int rc = 0;
1176 
1177 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1178 		assert(bdev_io->internal.f.has_bounce_buf);
1179 		if (bdev_io_use_memory_domain(bdev_io)) {
1180 			TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1181 			bdev_io_increment_outstanding(ch, ch->shared_resource);
1182 			rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain,
1183 							  bdev_io->internal.memory_domain_ctx,
1184 							  &bdev_io->internal.bounce_buf.orig_md_iov, 1,
1185 							  &bdev_io->internal.bounce_buf.md_iov, 1,
1186 							  bdev_io_pull_md_buf_done, bdev_io);
1187 			if (rc == 0) {
1188 				/* Continue to submit IO in completion callback */
1189 				return;
1190 			}
1191 			bdev_io_decrement_outstanding(ch, ch->shared_resource);
1192 			TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1193 			if (rc != -ENOMEM) {
1194 				SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n",
1195 					    spdk_memory_domain_get_dma_device_id(
1196 						    bdev_io->internal.memory_domain), rc);
1197 			}
1198 		} else {
1199 			memcpy(bdev_io->internal.bounce_buf.md_iov.iov_base,
1200 			       bdev_io->internal.bounce_buf.orig_md_iov.iov_base,
1201 			       bdev_io->internal.bounce_buf.orig_md_iov.iov_len);
1202 		}
1203 	}
1204 
1205 	if (spdk_unlikely(rc == -ENOMEM)) {
1206 		bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD);
1207 	} else {
1208 		assert(bdev_io->internal.data_transfer_cpl);
1209 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1210 	}
1211 }
1212 
1213 static void
1214 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len)
1215 {
1216 	assert(bdev_io->internal.f.has_bounce_buf);
1217 
1218 	/* save original md_buf */
1219 	bdev_io->internal.bounce_buf.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf;
1220 	bdev_io->internal.bounce_buf.orig_md_iov.iov_len = len;
1221 	bdev_io->internal.bounce_buf.md_iov.iov_base = md_buf;
1222 	bdev_io->internal.bounce_buf.md_iov.iov_len = len;
1223 	/* set bounce md_buf */
1224 	bdev_io->u.bdev.md_buf = md_buf;
1225 
1226 	bdev_io_pull_md_buf(bdev_io);
1227 }
1228 
1229 static void
1230 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io)
1231 {
1232 	struct spdk_bdev *bdev = bdev_io->bdev;
1233 	uint64_t md_len;
1234 	void *buf;
1235 
1236 	if (spdk_bdev_is_md_separate(bdev)) {
1237 		assert(!bdev_io_use_accel_sequence(bdev_io));
1238 
1239 		buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len;
1240 		md_len = bdev_io->u.bdev.num_blocks * bdev->md_len;
1241 
1242 		assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0);
1243 
1244 		if (bdev_io->u.bdev.md_buf != NULL) {
1245 			_bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len);
1246 			return;
1247 		} else {
1248 			spdk_bdev_io_set_md_buf(bdev_io, buf, md_len);
1249 		}
1250 	}
1251 
1252 	bdev_io_get_buf_complete(bdev_io, true);
1253 }
1254 
1255 static inline void
1256 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc)
1257 {
1258 	if (rc) {
1259 		SPDK_ERRLOG("Failed to get data buffer\n");
1260 		assert(bdev_io->internal.data_transfer_cpl);
1261 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1262 		return;
1263 	}
1264 
1265 	_bdev_io_set_md_buf(bdev_io);
1266 }
1267 
1268 static void
1269 bdev_io_pull_data_done_and_track(void *ctx, int status)
1270 {
1271 	struct spdk_bdev_io *bdev_io = ctx;
1272 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1273 
1274 	TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1275 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1276 
1277 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1278 		bdev_ch_retry_io(ch);
1279 	}
1280 
1281 	bdev_io_pull_data_done(bdev_io, status);
1282 }
1283 
1284 static void
1285 bdev_io_pull_data(struct spdk_bdev_io *bdev_io)
1286 {
1287 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1288 	int rc = 0;
1289 
1290 	/* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a
1291 	 * sequence, append a copy operation making accel change the src/dst buffers of the previous
1292 	 * operation */
1293 	if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) ||
1294 	    (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) {
1295 		if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1296 			assert(bdev_io_use_accel_sequence(bdev_io));
1297 			assert(bdev_io->internal.f.has_bounce_buf);
1298 			rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel,
1299 						    bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
1300 						    NULL, NULL,
1301 						    bdev_io->internal.bounce_buf.orig_iovs,
1302 						    bdev_io->internal.bounce_buf.orig_iovcnt,
1303 						    bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL,
1304 						    bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL,
1305 						    NULL, NULL);
1306 		} else {
1307 			/* We need to reverse the src/dst for reads */
1308 			assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
1309 			assert(bdev_io_use_accel_sequence(bdev_io));
1310 			assert(bdev_io->internal.f.has_bounce_buf);
1311 			rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel,
1312 						    bdev_io->internal.bounce_buf.orig_iovs,
1313 						    bdev_io->internal.bounce_buf.orig_iovcnt,
1314 						    bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL,
1315 						    bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL,
1316 						    bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
1317 						    NULL, NULL, NULL, NULL);
1318 		}
1319 
1320 		if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) {
1321 			SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n",
1322 				    bdev_io->internal.accel_sequence);
1323 		}
1324 	} else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1325 		/* if this is write path, copy data from original buffer to bounce buffer */
1326 		if (bdev_io_use_memory_domain(bdev_io)) {
1327 			assert(bdev_io->internal.f.has_bounce_buf);
1328 			TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1329 			bdev_io_increment_outstanding(ch, ch->shared_resource);
1330 			rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain,
1331 							  bdev_io->internal.memory_domain_ctx,
1332 							  bdev_io->internal.bounce_buf.orig_iovs,
1333 							  (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt,
1334 							  bdev_io->u.bdev.iovs, 1,
1335 							  bdev_io_pull_data_done_and_track,
1336 							  bdev_io);
1337 			if (rc == 0) {
1338 				/* Continue to submit IO in completion callback */
1339 				return;
1340 			}
1341 			TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1342 			bdev_io_decrement_outstanding(ch, ch->shared_resource);
1343 			if (rc != -ENOMEM) {
1344 				SPDK_ERRLOG("Failed to pull data from memory domain %s\n",
1345 					    spdk_memory_domain_get_dma_device_id(
1346 						    bdev_io->internal.memory_domain));
1347 			}
1348 		} else {
1349 			assert(bdev_io->u.bdev.iovcnt == 1);
1350 			assert(bdev_io->internal.f.has_bounce_buf);
1351 			spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base,
1352 					      bdev_io->u.bdev.iovs[0].iov_len,
1353 					      bdev_io->internal.bounce_buf.orig_iovs,
1354 					      bdev_io->internal.bounce_buf.orig_iovcnt);
1355 		}
1356 	}
1357 
1358 	if (spdk_unlikely(rc == -ENOMEM)) {
1359 		bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL);
1360 	} else {
1361 		bdev_io_pull_data_done(bdev_io, rc);
1362 	}
1363 }
1364 
1365 static void
1366 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len,
1367 			      bdev_copy_bounce_buffer_cpl cpl_cb)
1368 {
1369 	struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource;
1370 
1371 	assert(bdev_io->internal.f.has_bounce_buf == false);
1372 
1373 	bdev_io->internal.data_transfer_cpl = cpl_cb;
1374 	bdev_io->internal.f.has_bounce_buf = true;
1375 	/* save original iovec */
1376 	bdev_io->internal.bounce_buf.orig_iovs = bdev_io->u.bdev.iovs;
1377 	bdev_io->internal.bounce_buf.orig_iovcnt = bdev_io->u.bdev.iovcnt;
1378 	/* zero the other data members */
1379 	bdev_io->internal.bounce_buf.iov.iov_base = NULL;
1380 	bdev_io->internal.bounce_buf.md_iov.iov_base = NULL;
1381 	bdev_io->internal.bounce_buf.orig_md_iov.iov_base = NULL;
1382 	/* set bounce iov */
1383 	bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_buf.iov;
1384 	bdev_io->u.bdev.iovcnt = 1;
1385 	/* set bounce buffer for this operation */
1386 	bdev_io->u.bdev.iovs[0].iov_base = buf;
1387 	bdev_io->u.bdev.iovs[0].iov_len = len;
1388 	/* Now we use 1 iov, the split condition could have been changed */
1389 	bdev_io->internal.f.split = bdev_io_should_split(bdev_io);
1390 
1391 	if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) {
1392 		bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL);
1393 	} else {
1394 		bdev_io_pull_data(bdev_io);
1395 	}
1396 }
1397 
1398 static void
1399 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len)
1400 {
1401 	struct spdk_bdev *bdev = bdev_io->bdev;
1402 	bool buf_allocated;
1403 	uint64_t alignment;
1404 	void *aligned_buf;
1405 
1406 	bdev_io->internal.buf.ptr = buf;
1407 	bdev_io->internal.f.has_buf = true;
1408 
1409 	if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) {
1410 		bdev_io_get_buf_complete(bdev_io, true);
1411 		return;
1412 	}
1413 
1414 	alignment = spdk_bdev_get_buf_align(bdev);
1415 	buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs);
1416 	aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1));
1417 
1418 	if (buf_allocated) {
1419 		_bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl);
1420 		/* Continue in completion callback */
1421 		return;
1422 	} else {
1423 		spdk_bdev_io_set_buf(bdev_io, aligned_buf, len);
1424 	}
1425 
1426 	_bdev_io_set_md_buf(bdev_io);
1427 }
1428 
1429 static inline uint64_t
1430 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len)
1431 {
1432 	struct spdk_bdev *bdev = bdev_io->bdev;
1433 	uint64_t md_len, alignment;
1434 
1435 	md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0;
1436 
1437 	/* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */
1438 	alignment = spdk_bdev_get_buf_align(bdev) - 1;
1439 
1440 	return len + alignment + md_len;
1441 }
1442 
1443 static void
1444 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len)
1445 {
1446 	struct spdk_bdev_mgmt_channel *ch;
1447 
1448 	ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
1449 	spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len));
1450 }
1451 
1452 static void
1453 bdev_io_put_buf(struct spdk_bdev_io *bdev_io)
1454 {
1455 	assert(bdev_io->internal.f.has_buf);
1456 	_bdev_io_put_buf(bdev_io, bdev_io->internal.buf.ptr, bdev_io->internal.buf.len);
1457 	bdev_io->internal.buf.ptr = NULL;
1458 	bdev_io->internal.f.has_buf = false;
1459 }
1460 
1461 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_put_aux_buf,
1462 			      "spdk_bdev_io_put_aux_buf is deprecated", "v25.01", 0);
1463 
1464 void
1465 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf)
1466 {
1467 	uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
1468 
1469 	SPDK_LOG_DEPRECATED(spdk_bdev_io_put_aux_buf);
1470 
1471 	assert(buf != NULL);
1472 	_bdev_io_put_buf(bdev_io, buf, len);
1473 }
1474 
1475 static inline void
1476 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch,
1477 		    struct spdk_bdev_io *bdev_io)
1478 {
1479 	/* After a request is submitted to a bdev module, the ownership of an accel sequence
1480 	 * associated with that bdev_io is transferred to the bdev module. So, clear the internal
1481 	 * sequence pointer to make sure we won't touch it anymore. */
1482 	if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE ||
1483 	     bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) {
1484 		assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io));
1485 		bdev_io->internal.f.has_accel_sequence = false;
1486 	}
1487 
1488 	bdev->fn_table->submit_request(ioch, bdev_io);
1489 }
1490 
1491 static inline void
1492 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io)
1493 {
1494 	struct spdk_bdev *bdev = bdev_io->bdev;
1495 
1496 	bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource);
1497 	bdev_io->internal.error.nvme.cdw0 = 0;
1498 	bdev_io->num_retries++;
1499 	bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io);
1500 }
1501 
1502 static void
1503 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource)
1504 {
1505 	struct spdk_bdev_io *bdev_io;
1506 
1507 	if (shared_resource->io_outstanding > shared_resource->nomem_threshold) {
1508 		/*
1509 		 * Allow some more I/O to complete before retrying the nomem_io queue.
1510 		 *  Some drivers (such as nvme) cannot immediately take a new I/O in
1511 		 *  the context of a completion, because the resources for the I/O are
1512 		 *  not released until control returns to the bdev poller.  Also, we
1513 		 *  may require several small I/O to complete before a larger I/O
1514 		 *  (that requires splitting) can be submitted.
1515 		 */
1516 		return;
1517 	}
1518 
1519 	while (!TAILQ_EMPTY(&shared_resource->nomem_io)) {
1520 		bdev_io = TAILQ_FIRST(&shared_resource->nomem_io);
1521 		TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link);
1522 
1523 		switch (bdev_io->internal.retry_state) {
1524 		case BDEV_IO_RETRY_STATE_SUBMIT:
1525 			bdev_ch_resubmit_io(shared_resource, bdev_io);
1526 			break;
1527 		case BDEV_IO_RETRY_STATE_PULL:
1528 			bdev_io_pull_data(bdev_io);
1529 			break;
1530 		case BDEV_IO_RETRY_STATE_PULL_MD:
1531 			bdev_io_pull_md_buf(bdev_io);
1532 			break;
1533 		case BDEV_IO_RETRY_STATE_PUSH:
1534 			bdev_io_push_bounce_data(bdev_io);
1535 			break;
1536 		case BDEV_IO_RETRY_STATE_PUSH_MD:
1537 			bdev_io_push_bounce_md_buf(bdev_io);
1538 			break;
1539 		default:
1540 			assert(0 && "invalid retry state");
1541 			break;
1542 		}
1543 
1544 		if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) {
1545 			/* This IO completed again with NOMEM status, so break the loop and
1546 			 * don't try anymore.  Note that a bdev_io that fails with NOMEM
1547 			 * always gets requeued at the front of the list, to maintain
1548 			 * ordering.
1549 			 */
1550 			break;
1551 		}
1552 	}
1553 }
1554 
1555 static void
1556 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
1557 {
1558 	bdev_shared_ch_retry_io(bdev_ch->shared_resource);
1559 }
1560 
1561 static int
1562 bdev_no_mem_poller(void *ctx)
1563 {
1564 	struct spdk_bdev_shared_resource *shared_resource = ctx;
1565 
1566 	spdk_poller_unregister(&shared_resource->nomem_poller);
1567 
1568 	if (!TAILQ_EMPTY(&shared_resource->nomem_io)) {
1569 		bdev_shared_ch_retry_io(shared_resource);
1570 	}
1571 	/* the retry cb may re-register the poller so double check */
1572 	if (!TAILQ_EMPTY(&shared_resource->nomem_io) &&
1573 	    shared_resource->io_outstanding == 0 && shared_resource->nomem_poller == NULL) {
1574 		/* No IOs were submitted, try again */
1575 		shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource,
1576 						SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10);
1577 	}
1578 
1579 	return SPDK_POLLER_BUSY;
1580 }
1581 
1582 static inline bool
1583 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state)
1584 {
1585 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
1586 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
1587 
1588 	if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) {
1589 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
1590 		bdev_queue_nomem_io_head(shared_resource, bdev_io, state);
1591 
1592 		if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) {
1593 			/* Special case when we have nomem IOs and no outstanding IOs which completions
1594 			 * could trigger retry of queued IOs
1595 			 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no
1596 			 * new IOs submitted, e.g. qd==1 */
1597 			shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource,
1598 							SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10);
1599 		}
1600 		/* If bdev module completed an I/O that has an accel sequence with NOMEM status, the
1601 		 * ownership of that sequence is transferred back to the bdev layer, so we need to
1602 		 * restore internal.accel_sequence to make sure that the sequence is handled
1603 		 * correctly in case the I/O is later aborted. */
1604 		if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ ||
1605 		     bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) {
1606 			assert(!bdev_io_use_accel_sequence(bdev_io));
1607 			bdev_io->internal.f.has_accel_sequence = true;
1608 			bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence;
1609 		}
1610 
1611 		return true;
1612 	}
1613 
1614 	if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) {
1615 		bdev_ch_retry_io(bdev_ch);
1616 	}
1617 
1618 	return false;
1619 }
1620 
1621 static void
1622 _bdev_io_complete_push_bounce_done(void *ctx, int rc)
1623 {
1624 	struct spdk_bdev_io *bdev_io = ctx;
1625 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1626 
1627 	if (rc) {
1628 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1629 	}
1630 	/* We want to free the bounce buffer here since we know we're done with it (as opposed
1631 	 * to waiting for the conditional free of internal.buf.ptr in spdk_bdev_free_io()).
1632 	 */
1633 	bdev_io_put_buf(bdev_io);
1634 
1635 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1636 		bdev_ch_retry_io(ch);
1637 	}
1638 
1639 	/* Continue with IO completion flow */
1640 	bdev_io_complete(bdev_io);
1641 }
1642 
1643 static void
1644 bdev_io_push_bounce_md_buf_done(void *ctx, int rc)
1645 {
1646 	struct spdk_bdev_io *bdev_io = ctx;
1647 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1648 
1649 	TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1650 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1651 	bdev_io->internal.f.has_bounce_buf = false;
1652 
1653 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1654 		bdev_ch_retry_io(ch);
1655 	}
1656 
1657 	bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1658 }
1659 
1660 static inline void
1661 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io)
1662 {
1663 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1664 	int rc = 0;
1665 
1666 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
1667 	assert(bdev_io->internal.f.has_bounce_buf);
1668 
1669 	/* do the same for metadata buffer */
1670 	if (spdk_unlikely(bdev_io->internal.bounce_buf.orig_md_iov.iov_base != NULL)) {
1671 		assert(spdk_bdev_is_md_separate(bdev_io->bdev));
1672 
1673 		if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
1674 			if (bdev_io_use_memory_domain(bdev_io)) {
1675 				TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1676 				bdev_io_increment_outstanding(ch, ch->shared_resource);
1677 				/* If memory domain is used then we need to call async push function */
1678 				rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain,
1679 								  bdev_io->internal.memory_domain_ctx,
1680 								  &bdev_io->internal.bounce_buf.orig_md_iov,
1681 								  (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt,
1682 								  &bdev_io->internal.bounce_buf.md_iov, 1,
1683 								  bdev_io_push_bounce_md_buf_done,
1684 								  bdev_io);
1685 				if (rc == 0) {
1686 					/* Continue IO completion in async callback */
1687 					return;
1688 				}
1689 				TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1690 				bdev_io_decrement_outstanding(ch, ch->shared_resource);
1691 				if (rc != -ENOMEM) {
1692 					SPDK_ERRLOG("Failed to push md to memory domain %s\n",
1693 						    spdk_memory_domain_get_dma_device_id(
1694 							    bdev_io->internal.memory_domain));
1695 				}
1696 			} else {
1697 				memcpy(bdev_io->internal.bounce_buf.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf,
1698 				       bdev_io->internal.bounce_buf.orig_md_iov.iov_len);
1699 			}
1700 		}
1701 	}
1702 
1703 	if (spdk_unlikely(rc == -ENOMEM)) {
1704 		bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD);
1705 	} else {
1706 		assert(bdev_io->internal.data_transfer_cpl);
1707 		bdev_io->internal.f.has_bounce_buf = false;
1708 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1709 	}
1710 }
1711 
1712 static inline void
1713 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc)
1714 {
1715 	assert(bdev_io->internal.data_transfer_cpl);
1716 	if (rc) {
1717 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1718 		return;
1719 	}
1720 
1721 	/* set original buffer for this io */
1722 	bdev_io->u.bdev.iovcnt = bdev_io->internal.bounce_buf.orig_iovcnt;
1723 	bdev_io->u.bdev.iovs = bdev_io->internal.bounce_buf.orig_iovs;
1724 
1725 	/* We don't set bdev_io->internal.f.has_bounce_buf to false here because
1726 	 * we still need to clear the md buf */
1727 
1728 	bdev_io_push_bounce_md_buf(bdev_io);
1729 }
1730 
1731 static void
1732 bdev_io_push_bounce_data_done_and_track(void *ctx, int status)
1733 {
1734 	struct spdk_bdev_io *bdev_io = ctx;
1735 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1736 
1737 	TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1738 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1739 
1740 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1741 		bdev_ch_retry_io(ch);
1742 	}
1743 
1744 	bdev_io_push_bounce_data_done(bdev_io, status);
1745 }
1746 
1747 static inline void
1748 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io)
1749 {
1750 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1751 	int rc = 0;
1752 
1753 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
1754 	assert(!bdev_io_use_accel_sequence(bdev_io));
1755 	assert(bdev_io->internal.f.has_bounce_buf);
1756 
1757 	/* if this is read path, copy data from bounce buffer to original buffer */
1758 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
1759 		if (bdev_io_use_memory_domain(bdev_io)) {
1760 			TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1761 			bdev_io_increment_outstanding(ch, ch->shared_resource);
1762 			/* If memory domain is used then we need to call async push function */
1763 			rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain,
1764 							  bdev_io->internal.memory_domain_ctx,
1765 							  bdev_io->internal.bounce_buf.orig_iovs,
1766 							  (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt,
1767 							  &bdev_io->internal.bounce_buf.iov, 1,
1768 							  bdev_io_push_bounce_data_done_and_track,
1769 							  bdev_io);
1770 			if (rc == 0) {
1771 				/* Continue IO completion in async callback */
1772 				return;
1773 			}
1774 
1775 			TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1776 			bdev_io_decrement_outstanding(ch, ch->shared_resource);
1777 			if (rc != -ENOMEM) {
1778 				SPDK_ERRLOG("Failed to push data to memory domain %s\n",
1779 					    spdk_memory_domain_get_dma_device_id(
1780 						    bdev_io->internal.memory_domain));
1781 			}
1782 		} else {
1783 			spdk_copy_buf_to_iovs(bdev_io->internal.bounce_buf.orig_iovs,
1784 					      bdev_io->internal.bounce_buf.orig_iovcnt,
1785 					      bdev_io->internal.bounce_buf.iov.iov_base,
1786 					      bdev_io->internal.bounce_buf.iov.iov_len);
1787 		}
1788 	}
1789 
1790 	if (spdk_unlikely(rc == -ENOMEM)) {
1791 		bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH);
1792 	} else {
1793 		bdev_io_push_bounce_data_done(bdev_io, rc);
1794 	}
1795 }
1796 
1797 static inline void
1798 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb)
1799 {
1800 	bdev_io->internal.data_transfer_cpl = cpl_cb;
1801 	bdev_io_push_bounce_data(bdev_io);
1802 }
1803 
1804 static void
1805 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf)
1806 {
1807 	struct spdk_bdev_io *bdev_io;
1808 
1809 	bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf);
1810 	_bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len);
1811 }
1812 
1813 static void
1814 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len)
1815 {
1816 	struct spdk_bdev_mgmt_channel *mgmt_ch;
1817 	uint64_t max_len;
1818 	void *buf;
1819 
1820 	assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread());
1821 	mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
1822 	max_len = bdev_io_get_max_buf_len(bdev_io, len);
1823 
1824 	if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) {
1825 		SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len);
1826 		bdev_io_get_buf_complete(bdev_io, false);
1827 		return;
1828 	}
1829 
1830 	bdev_io->internal.buf.len = len;
1831 	buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf,
1832 			     bdev_io_get_iobuf_cb);
1833 	if (buf != NULL) {
1834 		_bdev_io_set_buf(bdev_io, buf, len);
1835 	}
1836 }
1837 
1838 void
1839 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len)
1840 {
1841 	struct spdk_bdev *bdev = bdev_io->bdev;
1842 	uint64_t alignment;
1843 
1844 	assert(cb != NULL);
1845 	bdev_io->internal.get_buf_cb = cb;
1846 
1847 	alignment = spdk_bdev_get_buf_align(bdev);
1848 
1849 	if (_is_buf_allocated(bdev_io->u.bdev.iovs) &&
1850 	    _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) {
1851 		/* Buffer already present and aligned */
1852 		cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true);
1853 		return;
1854 	}
1855 
1856 	bdev_io_get_buf(bdev_io, len);
1857 }
1858 
1859 static void
1860 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
1861 			      bool success)
1862 {
1863 	if (!success) {
1864 		SPDK_ERRLOG("Failed to get data buffer, completing IO\n");
1865 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1866 		bdev_io_complete_unsubmitted(bdev_io);
1867 		return;
1868 	}
1869 
1870 	if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) {
1871 		if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1872 			bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb);
1873 			return;
1874 		}
1875 		/* For reads we'll execute the sequence after the data is read, so, for now, only
1876 		 * clear out accel_sequence pointer and submit the IO */
1877 		assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
1878 		bdev_io->u.bdev.accel_sequence = NULL;
1879 	}
1880 
1881 	bdev_io_submit(bdev_io);
1882 }
1883 
1884 static void
1885 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb,
1886 			       uint64_t len)
1887 {
1888 	assert(cb != NULL);
1889 	bdev_io->internal.get_buf_cb = cb;
1890 
1891 	bdev_io_get_buf(bdev_io, len);
1892 }
1893 
1894 
1895 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_get_aux_buf,
1896 			      "spdk_bdev_io_get_aux_buf is deprecated", "v25.01", 0);
1897 
1898 void
1899 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb)
1900 {
1901 	uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
1902 
1903 	SPDK_LOG_DEPRECATED(spdk_bdev_io_get_aux_buf);
1904 
1905 	assert(cb != NULL);
1906 	assert(bdev_io->internal.get_aux_buf_cb == NULL);
1907 	bdev_io->internal.get_aux_buf_cb = cb;
1908 	bdev_io_get_buf(bdev_io, len);
1909 }
1910 
1911 static int
1912 bdev_module_get_max_ctx_size(void)
1913 {
1914 	struct spdk_bdev_module *bdev_module;
1915 	int max_bdev_module_size = 0;
1916 
1917 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
1918 		if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) {
1919 			max_bdev_module_size = bdev_module->get_ctx_size();
1920 		}
1921 	}
1922 
1923 	return max_bdev_module_size;
1924 }
1925 
1926 static void
1927 bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1928 {
1929 	if (!bdev->internal.histogram_enabled) {
1930 		return;
1931 	}
1932 
1933 	spdk_json_write_object_begin(w);
1934 	spdk_json_write_named_string(w, "method", "bdev_enable_histogram");
1935 
1936 	spdk_json_write_named_object_begin(w, "params");
1937 	spdk_json_write_named_string(w, "name", bdev->name);
1938 
1939 	spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled);
1940 
1941 	if (bdev->internal.histogram_io_type) {
1942 		spdk_json_write_named_string(w, "opc",
1943 					     spdk_bdev_get_io_type_name(bdev->internal.histogram_io_type));
1944 	}
1945 
1946 	spdk_json_write_object_end(w);
1947 
1948 	spdk_json_write_object_end(w);
1949 }
1950 
1951 static void
1952 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1953 {
1954 	int i;
1955 	struct spdk_bdev_qos *qos = bdev->internal.qos;
1956 	uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
1957 
1958 	if (!qos) {
1959 		return;
1960 	}
1961 
1962 	spdk_bdev_get_qos_rate_limits(bdev, limits);
1963 
1964 	spdk_json_write_object_begin(w);
1965 	spdk_json_write_named_string(w, "method", "bdev_set_qos_limit");
1966 
1967 	spdk_json_write_named_object_begin(w, "params");
1968 	spdk_json_write_named_string(w, "name", bdev->name);
1969 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
1970 		if (limits[i] > 0) {
1971 			spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]);
1972 		}
1973 	}
1974 	spdk_json_write_object_end(w);
1975 
1976 	spdk_json_write_object_end(w);
1977 }
1978 
1979 void
1980 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w)
1981 {
1982 	struct spdk_bdev_module *bdev_module;
1983 	struct spdk_bdev *bdev;
1984 
1985 	assert(w != NULL);
1986 
1987 	spdk_json_write_array_begin(w);
1988 
1989 	spdk_json_write_object_begin(w);
1990 	spdk_json_write_named_string(w, "method", "bdev_set_options");
1991 	spdk_json_write_named_object_begin(w, "params");
1992 	spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size);
1993 	spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size);
1994 	spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine);
1995 	spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size);
1996 	spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size);
1997 	spdk_json_write_object_end(w);
1998 	spdk_json_write_object_end(w);
1999 
2000 	bdev_examine_allowlist_config_json(w);
2001 
2002 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
2003 		if (bdev_module->config_json) {
2004 			bdev_module->config_json(w);
2005 		}
2006 	}
2007 
2008 	spdk_spin_lock(&g_bdev_mgr.spinlock);
2009 
2010 	TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) {
2011 		if (bdev->fn_table->write_config_json) {
2012 			bdev->fn_table->write_config_json(bdev, w);
2013 		}
2014 
2015 		bdev_qos_config_json(bdev, w);
2016 		bdev_enable_histogram_config_json(bdev, w);
2017 	}
2018 
2019 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
2020 
2021 	/* This has to be last RPC in array to make sure all bdevs finished examine */
2022 	spdk_json_write_object_begin(w);
2023 	spdk_json_write_named_string(w, "method", "bdev_wait_for_examine");
2024 	spdk_json_write_object_end(w);
2025 
2026 	spdk_json_write_array_end(w);
2027 }
2028 
2029 static void
2030 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf)
2031 {
2032 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
2033 	struct spdk_bdev_io *bdev_io;
2034 
2035 	spdk_iobuf_channel_fini(&ch->iobuf);
2036 
2037 	while (!STAILQ_EMPTY(&ch->per_thread_cache)) {
2038 		bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
2039 		STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
2040 		ch->per_thread_cache_count--;
2041 		spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
2042 	}
2043 
2044 	assert(ch->per_thread_cache_count == 0);
2045 }
2046 
2047 static int
2048 bdev_mgmt_channel_create(void *io_device, void *ctx_buf)
2049 {
2050 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
2051 	struct spdk_bdev_io *bdev_io;
2052 	uint32_t i;
2053 	int rc;
2054 
2055 	rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev",
2056 				     g_bdev_opts.iobuf_small_cache_size,
2057 				     g_bdev_opts.iobuf_large_cache_size);
2058 	if (rc != 0) {
2059 		SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc));
2060 		return -1;
2061 	}
2062 
2063 	STAILQ_INIT(&ch->per_thread_cache);
2064 	ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size;
2065 
2066 	/* Pre-populate bdev_io cache to ensure this thread cannot be starved. */
2067 	ch->per_thread_cache_count = 0;
2068 	for (i = 0; i < ch->bdev_io_cache_size; i++) {
2069 		bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
2070 		if (bdev_io == NULL) {
2071 			SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n");
2072 			assert(false);
2073 			bdev_mgmt_channel_destroy(io_device, ctx_buf);
2074 			return -1;
2075 		}
2076 		ch->per_thread_cache_count++;
2077 		STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link);
2078 	}
2079 
2080 	TAILQ_INIT(&ch->shared_resources);
2081 	TAILQ_INIT(&ch->io_wait_queue);
2082 
2083 	return 0;
2084 }
2085 
2086 static void
2087 bdev_init_complete(int rc)
2088 {
2089 	spdk_bdev_init_cb cb_fn = g_init_cb_fn;
2090 	void *cb_arg = g_init_cb_arg;
2091 	struct spdk_bdev_module *m;
2092 
2093 	g_bdev_mgr.init_complete = true;
2094 	g_init_cb_fn = NULL;
2095 	g_init_cb_arg = NULL;
2096 
2097 	/*
2098 	 * For modules that need to know when subsystem init is complete,
2099 	 * inform them now.
2100 	 */
2101 	if (rc == 0) {
2102 		TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
2103 			if (m->init_complete) {
2104 				m->init_complete();
2105 			}
2106 		}
2107 	}
2108 
2109 	cb_fn(cb_arg, rc);
2110 }
2111 
2112 static bool
2113 bdev_module_all_actions_completed(void)
2114 {
2115 	struct spdk_bdev_module *m;
2116 
2117 	TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
2118 		if (m->internal.action_in_progress > 0) {
2119 			return false;
2120 		}
2121 	}
2122 	return true;
2123 }
2124 
2125 static void
2126 bdev_module_action_complete(void)
2127 {
2128 	/*
2129 	 * Don't finish bdev subsystem initialization if
2130 	 * module pre-initialization is still in progress, or
2131 	 * the subsystem been already initialized.
2132 	 */
2133 	if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) {
2134 		return;
2135 	}
2136 
2137 	/*
2138 	 * Check all bdev modules for inits/examinations in progress. If any
2139 	 * exist, return immediately since we cannot finish bdev subsystem
2140 	 * initialization until all are completed.
2141 	 */
2142 	if (!bdev_module_all_actions_completed()) {
2143 		return;
2144 	}
2145 
2146 	/*
2147 	 * Modules already finished initialization - now that all
2148 	 * the bdev modules have finished their asynchronous I/O
2149 	 * processing, the entire bdev layer can be marked as complete.
2150 	 */
2151 	bdev_init_complete(0);
2152 }
2153 
2154 static void
2155 bdev_module_action_done(struct spdk_bdev_module *module)
2156 {
2157 	spdk_spin_lock(&module->internal.spinlock);
2158 	assert(module->internal.action_in_progress > 0);
2159 	module->internal.action_in_progress--;
2160 	spdk_spin_unlock(&module->internal.spinlock);
2161 	bdev_module_action_complete();
2162 }
2163 
2164 void
2165 spdk_bdev_module_init_done(struct spdk_bdev_module *module)
2166 {
2167 	assert(module->async_init);
2168 	bdev_module_action_done(module);
2169 }
2170 
2171 void
2172 spdk_bdev_module_examine_done(struct spdk_bdev_module *module)
2173 {
2174 	bdev_module_action_done(module);
2175 }
2176 
2177 /** The last initialized bdev module */
2178 static struct spdk_bdev_module *g_resume_bdev_module = NULL;
2179 
2180 static void
2181 bdev_init_failed(void *cb_arg)
2182 {
2183 	struct spdk_bdev_module *module = cb_arg;
2184 
2185 	spdk_spin_lock(&module->internal.spinlock);
2186 	assert(module->internal.action_in_progress > 0);
2187 	module->internal.action_in_progress--;
2188 	spdk_spin_unlock(&module->internal.spinlock);
2189 	bdev_init_complete(-1);
2190 }
2191 
2192 static int
2193 bdev_modules_init(void)
2194 {
2195 	struct spdk_bdev_module *module;
2196 	int rc = 0;
2197 
2198 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
2199 		g_resume_bdev_module = module;
2200 		if (module->async_init) {
2201 			spdk_spin_lock(&module->internal.spinlock);
2202 			module->internal.action_in_progress = 1;
2203 			spdk_spin_unlock(&module->internal.spinlock);
2204 		}
2205 		rc = module->module_init();
2206 		if (rc != 0) {
2207 			/* Bump action_in_progress to prevent other modules from completion of modules_init
2208 			 * Send message to defer application shutdown until resources are cleaned up */
2209 			spdk_spin_lock(&module->internal.spinlock);
2210 			module->internal.action_in_progress = 1;
2211 			spdk_spin_unlock(&module->internal.spinlock);
2212 			spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module);
2213 			return rc;
2214 		}
2215 	}
2216 
2217 	g_resume_bdev_module = NULL;
2218 	return 0;
2219 }
2220 
2221 void
2222 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg)
2223 {
2224 	int rc = 0;
2225 	char mempool_name[32];
2226 
2227 	assert(cb_fn != NULL);
2228 
2229 	g_init_cb_fn = cb_fn;
2230 	g_init_cb_arg = cb_arg;
2231 
2232 	spdk_notify_type_register("bdev_register");
2233 	spdk_notify_type_register("bdev_unregister");
2234 
2235 	snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid());
2236 
2237 	rc = spdk_iobuf_register_module("bdev");
2238 	if (rc != 0) {
2239 		SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc));
2240 		bdev_init_complete(-1);
2241 		return;
2242 	}
2243 
2244 	g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name,
2245 				  g_bdev_opts.bdev_io_pool_size,
2246 				  sizeof(struct spdk_bdev_io) +
2247 				  bdev_module_get_max_ctx_size(),
2248 				  0,
2249 				  SPDK_ENV_NUMA_ID_ANY);
2250 
2251 	if (g_bdev_mgr.bdev_io_pool == NULL) {
2252 		SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n");
2253 		bdev_init_complete(-1);
2254 		return;
2255 	}
2256 
2257 	g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE,
2258 					      NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
2259 	if (!g_bdev_mgr.zero_buffer) {
2260 		SPDK_ERRLOG("create bdev zero buffer failed\n");
2261 		bdev_init_complete(-1);
2262 		return;
2263 	}
2264 
2265 #ifdef SPDK_CONFIG_VTUNE
2266 	g_bdev_mgr.domain = __itt_domain_create("spdk_bdev");
2267 #endif
2268 
2269 	spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create,
2270 				bdev_mgmt_channel_destroy,
2271 				sizeof(struct spdk_bdev_mgmt_channel),
2272 				"bdev_mgr");
2273 
2274 	rc = bdev_modules_init();
2275 	g_bdev_mgr.module_init_complete = true;
2276 	if (rc != 0) {
2277 		SPDK_ERRLOG("bdev modules init failed\n");
2278 		return;
2279 	}
2280 
2281 	bdev_module_action_complete();
2282 }
2283 
2284 static void
2285 bdev_mgr_unregister_cb(void *io_device)
2286 {
2287 	spdk_bdev_fini_cb cb_fn = g_fini_cb_fn;
2288 
2289 	if (g_bdev_mgr.bdev_io_pool) {
2290 		if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) {
2291 			SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n",
2292 				    spdk_mempool_count(g_bdev_mgr.bdev_io_pool),
2293 				    g_bdev_opts.bdev_io_pool_size);
2294 		}
2295 
2296 		spdk_mempool_free(g_bdev_mgr.bdev_io_pool);
2297 	}
2298 
2299 	spdk_free(g_bdev_mgr.zero_buffer);
2300 
2301 	bdev_examine_allowlist_free();
2302 
2303 	cb_fn(g_fini_cb_arg);
2304 	g_fini_cb_fn = NULL;
2305 	g_fini_cb_arg = NULL;
2306 	g_bdev_mgr.init_complete = false;
2307 	g_bdev_mgr.module_init_complete = false;
2308 }
2309 
2310 static void
2311 bdev_module_fini_iter(void *arg)
2312 {
2313 	struct spdk_bdev_module *bdev_module;
2314 
2315 	/* FIXME: Handling initialization failures is broken now,
2316 	 * so we won't even try cleaning up after successfully
2317 	 * initialized modules. if module_init_complete is false,
2318 	 * just call spdk_bdev_mgr_unregister_cb
2319 	 */
2320 	if (!g_bdev_mgr.module_init_complete) {
2321 		bdev_mgr_unregister_cb(NULL);
2322 		return;
2323 	}
2324 
2325 	/* Start iterating from the last touched module */
2326 	if (!g_resume_bdev_module) {
2327 		bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list);
2328 	} else {
2329 		bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list,
2330 					 internal.tailq);
2331 	}
2332 
2333 	while (bdev_module) {
2334 		if (bdev_module->async_fini) {
2335 			/* Save our place so we can resume later. We must
2336 			 * save the variable here, before calling module_fini()
2337 			 * below, because in some cases the module may immediately
2338 			 * call spdk_bdev_module_fini_done() and re-enter
2339 			 * this function to continue iterating. */
2340 			g_resume_bdev_module = bdev_module;
2341 		}
2342 
2343 		if (bdev_module->module_fini) {
2344 			bdev_module->module_fini();
2345 		}
2346 
2347 		if (bdev_module->async_fini) {
2348 			return;
2349 		}
2350 
2351 		bdev_module = TAILQ_PREV(bdev_module, bdev_module_list,
2352 					 internal.tailq);
2353 	}
2354 
2355 	g_resume_bdev_module = NULL;
2356 	spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb);
2357 }
2358 
2359 void
2360 spdk_bdev_module_fini_done(void)
2361 {
2362 	if (spdk_get_thread() != g_fini_thread) {
2363 		spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL);
2364 	} else {
2365 		bdev_module_fini_iter(NULL);
2366 	}
2367 }
2368 
2369 static void
2370 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno)
2371 {
2372 	struct spdk_bdev *bdev = cb_arg;
2373 
2374 	if (bdeverrno && bdev) {
2375 		SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n",
2376 			     bdev->name);
2377 
2378 		/*
2379 		 * Since the call to spdk_bdev_unregister() failed, we have no way to free this
2380 		 *  bdev; try to continue by manually removing this bdev from the list and continue
2381 		 *  with the next bdev in the list.
2382 		 */
2383 		TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
2384 	}
2385 
2386 	if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) {
2387 		SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n");
2388 		/*
2389 		 * Bdev module finish need to be deferred as we might be in the middle of some context
2390 		 * (like bdev part free) that will use this bdev (or private bdev driver ctx data)
2391 		 * after returning.
2392 		 */
2393 		spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL);
2394 		return;
2395 	}
2396 
2397 	/*
2398 	 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem
2399 	 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity
2400 	 * to detect clean shutdown as opposed to run-time hot removal of the underlying
2401 	 * base bdevs.
2402 	 *
2403 	 * Also, walk the list in the reverse order.
2404 	 */
2405 	for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list);
2406 	     bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) {
2407 		spdk_spin_lock(&bdev->internal.spinlock);
2408 		if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
2409 			LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev);
2410 			spdk_spin_unlock(&bdev->internal.spinlock);
2411 			continue;
2412 		}
2413 		spdk_spin_unlock(&bdev->internal.spinlock);
2414 
2415 		SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name);
2416 		spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev);
2417 		return;
2418 	}
2419 
2420 	/*
2421 	 * If any bdev fails to unclaim underlying bdev properly, we may face the
2422 	 * case of bdev list consisting of claimed bdevs only (if claims are managed
2423 	 * correctly, this would mean there's a loop in the claims graph which is
2424 	 * clearly impossible). Warn and unregister last bdev on the list then.
2425 	 */
2426 	for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list);
2427 	     bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) {
2428 		SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name);
2429 		spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev);
2430 		return;
2431 	}
2432 }
2433 
2434 static void
2435 bdev_module_fini_start_iter(void *arg)
2436 {
2437 	struct spdk_bdev_module *bdev_module;
2438 
2439 	if (!g_resume_bdev_module) {
2440 		bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list);
2441 	} else {
2442 		bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq);
2443 	}
2444 
2445 	while (bdev_module) {
2446 		if (bdev_module->async_fini_start) {
2447 			/* Save our place so we can resume later. We must
2448 			 * save the variable here, before calling fini_start()
2449 			 * below, because in some cases the module may immediately
2450 			 * call spdk_bdev_module_fini_start_done() and re-enter
2451 			 * this function to continue iterating. */
2452 			g_resume_bdev_module = bdev_module;
2453 		}
2454 
2455 		if (bdev_module->fini_start) {
2456 			bdev_module->fini_start();
2457 		}
2458 
2459 		if (bdev_module->async_fini_start) {
2460 			return;
2461 		}
2462 
2463 		bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq);
2464 	}
2465 
2466 	g_resume_bdev_module = NULL;
2467 
2468 	bdev_finish_unregister_bdevs_iter(NULL, 0);
2469 }
2470 
2471 void
2472 spdk_bdev_module_fini_start_done(void)
2473 {
2474 	if (spdk_get_thread() != g_fini_thread) {
2475 		spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL);
2476 	} else {
2477 		bdev_module_fini_start_iter(NULL);
2478 	}
2479 }
2480 
2481 static void
2482 bdev_finish_wait_for_examine_done(void *cb_arg)
2483 {
2484 	bdev_module_fini_start_iter(NULL);
2485 }
2486 
2487 static void bdev_open_async_fini(void);
2488 
2489 void
2490 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg)
2491 {
2492 	int rc;
2493 
2494 	assert(cb_fn != NULL);
2495 
2496 	g_fini_thread = spdk_get_thread();
2497 
2498 	g_fini_cb_fn = cb_fn;
2499 	g_fini_cb_arg = cb_arg;
2500 
2501 	bdev_open_async_fini();
2502 
2503 	rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL);
2504 	if (rc != 0) {
2505 		SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc));
2506 		bdev_finish_wait_for_examine_done(NULL);
2507 	}
2508 }
2509 
2510 struct spdk_bdev_io *
2511 bdev_channel_get_io(struct spdk_bdev_channel *channel)
2512 {
2513 	struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch;
2514 	struct spdk_bdev_io *bdev_io;
2515 
2516 	if (ch->per_thread_cache_count > 0) {
2517 		bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
2518 		STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
2519 		ch->per_thread_cache_count--;
2520 	} else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) {
2521 		/*
2522 		 * Don't try to look for bdev_ios in the global pool if there are
2523 		 * waiters on bdev_ios - we don't want this caller to jump the line.
2524 		 */
2525 		bdev_io = NULL;
2526 	} else {
2527 		bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
2528 	}
2529 
2530 	return bdev_io;
2531 }
2532 
2533 void
2534 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io)
2535 {
2536 	struct spdk_bdev_mgmt_channel *ch;
2537 
2538 	assert(bdev_io != NULL);
2539 	assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING);
2540 
2541 	ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
2542 
2543 	if (bdev_io->internal.f.has_buf) {
2544 		bdev_io_put_buf(bdev_io);
2545 	}
2546 
2547 	if (ch->per_thread_cache_count < ch->bdev_io_cache_size) {
2548 		ch->per_thread_cache_count++;
2549 		STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link);
2550 		while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) {
2551 			struct spdk_bdev_io_wait_entry *entry;
2552 
2553 			entry = TAILQ_FIRST(&ch->io_wait_queue);
2554 			TAILQ_REMOVE(&ch->io_wait_queue, entry, link);
2555 			entry->cb_fn(entry->cb_arg);
2556 		}
2557 	} else {
2558 		/* We should never have a full cache with entries on the io wait queue. */
2559 		assert(TAILQ_EMPTY(&ch->io_wait_queue));
2560 		spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
2561 	}
2562 }
2563 
2564 static bool
2565 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit)
2566 {
2567 	assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES);
2568 
2569 	switch (limit) {
2570 	case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
2571 		return true;
2572 	case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT:
2573 	case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT:
2574 	case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT:
2575 		return false;
2576 	case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES:
2577 	default:
2578 		return false;
2579 	}
2580 }
2581 
2582 static bool
2583 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io)
2584 {
2585 	switch (bdev_io->type) {
2586 	case SPDK_BDEV_IO_TYPE_NVME_IO:
2587 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2588 	case SPDK_BDEV_IO_TYPE_READ:
2589 	case SPDK_BDEV_IO_TYPE_WRITE:
2590 		return true;
2591 	case SPDK_BDEV_IO_TYPE_ZCOPY:
2592 		if (bdev_io->u.bdev.zcopy.start) {
2593 			return true;
2594 		} else {
2595 			return false;
2596 		}
2597 	default:
2598 		return false;
2599 	}
2600 }
2601 
2602 static bool
2603 bdev_is_read_io(struct spdk_bdev_io *bdev_io)
2604 {
2605 	switch (bdev_io->type) {
2606 	case SPDK_BDEV_IO_TYPE_NVME_IO:
2607 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2608 		/* Bit 1 (0x2) set for read operation */
2609 		if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) {
2610 			return true;
2611 		} else {
2612 			return false;
2613 		}
2614 	case SPDK_BDEV_IO_TYPE_READ:
2615 		return true;
2616 	case SPDK_BDEV_IO_TYPE_ZCOPY:
2617 		/* Populate to read from disk */
2618 		if (bdev_io->u.bdev.zcopy.populate) {
2619 			return true;
2620 		} else {
2621 			return false;
2622 		}
2623 	default:
2624 		return false;
2625 	}
2626 }
2627 
2628 static uint64_t
2629 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io)
2630 {
2631 	struct spdk_bdev	*bdev = bdev_io->bdev;
2632 
2633 	switch (bdev_io->type) {
2634 	case SPDK_BDEV_IO_TYPE_NVME_IO:
2635 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2636 		return bdev_io->u.nvme_passthru.nbytes;
2637 	case SPDK_BDEV_IO_TYPE_READ:
2638 	case SPDK_BDEV_IO_TYPE_WRITE:
2639 		return bdev_io->u.bdev.num_blocks * bdev->blocklen;
2640 	case SPDK_BDEV_IO_TYPE_ZCOPY:
2641 		/* Track the data in the start phase only */
2642 		if (bdev_io->u.bdev.zcopy.start) {
2643 			return bdev_io->u.bdev.num_blocks * bdev->blocklen;
2644 		} else {
2645 			return 0;
2646 		}
2647 	default:
2648 		return 0;
2649 	}
2650 }
2651 
2652 static inline bool
2653 bdev_qos_rw_queue_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta)
2654 {
2655 	int64_t remaining_this_timeslice;
2656 
2657 	if (!limit->max_per_timeslice) {
2658 		/* The QoS is disabled */
2659 		return false;
2660 	}
2661 
2662 	remaining_this_timeslice = __atomic_sub_fetch(&limit->remaining_this_timeslice, delta,
2663 				   __ATOMIC_RELAXED);
2664 	if (remaining_this_timeslice + (int64_t)delta > 0) {
2665 		/* There was still a quota for this delta -> the IO shouldn't be queued
2666 		 *
2667 		 * We allow a slight quota overrun here so an IO bigger than the per-timeslice
2668 		 * quota can be allowed once a while. Such overrun then taken into account in
2669 		 * the QoS poller, where the next timeslice quota is calculated.
2670 		 */
2671 		return false;
2672 	}
2673 
2674 	/* There was no quota for this delta -> the IO should be queued
2675 	 * The remaining_this_timeslice must be rewinded so it reflects the real
2676 	 * amount of IOs or bytes allowed.
2677 	 */
2678 	__atomic_add_fetch(
2679 		&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED);
2680 	return true;
2681 }
2682 
2683 static inline void
2684 bdev_qos_rw_rewind_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta)
2685 {
2686 	__atomic_add_fetch(&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED);
2687 }
2688 
2689 static bool
2690 bdev_qos_rw_iops_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2691 {
2692 	return bdev_qos_rw_queue_io(limit, io, 1);
2693 }
2694 
2695 static void
2696 bdev_qos_rw_iops_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2697 {
2698 	bdev_qos_rw_rewind_io(limit, io, 1);
2699 }
2700 
2701 static bool
2702 bdev_qos_rw_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2703 {
2704 	return bdev_qos_rw_queue_io(limit, io, bdev_get_io_size_in_byte(io));
2705 }
2706 
2707 static void
2708 bdev_qos_rw_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2709 {
2710 	bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io));
2711 }
2712 
2713 static bool
2714 bdev_qos_r_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2715 {
2716 	if (bdev_is_read_io(io) == false) {
2717 		return false;
2718 	}
2719 
2720 	return bdev_qos_rw_bps_queue(limit, io);
2721 }
2722 
2723 static void
2724 bdev_qos_r_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2725 {
2726 	if (bdev_is_read_io(io) != false) {
2727 		bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io));
2728 	}
2729 }
2730 
2731 static bool
2732 bdev_qos_w_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2733 {
2734 	if (bdev_is_read_io(io) == true) {
2735 		return false;
2736 	}
2737 
2738 	return bdev_qos_rw_bps_queue(limit, io);
2739 }
2740 
2741 static void
2742 bdev_qos_w_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2743 {
2744 	if (bdev_is_read_io(io) != true) {
2745 		bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io));
2746 	}
2747 }
2748 
2749 static void
2750 bdev_qos_set_ops(struct spdk_bdev_qos *qos)
2751 {
2752 	int i;
2753 
2754 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2755 		if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
2756 			qos->rate_limits[i].queue_io = NULL;
2757 			continue;
2758 		}
2759 
2760 		switch (i) {
2761 		case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
2762 			qos->rate_limits[i].queue_io = bdev_qos_rw_iops_queue;
2763 			qos->rate_limits[i].rewind_quota = bdev_qos_rw_iops_rewind_quota;
2764 			break;
2765 		case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT:
2766 			qos->rate_limits[i].queue_io = bdev_qos_rw_bps_queue;
2767 			qos->rate_limits[i].rewind_quota = bdev_qos_rw_bps_rewind_quota;
2768 			break;
2769 		case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT:
2770 			qos->rate_limits[i].queue_io = bdev_qos_r_bps_queue;
2771 			qos->rate_limits[i].rewind_quota = bdev_qos_r_bps_rewind_quota;
2772 			break;
2773 		case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT:
2774 			qos->rate_limits[i].queue_io = bdev_qos_w_bps_queue;
2775 			qos->rate_limits[i].rewind_quota = bdev_qos_w_bps_rewind_quota;
2776 			break;
2777 		default:
2778 			break;
2779 		}
2780 	}
2781 }
2782 
2783 static void
2784 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch,
2785 			    struct spdk_bdev_io *bdev_io,
2786 			    enum spdk_bdev_io_status status)
2787 {
2788 	bdev_io->internal.f.in_submit_request = true;
2789 	bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource);
2790 	spdk_bdev_io_complete(bdev_io, status);
2791 	bdev_io->internal.f.in_submit_request = false;
2792 }
2793 
2794 static inline void
2795 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io)
2796 {
2797 	struct spdk_bdev *bdev = bdev_io->bdev;
2798 	struct spdk_io_channel *ch = bdev_ch->channel;
2799 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
2800 
2801 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) {
2802 		struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch;
2803 		struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort;
2804 
2805 		if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) ||
2806 		    bdev_abort_buf_io(mgmt_channel, bio_to_abort)) {
2807 			_bdev_io_complete_in_submit(bdev_ch, bdev_io,
2808 						    SPDK_BDEV_IO_STATUS_SUCCESS);
2809 			return;
2810 		}
2811 	}
2812 
2813 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE &&
2814 			  bdev_io->bdev->split_on_write_unit &&
2815 			  bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) {
2816 		SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n",
2817 			    bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size);
2818 		_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
2819 		return;
2820 	}
2821 
2822 	if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) {
2823 		bdev_io_increment_outstanding(bdev_ch, shared_resource);
2824 		bdev_io->internal.f.in_submit_request = true;
2825 		bdev_submit_request(bdev, ch, bdev_io);
2826 		bdev_io->internal.f.in_submit_request = false;
2827 	} else {
2828 		bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT);
2829 		if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) {
2830 			/* Special case when we have nomem IOs and no outstanding IOs which completions
2831 			 * could trigger retry of queued IOs */
2832 			bdev_shared_ch_retry_io(shared_resource);
2833 		}
2834 	}
2835 }
2836 
2837 static bool
2838 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io)
2839 {
2840 	int i;
2841 
2842 	if (bdev_qos_io_to_limit(bdev_io) == true) {
2843 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2844 			if (!qos->rate_limits[i].queue_io) {
2845 				continue;
2846 			}
2847 
2848 			if (qos->rate_limits[i].queue_io(&qos->rate_limits[i],
2849 							 bdev_io) == true) {
2850 				for (i -= 1; i >= 0 ; i--) {
2851 					if (!qos->rate_limits[i].queue_io) {
2852 						continue;
2853 					}
2854 
2855 					qos->rate_limits[i].rewind_quota(&qos->rate_limits[i], bdev_io);
2856 				}
2857 				return true;
2858 			}
2859 		}
2860 	}
2861 
2862 	return false;
2863 }
2864 
2865 static int
2866 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos)
2867 {
2868 	struct spdk_bdev_io		*bdev_io = NULL, *tmp = NULL;
2869 	int				submitted_ios = 0;
2870 
2871 	TAILQ_FOREACH_SAFE(bdev_io, &ch->qos_queued_io, internal.link, tmp) {
2872 		if (!bdev_qos_queue_io(qos, bdev_io)) {
2873 			TAILQ_REMOVE(&ch->qos_queued_io, bdev_io, internal.link);
2874 			bdev_io_do_submit(ch, bdev_io);
2875 
2876 			submitted_ios++;
2877 		}
2878 	}
2879 
2880 	return submitted_ios;
2881 }
2882 
2883 static void
2884 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn)
2885 {
2886 	int rc;
2887 
2888 	bdev_io->internal.waitq_entry.bdev = bdev_io->bdev;
2889 	bdev_io->internal.waitq_entry.cb_fn = cb_fn;
2890 	bdev_io->internal.waitq_entry.cb_arg = bdev_io;
2891 	rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch),
2892 				     &bdev_io->internal.waitq_entry);
2893 	if (rc != 0) {
2894 		SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc);
2895 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
2896 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
2897 	}
2898 }
2899 
2900 static bool
2901 bdev_rw_should_split(struct spdk_bdev_io *bdev_io)
2902 {
2903 	uint32_t io_boundary;
2904 	struct spdk_bdev *bdev = bdev_io->bdev;
2905 	uint32_t max_segment_size = bdev->max_segment_size;
2906 	uint32_t max_size = bdev->max_rw_size;
2907 	int max_segs = bdev->max_num_segments;
2908 
2909 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) {
2910 		io_boundary = bdev->write_unit_size;
2911 	} else if (bdev->split_on_optimal_io_boundary) {
2912 		io_boundary = bdev->optimal_io_boundary;
2913 	} else {
2914 		io_boundary = 0;
2915 	}
2916 
2917 	if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) {
2918 		return false;
2919 	}
2920 
2921 	if (io_boundary) {
2922 		uint64_t start_stripe, end_stripe;
2923 
2924 		start_stripe = bdev_io->u.bdev.offset_blocks;
2925 		end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1;
2926 		/* Avoid expensive div operations if possible.  These spdk_u32 functions are very cheap. */
2927 		if (spdk_likely(spdk_u32_is_pow2(io_boundary))) {
2928 			start_stripe >>= spdk_u32log2(io_boundary);
2929 			end_stripe >>= spdk_u32log2(io_boundary);
2930 		} else {
2931 			start_stripe /= io_boundary;
2932 			end_stripe /= io_boundary;
2933 		}
2934 
2935 		if (start_stripe != end_stripe) {
2936 			return true;
2937 		}
2938 	}
2939 
2940 	if (max_segs) {
2941 		if (bdev_io->u.bdev.iovcnt > max_segs) {
2942 			return true;
2943 		}
2944 	}
2945 
2946 	if (max_segment_size) {
2947 		for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) {
2948 			if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) {
2949 				return true;
2950 			}
2951 		}
2952 	}
2953 
2954 	if (max_size) {
2955 		if (bdev_io->u.bdev.num_blocks > max_size) {
2956 			return true;
2957 		}
2958 	}
2959 
2960 	return false;
2961 }
2962 
2963 static bool
2964 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io)
2965 {
2966 	uint32_t num_unmap_segments;
2967 
2968 	if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) {
2969 		return false;
2970 	}
2971 	num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap);
2972 	if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) {
2973 		return true;
2974 	}
2975 
2976 	return false;
2977 }
2978 
2979 static bool
2980 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io)
2981 {
2982 	if (!bdev_io->bdev->max_write_zeroes) {
2983 		return false;
2984 	}
2985 
2986 	if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) {
2987 		return true;
2988 	}
2989 
2990 	return false;
2991 }
2992 
2993 static bool
2994 bdev_copy_should_split(struct spdk_bdev_io *bdev_io)
2995 {
2996 	if (bdev_io->bdev->max_copy != 0 &&
2997 	    bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) {
2998 		return true;
2999 	}
3000 
3001 	return false;
3002 }
3003 
3004 static bool
3005 bdev_io_should_split(struct spdk_bdev_io *bdev_io)
3006 {
3007 	switch (bdev_io->type) {
3008 	case SPDK_BDEV_IO_TYPE_READ:
3009 	case SPDK_BDEV_IO_TYPE_WRITE:
3010 		return bdev_rw_should_split(bdev_io);
3011 	case SPDK_BDEV_IO_TYPE_UNMAP:
3012 		return bdev_unmap_should_split(bdev_io);
3013 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3014 		return bdev_write_zeroes_should_split(bdev_io);
3015 	case SPDK_BDEV_IO_TYPE_COPY:
3016 		return bdev_copy_should_split(bdev_io);
3017 	default:
3018 		return false;
3019 	}
3020 }
3021 
3022 static uint32_t
3023 _to_next_boundary(uint64_t offset, uint32_t boundary)
3024 {
3025 	return (boundary - (offset % boundary));
3026 }
3027 
3028 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
3029 
3030 static void _bdev_rw_split(void *_bdev_io);
3031 
3032 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io);
3033 
3034 static void
3035 _bdev_unmap_split(void *_bdev_io)
3036 {
3037 	return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io);
3038 }
3039 
3040 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io);
3041 
3042 static void
3043 _bdev_write_zeroes_split(void *_bdev_io)
3044 {
3045 	return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io);
3046 }
3047 
3048 static void bdev_copy_split(struct spdk_bdev_io *bdev_io);
3049 
3050 static void
3051 _bdev_copy_split(void *_bdev_io)
3052 {
3053 	return bdev_copy_split((struct spdk_bdev_io *)_bdev_io);
3054 }
3055 
3056 static int
3057 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf,
3058 		     uint64_t num_blocks, uint64_t *offset, uint64_t *remaining)
3059 {
3060 	int rc;
3061 	uint64_t current_offset, current_remaining, current_src_offset;
3062 	spdk_bdev_io_wait_cb io_wait_fn;
3063 
3064 	current_offset = *offset;
3065 	current_remaining = *remaining;
3066 
3067 	assert(bdev_io->internal.f.split);
3068 
3069 	bdev_io->internal.split.outstanding++;
3070 
3071 	io_wait_fn = _bdev_rw_split;
3072 	switch (bdev_io->type) {
3073 	case SPDK_BDEV_IO_TYPE_READ:
3074 		assert(bdev_io->u.bdev.accel_sequence == NULL);
3075 		rc = bdev_readv_blocks_with_md(bdev_io->internal.desc,
3076 					       spdk_io_channel_from_ctx(bdev_io->internal.ch),
3077 					       iov, iovcnt, md_buf, current_offset,
3078 					       num_blocks,
3079 					       bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL,
3080 					       bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL,
3081 					       NULL,
3082 					       bdev_io->u.bdev.dif_check_flags,
3083 					       bdev_io_split_done, bdev_io);
3084 		break;
3085 	case SPDK_BDEV_IO_TYPE_WRITE:
3086 		assert(bdev_io->u.bdev.accel_sequence == NULL);
3087 		rc = bdev_writev_blocks_with_md(bdev_io->internal.desc,
3088 						spdk_io_channel_from_ctx(bdev_io->internal.ch),
3089 						iov, iovcnt, md_buf, current_offset,
3090 						num_blocks,
3091 						bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL,
3092 						bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL,
3093 						NULL,
3094 						bdev_io->u.bdev.dif_check_flags,
3095 						bdev_io->u.bdev.nvme_cdw12.raw,
3096 						bdev_io->u.bdev.nvme_cdw13.raw,
3097 						bdev_io_split_done, bdev_io);
3098 		break;
3099 	case SPDK_BDEV_IO_TYPE_UNMAP:
3100 		io_wait_fn = _bdev_unmap_split;
3101 		rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc,
3102 					    spdk_io_channel_from_ctx(bdev_io->internal.ch),
3103 					    current_offset, num_blocks,
3104 					    bdev_io_split_done, bdev_io);
3105 		break;
3106 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3107 		io_wait_fn = _bdev_write_zeroes_split;
3108 		rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc,
3109 						   spdk_io_channel_from_ctx(bdev_io->internal.ch),
3110 						   current_offset, num_blocks,
3111 						   bdev_io_split_done, bdev_io);
3112 		break;
3113 	case SPDK_BDEV_IO_TYPE_COPY:
3114 		io_wait_fn = _bdev_copy_split;
3115 		current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks +
3116 				     (current_offset - bdev_io->u.bdev.offset_blocks);
3117 		rc = spdk_bdev_copy_blocks(bdev_io->internal.desc,
3118 					   spdk_io_channel_from_ctx(bdev_io->internal.ch),
3119 					   current_offset, current_src_offset, num_blocks,
3120 					   bdev_io_split_done, bdev_io);
3121 		break;
3122 	default:
3123 		assert(false);
3124 		rc = -EINVAL;
3125 		break;
3126 	}
3127 
3128 	if (rc == 0) {
3129 		current_offset += num_blocks;
3130 		current_remaining -= num_blocks;
3131 		bdev_io->internal.split.current_offset_blocks = current_offset;
3132 		bdev_io->internal.split.remaining_num_blocks = current_remaining;
3133 		*offset = current_offset;
3134 		*remaining = current_remaining;
3135 	} else {
3136 		bdev_io->internal.split.outstanding--;
3137 		if (rc == -ENOMEM) {
3138 			if (bdev_io->internal.split.outstanding == 0) {
3139 				/* No I/O is outstanding. Hence we should wait here. */
3140 				bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn);
3141 			}
3142 		} else {
3143 			bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3144 			if (bdev_io->internal.split.outstanding == 0) {
3145 				bdev_ch_remove_from_io_submitted(bdev_io);
3146 				spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id,
3147 						  0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx,
3148 						  bdev_io->internal.ch->queue_depth);
3149 				bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
3150 			}
3151 		}
3152 	}
3153 
3154 	return rc;
3155 }
3156 
3157 static void
3158 _bdev_rw_split(void *_bdev_io)
3159 {
3160 	struct iovec *parent_iov, *iov;
3161 	struct spdk_bdev_io *bdev_io = _bdev_io;
3162 	struct spdk_bdev *bdev = bdev_io->bdev;
3163 	uint64_t parent_offset, current_offset, remaining;
3164 	uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt;
3165 	uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes;
3166 	uint32_t iovcnt, iov_len, child_iovsize;
3167 	uint32_t blocklen = bdev->blocklen;
3168 	uint32_t io_boundary;
3169 	uint32_t max_segment_size = bdev->max_segment_size;
3170 	uint32_t max_child_iovcnt = bdev->max_num_segments;
3171 	uint32_t max_size = bdev->max_rw_size;
3172 	void *md_buf = NULL;
3173 	int rc;
3174 
3175 	max_size = max_size ? max_size : UINT32_MAX;
3176 	max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX;
3177 	max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) :
3178 			   SPDK_BDEV_IO_NUM_CHILD_IOV;
3179 
3180 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) {
3181 		io_boundary = bdev->write_unit_size;
3182 	} else if (bdev->split_on_optimal_io_boundary) {
3183 		io_boundary = bdev->optimal_io_boundary;
3184 	} else {
3185 		io_boundary = UINT32_MAX;
3186 	}
3187 
3188 	assert(bdev_io->internal.f.split);
3189 
3190 	remaining = bdev_io->internal.split.remaining_num_blocks;
3191 	current_offset = bdev_io->internal.split.current_offset_blocks;
3192 	parent_offset = bdev_io->u.bdev.offset_blocks;
3193 	parent_iov_offset = (current_offset - parent_offset) * blocklen;
3194 	parent_iovcnt = bdev_io->u.bdev.iovcnt;
3195 
3196 	for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) {
3197 		parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
3198 		if (parent_iov_offset < parent_iov->iov_len) {
3199 			break;
3200 		}
3201 		parent_iov_offset -= parent_iov->iov_len;
3202 	}
3203 
3204 	child_iovcnt = 0;
3205 	while (remaining > 0 && parent_iovpos < parent_iovcnt &&
3206 	       child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) {
3207 		to_next_boundary = _to_next_boundary(current_offset, io_boundary);
3208 		to_next_boundary = spdk_min(remaining, to_next_boundary);
3209 		to_next_boundary = spdk_min(max_size, to_next_boundary);
3210 		to_next_boundary_bytes = to_next_boundary * blocklen;
3211 
3212 		iov = &bdev_io->child_iov[child_iovcnt];
3213 		iovcnt = 0;
3214 
3215 		if (bdev_io->u.bdev.md_buf) {
3216 			md_buf = (char *)bdev_io->u.bdev.md_buf +
3217 				 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev);
3218 		}
3219 
3220 		child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt);
3221 		while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt &&
3222 		       iovcnt < child_iovsize) {
3223 			parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
3224 			iov_len = parent_iov->iov_len - parent_iov_offset;
3225 
3226 			iov_len = spdk_min(iov_len, max_segment_size);
3227 			iov_len = spdk_min(iov_len, to_next_boundary_bytes);
3228 			to_next_boundary_bytes -= iov_len;
3229 
3230 			bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset;
3231 			bdev_io->child_iov[child_iovcnt].iov_len = iov_len;
3232 
3233 			if (iov_len < parent_iov->iov_len - parent_iov_offset) {
3234 				parent_iov_offset += iov_len;
3235 			} else {
3236 				parent_iovpos++;
3237 				parent_iov_offset = 0;
3238 			}
3239 			child_iovcnt++;
3240 			iovcnt++;
3241 		}
3242 
3243 		if (to_next_boundary_bytes > 0) {
3244 			/* We had to stop this child I/O early because we ran out of
3245 			 * child_iov space or were limited by max_num_segments.
3246 			 * Ensure the iovs to be aligned with block size and
3247 			 * then adjust to_next_boundary before starting the
3248 			 * child I/O.
3249 			 */
3250 			assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV ||
3251 			       iovcnt == child_iovsize);
3252 			to_last_block_bytes = to_next_boundary_bytes % blocklen;
3253 			if (to_last_block_bytes != 0) {
3254 				uint32_t child_iovpos = child_iovcnt - 1;
3255 				/* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV
3256 				 * so the loop will naturally end
3257 				 */
3258 
3259 				to_last_block_bytes = blocklen - to_last_block_bytes;
3260 				to_next_boundary_bytes += to_last_block_bytes;
3261 				while (to_last_block_bytes > 0 && iovcnt > 0) {
3262 					iov_len = spdk_min(to_last_block_bytes,
3263 							   bdev_io->child_iov[child_iovpos].iov_len);
3264 					bdev_io->child_iov[child_iovpos].iov_len -= iov_len;
3265 					if (bdev_io->child_iov[child_iovpos].iov_len == 0) {
3266 						child_iovpos--;
3267 						if (--iovcnt == 0) {
3268 							/* If the child IO is less than a block size just return.
3269 							 * If the first child IO of any split round is less than
3270 							 * a block size, an error exit.
3271 							 */
3272 							if (bdev_io->internal.split.outstanding == 0) {
3273 								SPDK_ERRLOG("The first child io was less than a block size\n");
3274 								bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3275 								bdev_ch_remove_from_io_submitted(bdev_io);
3276 								spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id,
3277 										  0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx,
3278 										  bdev_io->internal.ch->queue_depth);
3279 								bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
3280 							}
3281 
3282 							return;
3283 						}
3284 					}
3285 
3286 					to_last_block_bytes -= iov_len;
3287 
3288 					if (parent_iov_offset == 0) {
3289 						parent_iovpos--;
3290 						parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len;
3291 					}
3292 					parent_iov_offset -= iov_len;
3293 				}
3294 
3295 				assert(to_last_block_bytes == 0);
3296 			}
3297 			to_next_boundary -= to_next_boundary_bytes / blocklen;
3298 		}
3299 
3300 		rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary,
3301 					  &current_offset, &remaining);
3302 		if (spdk_unlikely(rc)) {
3303 			return;
3304 		}
3305 	}
3306 }
3307 
3308 static void
3309 bdev_unmap_split(struct spdk_bdev_io *bdev_io)
3310 {
3311 	uint64_t offset, unmap_blocks, remaining, max_unmap_blocks;
3312 	uint32_t num_children_reqs = 0;
3313 	int rc;
3314 
3315 	assert(bdev_io->internal.f.split);
3316 
3317 	offset = bdev_io->internal.split.current_offset_blocks;
3318 	remaining = bdev_io->internal.split.remaining_num_blocks;
3319 	max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments;
3320 
3321 	while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) {
3322 		unmap_blocks = spdk_min(remaining, max_unmap_blocks);
3323 
3324 		rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks,
3325 					  &offset, &remaining);
3326 		if (spdk_likely(rc == 0)) {
3327 			num_children_reqs++;
3328 		} else {
3329 			return;
3330 		}
3331 	}
3332 }
3333 
3334 static void
3335 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io)
3336 {
3337 	uint64_t offset, write_zeroes_blocks, remaining;
3338 	uint32_t num_children_reqs = 0;
3339 	int rc;
3340 
3341 	assert(bdev_io->internal.f.split);
3342 
3343 	offset = bdev_io->internal.split.current_offset_blocks;
3344 	remaining = bdev_io->internal.split.remaining_num_blocks;
3345 
3346 	while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) {
3347 		write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes);
3348 
3349 		rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks,
3350 					  &offset, &remaining);
3351 		if (spdk_likely(rc == 0)) {
3352 			num_children_reqs++;
3353 		} else {
3354 			return;
3355 		}
3356 	}
3357 }
3358 
3359 static void
3360 bdev_copy_split(struct spdk_bdev_io *bdev_io)
3361 {
3362 	uint64_t offset, copy_blocks, remaining;
3363 	uint32_t num_children_reqs = 0;
3364 	int rc;
3365 
3366 	assert(bdev_io->internal.f.split);
3367 
3368 	offset = bdev_io->internal.split.current_offset_blocks;
3369 	remaining = bdev_io->internal.split.remaining_num_blocks;
3370 
3371 	assert(bdev_io->bdev->max_copy != 0);
3372 	while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) {
3373 		copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy);
3374 
3375 		rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks,
3376 					  &offset, &remaining);
3377 		if (spdk_likely(rc == 0)) {
3378 			num_children_reqs++;
3379 		} else {
3380 			return;
3381 		}
3382 	}
3383 }
3384 
3385 static void
3386 parent_bdev_io_complete(void *ctx, int rc)
3387 {
3388 	struct spdk_bdev_io *parent_io = ctx;
3389 
3390 	if (rc) {
3391 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3392 	}
3393 
3394 	parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
3395 			       parent_io->internal.caller_ctx);
3396 }
3397 
3398 static void
3399 bdev_io_complete_parent_sequence_cb(void *ctx, int status)
3400 {
3401 	struct spdk_bdev_io *bdev_io = ctx;
3402 
3403 	/* u.bdev.accel_sequence should have already been cleared at this point */
3404 	assert(bdev_io->u.bdev.accel_sequence == NULL);
3405 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
3406 	bdev_io->internal.f.has_accel_sequence = false;
3407 
3408 	if (spdk_unlikely(status != 0)) {
3409 		SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status);
3410 	}
3411 
3412 	parent_bdev_io_complete(bdev_io, status);
3413 }
3414 
3415 static void
3416 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
3417 {
3418 	struct spdk_bdev_io *parent_io = cb_arg;
3419 
3420 	spdk_bdev_free_io(bdev_io);
3421 
3422 	assert(parent_io->internal.f.split);
3423 
3424 	if (!success) {
3425 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3426 		/* If any child I/O failed, stop further splitting process. */
3427 		parent_io->internal.split.current_offset_blocks += parent_io->internal.split.remaining_num_blocks;
3428 		parent_io->internal.split.remaining_num_blocks = 0;
3429 	}
3430 	parent_io->internal.split.outstanding--;
3431 	if (parent_io->internal.split.outstanding != 0) {
3432 		return;
3433 	}
3434 
3435 	/*
3436 	 * Parent I/O finishes when all blocks are consumed.
3437 	 */
3438 	if (parent_io->internal.split.remaining_num_blocks == 0) {
3439 		assert(parent_io->internal.cb != bdev_io_split_done);
3440 		bdev_ch_remove_from_io_submitted(parent_io);
3441 		spdk_trace_record(TRACE_BDEV_IO_DONE, parent_io->internal.ch->trace_id,
3442 				  0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx,
3443 				  parent_io->internal.ch->queue_depth);
3444 
3445 		if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
3446 			if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) {
3447 				bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb);
3448 				return;
3449 			} else if (parent_io->internal.f.has_bounce_buf &&
3450 				   !bdev_io_use_accel_sequence(bdev_io)) {
3451 				/* bdev IO will be completed in the callback */
3452 				_bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete);
3453 				return;
3454 			}
3455 		}
3456 
3457 		parent_bdev_io_complete(parent_io, 0);
3458 		return;
3459 	}
3460 
3461 	/*
3462 	 * Continue with the splitting process.  This function will complete the parent I/O if the
3463 	 * splitting is done.
3464 	 */
3465 	switch (parent_io->type) {
3466 	case SPDK_BDEV_IO_TYPE_READ:
3467 	case SPDK_BDEV_IO_TYPE_WRITE:
3468 		_bdev_rw_split(parent_io);
3469 		break;
3470 	case SPDK_BDEV_IO_TYPE_UNMAP:
3471 		bdev_unmap_split(parent_io);
3472 		break;
3473 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3474 		bdev_write_zeroes_split(parent_io);
3475 		break;
3476 	case SPDK_BDEV_IO_TYPE_COPY:
3477 		bdev_copy_split(parent_io);
3478 		break;
3479 	default:
3480 		assert(false);
3481 		break;
3482 	}
3483 }
3484 
3485 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
3486 				     bool success);
3487 
3488 static void
3489 bdev_io_split(struct spdk_bdev_io *bdev_io)
3490 {
3491 	assert(bdev_io_should_split(bdev_io));
3492 	assert(bdev_io->internal.f.split);
3493 
3494 	bdev_io->internal.split.current_offset_blocks = bdev_io->u.bdev.offset_blocks;
3495 	bdev_io->internal.split.remaining_num_blocks = bdev_io->u.bdev.num_blocks;
3496 	bdev_io->internal.split.outstanding = 0;
3497 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
3498 
3499 	switch (bdev_io->type) {
3500 	case SPDK_BDEV_IO_TYPE_READ:
3501 	case SPDK_BDEV_IO_TYPE_WRITE:
3502 		if (_is_buf_allocated(bdev_io->u.bdev.iovs)) {
3503 			_bdev_rw_split(bdev_io);
3504 		} else {
3505 			assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
3506 			spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb,
3507 					     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
3508 		}
3509 		break;
3510 	case SPDK_BDEV_IO_TYPE_UNMAP:
3511 		bdev_unmap_split(bdev_io);
3512 		break;
3513 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3514 		bdev_write_zeroes_split(bdev_io);
3515 		break;
3516 	case SPDK_BDEV_IO_TYPE_COPY:
3517 		bdev_copy_split(bdev_io);
3518 		break;
3519 	default:
3520 		assert(false);
3521 		break;
3522 	}
3523 }
3524 
3525 static void
3526 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
3527 {
3528 	if (!success) {
3529 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
3530 		return;
3531 	}
3532 
3533 	_bdev_rw_split(bdev_io);
3534 }
3535 
3536 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't
3537  *  be inlined, at least on some compilers.
3538  */
3539 static inline void
3540 _bdev_io_submit(void *ctx)
3541 {
3542 	struct spdk_bdev_io *bdev_io = ctx;
3543 	struct spdk_bdev *bdev = bdev_io->bdev;
3544 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
3545 
3546 	if (spdk_likely(bdev_ch->flags == 0)) {
3547 		bdev_io_do_submit(bdev_ch, bdev_io);
3548 		return;
3549 	}
3550 
3551 	if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) {
3552 		_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
3553 	} else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) {
3554 		if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) &&
3555 		    bdev_abort_queued_io(&bdev_ch->qos_queued_io, bdev_io->u.abort.bio_to_abort)) {
3556 			_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
3557 		} else {
3558 			TAILQ_INSERT_TAIL(&bdev_ch->qos_queued_io, bdev_io, internal.link);
3559 			bdev_qos_io_submit(bdev_ch, bdev->internal.qos);
3560 		}
3561 	} else {
3562 		SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags);
3563 		_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
3564 	}
3565 }
3566 
3567 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2);
3568 
3569 bool
3570 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2)
3571 {
3572 	if (range1->length == 0 || range2->length == 0) {
3573 		return false;
3574 	}
3575 
3576 	if (range1->offset + range1->length <= range2->offset) {
3577 		return false;
3578 	}
3579 
3580 	if (range2->offset + range2->length <= range1->offset) {
3581 		return false;
3582 	}
3583 
3584 	return true;
3585 }
3586 
3587 static bool
3588 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range)
3589 {
3590 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
3591 	struct lba_range r;
3592 
3593 	switch (bdev_io->type) {
3594 	case SPDK_BDEV_IO_TYPE_NVME_IO:
3595 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
3596 		/* Don't try to decode the NVMe command - just assume worst-case and that
3597 		 * it overlaps a locked range.
3598 		 */
3599 		return true;
3600 	case SPDK_BDEV_IO_TYPE_READ:
3601 		if (!range->quiesce) {
3602 			return false;
3603 		}
3604 	/* fallthrough */
3605 	case SPDK_BDEV_IO_TYPE_WRITE:
3606 	case SPDK_BDEV_IO_TYPE_UNMAP:
3607 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3608 	case SPDK_BDEV_IO_TYPE_ZCOPY:
3609 	case SPDK_BDEV_IO_TYPE_COPY:
3610 		r.offset = bdev_io->u.bdev.offset_blocks;
3611 		r.length = bdev_io->u.bdev.num_blocks;
3612 		if (!bdev_lba_range_overlapped(range, &r)) {
3613 			/* This I/O doesn't overlap the specified LBA range. */
3614 			return false;
3615 		} else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) {
3616 			/* This I/O overlaps, but the I/O is on the same channel that locked this
3617 			 * range, and the caller_ctx is the same as the locked_ctx.  This means
3618 			 * that this I/O is associated with the lock, and is allowed to execute.
3619 			 */
3620 			return false;
3621 		} else {
3622 			return true;
3623 		}
3624 	default:
3625 		return false;
3626 	}
3627 }
3628 
3629 void
3630 bdev_io_submit(struct spdk_bdev_io *bdev_io)
3631 {
3632 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
3633 
3634 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
3635 
3636 	if (!TAILQ_EMPTY(&ch->locked_ranges)) {
3637 		struct lba_range *range;
3638 
3639 		TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
3640 			if (bdev_io_range_is_locked(bdev_io, range)) {
3641 				TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link);
3642 				return;
3643 			}
3644 		}
3645 	}
3646 
3647 	bdev_ch_add_to_io_submitted(bdev_io);
3648 
3649 	bdev_io->internal.submit_tsc = spdk_get_ticks();
3650 	spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START,
3651 			      ch->trace_id, bdev_io->u.bdev.num_blocks,
3652 			      (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx,
3653 			      bdev_io->u.bdev.offset_blocks, ch->queue_depth);
3654 
3655 	if (bdev_io->internal.f.split) {
3656 		bdev_io_split(bdev_io);
3657 		return;
3658 	}
3659 
3660 	_bdev_io_submit(bdev_io);
3661 }
3662 
3663 static inline void
3664 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io)
3665 {
3666 	/* bdev doesn't support memory domains, thereby buffers in this IO request can't
3667 	 * be accessed directly. It is needed to allocate buffers before issuing IO operation.
3668 	 * For write operation we need to pull buffers from memory domain before submitting IO.
3669 	 * Once read operation completes, we need to use memory_domain push functionality to
3670 	 * update data in original memory domain IO buffer
3671 	 * This IO request will go through a regular IO flow, so clear memory domains pointers */
3672 	assert(bdev_io->internal.f.has_memory_domain);
3673 	bdev_io->u.bdev.memory_domain = NULL;
3674 	bdev_io->u.bdev.memory_domain_ctx = NULL;
3675 	_bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb,
3676 				       bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
3677 }
3678 
3679 static inline void
3680 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io)
3681 {
3682 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
3683 	bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io);
3684 
3685 	if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) {
3686 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED;
3687 		bdev_io_complete_unsubmitted(bdev_io);
3688 		return;
3689 	}
3690 
3691 	/* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does
3692 	 * support them, but we need to execute an accel sequence and the data buffer is from accel
3693 	 * memory domain (to avoid doing a push/pull from that domain).
3694 	 */
3695 	if (bdev_io_use_memory_domain(bdev_io)) {
3696 		if (!desc->memory_domains_supported ||
3697 		    (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) {
3698 			_bdev_io_ext_use_bounce_buffer(bdev_io);
3699 			return;
3700 		}
3701 	}
3702 
3703 	if (needs_exec) {
3704 		if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
3705 			bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb);
3706 			return;
3707 		}
3708 		/* For reads we'll execute the sequence after the data is read, so, for now, only
3709 		 * clear out accel_sequence pointer and submit the IO */
3710 		assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
3711 		bdev_io->u.bdev.accel_sequence = NULL;
3712 	}
3713 
3714 	bdev_io_submit(bdev_io);
3715 }
3716 
3717 static void
3718 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io)
3719 {
3720 	struct spdk_bdev *bdev = bdev_io->bdev;
3721 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
3722 	struct spdk_io_channel *ch = bdev_ch->channel;
3723 
3724 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
3725 
3726 	bdev_io->internal.f.in_submit_request = true;
3727 	bdev_submit_request(bdev, ch, bdev_io);
3728 	bdev_io->internal.f.in_submit_request = false;
3729 }
3730 
3731 void
3732 bdev_io_init(struct spdk_bdev_io *bdev_io,
3733 	     struct spdk_bdev *bdev, void *cb_arg,
3734 	     spdk_bdev_io_completion_cb cb)
3735 {
3736 	bdev_io->bdev = bdev;
3737 	bdev_io->internal.f.raw = 0;
3738 	bdev_io->internal.caller_ctx = cb_arg;
3739 	bdev_io->internal.cb = cb;
3740 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
3741 	bdev_io->internal.f.in_submit_request = false;
3742 	bdev_io->internal.error.nvme.cdw0 = 0;
3743 	bdev_io->num_retries = 0;
3744 	bdev_io->internal.get_buf_cb = NULL;
3745 	bdev_io->internal.get_aux_buf_cb = NULL;
3746 	bdev_io->internal.data_transfer_cpl = NULL;
3747 	bdev_io->internal.f.split = bdev_io_should_split(bdev_io);
3748 }
3749 
3750 static bool
3751 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
3752 {
3753 	return bdev->fn_table->io_type_supported(bdev->ctxt, io_type);
3754 }
3755 
3756 bool
3757 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
3758 {
3759 	bool supported;
3760 
3761 	supported = bdev_io_type_supported(bdev, io_type);
3762 
3763 	if (!supported) {
3764 		switch (io_type) {
3765 		case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3766 			/* The bdev layer will emulate write zeroes as long as write is supported. */
3767 			supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE);
3768 			break;
3769 		default:
3770 			break;
3771 		}
3772 	}
3773 
3774 	return supported;
3775 }
3776 
3777 static const char *g_io_type_strings[] = {
3778 	[SPDK_BDEV_IO_TYPE_READ] = "read",
3779 	[SPDK_BDEV_IO_TYPE_WRITE] = "write",
3780 	[SPDK_BDEV_IO_TYPE_UNMAP] = "unmap",
3781 	[SPDK_BDEV_IO_TYPE_FLUSH] = "flush",
3782 	[SPDK_BDEV_IO_TYPE_RESET] = "reset",
3783 	[SPDK_BDEV_IO_TYPE_NVME_ADMIN] = "nvme_admin",
3784 	[SPDK_BDEV_IO_TYPE_NVME_IO] = "nvme_io",
3785 	[SPDK_BDEV_IO_TYPE_NVME_IO_MD] = "nvme_io_md",
3786 	[SPDK_BDEV_IO_TYPE_WRITE_ZEROES] = "write_zeroes",
3787 	[SPDK_BDEV_IO_TYPE_ZCOPY] = "zcopy",
3788 	[SPDK_BDEV_IO_TYPE_GET_ZONE_INFO] = "get_zone_info",
3789 	[SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT] = "zone_management",
3790 	[SPDK_BDEV_IO_TYPE_ZONE_APPEND] = "zone_append",
3791 	[SPDK_BDEV_IO_TYPE_COMPARE] = "compare",
3792 	[SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE] = "compare_and_write",
3793 	[SPDK_BDEV_IO_TYPE_ABORT] = "abort",
3794 	[SPDK_BDEV_IO_TYPE_SEEK_HOLE] = "seek_hole",
3795 	[SPDK_BDEV_IO_TYPE_SEEK_DATA] = "seek_data",
3796 	[SPDK_BDEV_IO_TYPE_COPY] = "copy",
3797 	[SPDK_BDEV_IO_TYPE_NVME_IOV_MD] = "nvme_iov_md",
3798 };
3799 
3800 const char *
3801 spdk_bdev_get_io_type_name(enum spdk_bdev_io_type io_type)
3802 {
3803 	if (io_type <= SPDK_BDEV_IO_TYPE_INVALID || io_type >= SPDK_BDEV_NUM_IO_TYPES) {
3804 		return NULL;
3805 	}
3806 
3807 	return g_io_type_strings[io_type];
3808 }
3809 
3810 int
3811 spdk_bdev_get_io_type(const char *io_type_string)
3812 {
3813 	int i;
3814 
3815 	for (i = SPDK_BDEV_IO_TYPE_READ; i < SPDK_BDEV_NUM_IO_TYPES; ++i) {
3816 		if (!strcmp(io_type_string, g_io_type_strings[i])) {
3817 			return i;
3818 		}
3819 	}
3820 
3821 	return -1;
3822 }
3823 
3824 uint64_t
3825 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io)
3826 {
3827 	return bdev_io->internal.submit_tsc;
3828 }
3829 
3830 int
3831 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
3832 {
3833 	if (bdev->fn_table->dump_info_json) {
3834 		return bdev->fn_table->dump_info_json(bdev->ctxt, w);
3835 	}
3836 
3837 	return 0;
3838 }
3839 
3840 static void
3841 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos)
3842 {
3843 	uint32_t max_per_timeslice = 0;
3844 	int i;
3845 
3846 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3847 		if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
3848 			qos->rate_limits[i].max_per_timeslice = 0;
3849 			continue;
3850 		}
3851 
3852 		max_per_timeslice = qos->rate_limits[i].limit *
3853 				    SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC;
3854 
3855 		qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice,
3856 							qos->rate_limits[i].min_per_timeslice);
3857 
3858 		__atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice,
3859 				 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELEASE);
3860 	}
3861 
3862 	bdev_qos_set_ops(qos);
3863 }
3864 
3865 static void
3866 bdev_channel_submit_qos_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
3867 			   struct spdk_io_channel *io_ch, void *ctx)
3868 {
3869 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
3870 	int status;
3871 
3872 	bdev_qos_io_submit(bdev_ch, bdev->internal.qos);
3873 
3874 	/* if all IOs were sent then continue the iteration, otherwise - stop it */
3875 	/* TODO: channels round robing */
3876 	status = TAILQ_EMPTY(&bdev_ch->qos_queued_io) ? 0 : 1;
3877 
3878 	spdk_bdev_for_each_channel_continue(i, status);
3879 }
3880 
3881 
3882 static void
3883 bdev_channel_submit_qos_io_done(struct spdk_bdev *bdev, void *ctx, int status)
3884 {
3885 
3886 }
3887 
3888 static int
3889 bdev_channel_poll_qos(void *arg)
3890 {
3891 	struct spdk_bdev *bdev = arg;
3892 	struct spdk_bdev_qos *qos = bdev->internal.qos;
3893 	uint64_t now = spdk_get_ticks();
3894 	int i;
3895 	int64_t remaining_last_timeslice;
3896 
3897 	if (spdk_unlikely(qos->thread == NULL)) {
3898 		/* Old QoS was unbound to remove and new QoS is not enabled yet. */
3899 		return SPDK_POLLER_IDLE;
3900 	}
3901 
3902 	if (now < (qos->last_timeslice + qos->timeslice_size)) {
3903 		/* We received our callback earlier than expected - return
3904 		 *  immediately and wait to do accounting until at least one
3905 		 *  timeslice has actually expired.  This should never happen
3906 		 *  with a well-behaved timer implementation.
3907 		 */
3908 		return SPDK_POLLER_IDLE;
3909 	}
3910 
3911 	/* Reset for next round of rate limiting */
3912 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3913 		/* We may have allowed the IOs or bytes to slightly overrun in the last
3914 		 * timeslice. remaining_this_timeslice is signed, so if it's negative
3915 		 * here, we'll account for the overrun so that the next timeslice will
3916 		 * be appropriately reduced.
3917 		 */
3918 		remaining_last_timeslice = __atomic_exchange_n(&qos->rate_limits[i].remaining_this_timeslice,
3919 					   0, __ATOMIC_RELAXED);
3920 		if (remaining_last_timeslice < 0) {
3921 			/* There could be a race condition here as both bdev_qos_rw_queue_io() and bdev_channel_poll_qos()
3922 			 * potentially use 2 atomic ops each, so they can intertwine.
3923 			 * This race can potentially cause the limits to be a little fuzzy but won't cause any real damage.
3924 			 */
3925 			__atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice,
3926 					 remaining_last_timeslice, __ATOMIC_RELAXED);
3927 		}
3928 	}
3929 
3930 	while (now >= (qos->last_timeslice + qos->timeslice_size)) {
3931 		qos->last_timeslice += qos->timeslice_size;
3932 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3933 			__atomic_add_fetch(&qos->rate_limits[i].remaining_this_timeslice,
3934 					   qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELAXED);
3935 		}
3936 	}
3937 
3938 	spdk_bdev_for_each_channel(bdev, bdev_channel_submit_qos_io, qos,
3939 				   bdev_channel_submit_qos_io_done);
3940 
3941 	return SPDK_POLLER_BUSY;
3942 }
3943 
3944 static void
3945 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch)
3946 {
3947 	struct spdk_bdev_shared_resource *shared_resource;
3948 	struct lba_range *range;
3949 
3950 	bdev_free_io_stat(ch->stat);
3951 #ifdef SPDK_CONFIG_VTUNE
3952 	bdev_free_io_stat(ch->prev_stat);
3953 #endif
3954 
3955 	while (!TAILQ_EMPTY(&ch->locked_ranges)) {
3956 		range = TAILQ_FIRST(&ch->locked_ranges);
3957 		TAILQ_REMOVE(&ch->locked_ranges, range, tailq);
3958 		free(range);
3959 	}
3960 
3961 	spdk_put_io_channel(ch->channel);
3962 	spdk_put_io_channel(ch->accel_channel);
3963 
3964 	shared_resource = ch->shared_resource;
3965 
3966 	assert(TAILQ_EMPTY(&ch->io_locked));
3967 	assert(TAILQ_EMPTY(&ch->io_submitted));
3968 	assert(TAILQ_EMPTY(&ch->io_accel_exec));
3969 	assert(TAILQ_EMPTY(&ch->io_memory_domain));
3970 	assert(ch->io_outstanding == 0);
3971 	assert(shared_resource->ref > 0);
3972 	shared_resource->ref--;
3973 	if (shared_resource->ref == 0) {
3974 		assert(shared_resource->io_outstanding == 0);
3975 		TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link);
3976 		spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch));
3977 		spdk_poller_unregister(&shared_resource->nomem_poller);
3978 		free(shared_resource);
3979 	}
3980 }
3981 
3982 static void
3983 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch)
3984 {
3985 	struct spdk_bdev_qos	*qos = bdev->internal.qos;
3986 	int			i;
3987 
3988 	assert(spdk_spin_held(&bdev->internal.spinlock));
3989 
3990 	/* Rate limiting on this bdev enabled */
3991 	if (qos) {
3992 		if (qos->ch == NULL) {
3993 			struct spdk_io_channel *io_ch;
3994 
3995 			SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch,
3996 				      bdev->name, spdk_get_thread());
3997 
3998 			/* No qos channel has been selected, so set one up */
3999 
4000 			/* Take another reference to ch */
4001 			io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev));
4002 			assert(io_ch != NULL);
4003 			qos->ch = ch;
4004 
4005 			qos->thread = spdk_io_channel_get_thread(io_ch);
4006 
4007 			for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4008 				if (bdev_qos_is_iops_rate_limit(i) == true) {
4009 					qos->rate_limits[i].min_per_timeslice =
4010 						SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE;
4011 				} else {
4012 					qos->rate_limits[i].min_per_timeslice =
4013 						SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE;
4014 				}
4015 
4016 				if (qos->rate_limits[i].limit == 0) {
4017 					qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
4018 				}
4019 			}
4020 			bdev_qos_update_max_quota_per_timeslice(qos);
4021 			qos->timeslice_size =
4022 				SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC;
4023 			qos->last_timeslice = spdk_get_ticks();
4024 			qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos,
4025 							   bdev,
4026 							   SPDK_BDEV_QOS_TIMESLICE_IN_USEC);
4027 		}
4028 
4029 		ch->flags |= BDEV_CH_QOS_ENABLED;
4030 	}
4031 }
4032 
4033 struct poll_timeout_ctx {
4034 	struct spdk_bdev_desc	*desc;
4035 	uint64_t		timeout_in_sec;
4036 	spdk_bdev_io_timeout_cb	cb_fn;
4037 	void			*cb_arg;
4038 };
4039 
4040 static void
4041 bdev_desc_free(struct spdk_bdev_desc *desc)
4042 {
4043 	spdk_spin_destroy(&desc->spinlock);
4044 	free(desc->media_events_buffer);
4045 	free(desc);
4046 }
4047 
4048 static void
4049 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
4050 {
4051 	struct poll_timeout_ctx *ctx  = _ctx;
4052 	struct spdk_bdev_desc *desc = ctx->desc;
4053 
4054 	free(ctx);
4055 
4056 	spdk_spin_lock(&desc->spinlock);
4057 	desc->refs--;
4058 	if (desc->closed == true && desc->refs == 0) {
4059 		spdk_spin_unlock(&desc->spinlock);
4060 		bdev_desc_free(desc);
4061 		return;
4062 	}
4063 	spdk_spin_unlock(&desc->spinlock);
4064 }
4065 
4066 static void
4067 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
4068 			     struct spdk_io_channel *io_ch, void *_ctx)
4069 {
4070 	struct poll_timeout_ctx *ctx  = _ctx;
4071 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
4072 	struct spdk_bdev_desc *desc = ctx->desc;
4073 	struct spdk_bdev_io *bdev_io;
4074 	uint64_t now;
4075 
4076 	spdk_spin_lock(&desc->spinlock);
4077 	if (desc->closed == true) {
4078 		spdk_spin_unlock(&desc->spinlock);
4079 		spdk_bdev_for_each_channel_continue(i, -1);
4080 		return;
4081 	}
4082 	spdk_spin_unlock(&desc->spinlock);
4083 
4084 	now = spdk_get_ticks();
4085 	TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) {
4086 		/* Exclude any I/O that are generated via splitting. */
4087 		if (bdev_io->internal.cb == bdev_io_split_done) {
4088 			continue;
4089 		}
4090 
4091 		/* Once we find an I/O that has not timed out, we can immediately
4092 		 * exit the loop.
4093 		 */
4094 		if (now < (bdev_io->internal.submit_tsc +
4095 			   ctx->timeout_in_sec * spdk_get_ticks_hz())) {
4096 			goto end;
4097 		}
4098 
4099 		if (bdev_io->internal.desc == desc) {
4100 			ctx->cb_fn(ctx->cb_arg, bdev_io);
4101 		}
4102 	}
4103 
4104 end:
4105 	spdk_bdev_for_each_channel_continue(i, 0);
4106 }
4107 
4108 static int
4109 bdev_poll_timeout_io(void *arg)
4110 {
4111 	struct spdk_bdev_desc *desc = arg;
4112 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
4113 	struct poll_timeout_ctx *ctx;
4114 
4115 	ctx = calloc(1, sizeof(struct poll_timeout_ctx));
4116 	if (!ctx) {
4117 		SPDK_ERRLOG("failed to allocate memory\n");
4118 		return SPDK_POLLER_BUSY;
4119 	}
4120 	ctx->desc = desc;
4121 	ctx->cb_arg = desc->cb_arg;
4122 	ctx->cb_fn = desc->cb_fn;
4123 	ctx->timeout_in_sec = desc->timeout_in_sec;
4124 
4125 	/* Take a ref on the descriptor in case it gets closed while we are checking
4126 	 * all of the channels.
4127 	 */
4128 	spdk_spin_lock(&desc->spinlock);
4129 	desc->refs++;
4130 	spdk_spin_unlock(&desc->spinlock);
4131 
4132 	spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx,
4133 				   bdev_channel_poll_timeout_io_done);
4134 
4135 	return SPDK_POLLER_BUSY;
4136 }
4137 
4138 int
4139 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec,
4140 		      spdk_bdev_io_timeout_cb cb_fn, void *cb_arg)
4141 {
4142 	assert(desc->thread == spdk_get_thread());
4143 
4144 	spdk_poller_unregister(&desc->io_timeout_poller);
4145 
4146 	if (timeout_in_sec) {
4147 		assert(cb_fn != NULL);
4148 		desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io,
4149 					  desc,
4150 					  SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC /
4151 					  1000);
4152 		if (desc->io_timeout_poller == NULL) {
4153 			SPDK_ERRLOG("can not register the desc timeout IO poller\n");
4154 			return -1;
4155 		}
4156 	}
4157 
4158 	desc->cb_fn = cb_fn;
4159 	desc->cb_arg = cb_arg;
4160 	desc->timeout_in_sec = timeout_in_sec;
4161 
4162 	return 0;
4163 }
4164 
4165 static int
4166 bdev_channel_create(void *io_device, void *ctx_buf)
4167 {
4168 	struct spdk_bdev		*bdev = __bdev_from_io_dev(io_device);
4169 	struct spdk_bdev_channel	*ch = ctx_buf;
4170 	struct spdk_io_channel		*mgmt_io_ch;
4171 	struct spdk_bdev_mgmt_channel	*mgmt_ch;
4172 	struct spdk_bdev_shared_resource *shared_resource;
4173 	struct lba_range		*range;
4174 
4175 	ch->bdev = bdev;
4176 	ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt);
4177 	if (!ch->channel) {
4178 		return -1;
4179 	}
4180 
4181 	ch->accel_channel = spdk_accel_get_io_channel();
4182 	if (!ch->accel_channel) {
4183 		spdk_put_io_channel(ch->channel);
4184 		return -1;
4185 	}
4186 
4187 	spdk_trace_record(TRACE_BDEV_IOCH_CREATE, bdev->internal.trace_id, 0, 0,
4188 			  spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel)));
4189 
4190 	assert(ch->histogram == NULL);
4191 	if (bdev->internal.histogram_enabled) {
4192 		ch->histogram = spdk_histogram_data_alloc();
4193 		if (ch->histogram == NULL) {
4194 			SPDK_ERRLOG("Could not allocate histogram\n");
4195 		}
4196 	}
4197 
4198 	mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr);
4199 	if (!mgmt_io_ch) {
4200 		spdk_put_io_channel(ch->channel);
4201 		spdk_put_io_channel(ch->accel_channel);
4202 		return -1;
4203 	}
4204 
4205 	mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch);
4206 	TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) {
4207 		if (shared_resource->shared_ch == ch->channel) {
4208 			spdk_put_io_channel(mgmt_io_ch);
4209 			shared_resource->ref++;
4210 			break;
4211 		}
4212 	}
4213 
4214 	if (shared_resource == NULL) {
4215 		shared_resource = calloc(1, sizeof(*shared_resource));
4216 		if (shared_resource == NULL) {
4217 			spdk_put_io_channel(ch->channel);
4218 			spdk_put_io_channel(ch->accel_channel);
4219 			spdk_put_io_channel(mgmt_io_ch);
4220 			return -1;
4221 		}
4222 
4223 		shared_resource->mgmt_ch = mgmt_ch;
4224 		shared_resource->io_outstanding = 0;
4225 		TAILQ_INIT(&shared_resource->nomem_io);
4226 		shared_resource->nomem_threshold = 0;
4227 		shared_resource->shared_ch = ch->channel;
4228 		shared_resource->ref = 1;
4229 		TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link);
4230 	}
4231 
4232 	ch->io_outstanding = 0;
4233 	TAILQ_INIT(&ch->queued_resets);
4234 	TAILQ_INIT(&ch->locked_ranges);
4235 	TAILQ_INIT(&ch->qos_queued_io);
4236 	ch->flags = 0;
4237 	ch->trace_id = bdev->internal.trace_id;
4238 	ch->shared_resource = shared_resource;
4239 
4240 	TAILQ_INIT(&ch->io_submitted);
4241 	TAILQ_INIT(&ch->io_locked);
4242 	TAILQ_INIT(&ch->io_accel_exec);
4243 	TAILQ_INIT(&ch->io_memory_domain);
4244 
4245 	ch->stat = bdev_alloc_io_stat(false);
4246 	if (ch->stat == NULL) {
4247 		bdev_channel_destroy_resource(ch);
4248 		return -1;
4249 	}
4250 
4251 	ch->stat->ticks_rate = spdk_get_ticks_hz();
4252 
4253 #ifdef SPDK_CONFIG_VTUNE
4254 	{
4255 		char *name;
4256 		__itt_init_ittlib(NULL, 0);
4257 		name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch);
4258 		if (!name) {
4259 			bdev_channel_destroy_resource(ch);
4260 			return -1;
4261 		}
4262 		ch->handle = __itt_string_handle_create(name);
4263 		free(name);
4264 		ch->start_tsc = spdk_get_ticks();
4265 		ch->interval_tsc = spdk_get_ticks_hz() / 100;
4266 		ch->prev_stat = bdev_alloc_io_stat(false);
4267 		if (ch->prev_stat == NULL) {
4268 			bdev_channel_destroy_resource(ch);
4269 			return -1;
4270 		}
4271 	}
4272 #endif
4273 
4274 	spdk_spin_lock(&bdev->internal.spinlock);
4275 	bdev_enable_qos(bdev, ch);
4276 
4277 	TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) {
4278 		struct lba_range *new_range;
4279 
4280 		new_range = calloc(1, sizeof(*new_range));
4281 		if (new_range == NULL) {
4282 			spdk_spin_unlock(&bdev->internal.spinlock);
4283 			bdev_channel_destroy_resource(ch);
4284 			return -1;
4285 		}
4286 		new_range->length = range->length;
4287 		new_range->offset = range->offset;
4288 		new_range->locked_ctx = range->locked_ctx;
4289 		TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq);
4290 	}
4291 
4292 	spdk_spin_unlock(&bdev->internal.spinlock);
4293 
4294 	return 0;
4295 }
4296 
4297 static int
4298 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry,
4299 			 void *cb_ctx)
4300 {
4301 	struct spdk_bdev_channel *bdev_ch = cb_ctx;
4302 	struct spdk_bdev_io *bdev_io;
4303 	uint64_t buf_len;
4304 
4305 	bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf);
4306 	if (bdev_io->internal.ch == bdev_ch) {
4307 		buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len);
4308 		spdk_iobuf_entry_abort(ch, entry, buf_len);
4309 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
4310 	}
4311 
4312 	return 0;
4313 }
4314 
4315 /*
4316  * Abort I/O that are waiting on a data buffer.
4317  */
4318 static void
4319 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch)
4320 {
4321 	spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small,
4322 				  bdev_abort_all_buf_io_cb, ch);
4323 	spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large,
4324 				  bdev_abort_all_buf_io_cb, ch);
4325 }
4326 
4327 /*
4328  * Abort I/O that are queued waiting for submission.  These types of I/O are
4329  *  linked using the spdk_bdev_io link TAILQ_ENTRY.
4330  */
4331 static void
4332 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch)
4333 {
4334 	struct spdk_bdev_io *bdev_io, *tmp;
4335 
4336 	TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) {
4337 		if (bdev_io->internal.ch == ch) {
4338 			TAILQ_REMOVE(queue, bdev_io, internal.link);
4339 			/*
4340 			 * spdk_bdev_io_complete() assumes that the completed I/O had
4341 			 *  been submitted to the bdev module.  Since in this case it
4342 			 *  hadn't, bump io_outstanding to account for the decrement
4343 			 *  that spdk_bdev_io_complete() will do.
4344 			 */
4345 			if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) {
4346 				bdev_io_increment_outstanding(ch, ch->shared_resource);
4347 			}
4348 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
4349 		}
4350 	}
4351 }
4352 
4353 static bool
4354 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort)
4355 {
4356 	struct spdk_bdev_io *bdev_io;
4357 
4358 	TAILQ_FOREACH(bdev_io, queue, internal.link) {
4359 		if (bdev_io == bio_to_abort) {
4360 			TAILQ_REMOVE(queue, bio_to_abort, internal.link);
4361 			spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
4362 			return true;
4363 		}
4364 	}
4365 
4366 	return false;
4367 }
4368 
4369 static int
4370 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx)
4371 {
4372 	struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx;
4373 	uint64_t buf_len;
4374 
4375 	bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf);
4376 	if (bdev_io == bio_to_abort) {
4377 		buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len);
4378 		spdk_iobuf_entry_abort(ch, entry, buf_len);
4379 		spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
4380 		return 1;
4381 	}
4382 
4383 	return 0;
4384 }
4385 
4386 static bool
4387 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort)
4388 {
4389 	int rc;
4390 
4391 	rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small,
4392 				       bdev_abort_buf_io_cb, bio_to_abort);
4393 	if (rc == 1) {
4394 		return true;
4395 	}
4396 
4397 	rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large,
4398 				       bdev_abort_buf_io_cb, bio_to_abort);
4399 	return rc == 1;
4400 }
4401 
4402 static void
4403 bdev_qos_channel_destroy(void *cb_arg)
4404 {
4405 	struct spdk_bdev_qos *qos = cb_arg;
4406 
4407 	spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
4408 	spdk_poller_unregister(&qos->poller);
4409 
4410 	SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos);
4411 
4412 	free(qos);
4413 }
4414 
4415 static int
4416 bdev_qos_destroy(struct spdk_bdev *bdev)
4417 {
4418 	int i;
4419 
4420 	/*
4421 	 * Cleanly shutting down the QoS poller is tricky, because
4422 	 * during the asynchronous operation the user could open
4423 	 * a new descriptor and create a new channel, spawning
4424 	 * a new QoS poller.
4425 	 *
4426 	 * The strategy is to create a new QoS structure here and swap it
4427 	 * in. The shutdown path then continues to refer to the old one
4428 	 * until it completes and then releases it.
4429 	 */
4430 	struct spdk_bdev_qos *new_qos, *old_qos;
4431 
4432 	old_qos = bdev->internal.qos;
4433 
4434 	new_qos = calloc(1, sizeof(*new_qos));
4435 	if (!new_qos) {
4436 		SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n");
4437 		return -ENOMEM;
4438 	}
4439 
4440 	/* Copy the old QoS data into the newly allocated structure */
4441 	memcpy(new_qos, old_qos, sizeof(*new_qos));
4442 
4443 	/* Zero out the key parts of the QoS structure */
4444 	new_qos->ch = NULL;
4445 	new_qos->thread = NULL;
4446 	new_qos->poller = NULL;
4447 	/*
4448 	 * The limit member of spdk_bdev_qos_limit structure is not zeroed.
4449 	 * It will be used later for the new QoS structure.
4450 	 */
4451 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4452 		new_qos->rate_limits[i].remaining_this_timeslice = 0;
4453 		new_qos->rate_limits[i].min_per_timeslice = 0;
4454 		new_qos->rate_limits[i].max_per_timeslice = 0;
4455 	}
4456 
4457 	bdev->internal.qos = new_qos;
4458 
4459 	if (old_qos->thread == NULL) {
4460 		free(old_qos);
4461 	} else {
4462 		spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos);
4463 	}
4464 
4465 	/* It is safe to continue with destroying the bdev even though the QoS channel hasn't
4466 	 * been destroyed yet. The destruction path will end up waiting for the final
4467 	 * channel to be put before it releases resources. */
4468 
4469 	return 0;
4470 }
4471 
4472 void
4473 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add)
4474 {
4475 	total->bytes_read += add->bytes_read;
4476 	total->num_read_ops += add->num_read_ops;
4477 	total->bytes_written += add->bytes_written;
4478 	total->num_write_ops += add->num_write_ops;
4479 	total->bytes_unmapped += add->bytes_unmapped;
4480 	total->num_unmap_ops += add->num_unmap_ops;
4481 	total->bytes_copied += add->bytes_copied;
4482 	total->num_copy_ops += add->num_copy_ops;
4483 	total->read_latency_ticks += add->read_latency_ticks;
4484 	total->write_latency_ticks += add->write_latency_ticks;
4485 	total->unmap_latency_ticks += add->unmap_latency_ticks;
4486 	total->copy_latency_ticks += add->copy_latency_ticks;
4487 	if (total->max_read_latency_ticks < add->max_read_latency_ticks) {
4488 		total->max_read_latency_ticks = add->max_read_latency_ticks;
4489 	}
4490 	if (total->min_read_latency_ticks > add->min_read_latency_ticks) {
4491 		total->min_read_latency_ticks = add->min_read_latency_ticks;
4492 	}
4493 	if (total->max_write_latency_ticks < add->max_write_latency_ticks) {
4494 		total->max_write_latency_ticks = add->max_write_latency_ticks;
4495 	}
4496 	if (total->min_write_latency_ticks > add->min_write_latency_ticks) {
4497 		total->min_write_latency_ticks = add->min_write_latency_ticks;
4498 	}
4499 	if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) {
4500 		total->max_unmap_latency_ticks = add->max_unmap_latency_ticks;
4501 	}
4502 	if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) {
4503 		total->min_unmap_latency_ticks = add->min_unmap_latency_ticks;
4504 	}
4505 	if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) {
4506 		total->max_copy_latency_ticks = add->max_copy_latency_ticks;
4507 	}
4508 	if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) {
4509 		total->min_copy_latency_ticks = add->min_copy_latency_ticks;
4510 	}
4511 }
4512 
4513 static void
4514 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat)
4515 {
4516 	memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error));
4517 
4518 	if (to_stat->io_error != NULL && from_stat->io_error != NULL) {
4519 		memcpy(to_stat->io_error, from_stat->io_error,
4520 		       sizeof(struct spdk_bdev_io_error_stat));
4521 	}
4522 }
4523 
4524 void
4525 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode)
4526 {
4527 	if (mode == SPDK_BDEV_RESET_STAT_NONE) {
4528 		return;
4529 	}
4530 
4531 	stat->max_read_latency_ticks = 0;
4532 	stat->min_read_latency_ticks = UINT64_MAX;
4533 	stat->max_write_latency_ticks = 0;
4534 	stat->min_write_latency_ticks = UINT64_MAX;
4535 	stat->max_unmap_latency_ticks = 0;
4536 	stat->min_unmap_latency_ticks = UINT64_MAX;
4537 	stat->max_copy_latency_ticks = 0;
4538 	stat->min_copy_latency_ticks = UINT64_MAX;
4539 
4540 	if (mode != SPDK_BDEV_RESET_STAT_ALL) {
4541 		return;
4542 	}
4543 
4544 	stat->bytes_read = 0;
4545 	stat->num_read_ops = 0;
4546 	stat->bytes_written = 0;
4547 	stat->num_write_ops = 0;
4548 	stat->bytes_unmapped = 0;
4549 	stat->num_unmap_ops = 0;
4550 	stat->bytes_copied = 0;
4551 	stat->num_copy_ops = 0;
4552 	stat->read_latency_ticks = 0;
4553 	stat->write_latency_ticks = 0;
4554 	stat->unmap_latency_ticks = 0;
4555 	stat->copy_latency_ticks = 0;
4556 
4557 	if (stat->io_error != NULL) {
4558 		memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat));
4559 	}
4560 }
4561 
4562 struct spdk_bdev_io_stat *
4563 bdev_alloc_io_stat(bool io_error_stat)
4564 {
4565 	struct spdk_bdev_io_stat *stat;
4566 
4567 	stat = malloc(sizeof(struct spdk_bdev_io_stat));
4568 	if (stat == NULL) {
4569 		return NULL;
4570 	}
4571 
4572 	if (io_error_stat) {
4573 		stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat));
4574 		if (stat->io_error == NULL) {
4575 			free(stat);
4576 			return NULL;
4577 		}
4578 	} else {
4579 		stat->io_error = NULL;
4580 	}
4581 
4582 	spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL);
4583 
4584 	return stat;
4585 }
4586 
4587 void
4588 bdev_free_io_stat(struct spdk_bdev_io_stat *stat)
4589 {
4590 	if (stat != NULL) {
4591 		free(stat->io_error);
4592 		free(stat);
4593 	}
4594 }
4595 
4596 void
4597 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w)
4598 {
4599 	int i;
4600 
4601 	spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read);
4602 	spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops);
4603 	spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written);
4604 	spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops);
4605 	spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped);
4606 	spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops);
4607 	spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied);
4608 	spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops);
4609 	spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks);
4610 	spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks);
4611 	spdk_json_write_named_uint64(w, "min_read_latency_ticks",
4612 				     stat->min_read_latency_ticks != UINT64_MAX ?
4613 				     stat->min_read_latency_ticks : 0);
4614 	spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks);
4615 	spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks);
4616 	spdk_json_write_named_uint64(w, "min_write_latency_ticks",
4617 				     stat->min_write_latency_ticks != UINT64_MAX ?
4618 				     stat->min_write_latency_ticks : 0);
4619 	spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks);
4620 	spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks);
4621 	spdk_json_write_named_uint64(w, "min_unmap_latency_ticks",
4622 				     stat->min_unmap_latency_ticks != UINT64_MAX ?
4623 				     stat->min_unmap_latency_ticks : 0);
4624 	spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks);
4625 	spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks);
4626 	spdk_json_write_named_uint64(w, "min_copy_latency_ticks",
4627 				     stat->min_copy_latency_ticks != UINT64_MAX ?
4628 				     stat->min_copy_latency_ticks : 0);
4629 
4630 	if (stat->io_error != NULL) {
4631 		spdk_json_write_named_object_begin(w, "io_error");
4632 		for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) {
4633 			if (stat->io_error->error_status[i] != 0) {
4634 				spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)),
4635 							     stat->io_error->error_status[i]);
4636 			}
4637 		}
4638 		spdk_json_write_object_end(w);
4639 	}
4640 }
4641 
4642 static void
4643 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch)
4644 {
4645 	struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource;
4646 	struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch;
4647 
4648 	bdev_abort_all_queued_io(&shared_resource->nomem_io, ch);
4649 	bdev_abort_all_buf_io(mgmt_ch, ch);
4650 }
4651 
4652 static void
4653 bdev_channel_destroy(void *io_device, void *ctx_buf)
4654 {
4655 	struct spdk_bdev_channel *ch = ctx_buf;
4656 
4657 	SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name,
4658 		      spdk_get_thread());
4659 
4660 	spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, ch->bdev->internal.trace_id, 0, 0,
4661 			  spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel)));
4662 
4663 	/* This channel is going away, so add its statistics into the bdev so that they don't get lost. */
4664 	spdk_spin_lock(&ch->bdev->internal.spinlock);
4665 	spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat);
4666 	spdk_spin_unlock(&ch->bdev->internal.spinlock);
4667 
4668 	bdev_abort_all_queued_io(&ch->queued_resets, ch);
4669 
4670 	bdev_channel_abort_queued_ios(ch);
4671 
4672 	if (ch->histogram) {
4673 		spdk_histogram_data_free(ch->histogram);
4674 	}
4675 
4676 	bdev_channel_destroy_resource(ch);
4677 }
4678 
4679 /*
4680  * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer
4681  * to it. Hence we do not have to call bdev_get_by_name() when using this function.
4682  */
4683 static int
4684 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name)
4685 {
4686 	struct spdk_bdev_name *tmp;
4687 
4688 	bdev_name->name = strdup(name);
4689 	if (bdev_name->name == NULL) {
4690 		SPDK_ERRLOG("Unable to allocate bdev name\n");
4691 		return -ENOMEM;
4692 	}
4693 
4694 	bdev_name->bdev = bdev;
4695 
4696 	spdk_spin_lock(&g_bdev_mgr.spinlock);
4697 	tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name);
4698 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
4699 
4700 	if (tmp != NULL) {
4701 		SPDK_ERRLOG("Bdev name %s already exists\n", name);
4702 		free(bdev_name->name);
4703 		return -EEXIST;
4704 	}
4705 
4706 	return 0;
4707 }
4708 
4709 static void
4710 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name)
4711 {
4712 	RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name);
4713 	free(bdev_name->name);
4714 }
4715 
4716 static void
4717 bdev_name_del(struct spdk_bdev_name *bdev_name)
4718 {
4719 	spdk_spin_lock(&g_bdev_mgr.spinlock);
4720 	bdev_name_del_unsafe(bdev_name);
4721 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
4722 }
4723 
4724 int
4725 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias)
4726 {
4727 	struct spdk_bdev_alias *tmp;
4728 	int ret;
4729 
4730 	if (alias == NULL) {
4731 		SPDK_ERRLOG("Empty alias passed\n");
4732 		return -EINVAL;
4733 	}
4734 
4735 	tmp = calloc(1, sizeof(*tmp));
4736 	if (tmp == NULL) {
4737 		SPDK_ERRLOG("Unable to allocate alias\n");
4738 		return -ENOMEM;
4739 	}
4740 
4741 	ret = bdev_name_add(&tmp->alias, bdev, alias);
4742 	if (ret != 0) {
4743 		free(tmp);
4744 		return ret;
4745 	}
4746 
4747 	TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq);
4748 
4749 	return 0;
4750 }
4751 
4752 static int
4753 bdev_alias_del(struct spdk_bdev *bdev, const char *alias,
4754 	       void (*alias_del_fn)(struct spdk_bdev_name *n))
4755 {
4756 	struct spdk_bdev_alias *tmp;
4757 
4758 	TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
4759 		if (strcmp(alias, tmp->alias.name) == 0) {
4760 			TAILQ_REMOVE(&bdev->aliases, tmp, tailq);
4761 			alias_del_fn(&tmp->alias);
4762 			free(tmp);
4763 			return 0;
4764 		}
4765 	}
4766 
4767 	return -ENOENT;
4768 }
4769 
4770 int
4771 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias)
4772 {
4773 	int rc;
4774 
4775 	rc = bdev_alias_del(bdev, alias, bdev_name_del);
4776 	if (rc == -ENOENT) {
4777 		SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias);
4778 	}
4779 
4780 	return rc;
4781 }
4782 
4783 void
4784 spdk_bdev_alias_del_all(struct spdk_bdev *bdev)
4785 {
4786 	struct spdk_bdev_alias *p, *tmp;
4787 
4788 	TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) {
4789 		TAILQ_REMOVE(&bdev->aliases, p, tailq);
4790 		bdev_name_del(&p->alias);
4791 		free(p);
4792 	}
4793 }
4794 
4795 struct spdk_io_channel *
4796 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc)
4797 {
4798 	return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc)));
4799 }
4800 
4801 void *
4802 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc)
4803 {
4804 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
4805 	void *ctx = NULL;
4806 
4807 	if (bdev->fn_table->get_module_ctx) {
4808 		ctx = bdev->fn_table->get_module_ctx(bdev->ctxt);
4809 	}
4810 
4811 	return ctx;
4812 }
4813 
4814 const char *
4815 spdk_bdev_get_module_name(const struct spdk_bdev *bdev)
4816 {
4817 	return bdev->module->name;
4818 }
4819 
4820 const char *
4821 spdk_bdev_get_name(const struct spdk_bdev *bdev)
4822 {
4823 	return bdev->name;
4824 }
4825 
4826 const char *
4827 spdk_bdev_get_product_name(const struct spdk_bdev *bdev)
4828 {
4829 	return bdev->product_name;
4830 }
4831 
4832 const struct spdk_bdev_aliases_list *
4833 spdk_bdev_get_aliases(const struct spdk_bdev *bdev)
4834 {
4835 	return &bdev->aliases;
4836 }
4837 
4838 uint32_t
4839 spdk_bdev_get_block_size(const struct spdk_bdev *bdev)
4840 {
4841 	return bdev->blocklen;
4842 }
4843 
4844 uint32_t
4845 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev)
4846 {
4847 	return bdev->write_unit_size;
4848 }
4849 
4850 uint64_t
4851 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev)
4852 {
4853 	return bdev->blockcnt;
4854 }
4855 
4856 const char *
4857 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type)
4858 {
4859 	return qos_rpc_type[type];
4860 }
4861 
4862 void
4863 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits)
4864 {
4865 	int i;
4866 
4867 	memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES);
4868 
4869 	spdk_spin_lock(&bdev->internal.spinlock);
4870 	if (bdev->internal.qos) {
4871 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4872 			if (bdev->internal.qos->rate_limits[i].limit !=
4873 			    SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
4874 				limits[i] = bdev->internal.qos->rate_limits[i].limit;
4875 				if (bdev_qos_is_iops_rate_limit(i) == false) {
4876 					/* Change from Byte to Megabyte which is user visible. */
4877 					limits[i] = limits[i] / 1024 / 1024;
4878 				}
4879 			}
4880 		}
4881 	}
4882 	spdk_spin_unlock(&bdev->internal.spinlock);
4883 }
4884 
4885 size_t
4886 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev)
4887 {
4888 	return 1 << bdev->required_alignment;
4889 }
4890 
4891 uint32_t
4892 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev)
4893 {
4894 	return bdev->optimal_io_boundary;
4895 }
4896 
4897 bool
4898 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev)
4899 {
4900 	return bdev->write_cache;
4901 }
4902 
4903 const struct spdk_uuid *
4904 spdk_bdev_get_uuid(const struct spdk_bdev *bdev)
4905 {
4906 	return &bdev->uuid;
4907 }
4908 
4909 uint16_t
4910 spdk_bdev_get_acwu(const struct spdk_bdev *bdev)
4911 {
4912 	return bdev->acwu;
4913 }
4914 
4915 uint32_t
4916 spdk_bdev_get_md_size(const struct spdk_bdev *bdev)
4917 {
4918 	return bdev->md_len;
4919 }
4920 
4921 bool
4922 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev)
4923 {
4924 	return (bdev->md_len != 0) && bdev->md_interleave;
4925 }
4926 
4927 bool
4928 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev)
4929 {
4930 	return (bdev->md_len != 0) && !bdev->md_interleave;
4931 }
4932 
4933 bool
4934 spdk_bdev_is_zoned(const struct spdk_bdev *bdev)
4935 {
4936 	return bdev->zoned;
4937 }
4938 
4939 uint32_t
4940 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev)
4941 {
4942 	if (spdk_bdev_is_md_interleaved(bdev)) {
4943 		return bdev->blocklen - bdev->md_len;
4944 	} else {
4945 		return bdev->blocklen;
4946 	}
4947 }
4948 
4949 uint32_t
4950 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev)
4951 {
4952 	return bdev->phys_blocklen;
4953 }
4954 
4955 static uint32_t
4956 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev)
4957 {
4958 	if (!spdk_bdev_is_md_interleaved(bdev)) {
4959 		return bdev->blocklen + bdev->md_len;
4960 	} else {
4961 		return bdev->blocklen;
4962 	}
4963 }
4964 
4965 /* We have to use the typedef in the function declaration to appease astyle. */
4966 typedef enum spdk_dif_type spdk_dif_type_t;
4967 typedef enum spdk_dif_pi_format spdk_dif_pi_format_t;
4968 
4969 spdk_dif_type_t
4970 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev)
4971 {
4972 	if (bdev->md_len != 0) {
4973 		return bdev->dif_type;
4974 	} else {
4975 		return SPDK_DIF_DISABLE;
4976 	}
4977 }
4978 
4979 spdk_dif_pi_format_t
4980 spdk_bdev_get_dif_pi_format(const struct spdk_bdev *bdev)
4981 {
4982 	return bdev->dif_pi_format;
4983 }
4984 
4985 bool
4986 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev)
4987 {
4988 	if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) {
4989 		return bdev->dif_is_head_of_md;
4990 	} else {
4991 		return false;
4992 	}
4993 }
4994 
4995 bool
4996 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev,
4997 			       enum spdk_dif_check_type check_type)
4998 {
4999 	if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) {
5000 		return false;
5001 	}
5002 
5003 	switch (check_type) {
5004 	case SPDK_DIF_CHECK_TYPE_REFTAG:
5005 		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0;
5006 	case SPDK_DIF_CHECK_TYPE_APPTAG:
5007 		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0;
5008 	case SPDK_DIF_CHECK_TYPE_GUARD:
5009 		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0;
5010 	default:
5011 		return false;
5012 	}
5013 }
5014 
5015 static uint32_t
5016 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes)
5017 {
5018 	uint64_t aligned_length, max_write_blocks;
5019 
5020 	aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1);
5021 	max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev);
5022 	max_write_blocks -= max_write_blocks % bdev->write_unit_size;
5023 
5024 	return max_write_blocks;
5025 }
5026 
5027 uint32_t
5028 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev)
5029 {
5030 	return bdev->max_copy;
5031 }
5032 
5033 uint64_t
5034 spdk_bdev_get_qd(const struct spdk_bdev *bdev)
5035 {
5036 	return bdev->internal.measured_queue_depth;
5037 }
5038 
5039 uint64_t
5040 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev)
5041 {
5042 	return bdev->internal.period;
5043 }
5044 
5045 uint64_t
5046 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev)
5047 {
5048 	return bdev->internal.weighted_io_time;
5049 }
5050 
5051 uint64_t
5052 spdk_bdev_get_io_time(const struct spdk_bdev *bdev)
5053 {
5054 	return bdev->internal.io_time;
5055 }
5056 
5057 union spdk_bdev_nvme_ctratt spdk_bdev_get_nvme_ctratt(struct spdk_bdev *bdev)
5058 {
5059 	return bdev->ctratt;
5060 }
5061 
5062 static void bdev_update_qd_sampling_period(void *ctx);
5063 
5064 static void
5065 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status)
5066 {
5067 	bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth;
5068 
5069 	if (bdev->internal.measured_queue_depth) {
5070 		bdev->internal.io_time += bdev->internal.period;
5071 		bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth;
5072 	}
5073 
5074 	bdev->internal.qd_poll_in_progress = false;
5075 
5076 	bdev_update_qd_sampling_period(bdev);
5077 }
5078 
5079 static void
5080 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
5081 		       struct spdk_io_channel *io_ch, void *_ctx)
5082 {
5083 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch);
5084 
5085 	bdev->internal.temporary_queue_depth += ch->io_outstanding;
5086 	spdk_bdev_for_each_channel_continue(i, 0);
5087 }
5088 
5089 static int
5090 bdev_calculate_measured_queue_depth(void *ctx)
5091 {
5092 	struct spdk_bdev *bdev = ctx;
5093 
5094 	bdev->internal.qd_poll_in_progress = true;
5095 	bdev->internal.temporary_queue_depth = 0;
5096 	spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl);
5097 	return SPDK_POLLER_BUSY;
5098 }
5099 
5100 static void
5101 bdev_update_qd_sampling_period(void *ctx)
5102 {
5103 	struct spdk_bdev *bdev = ctx;
5104 
5105 	if (bdev->internal.period == bdev->internal.new_period) {
5106 		return;
5107 	}
5108 
5109 	if (bdev->internal.qd_poll_in_progress) {
5110 		return;
5111 	}
5112 
5113 	bdev->internal.period = bdev->internal.new_period;
5114 
5115 	spdk_poller_unregister(&bdev->internal.qd_poller);
5116 	if (bdev->internal.period != 0) {
5117 		bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth,
5118 					   bdev, bdev->internal.period);
5119 	} else {
5120 		spdk_bdev_close(bdev->internal.qd_desc);
5121 		bdev->internal.qd_desc = NULL;
5122 	}
5123 }
5124 
5125 static void
5126 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
5127 {
5128 	SPDK_NOTICELOG("Unexpected event type: %d\n", type);
5129 }
5130 
5131 void
5132 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period)
5133 {
5134 	int rc;
5135 
5136 	if (bdev->internal.new_period == period) {
5137 		return;
5138 	}
5139 
5140 	bdev->internal.new_period = period;
5141 
5142 	if (bdev->internal.qd_desc != NULL) {
5143 		assert(bdev->internal.period != 0);
5144 
5145 		spdk_thread_send_msg(bdev->internal.qd_desc->thread,
5146 				     bdev_update_qd_sampling_period, bdev);
5147 		return;
5148 	}
5149 
5150 	assert(bdev->internal.period == 0);
5151 
5152 	rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb,
5153 				NULL, &bdev->internal.qd_desc);
5154 	if (rc != 0) {
5155 		return;
5156 	}
5157 
5158 	bdev->internal.period = period;
5159 	bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth,
5160 				   bdev, period);
5161 }
5162 
5163 struct bdev_get_current_qd_ctx {
5164 	uint64_t current_qd;
5165 	spdk_bdev_get_current_qd_cb cb_fn;
5166 	void *cb_arg;
5167 };
5168 
5169 static void
5170 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status)
5171 {
5172 	struct bdev_get_current_qd_ctx *ctx = _ctx;
5173 
5174 	ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0);
5175 
5176 	free(ctx);
5177 }
5178 
5179 static void
5180 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
5181 		    struct spdk_io_channel *io_ch, void *_ctx)
5182 {
5183 	struct bdev_get_current_qd_ctx *ctx = _ctx;
5184 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
5185 
5186 	ctx->current_qd += bdev_ch->io_outstanding;
5187 
5188 	spdk_bdev_for_each_channel_continue(i, 0);
5189 }
5190 
5191 void
5192 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn,
5193 			 void *cb_arg)
5194 {
5195 	struct bdev_get_current_qd_ctx *ctx;
5196 
5197 	assert(cb_fn != NULL);
5198 
5199 	ctx = calloc(1, sizeof(*ctx));
5200 	if (ctx == NULL) {
5201 		cb_fn(bdev, 0, cb_arg, -ENOMEM);
5202 		return;
5203 	}
5204 
5205 	ctx->cb_fn = cb_fn;
5206 	ctx->cb_arg = cb_arg;
5207 
5208 	spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done);
5209 }
5210 
5211 static void
5212 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type)
5213 {
5214 	assert(desc->thread == spdk_get_thread());
5215 
5216 	spdk_spin_lock(&desc->spinlock);
5217 	desc->refs--;
5218 	if (!desc->closed) {
5219 		spdk_spin_unlock(&desc->spinlock);
5220 		desc->callback.event_fn(type,
5221 					desc->bdev,
5222 					desc->callback.ctx);
5223 		return;
5224 	} else if (desc->refs == 0) {
5225 		/* This descriptor was closed after this event_notify message was sent.
5226 		 * spdk_bdev_close() could not free the descriptor since this message was
5227 		 * in flight, so we free it now using bdev_desc_free().
5228 		 */
5229 		spdk_spin_unlock(&desc->spinlock);
5230 		bdev_desc_free(desc);
5231 		return;
5232 	}
5233 	spdk_spin_unlock(&desc->spinlock);
5234 }
5235 
5236 static void
5237 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn)
5238 {
5239 	spdk_spin_lock(&desc->spinlock);
5240 	desc->refs++;
5241 	spdk_thread_send_msg(desc->thread, event_notify_fn, desc);
5242 	spdk_spin_unlock(&desc->spinlock);
5243 }
5244 
5245 static void
5246 _resize_notify(void *ctx)
5247 {
5248 	struct spdk_bdev_desc *desc = ctx;
5249 
5250 	_event_notify(desc, SPDK_BDEV_EVENT_RESIZE);
5251 }
5252 
5253 int
5254 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size)
5255 {
5256 	struct spdk_bdev_desc *desc;
5257 	int ret;
5258 
5259 	if (size == bdev->blockcnt) {
5260 		return 0;
5261 	}
5262 
5263 	spdk_spin_lock(&bdev->internal.spinlock);
5264 
5265 	/* bdev has open descriptors */
5266 	if (!TAILQ_EMPTY(&bdev->internal.open_descs) &&
5267 	    bdev->blockcnt > size) {
5268 		ret = -EBUSY;
5269 	} else {
5270 		bdev->blockcnt = size;
5271 		TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
5272 			event_notify(desc, _resize_notify);
5273 		}
5274 		ret = 0;
5275 	}
5276 
5277 	spdk_spin_unlock(&bdev->internal.spinlock);
5278 
5279 	return ret;
5280 }
5281 
5282 /*
5283  * Convert I/O offset and length from bytes to blocks.
5284  *
5285  * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size.
5286  */
5287 static uint64_t
5288 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks,
5289 		     uint64_t num_bytes, uint64_t *num_blocks)
5290 {
5291 	uint32_t block_size = bdev->blocklen;
5292 	uint8_t shift_cnt;
5293 
5294 	/* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */
5295 	if (spdk_likely(spdk_u32_is_pow2(block_size))) {
5296 		shift_cnt = spdk_u32log2(block_size);
5297 		*offset_blocks = offset_bytes >> shift_cnt;
5298 		*num_blocks = num_bytes >> shift_cnt;
5299 		return (offset_bytes - (*offset_blocks << shift_cnt)) |
5300 		       (num_bytes - (*num_blocks << shift_cnt));
5301 	} else {
5302 		*offset_blocks = offset_bytes / block_size;
5303 		*num_blocks = num_bytes / block_size;
5304 		return (offset_bytes % block_size) | (num_bytes % block_size);
5305 	}
5306 }
5307 
5308 static bool
5309 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks)
5310 {
5311 	/* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there
5312 	 * has been an overflow and hence the offset has been wrapped around */
5313 	if (offset_blocks + num_blocks < offset_blocks) {
5314 		return false;
5315 	}
5316 
5317 	/* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */
5318 	if (offset_blocks + num_blocks > bdev->blockcnt) {
5319 		return false;
5320 	}
5321 
5322 	return true;
5323 }
5324 
5325 static void
5326 bdev_seek_complete_cb(void *ctx)
5327 {
5328 	struct spdk_bdev_io *bdev_io = ctx;
5329 
5330 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
5331 	bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx);
5332 }
5333 
5334 static int
5335 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5336 	  uint64_t offset_blocks, enum spdk_bdev_io_type io_type,
5337 	  spdk_bdev_io_completion_cb cb, void *cb_arg)
5338 {
5339 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5340 	struct spdk_bdev_io *bdev_io;
5341 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5342 
5343 	assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE);
5344 
5345 	/* Check if offset_blocks is valid looking at the validity of one block */
5346 	if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) {
5347 		return -EINVAL;
5348 	}
5349 
5350 	bdev_io = bdev_channel_get_io(channel);
5351 	if (!bdev_io) {
5352 		return -ENOMEM;
5353 	}
5354 
5355 	bdev_io->internal.ch = channel;
5356 	bdev_io->internal.desc = desc;
5357 	bdev_io->type = io_type;
5358 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5359 	bdev_io->u.bdev.memory_domain = NULL;
5360 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5361 	bdev_io->u.bdev.accel_sequence = NULL;
5362 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5363 
5364 	if (!spdk_bdev_io_type_supported(bdev, io_type)) {
5365 		/* In case bdev doesn't support seek to next data/hole offset,
5366 		 * it is assumed that only data and no holes are present */
5367 		if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) {
5368 			bdev_io->u.bdev.seek.offset = offset_blocks;
5369 		} else {
5370 			bdev_io->u.bdev.seek.offset = UINT64_MAX;
5371 		}
5372 
5373 		spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io);
5374 		return 0;
5375 	}
5376 
5377 	bdev_io_submit(bdev_io);
5378 	return 0;
5379 }
5380 
5381 int
5382 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5383 		    uint64_t offset_blocks,
5384 		    spdk_bdev_io_completion_cb cb, void *cb_arg)
5385 {
5386 	return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg);
5387 }
5388 
5389 int
5390 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5391 		    uint64_t offset_blocks,
5392 		    spdk_bdev_io_completion_cb cb, void *cb_arg)
5393 {
5394 	return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg);
5395 }
5396 
5397 uint64_t
5398 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io)
5399 {
5400 	return bdev_io->u.bdev.seek.offset;
5401 }
5402 
5403 static int
5404 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf,
5405 			 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5406 			 spdk_bdev_io_completion_cb cb, void *cb_arg)
5407 {
5408 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5409 	struct spdk_bdev_io *bdev_io;
5410 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5411 
5412 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5413 		return -EINVAL;
5414 	}
5415 
5416 	bdev_io = bdev_channel_get_io(channel);
5417 	if (!bdev_io) {
5418 		return -ENOMEM;
5419 	}
5420 
5421 	bdev_io->internal.ch = channel;
5422 	bdev_io->internal.desc = desc;
5423 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
5424 	bdev_io->u.bdev.iovs = &bdev_io->iov;
5425 	bdev_io->u.bdev.iovs[0].iov_base = buf;
5426 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
5427 	bdev_io->u.bdev.iovcnt = 1;
5428 	bdev_io->u.bdev.md_buf = md_buf;
5429 	bdev_io->u.bdev.num_blocks = num_blocks;
5430 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5431 	bdev_io->u.bdev.memory_domain = NULL;
5432 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5433 	bdev_io->u.bdev.accel_sequence = NULL;
5434 	bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags;
5435 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5436 
5437 	bdev_io_submit(bdev_io);
5438 	return 0;
5439 }
5440 
5441 int
5442 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5443 	       void *buf, uint64_t offset, uint64_t nbytes,
5444 	       spdk_bdev_io_completion_cb cb, void *cb_arg)
5445 {
5446 	uint64_t offset_blocks, num_blocks;
5447 
5448 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5449 				 nbytes, &num_blocks) != 0) {
5450 		return -EINVAL;
5451 	}
5452 
5453 	return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
5454 }
5455 
5456 int
5457 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5458 		      void *buf, uint64_t offset_blocks, uint64_t num_blocks,
5459 		      spdk_bdev_io_completion_cb cb, void *cb_arg)
5460 {
5461 	return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg);
5462 }
5463 
5464 int
5465 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5466 			      void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5467 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
5468 {
5469 	struct iovec iov = {
5470 		.iov_base = buf,
5471 	};
5472 
5473 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5474 		return -EINVAL;
5475 	}
5476 
5477 	if (md_buf && !_is_buf_allocated(&iov)) {
5478 		return -EINVAL;
5479 	}
5480 
5481 	return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
5482 					cb, cb_arg);
5483 }
5484 
5485 int
5486 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5487 		struct iovec *iov, int iovcnt,
5488 		uint64_t offset, uint64_t nbytes,
5489 		spdk_bdev_io_completion_cb cb, void *cb_arg)
5490 {
5491 	uint64_t offset_blocks, num_blocks;
5492 
5493 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5494 				 nbytes, &num_blocks) != 0) {
5495 		return -EINVAL;
5496 	}
5497 
5498 	return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
5499 }
5500 
5501 static int
5502 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5503 			  struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks,
5504 			  uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx,
5505 			  struct spdk_accel_sequence *seq, uint32_t dif_check_flags,
5506 			  spdk_bdev_io_completion_cb cb, void *cb_arg)
5507 {
5508 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5509 	struct spdk_bdev_io *bdev_io;
5510 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5511 
5512 	if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) {
5513 		return -EINVAL;
5514 	}
5515 
5516 	bdev_io = bdev_channel_get_io(channel);
5517 	if (spdk_unlikely(!bdev_io)) {
5518 		return -ENOMEM;
5519 	}
5520 
5521 	bdev_io->internal.ch = channel;
5522 	bdev_io->internal.desc = desc;
5523 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
5524 	bdev_io->u.bdev.iovs = iov;
5525 	bdev_io->u.bdev.iovcnt = iovcnt;
5526 	bdev_io->u.bdev.md_buf = md_buf;
5527 	bdev_io->u.bdev.num_blocks = num_blocks;
5528 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5529 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5530 
5531 	if (seq != NULL) {
5532 		bdev_io->internal.f.has_accel_sequence = true;
5533 		bdev_io->internal.accel_sequence = seq;
5534 	}
5535 
5536 	if (domain != NULL) {
5537 		bdev_io->internal.f.has_memory_domain = true;
5538 		bdev_io->internal.memory_domain = domain;
5539 		bdev_io->internal.memory_domain_ctx = domain_ctx;
5540 	}
5541 
5542 	bdev_io->u.bdev.memory_domain = domain;
5543 	bdev_io->u.bdev.memory_domain_ctx = domain_ctx;
5544 	bdev_io->u.bdev.accel_sequence = seq;
5545 	bdev_io->u.bdev.dif_check_flags = dif_check_flags;
5546 
5547 	_bdev_io_submit_ext(desc, bdev_io);
5548 
5549 	return 0;
5550 }
5551 
5552 int
5553 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5554 		       struct iovec *iov, int iovcnt,
5555 		       uint64_t offset_blocks, uint64_t num_blocks,
5556 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
5557 {
5558 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5559 
5560 	return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
5561 					 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg);
5562 }
5563 
5564 int
5565 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5566 			       struct iovec *iov, int iovcnt, void *md_buf,
5567 			       uint64_t offset_blocks, uint64_t num_blocks,
5568 			       spdk_bdev_io_completion_cb cb, void *cb_arg)
5569 {
5570 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5571 
5572 	if (md_buf && !spdk_bdev_is_md_separate(bdev)) {
5573 		return -EINVAL;
5574 	}
5575 
5576 	if (md_buf && !_is_buf_allocated(iov)) {
5577 		return -EINVAL;
5578 	}
5579 
5580 	return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
5581 					 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg);
5582 }
5583 
5584 static inline bool
5585 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov)
5586 {
5587 	/*
5588 	 * We check if opts size is at least of size when we first introduced
5589 	 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members
5590 	 * are not checked internal.
5591 	 */
5592 	return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) +
5593 	       sizeof(opts->metadata) &&
5594 	       opts->size <= sizeof(*opts) &&
5595 	       /* When memory domain is used, the user must provide data buffers */
5596 	       (!opts->memory_domain || (iov && iov[0].iov_base));
5597 }
5598 
5599 int
5600 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5601 			   struct iovec *iov, int iovcnt,
5602 			   uint64_t offset_blocks, uint64_t num_blocks,
5603 			   spdk_bdev_io_completion_cb cb, void *cb_arg,
5604 			   struct spdk_bdev_ext_io_opts *opts)
5605 {
5606 	struct spdk_memory_domain *domain = NULL;
5607 	struct spdk_accel_sequence *seq = NULL;
5608 	void *domain_ctx = NULL, *md = NULL;
5609 	uint32_t dif_check_flags = 0;
5610 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5611 
5612 	if (opts) {
5613 		if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) {
5614 			return -EINVAL;
5615 		}
5616 
5617 		md = opts->metadata;
5618 		domain = bdev_get_ext_io_opt(opts, memory_domain, NULL);
5619 		domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL);
5620 		seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL);
5621 		if (md) {
5622 			if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) {
5623 				return -EINVAL;
5624 			}
5625 
5626 			if (spdk_unlikely(!_is_buf_allocated(iov))) {
5627 				return -EINVAL;
5628 			}
5629 
5630 			if (spdk_unlikely(seq != NULL)) {
5631 				return -EINVAL;
5632 			}
5633 		}
5634 	}
5635 
5636 	dif_check_flags = bdev->dif_check_flags &
5637 			  ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0));
5638 
5639 	return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks,
5640 					 num_blocks, domain, domain_ctx, seq, dif_check_flags, cb, cb_arg);
5641 }
5642 
5643 static int
5644 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5645 			  void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5646 			  spdk_bdev_io_completion_cb cb, void *cb_arg)
5647 {
5648 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5649 	struct spdk_bdev_io *bdev_io;
5650 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5651 
5652 	if (!desc->write) {
5653 		return -EBADF;
5654 	}
5655 
5656 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5657 		return -EINVAL;
5658 	}
5659 
5660 	bdev_io = bdev_channel_get_io(channel);
5661 	if (!bdev_io) {
5662 		return -ENOMEM;
5663 	}
5664 
5665 	bdev_io->internal.ch = channel;
5666 	bdev_io->internal.desc = desc;
5667 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
5668 	bdev_io->u.bdev.iovs = &bdev_io->iov;
5669 	bdev_io->u.bdev.iovs[0].iov_base = buf;
5670 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
5671 	bdev_io->u.bdev.iovcnt = 1;
5672 	bdev_io->u.bdev.md_buf = md_buf;
5673 	bdev_io->u.bdev.num_blocks = num_blocks;
5674 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5675 	bdev_io->u.bdev.memory_domain = NULL;
5676 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5677 	bdev_io->u.bdev.accel_sequence = NULL;
5678 	bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags;
5679 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5680 
5681 	bdev_io_submit(bdev_io);
5682 	return 0;
5683 }
5684 
5685 int
5686 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5687 		void *buf, uint64_t offset, uint64_t nbytes,
5688 		spdk_bdev_io_completion_cb cb, void *cb_arg)
5689 {
5690 	uint64_t offset_blocks, num_blocks;
5691 
5692 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5693 				 nbytes, &num_blocks) != 0) {
5694 		return -EINVAL;
5695 	}
5696 
5697 	return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
5698 }
5699 
5700 int
5701 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5702 		       void *buf, uint64_t offset_blocks, uint64_t num_blocks,
5703 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
5704 {
5705 	return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks,
5706 					 cb, cb_arg);
5707 }
5708 
5709 int
5710 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5711 			       void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5712 			       spdk_bdev_io_completion_cb cb, void *cb_arg)
5713 {
5714 	struct iovec iov = {
5715 		.iov_base = buf,
5716 	};
5717 
5718 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5719 		return -EINVAL;
5720 	}
5721 
5722 	if (md_buf && !_is_buf_allocated(&iov)) {
5723 		return -EINVAL;
5724 	}
5725 
5726 	return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
5727 					 cb, cb_arg);
5728 }
5729 
5730 static int
5731 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5732 			   struct iovec *iov, int iovcnt, void *md_buf,
5733 			   uint64_t offset_blocks, uint64_t num_blocks,
5734 			   struct spdk_memory_domain *domain, void *domain_ctx,
5735 			   struct spdk_accel_sequence *seq, uint32_t dif_check_flags,
5736 			   uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw,
5737 			   spdk_bdev_io_completion_cb cb, void *cb_arg)
5738 {
5739 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5740 	struct spdk_bdev_io *bdev_io;
5741 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5742 
5743 	if (spdk_unlikely(!desc->write)) {
5744 		return -EBADF;
5745 	}
5746 
5747 	if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) {
5748 		return -EINVAL;
5749 	}
5750 
5751 	bdev_io = bdev_channel_get_io(channel);
5752 	if (spdk_unlikely(!bdev_io)) {
5753 		return -ENOMEM;
5754 	}
5755 
5756 	bdev_io->internal.ch = channel;
5757 	bdev_io->internal.desc = desc;
5758 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
5759 	bdev_io->u.bdev.iovs = iov;
5760 	bdev_io->u.bdev.iovcnt = iovcnt;
5761 	bdev_io->u.bdev.md_buf = md_buf;
5762 	bdev_io->u.bdev.num_blocks = num_blocks;
5763 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5764 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5765 	if (seq != NULL) {
5766 		bdev_io->internal.f.has_accel_sequence = true;
5767 		bdev_io->internal.accel_sequence = seq;
5768 	}
5769 
5770 	if (domain != NULL) {
5771 		bdev_io->internal.f.has_memory_domain = true;
5772 		bdev_io->internal.memory_domain = domain;
5773 		bdev_io->internal.memory_domain_ctx = domain_ctx;
5774 	}
5775 
5776 	bdev_io->u.bdev.memory_domain = domain;
5777 	bdev_io->u.bdev.memory_domain_ctx = domain_ctx;
5778 	bdev_io->u.bdev.accel_sequence = seq;
5779 	bdev_io->u.bdev.dif_check_flags = dif_check_flags;
5780 	bdev_io->u.bdev.nvme_cdw12.raw = nvme_cdw12_raw;
5781 	bdev_io->u.bdev.nvme_cdw13.raw = nvme_cdw13_raw;
5782 
5783 	_bdev_io_submit_ext(desc, bdev_io);
5784 
5785 	return 0;
5786 }
5787 
5788 int
5789 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5790 		 struct iovec *iov, int iovcnt,
5791 		 uint64_t offset, uint64_t len,
5792 		 spdk_bdev_io_completion_cb cb, void *cb_arg)
5793 {
5794 	uint64_t offset_blocks, num_blocks;
5795 
5796 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5797 				 len, &num_blocks) != 0) {
5798 		return -EINVAL;
5799 	}
5800 
5801 	return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
5802 }
5803 
5804 int
5805 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5806 			struct iovec *iov, int iovcnt,
5807 			uint64_t offset_blocks, uint64_t num_blocks,
5808 			spdk_bdev_io_completion_cb cb, void *cb_arg)
5809 {
5810 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5811 
5812 	return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
5813 					  num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0,
5814 					  cb, cb_arg);
5815 }
5816 
5817 int
5818 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5819 				struct iovec *iov, int iovcnt, void *md_buf,
5820 				uint64_t offset_blocks, uint64_t num_blocks,
5821 				spdk_bdev_io_completion_cb cb, void *cb_arg)
5822 {
5823 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5824 
5825 	if (md_buf && !spdk_bdev_is_md_separate(bdev)) {
5826 		return -EINVAL;
5827 	}
5828 
5829 	if (md_buf && !_is_buf_allocated(iov)) {
5830 		return -EINVAL;
5831 	}
5832 
5833 	return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
5834 					  num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0,
5835 					  cb, cb_arg);
5836 }
5837 
5838 int
5839 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5840 			    struct iovec *iov, int iovcnt,
5841 			    uint64_t offset_blocks, uint64_t num_blocks,
5842 			    spdk_bdev_io_completion_cb cb, void *cb_arg,
5843 			    struct spdk_bdev_ext_io_opts *opts)
5844 {
5845 	struct spdk_memory_domain *domain = NULL;
5846 	struct spdk_accel_sequence *seq = NULL;
5847 	void *domain_ctx = NULL, *md = NULL;
5848 	uint32_t dif_check_flags = 0;
5849 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5850 	uint32_t nvme_cdw12_raw = 0;
5851 	uint32_t nvme_cdw13_raw = 0;
5852 
5853 	if (opts) {
5854 		if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) {
5855 			return -EINVAL;
5856 		}
5857 		md = opts->metadata;
5858 		domain = bdev_get_ext_io_opt(opts, memory_domain, NULL);
5859 		domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL);
5860 		seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL);
5861 		nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0);
5862 		nvme_cdw13_raw = bdev_get_ext_io_opt(opts, nvme_cdw13.raw, 0);
5863 		if (md) {
5864 			if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) {
5865 				return -EINVAL;
5866 			}
5867 
5868 			if (spdk_unlikely(!_is_buf_allocated(iov))) {
5869 				return -EINVAL;
5870 			}
5871 
5872 			if (spdk_unlikely(seq != NULL)) {
5873 				return -EINVAL;
5874 			}
5875 		}
5876 	}
5877 
5878 	dif_check_flags = bdev->dif_check_flags &
5879 			  ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0));
5880 
5881 	return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks,
5882 					  domain, domain_ctx, seq, dif_check_flags,
5883 					  nvme_cdw12_raw, nvme_cdw13_raw, cb, cb_arg);
5884 }
5885 
5886 static void
5887 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
5888 {
5889 	struct spdk_bdev_io *parent_io = cb_arg;
5890 	struct spdk_bdev *bdev = parent_io->bdev;
5891 	uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base;
5892 	int i, rc = 0;
5893 
5894 	if (!success) {
5895 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
5896 		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
5897 		spdk_bdev_free_io(bdev_io);
5898 		return;
5899 	}
5900 
5901 	for (i = 0; i < parent_io->u.bdev.iovcnt; i++) {
5902 		rc = memcmp(read_buf,
5903 			    parent_io->u.bdev.iovs[i].iov_base,
5904 			    parent_io->u.bdev.iovs[i].iov_len);
5905 		if (rc) {
5906 			break;
5907 		}
5908 		read_buf += parent_io->u.bdev.iovs[i].iov_len;
5909 	}
5910 
5911 	if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) {
5912 		rc = memcmp(bdev_io->u.bdev.md_buf,
5913 			    parent_io->u.bdev.md_buf,
5914 			    spdk_bdev_get_md_size(bdev));
5915 	}
5916 
5917 	spdk_bdev_free_io(bdev_io);
5918 
5919 	if (rc == 0) {
5920 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
5921 		parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx);
5922 	} else {
5923 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE;
5924 		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
5925 	}
5926 }
5927 
5928 static void
5929 bdev_compare_do_read(void *_bdev_io)
5930 {
5931 	struct spdk_bdev_io *bdev_io = _bdev_io;
5932 	int rc;
5933 
5934 	rc = spdk_bdev_read_blocks(bdev_io->internal.desc,
5935 				   spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL,
5936 				   bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
5937 				   bdev_compare_do_read_done, bdev_io);
5938 
5939 	if (rc == -ENOMEM) {
5940 		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read);
5941 	} else if (rc != 0) {
5942 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
5943 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
5944 	}
5945 }
5946 
5947 static int
5948 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5949 			     struct iovec *iov, int iovcnt, void *md_buf,
5950 			     uint64_t offset_blocks, uint64_t num_blocks,
5951 			     spdk_bdev_io_completion_cb cb, void *cb_arg)
5952 {
5953 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5954 	struct spdk_bdev_io *bdev_io;
5955 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5956 
5957 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5958 		return -EINVAL;
5959 	}
5960 
5961 	bdev_io = bdev_channel_get_io(channel);
5962 	if (!bdev_io) {
5963 		return -ENOMEM;
5964 	}
5965 
5966 	bdev_io->internal.ch = channel;
5967 	bdev_io->internal.desc = desc;
5968 	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE;
5969 	bdev_io->u.bdev.iovs = iov;
5970 	bdev_io->u.bdev.iovcnt = iovcnt;
5971 	bdev_io->u.bdev.md_buf = md_buf;
5972 	bdev_io->u.bdev.num_blocks = num_blocks;
5973 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5974 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5975 	bdev_io->u.bdev.memory_domain = NULL;
5976 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5977 	bdev_io->u.bdev.accel_sequence = NULL;
5978 
5979 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) {
5980 		bdev_io_submit(bdev_io);
5981 		return 0;
5982 	}
5983 
5984 	bdev_compare_do_read(bdev_io);
5985 
5986 	return 0;
5987 }
5988 
5989 int
5990 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5991 			  struct iovec *iov, int iovcnt,
5992 			  uint64_t offset_blocks, uint64_t num_blocks,
5993 			  spdk_bdev_io_completion_cb cb, void *cb_arg)
5994 {
5995 	return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
5996 					    num_blocks, cb, cb_arg);
5997 }
5998 
5999 int
6000 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6001 				  struct iovec *iov, int iovcnt, void *md_buf,
6002 				  uint64_t offset_blocks, uint64_t num_blocks,
6003 				  spdk_bdev_io_completion_cb cb, void *cb_arg)
6004 {
6005 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
6006 		return -EINVAL;
6007 	}
6008 
6009 	if (md_buf && !_is_buf_allocated(iov)) {
6010 		return -EINVAL;
6011 	}
6012 
6013 	return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
6014 					    num_blocks, cb, cb_arg);
6015 }
6016 
6017 static int
6018 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6019 			    void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
6020 			    spdk_bdev_io_completion_cb cb, void *cb_arg)
6021 {
6022 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6023 	struct spdk_bdev_io *bdev_io;
6024 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6025 
6026 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6027 		return -EINVAL;
6028 	}
6029 
6030 	bdev_io = bdev_channel_get_io(channel);
6031 	if (!bdev_io) {
6032 		return -ENOMEM;
6033 	}
6034 
6035 	bdev_io->internal.ch = channel;
6036 	bdev_io->internal.desc = desc;
6037 	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE;
6038 	bdev_io->u.bdev.iovs = &bdev_io->iov;
6039 	bdev_io->u.bdev.iovs[0].iov_base = buf;
6040 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
6041 	bdev_io->u.bdev.iovcnt = 1;
6042 	bdev_io->u.bdev.md_buf = md_buf;
6043 	bdev_io->u.bdev.num_blocks = num_blocks;
6044 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6045 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6046 	bdev_io->u.bdev.memory_domain = NULL;
6047 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6048 	bdev_io->u.bdev.accel_sequence = NULL;
6049 
6050 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) {
6051 		bdev_io_submit(bdev_io);
6052 		return 0;
6053 	}
6054 
6055 	bdev_compare_do_read(bdev_io);
6056 
6057 	return 0;
6058 }
6059 
6060 int
6061 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6062 			 void *buf, uint64_t offset_blocks, uint64_t num_blocks,
6063 			 spdk_bdev_io_completion_cb cb, void *cb_arg)
6064 {
6065 	return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks,
6066 					   cb, cb_arg);
6067 }
6068 
6069 int
6070 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6071 				 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
6072 				 spdk_bdev_io_completion_cb cb, void *cb_arg)
6073 {
6074 	struct iovec iov = {
6075 		.iov_base = buf,
6076 	};
6077 
6078 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
6079 		return -EINVAL;
6080 	}
6081 
6082 	if (md_buf && !_is_buf_allocated(&iov)) {
6083 		return -EINVAL;
6084 	}
6085 
6086 	return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
6087 					   cb, cb_arg);
6088 }
6089 
6090 static void
6091 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status)
6092 {
6093 	struct spdk_bdev_io *bdev_io = ctx;
6094 
6095 	if (unlock_status) {
6096 		SPDK_ERRLOG("LBA range unlock failed\n");
6097 	}
6098 
6099 	bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true :
6100 			     false, bdev_io->internal.caller_ctx);
6101 }
6102 
6103 static void
6104 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status)
6105 {
6106 	bdev_io->internal.status = status;
6107 
6108 	bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch),
6109 			      bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
6110 			      bdev_comparev_and_writev_blocks_unlocked, bdev_io);
6111 }
6112 
6113 static void
6114 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
6115 {
6116 	struct spdk_bdev_io *parent_io = cb_arg;
6117 
6118 	if (!success) {
6119 		SPDK_ERRLOG("Compare and write operation failed\n");
6120 	}
6121 
6122 	spdk_bdev_free_io(bdev_io);
6123 
6124 	bdev_comparev_and_writev_blocks_unlock(parent_io,
6125 					       success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED);
6126 }
6127 
6128 static void
6129 bdev_compare_and_write_do_write(void *_bdev_io)
6130 {
6131 	struct spdk_bdev_io *bdev_io = _bdev_io;
6132 	int rc;
6133 
6134 	rc = spdk_bdev_writev_blocks(bdev_io->internal.desc,
6135 				     spdk_io_channel_from_ctx(bdev_io->internal.ch),
6136 				     bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt,
6137 				     bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
6138 				     bdev_compare_and_write_do_write_done, bdev_io);
6139 
6140 
6141 	if (rc == -ENOMEM) {
6142 		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write);
6143 	} else if (rc != 0) {
6144 		bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
6145 	}
6146 }
6147 
6148 static void
6149 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
6150 {
6151 	struct spdk_bdev_io *parent_io = cb_arg;
6152 
6153 	spdk_bdev_free_io(bdev_io);
6154 
6155 	if (!success) {
6156 		bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE);
6157 		return;
6158 	}
6159 
6160 	bdev_compare_and_write_do_write(parent_io);
6161 }
6162 
6163 static void
6164 bdev_compare_and_write_do_compare(void *_bdev_io)
6165 {
6166 	struct spdk_bdev_io *bdev_io = _bdev_io;
6167 	int rc;
6168 
6169 	rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc,
6170 				       spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs,
6171 				       bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
6172 				       bdev_compare_and_write_do_compare_done, bdev_io);
6173 
6174 	if (rc == -ENOMEM) {
6175 		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare);
6176 	} else if (rc != 0) {
6177 		bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED);
6178 	}
6179 }
6180 
6181 static void
6182 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status)
6183 {
6184 	struct spdk_bdev_io *bdev_io = ctx;
6185 
6186 	if (status) {
6187 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED;
6188 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
6189 		return;
6190 	}
6191 
6192 	bdev_compare_and_write_do_compare(bdev_io);
6193 }
6194 
6195 int
6196 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6197 				     struct iovec *compare_iov, int compare_iovcnt,
6198 				     struct iovec *write_iov, int write_iovcnt,
6199 				     uint64_t offset_blocks, uint64_t num_blocks,
6200 				     spdk_bdev_io_completion_cb cb, void *cb_arg)
6201 {
6202 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6203 	struct spdk_bdev_io *bdev_io;
6204 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6205 
6206 	if (!desc->write) {
6207 		return -EBADF;
6208 	}
6209 
6210 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6211 		return -EINVAL;
6212 	}
6213 
6214 	if (num_blocks > bdev->acwu) {
6215 		return -EINVAL;
6216 	}
6217 
6218 	bdev_io = bdev_channel_get_io(channel);
6219 	if (!bdev_io) {
6220 		return -ENOMEM;
6221 	}
6222 
6223 	bdev_io->internal.ch = channel;
6224 	bdev_io->internal.desc = desc;
6225 	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE;
6226 	bdev_io->u.bdev.iovs = compare_iov;
6227 	bdev_io->u.bdev.iovcnt = compare_iovcnt;
6228 	bdev_io->u.bdev.fused_iovs = write_iov;
6229 	bdev_io->u.bdev.fused_iovcnt = write_iovcnt;
6230 	bdev_io->u.bdev.md_buf = NULL;
6231 	bdev_io->u.bdev.num_blocks = num_blocks;
6232 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6233 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6234 	bdev_io->u.bdev.memory_domain = NULL;
6235 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6236 	bdev_io->u.bdev.accel_sequence = NULL;
6237 
6238 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) {
6239 		bdev_io_submit(bdev_io);
6240 		return 0;
6241 	}
6242 
6243 	return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks,
6244 				   bdev_comparev_and_writev_blocks_locked, bdev_io);
6245 }
6246 
6247 int
6248 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6249 		      struct iovec *iov, int iovcnt,
6250 		      uint64_t offset_blocks, uint64_t num_blocks,
6251 		      bool populate,
6252 		      spdk_bdev_io_completion_cb cb, void *cb_arg)
6253 {
6254 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6255 	struct spdk_bdev_io *bdev_io;
6256 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6257 
6258 	if (!desc->write) {
6259 		return -EBADF;
6260 	}
6261 
6262 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6263 		return -EINVAL;
6264 	}
6265 
6266 	if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) {
6267 		return -ENOTSUP;
6268 	}
6269 
6270 	bdev_io = bdev_channel_get_io(channel);
6271 	if (!bdev_io) {
6272 		return -ENOMEM;
6273 	}
6274 
6275 	bdev_io->internal.ch = channel;
6276 	bdev_io->internal.desc = desc;
6277 	bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY;
6278 	bdev_io->u.bdev.num_blocks = num_blocks;
6279 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6280 	bdev_io->u.bdev.iovs = iov;
6281 	bdev_io->u.bdev.iovcnt = iovcnt;
6282 	bdev_io->u.bdev.md_buf = NULL;
6283 	bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0;
6284 	bdev_io->u.bdev.zcopy.commit = 0;
6285 	bdev_io->u.bdev.zcopy.start = 1;
6286 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6287 	bdev_io->u.bdev.memory_domain = NULL;
6288 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6289 	bdev_io->u.bdev.accel_sequence = NULL;
6290 
6291 	bdev_io_submit(bdev_io);
6292 
6293 	return 0;
6294 }
6295 
6296 int
6297 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit,
6298 		    spdk_bdev_io_completion_cb cb, void *cb_arg)
6299 {
6300 	if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) {
6301 		return -EINVAL;
6302 	}
6303 
6304 	bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0;
6305 	bdev_io->u.bdev.zcopy.start = 0;
6306 	bdev_io->internal.caller_ctx = cb_arg;
6307 	bdev_io->internal.cb = cb;
6308 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
6309 
6310 	bdev_io_submit(bdev_io);
6311 
6312 	return 0;
6313 }
6314 
6315 int
6316 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6317 		       uint64_t offset, uint64_t len,
6318 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
6319 {
6320 	uint64_t offset_blocks, num_blocks;
6321 
6322 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
6323 				 len, &num_blocks) != 0) {
6324 		return -EINVAL;
6325 	}
6326 
6327 	return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
6328 }
6329 
6330 int
6331 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6332 			      uint64_t offset_blocks, uint64_t num_blocks,
6333 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
6334 {
6335 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6336 	struct spdk_bdev_io *bdev_io;
6337 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6338 
6339 	if (!desc->write) {
6340 		return -EBADF;
6341 	}
6342 
6343 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6344 		return -EINVAL;
6345 	}
6346 
6347 	if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) &&
6348 	    !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) {
6349 		return -ENOTSUP;
6350 	}
6351 
6352 	bdev_io = bdev_channel_get_io(channel);
6353 
6354 	if (!bdev_io) {
6355 		return -ENOMEM;
6356 	}
6357 
6358 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES;
6359 	bdev_io->internal.ch = channel;
6360 	bdev_io->internal.desc = desc;
6361 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6362 	bdev_io->u.bdev.num_blocks = num_blocks;
6363 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6364 	bdev_io->u.bdev.memory_domain = NULL;
6365 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6366 	bdev_io->u.bdev.accel_sequence = NULL;
6367 
6368 	/* If the write_zeroes size is large and should be split, use the generic split
6369 	 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not.
6370 	 *
6371 	 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported
6372 	 * or emulate it using regular write request otherwise.
6373 	 */
6374 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) ||
6375 	    bdev_io->internal.f.split) {
6376 		bdev_io_submit(bdev_io);
6377 		return 0;
6378 	}
6379 
6380 	assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE);
6381 
6382 	return bdev_write_zero_buffer(bdev_io);
6383 }
6384 
6385 int
6386 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6387 		uint64_t offset, uint64_t nbytes,
6388 		spdk_bdev_io_completion_cb cb, void *cb_arg)
6389 {
6390 	uint64_t offset_blocks, num_blocks;
6391 
6392 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
6393 				 nbytes, &num_blocks) != 0) {
6394 		return -EINVAL;
6395 	}
6396 
6397 	return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
6398 }
6399 
6400 static void
6401 bdev_io_complete_cb(void *ctx)
6402 {
6403 	struct spdk_bdev_io *bdev_io = ctx;
6404 
6405 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
6406 	bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx);
6407 }
6408 
6409 int
6410 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6411 		       uint64_t offset_blocks, uint64_t num_blocks,
6412 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
6413 {
6414 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6415 	struct spdk_bdev_io *bdev_io;
6416 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6417 
6418 	if (!desc->write) {
6419 		return -EBADF;
6420 	}
6421 
6422 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6423 		return -EINVAL;
6424 	}
6425 
6426 	bdev_io = bdev_channel_get_io(channel);
6427 	if (!bdev_io) {
6428 		return -ENOMEM;
6429 	}
6430 
6431 	bdev_io->internal.ch = channel;
6432 	bdev_io->internal.desc = desc;
6433 	bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP;
6434 
6435 	bdev_io->u.bdev.iovs = &bdev_io->iov;
6436 	bdev_io->u.bdev.iovs[0].iov_base = NULL;
6437 	bdev_io->u.bdev.iovs[0].iov_len = 0;
6438 	bdev_io->u.bdev.iovcnt = 1;
6439 
6440 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6441 	bdev_io->u.bdev.num_blocks = num_blocks;
6442 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6443 	bdev_io->u.bdev.memory_domain = NULL;
6444 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6445 	bdev_io->u.bdev.accel_sequence = NULL;
6446 
6447 	if (num_blocks == 0) {
6448 		spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io);
6449 		return 0;
6450 	}
6451 
6452 	bdev_io_submit(bdev_io);
6453 	return 0;
6454 }
6455 
6456 int
6457 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6458 		uint64_t offset, uint64_t length,
6459 		spdk_bdev_io_completion_cb cb, void *cb_arg)
6460 {
6461 	uint64_t offset_blocks, num_blocks;
6462 
6463 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
6464 				 length, &num_blocks) != 0) {
6465 		return -EINVAL;
6466 	}
6467 
6468 	return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
6469 }
6470 
6471 int
6472 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6473 		       uint64_t offset_blocks, uint64_t num_blocks,
6474 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
6475 {
6476 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6477 	struct spdk_bdev_io *bdev_io;
6478 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6479 
6480 	if (!desc->write) {
6481 		return -EBADF;
6482 	}
6483 
6484 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6485 		return -EINVAL;
6486 	}
6487 
6488 	bdev_io = bdev_channel_get_io(channel);
6489 	if (!bdev_io) {
6490 		return -ENOMEM;
6491 	}
6492 
6493 	bdev_io->internal.ch = channel;
6494 	bdev_io->internal.desc = desc;
6495 	bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH;
6496 	bdev_io->u.bdev.iovs = NULL;
6497 	bdev_io->u.bdev.iovcnt = 0;
6498 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6499 	bdev_io->u.bdev.num_blocks = num_blocks;
6500 	bdev_io->u.bdev.memory_domain = NULL;
6501 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6502 	bdev_io->u.bdev.accel_sequence = NULL;
6503 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6504 
6505 	bdev_io_submit(bdev_io);
6506 	return 0;
6507 }
6508 
6509 static int bdev_reset_poll_for_outstanding_io(void *ctx);
6510 
6511 static void
6512 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
6513 {
6514 	struct spdk_bdev_channel *ch = _ctx;
6515 	struct spdk_bdev_io *bdev_io;
6516 
6517 	bdev_io = TAILQ_FIRST(&ch->queued_resets);
6518 
6519 	if (status == -EBUSY) {
6520 		if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) {
6521 			bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io,
6522 							      ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD);
6523 		} else {
6524 			TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
6525 
6526 			if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) {
6527 				/* If outstanding IOs are still present and reset_io_drain_timeout
6528 				 * seconds passed, start the reset. */
6529 				bdev_io_submit_reset(bdev_io);
6530 			} else {
6531 				/* We still have in progress memory domain pull/push or we're
6532 				 * executing accel sequence.  Since we cannot abort either of those
6533 				 * operations, fail the reset request. */
6534 				spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
6535 			}
6536 		}
6537 	} else {
6538 		TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
6539 		SPDK_DEBUGLOG(bdev,
6540 			      "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n",
6541 			      ch->bdev->name);
6542 		/* Mark the completion status as a SUCCESS and complete the reset. */
6543 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
6544 	}
6545 }
6546 
6547 static void
6548 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6549 				struct spdk_io_channel *io_ch, void *_ctx)
6550 {
6551 	struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch);
6552 	int status = 0;
6553 
6554 	if (cur_ch->io_outstanding > 0 ||
6555 	    !TAILQ_EMPTY(&cur_ch->io_memory_domain) ||
6556 	    !TAILQ_EMPTY(&cur_ch->io_accel_exec)) {
6557 		/* If a channel has outstanding IO, set status to -EBUSY code. This will stop
6558 		 * further iteration over the rest of the channels and pass non-zero status
6559 		 * to the callback function. */
6560 		status = -EBUSY;
6561 	}
6562 	spdk_bdev_for_each_channel_continue(i, status);
6563 }
6564 
6565 static int
6566 bdev_reset_poll_for_outstanding_io(void *ctx)
6567 {
6568 	struct spdk_bdev_channel *ch = ctx;
6569 	struct spdk_bdev_io *bdev_io;
6570 
6571 	bdev_io = TAILQ_FIRST(&ch->queued_resets);
6572 
6573 	spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller);
6574 	spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch,
6575 				   bdev_reset_check_outstanding_io_done);
6576 
6577 	return SPDK_POLLER_BUSY;
6578 }
6579 
6580 static void
6581 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status)
6582 {
6583 	struct spdk_bdev_channel *ch = _ctx;
6584 	struct spdk_bdev_io *bdev_io;
6585 
6586 	bdev_io = TAILQ_FIRST(&ch->queued_resets);
6587 
6588 	if (bdev->reset_io_drain_timeout == 0) {
6589 		TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
6590 
6591 		bdev_io_submit_reset(bdev_io);
6592 		return;
6593 	}
6594 
6595 	bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() +
6596 			(ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz());
6597 
6598 	/* In case bdev->reset_io_drain_timeout is not equal to zero,
6599 	 * submit the reset to the underlying module only if outstanding I/O
6600 	 * remain after reset_io_drain_timeout seconds have passed. */
6601 	spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch,
6602 				   bdev_reset_check_outstanding_io_done);
6603 }
6604 
6605 static void
6606 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6607 			  struct spdk_io_channel *ch, void *_ctx)
6608 {
6609 	struct spdk_bdev_channel	*channel;
6610 	struct spdk_bdev_mgmt_channel	*mgmt_channel;
6611 	struct spdk_bdev_shared_resource *shared_resource;
6612 	bdev_io_tailq_t			tmp_queued;
6613 
6614 	TAILQ_INIT(&tmp_queued);
6615 
6616 	channel = __io_ch_to_bdev_ch(ch);
6617 	shared_resource = channel->shared_resource;
6618 	mgmt_channel = shared_resource->mgmt_ch;
6619 
6620 	channel->flags |= BDEV_CH_RESET_IN_PROGRESS;
6621 
6622 	if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) {
6623 		TAILQ_SWAP(&channel->qos_queued_io, &tmp_queued, spdk_bdev_io, internal.link);
6624 	}
6625 
6626 	bdev_abort_all_queued_io(&shared_resource->nomem_io, channel);
6627 	bdev_abort_all_buf_io(mgmt_channel, channel);
6628 	bdev_abort_all_queued_io(&tmp_queued, channel);
6629 
6630 	spdk_bdev_for_each_channel_continue(i, 0);
6631 }
6632 
6633 static void
6634 bdev_start_reset(void *ctx)
6635 {
6636 	struct spdk_bdev_channel *ch = ctx;
6637 
6638 	spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch,
6639 				   bdev_reset_freeze_channel_done);
6640 }
6641 
6642 static void
6643 bdev_channel_start_reset(struct spdk_bdev_channel *ch)
6644 {
6645 	struct spdk_bdev *bdev = ch->bdev;
6646 
6647 	assert(!TAILQ_EMPTY(&ch->queued_resets));
6648 
6649 	spdk_spin_lock(&bdev->internal.spinlock);
6650 	if (bdev->internal.reset_in_progress == NULL) {
6651 		bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets);
6652 		/*
6653 		 * Take a channel reference for the target bdev for the life of this
6654 		 *  reset.  This guards against the channel getting destroyed while
6655 		 *  spdk_bdev_for_each_channel() calls related to this reset IO are in
6656 		 *  progress.  We will release the reference when this reset is
6657 		 *  completed.
6658 		 */
6659 		bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev));
6660 		bdev_start_reset(ch);
6661 	}
6662 	spdk_spin_unlock(&bdev->internal.spinlock);
6663 }
6664 
6665 int
6666 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6667 		spdk_bdev_io_completion_cb cb, void *cb_arg)
6668 {
6669 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6670 	struct spdk_bdev_io *bdev_io;
6671 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6672 
6673 	bdev_io = bdev_channel_get_io(channel);
6674 	if (!bdev_io) {
6675 		return -ENOMEM;
6676 	}
6677 
6678 	bdev_io->internal.ch = channel;
6679 	bdev_io->internal.desc = desc;
6680 	bdev_io->internal.submit_tsc = spdk_get_ticks();
6681 	bdev_io->type = SPDK_BDEV_IO_TYPE_RESET;
6682 	bdev_io->u.reset.ch_ref = NULL;
6683 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6684 
6685 	spdk_spin_lock(&bdev->internal.spinlock);
6686 	TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link);
6687 	spdk_spin_unlock(&bdev->internal.spinlock);
6688 
6689 	bdev_ch_add_to_io_submitted(bdev_io);
6690 
6691 	bdev_channel_start_reset(channel);
6692 
6693 	return 0;
6694 }
6695 
6696 void
6697 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
6698 		      struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode reset_mode)
6699 {
6700 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6701 
6702 	bdev_get_io_stat(stat, channel->stat);
6703 	spdk_bdev_reset_io_stat(stat, reset_mode);
6704 }
6705 
6706 static void
6707 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status)
6708 {
6709 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx;
6710 
6711 	bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat,
6712 			    bdev_iostat_ctx->cb_arg, 0);
6713 	free(bdev_iostat_ctx);
6714 }
6715 
6716 static void
6717 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6718 			   struct spdk_io_channel *ch, void *_ctx)
6719 {
6720 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx;
6721 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6722 
6723 	spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat);
6724 	spdk_bdev_reset_io_stat(channel->stat, bdev_iostat_ctx->reset_mode);
6725 	spdk_bdev_for_each_channel_continue(i, 0);
6726 }
6727 
6728 void
6729 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat,
6730 			  enum spdk_bdev_reset_stat_mode reset_mode, spdk_bdev_get_device_stat_cb cb, void *cb_arg)
6731 {
6732 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx;
6733 
6734 	assert(bdev != NULL);
6735 	assert(stat != NULL);
6736 	assert(cb != NULL);
6737 
6738 	bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx));
6739 	if (bdev_iostat_ctx == NULL) {
6740 		SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n");
6741 		cb(bdev, stat, cb_arg, -ENOMEM);
6742 		return;
6743 	}
6744 
6745 	bdev_iostat_ctx->stat = stat;
6746 	bdev_iostat_ctx->cb = cb;
6747 	bdev_iostat_ctx->cb_arg = cb_arg;
6748 	bdev_iostat_ctx->reset_mode = reset_mode;
6749 
6750 	/* Start with the statistics from previously deleted channels. */
6751 	spdk_spin_lock(&bdev->internal.spinlock);
6752 	bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat);
6753 	spdk_bdev_reset_io_stat(bdev->internal.stat, reset_mode);
6754 	spdk_spin_unlock(&bdev->internal.spinlock);
6755 
6756 	/* Then iterate and add the statistics from each existing channel. */
6757 	spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx,
6758 				   bdev_get_device_stat_done);
6759 }
6760 
6761 struct bdev_iostat_reset_ctx {
6762 	enum spdk_bdev_reset_stat_mode mode;
6763 	bdev_reset_device_stat_cb cb;
6764 	void *cb_arg;
6765 };
6766 
6767 static void
6768 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status)
6769 {
6770 	struct bdev_iostat_reset_ctx *ctx = _ctx;
6771 
6772 	ctx->cb(bdev, ctx->cb_arg, 0);
6773 
6774 	free(ctx);
6775 }
6776 
6777 static void
6778 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6779 			     struct spdk_io_channel *ch, void *_ctx)
6780 {
6781 	struct bdev_iostat_reset_ctx *ctx = _ctx;
6782 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6783 
6784 	spdk_bdev_reset_io_stat(channel->stat, ctx->mode);
6785 
6786 	spdk_bdev_for_each_channel_continue(i, 0);
6787 }
6788 
6789 void
6790 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode,
6791 		       bdev_reset_device_stat_cb cb, void *cb_arg)
6792 {
6793 	struct bdev_iostat_reset_ctx *ctx;
6794 
6795 	assert(bdev != NULL);
6796 	assert(cb != NULL);
6797 
6798 	ctx = calloc(1, sizeof(*ctx));
6799 	if (ctx == NULL) {
6800 		SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n");
6801 		cb(bdev, cb_arg, -ENOMEM);
6802 		return;
6803 	}
6804 
6805 	ctx->mode = mode;
6806 	ctx->cb = cb;
6807 	ctx->cb_arg = cb_arg;
6808 
6809 	spdk_spin_lock(&bdev->internal.spinlock);
6810 	spdk_bdev_reset_io_stat(bdev->internal.stat, mode);
6811 	spdk_spin_unlock(&bdev->internal.spinlock);
6812 
6813 	spdk_bdev_for_each_channel(bdev,
6814 				   bdev_reset_each_channel_stat,
6815 				   ctx,
6816 				   bdev_reset_device_stat_done);
6817 }
6818 
6819 int
6820 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6821 			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
6822 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
6823 {
6824 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6825 	struct spdk_bdev_io *bdev_io;
6826 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6827 
6828 	if (!desc->write) {
6829 		return -EBADF;
6830 	}
6831 
6832 	if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) {
6833 		return -ENOTSUP;
6834 	}
6835 
6836 	bdev_io = bdev_channel_get_io(channel);
6837 	if (!bdev_io) {
6838 		return -ENOMEM;
6839 	}
6840 
6841 	bdev_io->internal.ch = channel;
6842 	bdev_io->internal.desc = desc;
6843 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN;
6844 	bdev_io->u.nvme_passthru.cmd = *cmd;
6845 	bdev_io->u.nvme_passthru.buf = buf;
6846 	bdev_io->u.nvme_passthru.nbytes = nbytes;
6847 	bdev_io->u.nvme_passthru.md_buf = NULL;
6848 	bdev_io->u.nvme_passthru.md_len = 0;
6849 
6850 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6851 
6852 	bdev_io_submit(bdev_io);
6853 	return 0;
6854 }
6855 
6856 int
6857 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6858 			   const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
6859 			   spdk_bdev_io_completion_cb cb, void *cb_arg)
6860 {
6861 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6862 	struct spdk_bdev_io *bdev_io;
6863 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6864 
6865 	if (!desc->write) {
6866 		/*
6867 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
6868 		 *  to easily determine if the command is a read or write, but for now just
6869 		 *  do not allow io_passthru with a read-only descriptor.
6870 		 */
6871 		return -EBADF;
6872 	}
6873 
6874 	if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) {
6875 		return -ENOTSUP;
6876 	}
6877 
6878 	bdev_io = bdev_channel_get_io(channel);
6879 	if (!bdev_io) {
6880 		return -ENOMEM;
6881 	}
6882 
6883 	bdev_io->internal.ch = channel;
6884 	bdev_io->internal.desc = desc;
6885 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO;
6886 	bdev_io->u.nvme_passthru.cmd = *cmd;
6887 	bdev_io->u.nvme_passthru.buf = buf;
6888 	bdev_io->u.nvme_passthru.nbytes = nbytes;
6889 	bdev_io->u.nvme_passthru.md_buf = NULL;
6890 	bdev_io->u.nvme_passthru.md_len = 0;
6891 
6892 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6893 
6894 	bdev_io_submit(bdev_io);
6895 	return 0;
6896 }
6897 
6898 int
6899 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6900 			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len,
6901 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
6902 {
6903 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6904 	struct spdk_bdev_io *bdev_io;
6905 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6906 
6907 	if (!desc->write) {
6908 		/*
6909 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
6910 		 *  to easily determine if the command is a read or write, but for now just
6911 		 *  do not allow io_passthru with a read-only descriptor.
6912 		 */
6913 		return -EBADF;
6914 	}
6915 
6916 	if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) {
6917 		return -ENOTSUP;
6918 	}
6919 
6920 	bdev_io = bdev_channel_get_io(channel);
6921 	if (!bdev_io) {
6922 		return -ENOMEM;
6923 	}
6924 
6925 	bdev_io->internal.ch = channel;
6926 	bdev_io->internal.desc = desc;
6927 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD;
6928 	bdev_io->u.nvme_passthru.cmd = *cmd;
6929 	bdev_io->u.nvme_passthru.buf = buf;
6930 	bdev_io->u.nvme_passthru.nbytes = nbytes;
6931 	bdev_io->u.nvme_passthru.md_buf = md_buf;
6932 	bdev_io->u.nvme_passthru.md_len = md_len;
6933 
6934 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6935 
6936 	bdev_io_submit(bdev_io);
6937 	return 0;
6938 }
6939 
6940 int
6941 spdk_bdev_nvme_iov_passthru_md(struct spdk_bdev_desc *desc,
6942 			       struct spdk_io_channel *ch,
6943 			       const struct spdk_nvme_cmd *cmd,
6944 			       struct iovec *iov, int iovcnt, size_t nbytes,
6945 			       void *md_buf, size_t md_len,
6946 			       spdk_bdev_io_completion_cb cb, void *cb_arg)
6947 {
6948 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6949 	struct spdk_bdev_io *bdev_io;
6950 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6951 
6952 	if (!desc->write) {
6953 		/*
6954 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
6955 		 * to easily determine if the command is a read or write, but for now just
6956 		 * do not allow io_passthru with a read-only descriptor.
6957 		 */
6958 		return -EBADF;
6959 	}
6960 
6961 	if (md_buf && spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) {
6962 		return -ENOTSUP;
6963 	} else if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) {
6964 		return -ENOTSUP;
6965 	}
6966 
6967 	bdev_io = bdev_channel_get_io(channel);
6968 	if (!bdev_io) {
6969 		return -ENOMEM;
6970 	}
6971 
6972 	bdev_io->internal.ch = channel;
6973 	bdev_io->internal.desc = desc;
6974 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IOV_MD;
6975 	bdev_io->u.nvme_passthru.cmd = *cmd;
6976 	bdev_io->u.nvme_passthru.iovs = iov;
6977 	bdev_io->u.nvme_passthru.iovcnt = iovcnt;
6978 	bdev_io->u.nvme_passthru.nbytes = nbytes;
6979 	bdev_io->u.nvme_passthru.md_buf = md_buf;
6980 	bdev_io->u.nvme_passthru.md_len = md_len;
6981 
6982 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6983 
6984 	bdev_io_submit(bdev_io);
6985 	return 0;
6986 }
6987 
6988 static void bdev_abort_retry(void *ctx);
6989 static void bdev_abort(struct spdk_bdev_io *parent_io);
6990 
6991 static void
6992 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
6993 {
6994 	struct spdk_bdev_channel *channel = bdev_io->internal.ch;
6995 	struct spdk_bdev_io *parent_io = cb_arg;
6996 	struct spdk_bdev_io *bio_to_abort, *tmp_io;
6997 
6998 	bio_to_abort = bdev_io->u.abort.bio_to_abort;
6999 
7000 	spdk_bdev_free_io(bdev_io);
7001 
7002 	if (!success) {
7003 		/* Check if the target I/O completed in the meantime. */
7004 		TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) {
7005 			if (tmp_io == bio_to_abort) {
7006 				break;
7007 			}
7008 		}
7009 
7010 		/* If the target I/O still exists, set the parent to failed. */
7011 		if (tmp_io != NULL) {
7012 			parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7013 		}
7014 	}
7015 
7016 	assert(parent_io->internal.f.split);
7017 
7018 	parent_io->internal.split.outstanding--;
7019 	if (parent_io->internal.split.outstanding == 0) {
7020 		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
7021 			bdev_abort_retry(parent_io);
7022 		} else {
7023 			bdev_io_complete(parent_io);
7024 		}
7025 	}
7026 }
7027 
7028 static int
7029 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel,
7030 	      struct spdk_bdev_io *bio_to_abort,
7031 	      spdk_bdev_io_completion_cb cb, void *cb_arg)
7032 {
7033 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7034 	struct spdk_bdev_io *bdev_io;
7035 
7036 	if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT ||
7037 	    bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) {
7038 		/* TODO: Abort reset or abort request. */
7039 		return -ENOTSUP;
7040 	}
7041 
7042 	bdev_io = bdev_channel_get_io(channel);
7043 	if (bdev_io == NULL) {
7044 		return -ENOMEM;
7045 	}
7046 
7047 	bdev_io->internal.ch = channel;
7048 	bdev_io->internal.desc = desc;
7049 	bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT;
7050 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
7051 
7052 	if (bio_to_abort->internal.f.split) {
7053 		assert(bdev_io_should_split(bio_to_abort));
7054 		bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort;
7055 
7056 		/* Parent abort request is not submitted directly, but to manage its
7057 		 * execution add it to the submitted list here.
7058 		 */
7059 		bdev_io->internal.submit_tsc = spdk_get_ticks();
7060 		bdev_ch_add_to_io_submitted(bdev_io);
7061 
7062 		bdev_abort(bdev_io);
7063 
7064 		return 0;
7065 	}
7066 
7067 	bdev_io->u.abort.bio_to_abort = bio_to_abort;
7068 
7069 	/* Submit the abort request to the underlying bdev module. */
7070 	bdev_io_submit(bdev_io);
7071 
7072 	return 0;
7073 }
7074 
7075 static bool
7076 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq)
7077 {
7078 	struct spdk_bdev_io *iter;
7079 
7080 	TAILQ_FOREACH(iter, tailq, internal.link) {
7081 		if (iter == bdev_io) {
7082 			return true;
7083 		}
7084 	}
7085 
7086 	return false;
7087 }
7088 
7089 static uint32_t
7090 _bdev_abort(struct spdk_bdev_io *parent_io)
7091 {
7092 	struct spdk_bdev_desc *desc = parent_io->internal.desc;
7093 	struct spdk_bdev_channel *channel = parent_io->internal.ch;
7094 	void *bio_cb_arg;
7095 	struct spdk_bdev_io *bio_to_abort;
7096 	uint32_t matched_ios;
7097 	int rc;
7098 
7099 	bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg;
7100 
7101 	/* matched_ios is returned and will be kept by the caller.
7102 	 *
7103 	 * This function will be used for two cases, 1) the same cb_arg is used for
7104 	 * multiple I/Os, 2) a single large I/O is split into smaller ones.
7105 	 * Incrementing split_outstanding directly here may confuse readers especially
7106 	 * for the 1st case.
7107 	 *
7108 	 * Completion of I/O abort is processed after stack unwinding. Hence this trick
7109 	 * works as expected.
7110 	 */
7111 	matched_ios = 0;
7112 	parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
7113 
7114 	TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) {
7115 		if (bio_to_abort->internal.caller_ctx != bio_cb_arg) {
7116 			continue;
7117 		}
7118 
7119 		if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) {
7120 			/* Any I/O which was submitted after this abort command should be excluded. */
7121 			continue;
7122 		}
7123 
7124 		/* We can't abort a request that's being pushed/pulled or executed by accel */
7125 		if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) ||
7126 		    bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) {
7127 			parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7128 			break;
7129 		}
7130 
7131 		rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io);
7132 		if (rc != 0) {
7133 			if (rc == -ENOMEM) {
7134 				parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM;
7135 			} else {
7136 				parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7137 			}
7138 			break;
7139 		}
7140 		matched_ios++;
7141 	}
7142 
7143 	return matched_ios;
7144 }
7145 
7146 static void
7147 bdev_abort_retry(void *ctx)
7148 {
7149 	struct spdk_bdev_io *parent_io = ctx;
7150 	uint32_t matched_ios;
7151 
7152 	matched_ios = _bdev_abort(parent_io);
7153 
7154 	if (matched_ios == 0) {
7155 		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
7156 			bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry);
7157 		} else {
7158 			/* For retry, the case that no target I/O was found is success
7159 			 * because it means target I/Os completed in the meantime.
7160 			 */
7161 			bdev_io_complete(parent_io);
7162 		}
7163 		return;
7164 	}
7165 
7166 	/* Use split_outstanding to manage the progress of aborting I/Os. */
7167 	parent_io->internal.f.split = true;
7168 	parent_io->internal.split.outstanding = matched_ios;
7169 }
7170 
7171 static void
7172 bdev_abort(struct spdk_bdev_io *parent_io)
7173 {
7174 	uint32_t matched_ios;
7175 
7176 	matched_ios = _bdev_abort(parent_io);
7177 
7178 	if (matched_ios == 0) {
7179 		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
7180 			bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry);
7181 		} else {
7182 			/* The case the no target I/O was found is failure. */
7183 			parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7184 			bdev_io_complete(parent_io);
7185 		}
7186 		return;
7187 	}
7188 
7189 	/* Use split_outstanding to manage the progress of aborting I/Os. */
7190 	parent_io->internal.f.split = true;
7191 	parent_io->internal.split.outstanding = matched_ios;
7192 }
7193 
7194 int
7195 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
7196 		void *bio_cb_arg,
7197 		spdk_bdev_io_completion_cb cb, void *cb_arg)
7198 {
7199 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7200 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7201 	struct spdk_bdev_io *bdev_io;
7202 
7203 	if (bio_cb_arg == NULL) {
7204 		return -EINVAL;
7205 	}
7206 
7207 	if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) {
7208 		return -ENOTSUP;
7209 	}
7210 
7211 	bdev_io = bdev_channel_get_io(channel);
7212 	if (bdev_io == NULL) {
7213 		return -ENOMEM;
7214 	}
7215 
7216 	bdev_io->internal.ch = channel;
7217 	bdev_io->internal.desc = desc;
7218 	bdev_io->internal.submit_tsc = spdk_get_ticks();
7219 	bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT;
7220 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
7221 
7222 	bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg;
7223 
7224 	/* Parent abort request is not submitted directly, but to manage its execution,
7225 	 * add it to the submitted list here.
7226 	 */
7227 	bdev_ch_add_to_io_submitted(bdev_io);
7228 
7229 	bdev_abort(bdev_io);
7230 
7231 	return 0;
7232 }
7233 
7234 int
7235 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
7236 			struct spdk_bdev_io_wait_entry *entry)
7237 {
7238 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7239 	struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch;
7240 
7241 	if (bdev != entry->bdev) {
7242 		SPDK_ERRLOG("bdevs do not match\n");
7243 		return -EINVAL;
7244 	}
7245 
7246 	if (mgmt_ch->per_thread_cache_count > 0) {
7247 		SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n");
7248 		return -EINVAL;
7249 	}
7250 
7251 	TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link);
7252 	return 0;
7253 }
7254 
7255 static inline void
7256 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff)
7257 {
7258 	enum spdk_bdev_io_status io_status = bdev_io->internal.status;
7259 	struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat;
7260 	uint64_t num_blocks = bdev_io->u.bdev.num_blocks;
7261 	uint32_t blocklen = bdev_io->bdev->blocklen;
7262 
7263 	if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
7264 		switch (bdev_io->type) {
7265 		case SPDK_BDEV_IO_TYPE_READ:
7266 			io_stat->bytes_read += num_blocks * blocklen;
7267 			io_stat->num_read_ops++;
7268 			io_stat->read_latency_ticks += tsc_diff;
7269 			if (io_stat->max_read_latency_ticks < tsc_diff) {
7270 				io_stat->max_read_latency_ticks = tsc_diff;
7271 			}
7272 			if (io_stat->min_read_latency_ticks > tsc_diff) {
7273 				io_stat->min_read_latency_ticks = tsc_diff;
7274 			}
7275 			break;
7276 		case SPDK_BDEV_IO_TYPE_WRITE:
7277 			io_stat->bytes_written += num_blocks * blocklen;
7278 			io_stat->num_write_ops++;
7279 			io_stat->write_latency_ticks += tsc_diff;
7280 			if (io_stat->max_write_latency_ticks < tsc_diff) {
7281 				io_stat->max_write_latency_ticks = tsc_diff;
7282 			}
7283 			if (io_stat->min_write_latency_ticks > tsc_diff) {
7284 				io_stat->min_write_latency_ticks = tsc_diff;
7285 			}
7286 			break;
7287 		case SPDK_BDEV_IO_TYPE_UNMAP:
7288 			io_stat->bytes_unmapped += num_blocks * blocklen;
7289 			io_stat->num_unmap_ops++;
7290 			io_stat->unmap_latency_ticks += tsc_diff;
7291 			if (io_stat->max_unmap_latency_ticks < tsc_diff) {
7292 				io_stat->max_unmap_latency_ticks = tsc_diff;
7293 			}
7294 			if (io_stat->min_unmap_latency_ticks > tsc_diff) {
7295 				io_stat->min_unmap_latency_ticks = tsc_diff;
7296 			}
7297 			break;
7298 		case SPDK_BDEV_IO_TYPE_ZCOPY:
7299 			/* Track the data in the start phase only */
7300 			if (bdev_io->u.bdev.zcopy.start) {
7301 				if (bdev_io->u.bdev.zcopy.populate) {
7302 					io_stat->bytes_read += num_blocks * blocklen;
7303 					io_stat->num_read_ops++;
7304 					io_stat->read_latency_ticks += tsc_diff;
7305 					if (io_stat->max_read_latency_ticks < tsc_diff) {
7306 						io_stat->max_read_latency_ticks = tsc_diff;
7307 					}
7308 					if (io_stat->min_read_latency_ticks > tsc_diff) {
7309 						io_stat->min_read_latency_ticks = tsc_diff;
7310 					}
7311 				} else {
7312 					io_stat->bytes_written += num_blocks * blocklen;
7313 					io_stat->num_write_ops++;
7314 					io_stat->write_latency_ticks += tsc_diff;
7315 					if (io_stat->max_write_latency_ticks < tsc_diff) {
7316 						io_stat->max_write_latency_ticks = tsc_diff;
7317 					}
7318 					if (io_stat->min_write_latency_ticks > tsc_diff) {
7319 						io_stat->min_write_latency_ticks = tsc_diff;
7320 					}
7321 				}
7322 			}
7323 			break;
7324 		case SPDK_BDEV_IO_TYPE_COPY:
7325 			io_stat->bytes_copied += num_blocks * blocklen;
7326 			io_stat->num_copy_ops++;
7327 			bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff;
7328 			if (io_stat->max_copy_latency_ticks < tsc_diff) {
7329 				io_stat->max_copy_latency_ticks = tsc_diff;
7330 			}
7331 			if (io_stat->min_copy_latency_ticks > tsc_diff) {
7332 				io_stat->min_copy_latency_ticks = tsc_diff;
7333 			}
7334 			break;
7335 		default:
7336 			break;
7337 		}
7338 	} else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) {
7339 		io_stat = bdev_io->bdev->internal.stat;
7340 		assert(io_stat->io_error != NULL);
7341 
7342 		spdk_spin_lock(&bdev_io->bdev->internal.spinlock);
7343 		io_stat->io_error->error_status[-io_status - 1]++;
7344 		spdk_spin_unlock(&bdev_io->bdev->internal.spinlock);
7345 	}
7346 
7347 #ifdef SPDK_CONFIG_VTUNE
7348 	uint64_t now_tsc = spdk_get_ticks();
7349 	if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) {
7350 		uint64_t data[5];
7351 		struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat;
7352 
7353 		data[0] = io_stat->num_read_ops - prev_stat->num_read_ops;
7354 		data[1] = io_stat->bytes_read - prev_stat->bytes_read;
7355 		data[2] = io_stat->num_write_ops - prev_stat->num_write_ops;
7356 		data[3] = io_stat->bytes_written - prev_stat->bytes_written;
7357 		data[4] = bdev_io->bdev->fn_table->get_spin_time ?
7358 			  bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0;
7359 
7360 		__itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle,
7361 				   __itt_metadata_u64, 5, data);
7362 
7363 		memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat));
7364 		bdev_io->internal.ch->start_tsc = now_tsc;
7365 	}
7366 #endif
7367 }
7368 
7369 static inline void
7370 _bdev_io_complete(void *ctx)
7371 {
7372 	struct spdk_bdev_io *bdev_io = ctx;
7373 
7374 	if (spdk_unlikely(bdev_io_use_accel_sequence(bdev_io))) {
7375 		assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS);
7376 		spdk_accel_sequence_abort(bdev_io->internal.accel_sequence);
7377 	}
7378 
7379 	assert(bdev_io->internal.cb != NULL);
7380 	assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io));
7381 
7382 	bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
7383 			     bdev_io->internal.caller_ctx);
7384 }
7385 
7386 static inline void
7387 bdev_io_complete(void *ctx)
7388 {
7389 	struct spdk_bdev_io *bdev_io = ctx;
7390 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
7391 	uint64_t tsc, tsc_diff;
7392 
7393 	if (spdk_unlikely(bdev_io->internal.f.in_submit_request)) {
7394 		/*
7395 		 * Defer completion to avoid potential infinite recursion if the
7396 		 * user's completion callback issues a new I/O.
7397 		 */
7398 		spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
7399 				     bdev_io_complete, bdev_io);
7400 		return;
7401 	}
7402 
7403 	tsc = spdk_get_ticks();
7404 	tsc_diff = tsc - bdev_io->internal.submit_tsc;
7405 
7406 	bdev_ch_remove_from_io_submitted(bdev_io);
7407 	spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, bdev_ch->trace_id, 0, (uintptr_t)bdev_io,
7408 			      bdev_io->internal.caller_ctx, bdev_ch->queue_depth);
7409 
7410 	if (bdev_ch->histogram) {
7411 		if (bdev_io->bdev->internal.histogram_io_type == 0 ||
7412 		    bdev_io->bdev->internal.histogram_io_type == bdev_io->type) {
7413 			/*
7414 			 * Tally all I/O types if the histogram_io_type is set to 0.
7415 			 */
7416 			spdk_histogram_data_tally(bdev_ch->histogram, tsc_diff);
7417 		}
7418 	}
7419 
7420 	bdev_io_update_io_stat(bdev_io, tsc_diff);
7421 	_bdev_io_complete(bdev_io);
7422 }
7423 
7424 /* The difference between this function and bdev_io_complete() is that this should be called to
7425  * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the
7426  * io_submitted list and don't have submit_tsc updated.
7427  */
7428 static inline void
7429 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io)
7430 {
7431 	/* Since the IO hasn't been submitted it's bound to be failed */
7432 	assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS);
7433 
7434 	/* At this point we don't know if the IO is completed from submission context or not, but,
7435 	 * since this is an error path, we can always do an spdk_thread_send_msg(). */
7436 	spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
7437 			     _bdev_io_complete, bdev_io);
7438 }
7439 
7440 static void bdev_destroy_cb(void *io_device);
7441 
7442 static void
7443 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status)
7444 {
7445 	struct spdk_bdev_io *bdev_io = _ctx;
7446 
7447 	if (bdev_io->u.reset.ch_ref != NULL) {
7448 		spdk_put_io_channel(bdev_io->u.reset.ch_ref);
7449 		bdev_io->u.reset.ch_ref = NULL;
7450 	}
7451 
7452 	bdev_io_complete(bdev_io);
7453 
7454 	if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING &&
7455 	    TAILQ_EMPTY(&bdev->internal.open_descs)) {
7456 		spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
7457 	}
7458 }
7459 
7460 static void
7461 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
7462 		      struct spdk_io_channel *_ch, void *_ctx)
7463 {
7464 	struct spdk_bdev_io *bdev_io = _ctx;
7465 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
7466 	struct spdk_bdev_io *queued_reset;
7467 
7468 	ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS;
7469 	while (!TAILQ_EMPTY(&ch->queued_resets)) {
7470 		queued_reset = TAILQ_FIRST(&ch->queued_resets);
7471 		TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link);
7472 		spdk_bdev_io_complete(queued_reset, bdev_io->internal.status);
7473 	}
7474 
7475 	spdk_bdev_for_each_channel_continue(i, 0);
7476 }
7477 
7478 static void
7479 bdev_io_complete_sequence_cb(void *ctx, int status)
7480 {
7481 	struct spdk_bdev_io *bdev_io = ctx;
7482 
7483 	/* u.bdev.accel_sequence should have already been cleared at this point */
7484 	assert(bdev_io->u.bdev.accel_sequence == NULL);
7485 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
7486 	bdev_io->internal.f.has_accel_sequence = false;
7487 
7488 	if (spdk_unlikely(status != 0)) {
7489 		SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status);
7490 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7491 	}
7492 
7493 	bdev_io_complete(bdev_io);
7494 }
7495 
7496 void
7497 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status)
7498 {
7499 	struct spdk_bdev *bdev = bdev_io->bdev;
7500 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
7501 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
7502 
7503 	if (spdk_unlikely(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING)) {
7504 		SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n",
7505 			    spdk_bdev_get_module_name(bdev),
7506 			    bdev_io_status_get_string(bdev_io->internal.status));
7507 		assert(false);
7508 	}
7509 	bdev_io->internal.status = status;
7510 
7511 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) {
7512 		bool unlock_channels = false;
7513 
7514 		if (status == SPDK_BDEV_IO_STATUS_NOMEM) {
7515 			SPDK_ERRLOG("NOMEM returned for reset\n");
7516 		}
7517 		spdk_spin_lock(&bdev->internal.spinlock);
7518 		if (bdev_io == bdev->internal.reset_in_progress) {
7519 			bdev->internal.reset_in_progress = NULL;
7520 			unlock_channels = true;
7521 		}
7522 		spdk_spin_unlock(&bdev->internal.spinlock);
7523 
7524 		if (unlock_channels) {
7525 			spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io,
7526 						   bdev_reset_complete);
7527 			return;
7528 		}
7529 	} else {
7530 		bdev_io_decrement_outstanding(bdev_ch, shared_resource);
7531 		if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
7532 			if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) {
7533 				bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb);
7534 				return;
7535 			} else if (spdk_unlikely(bdev_io->internal.f.has_bounce_buf &&
7536 						 !bdev_io_use_accel_sequence(bdev_io))) {
7537 				_bdev_io_push_bounce_data_buffer(bdev_io,
7538 								 _bdev_io_complete_push_bounce_done);
7539 				/* bdev IO will be completed in the callback */
7540 				return;
7541 			}
7542 		}
7543 
7544 		if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) {
7545 			return;
7546 		}
7547 	}
7548 
7549 	bdev_io_complete(bdev_io);
7550 }
7551 
7552 void
7553 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc,
7554 				  enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq)
7555 {
7556 	enum spdk_bdev_io_status status;
7557 
7558 	if (sc == SPDK_SCSI_STATUS_GOOD) {
7559 		status = SPDK_BDEV_IO_STATUS_SUCCESS;
7560 	} else {
7561 		status = SPDK_BDEV_IO_STATUS_SCSI_ERROR;
7562 		bdev_io->internal.error.scsi.sc = sc;
7563 		bdev_io->internal.error.scsi.sk = sk;
7564 		bdev_io->internal.error.scsi.asc = asc;
7565 		bdev_io->internal.error.scsi.ascq = ascq;
7566 	}
7567 
7568 	spdk_bdev_io_complete(bdev_io, status);
7569 }
7570 
7571 void
7572 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io,
7573 			     int *sc, int *sk, int *asc, int *ascq)
7574 {
7575 	assert(sc != NULL);
7576 	assert(sk != NULL);
7577 	assert(asc != NULL);
7578 	assert(ascq != NULL);
7579 
7580 	switch (bdev_io->internal.status) {
7581 	case SPDK_BDEV_IO_STATUS_SUCCESS:
7582 		*sc = SPDK_SCSI_STATUS_GOOD;
7583 		*sk = SPDK_SCSI_SENSE_NO_SENSE;
7584 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
7585 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
7586 		break;
7587 	case SPDK_BDEV_IO_STATUS_NVME_ERROR:
7588 		spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq);
7589 		break;
7590 	case SPDK_BDEV_IO_STATUS_MISCOMPARE:
7591 		*sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
7592 		*sk = SPDK_SCSI_SENSE_MISCOMPARE;
7593 		*asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION;
7594 		*ascq = bdev_io->internal.error.scsi.ascq;
7595 		break;
7596 	case SPDK_BDEV_IO_STATUS_SCSI_ERROR:
7597 		*sc = bdev_io->internal.error.scsi.sc;
7598 		*sk = bdev_io->internal.error.scsi.sk;
7599 		*asc = bdev_io->internal.error.scsi.asc;
7600 		*ascq = bdev_io->internal.error.scsi.ascq;
7601 		break;
7602 	default:
7603 		*sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
7604 		*sk = SPDK_SCSI_SENSE_ABORTED_COMMAND;
7605 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
7606 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
7607 		break;
7608 	}
7609 }
7610 
7611 void
7612 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result)
7613 {
7614 	enum spdk_bdev_io_status status;
7615 
7616 	if (aio_result == 0) {
7617 		status = SPDK_BDEV_IO_STATUS_SUCCESS;
7618 	} else {
7619 		status = SPDK_BDEV_IO_STATUS_AIO_ERROR;
7620 	}
7621 
7622 	bdev_io->internal.error.aio_result = aio_result;
7623 
7624 	spdk_bdev_io_complete(bdev_io, status);
7625 }
7626 
7627 void
7628 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result)
7629 {
7630 	assert(aio_result != NULL);
7631 
7632 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) {
7633 		*aio_result = bdev_io->internal.error.aio_result;
7634 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
7635 		*aio_result = 0;
7636 	} else {
7637 		*aio_result = -EIO;
7638 	}
7639 }
7640 
7641 void
7642 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc)
7643 {
7644 	enum spdk_bdev_io_status status;
7645 
7646 	if (spdk_likely(sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS)) {
7647 		status = SPDK_BDEV_IO_STATUS_SUCCESS;
7648 	} else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) {
7649 		status = SPDK_BDEV_IO_STATUS_ABORTED;
7650 	} else {
7651 		status = SPDK_BDEV_IO_STATUS_NVME_ERROR;
7652 	}
7653 
7654 	bdev_io->internal.error.nvme.cdw0 = cdw0;
7655 	bdev_io->internal.error.nvme.sct = sct;
7656 	bdev_io->internal.error.nvme.sc = sc;
7657 
7658 	spdk_bdev_io_complete(bdev_io, status);
7659 }
7660 
7661 void
7662 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc)
7663 {
7664 	assert(sct != NULL);
7665 	assert(sc != NULL);
7666 	assert(cdw0 != NULL);
7667 
7668 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) {
7669 		*sct = SPDK_NVME_SCT_GENERIC;
7670 		*sc = SPDK_NVME_SC_SUCCESS;
7671 		if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
7672 			*cdw0 = 0;
7673 		} else {
7674 			*cdw0 = 1U;
7675 		}
7676 		return;
7677 	}
7678 
7679 	if (spdk_likely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
7680 		*sct = SPDK_NVME_SCT_GENERIC;
7681 		*sc = SPDK_NVME_SC_SUCCESS;
7682 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
7683 		*sct = bdev_io->internal.error.nvme.sct;
7684 		*sc = bdev_io->internal.error.nvme.sc;
7685 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) {
7686 		*sct = SPDK_NVME_SCT_GENERIC;
7687 		*sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
7688 	} else {
7689 		*sct = SPDK_NVME_SCT_GENERIC;
7690 		*sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
7691 	}
7692 
7693 	*cdw0 = bdev_io->internal.error.nvme.cdw0;
7694 }
7695 
7696 void
7697 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0,
7698 				   int *first_sct, int *first_sc, int *second_sct, int *second_sc)
7699 {
7700 	assert(first_sct != NULL);
7701 	assert(first_sc != NULL);
7702 	assert(second_sct != NULL);
7703 	assert(second_sc != NULL);
7704 	assert(cdw0 != NULL);
7705 
7706 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
7707 		if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR &&
7708 		    bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) {
7709 			*first_sct = bdev_io->internal.error.nvme.sct;
7710 			*first_sc = bdev_io->internal.error.nvme.sc;
7711 			*second_sct = SPDK_NVME_SCT_GENERIC;
7712 			*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
7713 		} else {
7714 			*first_sct = SPDK_NVME_SCT_GENERIC;
7715 			*first_sc = SPDK_NVME_SC_SUCCESS;
7716 			*second_sct = bdev_io->internal.error.nvme.sct;
7717 			*second_sc = bdev_io->internal.error.nvme.sc;
7718 		}
7719 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) {
7720 		*first_sct = SPDK_NVME_SCT_GENERIC;
7721 		*first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
7722 		*second_sct = SPDK_NVME_SCT_GENERIC;
7723 		*second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
7724 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
7725 		*first_sct = SPDK_NVME_SCT_GENERIC;
7726 		*first_sc = SPDK_NVME_SC_SUCCESS;
7727 		*second_sct = SPDK_NVME_SCT_GENERIC;
7728 		*second_sc = SPDK_NVME_SC_SUCCESS;
7729 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) {
7730 		*first_sct = SPDK_NVME_SCT_GENERIC;
7731 		*first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
7732 		*second_sct = SPDK_NVME_SCT_GENERIC;
7733 		*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
7734 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) {
7735 		*first_sct = SPDK_NVME_SCT_MEDIA_ERROR;
7736 		*first_sc = SPDK_NVME_SC_COMPARE_FAILURE;
7737 		*second_sct = SPDK_NVME_SCT_GENERIC;
7738 		*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
7739 	} else {
7740 		*first_sct = SPDK_NVME_SCT_GENERIC;
7741 		*first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
7742 		*second_sct = SPDK_NVME_SCT_GENERIC;
7743 		*second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
7744 	}
7745 
7746 	*cdw0 = bdev_io->internal.error.nvme.cdw0;
7747 }
7748 
7749 void
7750 spdk_bdev_io_complete_base_io_status(struct spdk_bdev_io *bdev_io,
7751 				     const struct spdk_bdev_io *base_io)
7752 {
7753 	switch (base_io->internal.status) {
7754 	case SPDK_BDEV_IO_STATUS_NVME_ERROR:
7755 		spdk_bdev_io_complete_nvme_status(bdev_io,
7756 						  base_io->internal.error.nvme.cdw0,
7757 						  base_io->internal.error.nvme.sct,
7758 						  base_io->internal.error.nvme.sc);
7759 		break;
7760 	case SPDK_BDEV_IO_STATUS_SCSI_ERROR:
7761 		spdk_bdev_io_complete_scsi_status(bdev_io,
7762 						  base_io->internal.error.scsi.sc,
7763 						  base_io->internal.error.scsi.sk,
7764 						  base_io->internal.error.scsi.asc,
7765 						  base_io->internal.error.scsi.ascq);
7766 		break;
7767 	case SPDK_BDEV_IO_STATUS_AIO_ERROR:
7768 		spdk_bdev_io_complete_aio_status(bdev_io, base_io->internal.error.aio_result);
7769 		break;
7770 	default:
7771 		spdk_bdev_io_complete(bdev_io, base_io->internal.status);
7772 		break;
7773 	}
7774 }
7775 
7776 struct spdk_thread *
7777 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io)
7778 {
7779 	return spdk_io_channel_get_thread(bdev_io->internal.ch->channel);
7780 }
7781 
7782 struct spdk_io_channel *
7783 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io)
7784 {
7785 	return bdev_io->internal.ch->channel;
7786 }
7787 
7788 static int
7789 bdev_register(struct spdk_bdev *bdev)
7790 {
7791 	char *bdev_name;
7792 	char uuid[SPDK_UUID_STRING_LEN];
7793 	struct spdk_iobuf_opts iobuf_opts;
7794 	int ret;
7795 
7796 	assert(bdev->module != NULL);
7797 
7798 	if (!bdev->name) {
7799 		SPDK_ERRLOG("Bdev name is NULL\n");
7800 		return -EINVAL;
7801 	}
7802 
7803 	if (!strlen(bdev->name)) {
7804 		SPDK_ERRLOG("Bdev name must not be an empty string\n");
7805 		return -EINVAL;
7806 	}
7807 
7808 	/* Users often register their own I/O devices using the bdev name. In
7809 	 * order to avoid conflicts, prepend bdev_. */
7810 	bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name);
7811 	if (!bdev_name) {
7812 		SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n");
7813 		return -ENOMEM;
7814 	}
7815 
7816 	bdev->internal.stat = bdev_alloc_io_stat(true);
7817 	if (!bdev->internal.stat) {
7818 		SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n");
7819 		free(bdev_name);
7820 		return -ENOMEM;
7821 	}
7822 
7823 	bdev->internal.status = SPDK_BDEV_STATUS_READY;
7824 	bdev->internal.measured_queue_depth = UINT64_MAX;
7825 	bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE;
7826 	memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim));
7827 	bdev->internal.qd_poller = NULL;
7828 	bdev->internal.qos = NULL;
7829 
7830 	TAILQ_INIT(&bdev->internal.open_descs);
7831 	TAILQ_INIT(&bdev->internal.locked_ranges);
7832 	TAILQ_INIT(&bdev->internal.pending_locked_ranges);
7833 	TAILQ_INIT(&bdev->aliases);
7834 
7835 	/* UUID may be specified by the user or defined by bdev itself.
7836 	 * Otherwise it will be generated here, so this field will never be empty. */
7837 	if (spdk_uuid_is_null(&bdev->uuid)) {
7838 		spdk_uuid_generate(&bdev->uuid);
7839 	}
7840 
7841 	/* Add the UUID alias only if it's different than the name */
7842 	spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid);
7843 	if (strcmp(bdev->name, uuid) != 0) {
7844 		ret = spdk_bdev_alias_add(bdev, uuid);
7845 		if (ret != 0) {
7846 			SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name);
7847 			bdev_free_io_stat(bdev->internal.stat);
7848 			free(bdev_name);
7849 			return ret;
7850 		}
7851 	}
7852 
7853 	spdk_iobuf_get_opts(&iobuf_opts, sizeof(iobuf_opts));
7854 	if (spdk_bdev_get_buf_align(bdev) > 1) {
7855 		bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX,
7856 					     iobuf_opts.large_bufsize / bdev->blocklen);
7857 	}
7858 
7859 	/* If the user didn't specify a write unit size, set it to one. */
7860 	if (bdev->write_unit_size == 0) {
7861 		bdev->write_unit_size = 1;
7862 	}
7863 
7864 	/* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */
7865 	if (bdev->acwu == 0) {
7866 		bdev->acwu = bdev->write_unit_size;
7867 	}
7868 
7869 	if (bdev->phys_blocklen == 0) {
7870 		bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev);
7871 	}
7872 
7873 	if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) {
7874 		bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize);
7875 	}
7876 
7877 	if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
7878 		bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE);
7879 	}
7880 
7881 	bdev->internal.reset_in_progress = NULL;
7882 	bdev->internal.qd_poll_in_progress = false;
7883 	bdev->internal.period = 0;
7884 	bdev->internal.new_period = 0;
7885 	bdev->internal.trace_id = spdk_trace_register_owner(OWNER_TYPE_BDEV, bdev_name);
7886 
7887 	/*
7888 	 * Initialize spinlock before registering IO device because spinlock is used in
7889 	 * bdev_channel_create
7890 	 */
7891 	spdk_spin_init(&bdev->internal.spinlock);
7892 
7893 	spdk_io_device_register(__bdev_to_io_dev(bdev),
7894 				bdev_channel_create, bdev_channel_destroy,
7895 				sizeof(struct spdk_bdev_channel),
7896 				bdev_name);
7897 
7898 	/*
7899 	 * Register bdev name only after the bdev object is ready.
7900 	 * After bdev_name_add returns, it is possible for other threads to start using the bdev,
7901 	 * create IO channels...
7902 	 */
7903 	ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name);
7904 	if (ret != 0) {
7905 		spdk_io_device_unregister(__bdev_to_io_dev(bdev), NULL);
7906 		bdev_free_io_stat(bdev->internal.stat);
7907 		spdk_spin_destroy(&bdev->internal.spinlock);
7908 		free(bdev_name);
7909 		return ret;
7910 	}
7911 
7912 	free(bdev_name);
7913 
7914 	SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name);
7915 	TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link);
7916 
7917 	return 0;
7918 }
7919 
7920 static void
7921 bdev_destroy_cb(void *io_device)
7922 {
7923 	int			rc;
7924 	struct spdk_bdev	*bdev;
7925 	spdk_bdev_unregister_cb	cb_fn;
7926 	void			*cb_arg;
7927 
7928 	bdev = __bdev_from_io_dev(io_device);
7929 
7930 	if (bdev->internal.unregister_td != spdk_get_thread()) {
7931 		spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device);
7932 		return;
7933 	}
7934 
7935 	cb_fn = bdev->internal.unregister_cb;
7936 	cb_arg = bdev->internal.unregister_ctx;
7937 
7938 	spdk_spin_destroy(&bdev->internal.spinlock);
7939 	free(bdev->internal.qos);
7940 	bdev_free_io_stat(bdev->internal.stat);
7941 	spdk_trace_unregister_owner(bdev->internal.trace_id);
7942 
7943 	rc = bdev->fn_table->destruct(bdev->ctxt);
7944 	if (rc < 0) {
7945 		SPDK_ERRLOG("destruct failed\n");
7946 	}
7947 	if (rc <= 0 && cb_fn != NULL) {
7948 		cb_fn(cb_arg, rc);
7949 	}
7950 }
7951 
7952 void
7953 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno)
7954 {
7955 	if (bdev->internal.unregister_cb != NULL) {
7956 		bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno);
7957 	}
7958 }
7959 
7960 static void
7961 _remove_notify(void *arg)
7962 {
7963 	struct spdk_bdev_desc *desc = arg;
7964 
7965 	_event_notify(desc, SPDK_BDEV_EVENT_REMOVE);
7966 }
7967 
7968 /* returns: 0 - bdev removed and ready to be destructed.
7969  *          -EBUSY - bdev can't be destructed yet.  */
7970 static int
7971 bdev_unregister_unsafe(struct spdk_bdev *bdev)
7972 {
7973 	struct spdk_bdev_desc	*desc, *tmp;
7974 	int			rc = 0;
7975 	char			uuid[SPDK_UUID_STRING_LEN];
7976 
7977 	assert(spdk_spin_held(&g_bdev_mgr.spinlock));
7978 	assert(spdk_spin_held(&bdev->internal.spinlock));
7979 
7980 	/* Notify each descriptor about hotremoval */
7981 	TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) {
7982 		rc = -EBUSY;
7983 		/*
7984 		 * Defer invocation of the event_cb to a separate message that will
7985 		 *  run later on its thread.  This ensures this context unwinds and
7986 		 *  we don't recursively unregister this bdev again if the event_cb
7987 		 *  immediately closes its descriptor.
7988 		 */
7989 		event_notify(desc, _remove_notify);
7990 	}
7991 
7992 	/* If there are no descriptors, proceed removing the bdev */
7993 	if (rc == 0) {
7994 		TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
7995 		SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name);
7996 
7997 		/* Delete the name and the UUID alias */
7998 		spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid);
7999 		bdev_name_del_unsafe(&bdev->internal.bdev_name);
8000 		bdev_alias_del(bdev, uuid, bdev_name_del_unsafe);
8001 
8002 		spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev));
8003 
8004 		if (bdev->internal.reset_in_progress != NULL) {
8005 			/* If reset is in progress, let the completion callback for reset
8006 			 * unregister the bdev.
8007 			 */
8008 			rc = -EBUSY;
8009 		}
8010 	}
8011 
8012 	return rc;
8013 }
8014 
8015 static void
8016 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
8017 			      struct spdk_io_channel *io_ch, void *_ctx)
8018 {
8019 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
8020 
8021 	bdev_channel_abort_queued_ios(bdev_ch);
8022 	spdk_bdev_for_each_channel_continue(i, 0);
8023 }
8024 
8025 static void
8026 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status)
8027 {
8028 	int rc;
8029 
8030 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8031 	spdk_spin_lock(&bdev->internal.spinlock);
8032 	/*
8033 	 * Set the status to REMOVING after completing to abort channels. Otherwise,
8034 	 * the last spdk_bdev_close() may call spdk_io_device_unregister() while
8035 	 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister()
8036 	 * may fail.
8037 	 */
8038 	bdev->internal.status = SPDK_BDEV_STATUS_REMOVING;
8039 	rc = bdev_unregister_unsafe(bdev);
8040 	spdk_spin_unlock(&bdev->internal.spinlock);
8041 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8042 
8043 	if (rc == 0) {
8044 		spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
8045 	}
8046 }
8047 
8048 void
8049 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
8050 {
8051 	struct spdk_thread	*thread;
8052 
8053 	SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name);
8054 
8055 	thread = spdk_get_thread();
8056 	if (!thread) {
8057 		/* The user called this from a non-SPDK thread. */
8058 		if (cb_fn != NULL) {
8059 			cb_fn(cb_arg, -ENOTSUP);
8060 		}
8061 		return;
8062 	}
8063 
8064 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8065 	if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING ||
8066 	    bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) {
8067 		spdk_spin_unlock(&g_bdev_mgr.spinlock);
8068 		if (cb_fn) {
8069 			cb_fn(cb_arg, -EBUSY);
8070 		}
8071 		return;
8072 	}
8073 
8074 	spdk_spin_lock(&bdev->internal.spinlock);
8075 	bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING;
8076 	bdev->internal.unregister_cb = cb_fn;
8077 	bdev->internal.unregister_ctx = cb_arg;
8078 	bdev->internal.unregister_td = thread;
8079 	spdk_spin_unlock(&bdev->internal.spinlock);
8080 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8081 
8082 	spdk_bdev_set_qd_sampling_period(bdev, 0);
8083 
8084 	spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev,
8085 				   bdev_unregister);
8086 }
8087 
8088 int
8089 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module,
8090 			     spdk_bdev_unregister_cb cb_fn, void *cb_arg)
8091 {
8092 	struct spdk_bdev_desc *desc;
8093 	struct spdk_bdev *bdev;
8094 	int rc;
8095 
8096 	rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc);
8097 	if (rc != 0) {
8098 		SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name);
8099 		return rc;
8100 	}
8101 
8102 	bdev = spdk_bdev_desc_get_bdev(desc);
8103 
8104 	if (bdev->module != module) {
8105 		spdk_bdev_close(desc);
8106 		SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n",
8107 			    bdev_name);
8108 		return -ENODEV;
8109 	}
8110 
8111 	spdk_bdev_unregister(bdev, cb_fn, cb_arg);
8112 
8113 	spdk_bdev_close(desc);
8114 
8115 	return 0;
8116 }
8117 
8118 static int
8119 bdev_start_qos(struct spdk_bdev *bdev)
8120 {
8121 	struct set_qos_limit_ctx *ctx;
8122 
8123 	/* Enable QoS */
8124 	if (bdev->internal.qos && bdev->internal.qos->thread == NULL) {
8125 		ctx = calloc(1, sizeof(*ctx));
8126 		if (ctx == NULL) {
8127 			SPDK_ERRLOG("Failed to allocate memory for QoS context\n");
8128 			return -ENOMEM;
8129 		}
8130 		ctx->bdev = bdev;
8131 		spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done);
8132 	}
8133 
8134 	return 0;
8135 }
8136 
8137 static void
8138 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail,
8139 		    struct spdk_bdev *bdev)
8140 {
8141 	enum spdk_bdev_claim_type type;
8142 	const char *typename, *modname;
8143 	extern struct spdk_log_flag SPDK_LOG_bdev;
8144 
8145 	assert(spdk_spin_held(&bdev->internal.spinlock));
8146 
8147 	if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) {
8148 		return;
8149 	}
8150 
8151 	type = bdev->internal.claim_type;
8152 	typename = spdk_bdev_claim_get_name(type);
8153 
8154 	if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) {
8155 		modname = bdev->internal.claim.v1.module->name;
8156 		spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n",
8157 			 bdev->name, detail, typename, modname);
8158 		return;
8159 	}
8160 
8161 	if (claim_type_is_v2(type)) {
8162 		struct spdk_bdev_module_claim *claim;
8163 
8164 		TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) {
8165 			modname = claim->module->name;
8166 			spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n",
8167 				 bdev->name, detail, typename, modname);
8168 		}
8169 		return;
8170 	}
8171 
8172 	assert(false);
8173 }
8174 
8175 static int
8176 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc)
8177 {
8178 	struct spdk_thread *thread;
8179 	int rc = 0;
8180 
8181 	thread = spdk_get_thread();
8182 	if (!thread) {
8183 		SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n");
8184 		return -ENOTSUP;
8185 	}
8186 
8187 	SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
8188 		      spdk_get_thread());
8189 
8190 	desc->bdev = bdev;
8191 	desc->thread = thread;
8192 	desc->write = write;
8193 
8194 	spdk_spin_lock(&bdev->internal.spinlock);
8195 	if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING ||
8196 	    bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) {
8197 		spdk_spin_unlock(&bdev->internal.spinlock);
8198 		return -ENODEV;
8199 	}
8200 
8201 	if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
8202 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
8203 		spdk_spin_unlock(&bdev->internal.spinlock);
8204 		return -EPERM;
8205 	}
8206 
8207 	rc = bdev_start_qos(bdev);
8208 	if (rc != 0) {
8209 		SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name);
8210 		spdk_spin_unlock(&bdev->internal.spinlock);
8211 		return rc;
8212 	}
8213 
8214 	TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link);
8215 
8216 	spdk_spin_unlock(&bdev->internal.spinlock);
8217 
8218 	return 0;
8219 }
8220 
8221 static int
8222 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx,
8223 		struct spdk_bdev_desc **_desc)
8224 {
8225 	struct spdk_bdev_desc *desc;
8226 	unsigned int i;
8227 
8228 	desc = calloc(1, sizeof(*desc));
8229 	if (desc == NULL) {
8230 		SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n");
8231 		return -ENOMEM;
8232 	}
8233 
8234 	TAILQ_INIT(&desc->pending_media_events);
8235 	TAILQ_INIT(&desc->free_media_events);
8236 
8237 	desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0;
8238 	desc->callback.event_fn = event_cb;
8239 	desc->callback.ctx = event_ctx;
8240 	spdk_spin_init(&desc->spinlock);
8241 
8242 	if (bdev->media_events) {
8243 		desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE,
8244 						   sizeof(*desc->media_events_buffer));
8245 		if (desc->media_events_buffer == NULL) {
8246 			SPDK_ERRLOG("Failed to initialize media event pool\n");
8247 			bdev_desc_free(desc);
8248 			return -ENOMEM;
8249 		}
8250 
8251 		for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) {
8252 			TAILQ_INSERT_TAIL(&desc->free_media_events,
8253 					  &desc->media_events_buffer[i], tailq);
8254 		}
8255 	}
8256 
8257 	if (bdev->fn_table->accel_sequence_supported != NULL) {
8258 		for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) {
8259 			desc->accel_sequence_supported[i] =
8260 				bdev->fn_table->accel_sequence_supported(bdev->ctxt,
8261 						(enum spdk_bdev_io_type)i);
8262 		}
8263 	}
8264 
8265 	*_desc = desc;
8266 
8267 	return 0;
8268 }
8269 
8270 static int
8271 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
8272 	      void *event_ctx, struct spdk_bdev_desc **_desc)
8273 {
8274 	struct spdk_bdev_desc *desc;
8275 	struct spdk_bdev *bdev;
8276 	int rc;
8277 
8278 	bdev = bdev_get_by_name(bdev_name);
8279 
8280 	if (bdev == NULL) {
8281 		SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name);
8282 		return -ENODEV;
8283 	}
8284 
8285 	rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc);
8286 	if (rc != 0) {
8287 		return rc;
8288 	}
8289 
8290 	rc = bdev_open(bdev, write, desc);
8291 	if (rc != 0) {
8292 		bdev_desc_free(desc);
8293 		desc = NULL;
8294 	}
8295 
8296 	*_desc = desc;
8297 
8298 	return rc;
8299 }
8300 
8301 int
8302 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
8303 		   void *event_ctx, struct spdk_bdev_desc **_desc)
8304 {
8305 	int rc;
8306 
8307 	if (event_cb == NULL) {
8308 		SPDK_ERRLOG("Missing event callback function\n");
8309 		return -EINVAL;
8310 	}
8311 
8312 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8313 	rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, _desc);
8314 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8315 
8316 	return rc;
8317 }
8318 
8319 struct spdk_bdev_open_async_ctx {
8320 	char					*bdev_name;
8321 	spdk_bdev_event_cb_t			event_cb;
8322 	void					*event_ctx;
8323 	bool					write;
8324 	int					rc;
8325 	spdk_bdev_open_async_cb_t		cb_fn;
8326 	void					*cb_arg;
8327 	struct spdk_bdev_desc			*desc;
8328 	struct spdk_bdev_open_async_opts	opts;
8329 	uint64_t				start_ticks;
8330 	struct spdk_thread			*orig_thread;
8331 	struct spdk_poller			*poller;
8332 	TAILQ_ENTRY(spdk_bdev_open_async_ctx)	tailq;
8333 };
8334 
8335 static void
8336 bdev_open_async_done(void *arg)
8337 {
8338 	struct spdk_bdev_open_async_ctx *ctx = arg;
8339 
8340 	ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg);
8341 
8342 	free(ctx->bdev_name);
8343 	free(ctx);
8344 }
8345 
8346 static void
8347 bdev_open_async_cancel(void *arg)
8348 {
8349 	struct spdk_bdev_open_async_ctx *ctx = arg;
8350 
8351 	assert(ctx->rc == -ESHUTDOWN);
8352 
8353 	spdk_poller_unregister(&ctx->poller);
8354 
8355 	bdev_open_async_done(ctx);
8356 }
8357 
8358 /* This is called when the bdev library finishes at shutdown. */
8359 static void
8360 bdev_open_async_fini(void)
8361 {
8362 	struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx;
8363 
8364 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8365 	TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) {
8366 		TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq);
8367 		/*
8368 		 * We have to move to ctx->orig_thread to unregister ctx->poller.
8369 		 * However, there is a chance that ctx->poller is executed before
8370 		 * message is executed, which could result in bdev_open_async_done()
8371 		 * being called twice. To avoid such race condition, set ctx->rc to
8372 		 * -ESHUTDOWN.
8373 		 */
8374 		ctx->rc = -ESHUTDOWN;
8375 		spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx);
8376 	}
8377 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8378 }
8379 
8380 static int bdev_open_async(void *arg);
8381 
8382 static void
8383 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx)
8384 {
8385 	uint64_t timeout_ticks;
8386 
8387 	if (ctx->rc == -ESHUTDOWN) {
8388 		/* This context is being canceled. Do nothing. */
8389 		return;
8390 	}
8391 
8392 	ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx,
8393 				&ctx->desc);
8394 	if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) {
8395 		goto exit;
8396 	}
8397 
8398 	timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull;
8399 	if (spdk_get_ticks() >= timeout_ticks) {
8400 		SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name);
8401 		ctx->rc = -ETIMEDOUT;
8402 		goto exit;
8403 	}
8404 
8405 	return;
8406 
8407 exit:
8408 	spdk_poller_unregister(&ctx->poller);
8409 	TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq);
8410 
8411 	/* Completion callback is processed after stack unwinding. */
8412 	spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx);
8413 }
8414 
8415 static int
8416 bdev_open_async(void *arg)
8417 {
8418 	struct spdk_bdev_open_async_ctx *ctx = arg;
8419 
8420 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8421 
8422 	_bdev_open_async(ctx);
8423 
8424 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8425 
8426 	return SPDK_POLLER_BUSY;
8427 }
8428 
8429 static void
8430 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts,
8431 			  struct spdk_bdev_open_async_opts *opts_src,
8432 			  size_t size)
8433 {
8434 	assert(opts);
8435 	assert(opts_src);
8436 
8437 	opts->size = size;
8438 
8439 #define SET_FIELD(field) \
8440 	if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \
8441 		opts->field = opts_src->field; \
8442 	} \
8443 
8444 	SET_FIELD(timeout_ms);
8445 
8446 	/* Do not remove this statement, you should always update this statement when you adding a new field,
8447 	 * and do not forget to add the SET_FIELD statement for your added field. */
8448 	SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size");
8449 
8450 #undef SET_FIELD
8451 }
8452 
8453 static void
8454 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size)
8455 {
8456 	assert(opts);
8457 
8458 	opts->size = size;
8459 
8460 #define SET_FIELD(field, value) \
8461 	if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \
8462 		opts->field = value; \
8463 	} \
8464 
8465 	SET_FIELD(timeout_ms, 0);
8466 
8467 #undef SET_FIELD
8468 }
8469 
8470 int
8471 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
8472 		     void *event_ctx, struct spdk_bdev_open_async_opts *opts,
8473 		     spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg)
8474 {
8475 	struct spdk_bdev_open_async_ctx *ctx;
8476 
8477 	if (event_cb == NULL) {
8478 		SPDK_ERRLOG("Missing event callback function\n");
8479 		return -EINVAL;
8480 	}
8481 
8482 	if (open_cb == NULL) {
8483 		SPDK_ERRLOG("Missing open callback function\n");
8484 		return -EINVAL;
8485 	}
8486 
8487 	if (opts != NULL && opts->size == 0) {
8488 		SPDK_ERRLOG("size in the options structure should not be zero\n");
8489 		return -EINVAL;
8490 	}
8491 
8492 	ctx = calloc(1, sizeof(*ctx));
8493 	if (ctx == NULL) {
8494 		SPDK_ERRLOG("Failed to allocate open context\n");
8495 		return -ENOMEM;
8496 	}
8497 
8498 	ctx->bdev_name = strdup(bdev_name);
8499 	if (ctx->bdev_name == NULL) {
8500 		SPDK_ERRLOG("Failed to duplicate bdev_name\n");
8501 		free(ctx);
8502 		return -ENOMEM;
8503 	}
8504 
8505 	ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000);
8506 	if (ctx->poller == NULL) {
8507 		SPDK_ERRLOG("Failed to register bdev_open_async poller\n");
8508 		free(ctx->bdev_name);
8509 		free(ctx);
8510 		return -ENOMEM;
8511 	}
8512 
8513 	ctx->cb_fn = open_cb;
8514 	ctx->cb_arg = open_cb_arg;
8515 	ctx->write = write;
8516 	ctx->event_cb = event_cb;
8517 	ctx->event_ctx = event_ctx;
8518 	ctx->orig_thread = spdk_get_thread();
8519 	ctx->start_ticks = spdk_get_ticks();
8520 
8521 	bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts));
8522 	if (opts != NULL) {
8523 		bdev_open_async_opts_copy(&ctx->opts, opts, opts->size);
8524 	}
8525 
8526 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8527 
8528 	TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq);
8529 	_bdev_open_async(ctx);
8530 
8531 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8532 
8533 	return 0;
8534 }
8535 
8536 static void
8537 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc)
8538 {
8539 	int rc;
8540 
8541 	spdk_spin_lock(&bdev->internal.spinlock);
8542 	spdk_spin_lock(&desc->spinlock);
8543 
8544 	TAILQ_REMOVE(&bdev->internal.open_descs, desc, link);
8545 
8546 	desc->closed = true;
8547 
8548 	if (desc->claim != NULL) {
8549 		bdev_desc_release_claims(desc);
8550 	}
8551 
8552 	if (0 == desc->refs) {
8553 		spdk_spin_unlock(&desc->spinlock);
8554 		bdev_desc_free(desc);
8555 	} else {
8556 		spdk_spin_unlock(&desc->spinlock);
8557 	}
8558 
8559 	/* If no more descriptors, kill QoS channel */
8560 	if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) {
8561 		SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n",
8562 			      bdev->name, spdk_get_thread());
8563 
8564 		if (bdev_qos_destroy(bdev)) {
8565 			/* There isn't anything we can do to recover here. Just let the
8566 			 * old QoS poller keep running. The QoS handling won't change
8567 			 * cores when the user allocates a new channel, but it won't break. */
8568 			SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n");
8569 		}
8570 	}
8571 
8572 	if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) {
8573 		rc = bdev_unregister_unsafe(bdev);
8574 		spdk_spin_unlock(&bdev->internal.spinlock);
8575 
8576 		if (rc == 0) {
8577 			spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
8578 		}
8579 	} else {
8580 		spdk_spin_unlock(&bdev->internal.spinlock);
8581 	}
8582 }
8583 
8584 void
8585 spdk_bdev_close(struct spdk_bdev_desc *desc)
8586 {
8587 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
8588 
8589 	SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
8590 		      spdk_get_thread());
8591 
8592 	assert(desc->thread == spdk_get_thread());
8593 
8594 	spdk_poller_unregister(&desc->io_timeout_poller);
8595 
8596 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8597 
8598 	bdev_close(bdev, desc);
8599 
8600 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8601 }
8602 
8603 int32_t
8604 spdk_bdev_get_numa_id(struct spdk_bdev *bdev)
8605 {
8606 	if (bdev->numa.id_valid) {
8607 		return bdev->numa.id;
8608 	} else {
8609 		return SPDK_ENV_NUMA_ID_ANY;
8610 	}
8611 }
8612 
8613 static void
8614 bdev_register_finished(void *arg)
8615 {
8616 	struct spdk_bdev_desc *desc = arg;
8617 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
8618 
8619 	spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev));
8620 
8621 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8622 
8623 	bdev_close(bdev, desc);
8624 
8625 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8626 }
8627 
8628 int
8629 spdk_bdev_register(struct spdk_bdev *bdev)
8630 {
8631 	struct spdk_bdev_desc *desc;
8632 	struct spdk_thread *thread = spdk_get_thread();
8633 	int rc;
8634 
8635 	if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) {
8636 		SPDK_ERRLOG("Cannot register bdev %s on thread %p (%s)\n", bdev->name, thread,
8637 			    thread ? spdk_thread_get_name(thread) : "null");
8638 		return -EINVAL;
8639 	}
8640 
8641 	rc = bdev_register(bdev);
8642 	if (rc != 0) {
8643 		return rc;
8644 	}
8645 
8646 	/* A descriptor is opened to prevent bdev deletion during examination */
8647 	rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc);
8648 	if (rc != 0) {
8649 		spdk_bdev_unregister(bdev, NULL, NULL);
8650 		return rc;
8651 	}
8652 
8653 	rc = bdev_open(bdev, false, desc);
8654 	if (rc != 0) {
8655 		bdev_desc_free(desc);
8656 		spdk_bdev_unregister(bdev, NULL, NULL);
8657 		return rc;
8658 	}
8659 
8660 	/* Examine configuration before initializing I/O */
8661 	bdev_examine(bdev);
8662 
8663 	rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc);
8664 	if (rc != 0) {
8665 		bdev_close(bdev, desc);
8666 		spdk_bdev_unregister(bdev, NULL, NULL);
8667 	}
8668 
8669 	return rc;
8670 }
8671 
8672 int
8673 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
8674 			    struct spdk_bdev_module *module)
8675 {
8676 	spdk_spin_lock(&bdev->internal.spinlock);
8677 
8678 	if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
8679 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
8680 		spdk_spin_unlock(&bdev->internal.spinlock);
8681 		return -EPERM;
8682 	}
8683 
8684 	if (desc && !desc->write) {
8685 		desc->write = true;
8686 	}
8687 
8688 	bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE;
8689 	bdev->internal.claim.v1.module = module;
8690 
8691 	spdk_spin_unlock(&bdev->internal.spinlock);
8692 	return 0;
8693 }
8694 
8695 void
8696 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev)
8697 {
8698 	spdk_spin_lock(&bdev->internal.spinlock);
8699 
8700 	assert(bdev->internal.claim.v1.module != NULL);
8701 	assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE);
8702 	bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE;
8703 	bdev->internal.claim.v1.module = NULL;
8704 
8705 	spdk_spin_unlock(&bdev->internal.spinlock);
8706 }
8707 
8708 /*
8709  * Start claims v2
8710  */
8711 
8712 const char *
8713 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type)
8714 {
8715 	switch (type) {
8716 	case SPDK_BDEV_CLAIM_NONE:
8717 		return "not_claimed";
8718 	case SPDK_BDEV_CLAIM_EXCL_WRITE:
8719 		return "exclusive_write";
8720 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
8721 		return "read_many_write_one";
8722 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE:
8723 		return "read_many_write_none";
8724 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
8725 		return "read_many_write_many";
8726 	default:
8727 		break;
8728 	}
8729 	return "invalid_claim";
8730 }
8731 
8732 static bool
8733 claim_type_is_v2(enum spdk_bdev_claim_type type)
8734 {
8735 	switch (type) {
8736 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
8737 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE:
8738 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
8739 		return true;
8740 	default:
8741 		break;
8742 	}
8743 	return false;
8744 }
8745 
8746 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */
8747 static bool
8748 claim_type_promotes_to_write(enum spdk_bdev_claim_type type)
8749 {
8750 	switch (type) {
8751 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
8752 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
8753 		return true;
8754 	default:
8755 		break;
8756 	}
8757 	return false;
8758 }
8759 
8760 void
8761 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size)
8762 {
8763 	if (opts == NULL) {
8764 		SPDK_ERRLOG("opts should not be NULL\n");
8765 		assert(opts != NULL);
8766 		return;
8767 	}
8768 	if (size == 0) {
8769 		SPDK_ERRLOG("size should not be zero\n");
8770 		assert(size != 0);
8771 		return;
8772 	}
8773 
8774 	memset(opts, 0, size);
8775 	opts->opts_size = size;
8776 
8777 #define FIELD_OK(field) \
8778         offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size
8779 
8780 #define SET_FIELD(field, value) \
8781         if (FIELD_OK(field)) { \
8782                 opts->field = value; \
8783         } \
8784 
8785 	SET_FIELD(shared_claim_key, 0);
8786 
8787 #undef FIELD_OK
8788 #undef SET_FIELD
8789 }
8790 
8791 static int
8792 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst)
8793 {
8794 	if (src->opts_size == 0) {
8795 		SPDK_ERRLOG("size should not be zero\n");
8796 		return -1;
8797 	}
8798 
8799 	memset(dst, 0, sizeof(*dst));
8800 	dst->opts_size = src->opts_size;
8801 
8802 #define FIELD_OK(field) \
8803         offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size
8804 
8805 #define SET_FIELD(field) \
8806         if (FIELD_OK(field)) { \
8807                 dst->field = src->field; \
8808         } \
8809 
8810 	if (FIELD_OK(name)) {
8811 		snprintf(dst->name, sizeof(dst->name), "%s", src->name);
8812 	}
8813 
8814 	SET_FIELD(shared_claim_key);
8815 
8816 	/* You should not remove this statement, but need to update the assert statement
8817 	 * if you add a new field, and also add a corresponding SET_FIELD statement */
8818 	SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size");
8819 
8820 #undef FIELD_OK
8821 #undef SET_FIELD
8822 	return 0;
8823 }
8824 
8825 /* Returns 0 if a read-write-once claim can be taken. */
8826 static int
8827 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
8828 		 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
8829 {
8830 	struct spdk_bdev *bdev = desc->bdev;
8831 	struct spdk_bdev_desc *open_desc;
8832 
8833 	assert(spdk_spin_held(&bdev->internal.spinlock));
8834 	assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE);
8835 
8836 	if (opts->shared_claim_key != 0) {
8837 		SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n",
8838 			    bdev->name);
8839 		return -EINVAL;
8840 	}
8841 	if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
8842 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
8843 		return -EPERM;
8844 	}
8845 	if (desc->claim != NULL) {
8846 		SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n",
8847 			       bdev->name, desc->claim->module->name);
8848 		return -EPERM;
8849 	}
8850 	TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) {
8851 		if (desc != open_desc && open_desc->write) {
8852 			SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while "
8853 				       "another descriptor is open for writing\n",
8854 				       bdev->name);
8855 			return -EPERM;
8856 		}
8857 	}
8858 
8859 	return 0;
8860 }
8861 
8862 /* Returns 0 if a read-only-many claim can be taken. */
8863 static int
8864 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
8865 		 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
8866 {
8867 	struct spdk_bdev *bdev = desc->bdev;
8868 	struct spdk_bdev_desc *open_desc;
8869 
8870 	assert(spdk_spin_held(&bdev->internal.spinlock));
8871 	assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE);
8872 	assert(desc->claim == NULL);
8873 
8874 	if (desc->write) {
8875 		SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n",
8876 			    bdev->name);
8877 		return -EINVAL;
8878 	}
8879 	if (opts->shared_claim_key != 0) {
8880 		SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name);
8881 		return -EINVAL;
8882 	}
8883 	if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) {
8884 		TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) {
8885 			if (open_desc->write) {
8886 				SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while "
8887 					       "another descriptor is open for writing\n",
8888 					       bdev->name);
8889 				return -EPERM;
8890 			}
8891 		}
8892 	}
8893 
8894 	return 0;
8895 }
8896 
8897 /* Returns 0 if a read-write-many claim can be taken. */
8898 static int
8899 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
8900 		 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
8901 {
8902 	struct spdk_bdev *bdev = desc->bdev;
8903 	struct spdk_bdev_desc *open_desc;
8904 
8905 	assert(spdk_spin_held(&bdev->internal.spinlock));
8906 	assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED);
8907 	assert(desc->claim == NULL);
8908 
8909 	if (opts->shared_claim_key == 0) {
8910 		SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n",
8911 			    bdev->name);
8912 		return -EINVAL;
8913 	}
8914 	switch (bdev->internal.claim_type) {
8915 	case SPDK_BDEV_CLAIM_NONE:
8916 		TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) {
8917 			if (open_desc == desc) {
8918 				continue;
8919 			}
8920 			if (open_desc->write) {
8921 				SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while "
8922 					       "another descriptor is open for writing without a "
8923 					       "claim\n", bdev->name);
8924 				return -EPERM;
8925 			}
8926 		}
8927 		break;
8928 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
8929 		if (opts->shared_claim_key != bdev->internal.claim.v2.key) {
8930 			LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev);
8931 			return -EPERM;
8932 		}
8933 		break;
8934 	default:
8935 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
8936 		return -EBUSY;
8937 	}
8938 
8939 	return 0;
8940 }
8941 
8942 /* Updates desc and its bdev with a v2 claim. */
8943 static int
8944 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
8945 	   struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
8946 {
8947 	struct spdk_bdev *bdev = desc->bdev;
8948 	struct spdk_bdev_module_claim *claim;
8949 
8950 	assert(spdk_spin_held(&bdev->internal.spinlock));
8951 	assert(claim_type_is_v2(type));
8952 	assert(desc->claim == NULL);
8953 
8954 	claim = calloc(1, sizeof(*desc->claim));
8955 	if (claim == NULL) {
8956 		SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name);
8957 		return -ENOMEM;
8958 	}
8959 	claim->module = module;
8960 	claim->desc = desc;
8961 	SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match");
8962 	memcpy(claim->name, opts->name, sizeof(claim->name));
8963 	desc->claim = claim;
8964 
8965 	if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) {
8966 		bdev->internal.claim_type = type;
8967 		TAILQ_INIT(&bdev->internal.claim.v2.claims);
8968 		bdev->internal.claim.v2.key = opts->shared_claim_key;
8969 	}
8970 	assert(type == bdev->internal.claim_type);
8971 
8972 	TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link);
8973 
8974 	if (!desc->write && claim_type_promotes_to_write(type)) {
8975 		desc->write = true;
8976 	}
8977 
8978 	return 0;
8979 }
8980 
8981 int
8982 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
8983 				 struct spdk_bdev_claim_opts *_opts,
8984 				 struct spdk_bdev_module *module)
8985 {
8986 	struct spdk_bdev *bdev;
8987 	struct spdk_bdev_claim_opts opts;
8988 	int rc = 0;
8989 
8990 	if (desc == NULL) {
8991 		SPDK_ERRLOG("descriptor must not be NULL\n");
8992 		return -EINVAL;
8993 	}
8994 
8995 	bdev = desc->bdev;
8996 
8997 	if (_opts == NULL) {
8998 		spdk_bdev_claim_opts_init(&opts, sizeof(opts));
8999 	} else if (claim_opts_copy(_opts, &opts) != 0) {
9000 		return -EINVAL;
9001 	}
9002 
9003 	spdk_spin_lock(&bdev->internal.spinlock);
9004 
9005 	if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE &&
9006 	    bdev->internal.claim_type != type) {
9007 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
9008 		spdk_spin_unlock(&bdev->internal.spinlock);
9009 		return -EPERM;
9010 	}
9011 
9012 	if (claim_type_is_v2(type) && desc->claim != NULL) {
9013 		SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n",
9014 			    bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name);
9015 		spdk_spin_unlock(&bdev->internal.spinlock);
9016 		return -EPERM;
9017 	}
9018 
9019 	switch (type) {
9020 	case SPDK_BDEV_CLAIM_EXCL_WRITE:
9021 		spdk_spin_unlock(&bdev->internal.spinlock);
9022 		return spdk_bdev_module_claim_bdev(bdev, desc, module);
9023 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
9024 		rc = claim_verify_rwo(desc, type, &opts, module);
9025 		break;
9026 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE:
9027 		rc = claim_verify_rom(desc, type, &opts, module);
9028 		break;
9029 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
9030 		rc = claim_verify_rwm(desc, type, &opts, module);
9031 		break;
9032 	default:
9033 		SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type);
9034 		rc = -ENOTSUP;
9035 	}
9036 
9037 	if (rc == 0) {
9038 		rc = claim_bdev(desc, type, &opts, module);
9039 	}
9040 
9041 	spdk_spin_unlock(&bdev->internal.spinlock);
9042 	return rc;
9043 }
9044 
9045 static void
9046 claim_reset(struct spdk_bdev *bdev)
9047 {
9048 	assert(spdk_spin_held(&bdev->internal.spinlock));
9049 	assert(claim_type_is_v2(bdev->internal.claim_type));
9050 	assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims));
9051 
9052 	memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim));
9053 	bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE;
9054 }
9055 
9056 static void
9057 bdev_desc_release_claims(struct spdk_bdev_desc *desc)
9058 {
9059 	struct spdk_bdev *bdev = desc->bdev;
9060 
9061 	assert(spdk_spin_held(&bdev->internal.spinlock));
9062 	assert(claim_type_is_v2(bdev->internal.claim_type));
9063 
9064 	if (bdev->internal.examine_in_progress == 0) {
9065 		TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link);
9066 		free(desc->claim);
9067 		if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) {
9068 			claim_reset(bdev);
9069 		}
9070 	} else {
9071 		/* This is a dead claim that will be cleaned up when bdev_examine() is done. */
9072 		desc->claim->module = NULL;
9073 		desc->claim->desc = NULL;
9074 	}
9075 	desc->claim = NULL;
9076 }
9077 
9078 /*
9079  * End claims v2
9080  */
9081 
9082 struct spdk_bdev *
9083 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc)
9084 {
9085 	assert(desc != NULL);
9086 	return desc->bdev;
9087 }
9088 
9089 int
9090 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn)
9091 {
9092 	struct spdk_bdev *bdev, *tmp;
9093 	struct spdk_bdev_desc *desc;
9094 	int rc = 0;
9095 
9096 	assert(fn != NULL);
9097 
9098 	spdk_spin_lock(&g_bdev_mgr.spinlock);
9099 	bdev = spdk_bdev_first();
9100 	while (bdev != NULL) {
9101 		rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc);
9102 		if (rc != 0) {
9103 			break;
9104 		}
9105 		rc = bdev_open(bdev, false, desc);
9106 		if (rc != 0) {
9107 			bdev_desc_free(desc);
9108 			if (rc == -ENODEV) {
9109 				/* Ignore the error and move to the next bdev. */
9110 				rc = 0;
9111 				bdev = spdk_bdev_next(bdev);
9112 				continue;
9113 			}
9114 			break;
9115 		}
9116 		spdk_spin_unlock(&g_bdev_mgr.spinlock);
9117 
9118 		rc = fn(ctx, bdev);
9119 
9120 		spdk_spin_lock(&g_bdev_mgr.spinlock);
9121 		tmp = spdk_bdev_next(bdev);
9122 		bdev_close(bdev, desc);
9123 		if (rc != 0) {
9124 			break;
9125 		}
9126 		bdev = tmp;
9127 	}
9128 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
9129 
9130 	return rc;
9131 }
9132 
9133 int
9134 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn)
9135 {
9136 	struct spdk_bdev *bdev, *tmp;
9137 	struct spdk_bdev_desc *desc;
9138 	int rc = 0;
9139 
9140 	assert(fn != NULL);
9141 
9142 	spdk_spin_lock(&g_bdev_mgr.spinlock);
9143 	bdev = spdk_bdev_first_leaf();
9144 	while (bdev != NULL) {
9145 		rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc);
9146 		if (rc != 0) {
9147 			break;
9148 		}
9149 		rc = bdev_open(bdev, false, desc);
9150 		if (rc != 0) {
9151 			bdev_desc_free(desc);
9152 			if (rc == -ENODEV) {
9153 				/* Ignore the error and move to the next bdev. */
9154 				rc = 0;
9155 				bdev = spdk_bdev_next_leaf(bdev);
9156 				continue;
9157 			}
9158 			break;
9159 		}
9160 		spdk_spin_unlock(&g_bdev_mgr.spinlock);
9161 
9162 		rc = fn(ctx, bdev);
9163 
9164 		spdk_spin_lock(&g_bdev_mgr.spinlock);
9165 		tmp = spdk_bdev_next_leaf(bdev);
9166 		bdev_close(bdev, desc);
9167 		if (rc != 0) {
9168 			break;
9169 		}
9170 		bdev = tmp;
9171 	}
9172 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
9173 
9174 	return rc;
9175 }
9176 
9177 void
9178 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp)
9179 {
9180 	struct iovec *iovs;
9181 	int iovcnt;
9182 
9183 	if (bdev_io == NULL) {
9184 		return;
9185 	}
9186 
9187 	switch (bdev_io->type) {
9188 	case SPDK_BDEV_IO_TYPE_READ:
9189 	case SPDK_BDEV_IO_TYPE_WRITE:
9190 	case SPDK_BDEV_IO_TYPE_ZCOPY:
9191 		iovs = bdev_io->u.bdev.iovs;
9192 		iovcnt = bdev_io->u.bdev.iovcnt;
9193 		break;
9194 	default:
9195 		iovs = NULL;
9196 		iovcnt = 0;
9197 		break;
9198 	}
9199 
9200 	if (iovp) {
9201 		*iovp = iovs;
9202 	}
9203 	if (iovcntp) {
9204 		*iovcntp = iovcnt;
9205 	}
9206 }
9207 
9208 void *
9209 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io)
9210 {
9211 	if (bdev_io == NULL) {
9212 		return NULL;
9213 	}
9214 
9215 	if (!spdk_bdev_is_md_separate(bdev_io->bdev)) {
9216 		return NULL;
9217 	}
9218 
9219 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ ||
9220 	    bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
9221 		return bdev_io->u.bdev.md_buf;
9222 	}
9223 
9224 	return NULL;
9225 }
9226 
9227 void *
9228 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io)
9229 {
9230 	if (bdev_io == NULL) {
9231 		assert(false);
9232 		return NULL;
9233 	}
9234 
9235 	return bdev_io->internal.caller_ctx;
9236 }
9237 
9238 void
9239 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module)
9240 {
9241 
9242 	if (spdk_bdev_module_list_find(bdev_module->name)) {
9243 		SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name);
9244 		assert(false);
9245 	}
9246 
9247 	spdk_spin_init(&bdev_module->internal.spinlock);
9248 	TAILQ_INIT(&bdev_module->internal.quiesced_ranges);
9249 
9250 	/*
9251 	 * Modules with examine callbacks must be initialized first, so they are
9252 	 *  ready to handle examine callbacks from later modules that will
9253 	 *  register physical bdevs.
9254 	 */
9255 	if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) {
9256 		TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
9257 	} else {
9258 		TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
9259 	}
9260 }
9261 
9262 struct spdk_bdev_module *
9263 spdk_bdev_module_list_find(const char *name)
9264 {
9265 	struct spdk_bdev_module *bdev_module;
9266 
9267 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
9268 		if (strcmp(name, bdev_module->name) == 0) {
9269 			break;
9270 		}
9271 	}
9272 
9273 	return bdev_module;
9274 }
9275 
9276 static int
9277 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io)
9278 {
9279 	uint64_t num_blocks;
9280 	void *md_buf = NULL;
9281 
9282 	num_blocks = bdev_io->u.bdev.num_blocks;
9283 
9284 	if (spdk_bdev_is_md_separate(bdev_io->bdev)) {
9285 		md_buf = (char *)g_bdev_mgr.zero_buffer +
9286 			 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks;
9287 	}
9288 
9289 	return bdev_write_blocks_with_md(bdev_io->internal.desc,
9290 					 spdk_io_channel_from_ctx(bdev_io->internal.ch),
9291 					 g_bdev_mgr.zero_buffer, md_buf,
9292 					 bdev_io->u.bdev.offset_blocks, num_blocks,
9293 					 bdev_write_zero_buffer_done, bdev_io);
9294 }
9295 
9296 static void
9297 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
9298 {
9299 	struct spdk_bdev_io *parent_io = cb_arg;
9300 
9301 	spdk_bdev_free_io(bdev_io);
9302 
9303 	parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
9304 	parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx);
9305 }
9306 
9307 static void
9308 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status)
9309 {
9310 	spdk_spin_lock(&ctx->bdev->internal.spinlock);
9311 	ctx->bdev->internal.qos_mod_in_progress = false;
9312 	spdk_spin_unlock(&ctx->bdev->internal.spinlock);
9313 
9314 	if (ctx->cb_fn) {
9315 		ctx->cb_fn(ctx->cb_arg, status);
9316 	}
9317 	free(ctx);
9318 }
9319 
9320 static void
9321 bdev_disable_qos_done(void *cb_arg)
9322 {
9323 	struct set_qos_limit_ctx *ctx = cb_arg;
9324 	struct spdk_bdev *bdev = ctx->bdev;
9325 	struct spdk_bdev_qos *qos;
9326 
9327 	spdk_spin_lock(&bdev->internal.spinlock);
9328 	qos = bdev->internal.qos;
9329 	bdev->internal.qos = NULL;
9330 	spdk_spin_unlock(&bdev->internal.spinlock);
9331 
9332 	if (qos->thread != NULL) {
9333 		spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
9334 		spdk_poller_unregister(&qos->poller);
9335 	}
9336 
9337 	free(qos);
9338 
9339 	bdev_set_qos_limit_done(ctx, 0);
9340 }
9341 
9342 static void
9343 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status)
9344 {
9345 	struct set_qos_limit_ctx *ctx = _ctx;
9346 	struct spdk_thread *thread;
9347 
9348 	spdk_spin_lock(&bdev->internal.spinlock);
9349 	thread = bdev->internal.qos->thread;
9350 	spdk_spin_unlock(&bdev->internal.spinlock);
9351 
9352 	if (thread != NULL) {
9353 		spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx);
9354 	} else {
9355 		bdev_disable_qos_done(ctx);
9356 	}
9357 }
9358 
9359 static void
9360 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9361 		     struct spdk_io_channel *ch, void *_ctx)
9362 {
9363 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
9364 	struct spdk_bdev_io *bdev_io;
9365 
9366 	bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED;
9367 
9368 	while (!TAILQ_EMPTY(&bdev_ch->qos_queued_io)) {
9369 		/* Re-submit the queued I/O. */
9370 		bdev_io = TAILQ_FIRST(&bdev_ch->qos_queued_io);
9371 		TAILQ_REMOVE(&bdev_ch->qos_queued_io, bdev_io, internal.link);
9372 		_bdev_io_submit(bdev_io);
9373 	}
9374 
9375 	spdk_bdev_for_each_channel_continue(i, 0);
9376 }
9377 
9378 static void
9379 bdev_update_qos_rate_limit_msg(void *cb_arg)
9380 {
9381 	struct set_qos_limit_ctx *ctx = cb_arg;
9382 	struct spdk_bdev *bdev = ctx->bdev;
9383 
9384 	spdk_spin_lock(&bdev->internal.spinlock);
9385 	bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos);
9386 	spdk_spin_unlock(&bdev->internal.spinlock);
9387 
9388 	bdev_set_qos_limit_done(ctx, 0);
9389 }
9390 
9391 static void
9392 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9393 		    struct spdk_io_channel *ch, void *_ctx)
9394 {
9395 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
9396 
9397 	spdk_spin_lock(&bdev->internal.spinlock);
9398 	bdev_enable_qos(bdev, bdev_ch);
9399 	spdk_spin_unlock(&bdev->internal.spinlock);
9400 	spdk_bdev_for_each_channel_continue(i, 0);
9401 }
9402 
9403 static void
9404 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status)
9405 {
9406 	struct set_qos_limit_ctx *ctx = _ctx;
9407 
9408 	bdev_set_qos_limit_done(ctx, status);
9409 }
9410 
9411 static void
9412 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits)
9413 {
9414 	int i;
9415 
9416 	assert(bdev->internal.qos != NULL);
9417 
9418 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
9419 		if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
9420 			bdev->internal.qos->rate_limits[i].limit = limits[i];
9421 
9422 			if (limits[i] == 0) {
9423 				bdev->internal.qos->rate_limits[i].limit =
9424 					SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
9425 			}
9426 		}
9427 	}
9428 }
9429 
9430 void
9431 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits,
9432 			      void (*cb_fn)(void *cb_arg, int status), void *cb_arg)
9433 {
9434 	struct set_qos_limit_ctx	*ctx;
9435 	uint32_t			limit_set_complement;
9436 	uint64_t			min_limit_per_sec;
9437 	int				i;
9438 	bool				disable_rate_limit = true;
9439 
9440 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
9441 		if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
9442 			continue;
9443 		}
9444 
9445 		if (limits[i] > 0) {
9446 			disable_rate_limit = false;
9447 		}
9448 
9449 		if (bdev_qos_is_iops_rate_limit(i) == true) {
9450 			min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC;
9451 		} else {
9452 			if (limits[i] > SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC) {
9453 				SPDK_WARNLOG("Requested rate limit %" PRIu64 " will result in uint64_t overflow, "
9454 					     "reset to %" PRIu64 "\n", limits[i], SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC);
9455 				limits[i] = SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC;
9456 			}
9457 			/* Change from megabyte to byte rate limit */
9458 			limits[i] = limits[i] * 1024 * 1024;
9459 			min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC;
9460 		}
9461 
9462 		limit_set_complement = limits[i] % min_limit_per_sec;
9463 		if (limit_set_complement) {
9464 			SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n",
9465 				    limits[i], min_limit_per_sec);
9466 			limits[i] += min_limit_per_sec - limit_set_complement;
9467 			SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]);
9468 		}
9469 	}
9470 
9471 	ctx = calloc(1, sizeof(*ctx));
9472 	if (ctx == NULL) {
9473 		cb_fn(cb_arg, -ENOMEM);
9474 		return;
9475 	}
9476 
9477 	ctx->cb_fn = cb_fn;
9478 	ctx->cb_arg = cb_arg;
9479 	ctx->bdev = bdev;
9480 
9481 	spdk_spin_lock(&bdev->internal.spinlock);
9482 	if (bdev->internal.qos_mod_in_progress) {
9483 		spdk_spin_unlock(&bdev->internal.spinlock);
9484 		free(ctx);
9485 		cb_fn(cb_arg, -EAGAIN);
9486 		return;
9487 	}
9488 	bdev->internal.qos_mod_in_progress = true;
9489 
9490 	if (disable_rate_limit == true && bdev->internal.qos) {
9491 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
9492 			if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED &&
9493 			    (bdev->internal.qos->rate_limits[i].limit > 0 &&
9494 			     bdev->internal.qos->rate_limits[i].limit !=
9495 			     SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) {
9496 				disable_rate_limit = false;
9497 				break;
9498 			}
9499 		}
9500 	}
9501 
9502 	if (disable_rate_limit == false) {
9503 		if (bdev->internal.qos == NULL) {
9504 			bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos));
9505 			if (!bdev->internal.qos) {
9506 				spdk_spin_unlock(&bdev->internal.spinlock);
9507 				SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n");
9508 				bdev_set_qos_limit_done(ctx, -ENOMEM);
9509 				return;
9510 			}
9511 		}
9512 
9513 		if (bdev->internal.qos->thread == NULL) {
9514 			/* Enabling */
9515 			bdev_set_qos_rate_limits(bdev, limits);
9516 
9517 			spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx,
9518 						   bdev_enable_qos_done);
9519 		} else {
9520 			/* Updating */
9521 			bdev_set_qos_rate_limits(bdev, limits);
9522 
9523 			spdk_thread_send_msg(bdev->internal.qos->thread,
9524 					     bdev_update_qos_rate_limit_msg, ctx);
9525 		}
9526 	} else {
9527 		if (bdev->internal.qos != NULL) {
9528 			bdev_set_qos_rate_limits(bdev, limits);
9529 
9530 			/* Disabling */
9531 			spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx,
9532 						   bdev_disable_qos_msg_done);
9533 		} else {
9534 			spdk_spin_unlock(&bdev->internal.spinlock);
9535 			bdev_set_qos_limit_done(ctx, 0);
9536 			return;
9537 		}
9538 	}
9539 
9540 	spdk_spin_unlock(&bdev->internal.spinlock);
9541 }
9542 
9543 struct spdk_bdev_histogram_ctx {
9544 	spdk_bdev_histogram_status_cb cb_fn;
9545 	void *cb_arg;
9546 	struct spdk_bdev *bdev;
9547 	int status;
9548 };
9549 
9550 static void
9551 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
9552 {
9553 	struct spdk_bdev_histogram_ctx *ctx = _ctx;
9554 
9555 	spdk_spin_lock(&ctx->bdev->internal.spinlock);
9556 	ctx->bdev->internal.histogram_in_progress = false;
9557 	spdk_spin_unlock(&ctx->bdev->internal.spinlock);
9558 	ctx->cb_fn(ctx->cb_arg, ctx->status);
9559 	free(ctx);
9560 }
9561 
9562 static void
9563 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9564 			       struct spdk_io_channel *_ch, void *_ctx)
9565 {
9566 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9567 
9568 	if (ch->histogram != NULL) {
9569 		spdk_histogram_data_free(ch->histogram);
9570 		ch->histogram = NULL;
9571 	}
9572 	spdk_bdev_for_each_channel_continue(i, 0);
9573 }
9574 
9575 static void
9576 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
9577 {
9578 	struct spdk_bdev_histogram_ctx *ctx = _ctx;
9579 
9580 	if (status != 0) {
9581 		ctx->status = status;
9582 		ctx->bdev->internal.histogram_enabled = false;
9583 		spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx,
9584 					   bdev_histogram_disable_channel_cb);
9585 	} else {
9586 		spdk_spin_lock(&ctx->bdev->internal.spinlock);
9587 		ctx->bdev->internal.histogram_in_progress = false;
9588 		spdk_spin_unlock(&ctx->bdev->internal.spinlock);
9589 		ctx->cb_fn(ctx->cb_arg, ctx->status);
9590 		free(ctx);
9591 	}
9592 }
9593 
9594 static void
9595 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9596 			      struct spdk_io_channel *_ch, void *_ctx)
9597 {
9598 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9599 	int status = 0;
9600 
9601 	if (ch->histogram == NULL) {
9602 		ch->histogram = spdk_histogram_data_alloc();
9603 		if (ch->histogram == NULL) {
9604 			status = -ENOMEM;
9605 		}
9606 	}
9607 
9608 	spdk_bdev_for_each_channel_continue(i, status);
9609 }
9610 
9611 void
9612 spdk_bdev_histogram_enable_ext(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn,
9613 			       void *cb_arg, bool enable, struct spdk_bdev_enable_histogram_opts *opts)
9614 {
9615 	struct spdk_bdev_histogram_ctx *ctx;
9616 
9617 	ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx));
9618 	if (ctx == NULL) {
9619 		cb_fn(cb_arg, -ENOMEM);
9620 		return;
9621 	}
9622 
9623 	ctx->bdev = bdev;
9624 	ctx->status = 0;
9625 	ctx->cb_fn = cb_fn;
9626 	ctx->cb_arg = cb_arg;
9627 
9628 	spdk_spin_lock(&bdev->internal.spinlock);
9629 	if (bdev->internal.histogram_in_progress) {
9630 		spdk_spin_unlock(&bdev->internal.spinlock);
9631 		free(ctx);
9632 		cb_fn(cb_arg, -EAGAIN);
9633 		return;
9634 	}
9635 
9636 	bdev->internal.histogram_in_progress = true;
9637 	spdk_spin_unlock(&bdev->internal.spinlock);
9638 
9639 	bdev->internal.histogram_enabled = enable;
9640 	bdev->internal.histogram_io_type = opts->io_type;
9641 
9642 	if (enable) {
9643 		/* Allocate histogram for each channel */
9644 		spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx,
9645 					   bdev_histogram_enable_channel_cb);
9646 	} else {
9647 		spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx,
9648 					   bdev_histogram_disable_channel_cb);
9649 	}
9650 }
9651 
9652 void
9653 spdk_bdev_enable_histogram_opts_init(struct spdk_bdev_enable_histogram_opts *opts, size_t size)
9654 {
9655 	if (opts == NULL) {
9656 		SPDK_ERRLOG("opts should not be NULL\n");
9657 		assert(opts != NULL);
9658 		return;
9659 	}
9660 	if (size == 0) {
9661 		SPDK_ERRLOG("size should not be zero\n");
9662 		assert(size != 0);
9663 		return;
9664 	}
9665 
9666 	memset(opts, 0, size);
9667 	opts->size = size;
9668 
9669 #define FIELD_OK(field) \
9670         offsetof(struct spdk_bdev_enable_histogram_opts, field) + sizeof(opts->field) <= size
9671 
9672 #define SET_FIELD(field, value) \
9673         if (FIELD_OK(field)) { \
9674                 opts->field = value; \
9675         } \
9676 
9677 	SET_FIELD(io_type, 0);
9678 
9679 	/* You should not remove this statement, but need to update the assert statement
9680 	 * if you add a new field, and also add a corresponding SET_FIELD statement */
9681 	SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_enable_histogram_opts) == 9, "Incorrect size");
9682 
9683 #undef FIELD_OK
9684 #undef SET_FIELD
9685 }
9686 
9687 void
9688 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn,
9689 			   void *cb_arg, bool enable)
9690 {
9691 	struct spdk_bdev_enable_histogram_opts opts;
9692 
9693 	spdk_bdev_enable_histogram_opts_init(&opts, sizeof(opts));
9694 	spdk_bdev_histogram_enable_ext(bdev, cb_fn, cb_arg, enable, &opts);
9695 }
9696 
9697 struct spdk_bdev_histogram_data_ctx {
9698 	spdk_bdev_histogram_data_cb cb_fn;
9699 	void *cb_arg;
9700 	struct spdk_bdev *bdev;
9701 	/** merged histogram data from all channels */
9702 	struct spdk_histogram_data	*histogram;
9703 };
9704 
9705 static void
9706 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
9707 {
9708 	struct spdk_bdev_histogram_data_ctx *ctx = _ctx;
9709 
9710 	ctx->cb_fn(ctx->cb_arg, status, ctx->histogram);
9711 	free(ctx);
9712 }
9713 
9714 static void
9715 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9716 			   struct spdk_io_channel *_ch, void *_ctx)
9717 {
9718 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9719 	struct spdk_bdev_histogram_data_ctx *ctx = _ctx;
9720 	int status = 0;
9721 
9722 	if (ch->histogram == NULL) {
9723 		status = -EFAULT;
9724 	} else {
9725 		spdk_histogram_data_merge(ctx->histogram, ch->histogram);
9726 	}
9727 
9728 	spdk_bdev_for_each_channel_continue(i, status);
9729 }
9730 
9731 void
9732 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram,
9733 			spdk_bdev_histogram_data_cb cb_fn,
9734 			void *cb_arg)
9735 {
9736 	struct spdk_bdev_histogram_data_ctx *ctx;
9737 
9738 	ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx));
9739 	if (ctx == NULL) {
9740 		cb_fn(cb_arg, -ENOMEM, NULL);
9741 		return;
9742 	}
9743 
9744 	ctx->bdev = bdev;
9745 	ctx->cb_fn = cb_fn;
9746 	ctx->cb_arg = cb_arg;
9747 
9748 	ctx->histogram = histogram;
9749 
9750 	spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx,
9751 				   bdev_histogram_get_channel_cb);
9752 }
9753 
9754 void
9755 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn,
9756 				void *cb_arg)
9757 {
9758 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
9759 	int status = 0;
9760 
9761 	assert(cb_fn != NULL);
9762 
9763 	if (bdev_ch->histogram == NULL) {
9764 		status = -EFAULT;
9765 	}
9766 	cb_fn(cb_arg, status, bdev_ch->histogram);
9767 }
9768 
9769 size_t
9770 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events,
9771 			   size_t max_events)
9772 {
9773 	struct media_event_entry *entry;
9774 	size_t num_events = 0;
9775 
9776 	for (; num_events < max_events; ++num_events) {
9777 		entry = TAILQ_FIRST(&desc->pending_media_events);
9778 		if (entry == NULL) {
9779 			break;
9780 		}
9781 
9782 		events[num_events] = entry->event;
9783 		TAILQ_REMOVE(&desc->pending_media_events, entry, tailq);
9784 		TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq);
9785 	}
9786 
9787 	return num_events;
9788 }
9789 
9790 int
9791 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events,
9792 			    size_t num_events)
9793 {
9794 	struct spdk_bdev_desc *desc;
9795 	struct media_event_entry *entry;
9796 	size_t event_id;
9797 	int rc = 0;
9798 
9799 	assert(bdev->media_events);
9800 
9801 	spdk_spin_lock(&bdev->internal.spinlock);
9802 	TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
9803 		if (desc->write) {
9804 			break;
9805 		}
9806 	}
9807 
9808 	if (desc == NULL || desc->media_events_buffer == NULL) {
9809 		rc = -ENODEV;
9810 		goto out;
9811 	}
9812 
9813 	for (event_id = 0; event_id < num_events; ++event_id) {
9814 		entry = TAILQ_FIRST(&desc->free_media_events);
9815 		if (entry == NULL) {
9816 			break;
9817 		}
9818 
9819 		TAILQ_REMOVE(&desc->free_media_events, entry, tailq);
9820 		TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq);
9821 		entry->event = events[event_id];
9822 	}
9823 
9824 	rc = event_id;
9825 out:
9826 	spdk_spin_unlock(&bdev->internal.spinlock);
9827 	return rc;
9828 }
9829 
9830 static void
9831 _media_management_notify(void *arg)
9832 {
9833 	struct spdk_bdev_desc *desc = arg;
9834 
9835 	_event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT);
9836 }
9837 
9838 void
9839 spdk_bdev_notify_media_management(struct spdk_bdev *bdev)
9840 {
9841 	struct spdk_bdev_desc *desc;
9842 
9843 	spdk_spin_lock(&bdev->internal.spinlock);
9844 	TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
9845 		if (!TAILQ_EMPTY(&desc->pending_media_events)) {
9846 			event_notify(desc, _media_management_notify);
9847 		}
9848 	}
9849 	spdk_spin_unlock(&bdev->internal.spinlock);
9850 }
9851 
9852 struct locked_lba_range_ctx {
9853 	struct lba_range		range;
9854 	struct lba_range		*current_range;
9855 	struct lba_range		*owner_range;
9856 	struct spdk_poller		*poller;
9857 	lock_range_cb			cb_fn;
9858 	void				*cb_arg;
9859 };
9860 
9861 static void
9862 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status)
9863 {
9864 	struct locked_lba_range_ctx *ctx = _ctx;
9865 
9866 	ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM);
9867 	free(ctx);
9868 }
9869 
9870 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i,
9871 		struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx);
9872 
9873 static void
9874 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status)
9875 {
9876 	struct locked_lba_range_ctx *ctx = _ctx;
9877 
9878 	if (status == -ENOMEM) {
9879 		/* One of the channels could not allocate a range object.
9880 		 * So we have to go back and clean up any ranges that were
9881 		 * allocated successfully before we return error status to
9882 		 * the caller.  We can reuse the unlock function to do that
9883 		 * clean up.
9884 		 */
9885 		spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx,
9886 					   bdev_lock_error_cleanup_cb);
9887 		return;
9888 	}
9889 
9890 	/* All channels have locked this range and no I/O overlapping the range
9891 	 * are outstanding!  Set the owner_ch for the range object for the
9892 	 * locking channel, so that this channel will know that it is allowed
9893 	 * to write to this range.
9894 	 */
9895 	if (ctx->owner_range != NULL) {
9896 		ctx->owner_range->owner_ch = ctx->range.owner_ch;
9897 	}
9898 
9899 	ctx->cb_fn(&ctx->range, ctx->cb_arg, status);
9900 
9901 	/* Don't free the ctx here.  Its range is in the bdev's global list of
9902 	 * locked ranges still, and will be removed and freed when this range
9903 	 * is later unlocked.
9904 	 */
9905 }
9906 
9907 static int
9908 bdev_lock_lba_range_check_io(void *_i)
9909 {
9910 	struct spdk_bdev_channel_iter *i = _i;
9911 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i);
9912 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9913 	struct locked_lba_range_ctx *ctx = i->ctx;
9914 	struct lba_range *range = ctx->current_range;
9915 	struct spdk_bdev_io *bdev_io;
9916 
9917 	spdk_poller_unregister(&ctx->poller);
9918 
9919 	/* The range is now in the locked_ranges, so no new IO can be submitted to this
9920 	 * range.  But we need to wait until any outstanding IO overlapping with this range
9921 	 * are completed.
9922 	 */
9923 	TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) {
9924 		if (bdev_io_range_is_locked(bdev_io, range)) {
9925 			ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100);
9926 			return SPDK_POLLER_BUSY;
9927 		}
9928 	}
9929 
9930 	spdk_bdev_for_each_channel_continue(i, 0);
9931 	return SPDK_POLLER_BUSY;
9932 }
9933 
9934 static void
9935 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9936 				struct spdk_io_channel *_ch, void *_ctx)
9937 {
9938 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9939 	struct locked_lba_range_ctx *ctx = _ctx;
9940 	struct lba_range *range;
9941 
9942 	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
9943 		if (range->length == ctx->range.length &&
9944 		    range->offset == ctx->range.offset &&
9945 		    range->locked_ctx == ctx->range.locked_ctx) {
9946 			/* This range already exists on this channel, so don't add
9947 			 * it again.  This can happen when a new channel is created
9948 			 * while the for_each_channel operation is in progress.
9949 			 * Do not check for outstanding I/O in that case, since the
9950 			 * range was locked before any I/O could be submitted to the
9951 			 * new channel.
9952 			 */
9953 			spdk_bdev_for_each_channel_continue(i, 0);
9954 			return;
9955 		}
9956 	}
9957 
9958 	range = calloc(1, sizeof(*range));
9959 	if (range == NULL) {
9960 		spdk_bdev_for_each_channel_continue(i, -ENOMEM);
9961 		return;
9962 	}
9963 
9964 	range->length = ctx->range.length;
9965 	range->offset = ctx->range.offset;
9966 	range->locked_ctx = ctx->range.locked_ctx;
9967 	range->quiesce = ctx->range.quiesce;
9968 	ctx->current_range = range;
9969 	if (ctx->range.owner_ch == ch) {
9970 		/* This is the range object for the channel that will hold
9971 		 * the lock.  Store it in the ctx object so that we can easily
9972 		 * set its owner_ch after the lock is finally acquired.
9973 		 */
9974 		ctx->owner_range = range;
9975 	}
9976 	TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq);
9977 	bdev_lock_lba_range_check_io(i);
9978 }
9979 
9980 static void
9981 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx)
9982 {
9983 	assert(spdk_get_thread() == ctx->range.owner_thread);
9984 	assert(ctx->range.owner_ch == NULL ||
9985 	       spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread);
9986 
9987 	/* We will add a copy of this range to each channel now. */
9988 	spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx,
9989 				   bdev_lock_lba_range_cb);
9990 }
9991 
9992 static bool
9993 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq)
9994 {
9995 	struct lba_range *r;
9996 
9997 	TAILQ_FOREACH(r, tailq, tailq) {
9998 		if (bdev_lba_range_overlapped(range, r)) {
9999 			return true;
10000 		}
10001 	}
10002 	return false;
10003 }
10004 
10005 static void bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status);
10006 
10007 static int
10008 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch,
10009 		     uint64_t offset, uint64_t length,
10010 		     lock_range_cb cb_fn, void *cb_arg)
10011 {
10012 	struct locked_lba_range_ctx *ctx;
10013 
10014 	ctx = calloc(1, sizeof(*ctx));
10015 	if (ctx == NULL) {
10016 		return -ENOMEM;
10017 	}
10018 
10019 	ctx->range.offset = offset;
10020 	ctx->range.length = length;
10021 	ctx->range.owner_thread = spdk_get_thread();
10022 	ctx->range.owner_ch = ch;
10023 	ctx->range.locked_ctx = cb_arg;
10024 	ctx->range.bdev = bdev;
10025 	ctx->range.quiesce = (cb_fn == bdev_quiesce_range_locked);
10026 	ctx->cb_fn = cb_fn;
10027 	ctx->cb_arg = cb_arg;
10028 
10029 	spdk_spin_lock(&bdev->internal.spinlock);
10030 	if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) {
10031 		/* There is an active lock overlapping with this range.
10032 		 * Put it on the pending list until this range no
10033 		 * longer overlaps with another.
10034 		 */
10035 		TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq);
10036 	} else {
10037 		TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq);
10038 		bdev_lock_lba_range_ctx(bdev, ctx);
10039 	}
10040 	spdk_spin_unlock(&bdev->internal.spinlock);
10041 	return 0;
10042 }
10043 
10044 static int
10045 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
10046 		    uint64_t offset, uint64_t length,
10047 		    lock_range_cb cb_fn, void *cb_arg)
10048 {
10049 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
10050 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10051 
10052 	if (cb_arg == NULL) {
10053 		SPDK_ERRLOG("cb_arg must not be NULL\n");
10054 		return -EINVAL;
10055 	}
10056 
10057 	return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg);
10058 }
10059 
10060 static void
10061 bdev_lock_lba_range_ctx_msg(void *_ctx)
10062 {
10063 	struct locked_lba_range_ctx *ctx = _ctx;
10064 
10065 	bdev_lock_lba_range_ctx(ctx->range.bdev, ctx);
10066 }
10067 
10068 static void
10069 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status)
10070 {
10071 	struct locked_lba_range_ctx *ctx = _ctx;
10072 	struct locked_lba_range_ctx *pending_ctx;
10073 	struct lba_range *range, *tmp;
10074 
10075 	spdk_spin_lock(&bdev->internal.spinlock);
10076 	/* Check if there are any pending locked ranges that overlap with this range
10077 	 * that was just unlocked.  If there are, check that it doesn't overlap with any
10078 	 * other locked ranges before calling bdev_lock_lba_range_ctx which will start
10079 	 * the lock process.
10080 	 */
10081 	TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) {
10082 		if (bdev_lba_range_overlapped(range, &ctx->range) &&
10083 		    !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) {
10084 			TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq);
10085 			pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range);
10086 			TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq);
10087 			spdk_thread_send_msg(pending_ctx->range.owner_thread,
10088 					     bdev_lock_lba_range_ctx_msg, pending_ctx);
10089 		}
10090 	}
10091 	spdk_spin_unlock(&bdev->internal.spinlock);
10092 
10093 	ctx->cb_fn(&ctx->range, ctx->cb_arg, status);
10094 	free(ctx);
10095 }
10096 
10097 static void
10098 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
10099 				  struct spdk_io_channel *_ch, void *_ctx)
10100 {
10101 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10102 	struct locked_lba_range_ctx *ctx = _ctx;
10103 	TAILQ_HEAD(, spdk_bdev_io) io_locked;
10104 	struct spdk_bdev_io *bdev_io;
10105 	struct lba_range *range;
10106 
10107 	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
10108 		if (ctx->range.offset == range->offset &&
10109 		    ctx->range.length == range->length &&
10110 		    ctx->range.locked_ctx == range->locked_ctx) {
10111 			TAILQ_REMOVE(&ch->locked_ranges, range, tailq);
10112 			free(range);
10113 			break;
10114 		}
10115 	}
10116 
10117 	/* Note: we should almost always be able to assert that the range specified
10118 	 * was found.  But there are some very rare corner cases where a new channel
10119 	 * gets created simultaneously with a range unlock, where this function
10120 	 * would execute on that new channel and wouldn't have the range.
10121 	 * We also use this to clean up range allocations when a later allocation
10122 	 * fails in the locking path.
10123 	 * So we can't actually assert() here.
10124 	 */
10125 
10126 	/* Swap the locked IO into a temporary list, and then try to submit them again.
10127 	 * We could hyper-optimize this to only resubmit locked I/O that overlap
10128 	 * with the range that was just unlocked, but this isn't a performance path so
10129 	 * we go for simplicity here.
10130 	 */
10131 	TAILQ_INIT(&io_locked);
10132 	TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link);
10133 	while (!TAILQ_EMPTY(&io_locked)) {
10134 		bdev_io = TAILQ_FIRST(&io_locked);
10135 		TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link);
10136 		bdev_io_submit(bdev_io);
10137 	}
10138 
10139 	spdk_bdev_for_each_channel_continue(i, 0);
10140 }
10141 
10142 static int
10143 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length,
10144 		       lock_range_cb cb_fn, void *cb_arg)
10145 {
10146 	struct locked_lba_range_ctx *ctx;
10147 	struct lba_range *range;
10148 
10149 	spdk_spin_lock(&bdev->internal.spinlock);
10150 	/* To start the unlock the process, we find the range in the bdev's locked_ranges
10151 	 * and remove it. This ensures new channels don't inherit the locked range.
10152 	 * Then we will send a message to each channel to remove the range from its
10153 	 * per-channel list.
10154 	 */
10155 	TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) {
10156 		if (range->offset == offset && range->length == length &&
10157 		    (range->owner_ch == NULL || range->locked_ctx == cb_arg)) {
10158 			break;
10159 		}
10160 	}
10161 	if (range == NULL) {
10162 		assert(false);
10163 		spdk_spin_unlock(&bdev->internal.spinlock);
10164 		return -EINVAL;
10165 	}
10166 	TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq);
10167 	ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range);
10168 	spdk_spin_unlock(&bdev->internal.spinlock);
10169 
10170 	ctx->cb_fn = cb_fn;
10171 	ctx->cb_arg = cb_arg;
10172 
10173 	spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx,
10174 				   bdev_unlock_lba_range_cb);
10175 	return 0;
10176 }
10177 
10178 static int
10179 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
10180 		      uint64_t offset, uint64_t length,
10181 		      lock_range_cb cb_fn, void *cb_arg)
10182 {
10183 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
10184 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10185 	struct lba_range *range;
10186 	bool range_found = false;
10187 
10188 	/* Let's make sure the specified channel actually has a lock on
10189 	 * the specified range.  Note that the range must match exactly.
10190 	 */
10191 	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
10192 		if (range->offset == offset && range->length == length &&
10193 		    range->owner_ch == ch && range->locked_ctx == cb_arg) {
10194 			range_found = true;
10195 			break;
10196 		}
10197 	}
10198 
10199 	if (!range_found) {
10200 		return -EINVAL;
10201 	}
10202 
10203 	return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg);
10204 }
10205 
10206 struct bdev_quiesce_ctx {
10207 	spdk_bdev_quiesce_cb cb_fn;
10208 	void *cb_arg;
10209 };
10210 
10211 static void
10212 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status)
10213 {
10214 	struct bdev_quiesce_ctx *quiesce_ctx = ctx;
10215 
10216 	if (quiesce_ctx->cb_fn != NULL) {
10217 		quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status);
10218 	}
10219 
10220 	free(quiesce_ctx);
10221 }
10222 
10223 static void
10224 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status)
10225 {
10226 	struct bdev_quiesce_ctx *quiesce_ctx = ctx;
10227 	struct spdk_bdev_module *module = range->bdev->module;
10228 
10229 	if (status != 0) {
10230 		if (quiesce_ctx->cb_fn != NULL) {
10231 			quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status);
10232 		}
10233 		free(quiesce_ctx);
10234 		return;
10235 	}
10236 
10237 	spdk_spin_lock(&module->internal.spinlock);
10238 	TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module);
10239 	spdk_spin_unlock(&module->internal.spinlock);
10240 
10241 	if (quiesce_ctx->cb_fn != NULL) {
10242 		/* copy the context in case the range is unlocked by the callback */
10243 		struct bdev_quiesce_ctx tmp = *quiesce_ctx;
10244 
10245 		quiesce_ctx->cb_fn = NULL;
10246 		quiesce_ctx->cb_arg = NULL;
10247 
10248 		tmp.cb_fn(tmp.cb_arg, status);
10249 	}
10250 	/* quiesce_ctx will be freed on unquiesce */
10251 }
10252 
10253 static int
10254 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
10255 		   uint64_t offset, uint64_t length,
10256 		   spdk_bdev_quiesce_cb cb_fn, void *cb_arg,
10257 		   bool unquiesce)
10258 {
10259 	struct bdev_quiesce_ctx *quiesce_ctx;
10260 	int rc;
10261 
10262 	if (module != bdev->module) {
10263 		SPDK_ERRLOG("Bdev does not belong to specified module.\n");
10264 		return -EINVAL;
10265 	}
10266 
10267 	if (!bdev_io_valid_blocks(bdev, offset, length)) {
10268 		return -EINVAL;
10269 	}
10270 
10271 	if (unquiesce) {
10272 		struct lba_range *range;
10273 
10274 		/* Make sure the specified range is actually quiesced in the specified module and
10275 		 * then remove it from the list. Note that the range must match exactly.
10276 		 */
10277 		spdk_spin_lock(&module->internal.spinlock);
10278 		TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) {
10279 			if (range->bdev == bdev && range->offset == offset && range->length == length) {
10280 				TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module);
10281 				break;
10282 			}
10283 		}
10284 		spdk_spin_unlock(&module->internal.spinlock);
10285 
10286 		if (range == NULL) {
10287 			SPDK_ERRLOG("The range to unquiesce was not found.\n");
10288 			return -EINVAL;
10289 		}
10290 
10291 		quiesce_ctx = range->locked_ctx;
10292 		quiesce_ctx->cb_fn = cb_fn;
10293 		quiesce_ctx->cb_arg = cb_arg;
10294 
10295 		rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx);
10296 	} else {
10297 		quiesce_ctx = malloc(sizeof(*quiesce_ctx));
10298 		if (quiesce_ctx == NULL) {
10299 			return -ENOMEM;
10300 		}
10301 
10302 		quiesce_ctx->cb_fn = cb_fn;
10303 		quiesce_ctx->cb_arg = cb_arg;
10304 
10305 		rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx);
10306 		if (rc != 0) {
10307 			free(quiesce_ctx);
10308 		}
10309 	}
10310 
10311 	return rc;
10312 }
10313 
10314 int
10315 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
10316 		  spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
10317 {
10318 	return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false);
10319 }
10320 
10321 int
10322 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
10323 		    spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
10324 {
10325 	return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true);
10326 }
10327 
10328 int
10329 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
10330 			uint64_t offset, uint64_t length,
10331 			spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
10332 {
10333 	return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false);
10334 }
10335 
10336 int
10337 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
10338 			  uint64_t offset, uint64_t length,
10339 			  spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
10340 {
10341 	return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true);
10342 }
10343 
10344 int
10345 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains,
10346 			     int array_size)
10347 {
10348 	if (!bdev) {
10349 		return -EINVAL;
10350 	}
10351 
10352 	if (bdev->fn_table->get_memory_domains) {
10353 		return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size);
10354 	}
10355 
10356 	return 0;
10357 }
10358 
10359 struct spdk_bdev_for_each_io_ctx {
10360 	void *ctx;
10361 	spdk_bdev_io_fn fn;
10362 	spdk_bdev_for_each_io_cb cb;
10363 };
10364 
10365 static void
10366 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
10367 			 struct spdk_io_channel *io_ch, void *_ctx)
10368 {
10369 	struct spdk_bdev_for_each_io_ctx *ctx = _ctx;
10370 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
10371 	struct spdk_bdev_io *bdev_io;
10372 	int rc = 0;
10373 
10374 	TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) {
10375 		rc = ctx->fn(ctx->ctx, bdev_io);
10376 		if (rc != 0) {
10377 			break;
10378 		}
10379 	}
10380 
10381 	spdk_bdev_for_each_channel_continue(i, rc);
10382 }
10383 
10384 static void
10385 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
10386 {
10387 	struct spdk_bdev_for_each_io_ctx *ctx = _ctx;
10388 
10389 	ctx->cb(ctx->ctx, status);
10390 
10391 	free(ctx);
10392 }
10393 
10394 void
10395 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn,
10396 			   spdk_bdev_for_each_io_cb cb)
10397 {
10398 	struct spdk_bdev_for_each_io_ctx *ctx;
10399 
10400 	assert(fn != NULL && cb != NULL);
10401 
10402 	ctx = calloc(1, sizeof(*ctx));
10403 	if (ctx == NULL) {
10404 		SPDK_ERRLOG("Failed to allocate context.\n");
10405 		cb(_ctx, -ENOMEM);
10406 		return;
10407 	}
10408 
10409 	ctx->ctx = _ctx;
10410 	ctx->fn = fn;
10411 	ctx->cb = cb;
10412 
10413 	spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx,
10414 				   bdev_for_each_io_done);
10415 }
10416 
10417 void
10418 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status)
10419 {
10420 	spdk_for_each_channel_continue(iter->i, status);
10421 }
10422 
10423 static struct spdk_bdev *
10424 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i)
10425 {
10426 	void *io_device = spdk_io_channel_iter_get_io_device(i);
10427 
10428 	return __bdev_from_io_dev(io_device);
10429 }
10430 
10431 static void
10432 bdev_each_channel_msg(struct spdk_io_channel_iter *i)
10433 {
10434 	struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
10435 	struct spdk_bdev *bdev = io_channel_iter_get_bdev(i);
10436 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
10437 
10438 	iter->i = i;
10439 	iter->fn(iter, bdev, ch, iter->ctx);
10440 }
10441 
10442 static void
10443 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status)
10444 {
10445 	struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
10446 	struct spdk_bdev *bdev = io_channel_iter_get_bdev(i);
10447 
10448 	iter->i = i;
10449 	iter->cpl(bdev, iter->ctx, status);
10450 
10451 	free(iter);
10452 }
10453 
10454 void
10455 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn,
10456 			   void *ctx, spdk_bdev_for_each_channel_done cpl)
10457 {
10458 	struct spdk_bdev_channel_iter *iter;
10459 
10460 	assert(bdev != NULL && fn != NULL && ctx != NULL);
10461 
10462 	iter = calloc(1, sizeof(struct spdk_bdev_channel_iter));
10463 	if (iter == NULL) {
10464 		SPDK_ERRLOG("Unable to allocate iterator\n");
10465 		assert(false);
10466 		return;
10467 	}
10468 
10469 	iter->fn = fn;
10470 	iter->cpl = cpl;
10471 	iter->ctx = ctx;
10472 
10473 	spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg,
10474 			      iter, bdev_each_channel_cpl);
10475 }
10476 
10477 static void
10478 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
10479 {
10480 	struct spdk_bdev_io *parent_io = cb_arg;
10481 
10482 	spdk_bdev_free_io(bdev_io);
10483 
10484 	/* Check return status of write */
10485 	parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
10486 	parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx);
10487 }
10488 
10489 static void
10490 bdev_copy_do_write(void *_bdev_io)
10491 {
10492 	struct spdk_bdev_io *bdev_io = _bdev_io;
10493 	int rc;
10494 
10495 	/* Write blocks */
10496 	rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc,
10497 					    spdk_io_channel_from_ctx(bdev_io->internal.ch),
10498 					    bdev_io->u.bdev.iovs[0].iov_base,
10499 					    bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks,
10500 					    bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io);
10501 
10502 	if (rc == -ENOMEM) {
10503 		bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write);
10504 	} else if (rc != 0) {
10505 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10506 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
10507 	}
10508 }
10509 
10510 static void
10511 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
10512 {
10513 	struct spdk_bdev_io *parent_io = cb_arg;
10514 
10515 	spdk_bdev_free_io(bdev_io);
10516 
10517 	/* Check return status of read */
10518 	if (!success) {
10519 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10520 		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
10521 		return;
10522 	}
10523 
10524 	/* Do write */
10525 	bdev_copy_do_write(parent_io);
10526 }
10527 
10528 static void
10529 bdev_copy_do_read(void *_bdev_io)
10530 {
10531 	struct spdk_bdev_io *bdev_io = _bdev_io;
10532 	int rc;
10533 
10534 	/* Read blocks */
10535 	rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc,
10536 					   spdk_io_channel_from_ctx(bdev_io->internal.ch),
10537 					   bdev_io->u.bdev.iovs[0].iov_base,
10538 					   bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks,
10539 					   bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io);
10540 
10541 	if (rc == -ENOMEM) {
10542 		bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read);
10543 	} else if (rc != 0) {
10544 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10545 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
10546 	}
10547 }
10548 
10549 static void
10550 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
10551 {
10552 	if (!success) {
10553 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10554 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
10555 		return;
10556 	}
10557 
10558 	bdev_copy_do_read(bdev_io);
10559 }
10560 
10561 int
10562 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
10563 		      uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks,
10564 		      spdk_bdev_io_completion_cb cb, void *cb_arg)
10565 {
10566 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
10567 	struct spdk_bdev_io *bdev_io;
10568 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
10569 
10570 	if (!desc->write) {
10571 		return -EBADF;
10572 	}
10573 
10574 	if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) ||
10575 	    !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) {
10576 		SPDK_DEBUGLOG(bdev,
10577 			      "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n",
10578 			      dst_offset_blocks, src_offset_blocks, num_blocks);
10579 		return -EINVAL;
10580 	}
10581 
10582 	bdev_io = bdev_channel_get_io(channel);
10583 	if (!bdev_io) {
10584 		return -ENOMEM;
10585 	}
10586 
10587 	bdev_io->internal.ch = channel;
10588 	bdev_io->internal.desc = desc;
10589 	bdev_io->type = SPDK_BDEV_IO_TYPE_COPY;
10590 
10591 	bdev_io->u.bdev.offset_blocks = dst_offset_blocks;
10592 	bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks;
10593 	bdev_io->u.bdev.num_blocks = num_blocks;
10594 	bdev_io->u.bdev.memory_domain = NULL;
10595 	bdev_io->u.bdev.memory_domain_ctx = NULL;
10596 	bdev_io->u.bdev.iovs = NULL;
10597 	bdev_io->u.bdev.iovcnt = 0;
10598 	bdev_io->u.bdev.md_buf = NULL;
10599 	bdev_io->u.bdev.accel_sequence = NULL;
10600 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
10601 
10602 	if (dst_offset_blocks == src_offset_blocks || num_blocks == 0) {
10603 		spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io);
10604 		return 0;
10605 	}
10606 
10607 
10608 	/* If the copy size is large and should be split, use the generic split logic
10609 	 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not.
10610 	 *
10611 	 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or
10612 	 * emulate it using regular read and write requests otherwise.
10613 	 */
10614 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) ||
10615 	    bdev_io->internal.f.split) {
10616 		bdev_io_submit(bdev_io);
10617 		return 0;
10618 	}
10619 
10620 	spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev));
10621 
10622 	return 0;
10623 }
10624 
10625 SPDK_LOG_REGISTER_COMPONENT(bdev)
10626 
10627 static void
10628 bdev_trace(void)
10629 {
10630 	struct spdk_trace_tpoint_opts opts[] = {
10631 		{
10632 			"BDEV_IO_START", TRACE_BDEV_IO_START,
10633 			OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 1,
10634 			{
10635 				{ "type", SPDK_TRACE_ARG_TYPE_INT, 8 },
10636 				{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 },
10637 				{ "offset", SPDK_TRACE_ARG_TYPE_INT, 8 },
10638 				{ "qd", SPDK_TRACE_ARG_TYPE_INT, 4 }
10639 			}
10640 		},
10641 		{
10642 			"BDEV_IO_DONE", TRACE_BDEV_IO_DONE,
10643 			OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 0,
10644 			{
10645 				{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 },
10646 				{ "qd", SPDK_TRACE_ARG_TYPE_INT, 4 }
10647 			}
10648 		},
10649 		{
10650 			"BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE,
10651 			OWNER_TYPE_BDEV, OBJECT_NONE, 0,
10652 			{
10653 				{ "tid", SPDK_TRACE_ARG_TYPE_INT, 8 }
10654 			}
10655 		},
10656 		{
10657 			"BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY,
10658 			OWNER_TYPE_BDEV, OBJECT_NONE, 0,
10659 			{
10660 				{ "tid", SPDK_TRACE_ARG_TYPE_INT, 8 }
10661 			}
10662 		},
10663 	};
10664 
10665 
10666 	spdk_trace_register_owner_type(OWNER_TYPE_BDEV, 'b');
10667 	spdk_trace_register_object(OBJECT_BDEV_IO, 'i');
10668 	spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts));
10669 	spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0);
10670 	spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0);
10671 	spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_START, OBJECT_BDEV_IO, 0);
10672 	spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_COMPLETE, OBJECT_BDEV_IO, 0);
10673 	spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_START, OBJECT_BDEV_IO, 0);
10674 	spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_DONE, OBJECT_BDEV_IO, 0);
10675 }
10676 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV)
10677