xref: /spdk/lib/bdev/bdev.c (revision 52a4134875252629d5d87a15dc337c6bfe0b3746)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2016 Intel Corporation. All rights reserved.
3  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
4  *   Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  */
6 
7 #include "spdk/stdinc.h"
8 
9 #include "spdk/bdev.h"
10 
11 #include "spdk/accel.h"
12 #include "spdk/config.h"
13 #include "spdk/env.h"
14 #include "spdk/thread.h"
15 #include "spdk/likely.h"
16 #include "spdk/queue.h"
17 #include "spdk/nvme_spec.h"
18 #include "spdk/scsi_spec.h"
19 #include "spdk/notify.h"
20 #include "spdk/util.h"
21 #include "spdk/trace.h"
22 #include "spdk/dma.h"
23 
24 #include "spdk/bdev_module.h"
25 #include "spdk/log.h"
26 #include "spdk/string.h"
27 
28 #include "bdev_internal.h"
29 #include "spdk_internal/trace_defs.h"
30 #include "spdk_internal/assert.h"
31 
32 #ifdef SPDK_CONFIG_VTUNE
33 #include "ittnotify.h"
34 #include "ittnotify_types.h"
35 int __itt_init_ittlib(const char *, __itt_group_id);
36 #endif
37 
38 #define SPDK_BDEV_IO_POOL_SIZE			(64 * 1024 - 1)
39 #define SPDK_BDEV_IO_CACHE_SIZE			256
40 #define SPDK_BDEV_AUTO_EXAMINE			true
41 #define BUF_SMALL_CACHE_SIZE			128
42 #define BUF_LARGE_CACHE_SIZE			16
43 #define NOMEM_THRESHOLD_COUNT			8
44 
45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC		1000
46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE	1
47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE	512
48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC		1000
49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC		(1024 * 1024)
50 #define SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC	(UINT64_MAX / (1024 * 1024))
51 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED		UINT64_MAX
52 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC	1000
53 
54 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command
55  * when splitting into children requests at a time.
56  */
57 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8)
58 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000
59 
60 /* The maximum number of children requests for a COPY command
61  * when splitting into children requests at a time.
62  */
63 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8)
64 
65 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \
66 	log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev)
67 #ifdef DEBUG
68 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \
69 	log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev)
70 #else
71 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0)
72 #endif
73 
74 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func,
75 				const char *detail, struct spdk_bdev *bdev);
76 
77 static const char *qos_rpc_type[] = {"rw_ios_per_sec",
78 				     "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec"
79 				    };
80 
81 TAILQ_HEAD(spdk_bdev_list, spdk_bdev);
82 
83 RB_HEAD(bdev_name_tree, spdk_bdev_name);
84 
85 static int
86 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2)
87 {
88 	return strcmp(name1->name, name2->name);
89 }
90 
91 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp);
92 
93 struct spdk_bdev_mgr {
94 	struct spdk_mempool *bdev_io_pool;
95 
96 	void *zero_buffer;
97 
98 	TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules;
99 
100 	struct spdk_bdev_list bdevs;
101 	struct bdev_name_tree bdev_names;
102 
103 	bool init_complete;
104 	bool module_init_complete;
105 
106 	struct spdk_spinlock spinlock;
107 
108 	TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens;
109 
110 #ifdef SPDK_CONFIG_VTUNE
111 	__itt_domain	*domain;
112 #endif
113 };
114 
115 static struct spdk_bdev_mgr g_bdev_mgr = {
116 	.bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules),
117 	.bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs),
118 	.bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names),
119 	.init_complete = false,
120 	.module_init_complete = false,
121 	.async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens),
122 };
123 
124 static void
125 __attribute__((constructor))
126 _bdev_init(void)
127 {
128 	spdk_spin_init(&g_bdev_mgr.spinlock);
129 }
130 
131 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status);
132 
133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc);
134 
135 struct lba_range {
136 	struct spdk_bdev		*bdev;
137 	uint64_t			offset;
138 	uint64_t			length;
139 	bool				quiesce;
140 	void				*locked_ctx;
141 	struct spdk_thread		*owner_thread;
142 	struct spdk_bdev_channel	*owner_ch;
143 	TAILQ_ENTRY(lba_range)		tailq;
144 	TAILQ_ENTRY(lba_range)		tailq_module;
145 };
146 
147 static struct spdk_bdev_opts	g_bdev_opts = {
148 	.bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE,
149 	.bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE,
150 	.bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE,
151 	.iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE,
152 	.iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE,
153 };
154 
155 static spdk_bdev_init_cb	g_init_cb_fn = NULL;
156 static void			*g_init_cb_arg = NULL;
157 
158 static spdk_bdev_fini_cb	g_fini_cb_fn = NULL;
159 static void			*g_fini_cb_arg = NULL;
160 static struct spdk_thread	*g_fini_thread = NULL;
161 
162 struct spdk_bdev_qos_limit {
163 	/** IOs or bytes allowed per second (i.e., 1s). */
164 	uint64_t limit;
165 
166 	/** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms).
167 	 *  For remaining bytes, allowed to run negative if an I/O is submitted when
168 	 *  some bytes are remaining, but the I/O is bigger than that amount. The
169 	 *  excess will be deducted from the next timeslice.
170 	 */
171 	int64_t remaining_this_timeslice;
172 
173 	/** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */
174 	uint32_t min_per_timeslice;
175 
176 	/** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */
177 	uint32_t max_per_timeslice;
178 
179 	/** Function to check whether to queue the IO.
180 	 * If The IO is allowed to pass, the quota will be reduced correspondingly.
181 	 */
182 	bool (*queue_io)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io);
183 
184 	/** Function to rewind the quota once the IO was allowed to be sent by this
185 	 * limit but queued due to one of the further limits.
186 	 */
187 	void (*rewind_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io);
188 };
189 
190 struct spdk_bdev_qos {
191 	/** Types of structure of rate limits. */
192 	struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
193 
194 	/** The channel that all I/O are funneled through. */
195 	struct spdk_bdev_channel *ch;
196 
197 	/** The thread on which the poller is running. */
198 	struct spdk_thread *thread;
199 
200 	/** Size of a timeslice in tsc ticks. */
201 	uint64_t timeslice_size;
202 
203 	/** Timestamp of start of last timeslice. */
204 	uint64_t last_timeslice;
205 
206 	/** Poller that processes queued I/O commands each time slice. */
207 	struct spdk_poller *poller;
208 };
209 
210 struct spdk_bdev_mgmt_channel {
211 	/*
212 	 * Each thread keeps a cache of bdev_io - this allows
213 	 *  bdev threads which are *not* DPDK threads to still
214 	 *  benefit from a per-thread bdev_io cache.  Without
215 	 *  this, non-DPDK threads fetching from the mempool
216 	 *  incur a cmpxchg on get and put.
217 	 */
218 	bdev_io_stailq_t per_thread_cache;
219 	uint32_t	per_thread_cache_count;
220 	uint32_t	bdev_io_cache_size;
221 
222 	struct spdk_iobuf_channel iobuf;
223 
224 	TAILQ_HEAD(, spdk_bdev_shared_resource)	shared_resources;
225 	TAILQ_HEAD(, spdk_bdev_io_wait_entry)	io_wait_queue;
226 };
227 
228 /*
229  * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device
230  * will queue here their IO that awaits retry. It makes it possible to retry sending
231  * IO to one bdev after IO from other bdev completes.
232  */
233 struct spdk_bdev_shared_resource {
234 	/* The bdev management channel */
235 	struct spdk_bdev_mgmt_channel *mgmt_ch;
236 
237 	/*
238 	 * Count of I/O submitted to bdev module and waiting for completion.
239 	 * Incremented before submit_request() is called on an spdk_bdev_io.
240 	 */
241 	uint64_t		io_outstanding;
242 
243 	/*
244 	 * Queue of IO awaiting retry because of a previous NOMEM status returned
245 	 *  on this channel.
246 	 */
247 	bdev_io_tailq_t		nomem_io;
248 
249 	/*
250 	 * Threshold which io_outstanding must drop to before retrying nomem_io.
251 	 */
252 	uint64_t		nomem_threshold;
253 
254 	/*
255 	 * Indicate whether aborting nomem I/Os is in progress.
256 	 * If true, we should not touch the nomem_io list on I/O completions.
257 	 */
258 	bool			nomem_abort_in_progress;
259 
260 	/* I/O channel allocated by a bdev module */
261 	struct spdk_io_channel	*shared_ch;
262 
263 	struct spdk_poller	*nomem_poller;
264 
265 	/* Refcount of bdev channels using this resource */
266 	uint32_t		ref;
267 
268 	TAILQ_ENTRY(spdk_bdev_shared_resource) link;
269 };
270 
271 #define BDEV_CH_RESET_IN_PROGRESS	(1 << 0)
272 #define BDEV_CH_QOS_ENABLED		(1 << 1)
273 
274 struct spdk_bdev_channel {
275 	struct spdk_bdev	*bdev;
276 
277 	/* The channel for the underlying device */
278 	struct spdk_io_channel	*channel;
279 
280 	/* Accel channel */
281 	struct spdk_io_channel	*accel_channel;
282 
283 	/* Per io_device per thread data */
284 	struct spdk_bdev_shared_resource *shared_resource;
285 
286 	struct spdk_bdev_io_stat *stat;
287 
288 	/*
289 	 * Count of I/O submitted to the underlying dev module through this channel
290 	 * and waiting for completion.
291 	 */
292 	uint64_t		io_outstanding;
293 
294 	/*
295 	 * List of all submitted I/Os including I/O that are generated via splitting.
296 	 */
297 	bdev_io_tailq_t		io_submitted;
298 
299 	/*
300 	 * List of spdk_bdev_io that are currently queued because they write to a locked
301 	 * LBA range.
302 	 */
303 	bdev_io_tailq_t		io_locked;
304 
305 	/* List of I/Os with accel sequence being currently executed */
306 	bdev_io_tailq_t		io_accel_exec;
307 
308 	/* List of I/Os doing memory domain pull/push */
309 	bdev_io_tailq_t		io_memory_domain;
310 
311 	uint32_t		flags;
312 
313 	/* Counts number of bdev_io in the io_submitted TAILQ */
314 	uint16_t		queue_depth;
315 
316 	uint16_t		trace_id;
317 
318 	struct spdk_histogram_data *histogram;
319 
320 #ifdef SPDK_CONFIG_VTUNE
321 	uint64_t		start_tsc;
322 	uint64_t		interval_tsc;
323 	__itt_string_handle	*handle;
324 	struct spdk_bdev_io_stat *prev_stat;
325 #endif
326 
327 	lba_range_tailq_t	locked_ranges;
328 
329 	/** List of I/Os queued by QoS. */
330 	bdev_io_tailq_t		qos_queued_io;
331 };
332 
333 struct media_event_entry {
334 	struct spdk_bdev_media_event	event;
335 	TAILQ_ENTRY(media_event_entry)	tailq;
336 };
337 
338 #define MEDIA_EVENT_POOL_SIZE 64
339 
340 struct spdk_bdev_desc {
341 	struct spdk_bdev		*bdev;
342 	bool				write;
343 	bool				memory_domains_supported;
344 	bool				accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES];
345 	struct spdk_bdev_open_opts	opts;
346 	struct spdk_thread		*thread;
347 	struct {
348 		spdk_bdev_event_cb_t event_fn;
349 		void *ctx;
350 	}				callback;
351 	bool				closed;
352 	struct spdk_spinlock		spinlock;
353 	uint32_t			refs;
354 	TAILQ_HEAD(, media_event_entry)	pending_media_events;
355 	TAILQ_HEAD(, media_event_entry)	free_media_events;
356 	struct media_event_entry	*media_events_buffer;
357 	TAILQ_ENTRY(spdk_bdev_desc)	link;
358 
359 	uint64_t		timeout_in_sec;
360 	spdk_bdev_io_timeout_cb	cb_fn;
361 	void			*cb_arg;
362 	struct spdk_poller	*io_timeout_poller;
363 	struct spdk_bdev_module_claim	*claim;
364 };
365 
366 struct spdk_bdev_iostat_ctx {
367 	struct spdk_bdev_io_stat *stat;
368 	enum spdk_bdev_reset_stat_mode reset_mode;
369 	spdk_bdev_get_device_stat_cb cb;
370 	void *cb_arg;
371 };
372 
373 struct set_qos_limit_ctx {
374 	void (*cb_fn)(void *cb_arg, int status);
375 	void *cb_arg;
376 	struct spdk_bdev *bdev;
377 };
378 
379 struct spdk_bdev_channel_iter {
380 	spdk_bdev_for_each_channel_msg fn;
381 	spdk_bdev_for_each_channel_done cpl;
382 	struct spdk_io_channel_iter *i;
383 	void *ctx;
384 };
385 
386 struct spdk_bdev_io_error_stat {
387 	uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS];
388 };
389 
390 enum bdev_io_retry_state {
391 	BDEV_IO_RETRY_STATE_INVALID,
392 	BDEV_IO_RETRY_STATE_PULL,
393 	BDEV_IO_RETRY_STATE_PULL_MD,
394 	BDEV_IO_RETRY_STATE_SUBMIT,
395 	BDEV_IO_RETRY_STATE_PUSH,
396 	BDEV_IO_RETRY_STATE_PUSH_MD,
397 	BDEV_IO_RETRY_STATE_GET_ACCEL_BUF,
398 };
399 
400 #define __bdev_to_io_dev(bdev)		(((char *)bdev) + 1)
401 #define __bdev_from_io_dev(io_dev)	((struct spdk_bdev *)(((char *)io_dev) - 1))
402 #define __io_ch_to_bdev_ch(io_ch)	((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch))
403 #define __io_ch_to_bdev_mgmt_ch(io_ch)	((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch))
404 
405 static inline void bdev_io_complete(void *ctx);
406 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io);
407 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io);
408 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io);
409 static void _bdev_io_get_accel_buf(struct spdk_bdev_io *bdev_io);
410 
411 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
412 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io);
413 
414 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
415 				struct spdk_io_channel *ch, void *_ctx);
416 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status);
417 
418 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
419 				     struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks,
420 				     uint64_t num_blocks,
421 				     struct spdk_memory_domain *domain, void *domain_ctx,
422 				     struct spdk_accel_sequence *seq, uint32_t dif_check_flags,
423 				     spdk_bdev_io_completion_cb cb, void *cb_arg);
424 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
425 				      struct iovec *iov, int iovcnt, void *md_buf,
426 				      uint64_t offset_blocks, uint64_t num_blocks,
427 				      struct spdk_memory_domain *domain, void *domain_ctx,
428 				      struct spdk_accel_sequence *seq, uint32_t dif_check_flags,
429 				      uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw,
430 				      spdk_bdev_io_completion_cb cb, void *cb_arg);
431 
432 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
433 			       uint64_t offset, uint64_t length,
434 			       lock_range_cb cb_fn, void *cb_arg);
435 
436 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
437 				 uint64_t offset, uint64_t length,
438 				 lock_range_cb cb_fn, void *cb_arg);
439 
440 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort);
441 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort);
442 
443 static bool claim_type_is_v2(enum spdk_bdev_claim_type type);
444 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc);
445 static void claim_reset(struct spdk_bdev *bdev);
446 
447 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch);
448 
449 static bool bdev_io_should_split(struct spdk_bdev_io *bdev_io);
450 
451 #define bdev_get_ext_io_opt(opts, field, defval) \
452 	((opts) != NULL ? SPDK_GET_FIELD(opts, field, defval) : (defval))
453 
454 static inline void
455 bdev_ch_add_to_io_submitted(struct spdk_bdev_io *bdev_io)
456 {
457 	TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link);
458 	bdev_io->internal.ch->queue_depth++;
459 }
460 
461 static inline void
462 bdev_ch_remove_from_io_submitted(struct spdk_bdev_io *bdev_io)
463 {
464 	TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link);
465 	bdev_io->internal.ch->queue_depth--;
466 }
467 
468 void
469 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size)
470 {
471 	if (!opts) {
472 		SPDK_ERRLOG("opts should not be NULL\n");
473 		return;
474 	}
475 
476 	if (!opts_size) {
477 		SPDK_ERRLOG("opts_size should not be zero value\n");
478 		return;
479 	}
480 
481 	opts->opts_size = opts_size;
482 
483 #define SET_FIELD(field) \
484 	if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \
485 		opts->field = g_bdev_opts.field; \
486 	} \
487 
488 	SET_FIELD(bdev_io_pool_size);
489 	SET_FIELD(bdev_io_cache_size);
490 	SET_FIELD(bdev_auto_examine);
491 	SET_FIELD(iobuf_small_cache_size);
492 	SET_FIELD(iobuf_large_cache_size);
493 
494 	/* Do not remove this statement, you should always update this statement when you adding a new field,
495 	 * and do not forget to add the SET_FIELD statement for your added field. */
496 	SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size");
497 
498 #undef SET_FIELD
499 }
500 
501 int
502 spdk_bdev_set_opts(struct spdk_bdev_opts *opts)
503 {
504 	uint32_t min_pool_size;
505 
506 	if (!opts) {
507 		SPDK_ERRLOG("opts cannot be NULL\n");
508 		return -1;
509 	}
510 
511 	if (!opts->opts_size) {
512 		SPDK_ERRLOG("opts_size inside opts cannot be zero value\n");
513 		return -1;
514 	}
515 
516 	/*
517 	 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem
518 	 *  initialization.  A second mgmt_ch will be created on the same thread when the application starts
519 	 *  but before the deferred put_io_channel event is executed for the first mgmt_ch.
520 	 */
521 	min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1);
522 	if (opts->bdev_io_pool_size < min_pool_size) {
523 		SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32
524 			    " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size,
525 			    spdk_thread_get_count());
526 		SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size);
527 		return -1;
528 	}
529 
530 #define SET_FIELD(field) \
531         if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \
532                 g_bdev_opts.field = opts->field; \
533         } \
534 
535 	SET_FIELD(bdev_io_pool_size);
536 	SET_FIELD(bdev_io_cache_size);
537 	SET_FIELD(bdev_auto_examine);
538 	SET_FIELD(iobuf_small_cache_size);
539 	SET_FIELD(iobuf_large_cache_size);
540 
541 	g_bdev_opts.opts_size = opts->opts_size;
542 
543 #undef SET_FIELD
544 
545 	return 0;
546 }
547 
548 static struct spdk_bdev *
549 bdev_get_by_name(const char *bdev_name)
550 {
551 	struct spdk_bdev_name find;
552 	struct spdk_bdev_name *res;
553 
554 	find.name = (char *)bdev_name;
555 	res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find);
556 	if (res != NULL) {
557 		return res->bdev;
558 	}
559 
560 	return NULL;
561 }
562 
563 struct spdk_bdev *
564 spdk_bdev_get_by_name(const char *bdev_name)
565 {
566 	struct spdk_bdev *bdev;
567 
568 	spdk_spin_lock(&g_bdev_mgr.spinlock);
569 	bdev = bdev_get_by_name(bdev_name);
570 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
571 
572 	return bdev;
573 }
574 
575 struct bdev_io_status_string {
576 	enum spdk_bdev_io_status status;
577 	const char *str;
578 };
579 
580 static const struct bdev_io_status_string bdev_io_status_strings[] = {
581 	{ SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" },
582 	{ SPDK_BDEV_IO_STATUS_ABORTED, "aborted" },
583 	{ SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" },
584 	{ SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" },
585 	{ SPDK_BDEV_IO_STATUS_NOMEM, "nomem" },
586 	{ SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" },
587 	{ SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" },
588 	{ SPDK_BDEV_IO_STATUS_FAILED, "failed" },
589 	{ SPDK_BDEV_IO_STATUS_PENDING, "pending" },
590 	{ SPDK_BDEV_IO_STATUS_SUCCESS, "success" },
591 };
592 
593 static const char *
594 bdev_io_status_get_string(enum spdk_bdev_io_status status)
595 {
596 	uint32_t i;
597 
598 	for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) {
599 		if (bdev_io_status_strings[i].status == status) {
600 			return bdev_io_status_strings[i].str;
601 		}
602 	}
603 
604 	return "reserved";
605 }
606 
607 struct spdk_bdev_wait_for_examine_ctx {
608 	struct spdk_poller              *poller;
609 	spdk_bdev_wait_for_examine_cb	cb_fn;
610 	void				*cb_arg;
611 };
612 
613 static bool bdev_module_all_actions_completed(void);
614 
615 static int
616 bdev_wait_for_examine_cb(void *arg)
617 {
618 	struct spdk_bdev_wait_for_examine_ctx *ctx = arg;
619 
620 	if (!bdev_module_all_actions_completed()) {
621 		return SPDK_POLLER_IDLE;
622 	}
623 
624 	spdk_poller_unregister(&ctx->poller);
625 	ctx->cb_fn(ctx->cb_arg);
626 	free(ctx);
627 
628 	return SPDK_POLLER_BUSY;
629 }
630 
631 int
632 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg)
633 {
634 	struct spdk_bdev_wait_for_examine_ctx *ctx;
635 
636 	ctx = calloc(1, sizeof(*ctx));
637 	if (ctx == NULL) {
638 		return -ENOMEM;
639 	}
640 	ctx->cb_fn = cb_fn;
641 	ctx->cb_arg = cb_arg;
642 	ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0);
643 
644 	return 0;
645 }
646 
647 struct spdk_bdev_examine_item {
648 	char *name;
649 	TAILQ_ENTRY(spdk_bdev_examine_item) link;
650 };
651 
652 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item);
653 
654 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER(
655 			g_bdev_examine_allowlist);
656 
657 static inline bool
658 bdev_examine_allowlist_check(const char *name)
659 {
660 	struct spdk_bdev_examine_item *item;
661 	TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) {
662 		if (strcmp(name, item->name) == 0) {
663 			return true;
664 		}
665 	}
666 	return false;
667 }
668 
669 static inline void
670 bdev_examine_allowlist_remove(const char *name)
671 {
672 	struct spdk_bdev_examine_item *item;
673 	TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) {
674 		if (strcmp(name, item->name) == 0) {
675 			TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link);
676 			free(item->name);
677 			free(item);
678 			break;
679 		}
680 	}
681 }
682 
683 static inline void
684 bdev_examine_allowlist_free(void)
685 {
686 	struct spdk_bdev_examine_item *item;
687 	while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) {
688 		item = TAILQ_FIRST(&g_bdev_examine_allowlist);
689 		TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link);
690 		free(item->name);
691 		free(item);
692 	}
693 }
694 
695 static inline bool
696 bdev_in_examine_allowlist(struct spdk_bdev *bdev)
697 {
698 	struct spdk_bdev_alias *tmp;
699 	if (bdev_examine_allowlist_check(bdev->name)) {
700 		return true;
701 	}
702 	TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
703 		if (bdev_examine_allowlist_check(tmp->alias.name)) {
704 			return true;
705 		}
706 	}
707 	return false;
708 }
709 
710 static inline bool
711 bdev_ok_to_examine(struct spdk_bdev *bdev)
712 {
713 	/* Some bdevs may not support the READ command.
714 	 * Do not try to examine them.
715 	 */
716 	if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ)) {
717 		return false;
718 	}
719 
720 	if (g_bdev_opts.bdev_auto_examine) {
721 		return true;
722 	} else {
723 		return bdev_in_examine_allowlist(bdev);
724 	}
725 }
726 
727 static void
728 bdev_examine(struct spdk_bdev *bdev)
729 {
730 	struct spdk_bdev_module *module;
731 	struct spdk_bdev_module_claim *claim, *tmpclaim;
732 	uint32_t action;
733 
734 	if (!bdev_ok_to_examine(bdev)) {
735 		return;
736 	}
737 
738 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
739 		if (module->examine_config) {
740 			spdk_spin_lock(&module->internal.spinlock);
741 			action = module->internal.action_in_progress;
742 			module->internal.action_in_progress++;
743 			spdk_spin_unlock(&module->internal.spinlock);
744 			module->examine_config(bdev);
745 			if (action != module->internal.action_in_progress) {
746 				SPDK_ERRLOG("examine_config for module %s did not call "
747 					    "spdk_bdev_module_examine_done()\n", module->name);
748 			}
749 		}
750 	}
751 
752 	spdk_spin_lock(&bdev->internal.spinlock);
753 
754 	switch (bdev->internal.claim_type) {
755 	case SPDK_BDEV_CLAIM_NONE:
756 		/* Examine by all bdev modules */
757 		TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
758 			if (module->examine_disk) {
759 				spdk_spin_lock(&module->internal.spinlock);
760 				module->internal.action_in_progress++;
761 				spdk_spin_unlock(&module->internal.spinlock);
762 				spdk_spin_unlock(&bdev->internal.spinlock);
763 				module->examine_disk(bdev);
764 				spdk_spin_lock(&bdev->internal.spinlock);
765 			}
766 		}
767 		break;
768 	case SPDK_BDEV_CLAIM_EXCL_WRITE:
769 		/* Examine by the one bdev module with a v1 claim */
770 		module = bdev->internal.claim.v1.module;
771 		if (module->examine_disk) {
772 			spdk_spin_lock(&module->internal.spinlock);
773 			module->internal.action_in_progress++;
774 			spdk_spin_unlock(&module->internal.spinlock);
775 			spdk_spin_unlock(&bdev->internal.spinlock);
776 			module->examine_disk(bdev);
777 			return;
778 		}
779 		break;
780 	default:
781 		/* Examine by all bdev modules with a v2 claim */
782 		assert(claim_type_is_v2(bdev->internal.claim_type));
783 		/*
784 		 * Removal of tailq nodes while iterating can cause the iteration to jump out of the
785 		 * list, perhaps accessing freed memory. Without protection, this could happen
786 		 * while the lock is dropped during the examine callback.
787 		 */
788 		bdev->internal.examine_in_progress++;
789 
790 		TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) {
791 			module = claim->module;
792 
793 			if (module == NULL) {
794 				/* This is a vestigial claim, held by examine_count */
795 				continue;
796 			}
797 
798 			if (module->examine_disk == NULL) {
799 				continue;
800 			}
801 
802 			spdk_spin_lock(&module->internal.spinlock);
803 			module->internal.action_in_progress++;
804 			spdk_spin_unlock(&module->internal.spinlock);
805 
806 			/* Call examine_disk without holding internal.spinlock. */
807 			spdk_spin_unlock(&bdev->internal.spinlock);
808 			module->examine_disk(bdev);
809 			spdk_spin_lock(&bdev->internal.spinlock);
810 		}
811 
812 		assert(bdev->internal.examine_in_progress > 0);
813 		bdev->internal.examine_in_progress--;
814 		if (bdev->internal.examine_in_progress == 0) {
815 			/* Remove any claims that were released during examine_disk */
816 			TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) {
817 				if (claim->desc != NULL) {
818 					continue;
819 				}
820 
821 				TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link);
822 				free(claim);
823 			}
824 			if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) {
825 				claim_reset(bdev);
826 			}
827 		}
828 	}
829 
830 	spdk_spin_unlock(&bdev->internal.spinlock);
831 }
832 
833 int
834 spdk_bdev_examine(const char *name)
835 {
836 	struct spdk_bdev *bdev;
837 	struct spdk_bdev_examine_item *item;
838 	struct spdk_thread *thread = spdk_get_thread();
839 
840 	if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) {
841 		SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread,
842 			    thread ? spdk_thread_get_name(thread) : "null");
843 		return -EINVAL;
844 	}
845 
846 	if (g_bdev_opts.bdev_auto_examine) {
847 		SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled\n");
848 		return -EINVAL;
849 	}
850 
851 	if (bdev_examine_allowlist_check(name)) {
852 		SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name);
853 		return -EEXIST;
854 	}
855 
856 	item = calloc(1, sizeof(*item));
857 	if (!item) {
858 		return -ENOMEM;
859 	}
860 	item->name = strdup(name);
861 	if (!item->name) {
862 		free(item);
863 		return -ENOMEM;
864 	}
865 	TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link);
866 
867 	bdev = spdk_bdev_get_by_name(name);
868 	if (bdev) {
869 		bdev_examine(bdev);
870 	}
871 	return 0;
872 }
873 
874 static inline void
875 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w)
876 {
877 	struct spdk_bdev_examine_item *item;
878 	TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) {
879 		spdk_json_write_object_begin(w);
880 		spdk_json_write_named_string(w, "method", "bdev_examine");
881 		spdk_json_write_named_object_begin(w, "params");
882 		spdk_json_write_named_string(w, "name", item->name);
883 		spdk_json_write_object_end(w);
884 		spdk_json_write_object_end(w);
885 	}
886 }
887 
888 struct spdk_bdev *
889 spdk_bdev_first(void)
890 {
891 	struct spdk_bdev *bdev;
892 
893 	bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs);
894 	if (bdev) {
895 		SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name);
896 	}
897 
898 	return bdev;
899 }
900 
901 struct spdk_bdev *
902 spdk_bdev_next(struct spdk_bdev *prev)
903 {
904 	struct spdk_bdev *bdev;
905 
906 	bdev = TAILQ_NEXT(prev, internal.link);
907 	if (bdev) {
908 		SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name);
909 	}
910 
911 	return bdev;
912 }
913 
914 static struct spdk_bdev *
915 _bdev_next_leaf(struct spdk_bdev *bdev)
916 {
917 	while (bdev != NULL) {
918 		if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) {
919 			return bdev;
920 		} else {
921 			bdev = TAILQ_NEXT(bdev, internal.link);
922 		}
923 	}
924 
925 	return bdev;
926 }
927 
928 struct spdk_bdev *
929 spdk_bdev_first_leaf(void)
930 {
931 	struct spdk_bdev *bdev;
932 
933 	bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs));
934 
935 	if (bdev) {
936 		SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name);
937 	}
938 
939 	return bdev;
940 }
941 
942 struct spdk_bdev *
943 spdk_bdev_next_leaf(struct spdk_bdev *prev)
944 {
945 	struct spdk_bdev *bdev;
946 
947 	bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link));
948 
949 	if (bdev) {
950 		SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name);
951 	}
952 
953 	return bdev;
954 }
955 
956 static inline bool
957 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io)
958 {
959 	return bdev_io->internal.f.has_memory_domain;
960 }
961 
962 static inline bool
963 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io)
964 {
965 	return bdev_io->internal.f.has_accel_sequence;
966 }
967 
968 static inline uint32_t
969 bdev_desc_get_block_size(struct spdk_bdev_desc *desc)
970 {
971 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
972 
973 	if (spdk_unlikely(desc->opts.hide_metadata)) {
974 		return bdev->blocklen - bdev->md_len;
975 	} else {
976 		return bdev->blocklen;
977 	}
978 }
979 
980 static inline uint32_t
981 bdev_io_get_block_size(struct spdk_bdev_io *bdev_io)
982 {
983 	struct spdk_bdev *bdev = bdev_io->bdev;
984 
985 	if (bdev_io->u.bdev.dif_check_flags & SPDK_DIF_FLAGS_NVME_PRACT) {
986 		if (bdev->md_len == spdk_dif_pi_format_get_size(bdev->dif_pi_format)) {
987 			return bdev->blocklen - bdev->md_len;
988 		} else {
989 			return bdev->blocklen;
990 		}
991 	}
992 
993 	return bdev_desc_get_block_size(bdev_io->internal.desc);
994 }
995 
996 static inline void
997 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource,
998 			 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state)
999 {
1000 	/* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io.
1001 	 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth
1002 	 * channels we will instead wait for half to complete.
1003 	 */
1004 	shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2,
1005 					   (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT);
1006 
1007 	assert(state != BDEV_IO_RETRY_STATE_INVALID);
1008 	bdev_io->internal.retry_state = state;
1009 	TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link);
1010 }
1011 
1012 static inline void
1013 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource,
1014 			 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state)
1015 {
1016 	/* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while
1017 	 * the queue isn't empty, so we don't need to update the nomem_threshold here */
1018 	assert(!TAILQ_EMPTY(&shared_resource->nomem_io));
1019 
1020 	assert(state != BDEV_IO_RETRY_STATE_INVALID);
1021 	bdev_io->internal.retry_state = state;
1022 	TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link);
1023 }
1024 
1025 void
1026 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len)
1027 {
1028 	struct iovec *iovs;
1029 
1030 	if (bdev_io->u.bdev.iovs == NULL) {
1031 		bdev_io->u.bdev.iovs = &bdev_io->iov;
1032 		bdev_io->u.bdev.iovcnt = 1;
1033 	}
1034 
1035 	iovs = bdev_io->u.bdev.iovs;
1036 
1037 	assert(iovs != NULL);
1038 	assert(bdev_io->u.bdev.iovcnt >= 1);
1039 
1040 	iovs[0].iov_base = buf;
1041 	iovs[0].iov_len = len;
1042 }
1043 
1044 void
1045 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len)
1046 {
1047 	assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks);
1048 	bdev_io->u.bdev.md_buf = md_buf;
1049 }
1050 
1051 static bool
1052 _is_buf_allocated(const struct iovec *iovs)
1053 {
1054 	if (iovs == NULL) {
1055 		return false;
1056 	}
1057 
1058 	return iovs[0].iov_base != NULL;
1059 }
1060 
1061 static bool
1062 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment)
1063 {
1064 	int i;
1065 	uintptr_t iov_base;
1066 
1067 	if (spdk_likely(alignment == 1)) {
1068 		return true;
1069 	}
1070 
1071 	for (i = 0; i < iovcnt; i++) {
1072 		iov_base = (uintptr_t)iovs[i].iov_base;
1073 		if ((iov_base & (alignment - 1)) != 0) {
1074 			return false;
1075 		}
1076 	}
1077 
1078 	return true;
1079 }
1080 
1081 static inline bool
1082 bdev_io_needs_metadata(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io)
1083 {
1084 	return (bdev_io->bdev->md_len != 0) &&
1085 	       (desc->opts.hide_metadata ||
1086 		(bdev_io->u.bdev.dif_check_flags & SPDK_DIF_FLAGS_NVME_PRACT));
1087 }
1088 
1089 static inline bool
1090 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io)
1091 {
1092 	if (!bdev_io_use_accel_sequence(bdev_io)) {
1093 		return false;
1094 	}
1095 
1096 	/* For now, we don't allow splitting IOs with an accel sequence and will treat them as if
1097 	 * bdev module didn't support accel sequences */
1098 	return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.f.split;
1099 }
1100 
1101 static inline void
1102 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch,
1103 			      struct spdk_bdev_shared_resource *shared_resource)
1104 {
1105 	bdev_ch->io_outstanding++;
1106 	shared_resource->io_outstanding++;
1107 }
1108 
1109 static inline void
1110 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch,
1111 			      struct spdk_bdev_shared_resource *shared_resource)
1112 {
1113 	assert(bdev_ch->io_outstanding > 0);
1114 	assert(shared_resource->io_outstanding > 0);
1115 	bdev_ch->io_outstanding--;
1116 	shared_resource->io_outstanding--;
1117 }
1118 
1119 static void
1120 bdev_io_submit_sequence_cb(void *ctx, int status)
1121 {
1122 	struct spdk_bdev_io *bdev_io = ctx;
1123 
1124 	assert(bdev_io_use_accel_sequence(bdev_io));
1125 
1126 	bdev_io->u.bdev.accel_sequence = NULL;
1127 	bdev_io->internal.f.has_accel_sequence = false;
1128 
1129 	if (spdk_unlikely(status != 0)) {
1130 		SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status);
1131 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1132 		bdev_io_complete_unsubmitted(bdev_io);
1133 		return;
1134 	}
1135 
1136 	bdev_io_submit(bdev_io);
1137 }
1138 
1139 static void
1140 bdev_io_exec_sequence_cb(void *ctx, int status)
1141 {
1142 	struct spdk_bdev_io *bdev_io = ctx;
1143 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1144 
1145 	TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link);
1146 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1147 
1148 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1149 		bdev_ch_retry_io(ch);
1150 	}
1151 
1152 	bdev_io->internal.data_transfer_cpl(bdev_io, status);
1153 }
1154 
1155 static void
1156 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status))
1157 {
1158 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1159 
1160 	assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io));
1161 	assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
1162 	assert(bdev_io_use_accel_sequence(bdev_io));
1163 
1164 	/* Since the operations are appended during submission, they're in the opposite order than
1165 	 * how we want to execute them for reads (i.e. we need to execute the most recently added
1166 	 * operation first), so reverse the sequence before executing it.
1167 	 */
1168 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
1169 		spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence);
1170 	}
1171 
1172 	TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link);
1173 	bdev_io_increment_outstanding(ch, ch->shared_resource);
1174 	bdev_io->internal.data_transfer_cpl = cb_fn;
1175 
1176 	spdk_accel_sequence_finish(bdev_io->internal.accel_sequence,
1177 				   bdev_io_exec_sequence_cb, bdev_io);
1178 }
1179 
1180 static void
1181 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status)
1182 {
1183 	struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io);
1184 	void *buf;
1185 
1186 	if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) {
1187 		buf = bdev_io->internal.buf.ptr;
1188 		bdev_io->internal.buf.ptr = NULL;
1189 		bdev_io->internal.f.has_buf = false;
1190 		bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf);
1191 		bdev_io->internal.get_aux_buf_cb = NULL;
1192 	} else {
1193 		assert(bdev_io->internal.get_buf_cb != NULL);
1194 		bdev_io->internal.get_buf_cb(ch, bdev_io, status);
1195 		bdev_io->internal.get_buf_cb = NULL;
1196 	}
1197 }
1198 
1199 static void
1200 _bdev_io_pull_buffer_cpl(void *ctx, int rc)
1201 {
1202 	struct spdk_bdev_io *bdev_io = ctx;
1203 
1204 	if (rc) {
1205 		SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc);
1206 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1207 	}
1208 	bdev_io_get_buf_complete(bdev_io, !rc);
1209 }
1210 
1211 static void
1212 bdev_io_pull_md_buf_done(void *ctx, int status)
1213 {
1214 	struct spdk_bdev_io *bdev_io = ctx;
1215 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1216 
1217 	TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1218 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1219 
1220 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1221 		bdev_ch_retry_io(ch);
1222 	}
1223 
1224 	assert(bdev_io->internal.data_transfer_cpl);
1225 	bdev_io->internal.data_transfer_cpl(bdev_io, status);
1226 }
1227 
1228 static void
1229 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io)
1230 {
1231 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1232 	int rc = 0;
1233 
1234 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1235 		assert(bdev_io->internal.f.has_bounce_buf);
1236 		if (bdev_io_use_memory_domain(bdev_io)) {
1237 			TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1238 			bdev_io_increment_outstanding(ch, ch->shared_resource);
1239 			rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain,
1240 							  bdev_io->internal.memory_domain_ctx,
1241 							  &bdev_io->internal.bounce_buf.orig_md_iov, 1,
1242 							  &bdev_io->internal.bounce_buf.md_iov, 1,
1243 							  bdev_io_pull_md_buf_done, bdev_io);
1244 			if (rc == 0) {
1245 				/* Continue to submit IO in completion callback */
1246 				return;
1247 			}
1248 			bdev_io_decrement_outstanding(ch, ch->shared_resource);
1249 			TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1250 			if (rc != -ENOMEM) {
1251 				SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n",
1252 					    spdk_memory_domain_get_dma_device_id(
1253 						    bdev_io->internal.memory_domain), rc);
1254 			}
1255 		} else {
1256 			memcpy(bdev_io->internal.bounce_buf.md_iov.iov_base,
1257 			       bdev_io->internal.bounce_buf.orig_md_iov.iov_base,
1258 			       bdev_io->internal.bounce_buf.orig_md_iov.iov_len);
1259 		}
1260 	}
1261 
1262 	if (spdk_unlikely(rc == -ENOMEM)) {
1263 		bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD);
1264 	} else {
1265 		assert(bdev_io->internal.data_transfer_cpl);
1266 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1267 	}
1268 }
1269 
1270 static void
1271 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len)
1272 {
1273 	assert(bdev_io->internal.f.has_bounce_buf);
1274 
1275 	/* save original md_buf */
1276 	bdev_io->internal.bounce_buf.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf;
1277 	bdev_io->internal.bounce_buf.orig_md_iov.iov_len = len;
1278 	bdev_io->internal.bounce_buf.md_iov.iov_base = md_buf;
1279 	bdev_io->internal.bounce_buf.md_iov.iov_len = len;
1280 	/* set bounce md_buf */
1281 	bdev_io->u.bdev.md_buf = md_buf;
1282 
1283 	bdev_io_pull_md_buf(bdev_io);
1284 }
1285 
1286 static void
1287 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io)
1288 {
1289 	struct spdk_bdev *bdev = bdev_io->bdev;
1290 	uint64_t md_len;
1291 	void *buf;
1292 
1293 	if (spdk_bdev_is_md_separate(bdev)) {
1294 		assert(!bdev_io_use_accel_sequence(bdev_io));
1295 
1296 		buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len;
1297 		md_len = bdev_io->u.bdev.num_blocks * bdev->md_len;
1298 
1299 		assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0);
1300 
1301 		if (bdev_io->u.bdev.md_buf != NULL) {
1302 			_bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len);
1303 			return;
1304 		} else {
1305 			spdk_bdev_io_set_md_buf(bdev_io, buf, md_len);
1306 		}
1307 	}
1308 
1309 	bdev_io_get_buf_complete(bdev_io, true);
1310 }
1311 
1312 static inline void
1313 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc)
1314 {
1315 	if (rc) {
1316 		SPDK_ERRLOG("Failed to get data buffer\n");
1317 		assert(bdev_io->internal.data_transfer_cpl);
1318 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1319 		return;
1320 	}
1321 
1322 	_bdev_io_set_md_buf(bdev_io);
1323 }
1324 
1325 static void
1326 bdev_io_pull_data_done_and_track(void *ctx, int status)
1327 {
1328 	struct spdk_bdev_io *bdev_io = ctx;
1329 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1330 
1331 	TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1332 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1333 
1334 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1335 		bdev_ch_retry_io(ch);
1336 	}
1337 
1338 	bdev_io_pull_data_done(bdev_io, status);
1339 }
1340 
1341 static void
1342 bdev_io_pull_data(struct spdk_bdev_io *bdev_io)
1343 {
1344 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1345 	struct spdk_bdev_desc *desc = bdev_io->internal.desc;
1346 	int rc = 0;
1347 
1348 	assert(bdev_io->internal.f.has_bounce_buf);
1349 
1350 	if (bdev_io_needs_metadata(desc, bdev_io)) {
1351 		assert(bdev_io->bdev->md_interleave);
1352 
1353 		bdev_io->u.bdev.dif_check_flags &= ~SPDK_DIF_FLAGS_NVME_PRACT;
1354 
1355 		if (!bdev_io_use_accel_sequence(bdev_io)) {
1356 			bdev_io->internal.accel_sequence = NULL;
1357 		}
1358 
1359 		if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1360 			rc = spdk_accel_append_dif_generate_copy(&bdev_io->internal.accel_sequence, ch->accel_channel,
1361 					bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
1362 					bdev_io->u.bdev.memory_domain,
1363 					bdev_io->u.bdev.memory_domain_ctx,
1364 					bdev_io->internal.bounce_buf.orig_iovs,
1365 					bdev_io->internal.bounce_buf.orig_iovcnt,
1366 					bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL,
1367 					bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL,
1368 					bdev_io->u.bdev.num_blocks,
1369 					&bdev_io->u.bdev.dif_ctx,
1370 					NULL, NULL);
1371 		} else {
1372 			assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
1373 			rc = spdk_accel_append_dif_verify_copy(&bdev_io->internal.accel_sequence, ch->accel_channel,
1374 							       bdev_io->internal.bounce_buf.orig_iovs,
1375 							       bdev_io->internal.bounce_buf.orig_iovcnt,
1376 							       bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL,
1377 							       bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL,
1378 							       bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
1379 							       bdev_io->u.bdev.memory_domain,
1380 							       bdev_io->u.bdev.memory_domain_ctx,
1381 							       bdev_io->u.bdev.num_blocks,
1382 							       &bdev_io->u.bdev.dif_ctx,
1383 							       &bdev_io->u.bdev.dif_err,
1384 							       NULL, NULL);
1385 		}
1386 
1387 		if (spdk_likely(rc == 0)) {
1388 			bdev_io->internal.f.has_accel_sequence = true;
1389 			bdev_io->u.bdev.accel_sequence = bdev_io->internal.accel_sequence;
1390 		} else if (rc != -ENOMEM) {
1391 			SPDK_ERRLOG("Failed to append generate/verify_copy to accel sequence: %p\n",
1392 				    bdev_io->internal.accel_sequence);
1393 		}
1394 	} else if (bdev_io_needs_sequence_exec(desc, bdev_io) ||
1395 		   (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) {
1396 		/* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a
1397 		 * sequence, append a copy operation making accel change the src/dst buffers of the previous
1398 		 * operation */
1399 		assert(bdev_io_use_accel_sequence(bdev_io));
1400 		if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1401 			rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel,
1402 						    bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
1403 						    NULL, NULL,
1404 						    bdev_io->internal.bounce_buf.orig_iovs,
1405 						    bdev_io->internal.bounce_buf.orig_iovcnt,
1406 						    bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL,
1407 						    bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL,
1408 						    NULL, NULL);
1409 		} else {
1410 			/* We need to reverse the src/dst for reads */
1411 			assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
1412 			rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel,
1413 						    bdev_io->internal.bounce_buf.orig_iovs,
1414 						    bdev_io->internal.bounce_buf.orig_iovcnt,
1415 						    bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL,
1416 						    bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL,
1417 						    bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
1418 						    NULL, NULL, NULL, NULL);
1419 		}
1420 
1421 		if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) {
1422 			SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n",
1423 				    bdev_io->internal.accel_sequence);
1424 		}
1425 	} else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1426 		/* if this is write path, copy data from original buffer to bounce buffer */
1427 		if (bdev_io_use_memory_domain(bdev_io)) {
1428 			TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1429 			bdev_io_increment_outstanding(ch, ch->shared_resource);
1430 			rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain,
1431 							  bdev_io->internal.memory_domain_ctx,
1432 							  bdev_io->internal.bounce_buf.orig_iovs,
1433 							  (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt,
1434 							  bdev_io->u.bdev.iovs, 1,
1435 							  bdev_io_pull_data_done_and_track,
1436 							  bdev_io);
1437 			if (rc == 0) {
1438 				/* Continue to submit IO in completion callback */
1439 				return;
1440 			}
1441 			TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1442 			bdev_io_decrement_outstanding(ch, ch->shared_resource);
1443 			if (rc != -ENOMEM) {
1444 				SPDK_ERRLOG("Failed to pull data from memory domain %s\n",
1445 					    spdk_memory_domain_get_dma_device_id(
1446 						    bdev_io->internal.memory_domain));
1447 			}
1448 		} else {
1449 			assert(bdev_io->u.bdev.iovcnt == 1);
1450 			spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base,
1451 					      bdev_io->u.bdev.iovs[0].iov_len,
1452 					      bdev_io->internal.bounce_buf.orig_iovs,
1453 					      bdev_io->internal.bounce_buf.orig_iovcnt);
1454 		}
1455 	}
1456 
1457 	if (spdk_unlikely(rc == -ENOMEM)) {
1458 		bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL);
1459 	} else {
1460 		bdev_io_pull_data_done(bdev_io, rc);
1461 	}
1462 }
1463 
1464 static void
1465 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len,
1466 			      bdev_copy_bounce_buffer_cpl cpl_cb)
1467 {
1468 	struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource;
1469 
1470 	assert(bdev_io->internal.f.has_bounce_buf == false);
1471 
1472 	bdev_io->internal.data_transfer_cpl = cpl_cb;
1473 	bdev_io->internal.f.has_bounce_buf = true;
1474 	/* save original iovec */
1475 	bdev_io->internal.bounce_buf.orig_iovs = bdev_io->u.bdev.iovs;
1476 	bdev_io->internal.bounce_buf.orig_iovcnt = bdev_io->u.bdev.iovcnt;
1477 	/* zero the other data members */
1478 	bdev_io->internal.bounce_buf.iov.iov_base = NULL;
1479 	bdev_io->internal.bounce_buf.md_iov.iov_base = NULL;
1480 	bdev_io->internal.bounce_buf.orig_md_iov.iov_base = NULL;
1481 	/* set bounce iov */
1482 	bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_buf.iov;
1483 	bdev_io->u.bdev.iovcnt = 1;
1484 	/* set bounce buffer for this operation */
1485 	bdev_io->u.bdev.iovs[0].iov_base = buf;
1486 	bdev_io->u.bdev.iovs[0].iov_len = len;
1487 	/* Now we use 1 iov, the split condition could have been changed */
1488 	bdev_io->internal.f.split = bdev_io_should_split(bdev_io);
1489 
1490 	if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) {
1491 		bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL);
1492 	} else {
1493 		bdev_io_pull_data(bdev_io);
1494 	}
1495 }
1496 
1497 static void
1498 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len)
1499 {
1500 	struct spdk_bdev *bdev = bdev_io->bdev;
1501 	bool buf_allocated;
1502 	uint64_t alignment;
1503 	void *aligned_buf;
1504 
1505 	bdev_io->internal.buf.ptr = buf;
1506 	bdev_io->internal.f.has_buf = true;
1507 
1508 	if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) {
1509 		bdev_io_get_buf_complete(bdev_io, true);
1510 		return;
1511 	}
1512 
1513 	alignment = spdk_bdev_get_buf_align(bdev);
1514 	buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs);
1515 	aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1));
1516 
1517 	if (buf_allocated) {
1518 		_bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl);
1519 		/* Continue in completion callback */
1520 		return;
1521 	} else {
1522 		spdk_bdev_io_set_buf(bdev_io, aligned_buf, len);
1523 	}
1524 
1525 	_bdev_io_set_md_buf(bdev_io);
1526 }
1527 
1528 static inline uint64_t
1529 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len)
1530 {
1531 	struct spdk_bdev *bdev = bdev_io->bdev;
1532 	uint64_t md_len, alignment;
1533 
1534 	md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0;
1535 
1536 	/* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */
1537 	alignment = spdk_bdev_get_buf_align(bdev) - 1;
1538 
1539 	return len + alignment + md_len;
1540 }
1541 
1542 static void
1543 bdev_io_put_accel_buf(struct spdk_bdev_io *bdev_io)
1544 {
1545 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1546 
1547 	spdk_accel_put_buf(ch->accel_channel,
1548 			   bdev_io->internal.buf.ptr,
1549 			   bdev_io->u.bdev.memory_domain,
1550 			   bdev_io->u.bdev.memory_domain_ctx);
1551 }
1552 
1553 static void
1554 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len)
1555 {
1556 	struct spdk_bdev_mgmt_channel *ch;
1557 
1558 	ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
1559 	spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len));
1560 }
1561 
1562 static void
1563 bdev_io_put_buf(struct spdk_bdev_io *bdev_io)
1564 {
1565 	assert(bdev_io->internal.f.has_buf);
1566 
1567 	if (bdev_io->u.bdev.memory_domain == spdk_accel_get_memory_domain()) {
1568 		bdev_io_put_accel_buf(bdev_io);
1569 	} else {
1570 		assert(bdev_io->u.bdev.memory_domain == NULL);
1571 		_bdev_io_put_buf(bdev_io, bdev_io->internal.buf.ptr,
1572 				 bdev_io->internal.buf.len);
1573 	}
1574 	bdev_io->internal.buf.ptr = NULL;
1575 	bdev_io->internal.f.has_buf = false;
1576 }
1577 
1578 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_put_aux_buf,
1579 			      "spdk_bdev_io_put_aux_buf is deprecated", "v25.01", 0);
1580 
1581 void
1582 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf)
1583 {
1584 	uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
1585 
1586 	SPDK_LOG_DEPRECATED(spdk_bdev_io_put_aux_buf);
1587 
1588 	assert(buf != NULL);
1589 	_bdev_io_put_buf(bdev_io, buf, len);
1590 }
1591 
1592 static inline void
1593 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch,
1594 		    struct spdk_bdev_io *bdev_io)
1595 {
1596 	/* After a request is submitted to a bdev module, the ownership of an accel sequence
1597 	 * associated with that bdev_io is transferred to the bdev module. So, clear the internal
1598 	 * sequence pointer to make sure we won't touch it anymore. */
1599 	if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE ||
1600 	     bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) {
1601 		assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io));
1602 		bdev_io->internal.f.has_accel_sequence = false;
1603 	}
1604 
1605 	/* The generic bdev layer should not pass an I/O with a dif_check_flags set that
1606 	 * the underlying bdev does not support. Add an assert to check this.
1607 	 */
1608 	assert((bdev_io->type != SPDK_BDEV_IO_TYPE_WRITE &&
1609 		bdev_io->type != SPDK_BDEV_IO_TYPE_READ) ||
1610 	       ((bdev_io->u.bdev.dif_check_flags & bdev->dif_check_flags) ==
1611 		bdev_io->u.bdev.dif_check_flags));
1612 
1613 	bdev->fn_table->submit_request(ioch, bdev_io);
1614 }
1615 
1616 static inline void
1617 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io)
1618 {
1619 	struct spdk_bdev *bdev = bdev_io->bdev;
1620 
1621 	bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource);
1622 	bdev_io->internal.error.nvme.cdw0 = 0;
1623 	bdev_io->num_retries++;
1624 	bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io);
1625 }
1626 
1627 static void
1628 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource)
1629 {
1630 	struct spdk_bdev_io *bdev_io;
1631 
1632 	if (shared_resource->nomem_abort_in_progress) {
1633 		/**
1634 		 * We are aborting nomem I/Os, do not touch nomem_io list now.
1635 		 */
1636 		return;
1637 	}
1638 
1639 	if (shared_resource->io_outstanding > shared_resource->nomem_threshold) {
1640 		/*
1641 		 * Allow some more I/O to complete before retrying the nomem_io queue.
1642 		 *  Some drivers (such as nvme) cannot immediately take a new I/O in
1643 		 *  the context of a completion, because the resources for the I/O are
1644 		 *  not released until control returns to the bdev poller.  Also, we
1645 		 *  may require several small I/O to complete before a larger I/O
1646 		 *  (that requires splitting) can be submitted.
1647 		 */
1648 		return;
1649 	}
1650 
1651 	while (!TAILQ_EMPTY(&shared_resource->nomem_io)) {
1652 		bdev_io = TAILQ_FIRST(&shared_resource->nomem_io);
1653 		TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link);
1654 
1655 		switch (bdev_io->internal.retry_state) {
1656 		case BDEV_IO_RETRY_STATE_SUBMIT:
1657 			bdev_ch_resubmit_io(shared_resource, bdev_io);
1658 			break;
1659 		case BDEV_IO_RETRY_STATE_PULL:
1660 			bdev_io_pull_data(bdev_io);
1661 			break;
1662 		case BDEV_IO_RETRY_STATE_PULL_MD:
1663 			bdev_io_pull_md_buf(bdev_io);
1664 			break;
1665 		case BDEV_IO_RETRY_STATE_PUSH:
1666 			bdev_io_push_bounce_data(bdev_io);
1667 			break;
1668 		case BDEV_IO_RETRY_STATE_PUSH_MD:
1669 			bdev_io_push_bounce_md_buf(bdev_io);
1670 			break;
1671 		case BDEV_IO_RETRY_STATE_GET_ACCEL_BUF:
1672 			_bdev_io_get_accel_buf(bdev_io);
1673 			break;
1674 		default:
1675 			assert(0 && "invalid retry state");
1676 			break;
1677 		}
1678 
1679 		if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) {
1680 			/* This IO completed again with NOMEM status, so break the loop and
1681 			 * don't try anymore.  Note that a bdev_io that fails with NOMEM
1682 			 * always gets requeued at the front of the list, to maintain
1683 			 * ordering.
1684 			 */
1685 			break;
1686 		}
1687 	}
1688 }
1689 
1690 static void
1691 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
1692 {
1693 	bdev_shared_ch_retry_io(bdev_ch->shared_resource);
1694 }
1695 
1696 static int
1697 bdev_no_mem_poller(void *ctx)
1698 {
1699 	struct spdk_bdev_shared_resource *shared_resource = ctx;
1700 
1701 	spdk_poller_unregister(&shared_resource->nomem_poller);
1702 
1703 	if (!TAILQ_EMPTY(&shared_resource->nomem_io)) {
1704 		bdev_shared_ch_retry_io(shared_resource);
1705 	}
1706 	/* the retry cb may re-register the poller so double check */
1707 	if (!TAILQ_EMPTY(&shared_resource->nomem_io) &&
1708 	    shared_resource->io_outstanding == 0 && shared_resource->nomem_poller == NULL) {
1709 		/* No IOs were submitted, try again */
1710 		shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource,
1711 						SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10);
1712 	}
1713 
1714 	return SPDK_POLLER_BUSY;
1715 }
1716 
1717 static inline bool
1718 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state)
1719 {
1720 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
1721 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
1722 
1723 	if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) {
1724 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
1725 		bdev_queue_nomem_io_head(shared_resource, bdev_io, state);
1726 
1727 		if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) {
1728 			/* Special case when we have nomem IOs and no outstanding IOs which completions
1729 			 * could trigger retry of queued IOs
1730 			 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no
1731 			 * new IOs submitted, e.g. qd==1 */
1732 			shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource,
1733 							SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10);
1734 		}
1735 		/* If bdev module completed an I/O that has an accel sequence with NOMEM status, the
1736 		 * ownership of that sequence is transferred back to the bdev layer, so we need to
1737 		 * restore internal.accel_sequence to make sure that the sequence is handled
1738 		 * correctly in case the I/O is later aborted. */
1739 		if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ ||
1740 		     bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) {
1741 			assert(!bdev_io_use_accel_sequence(bdev_io));
1742 			bdev_io->internal.f.has_accel_sequence = true;
1743 			bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence;
1744 		}
1745 
1746 		return true;
1747 	}
1748 
1749 	if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) {
1750 		bdev_ch_retry_io(bdev_ch);
1751 	}
1752 
1753 	return false;
1754 }
1755 
1756 static void
1757 _bdev_io_complete_push_bounce_done(void *ctx, int rc)
1758 {
1759 	struct spdk_bdev_io *bdev_io = ctx;
1760 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1761 
1762 	if (rc) {
1763 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1764 	}
1765 	/* We want to free the bounce buffer here since we know we're done with it (as opposed
1766 	 * to waiting for the conditional free of internal.buf.ptr in spdk_bdev_free_io()).
1767 	 */
1768 	bdev_io_put_buf(bdev_io);
1769 
1770 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1771 		bdev_ch_retry_io(ch);
1772 	}
1773 
1774 	/* Continue with IO completion flow */
1775 	bdev_io_complete(bdev_io);
1776 }
1777 
1778 static void
1779 bdev_io_push_bounce_md_buf_done(void *ctx, int rc)
1780 {
1781 	struct spdk_bdev_io *bdev_io = ctx;
1782 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1783 
1784 	TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1785 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1786 	bdev_io->internal.f.has_bounce_buf = false;
1787 
1788 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1789 		bdev_ch_retry_io(ch);
1790 	}
1791 
1792 	bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1793 }
1794 
1795 static inline void
1796 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io)
1797 {
1798 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1799 	int rc = 0;
1800 
1801 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
1802 	assert(bdev_io->internal.f.has_bounce_buf);
1803 
1804 	/* do the same for metadata buffer */
1805 	if (spdk_unlikely(bdev_io->internal.bounce_buf.orig_md_iov.iov_base != NULL)) {
1806 		assert(spdk_bdev_is_md_separate(bdev_io->bdev));
1807 
1808 		if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
1809 			if (bdev_io_use_memory_domain(bdev_io)) {
1810 				TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1811 				bdev_io_increment_outstanding(ch, ch->shared_resource);
1812 				/* If memory domain is used then we need to call async push function */
1813 				rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain,
1814 								  bdev_io->internal.memory_domain_ctx,
1815 								  &bdev_io->internal.bounce_buf.orig_md_iov,
1816 								  (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt,
1817 								  &bdev_io->internal.bounce_buf.md_iov, 1,
1818 								  bdev_io_push_bounce_md_buf_done,
1819 								  bdev_io);
1820 				if (rc == 0) {
1821 					/* Continue IO completion in async callback */
1822 					return;
1823 				}
1824 				TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1825 				bdev_io_decrement_outstanding(ch, ch->shared_resource);
1826 				if (rc != -ENOMEM) {
1827 					SPDK_ERRLOG("Failed to push md to memory domain %s\n",
1828 						    spdk_memory_domain_get_dma_device_id(
1829 							    bdev_io->internal.memory_domain));
1830 				}
1831 			} else {
1832 				memcpy(bdev_io->internal.bounce_buf.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf,
1833 				       bdev_io->internal.bounce_buf.orig_md_iov.iov_len);
1834 			}
1835 		}
1836 	}
1837 
1838 	if (spdk_unlikely(rc == -ENOMEM)) {
1839 		bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD);
1840 	} else {
1841 		assert(bdev_io->internal.data_transfer_cpl);
1842 		bdev_io->internal.f.has_bounce_buf = false;
1843 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1844 	}
1845 }
1846 
1847 static inline void
1848 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc)
1849 {
1850 	assert(bdev_io->internal.data_transfer_cpl);
1851 	if (rc) {
1852 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1853 		return;
1854 	}
1855 
1856 	/* set original buffer for this io */
1857 	bdev_io->u.bdev.iovcnt = bdev_io->internal.bounce_buf.orig_iovcnt;
1858 	bdev_io->u.bdev.iovs = bdev_io->internal.bounce_buf.orig_iovs;
1859 
1860 	/* We don't set bdev_io->internal.f.has_bounce_buf to false here because
1861 	 * we still need to clear the md buf */
1862 
1863 	bdev_io_push_bounce_md_buf(bdev_io);
1864 }
1865 
1866 static void
1867 bdev_io_push_bounce_data_done_and_track(void *ctx, int status)
1868 {
1869 	struct spdk_bdev_io *bdev_io = ctx;
1870 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1871 
1872 	TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1873 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1874 
1875 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1876 		bdev_ch_retry_io(ch);
1877 	}
1878 
1879 	bdev_io_push_bounce_data_done(bdev_io, status);
1880 }
1881 
1882 static inline void
1883 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io)
1884 {
1885 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1886 	int rc = 0;
1887 
1888 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
1889 	assert(!bdev_io_use_accel_sequence(bdev_io));
1890 	assert(bdev_io->internal.f.has_bounce_buf);
1891 
1892 	/* if this is read path, copy data from bounce buffer to original buffer */
1893 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
1894 		if (bdev_io_use_memory_domain(bdev_io)) {
1895 			TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1896 			bdev_io_increment_outstanding(ch, ch->shared_resource);
1897 			/* If memory domain is used then we need to call async push function */
1898 			rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain,
1899 							  bdev_io->internal.memory_domain_ctx,
1900 							  bdev_io->internal.bounce_buf.orig_iovs,
1901 							  (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt,
1902 							  &bdev_io->internal.bounce_buf.iov, 1,
1903 							  bdev_io_push_bounce_data_done_and_track,
1904 							  bdev_io);
1905 			if (rc == 0) {
1906 				/* Continue IO completion in async callback */
1907 				return;
1908 			}
1909 
1910 			TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1911 			bdev_io_decrement_outstanding(ch, ch->shared_resource);
1912 			if (rc != -ENOMEM) {
1913 				SPDK_ERRLOG("Failed to push data to memory domain %s\n",
1914 					    spdk_memory_domain_get_dma_device_id(
1915 						    bdev_io->internal.memory_domain));
1916 			}
1917 		} else {
1918 			spdk_copy_buf_to_iovs(bdev_io->internal.bounce_buf.orig_iovs,
1919 					      bdev_io->internal.bounce_buf.orig_iovcnt,
1920 					      bdev_io->internal.bounce_buf.iov.iov_base,
1921 					      bdev_io->internal.bounce_buf.iov.iov_len);
1922 		}
1923 	}
1924 
1925 	if (spdk_unlikely(rc == -ENOMEM)) {
1926 		bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH);
1927 	} else {
1928 		bdev_io_push_bounce_data_done(bdev_io, rc);
1929 	}
1930 }
1931 
1932 static inline void
1933 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb)
1934 {
1935 	bdev_io->internal.data_transfer_cpl = cpl_cb;
1936 	bdev_io_push_bounce_data(bdev_io);
1937 }
1938 
1939 static void
1940 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf)
1941 {
1942 	struct spdk_bdev_io *bdev_io;
1943 
1944 	bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf);
1945 	_bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len);
1946 }
1947 
1948 static void
1949 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len)
1950 {
1951 	struct spdk_bdev_mgmt_channel *mgmt_ch;
1952 	uint64_t max_len;
1953 	void *buf;
1954 
1955 	assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread());
1956 	mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
1957 	max_len = bdev_io_get_max_buf_len(bdev_io, len);
1958 
1959 	if (spdk_unlikely(max_len > mgmt_ch->iobuf.cache[0].large.bufsize)) {
1960 		SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len);
1961 		bdev_io_get_buf_complete(bdev_io, false);
1962 		return;
1963 	}
1964 
1965 	bdev_io->internal.buf.len = len;
1966 	buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf,
1967 			     bdev_io_get_iobuf_cb);
1968 	if (buf != NULL) {
1969 		_bdev_io_set_buf(bdev_io, buf, len);
1970 	}
1971 }
1972 
1973 void
1974 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len)
1975 {
1976 	struct spdk_bdev *bdev = bdev_io->bdev;
1977 	uint64_t alignment;
1978 
1979 	assert(cb != NULL);
1980 	bdev_io->internal.get_buf_cb = cb;
1981 
1982 	alignment = spdk_bdev_get_buf_align(bdev);
1983 
1984 	if (_is_buf_allocated(bdev_io->u.bdev.iovs) &&
1985 	    _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) {
1986 		/* Buffer already present and aligned */
1987 		cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true);
1988 		return;
1989 	}
1990 
1991 	bdev_io_get_buf(bdev_io, len);
1992 }
1993 
1994 static void
1995 _bdev_io_get_bounce_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb,
1996 			uint64_t len)
1997 {
1998 	assert(cb != NULL);
1999 	bdev_io->internal.get_buf_cb = cb;
2000 
2001 	bdev_io_get_buf(bdev_io, len);
2002 }
2003 
2004 static void
2005 _bdev_io_get_accel_buf(struct spdk_bdev_io *bdev_io)
2006 {
2007 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
2008 	void *buf;
2009 	int rc;
2010 
2011 	rc = spdk_accel_get_buf(ch->accel_channel,
2012 				bdev_io->internal.buf.len,
2013 				&buf,
2014 				&bdev_io->u.bdev.memory_domain,
2015 				&bdev_io->u.bdev.memory_domain_ctx);
2016 	if (rc != 0) {
2017 		bdev_queue_nomem_io_tail(ch->shared_resource, bdev_io,
2018 					 BDEV_IO_RETRY_STATE_GET_ACCEL_BUF);
2019 		return;
2020 	}
2021 
2022 	_bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len);
2023 }
2024 
2025 static inline void
2026 bdev_io_get_accel_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb,
2027 		      uint64_t len)
2028 {
2029 	bdev_io->internal.buf.len = len;
2030 	bdev_io->internal.get_buf_cb = cb;
2031 
2032 	_bdev_io_get_accel_buf(bdev_io);
2033 }
2034 
2035 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_get_aux_buf,
2036 			      "spdk_bdev_io_get_aux_buf is deprecated", "v25.01", 0);
2037 
2038 void
2039 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb)
2040 {
2041 	uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
2042 
2043 	SPDK_LOG_DEPRECATED(spdk_bdev_io_get_aux_buf);
2044 
2045 	assert(cb != NULL);
2046 	assert(bdev_io->internal.get_aux_buf_cb == NULL);
2047 	bdev_io->internal.get_aux_buf_cb = cb;
2048 	bdev_io_get_buf(bdev_io, len);
2049 }
2050 
2051 static int
2052 bdev_module_get_max_ctx_size(void)
2053 {
2054 	struct spdk_bdev_module *bdev_module;
2055 	int max_bdev_module_size = 0;
2056 
2057 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
2058 		if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) {
2059 			max_bdev_module_size = bdev_module->get_ctx_size();
2060 		}
2061 	}
2062 
2063 	return max_bdev_module_size;
2064 }
2065 
2066 static void
2067 bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
2068 {
2069 	if (!bdev->internal.histogram_enabled) {
2070 		return;
2071 	}
2072 
2073 	spdk_json_write_object_begin(w);
2074 	spdk_json_write_named_string(w, "method", "bdev_enable_histogram");
2075 
2076 	spdk_json_write_named_object_begin(w, "params");
2077 	spdk_json_write_named_string(w, "name", bdev->name);
2078 
2079 	spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled);
2080 
2081 	if (bdev->internal.histogram_io_type) {
2082 		spdk_json_write_named_string(w, "opc",
2083 					     spdk_bdev_get_io_type_name(bdev->internal.histogram_io_type));
2084 	}
2085 
2086 	spdk_json_write_object_end(w);
2087 
2088 	spdk_json_write_object_end(w);
2089 }
2090 
2091 static void
2092 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
2093 {
2094 	int i;
2095 	struct spdk_bdev_qos *qos = bdev->internal.qos;
2096 	uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
2097 
2098 	if (!qos) {
2099 		return;
2100 	}
2101 
2102 	spdk_bdev_get_qos_rate_limits(bdev, limits);
2103 
2104 	spdk_json_write_object_begin(w);
2105 	spdk_json_write_named_string(w, "method", "bdev_set_qos_limit");
2106 
2107 	spdk_json_write_named_object_begin(w, "params");
2108 	spdk_json_write_named_string(w, "name", bdev->name);
2109 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2110 		if (limits[i] > 0) {
2111 			spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]);
2112 		}
2113 	}
2114 	spdk_json_write_object_end(w);
2115 
2116 	spdk_json_write_object_end(w);
2117 }
2118 
2119 void
2120 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w)
2121 {
2122 	struct spdk_bdev_module *bdev_module;
2123 	struct spdk_bdev *bdev;
2124 
2125 	assert(w != NULL);
2126 
2127 	spdk_json_write_array_begin(w);
2128 
2129 	spdk_json_write_object_begin(w);
2130 	spdk_json_write_named_string(w, "method", "bdev_set_options");
2131 	spdk_json_write_named_object_begin(w, "params");
2132 	spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size);
2133 	spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size);
2134 	spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine);
2135 	spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size);
2136 	spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size);
2137 	spdk_json_write_object_end(w);
2138 	spdk_json_write_object_end(w);
2139 
2140 	bdev_examine_allowlist_config_json(w);
2141 
2142 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
2143 		if (bdev_module->config_json) {
2144 			bdev_module->config_json(w);
2145 		}
2146 	}
2147 
2148 	spdk_spin_lock(&g_bdev_mgr.spinlock);
2149 
2150 	TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) {
2151 		if (bdev->fn_table->write_config_json) {
2152 			bdev->fn_table->write_config_json(bdev, w);
2153 		}
2154 
2155 		bdev_qos_config_json(bdev, w);
2156 		bdev_enable_histogram_config_json(bdev, w);
2157 	}
2158 
2159 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
2160 
2161 	/* This has to be last RPC in array to make sure all bdevs finished examine */
2162 	spdk_json_write_object_begin(w);
2163 	spdk_json_write_named_string(w, "method", "bdev_wait_for_examine");
2164 	spdk_json_write_object_end(w);
2165 
2166 	spdk_json_write_array_end(w);
2167 }
2168 
2169 static void
2170 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf)
2171 {
2172 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
2173 	struct spdk_bdev_io *bdev_io;
2174 
2175 	spdk_iobuf_channel_fini(&ch->iobuf);
2176 
2177 	while (!STAILQ_EMPTY(&ch->per_thread_cache)) {
2178 		bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
2179 		STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
2180 		ch->per_thread_cache_count--;
2181 		spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
2182 	}
2183 
2184 	assert(ch->per_thread_cache_count == 0);
2185 }
2186 
2187 static int
2188 bdev_mgmt_channel_create(void *io_device, void *ctx_buf)
2189 {
2190 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
2191 	struct spdk_bdev_io *bdev_io;
2192 	uint32_t i;
2193 	int rc;
2194 
2195 	rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev",
2196 				     g_bdev_opts.iobuf_small_cache_size,
2197 				     g_bdev_opts.iobuf_large_cache_size);
2198 	if (rc != 0) {
2199 		SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc));
2200 		return -1;
2201 	}
2202 
2203 	STAILQ_INIT(&ch->per_thread_cache);
2204 	ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size;
2205 
2206 	/* Pre-populate bdev_io cache to ensure this thread cannot be starved. */
2207 	ch->per_thread_cache_count = 0;
2208 	for (i = 0; i < ch->bdev_io_cache_size; i++) {
2209 		bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
2210 		if (bdev_io == NULL) {
2211 			SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n");
2212 			assert(false);
2213 			bdev_mgmt_channel_destroy(io_device, ctx_buf);
2214 			return -1;
2215 		}
2216 		ch->per_thread_cache_count++;
2217 		STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link);
2218 	}
2219 
2220 	TAILQ_INIT(&ch->shared_resources);
2221 	TAILQ_INIT(&ch->io_wait_queue);
2222 
2223 	return 0;
2224 }
2225 
2226 static void
2227 bdev_init_complete(int rc)
2228 {
2229 	spdk_bdev_init_cb cb_fn = g_init_cb_fn;
2230 	void *cb_arg = g_init_cb_arg;
2231 	struct spdk_bdev_module *m;
2232 
2233 	g_bdev_mgr.init_complete = true;
2234 	g_init_cb_fn = NULL;
2235 	g_init_cb_arg = NULL;
2236 
2237 	/*
2238 	 * For modules that need to know when subsystem init is complete,
2239 	 * inform them now.
2240 	 */
2241 	if (rc == 0) {
2242 		TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
2243 			if (m->init_complete) {
2244 				m->init_complete();
2245 			}
2246 		}
2247 	}
2248 
2249 	cb_fn(cb_arg, rc);
2250 }
2251 
2252 static bool
2253 bdev_module_all_actions_completed(void)
2254 {
2255 	struct spdk_bdev_module *m;
2256 
2257 	TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
2258 		if (m->internal.action_in_progress > 0) {
2259 			return false;
2260 		}
2261 	}
2262 	return true;
2263 }
2264 
2265 static void
2266 bdev_module_action_complete(void)
2267 {
2268 	/*
2269 	 * Don't finish bdev subsystem initialization if
2270 	 * module pre-initialization is still in progress, or
2271 	 * the subsystem been already initialized.
2272 	 */
2273 	if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) {
2274 		return;
2275 	}
2276 
2277 	/*
2278 	 * Check all bdev modules for inits/examinations in progress. If any
2279 	 * exist, return immediately since we cannot finish bdev subsystem
2280 	 * initialization until all are completed.
2281 	 */
2282 	if (!bdev_module_all_actions_completed()) {
2283 		return;
2284 	}
2285 
2286 	/*
2287 	 * Modules already finished initialization - now that all
2288 	 * the bdev modules have finished their asynchronous I/O
2289 	 * processing, the entire bdev layer can be marked as complete.
2290 	 */
2291 	bdev_init_complete(0);
2292 }
2293 
2294 static void
2295 bdev_module_action_done(struct spdk_bdev_module *module)
2296 {
2297 	spdk_spin_lock(&module->internal.spinlock);
2298 	assert(module->internal.action_in_progress > 0);
2299 	module->internal.action_in_progress--;
2300 	spdk_spin_unlock(&module->internal.spinlock);
2301 	bdev_module_action_complete();
2302 }
2303 
2304 void
2305 spdk_bdev_module_init_done(struct spdk_bdev_module *module)
2306 {
2307 	assert(module->async_init);
2308 	bdev_module_action_done(module);
2309 }
2310 
2311 void
2312 spdk_bdev_module_examine_done(struct spdk_bdev_module *module)
2313 {
2314 	bdev_module_action_done(module);
2315 }
2316 
2317 /** The last initialized bdev module */
2318 static struct spdk_bdev_module *g_resume_bdev_module = NULL;
2319 
2320 static void
2321 bdev_init_failed(void *cb_arg)
2322 {
2323 	struct spdk_bdev_module *module = cb_arg;
2324 
2325 	spdk_spin_lock(&module->internal.spinlock);
2326 	assert(module->internal.action_in_progress > 0);
2327 	module->internal.action_in_progress--;
2328 	spdk_spin_unlock(&module->internal.spinlock);
2329 	bdev_init_complete(-1);
2330 }
2331 
2332 static int
2333 bdev_modules_init(void)
2334 {
2335 	struct spdk_bdev_module *module;
2336 	int rc = 0;
2337 
2338 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
2339 		g_resume_bdev_module = module;
2340 		if (module->async_init) {
2341 			spdk_spin_lock(&module->internal.spinlock);
2342 			module->internal.action_in_progress = 1;
2343 			spdk_spin_unlock(&module->internal.spinlock);
2344 		}
2345 		rc = module->module_init();
2346 		if (rc != 0) {
2347 			/* Bump action_in_progress to prevent other modules from completion of modules_init
2348 			 * Send message to defer application shutdown until resources are cleaned up */
2349 			spdk_spin_lock(&module->internal.spinlock);
2350 			module->internal.action_in_progress = 1;
2351 			spdk_spin_unlock(&module->internal.spinlock);
2352 			spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module);
2353 			return rc;
2354 		}
2355 	}
2356 
2357 	g_resume_bdev_module = NULL;
2358 	return 0;
2359 }
2360 
2361 void
2362 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg)
2363 {
2364 	int rc = 0;
2365 	char mempool_name[32];
2366 
2367 	assert(cb_fn != NULL);
2368 
2369 	g_init_cb_fn = cb_fn;
2370 	g_init_cb_arg = cb_arg;
2371 
2372 	spdk_notify_type_register("bdev_register");
2373 	spdk_notify_type_register("bdev_unregister");
2374 
2375 	snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid());
2376 
2377 	rc = spdk_iobuf_register_module("bdev");
2378 	if (rc != 0) {
2379 		SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc));
2380 		bdev_init_complete(-1);
2381 		return;
2382 	}
2383 
2384 	g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name,
2385 				  g_bdev_opts.bdev_io_pool_size,
2386 				  sizeof(struct spdk_bdev_io) +
2387 				  bdev_module_get_max_ctx_size(),
2388 				  0,
2389 				  SPDK_ENV_NUMA_ID_ANY);
2390 
2391 	if (g_bdev_mgr.bdev_io_pool == NULL) {
2392 		SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n");
2393 		bdev_init_complete(-1);
2394 		return;
2395 	}
2396 
2397 	g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE,
2398 					      NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
2399 	if (!g_bdev_mgr.zero_buffer) {
2400 		SPDK_ERRLOG("create bdev zero buffer failed\n");
2401 		bdev_init_complete(-1);
2402 		return;
2403 	}
2404 
2405 #ifdef SPDK_CONFIG_VTUNE
2406 	g_bdev_mgr.domain = __itt_domain_create("spdk_bdev");
2407 #endif
2408 
2409 	spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create,
2410 				bdev_mgmt_channel_destroy,
2411 				sizeof(struct spdk_bdev_mgmt_channel),
2412 				"bdev_mgr");
2413 
2414 	rc = bdev_modules_init();
2415 	g_bdev_mgr.module_init_complete = true;
2416 	if (rc != 0) {
2417 		SPDK_ERRLOG("bdev modules init failed\n");
2418 		return;
2419 	}
2420 
2421 	bdev_module_action_complete();
2422 }
2423 
2424 static void
2425 bdev_mgr_unregister_cb(void *io_device)
2426 {
2427 	spdk_bdev_fini_cb cb_fn = g_fini_cb_fn;
2428 
2429 	if (g_bdev_mgr.bdev_io_pool) {
2430 		if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) {
2431 			SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n",
2432 				    spdk_mempool_count(g_bdev_mgr.bdev_io_pool),
2433 				    g_bdev_opts.bdev_io_pool_size);
2434 		}
2435 
2436 		spdk_mempool_free(g_bdev_mgr.bdev_io_pool);
2437 	}
2438 
2439 	spdk_free(g_bdev_mgr.zero_buffer);
2440 
2441 	bdev_examine_allowlist_free();
2442 
2443 	cb_fn(g_fini_cb_arg);
2444 	g_fini_cb_fn = NULL;
2445 	g_fini_cb_arg = NULL;
2446 	g_bdev_mgr.init_complete = false;
2447 	g_bdev_mgr.module_init_complete = false;
2448 }
2449 
2450 static void
2451 bdev_module_fini_iter(void *arg)
2452 {
2453 	struct spdk_bdev_module *bdev_module;
2454 
2455 	/* FIXME: Handling initialization failures is broken now,
2456 	 * so we won't even try cleaning up after successfully
2457 	 * initialized modules. if module_init_complete is false,
2458 	 * just call spdk_bdev_mgr_unregister_cb
2459 	 */
2460 	if (!g_bdev_mgr.module_init_complete) {
2461 		bdev_mgr_unregister_cb(NULL);
2462 		return;
2463 	}
2464 
2465 	/* Start iterating from the last touched module */
2466 	if (!g_resume_bdev_module) {
2467 		bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list);
2468 	} else {
2469 		bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list,
2470 					 internal.tailq);
2471 	}
2472 
2473 	while (bdev_module) {
2474 		if (bdev_module->async_fini) {
2475 			/* Save our place so we can resume later. We must
2476 			 * save the variable here, before calling module_fini()
2477 			 * below, because in some cases the module may immediately
2478 			 * call spdk_bdev_module_fini_done() and re-enter
2479 			 * this function to continue iterating. */
2480 			g_resume_bdev_module = bdev_module;
2481 		}
2482 
2483 		if (bdev_module->module_fini) {
2484 			bdev_module->module_fini();
2485 		}
2486 
2487 		if (bdev_module->async_fini) {
2488 			return;
2489 		}
2490 
2491 		bdev_module = TAILQ_PREV(bdev_module, bdev_module_list,
2492 					 internal.tailq);
2493 	}
2494 
2495 	g_resume_bdev_module = NULL;
2496 	spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb);
2497 }
2498 
2499 void
2500 spdk_bdev_module_fini_done(void)
2501 {
2502 	if (spdk_get_thread() != g_fini_thread) {
2503 		spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL);
2504 	} else {
2505 		bdev_module_fini_iter(NULL);
2506 	}
2507 }
2508 
2509 static void
2510 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno)
2511 {
2512 	struct spdk_bdev *bdev = cb_arg;
2513 
2514 	if (bdeverrno && bdev) {
2515 		SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n",
2516 			     bdev->name);
2517 
2518 		/*
2519 		 * Since the call to spdk_bdev_unregister() failed, we have no way to free this
2520 		 *  bdev; try to continue by manually removing this bdev from the list and continue
2521 		 *  with the next bdev in the list.
2522 		 */
2523 		TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
2524 	}
2525 
2526 	if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) {
2527 		SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n");
2528 		/*
2529 		 * Bdev module finish need to be deferred as we might be in the middle of some context
2530 		 * (like bdev part free) that will use this bdev (or private bdev driver ctx data)
2531 		 * after returning.
2532 		 */
2533 		spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL);
2534 		return;
2535 	}
2536 
2537 	/*
2538 	 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem
2539 	 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity
2540 	 * to detect clean shutdown as opposed to run-time hot removal of the underlying
2541 	 * base bdevs.
2542 	 *
2543 	 * Also, walk the list in the reverse order.
2544 	 */
2545 	for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list);
2546 	     bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) {
2547 		spdk_spin_lock(&bdev->internal.spinlock);
2548 		if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
2549 			LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev);
2550 			spdk_spin_unlock(&bdev->internal.spinlock);
2551 			continue;
2552 		}
2553 		spdk_spin_unlock(&bdev->internal.spinlock);
2554 
2555 		SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name);
2556 		spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev);
2557 		return;
2558 	}
2559 
2560 	/*
2561 	 * If any bdev fails to unclaim underlying bdev properly, we may face the
2562 	 * case of bdev list consisting of claimed bdevs only (if claims are managed
2563 	 * correctly, this would mean there's a loop in the claims graph which is
2564 	 * clearly impossible). Warn and unregister last bdev on the list then.
2565 	 */
2566 	for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list);
2567 	     bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) {
2568 		SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name);
2569 		spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev);
2570 		return;
2571 	}
2572 }
2573 
2574 static void
2575 bdev_module_fini_start_iter(void *arg)
2576 {
2577 	struct spdk_bdev_module *bdev_module;
2578 
2579 	if (!g_resume_bdev_module) {
2580 		bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list);
2581 	} else {
2582 		bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq);
2583 	}
2584 
2585 	while (bdev_module) {
2586 		if (bdev_module->async_fini_start) {
2587 			/* Save our place so we can resume later. We must
2588 			 * save the variable here, before calling fini_start()
2589 			 * below, because in some cases the module may immediately
2590 			 * call spdk_bdev_module_fini_start_done() and re-enter
2591 			 * this function to continue iterating. */
2592 			g_resume_bdev_module = bdev_module;
2593 		}
2594 
2595 		if (bdev_module->fini_start) {
2596 			bdev_module->fini_start();
2597 		}
2598 
2599 		if (bdev_module->async_fini_start) {
2600 			return;
2601 		}
2602 
2603 		bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq);
2604 	}
2605 
2606 	g_resume_bdev_module = NULL;
2607 
2608 	bdev_finish_unregister_bdevs_iter(NULL, 0);
2609 }
2610 
2611 void
2612 spdk_bdev_module_fini_start_done(void)
2613 {
2614 	if (spdk_get_thread() != g_fini_thread) {
2615 		spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL);
2616 	} else {
2617 		bdev_module_fini_start_iter(NULL);
2618 	}
2619 }
2620 
2621 static void
2622 bdev_finish_wait_for_examine_done(void *cb_arg)
2623 {
2624 	bdev_module_fini_start_iter(NULL);
2625 }
2626 
2627 static void bdev_open_async_fini(void);
2628 
2629 void
2630 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg)
2631 {
2632 	int rc;
2633 
2634 	assert(cb_fn != NULL);
2635 
2636 	g_fini_thread = spdk_get_thread();
2637 
2638 	g_fini_cb_fn = cb_fn;
2639 	g_fini_cb_arg = cb_arg;
2640 
2641 	bdev_open_async_fini();
2642 
2643 	rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL);
2644 	if (rc != 0) {
2645 		SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc));
2646 		bdev_finish_wait_for_examine_done(NULL);
2647 	}
2648 }
2649 
2650 struct spdk_bdev_io *
2651 bdev_channel_get_io(struct spdk_bdev_channel *channel)
2652 {
2653 	struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch;
2654 	struct spdk_bdev_io *bdev_io;
2655 
2656 	if (ch->per_thread_cache_count > 0) {
2657 		bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
2658 		STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
2659 		ch->per_thread_cache_count--;
2660 	} else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) {
2661 		/*
2662 		 * Don't try to look for bdev_ios in the global pool if there are
2663 		 * waiters on bdev_ios - we don't want this caller to jump the line.
2664 		 */
2665 		bdev_io = NULL;
2666 	} else {
2667 		bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
2668 	}
2669 
2670 	return bdev_io;
2671 }
2672 
2673 void
2674 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io)
2675 {
2676 	struct spdk_bdev_mgmt_channel *ch;
2677 
2678 	assert(bdev_io != NULL);
2679 	assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING);
2680 
2681 	ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
2682 
2683 	if (bdev_io->internal.f.has_buf) {
2684 		bdev_io_put_buf(bdev_io);
2685 	}
2686 
2687 	if (ch->per_thread_cache_count < ch->bdev_io_cache_size) {
2688 		ch->per_thread_cache_count++;
2689 		STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link);
2690 		while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) {
2691 			struct spdk_bdev_io_wait_entry *entry;
2692 
2693 			entry = TAILQ_FIRST(&ch->io_wait_queue);
2694 			TAILQ_REMOVE(&ch->io_wait_queue, entry, link);
2695 			entry->cb_fn(entry->cb_arg);
2696 		}
2697 	} else {
2698 		/* We should never have a full cache with entries on the io wait queue. */
2699 		assert(TAILQ_EMPTY(&ch->io_wait_queue));
2700 		spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
2701 	}
2702 }
2703 
2704 static bool
2705 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit)
2706 {
2707 	assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES);
2708 
2709 	switch (limit) {
2710 	case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
2711 		return true;
2712 	case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT:
2713 	case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT:
2714 	case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT:
2715 		return false;
2716 	case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES:
2717 	default:
2718 		return false;
2719 	}
2720 }
2721 
2722 static bool
2723 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io)
2724 {
2725 	switch (bdev_io->type) {
2726 	case SPDK_BDEV_IO_TYPE_NVME_IO:
2727 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2728 	case SPDK_BDEV_IO_TYPE_READ:
2729 	case SPDK_BDEV_IO_TYPE_WRITE:
2730 		return true;
2731 	case SPDK_BDEV_IO_TYPE_ZCOPY:
2732 		if (bdev_io->u.bdev.zcopy.start) {
2733 			return true;
2734 		} else {
2735 			return false;
2736 		}
2737 	default:
2738 		return false;
2739 	}
2740 }
2741 
2742 static bool
2743 bdev_is_read_io(struct spdk_bdev_io *bdev_io)
2744 {
2745 	switch (bdev_io->type) {
2746 	case SPDK_BDEV_IO_TYPE_NVME_IO:
2747 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2748 		/* Bit 1 (0x2) set for read operation */
2749 		if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) {
2750 			return true;
2751 		} else {
2752 			return false;
2753 		}
2754 	case SPDK_BDEV_IO_TYPE_READ:
2755 		return true;
2756 	case SPDK_BDEV_IO_TYPE_ZCOPY:
2757 		/* Populate to read from disk */
2758 		if (bdev_io->u.bdev.zcopy.populate) {
2759 			return true;
2760 		} else {
2761 			return false;
2762 		}
2763 	default:
2764 		return false;
2765 	}
2766 }
2767 
2768 static uint64_t
2769 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io)
2770 {
2771 	uint32_t blocklen = bdev_io_get_block_size(bdev_io);
2772 
2773 	switch (bdev_io->type) {
2774 	case SPDK_BDEV_IO_TYPE_NVME_IO:
2775 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2776 		return bdev_io->u.nvme_passthru.nbytes;
2777 	case SPDK_BDEV_IO_TYPE_READ:
2778 	case SPDK_BDEV_IO_TYPE_WRITE:
2779 		return bdev_io->u.bdev.num_blocks * blocklen;
2780 	case SPDK_BDEV_IO_TYPE_ZCOPY:
2781 		/* Track the data in the start phase only */
2782 		if (bdev_io->u.bdev.zcopy.start) {
2783 			return bdev_io->u.bdev.num_blocks * blocklen;
2784 		} else {
2785 			return 0;
2786 		}
2787 	default:
2788 		return 0;
2789 	}
2790 }
2791 
2792 static inline bool
2793 bdev_qos_rw_queue_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta)
2794 {
2795 	int64_t remaining_this_timeslice;
2796 
2797 	if (!limit->max_per_timeslice) {
2798 		/* The QoS is disabled */
2799 		return false;
2800 	}
2801 
2802 	remaining_this_timeslice = __atomic_sub_fetch(&limit->remaining_this_timeslice, delta,
2803 				   __ATOMIC_RELAXED);
2804 	if (remaining_this_timeslice + (int64_t)delta > 0) {
2805 		/* There was still a quota for this delta -> the IO shouldn't be queued
2806 		 *
2807 		 * We allow a slight quota overrun here so an IO bigger than the per-timeslice
2808 		 * quota can be allowed once a while. Such overrun then taken into account in
2809 		 * the QoS poller, where the next timeslice quota is calculated.
2810 		 */
2811 		return false;
2812 	}
2813 
2814 	/* There was no quota for this delta -> the IO should be queued
2815 	 * The remaining_this_timeslice must be rewinded so it reflects the real
2816 	 * amount of IOs or bytes allowed.
2817 	 */
2818 	__atomic_add_fetch(
2819 		&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED);
2820 	return true;
2821 }
2822 
2823 static inline void
2824 bdev_qos_rw_rewind_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta)
2825 {
2826 	__atomic_add_fetch(&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED);
2827 }
2828 
2829 static bool
2830 bdev_qos_rw_iops_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2831 {
2832 	return bdev_qos_rw_queue_io(limit, io, 1);
2833 }
2834 
2835 static void
2836 bdev_qos_rw_iops_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2837 {
2838 	bdev_qos_rw_rewind_io(limit, io, 1);
2839 }
2840 
2841 static bool
2842 bdev_qos_rw_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2843 {
2844 	return bdev_qos_rw_queue_io(limit, io, bdev_get_io_size_in_byte(io));
2845 }
2846 
2847 static void
2848 bdev_qos_rw_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2849 {
2850 	bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io));
2851 }
2852 
2853 static bool
2854 bdev_qos_r_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2855 {
2856 	if (bdev_is_read_io(io) == false) {
2857 		return false;
2858 	}
2859 
2860 	return bdev_qos_rw_bps_queue(limit, io);
2861 }
2862 
2863 static void
2864 bdev_qos_r_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2865 {
2866 	if (bdev_is_read_io(io) != false) {
2867 		bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io));
2868 	}
2869 }
2870 
2871 static bool
2872 bdev_qos_w_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2873 {
2874 	if (bdev_is_read_io(io) == true) {
2875 		return false;
2876 	}
2877 
2878 	return bdev_qos_rw_bps_queue(limit, io);
2879 }
2880 
2881 static void
2882 bdev_qos_w_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2883 {
2884 	if (bdev_is_read_io(io) != true) {
2885 		bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io));
2886 	}
2887 }
2888 
2889 static void
2890 bdev_qos_set_ops(struct spdk_bdev_qos *qos)
2891 {
2892 	int i;
2893 
2894 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2895 		if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
2896 			qos->rate_limits[i].queue_io = NULL;
2897 			continue;
2898 		}
2899 
2900 		switch (i) {
2901 		case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
2902 			qos->rate_limits[i].queue_io = bdev_qos_rw_iops_queue;
2903 			qos->rate_limits[i].rewind_quota = bdev_qos_rw_iops_rewind_quota;
2904 			break;
2905 		case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT:
2906 			qos->rate_limits[i].queue_io = bdev_qos_rw_bps_queue;
2907 			qos->rate_limits[i].rewind_quota = bdev_qos_rw_bps_rewind_quota;
2908 			break;
2909 		case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT:
2910 			qos->rate_limits[i].queue_io = bdev_qos_r_bps_queue;
2911 			qos->rate_limits[i].rewind_quota = bdev_qos_r_bps_rewind_quota;
2912 			break;
2913 		case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT:
2914 			qos->rate_limits[i].queue_io = bdev_qos_w_bps_queue;
2915 			qos->rate_limits[i].rewind_quota = bdev_qos_w_bps_rewind_quota;
2916 			break;
2917 		default:
2918 			break;
2919 		}
2920 	}
2921 }
2922 
2923 static void
2924 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch,
2925 			    struct spdk_bdev_io *bdev_io,
2926 			    enum spdk_bdev_io_status status)
2927 {
2928 	bdev_io->internal.f.in_submit_request = true;
2929 	bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource);
2930 	spdk_bdev_io_complete(bdev_io, status);
2931 	bdev_io->internal.f.in_submit_request = false;
2932 }
2933 
2934 static inline void
2935 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io)
2936 {
2937 	struct spdk_bdev *bdev = bdev_io->bdev;
2938 	struct spdk_io_channel *ch = bdev_ch->channel;
2939 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
2940 
2941 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) {
2942 		struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch;
2943 		struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort;
2944 
2945 		if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) ||
2946 		    bdev_abort_buf_io(mgmt_channel, bio_to_abort)) {
2947 			_bdev_io_complete_in_submit(bdev_ch, bdev_io,
2948 						    SPDK_BDEV_IO_STATUS_SUCCESS);
2949 			return;
2950 		}
2951 	}
2952 
2953 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE &&
2954 			  bdev_io->bdev->split_on_write_unit &&
2955 			  bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) {
2956 		SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n",
2957 			    bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size);
2958 		_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
2959 		return;
2960 	}
2961 
2962 	if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) {
2963 		bdev_io_increment_outstanding(bdev_ch, shared_resource);
2964 		bdev_io->internal.f.in_submit_request = true;
2965 		bdev_submit_request(bdev, ch, bdev_io);
2966 		bdev_io->internal.f.in_submit_request = false;
2967 	} else {
2968 		bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT);
2969 		if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) {
2970 			/* Special case when we have nomem IOs and no outstanding IOs which completions
2971 			 * could trigger retry of queued IOs */
2972 			bdev_shared_ch_retry_io(shared_resource);
2973 		}
2974 	}
2975 }
2976 
2977 static bool
2978 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io)
2979 {
2980 	int i;
2981 
2982 	if (bdev_qos_io_to_limit(bdev_io) == true) {
2983 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2984 			if (!qos->rate_limits[i].queue_io) {
2985 				continue;
2986 			}
2987 
2988 			if (qos->rate_limits[i].queue_io(&qos->rate_limits[i],
2989 							 bdev_io) == true) {
2990 				for (i -= 1; i >= 0 ; i--) {
2991 					if (!qos->rate_limits[i].queue_io) {
2992 						continue;
2993 					}
2994 
2995 					qos->rate_limits[i].rewind_quota(&qos->rate_limits[i], bdev_io);
2996 				}
2997 				return true;
2998 			}
2999 		}
3000 	}
3001 
3002 	return false;
3003 }
3004 
3005 static int
3006 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos)
3007 {
3008 	struct spdk_bdev_io		*bdev_io = NULL, *tmp = NULL;
3009 	int				submitted_ios = 0;
3010 
3011 	TAILQ_FOREACH_SAFE(bdev_io, &ch->qos_queued_io, internal.link, tmp) {
3012 		if (!bdev_qos_queue_io(qos, bdev_io)) {
3013 			TAILQ_REMOVE(&ch->qos_queued_io, bdev_io, internal.link);
3014 			bdev_io_do_submit(ch, bdev_io);
3015 
3016 			submitted_ios++;
3017 		}
3018 	}
3019 
3020 	return submitted_ios;
3021 }
3022 
3023 static void
3024 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn)
3025 {
3026 	int rc;
3027 
3028 	bdev_io->internal.waitq_entry.bdev = bdev_io->bdev;
3029 	bdev_io->internal.waitq_entry.cb_fn = cb_fn;
3030 	bdev_io->internal.waitq_entry.cb_arg = bdev_io;
3031 	rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch),
3032 				     &bdev_io->internal.waitq_entry);
3033 	if (rc != 0) {
3034 		SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc);
3035 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3036 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
3037 	}
3038 }
3039 
3040 static bool
3041 bdev_rw_should_split(struct spdk_bdev_io *bdev_io)
3042 {
3043 	uint32_t io_boundary;
3044 	struct spdk_bdev *bdev = bdev_io->bdev;
3045 	uint32_t max_segment_size = bdev->max_segment_size;
3046 	uint32_t max_size = bdev->max_rw_size;
3047 	int max_segs = bdev->max_num_segments;
3048 
3049 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) {
3050 		io_boundary = bdev->write_unit_size;
3051 	} else if (bdev->split_on_optimal_io_boundary) {
3052 		io_boundary = bdev->optimal_io_boundary;
3053 	} else {
3054 		io_boundary = 0;
3055 	}
3056 
3057 	if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) {
3058 		return false;
3059 	}
3060 
3061 	if (io_boundary) {
3062 		uint64_t start_stripe, end_stripe;
3063 
3064 		start_stripe = bdev_io->u.bdev.offset_blocks;
3065 		end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1;
3066 		/* Avoid expensive div operations if possible.  These spdk_u32 functions are very cheap. */
3067 		if (spdk_likely(spdk_u32_is_pow2(io_boundary))) {
3068 			start_stripe >>= spdk_u32log2(io_boundary);
3069 			end_stripe >>= spdk_u32log2(io_boundary);
3070 		} else {
3071 			start_stripe /= io_boundary;
3072 			end_stripe /= io_boundary;
3073 		}
3074 
3075 		if (start_stripe != end_stripe) {
3076 			return true;
3077 		}
3078 	}
3079 
3080 	if (max_segs) {
3081 		if (bdev_io->u.bdev.iovcnt > max_segs) {
3082 			return true;
3083 		}
3084 	}
3085 
3086 	if (max_segment_size) {
3087 		for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) {
3088 			if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) {
3089 				return true;
3090 			}
3091 		}
3092 	}
3093 
3094 	if (max_size) {
3095 		if (bdev_io->u.bdev.num_blocks > max_size) {
3096 			return true;
3097 		}
3098 	}
3099 
3100 	return false;
3101 }
3102 
3103 static bool
3104 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io)
3105 {
3106 	uint32_t num_unmap_segments;
3107 
3108 	if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) {
3109 		return false;
3110 	}
3111 	num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap);
3112 	if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) {
3113 		return true;
3114 	}
3115 
3116 	return false;
3117 }
3118 
3119 static bool
3120 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io)
3121 {
3122 	if (!bdev_io->bdev->max_write_zeroes) {
3123 		return false;
3124 	}
3125 
3126 	if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) {
3127 		return true;
3128 	}
3129 
3130 	return false;
3131 }
3132 
3133 static bool
3134 bdev_copy_should_split(struct spdk_bdev_io *bdev_io)
3135 {
3136 	if (bdev_io->bdev->max_copy != 0 &&
3137 	    bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) {
3138 		return true;
3139 	}
3140 
3141 	return false;
3142 }
3143 
3144 static bool
3145 bdev_io_should_split(struct spdk_bdev_io *bdev_io)
3146 {
3147 	switch (bdev_io->type) {
3148 	case SPDK_BDEV_IO_TYPE_READ:
3149 	case SPDK_BDEV_IO_TYPE_WRITE:
3150 		return bdev_rw_should_split(bdev_io);
3151 	case SPDK_BDEV_IO_TYPE_UNMAP:
3152 		return bdev_unmap_should_split(bdev_io);
3153 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3154 		return bdev_write_zeroes_should_split(bdev_io);
3155 	case SPDK_BDEV_IO_TYPE_COPY:
3156 		return bdev_copy_should_split(bdev_io);
3157 	default:
3158 		return false;
3159 	}
3160 }
3161 
3162 static uint32_t
3163 _to_next_boundary(uint64_t offset, uint32_t boundary)
3164 {
3165 	return (boundary - (offset % boundary));
3166 }
3167 
3168 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
3169 
3170 static void _bdev_rw_split(void *_bdev_io);
3171 
3172 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io);
3173 
3174 static void
3175 _bdev_unmap_split(void *_bdev_io)
3176 {
3177 	return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io);
3178 }
3179 
3180 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io);
3181 
3182 static void
3183 _bdev_write_zeroes_split(void *_bdev_io)
3184 {
3185 	return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io);
3186 }
3187 
3188 static void bdev_copy_split(struct spdk_bdev_io *bdev_io);
3189 
3190 static void
3191 _bdev_copy_split(void *_bdev_io)
3192 {
3193 	return bdev_copy_split((struct spdk_bdev_io *)_bdev_io);
3194 }
3195 
3196 static int
3197 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf,
3198 		     uint64_t num_blocks, uint64_t *offset, uint64_t *remaining)
3199 {
3200 	int rc;
3201 	uint64_t current_offset, current_remaining, current_src_offset;
3202 	spdk_bdev_io_wait_cb io_wait_fn;
3203 
3204 	current_offset = *offset;
3205 	current_remaining = *remaining;
3206 
3207 	assert(bdev_io->internal.f.split);
3208 
3209 	bdev_io->internal.split.outstanding++;
3210 
3211 	io_wait_fn = _bdev_rw_split;
3212 	switch (bdev_io->type) {
3213 	case SPDK_BDEV_IO_TYPE_READ:
3214 		assert(bdev_io->u.bdev.accel_sequence == NULL);
3215 		rc = bdev_readv_blocks_with_md(bdev_io->internal.desc,
3216 					       spdk_io_channel_from_ctx(bdev_io->internal.ch),
3217 					       iov, iovcnt, md_buf, current_offset,
3218 					       num_blocks,
3219 					       bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL,
3220 					       bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL,
3221 					       NULL,
3222 					       bdev_io->u.bdev.dif_check_flags,
3223 					       bdev_io_split_done, bdev_io);
3224 		break;
3225 	case SPDK_BDEV_IO_TYPE_WRITE:
3226 		assert(bdev_io->u.bdev.accel_sequence == NULL);
3227 		rc = bdev_writev_blocks_with_md(bdev_io->internal.desc,
3228 						spdk_io_channel_from_ctx(bdev_io->internal.ch),
3229 						iov, iovcnt, md_buf, current_offset,
3230 						num_blocks,
3231 						bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL,
3232 						bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL,
3233 						NULL,
3234 						bdev_io->u.bdev.dif_check_flags,
3235 						bdev_io->u.bdev.nvme_cdw12.raw,
3236 						bdev_io->u.bdev.nvme_cdw13.raw,
3237 						bdev_io_split_done, bdev_io);
3238 		break;
3239 	case SPDK_BDEV_IO_TYPE_UNMAP:
3240 		io_wait_fn = _bdev_unmap_split;
3241 		rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc,
3242 					    spdk_io_channel_from_ctx(bdev_io->internal.ch),
3243 					    current_offset, num_blocks,
3244 					    bdev_io_split_done, bdev_io);
3245 		break;
3246 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3247 		io_wait_fn = _bdev_write_zeroes_split;
3248 		rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc,
3249 						   spdk_io_channel_from_ctx(bdev_io->internal.ch),
3250 						   current_offset, num_blocks,
3251 						   bdev_io_split_done, bdev_io);
3252 		break;
3253 	case SPDK_BDEV_IO_TYPE_COPY:
3254 		io_wait_fn = _bdev_copy_split;
3255 		current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks +
3256 				     (current_offset - bdev_io->u.bdev.offset_blocks);
3257 		rc = spdk_bdev_copy_blocks(bdev_io->internal.desc,
3258 					   spdk_io_channel_from_ctx(bdev_io->internal.ch),
3259 					   current_offset, current_src_offset, num_blocks,
3260 					   bdev_io_split_done, bdev_io);
3261 		break;
3262 	default:
3263 		assert(false);
3264 		rc = -EINVAL;
3265 		break;
3266 	}
3267 
3268 	if (rc == 0) {
3269 		current_offset += num_blocks;
3270 		current_remaining -= num_blocks;
3271 		bdev_io->internal.split.current_offset_blocks = current_offset;
3272 		bdev_io->internal.split.remaining_num_blocks = current_remaining;
3273 		*offset = current_offset;
3274 		*remaining = current_remaining;
3275 	} else {
3276 		bdev_io->internal.split.outstanding--;
3277 		if (rc == -ENOMEM) {
3278 			if (bdev_io->internal.split.outstanding == 0) {
3279 				/* No I/O is outstanding. Hence we should wait here. */
3280 				bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn);
3281 			}
3282 		} else {
3283 			bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3284 			if (bdev_io->internal.split.outstanding == 0) {
3285 				bdev_ch_remove_from_io_submitted(bdev_io);
3286 				spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id,
3287 						  0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx,
3288 						  bdev_io->internal.ch->queue_depth);
3289 				bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
3290 			}
3291 		}
3292 	}
3293 
3294 	return rc;
3295 }
3296 
3297 static void
3298 _bdev_rw_split(void *_bdev_io)
3299 {
3300 	struct iovec *parent_iov, *iov;
3301 	struct spdk_bdev_io *bdev_io = _bdev_io;
3302 	struct spdk_bdev *bdev = bdev_io->bdev;
3303 	uint64_t parent_offset, current_offset, remaining;
3304 	uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt;
3305 	uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes;
3306 	uint32_t iovcnt, iov_len, child_iovsize;
3307 	uint32_t blocklen;
3308 	uint32_t io_boundary;
3309 	uint32_t max_segment_size = bdev->max_segment_size;
3310 	uint32_t max_child_iovcnt = bdev->max_num_segments;
3311 	uint32_t max_size = bdev->max_rw_size;
3312 	void *md_buf = NULL;
3313 	int rc;
3314 
3315 	blocklen = bdev_io_get_block_size(bdev_io);
3316 
3317 	max_size = max_size ? max_size : UINT32_MAX;
3318 	max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX;
3319 	max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) :
3320 			   SPDK_BDEV_IO_NUM_CHILD_IOV;
3321 
3322 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) {
3323 		io_boundary = bdev->write_unit_size;
3324 	} else if (bdev->split_on_optimal_io_boundary) {
3325 		io_boundary = bdev->optimal_io_boundary;
3326 	} else {
3327 		io_boundary = UINT32_MAX;
3328 	}
3329 
3330 	assert(bdev_io->internal.f.split);
3331 
3332 	remaining = bdev_io->internal.split.remaining_num_blocks;
3333 	current_offset = bdev_io->internal.split.current_offset_blocks;
3334 	parent_offset = bdev_io->u.bdev.offset_blocks;
3335 	parent_iov_offset = (current_offset - parent_offset) * blocklen;
3336 	parent_iovcnt = bdev_io->u.bdev.iovcnt;
3337 
3338 	for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) {
3339 		parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
3340 		if (parent_iov_offset < parent_iov->iov_len) {
3341 			break;
3342 		}
3343 		parent_iov_offset -= parent_iov->iov_len;
3344 	}
3345 
3346 	child_iovcnt = 0;
3347 	while (remaining > 0 && parent_iovpos < parent_iovcnt &&
3348 	       child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) {
3349 		to_next_boundary = _to_next_boundary(current_offset, io_boundary);
3350 		to_next_boundary = spdk_min(remaining, to_next_boundary);
3351 		to_next_boundary = spdk_min(max_size, to_next_boundary);
3352 		to_next_boundary_bytes = to_next_boundary * blocklen;
3353 
3354 		iov = &bdev_io->child_iov[child_iovcnt];
3355 		iovcnt = 0;
3356 
3357 		if (bdev_io->u.bdev.md_buf) {
3358 			md_buf = (char *)bdev_io->u.bdev.md_buf +
3359 				 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev);
3360 		}
3361 
3362 		child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt);
3363 		while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt &&
3364 		       iovcnt < child_iovsize) {
3365 			parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
3366 			iov_len = parent_iov->iov_len - parent_iov_offset;
3367 
3368 			iov_len = spdk_min(iov_len, max_segment_size);
3369 			iov_len = spdk_min(iov_len, to_next_boundary_bytes);
3370 			to_next_boundary_bytes -= iov_len;
3371 
3372 			bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset;
3373 			bdev_io->child_iov[child_iovcnt].iov_len = iov_len;
3374 
3375 			if (iov_len < parent_iov->iov_len - parent_iov_offset) {
3376 				parent_iov_offset += iov_len;
3377 			} else {
3378 				parent_iovpos++;
3379 				parent_iov_offset = 0;
3380 			}
3381 			child_iovcnt++;
3382 			iovcnt++;
3383 		}
3384 
3385 		if (to_next_boundary_bytes > 0) {
3386 			/* We had to stop this child I/O early because we ran out of
3387 			 * child_iov space or were limited by max_num_segments.
3388 			 * Ensure the iovs to be aligned with block size and
3389 			 * then adjust to_next_boundary before starting the
3390 			 * child I/O.
3391 			 */
3392 			assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV ||
3393 			       iovcnt == child_iovsize);
3394 			to_last_block_bytes = to_next_boundary_bytes % blocklen;
3395 			if (to_last_block_bytes != 0) {
3396 				uint32_t child_iovpos = child_iovcnt - 1;
3397 				/* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV
3398 				 * so the loop will naturally end
3399 				 */
3400 
3401 				to_last_block_bytes = blocklen - to_last_block_bytes;
3402 				to_next_boundary_bytes += to_last_block_bytes;
3403 				while (to_last_block_bytes > 0 && iovcnt > 0) {
3404 					iov_len = spdk_min(to_last_block_bytes,
3405 							   bdev_io->child_iov[child_iovpos].iov_len);
3406 					bdev_io->child_iov[child_iovpos].iov_len -= iov_len;
3407 					if (bdev_io->child_iov[child_iovpos].iov_len == 0) {
3408 						child_iovpos--;
3409 						if (--iovcnt == 0) {
3410 							/* If the child IO is less than a block size just return.
3411 							 * If the first child IO of any split round is less than
3412 							 * a block size, an error exit.
3413 							 */
3414 							if (bdev_io->internal.split.outstanding == 0) {
3415 								SPDK_ERRLOG("The first child io was less than a block size\n");
3416 								bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3417 								bdev_ch_remove_from_io_submitted(bdev_io);
3418 								spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id,
3419 										  0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx,
3420 										  bdev_io->internal.ch->queue_depth);
3421 								bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
3422 							}
3423 
3424 							return;
3425 						}
3426 					}
3427 
3428 					to_last_block_bytes -= iov_len;
3429 
3430 					if (parent_iov_offset == 0) {
3431 						parent_iovpos--;
3432 						parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len;
3433 					}
3434 					parent_iov_offset -= iov_len;
3435 				}
3436 
3437 				assert(to_last_block_bytes == 0);
3438 			}
3439 			to_next_boundary -= to_next_boundary_bytes / blocklen;
3440 		}
3441 
3442 		rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary,
3443 					  &current_offset, &remaining);
3444 		if (spdk_unlikely(rc)) {
3445 			return;
3446 		}
3447 	}
3448 }
3449 
3450 static void
3451 bdev_unmap_split(struct spdk_bdev_io *bdev_io)
3452 {
3453 	uint64_t offset, unmap_blocks, remaining, max_unmap_blocks;
3454 	uint32_t num_children_reqs = 0;
3455 	int rc;
3456 
3457 	assert(bdev_io->internal.f.split);
3458 
3459 	offset = bdev_io->internal.split.current_offset_blocks;
3460 	remaining = bdev_io->internal.split.remaining_num_blocks;
3461 	max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments;
3462 
3463 	while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) {
3464 		unmap_blocks = spdk_min(remaining, max_unmap_blocks);
3465 
3466 		rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks,
3467 					  &offset, &remaining);
3468 		if (spdk_likely(rc == 0)) {
3469 			num_children_reqs++;
3470 		} else {
3471 			return;
3472 		}
3473 	}
3474 }
3475 
3476 static void
3477 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io)
3478 {
3479 	uint64_t offset, write_zeroes_blocks, remaining;
3480 	uint32_t num_children_reqs = 0;
3481 	int rc;
3482 
3483 	assert(bdev_io->internal.f.split);
3484 
3485 	offset = bdev_io->internal.split.current_offset_blocks;
3486 	remaining = bdev_io->internal.split.remaining_num_blocks;
3487 
3488 	while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) {
3489 		write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes);
3490 
3491 		rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks,
3492 					  &offset, &remaining);
3493 		if (spdk_likely(rc == 0)) {
3494 			num_children_reqs++;
3495 		} else {
3496 			return;
3497 		}
3498 	}
3499 }
3500 
3501 static void
3502 bdev_copy_split(struct spdk_bdev_io *bdev_io)
3503 {
3504 	uint64_t offset, copy_blocks, remaining;
3505 	uint32_t num_children_reqs = 0;
3506 	int rc;
3507 
3508 	assert(bdev_io->internal.f.split);
3509 
3510 	offset = bdev_io->internal.split.current_offset_blocks;
3511 	remaining = bdev_io->internal.split.remaining_num_blocks;
3512 
3513 	assert(bdev_io->bdev->max_copy != 0);
3514 	while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) {
3515 		copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy);
3516 
3517 		rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks,
3518 					  &offset, &remaining);
3519 		if (spdk_likely(rc == 0)) {
3520 			num_children_reqs++;
3521 		} else {
3522 			return;
3523 		}
3524 	}
3525 }
3526 
3527 static void
3528 parent_bdev_io_complete(void *ctx, int rc)
3529 {
3530 	struct spdk_bdev_io *parent_io = ctx;
3531 
3532 	if (rc) {
3533 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3534 	}
3535 
3536 	parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
3537 			       parent_io->internal.caller_ctx);
3538 }
3539 
3540 static void
3541 bdev_io_complete_parent_sequence_cb(void *ctx, int status)
3542 {
3543 	struct spdk_bdev_io *bdev_io = ctx;
3544 
3545 	/* u.bdev.accel_sequence should have already been cleared at this point */
3546 	assert(bdev_io->u.bdev.accel_sequence == NULL);
3547 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
3548 	bdev_io->internal.f.has_accel_sequence = false;
3549 
3550 	if (spdk_unlikely(status != 0)) {
3551 		SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status);
3552 	}
3553 
3554 	parent_bdev_io_complete(bdev_io, status);
3555 }
3556 
3557 static void
3558 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
3559 {
3560 	struct spdk_bdev_io *parent_io = cb_arg;
3561 
3562 	spdk_bdev_free_io(bdev_io);
3563 
3564 	assert(parent_io->internal.f.split);
3565 
3566 	if (!success) {
3567 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3568 		/* If any child I/O failed, stop further splitting process. */
3569 		parent_io->internal.split.current_offset_blocks += parent_io->internal.split.remaining_num_blocks;
3570 		parent_io->internal.split.remaining_num_blocks = 0;
3571 	}
3572 	parent_io->internal.split.outstanding--;
3573 	if (parent_io->internal.split.outstanding != 0) {
3574 		return;
3575 	}
3576 
3577 	/*
3578 	 * Parent I/O finishes when all blocks are consumed.
3579 	 */
3580 	if (parent_io->internal.split.remaining_num_blocks == 0) {
3581 		assert(parent_io->internal.cb != bdev_io_split_done);
3582 		bdev_ch_remove_from_io_submitted(parent_io);
3583 		spdk_trace_record(TRACE_BDEV_IO_DONE, parent_io->internal.ch->trace_id,
3584 				  0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx,
3585 				  parent_io->internal.ch->queue_depth);
3586 
3587 		if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
3588 			if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) {
3589 				bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb);
3590 				return;
3591 			} else if (parent_io->internal.f.has_bounce_buf &&
3592 				   !bdev_io_use_accel_sequence(bdev_io)) {
3593 				/* bdev IO will be completed in the callback */
3594 				_bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete);
3595 				return;
3596 			}
3597 		}
3598 
3599 		parent_bdev_io_complete(parent_io, 0);
3600 		return;
3601 	}
3602 
3603 	/*
3604 	 * Continue with the splitting process.  This function will complete the parent I/O if the
3605 	 * splitting is done.
3606 	 */
3607 	switch (parent_io->type) {
3608 	case SPDK_BDEV_IO_TYPE_READ:
3609 	case SPDK_BDEV_IO_TYPE_WRITE:
3610 		_bdev_rw_split(parent_io);
3611 		break;
3612 	case SPDK_BDEV_IO_TYPE_UNMAP:
3613 		bdev_unmap_split(parent_io);
3614 		break;
3615 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3616 		bdev_write_zeroes_split(parent_io);
3617 		break;
3618 	case SPDK_BDEV_IO_TYPE_COPY:
3619 		bdev_copy_split(parent_io);
3620 		break;
3621 	default:
3622 		assert(false);
3623 		break;
3624 	}
3625 }
3626 
3627 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
3628 				     bool success);
3629 
3630 static void
3631 bdev_io_split(struct spdk_bdev_io *bdev_io)
3632 {
3633 	assert(bdev_io_should_split(bdev_io));
3634 	assert(bdev_io->internal.f.split);
3635 
3636 	bdev_io->internal.split.current_offset_blocks = bdev_io->u.bdev.offset_blocks;
3637 	bdev_io->internal.split.remaining_num_blocks = bdev_io->u.bdev.num_blocks;
3638 	bdev_io->internal.split.outstanding = 0;
3639 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
3640 
3641 	switch (bdev_io->type) {
3642 	case SPDK_BDEV_IO_TYPE_READ:
3643 	case SPDK_BDEV_IO_TYPE_WRITE:
3644 		if (_is_buf_allocated(bdev_io->u.bdev.iovs)) {
3645 			_bdev_rw_split(bdev_io);
3646 		} else {
3647 			assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
3648 			spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb,
3649 					     bdev_io->u.bdev.num_blocks * bdev_io_get_block_size(bdev_io));
3650 		}
3651 		break;
3652 	case SPDK_BDEV_IO_TYPE_UNMAP:
3653 		bdev_unmap_split(bdev_io);
3654 		break;
3655 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3656 		bdev_write_zeroes_split(bdev_io);
3657 		break;
3658 	case SPDK_BDEV_IO_TYPE_COPY:
3659 		bdev_copy_split(bdev_io);
3660 		break;
3661 	default:
3662 		assert(false);
3663 		break;
3664 	}
3665 }
3666 
3667 static void
3668 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
3669 {
3670 	if (!success) {
3671 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
3672 		return;
3673 	}
3674 
3675 	_bdev_rw_split(bdev_io);
3676 }
3677 
3678 static inline void
3679 _bdev_io_submit(struct spdk_bdev_io *bdev_io)
3680 {
3681 	struct spdk_bdev *bdev = bdev_io->bdev;
3682 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
3683 
3684 	if (spdk_likely(bdev_ch->flags == 0)) {
3685 		bdev_io_do_submit(bdev_ch, bdev_io);
3686 		return;
3687 	}
3688 
3689 	if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) {
3690 		_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
3691 	} else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) {
3692 		if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) &&
3693 		    bdev_abort_queued_io(&bdev_ch->qos_queued_io, bdev_io->u.abort.bio_to_abort)) {
3694 			_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
3695 		} else {
3696 			TAILQ_INSERT_TAIL(&bdev_ch->qos_queued_io, bdev_io, internal.link);
3697 			bdev_qos_io_submit(bdev_ch, bdev->internal.qos);
3698 		}
3699 	} else {
3700 		SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags);
3701 		_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
3702 	}
3703 }
3704 
3705 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2);
3706 
3707 bool
3708 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2)
3709 {
3710 	if (range1->length == 0 || range2->length == 0) {
3711 		return false;
3712 	}
3713 
3714 	if (range1->offset + range1->length <= range2->offset) {
3715 		return false;
3716 	}
3717 
3718 	if (range2->offset + range2->length <= range1->offset) {
3719 		return false;
3720 	}
3721 
3722 	return true;
3723 }
3724 
3725 static bool
3726 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range)
3727 {
3728 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
3729 	struct lba_range r;
3730 
3731 	switch (bdev_io->type) {
3732 	case SPDK_BDEV_IO_TYPE_NVME_IO:
3733 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
3734 		/* Don't try to decode the NVMe command - just assume worst-case and that
3735 		 * it overlaps a locked range.
3736 		 */
3737 		return true;
3738 	case SPDK_BDEV_IO_TYPE_READ:
3739 		if (!range->quiesce) {
3740 			return false;
3741 		}
3742 	/* fallthrough */
3743 	case SPDK_BDEV_IO_TYPE_WRITE:
3744 	case SPDK_BDEV_IO_TYPE_UNMAP:
3745 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3746 	case SPDK_BDEV_IO_TYPE_ZCOPY:
3747 	case SPDK_BDEV_IO_TYPE_COPY:
3748 		r.offset = bdev_io->u.bdev.offset_blocks;
3749 		r.length = bdev_io->u.bdev.num_blocks;
3750 		if (!bdev_lba_range_overlapped(range, &r)) {
3751 			/* This I/O doesn't overlap the specified LBA range. */
3752 			return false;
3753 		} else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) {
3754 			/* This I/O overlaps, but the I/O is on the same channel that locked this
3755 			 * range, and the caller_ctx is the same as the locked_ctx.  This means
3756 			 * that this I/O is associated with the lock, and is allowed to execute.
3757 			 */
3758 			return false;
3759 		} else {
3760 			return true;
3761 		}
3762 	default:
3763 		return false;
3764 	}
3765 }
3766 
3767 void
3768 bdev_io_submit(struct spdk_bdev_io *bdev_io)
3769 {
3770 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
3771 
3772 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
3773 
3774 	if (!TAILQ_EMPTY(&ch->locked_ranges)) {
3775 		struct lba_range *range;
3776 
3777 		TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
3778 			if (bdev_io_range_is_locked(bdev_io, range)) {
3779 				TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link);
3780 				return;
3781 			}
3782 		}
3783 	}
3784 
3785 	bdev_ch_add_to_io_submitted(bdev_io);
3786 
3787 	bdev_io->internal.submit_tsc = spdk_get_ticks();
3788 	spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START,
3789 			      ch->trace_id, bdev_io->u.bdev.num_blocks,
3790 			      (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx,
3791 			      bdev_io->u.bdev.offset_blocks, ch->queue_depth);
3792 
3793 	if (bdev_io->internal.f.split) {
3794 		bdev_io_split(bdev_io);
3795 		return;
3796 	}
3797 
3798 	_bdev_io_submit(bdev_io);
3799 }
3800 
3801 static inline int
3802 bdev_io_init_dif_ctx(struct spdk_bdev_io *bdev_io)
3803 {
3804 	struct spdk_bdev *bdev = bdev_io->bdev;
3805 	struct spdk_dif_ctx_init_ext_opts dif_opts;
3806 
3807 	memset(&bdev_io->u.bdev.dif_err, 0, sizeof(struct spdk_dif_error));
3808 
3809 	dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format);
3810 	dif_opts.dif_pi_format = bdev->dif_pi_format;
3811 
3812 	return spdk_dif_ctx_init(&bdev_io->u.bdev.dif_ctx,
3813 				 bdev->blocklen,
3814 				 bdev->md_len,
3815 				 bdev->md_interleave,
3816 				 bdev->dif_is_head_of_md,
3817 				 bdev->dif_type,
3818 				 bdev_io->u.bdev.dif_check_flags,
3819 				 bdev_io->u.bdev.offset_blocks & 0xFFFFFFFF,
3820 				 0xFFFF, 0, 0, 0, &dif_opts);
3821 }
3822 
3823 static void
3824 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
3825 			      bool success)
3826 {
3827 	if (!success) {
3828 		SPDK_ERRLOG("Failed to get data buffer, completing IO\n");
3829 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3830 		bdev_io_complete_unsubmitted(bdev_io);
3831 		return;
3832 	}
3833 
3834 	if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) {
3835 		if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
3836 			bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb);
3837 			return;
3838 		}
3839 		/* For reads we'll execute the sequence after the data is read, so, for now, only
3840 		 * clear out accel_sequence pointer and submit the IO */
3841 		assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
3842 		bdev_io->u.bdev.accel_sequence = NULL;
3843 	}
3844 
3845 	bdev_io_submit(bdev_io);
3846 }
3847 
3848 static inline void
3849 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io)
3850 {
3851 	/* bdev doesn't support memory domains, thereby buffers in this IO request can't
3852 	 * be accessed directly. It is needed to allocate buffers before issuing IO operation.
3853 	 * For write operation we need to pull buffers from memory domain before submitting IO.
3854 	 * Once read operation completes, we need to use memory_domain push functionality to
3855 	 * update data in original memory domain IO buffer.
3856 	 *
3857 	 * If this I/O request is not aware of metadata, buffers in thsi IO request can't be
3858 	 * accessed directly too. It is needed to allocate buffers before issuing IO operation.
3859 	 * For write operation we need to insert metadata before submitting IO. Once read
3860 	 * operation completes, we need to strip metadata in original IO buffer.
3861 	 *
3862 	 * This IO request will go through a regular IO flow, so clear memory domains pointers */
3863 	assert(bdev_io_use_memory_domain(bdev_io) ||
3864 	       bdev_io_needs_metadata(bdev_io->internal.desc, bdev_io));
3865 
3866 	bdev_io->u.bdev.memory_domain = NULL;
3867 	bdev_io->u.bdev.memory_domain_ctx = NULL;
3868 	_bdev_io_get_bounce_buf(bdev_io, _bdev_memory_domain_get_io_cb,
3869 				bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
3870 }
3871 
3872 static inline void
3873 _bdev_io_ext_use_accel_buffer(struct spdk_bdev_io *bdev_io)
3874 {
3875 	assert(bdev_io_use_memory_domain(bdev_io));
3876 	assert(bdev_io_needs_metadata(bdev_io->internal.desc, bdev_io));
3877 
3878 	bdev_io->u.bdev.memory_domain = NULL;
3879 	bdev_io->u.bdev.memory_domain_ctx = NULL;
3880 	bdev_io_get_accel_buf(bdev_io, _bdev_memory_domain_get_io_cb,
3881 			      bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
3882 }
3883 
3884 /* We need to allocate bounce buffer
3885  * - if bdev doesn't support memory domains,
3886  * -  if it does support them, but we need to execute an accel sequence and the data buffer is
3887  *    from accel memory domain (to avoid doing a push/pull from that domain), or
3888  * - if IO is not aware of metadata.
3889  */
3890 static inline bool
3891 bdev_io_needs_bounce_buffer(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io)
3892 {
3893 	if (bdev_io_use_memory_domain(bdev_io)) {
3894 		if (!desc->memory_domains_supported ||
3895 		    (bdev_io_needs_sequence_exec(desc, bdev_io) &&
3896 		     (bdev_io->internal.memory_domain == spdk_accel_get_memory_domain() ||
3897 		      bdev_io_needs_metadata(desc, bdev_io)))) {
3898 			return true;
3899 		}
3900 
3901 		return false;
3902 	}
3903 
3904 	if (bdev_io_needs_metadata(desc, bdev_io)) {
3905 		return true;
3906 	}
3907 
3908 	return false;
3909 }
3910 
3911 /* We need to allocate fake accel buffer if bdev supports memory domains but IO is not
3912  * aware of metadata.
3913  */
3914 static inline bool
3915 bdev_io_needs_accel_buffer(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io)
3916 {
3917 	if (bdev_io_needs_metadata(desc, bdev_io)) {
3918 		assert(bdev_io_use_memory_domain(bdev_io));
3919 		return true;
3920 	}
3921 
3922 	return false;
3923 }
3924 
3925 static inline void
3926 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io)
3927 {
3928 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
3929 	int rc;
3930 
3931 	if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) {
3932 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED;
3933 		bdev_io_complete_unsubmitted(bdev_io);
3934 		return;
3935 	}
3936 
3937 	if (bdev_io_needs_metadata(desc, bdev_io)) {
3938 		rc = bdev_io_init_dif_ctx(bdev_io);
3939 		if (spdk_unlikely(rc != 0)) {
3940 			bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3941 			bdev_io_complete_unsubmitted(bdev_io);
3942 			return;
3943 		}
3944 	}
3945 
3946 	if (bdev_io_needs_bounce_buffer(desc, bdev_io)) {
3947 		_bdev_io_ext_use_bounce_buffer(bdev_io);
3948 		return;
3949 	}
3950 
3951 	if (bdev_io_needs_accel_buffer(desc, bdev_io)) {
3952 		_bdev_io_ext_use_accel_buffer(bdev_io);
3953 		return;
3954 	}
3955 
3956 	if (bdev_io_needs_sequence_exec(desc, bdev_io)) {
3957 		if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
3958 			bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb);
3959 			return;
3960 		}
3961 		/* For reads we'll execute the sequence after the data is read, so, for now, only
3962 		 * clear out accel_sequence pointer and submit the IO */
3963 		assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
3964 		bdev_io->u.bdev.accel_sequence = NULL;
3965 	}
3966 
3967 	bdev_io_submit(bdev_io);
3968 }
3969 
3970 static void
3971 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io)
3972 {
3973 	struct spdk_bdev *bdev = bdev_io->bdev;
3974 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
3975 	struct spdk_io_channel *ch = bdev_ch->channel;
3976 
3977 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
3978 
3979 	bdev_io->internal.f.in_submit_request = true;
3980 	bdev_submit_request(bdev, ch, bdev_io);
3981 	bdev_io->internal.f.in_submit_request = false;
3982 }
3983 
3984 void
3985 bdev_io_init(struct spdk_bdev_io *bdev_io,
3986 	     struct spdk_bdev *bdev, void *cb_arg,
3987 	     spdk_bdev_io_completion_cb cb)
3988 {
3989 	bdev_io->bdev = bdev;
3990 	bdev_io->internal.f.raw = 0;
3991 	bdev_io->internal.caller_ctx = cb_arg;
3992 	bdev_io->internal.cb = cb;
3993 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
3994 	bdev_io->internal.f.in_submit_request = false;
3995 	bdev_io->internal.error.nvme.cdw0 = 0;
3996 	bdev_io->num_retries = 0;
3997 	bdev_io->internal.get_buf_cb = NULL;
3998 	bdev_io->internal.get_aux_buf_cb = NULL;
3999 	bdev_io->internal.data_transfer_cpl = NULL;
4000 	bdev_io->internal.f.split = bdev_io_should_split(bdev_io);
4001 }
4002 
4003 static bool
4004 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
4005 {
4006 	return bdev->fn_table->io_type_supported(bdev->ctxt, io_type);
4007 }
4008 
4009 bool
4010 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
4011 {
4012 	bool supported;
4013 
4014 	supported = bdev_io_type_supported(bdev, io_type);
4015 
4016 	if (!supported) {
4017 		switch (io_type) {
4018 		case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
4019 			/* The bdev layer will emulate write zeroes as long as write is supported. */
4020 			supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE);
4021 			break;
4022 		default:
4023 			break;
4024 		}
4025 	}
4026 
4027 	return supported;
4028 }
4029 
4030 static const char *g_io_type_strings[] = {
4031 	[SPDK_BDEV_IO_TYPE_READ] = "read",
4032 	[SPDK_BDEV_IO_TYPE_WRITE] = "write",
4033 	[SPDK_BDEV_IO_TYPE_UNMAP] = "unmap",
4034 	[SPDK_BDEV_IO_TYPE_FLUSH] = "flush",
4035 	[SPDK_BDEV_IO_TYPE_RESET] = "reset",
4036 	[SPDK_BDEV_IO_TYPE_NVME_ADMIN] = "nvme_admin",
4037 	[SPDK_BDEV_IO_TYPE_NVME_IO] = "nvme_io",
4038 	[SPDK_BDEV_IO_TYPE_NVME_IO_MD] = "nvme_io_md",
4039 	[SPDK_BDEV_IO_TYPE_WRITE_ZEROES] = "write_zeroes",
4040 	[SPDK_BDEV_IO_TYPE_ZCOPY] = "zcopy",
4041 	[SPDK_BDEV_IO_TYPE_GET_ZONE_INFO] = "get_zone_info",
4042 	[SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT] = "zone_management",
4043 	[SPDK_BDEV_IO_TYPE_ZONE_APPEND] = "zone_append",
4044 	[SPDK_BDEV_IO_TYPE_COMPARE] = "compare",
4045 	[SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE] = "compare_and_write",
4046 	[SPDK_BDEV_IO_TYPE_ABORT] = "abort",
4047 	[SPDK_BDEV_IO_TYPE_SEEK_HOLE] = "seek_hole",
4048 	[SPDK_BDEV_IO_TYPE_SEEK_DATA] = "seek_data",
4049 	[SPDK_BDEV_IO_TYPE_COPY] = "copy",
4050 	[SPDK_BDEV_IO_TYPE_NVME_IOV_MD] = "nvme_iov_md",
4051 };
4052 
4053 const char *
4054 spdk_bdev_get_io_type_name(enum spdk_bdev_io_type io_type)
4055 {
4056 	if (io_type <= SPDK_BDEV_IO_TYPE_INVALID || io_type >= SPDK_BDEV_NUM_IO_TYPES) {
4057 		return NULL;
4058 	}
4059 
4060 	return g_io_type_strings[io_type];
4061 }
4062 
4063 int
4064 spdk_bdev_get_io_type(const char *io_type_string)
4065 {
4066 	int i;
4067 
4068 	for (i = SPDK_BDEV_IO_TYPE_READ; i < SPDK_BDEV_NUM_IO_TYPES; ++i) {
4069 		if (!strcmp(io_type_string, g_io_type_strings[i])) {
4070 			return i;
4071 		}
4072 	}
4073 
4074 	return -1;
4075 }
4076 
4077 uint64_t
4078 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io)
4079 {
4080 	return bdev_io->internal.submit_tsc;
4081 }
4082 
4083 bool
4084 spdk_bdev_io_hide_metadata(struct spdk_bdev_io *bdev_io)
4085 {
4086 	return bdev_io->internal.desc->opts.hide_metadata;
4087 }
4088 
4089 int
4090 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
4091 {
4092 	if (bdev->fn_table->dump_info_json) {
4093 		return bdev->fn_table->dump_info_json(bdev->ctxt, w);
4094 	}
4095 
4096 	return 0;
4097 }
4098 
4099 static void
4100 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos)
4101 {
4102 	uint32_t max_per_timeslice = 0;
4103 	int i;
4104 
4105 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4106 		if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
4107 			qos->rate_limits[i].max_per_timeslice = 0;
4108 			continue;
4109 		}
4110 
4111 		max_per_timeslice = qos->rate_limits[i].limit *
4112 				    SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC;
4113 
4114 		qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice,
4115 							qos->rate_limits[i].min_per_timeslice);
4116 
4117 		__atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice,
4118 				 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELEASE);
4119 	}
4120 
4121 	bdev_qos_set_ops(qos);
4122 }
4123 
4124 static void
4125 bdev_channel_submit_qos_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
4126 			   struct spdk_io_channel *io_ch, void *ctx)
4127 {
4128 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
4129 	int status;
4130 
4131 	bdev_qos_io_submit(bdev_ch, bdev->internal.qos);
4132 
4133 	/* if all IOs were sent then continue the iteration, otherwise - stop it */
4134 	/* TODO: channels round robing */
4135 	status = TAILQ_EMPTY(&bdev_ch->qos_queued_io) ? 0 : 1;
4136 
4137 	spdk_bdev_for_each_channel_continue(i, status);
4138 }
4139 
4140 
4141 static void
4142 bdev_channel_submit_qos_io_done(struct spdk_bdev *bdev, void *ctx, int status)
4143 {
4144 
4145 }
4146 
4147 static int
4148 bdev_channel_poll_qos(void *arg)
4149 {
4150 	struct spdk_bdev *bdev = arg;
4151 	struct spdk_bdev_qos *qos = bdev->internal.qos;
4152 	uint64_t now = spdk_get_ticks();
4153 	int i;
4154 	int64_t remaining_last_timeslice;
4155 
4156 	if (spdk_unlikely(qos->thread == NULL)) {
4157 		/* Old QoS was unbound to remove and new QoS is not enabled yet. */
4158 		return SPDK_POLLER_IDLE;
4159 	}
4160 
4161 	if (now < (qos->last_timeslice + qos->timeslice_size)) {
4162 		/* We received our callback earlier than expected - return
4163 		 *  immediately and wait to do accounting until at least one
4164 		 *  timeslice has actually expired.  This should never happen
4165 		 *  with a well-behaved timer implementation.
4166 		 */
4167 		return SPDK_POLLER_IDLE;
4168 	}
4169 
4170 	/* Reset for next round of rate limiting */
4171 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4172 		/* We may have allowed the IOs or bytes to slightly overrun in the last
4173 		 * timeslice. remaining_this_timeslice is signed, so if it's negative
4174 		 * here, we'll account for the overrun so that the next timeslice will
4175 		 * be appropriately reduced.
4176 		 */
4177 		remaining_last_timeslice = __atomic_exchange_n(&qos->rate_limits[i].remaining_this_timeslice,
4178 					   0, __ATOMIC_RELAXED);
4179 		if (remaining_last_timeslice < 0) {
4180 			/* There could be a race condition here as both bdev_qos_rw_queue_io() and bdev_channel_poll_qos()
4181 			 * potentially use 2 atomic ops each, so they can intertwine.
4182 			 * This race can potentially cause the limits to be a little fuzzy but won't cause any real damage.
4183 			 */
4184 			__atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice,
4185 					 remaining_last_timeslice, __ATOMIC_RELAXED);
4186 		}
4187 	}
4188 
4189 	while (now >= (qos->last_timeslice + qos->timeslice_size)) {
4190 		qos->last_timeslice += qos->timeslice_size;
4191 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4192 			__atomic_add_fetch(&qos->rate_limits[i].remaining_this_timeslice,
4193 					   qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELAXED);
4194 		}
4195 	}
4196 
4197 	spdk_bdev_for_each_channel(bdev, bdev_channel_submit_qos_io, qos,
4198 				   bdev_channel_submit_qos_io_done);
4199 
4200 	return SPDK_POLLER_BUSY;
4201 }
4202 
4203 static void
4204 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch)
4205 {
4206 	struct spdk_bdev_shared_resource *shared_resource;
4207 	struct lba_range *range;
4208 
4209 	bdev_free_io_stat(ch->stat);
4210 #ifdef SPDK_CONFIG_VTUNE
4211 	bdev_free_io_stat(ch->prev_stat);
4212 #endif
4213 
4214 	while (!TAILQ_EMPTY(&ch->locked_ranges)) {
4215 		range = TAILQ_FIRST(&ch->locked_ranges);
4216 		TAILQ_REMOVE(&ch->locked_ranges, range, tailq);
4217 		free(range);
4218 	}
4219 
4220 	spdk_put_io_channel(ch->channel);
4221 	spdk_put_io_channel(ch->accel_channel);
4222 
4223 	shared_resource = ch->shared_resource;
4224 
4225 	assert(TAILQ_EMPTY(&ch->io_locked));
4226 	assert(TAILQ_EMPTY(&ch->io_submitted));
4227 	assert(TAILQ_EMPTY(&ch->io_accel_exec));
4228 	assert(TAILQ_EMPTY(&ch->io_memory_domain));
4229 	assert(ch->io_outstanding == 0);
4230 	assert(shared_resource->ref > 0);
4231 	shared_resource->ref--;
4232 	if (shared_resource->ref == 0) {
4233 		assert(shared_resource->io_outstanding == 0);
4234 		TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link);
4235 		spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch));
4236 		spdk_poller_unregister(&shared_resource->nomem_poller);
4237 		free(shared_resource);
4238 	}
4239 }
4240 
4241 static void
4242 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch)
4243 {
4244 	struct spdk_bdev_qos	*qos = bdev->internal.qos;
4245 	int			i;
4246 
4247 	assert(spdk_spin_held(&bdev->internal.spinlock));
4248 
4249 	/* Rate limiting on this bdev enabled */
4250 	if (qos) {
4251 		if (qos->ch == NULL) {
4252 			struct spdk_io_channel *io_ch;
4253 
4254 			SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch,
4255 				      bdev->name, spdk_get_thread());
4256 
4257 			/* No qos channel has been selected, so set one up */
4258 
4259 			/* Take another reference to ch */
4260 			io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev));
4261 			assert(io_ch != NULL);
4262 			qos->ch = ch;
4263 
4264 			qos->thread = spdk_io_channel_get_thread(io_ch);
4265 
4266 			for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4267 				if (bdev_qos_is_iops_rate_limit(i) == true) {
4268 					qos->rate_limits[i].min_per_timeslice =
4269 						SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE;
4270 				} else {
4271 					qos->rate_limits[i].min_per_timeslice =
4272 						SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE;
4273 				}
4274 
4275 				if (qos->rate_limits[i].limit == 0) {
4276 					qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
4277 				}
4278 			}
4279 			bdev_qos_update_max_quota_per_timeslice(qos);
4280 			qos->timeslice_size =
4281 				SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC;
4282 			qos->last_timeslice = spdk_get_ticks();
4283 			qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos,
4284 							   bdev,
4285 							   SPDK_BDEV_QOS_TIMESLICE_IN_USEC);
4286 		}
4287 
4288 		ch->flags |= BDEV_CH_QOS_ENABLED;
4289 	}
4290 }
4291 
4292 struct poll_timeout_ctx {
4293 	struct spdk_bdev_desc	*desc;
4294 	uint64_t		timeout_in_sec;
4295 	spdk_bdev_io_timeout_cb	cb_fn;
4296 	void			*cb_arg;
4297 };
4298 
4299 static void
4300 bdev_desc_free(struct spdk_bdev_desc *desc)
4301 {
4302 	spdk_spin_destroy(&desc->spinlock);
4303 	free(desc->media_events_buffer);
4304 	free(desc);
4305 }
4306 
4307 static void
4308 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
4309 {
4310 	struct poll_timeout_ctx *ctx  = _ctx;
4311 	struct spdk_bdev_desc *desc = ctx->desc;
4312 
4313 	free(ctx);
4314 
4315 	spdk_spin_lock(&desc->spinlock);
4316 	desc->refs--;
4317 	if (desc->closed == true && desc->refs == 0) {
4318 		spdk_spin_unlock(&desc->spinlock);
4319 		bdev_desc_free(desc);
4320 		return;
4321 	}
4322 	spdk_spin_unlock(&desc->spinlock);
4323 }
4324 
4325 static void
4326 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
4327 			     struct spdk_io_channel *io_ch, void *_ctx)
4328 {
4329 	struct poll_timeout_ctx *ctx  = _ctx;
4330 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
4331 	struct spdk_bdev_desc *desc = ctx->desc;
4332 	struct spdk_bdev_io *bdev_io;
4333 	uint64_t now;
4334 
4335 	spdk_spin_lock(&desc->spinlock);
4336 	if (desc->closed == true) {
4337 		spdk_spin_unlock(&desc->spinlock);
4338 		spdk_bdev_for_each_channel_continue(i, -1);
4339 		return;
4340 	}
4341 	spdk_spin_unlock(&desc->spinlock);
4342 
4343 	now = spdk_get_ticks();
4344 	TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) {
4345 		/* Exclude any I/O that are generated via splitting. */
4346 		if (bdev_io->internal.cb == bdev_io_split_done) {
4347 			continue;
4348 		}
4349 
4350 		/* Once we find an I/O that has not timed out, we can immediately
4351 		 * exit the loop.
4352 		 */
4353 		if (now < (bdev_io->internal.submit_tsc +
4354 			   ctx->timeout_in_sec * spdk_get_ticks_hz())) {
4355 			goto end;
4356 		}
4357 
4358 		if (bdev_io->internal.desc == desc) {
4359 			ctx->cb_fn(ctx->cb_arg, bdev_io);
4360 		}
4361 	}
4362 
4363 end:
4364 	spdk_bdev_for_each_channel_continue(i, 0);
4365 }
4366 
4367 static int
4368 bdev_poll_timeout_io(void *arg)
4369 {
4370 	struct spdk_bdev_desc *desc = arg;
4371 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
4372 	struct poll_timeout_ctx *ctx;
4373 
4374 	ctx = calloc(1, sizeof(struct poll_timeout_ctx));
4375 	if (!ctx) {
4376 		SPDK_ERRLOG("failed to allocate memory\n");
4377 		return SPDK_POLLER_BUSY;
4378 	}
4379 	ctx->desc = desc;
4380 	ctx->cb_arg = desc->cb_arg;
4381 	ctx->cb_fn = desc->cb_fn;
4382 	ctx->timeout_in_sec = desc->timeout_in_sec;
4383 
4384 	/* Take a ref on the descriptor in case it gets closed while we are checking
4385 	 * all of the channels.
4386 	 */
4387 	spdk_spin_lock(&desc->spinlock);
4388 	desc->refs++;
4389 	spdk_spin_unlock(&desc->spinlock);
4390 
4391 	spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx,
4392 				   bdev_channel_poll_timeout_io_done);
4393 
4394 	return SPDK_POLLER_BUSY;
4395 }
4396 
4397 int
4398 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec,
4399 		      spdk_bdev_io_timeout_cb cb_fn, void *cb_arg)
4400 {
4401 	assert(desc->thread == spdk_get_thread());
4402 
4403 	spdk_poller_unregister(&desc->io_timeout_poller);
4404 
4405 	if (timeout_in_sec) {
4406 		assert(cb_fn != NULL);
4407 		desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io,
4408 					  desc,
4409 					  SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC /
4410 					  1000);
4411 		if (desc->io_timeout_poller == NULL) {
4412 			SPDK_ERRLOG("can not register the desc timeout IO poller\n");
4413 			return -1;
4414 		}
4415 	}
4416 
4417 	desc->cb_fn = cb_fn;
4418 	desc->cb_arg = cb_arg;
4419 	desc->timeout_in_sec = timeout_in_sec;
4420 
4421 	return 0;
4422 }
4423 
4424 static int
4425 bdev_channel_create(void *io_device, void *ctx_buf)
4426 {
4427 	struct spdk_bdev		*bdev = __bdev_from_io_dev(io_device);
4428 	struct spdk_bdev_channel	*ch = ctx_buf;
4429 	struct spdk_io_channel		*mgmt_io_ch;
4430 	struct spdk_bdev_mgmt_channel	*mgmt_ch;
4431 	struct spdk_bdev_shared_resource *shared_resource;
4432 	struct lba_range		*range;
4433 
4434 	ch->bdev = bdev;
4435 	ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt);
4436 	if (!ch->channel) {
4437 		return -1;
4438 	}
4439 
4440 	ch->accel_channel = spdk_accel_get_io_channel();
4441 	if (!ch->accel_channel) {
4442 		spdk_put_io_channel(ch->channel);
4443 		return -1;
4444 	}
4445 
4446 	spdk_trace_record(TRACE_BDEV_IOCH_CREATE, bdev->internal.trace_id, 0, 0,
4447 			  spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel)));
4448 
4449 	assert(ch->histogram == NULL);
4450 	if (bdev->internal.histogram_enabled) {
4451 		ch->histogram = spdk_histogram_data_alloc();
4452 		if (ch->histogram == NULL) {
4453 			SPDK_ERRLOG("Could not allocate histogram\n");
4454 		}
4455 	}
4456 
4457 	mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr);
4458 	if (!mgmt_io_ch) {
4459 		spdk_put_io_channel(ch->channel);
4460 		spdk_put_io_channel(ch->accel_channel);
4461 		return -1;
4462 	}
4463 
4464 	mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch);
4465 	TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) {
4466 		if (shared_resource->shared_ch == ch->channel) {
4467 			spdk_put_io_channel(mgmt_io_ch);
4468 			shared_resource->ref++;
4469 			break;
4470 		}
4471 	}
4472 
4473 	if (shared_resource == NULL) {
4474 		shared_resource = calloc(1, sizeof(*shared_resource));
4475 		if (shared_resource == NULL) {
4476 			spdk_put_io_channel(ch->channel);
4477 			spdk_put_io_channel(ch->accel_channel);
4478 			spdk_put_io_channel(mgmt_io_ch);
4479 			return -1;
4480 		}
4481 
4482 		shared_resource->mgmt_ch = mgmt_ch;
4483 		shared_resource->io_outstanding = 0;
4484 		TAILQ_INIT(&shared_resource->nomem_io);
4485 		shared_resource->nomem_threshold = 0;
4486 		shared_resource->shared_ch = ch->channel;
4487 		shared_resource->ref = 1;
4488 		TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link);
4489 	}
4490 
4491 	ch->io_outstanding = 0;
4492 	TAILQ_INIT(&ch->locked_ranges);
4493 	TAILQ_INIT(&ch->qos_queued_io);
4494 	ch->flags = 0;
4495 	ch->trace_id = bdev->internal.trace_id;
4496 	ch->shared_resource = shared_resource;
4497 
4498 	TAILQ_INIT(&ch->io_submitted);
4499 	TAILQ_INIT(&ch->io_locked);
4500 	TAILQ_INIT(&ch->io_accel_exec);
4501 	TAILQ_INIT(&ch->io_memory_domain);
4502 
4503 	ch->stat = bdev_alloc_io_stat(false);
4504 	if (ch->stat == NULL) {
4505 		bdev_channel_destroy_resource(ch);
4506 		return -1;
4507 	}
4508 
4509 	ch->stat->ticks_rate = spdk_get_ticks_hz();
4510 
4511 #ifdef SPDK_CONFIG_VTUNE
4512 	{
4513 		char *name;
4514 		__itt_init_ittlib(NULL, 0);
4515 		name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch);
4516 		if (!name) {
4517 			bdev_channel_destroy_resource(ch);
4518 			return -1;
4519 		}
4520 		ch->handle = __itt_string_handle_create(name);
4521 		free(name);
4522 		ch->start_tsc = spdk_get_ticks();
4523 		ch->interval_tsc = spdk_get_ticks_hz() / 100;
4524 		ch->prev_stat = bdev_alloc_io_stat(false);
4525 		if (ch->prev_stat == NULL) {
4526 			bdev_channel_destroy_resource(ch);
4527 			return -1;
4528 		}
4529 	}
4530 #endif
4531 
4532 	spdk_spin_lock(&bdev->internal.spinlock);
4533 	bdev_enable_qos(bdev, ch);
4534 
4535 	TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) {
4536 		struct lba_range *new_range;
4537 
4538 		new_range = calloc(1, sizeof(*new_range));
4539 		if (new_range == NULL) {
4540 			spdk_spin_unlock(&bdev->internal.spinlock);
4541 			bdev_channel_destroy_resource(ch);
4542 			return -1;
4543 		}
4544 		new_range->length = range->length;
4545 		new_range->offset = range->offset;
4546 		new_range->locked_ctx = range->locked_ctx;
4547 		TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq);
4548 	}
4549 
4550 	spdk_spin_unlock(&bdev->internal.spinlock);
4551 
4552 	return 0;
4553 }
4554 
4555 static int
4556 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry,
4557 			 void *cb_ctx)
4558 {
4559 	struct spdk_bdev_channel *bdev_ch = cb_ctx;
4560 	struct spdk_bdev_io *bdev_io;
4561 	uint64_t buf_len;
4562 
4563 	bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf);
4564 	if (bdev_io->internal.ch == bdev_ch) {
4565 		buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len);
4566 		spdk_iobuf_entry_abort(ch, entry, buf_len);
4567 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
4568 	}
4569 
4570 	return 0;
4571 }
4572 
4573 /*
4574  * Abort I/O that are waiting on a data buffer.
4575  */
4576 static void
4577 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch)
4578 {
4579 	spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_all_buf_io_cb, ch);
4580 }
4581 
4582 /*
4583  * Abort I/O that are queued waiting for submission.  These types of I/O are
4584  *  linked using the spdk_bdev_io link TAILQ_ENTRY.
4585  */
4586 static void
4587 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch)
4588 {
4589 	struct spdk_bdev_io *bdev_io, *tmp;
4590 
4591 	TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) {
4592 		if (bdev_io->internal.ch == ch) {
4593 			TAILQ_REMOVE(queue, bdev_io, internal.link);
4594 			/*
4595 			 * spdk_bdev_io_complete() assumes that the completed I/O had
4596 			 *  been submitted to the bdev module.  Since in this case it
4597 			 *  hadn't, bump io_outstanding to account for the decrement
4598 			 *  that spdk_bdev_io_complete() will do.
4599 			 */
4600 			if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) {
4601 				bdev_io_increment_outstanding(ch, ch->shared_resource);
4602 			}
4603 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
4604 		}
4605 	}
4606 }
4607 
4608 static inline void
4609 bdev_abort_all_nomem_io(struct spdk_bdev_channel *ch)
4610 {
4611 	struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource;
4612 
4613 	shared_resource->nomem_abort_in_progress = true;
4614 	bdev_abort_all_queued_io(&shared_resource->nomem_io, ch);
4615 	shared_resource->nomem_abort_in_progress = false;
4616 }
4617 
4618 static bool
4619 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort)
4620 {
4621 	struct spdk_bdev_io *bdev_io;
4622 
4623 	TAILQ_FOREACH(bdev_io, queue, internal.link) {
4624 		if (bdev_io == bio_to_abort) {
4625 			TAILQ_REMOVE(queue, bio_to_abort, internal.link);
4626 			spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
4627 			return true;
4628 		}
4629 	}
4630 
4631 	return false;
4632 }
4633 
4634 static int
4635 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx)
4636 {
4637 	struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx;
4638 	uint64_t buf_len;
4639 
4640 	bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf);
4641 	if (bdev_io == bio_to_abort) {
4642 		buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len);
4643 		spdk_iobuf_entry_abort(ch, entry, buf_len);
4644 		spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
4645 		return 1;
4646 	}
4647 
4648 	return 0;
4649 }
4650 
4651 static bool
4652 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort)
4653 {
4654 	int rc;
4655 
4656 	rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_buf_io_cb, bio_to_abort);
4657 	return rc == 1;
4658 }
4659 
4660 static void
4661 bdev_qos_channel_destroy(void *cb_arg)
4662 {
4663 	struct spdk_bdev_qos *qos = cb_arg;
4664 
4665 	spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
4666 	spdk_poller_unregister(&qos->poller);
4667 
4668 	SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos);
4669 
4670 	free(qos);
4671 }
4672 
4673 static int
4674 bdev_qos_destroy(struct spdk_bdev *bdev)
4675 {
4676 	int i;
4677 
4678 	/*
4679 	 * Cleanly shutting down the QoS poller is tricky, because
4680 	 * during the asynchronous operation the user could open
4681 	 * a new descriptor and create a new channel, spawning
4682 	 * a new QoS poller.
4683 	 *
4684 	 * The strategy is to create a new QoS structure here and swap it
4685 	 * in. The shutdown path then continues to refer to the old one
4686 	 * until it completes and then releases it.
4687 	 */
4688 	struct spdk_bdev_qos *new_qos, *old_qos;
4689 
4690 	old_qos = bdev->internal.qos;
4691 
4692 	new_qos = calloc(1, sizeof(*new_qos));
4693 	if (!new_qos) {
4694 		SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n");
4695 		return -ENOMEM;
4696 	}
4697 
4698 	/* Copy the old QoS data into the newly allocated structure */
4699 	memcpy(new_qos, old_qos, sizeof(*new_qos));
4700 
4701 	/* Zero out the key parts of the QoS structure */
4702 	new_qos->ch = NULL;
4703 	new_qos->thread = NULL;
4704 	new_qos->poller = NULL;
4705 	/*
4706 	 * The limit member of spdk_bdev_qos_limit structure is not zeroed.
4707 	 * It will be used later for the new QoS structure.
4708 	 */
4709 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4710 		new_qos->rate_limits[i].remaining_this_timeslice = 0;
4711 		new_qos->rate_limits[i].min_per_timeslice = 0;
4712 		new_qos->rate_limits[i].max_per_timeslice = 0;
4713 	}
4714 
4715 	bdev->internal.qos = new_qos;
4716 
4717 	if (old_qos->thread == NULL) {
4718 		free(old_qos);
4719 	} else {
4720 		spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos);
4721 	}
4722 
4723 	/* It is safe to continue with destroying the bdev even though the QoS channel hasn't
4724 	 * been destroyed yet. The destruction path will end up waiting for the final
4725 	 * channel to be put before it releases resources. */
4726 
4727 	return 0;
4728 }
4729 
4730 void
4731 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add)
4732 {
4733 	total->bytes_read += add->bytes_read;
4734 	total->num_read_ops += add->num_read_ops;
4735 	total->bytes_written += add->bytes_written;
4736 	total->num_write_ops += add->num_write_ops;
4737 	total->bytes_unmapped += add->bytes_unmapped;
4738 	total->num_unmap_ops += add->num_unmap_ops;
4739 	total->bytes_copied += add->bytes_copied;
4740 	total->num_copy_ops += add->num_copy_ops;
4741 	total->read_latency_ticks += add->read_latency_ticks;
4742 	total->write_latency_ticks += add->write_latency_ticks;
4743 	total->unmap_latency_ticks += add->unmap_latency_ticks;
4744 	total->copy_latency_ticks += add->copy_latency_ticks;
4745 	if (total->max_read_latency_ticks < add->max_read_latency_ticks) {
4746 		total->max_read_latency_ticks = add->max_read_latency_ticks;
4747 	}
4748 	if (total->min_read_latency_ticks > add->min_read_latency_ticks) {
4749 		total->min_read_latency_ticks = add->min_read_latency_ticks;
4750 	}
4751 	if (total->max_write_latency_ticks < add->max_write_latency_ticks) {
4752 		total->max_write_latency_ticks = add->max_write_latency_ticks;
4753 	}
4754 	if (total->min_write_latency_ticks > add->min_write_latency_ticks) {
4755 		total->min_write_latency_ticks = add->min_write_latency_ticks;
4756 	}
4757 	if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) {
4758 		total->max_unmap_latency_ticks = add->max_unmap_latency_ticks;
4759 	}
4760 	if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) {
4761 		total->min_unmap_latency_ticks = add->min_unmap_latency_ticks;
4762 	}
4763 	if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) {
4764 		total->max_copy_latency_ticks = add->max_copy_latency_ticks;
4765 	}
4766 	if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) {
4767 		total->min_copy_latency_ticks = add->min_copy_latency_ticks;
4768 	}
4769 }
4770 
4771 static void
4772 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat)
4773 {
4774 	memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error));
4775 
4776 	if (to_stat->io_error != NULL && from_stat->io_error != NULL) {
4777 		memcpy(to_stat->io_error, from_stat->io_error,
4778 		       sizeof(struct spdk_bdev_io_error_stat));
4779 	}
4780 }
4781 
4782 void
4783 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode)
4784 {
4785 	if (mode == SPDK_BDEV_RESET_STAT_NONE) {
4786 		return;
4787 	}
4788 
4789 	stat->max_read_latency_ticks = 0;
4790 	stat->min_read_latency_ticks = UINT64_MAX;
4791 	stat->max_write_latency_ticks = 0;
4792 	stat->min_write_latency_ticks = UINT64_MAX;
4793 	stat->max_unmap_latency_ticks = 0;
4794 	stat->min_unmap_latency_ticks = UINT64_MAX;
4795 	stat->max_copy_latency_ticks = 0;
4796 	stat->min_copy_latency_ticks = UINT64_MAX;
4797 
4798 	if (mode != SPDK_BDEV_RESET_STAT_ALL) {
4799 		return;
4800 	}
4801 
4802 	stat->bytes_read = 0;
4803 	stat->num_read_ops = 0;
4804 	stat->bytes_written = 0;
4805 	stat->num_write_ops = 0;
4806 	stat->bytes_unmapped = 0;
4807 	stat->num_unmap_ops = 0;
4808 	stat->bytes_copied = 0;
4809 	stat->num_copy_ops = 0;
4810 	stat->read_latency_ticks = 0;
4811 	stat->write_latency_ticks = 0;
4812 	stat->unmap_latency_ticks = 0;
4813 	stat->copy_latency_ticks = 0;
4814 
4815 	if (stat->io_error != NULL) {
4816 		memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat));
4817 	}
4818 }
4819 
4820 struct spdk_bdev_io_stat *
4821 bdev_alloc_io_stat(bool io_error_stat)
4822 {
4823 	struct spdk_bdev_io_stat *stat;
4824 
4825 	stat = malloc(sizeof(struct spdk_bdev_io_stat));
4826 	if (stat == NULL) {
4827 		return NULL;
4828 	}
4829 
4830 	if (io_error_stat) {
4831 		stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat));
4832 		if (stat->io_error == NULL) {
4833 			free(stat);
4834 			return NULL;
4835 		}
4836 	} else {
4837 		stat->io_error = NULL;
4838 	}
4839 
4840 	spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL);
4841 
4842 	return stat;
4843 }
4844 
4845 void
4846 bdev_free_io_stat(struct spdk_bdev_io_stat *stat)
4847 {
4848 	if (stat != NULL) {
4849 		free(stat->io_error);
4850 		free(stat);
4851 	}
4852 }
4853 
4854 void
4855 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w)
4856 {
4857 	int i;
4858 
4859 	spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read);
4860 	spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops);
4861 	spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written);
4862 	spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops);
4863 	spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped);
4864 	spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops);
4865 	spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied);
4866 	spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops);
4867 	spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks);
4868 	spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks);
4869 	spdk_json_write_named_uint64(w, "min_read_latency_ticks",
4870 				     stat->min_read_latency_ticks != UINT64_MAX ?
4871 				     stat->min_read_latency_ticks : 0);
4872 	spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks);
4873 	spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks);
4874 	spdk_json_write_named_uint64(w, "min_write_latency_ticks",
4875 				     stat->min_write_latency_ticks != UINT64_MAX ?
4876 				     stat->min_write_latency_ticks : 0);
4877 	spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks);
4878 	spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks);
4879 	spdk_json_write_named_uint64(w, "min_unmap_latency_ticks",
4880 				     stat->min_unmap_latency_ticks != UINT64_MAX ?
4881 				     stat->min_unmap_latency_ticks : 0);
4882 	spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks);
4883 	spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks);
4884 	spdk_json_write_named_uint64(w, "min_copy_latency_ticks",
4885 				     stat->min_copy_latency_ticks != UINT64_MAX ?
4886 				     stat->min_copy_latency_ticks : 0);
4887 
4888 	if (stat->io_error != NULL) {
4889 		spdk_json_write_named_object_begin(w, "io_error");
4890 		for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) {
4891 			if (stat->io_error->error_status[i] != 0) {
4892 				spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)),
4893 							     stat->io_error->error_status[i]);
4894 			}
4895 		}
4896 		spdk_json_write_object_end(w);
4897 	}
4898 }
4899 
4900 static void
4901 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch)
4902 {
4903 	struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource;
4904 	struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch;
4905 
4906 	bdev_abort_all_nomem_io(ch);
4907 	bdev_abort_all_buf_io(mgmt_ch, ch);
4908 }
4909 
4910 static void
4911 bdev_channel_destroy(void *io_device, void *ctx_buf)
4912 {
4913 	struct spdk_bdev_channel *ch = ctx_buf;
4914 
4915 	SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name,
4916 		      spdk_get_thread());
4917 
4918 	spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, ch->bdev->internal.trace_id, 0, 0,
4919 			  spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel)));
4920 
4921 	/* This channel is going away, so add its statistics into the bdev so that they don't get lost. */
4922 	spdk_spin_lock(&ch->bdev->internal.spinlock);
4923 	spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat);
4924 	spdk_spin_unlock(&ch->bdev->internal.spinlock);
4925 
4926 	bdev_channel_abort_queued_ios(ch);
4927 
4928 	if (ch->histogram) {
4929 		spdk_histogram_data_free(ch->histogram);
4930 	}
4931 
4932 	bdev_channel_destroy_resource(ch);
4933 }
4934 
4935 /*
4936  * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer
4937  * to it. Hence we do not have to call bdev_get_by_name() when using this function.
4938  */
4939 static int
4940 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name)
4941 {
4942 	struct spdk_bdev_name *tmp;
4943 
4944 	bdev_name->name = strdup(name);
4945 	if (bdev_name->name == NULL) {
4946 		SPDK_ERRLOG("Unable to allocate bdev name\n");
4947 		return -ENOMEM;
4948 	}
4949 
4950 	bdev_name->bdev = bdev;
4951 
4952 	spdk_spin_lock(&g_bdev_mgr.spinlock);
4953 	tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name);
4954 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
4955 
4956 	if (tmp != NULL) {
4957 		SPDK_ERRLOG("Bdev name %s already exists\n", name);
4958 		free(bdev_name->name);
4959 		return -EEXIST;
4960 	}
4961 
4962 	return 0;
4963 }
4964 
4965 static void
4966 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name)
4967 {
4968 	RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name);
4969 	free(bdev_name->name);
4970 }
4971 
4972 static void
4973 bdev_name_del(struct spdk_bdev_name *bdev_name)
4974 {
4975 	spdk_spin_lock(&g_bdev_mgr.spinlock);
4976 	bdev_name_del_unsafe(bdev_name);
4977 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
4978 }
4979 
4980 int
4981 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias)
4982 {
4983 	struct spdk_bdev_alias *tmp;
4984 	int ret;
4985 
4986 	if (alias == NULL) {
4987 		SPDK_ERRLOG("Empty alias passed\n");
4988 		return -EINVAL;
4989 	}
4990 
4991 	tmp = calloc(1, sizeof(*tmp));
4992 	if (tmp == NULL) {
4993 		SPDK_ERRLOG("Unable to allocate alias\n");
4994 		return -ENOMEM;
4995 	}
4996 
4997 	ret = bdev_name_add(&tmp->alias, bdev, alias);
4998 	if (ret != 0) {
4999 		free(tmp);
5000 		return ret;
5001 	}
5002 
5003 	TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq);
5004 
5005 	return 0;
5006 }
5007 
5008 static int
5009 bdev_alias_del(struct spdk_bdev *bdev, const char *alias,
5010 	       void (*alias_del_fn)(struct spdk_bdev_name *n))
5011 {
5012 	struct spdk_bdev_alias *tmp;
5013 
5014 	TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
5015 		if (strcmp(alias, tmp->alias.name) == 0) {
5016 			TAILQ_REMOVE(&bdev->aliases, tmp, tailq);
5017 			alias_del_fn(&tmp->alias);
5018 			free(tmp);
5019 			return 0;
5020 		}
5021 	}
5022 
5023 	return -ENOENT;
5024 }
5025 
5026 int
5027 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias)
5028 {
5029 	int rc;
5030 
5031 	rc = bdev_alias_del(bdev, alias, bdev_name_del);
5032 	if (rc == -ENOENT) {
5033 		SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias);
5034 	}
5035 
5036 	return rc;
5037 }
5038 
5039 void
5040 spdk_bdev_alias_del_all(struct spdk_bdev *bdev)
5041 {
5042 	struct spdk_bdev_alias *p, *tmp;
5043 
5044 	TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) {
5045 		TAILQ_REMOVE(&bdev->aliases, p, tailq);
5046 		bdev_name_del(&p->alias);
5047 		free(p);
5048 	}
5049 }
5050 
5051 struct spdk_io_channel *
5052 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc)
5053 {
5054 	return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc)));
5055 }
5056 
5057 void *
5058 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc)
5059 {
5060 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5061 	void *ctx = NULL;
5062 
5063 	if (bdev->fn_table->get_module_ctx) {
5064 		ctx = bdev->fn_table->get_module_ctx(bdev->ctxt);
5065 	}
5066 
5067 	return ctx;
5068 }
5069 
5070 const char *
5071 spdk_bdev_get_module_name(const struct spdk_bdev *bdev)
5072 {
5073 	return bdev->module->name;
5074 }
5075 
5076 const char *
5077 spdk_bdev_get_name(const struct spdk_bdev *bdev)
5078 {
5079 	return bdev->name;
5080 }
5081 
5082 const char *
5083 spdk_bdev_get_product_name(const struct spdk_bdev *bdev)
5084 {
5085 	return bdev->product_name;
5086 }
5087 
5088 const struct spdk_bdev_aliases_list *
5089 spdk_bdev_get_aliases(const struct spdk_bdev *bdev)
5090 {
5091 	return &bdev->aliases;
5092 }
5093 
5094 uint32_t
5095 spdk_bdev_get_block_size(const struct spdk_bdev *bdev)
5096 {
5097 	return bdev->blocklen;
5098 }
5099 
5100 uint32_t
5101 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev)
5102 {
5103 	return bdev->write_unit_size;
5104 }
5105 
5106 uint64_t
5107 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev)
5108 {
5109 	return bdev->blockcnt;
5110 }
5111 
5112 const char *
5113 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type)
5114 {
5115 	return qos_rpc_type[type];
5116 }
5117 
5118 void
5119 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits)
5120 {
5121 	int i;
5122 
5123 	memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES);
5124 
5125 	spdk_spin_lock(&bdev->internal.spinlock);
5126 	if (bdev->internal.qos) {
5127 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
5128 			if (bdev->internal.qos->rate_limits[i].limit !=
5129 			    SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
5130 				limits[i] = bdev->internal.qos->rate_limits[i].limit;
5131 				if (bdev_qos_is_iops_rate_limit(i) == false) {
5132 					/* Change from Byte to Megabyte which is user visible. */
5133 					limits[i] = limits[i] / 1024 / 1024;
5134 				}
5135 			}
5136 		}
5137 	}
5138 	spdk_spin_unlock(&bdev->internal.spinlock);
5139 }
5140 
5141 size_t
5142 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev)
5143 {
5144 	return 1 << bdev->required_alignment;
5145 }
5146 
5147 uint32_t
5148 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev)
5149 {
5150 	return bdev->optimal_io_boundary;
5151 }
5152 
5153 bool
5154 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev)
5155 {
5156 	return bdev->write_cache;
5157 }
5158 
5159 const struct spdk_uuid *
5160 spdk_bdev_get_uuid(const struct spdk_bdev *bdev)
5161 {
5162 	return &bdev->uuid;
5163 }
5164 
5165 uint16_t
5166 spdk_bdev_get_acwu(const struct spdk_bdev *bdev)
5167 {
5168 	return bdev->acwu;
5169 }
5170 
5171 uint32_t
5172 spdk_bdev_get_md_size(const struct spdk_bdev *bdev)
5173 {
5174 	return bdev->md_len;
5175 }
5176 
5177 bool
5178 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev)
5179 {
5180 	return (bdev->md_len != 0) && bdev->md_interleave;
5181 }
5182 
5183 bool
5184 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev)
5185 {
5186 	return (bdev->md_len != 0) && !bdev->md_interleave;
5187 }
5188 
5189 bool
5190 spdk_bdev_is_zoned(const struct spdk_bdev *bdev)
5191 {
5192 	return bdev->zoned;
5193 }
5194 
5195 uint32_t
5196 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev)
5197 {
5198 	if (spdk_bdev_is_md_interleaved(bdev)) {
5199 		return bdev->blocklen - bdev->md_len;
5200 	} else {
5201 		return bdev->blocklen;
5202 	}
5203 }
5204 
5205 uint32_t
5206 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev)
5207 {
5208 	return bdev->phys_blocklen;
5209 }
5210 
5211 static uint32_t
5212 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev)
5213 {
5214 	if (!spdk_bdev_is_md_interleaved(bdev)) {
5215 		return bdev->blocklen + bdev->md_len;
5216 	} else {
5217 		return bdev->blocklen;
5218 	}
5219 }
5220 
5221 /* We have to use the typedef in the function declaration to appease astyle. */
5222 typedef enum spdk_dif_type spdk_dif_type_t;
5223 typedef enum spdk_dif_pi_format spdk_dif_pi_format_t;
5224 
5225 spdk_dif_type_t
5226 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev)
5227 {
5228 	if (bdev->md_len != 0) {
5229 		return bdev->dif_type;
5230 	} else {
5231 		return SPDK_DIF_DISABLE;
5232 	}
5233 }
5234 
5235 spdk_dif_pi_format_t
5236 spdk_bdev_get_dif_pi_format(const struct spdk_bdev *bdev)
5237 {
5238 	return bdev->dif_pi_format;
5239 }
5240 
5241 bool
5242 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev)
5243 {
5244 	if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) {
5245 		return bdev->dif_is_head_of_md;
5246 	} else {
5247 		return false;
5248 	}
5249 }
5250 
5251 bool
5252 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev,
5253 			       enum spdk_dif_check_type check_type)
5254 {
5255 	if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) {
5256 		return false;
5257 	}
5258 
5259 	switch (check_type) {
5260 	case SPDK_DIF_CHECK_TYPE_REFTAG:
5261 		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0;
5262 	case SPDK_DIF_CHECK_TYPE_APPTAG:
5263 		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0;
5264 	case SPDK_DIF_CHECK_TYPE_GUARD:
5265 		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0;
5266 	default:
5267 		return false;
5268 	}
5269 }
5270 
5271 static uint32_t
5272 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes)
5273 {
5274 	uint64_t aligned_length, max_write_blocks;
5275 
5276 	aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1);
5277 	max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev);
5278 	max_write_blocks -= max_write_blocks % bdev->write_unit_size;
5279 
5280 	return max_write_blocks;
5281 }
5282 
5283 uint32_t
5284 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev)
5285 {
5286 	return bdev->max_copy;
5287 }
5288 
5289 uint64_t
5290 spdk_bdev_get_qd(const struct spdk_bdev *bdev)
5291 {
5292 	return bdev->internal.measured_queue_depth;
5293 }
5294 
5295 uint64_t
5296 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev)
5297 {
5298 	return bdev->internal.period;
5299 }
5300 
5301 uint64_t
5302 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev)
5303 {
5304 	return bdev->internal.weighted_io_time;
5305 }
5306 
5307 uint64_t
5308 spdk_bdev_get_io_time(const struct spdk_bdev *bdev)
5309 {
5310 	return bdev->internal.io_time;
5311 }
5312 
5313 union spdk_bdev_nvme_ctratt spdk_bdev_get_nvme_ctratt(struct spdk_bdev *bdev)
5314 {
5315 	return bdev->ctratt;
5316 }
5317 
5318 uint32_t
5319 spdk_bdev_get_nvme_nsid(struct spdk_bdev *bdev)
5320 {
5321 	return bdev->nsid;
5322 }
5323 
5324 uint32_t
5325 spdk_bdev_desc_get_block_size(struct spdk_bdev_desc *desc)
5326 {
5327 	struct spdk_bdev *bdev = desc->bdev;
5328 
5329 	return desc->opts.hide_metadata ? bdev->blocklen - bdev->md_len : bdev->blocklen;
5330 }
5331 
5332 uint32_t
5333 spdk_bdev_desc_get_md_size(struct spdk_bdev_desc *desc)
5334 {
5335 	struct spdk_bdev *bdev = desc->bdev;
5336 
5337 	return desc->opts.hide_metadata ? 0 : bdev->md_len;
5338 }
5339 
5340 bool
5341 spdk_bdev_desc_is_md_interleaved(struct spdk_bdev_desc *desc)
5342 {
5343 	struct spdk_bdev *bdev = desc->bdev;
5344 
5345 	return desc->opts.hide_metadata ? false : spdk_bdev_is_md_interleaved(bdev);
5346 }
5347 
5348 bool
5349 spdk_bdev_desc_is_md_separate(struct spdk_bdev_desc *desc)
5350 {
5351 	struct spdk_bdev *bdev = desc->bdev;
5352 
5353 	return desc->opts.hide_metadata ? false : spdk_bdev_is_md_separate(bdev);
5354 }
5355 
5356 spdk_dif_type_t
5357 spdk_bdev_desc_get_dif_type(struct spdk_bdev_desc *desc)
5358 {
5359 	struct spdk_bdev *bdev = desc->bdev;
5360 
5361 	return desc->opts.hide_metadata ? SPDK_DIF_DISABLE : spdk_bdev_get_dif_type(bdev);
5362 }
5363 
5364 spdk_dif_pi_format_t
5365 spdk_bdev_desc_get_dif_pi_format(struct spdk_bdev_desc *desc)
5366 {
5367 	struct spdk_bdev *bdev = desc->bdev;
5368 
5369 	return desc->opts.hide_metadata ? SPDK_DIF_PI_FORMAT_16 : spdk_bdev_get_dif_pi_format(bdev);
5370 }
5371 
5372 bool
5373 spdk_bdev_desc_is_dif_head_of_md(struct spdk_bdev_desc *desc)
5374 {
5375 	struct spdk_bdev *bdev = desc->bdev;
5376 
5377 	return desc->opts.hide_metadata ? false : spdk_bdev_is_dif_head_of_md(bdev);
5378 }
5379 
5380 bool
5381 spdk_bdev_desc_is_dif_check_enabled(struct spdk_bdev_desc *desc,
5382 				    enum spdk_dif_check_type check_type)
5383 {
5384 	struct spdk_bdev *bdev = desc->bdev;
5385 
5386 	return desc->opts.hide_metadata ? false : spdk_bdev_is_dif_check_enabled(bdev, check_type);
5387 }
5388 
5389 static void bdev_update_qd_sampling_period(void *ctx);
5390 
5391 static void
5392 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status)
5393 {
5394 	bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth;
5395 
5396 	if (bdev->internal.measured_queue_depth) {
5397 		bdev->internal.io_time += bdev->internal.period;
5398 		bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth;
5399 	}
5400 
5401 	bdev->internal.qd_poll_in_progress = false;
5402 
5403 	bdev_update_qd_sampling_period(bdev);
5404 }
5405 
5406 static void
5407 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
5408 		       struct spdk_io_channel *io_ch, void *_ctx)
5409 {
5410 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch);
5411 
5412 	bdev->internal.temporary_queue_depth += ch->io_outstanding;
5413 	spdk_bdev_for_each_channel_continue(i, 0);
5414 }
5415 
5416 static int
5417 bdev_calculate_measured_queue_depth(void *ctx)
5418 {
5419 	struct spdk_bdev *bdev = ctx;
5420 
5421 	bdev->internal.qd_poll_in_progress = true;
5422 	bdev->internal.temporary_queue_depth = 0;
5423 	spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl);
5424 	return SPDK_POLLER_BUSY;
5425 }
5426 
5427 static void
5428 bdev_update_qd_sampling_period(void *ctx)
5429 {
5430 	struct spdk_bdev *bdev = ctx;
5431 
5432 	if (bdev->internal.period == bdev->internal.new_period) {
5433 		return;
5434 	}
5435 
5436 	if (bdev->internal.qd_poll_in_progress) {
5437 		return;
5438 	}
5439 
5440 	bdev->internal.period = bdev->internal.new_period;
5441 
5442 	spdk_poller_unregister(&bdev->internal.qd_poller);
5443 	if (bdev->internal.period != 0) {
5444 		bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth,
5445 					   bdev, bdev->internal.period);
5446 	} else {
5447 		spdk_bdev_close(bdev->internal.qd_desc);
5448 		bdev->internal.qd_desc = NULL;
5449 	}
5450 }
5451 
5452 static void
5453 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
5454 {
5455 	SPDK_NOTICELOG("Unexpected event type: %d\n", type);
5456 }
5457 
5458 void
5459 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period)
5460 {
5461 	int rc;
5462 
5463 	if (bdev->internal.new_period == period) {
5464 		return;
5465 	}
5466 
5467 	bdev->internal.new_period = period;
5468 
5469 	if (bdev->internal.qd_desc != NULL) {
5470 		assert(bdev->internal.period != 0);
5471 
5472 		spdk_thread_send_msg(bdev->internal.qd_desc->thread,
5473 				     bdev_update_qd_sampling_period, bdev);
5474 		return;
5475 	}
5476 
5477 	assert(bdev->internal.period == 0);
5478 
5479 	rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb,
5480 				NULL, &bdev->internal.qd_desc);
5481 	if (rc != 0) {
5482 		return;
5483 	}
5484 
5485 	bdev->internal.period = period;
5486 	bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth,
5487 				   bdev, period);
5488 }
5489 
5490 struct bdev_get_current_qd_ctx {
5491 	uint64_t current_qd;
5492 	spdk_bdev_get_current_qd_cb cb_fn;
5493 	void *cb_arg;
5494 };
5495 
5496 static void
5497 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status)
5498 {
5499 	struct bdev_get_current_qd_ctx *ctx = _ctx;
5500 
5501 	ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0);
5502 
5503 	free(ctx);
5504 }
5505 
5506 static void
5507 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
5508 		    struct spdk_io_channel *io_ch, void *_ctx)
5509 {
5510 	struct bdev_get_current_qd_ctx *ctx = _ctx;
5511 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
5512 
5513 	ctx->current_qd += bdev_ch->io_outstanding;
5514 
5515 	spdk_bdev_for_each_channel_continue(i, 0);
5516 }
5517 
5518 void
5519 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn,
5520 			 void *cb_arg)
5521 {
5522 	struct bdev_get_current_qd_ctx *ctx;
5523 
5524 	assert(cb_fn != NULL);
5525 
5526 	ctx = calloc(1, sizeof(*ctx));
5527 	if (ctx == NULL) {
5528 		cb_fn(bdev, 0, cb_arg, -ENOMEM);
5529 		return;
5530 	}
5531 
5532 	ctx->cb_fn = cb_fn;
5533 	ctx->cb_arg = cb_arg;
5534 
5535 	spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done);
5536 }
5537 
5538 static void
5539 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type)
5540 {
5541 	assert(desc->thread == spdk_get_thread());
5542 
5543 	spdk_spin_lock(&desc->spinlock);
5544 	desc->refs--;
5545 	if (!desc->closed) {
5546 		spdk_spin_unlock(&desc->spinlock);
5547 		desc->callback.event_fn(type,
5548 					desc->bdev,
5549 					desc->callback.ctx);
5550 		return;
5551 	} else if (desc->refs == 0) {
5552 		/* This descriptor was closed after this event_notify message was sent.
5553 		 * spdk_bdev_close() could not free the descriptor since this message was
5554 		 * in flight, so we free it now using bdev_desc_free().
5555 		 */
5556 		spdk_spin_unlock(&desc->spinlock);
5557 		bdev_desc_free(desc);
5558 		return;
5559 	}
5560 	spdk_spin_unlock(&desc->spinlock);
5561 }
5562 
5563 static void
5564 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn)
5565 {
5566 	spdk_spin_lock(&desc->spinlock);
5567 	desc->refs++;
5568 	spdk_thread_send_msg(desc->thread, event_notify_fn, desc);
5569 	spdk_spin_unlock(&desc->spinlock);
5570 }
5571 
5572 static void
5573 _resize_notify(void *ctx)
5574 {
5575 	struct spdk_bdev_desc *desc = ctx;
5576 
5577 	_event_notify(desc, SPDK_BDEV_EVENT_RESIZE);
5578 }
5579 
5580 int
5581 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size)
5582 {
5583 	struct spdk_bdev_desc *desc;
5584 	int ret;
5585 
5586 	if (size == bdev->blockcnt) {
5587 		return 0;
5588 	}
5589 
5590 	spdk_spin_lock(&bdev->internal.spinlock);
5591 
5592 	/* bdev has open descriptors */
5593 	if (!TAILQ_EMPTY(&bdev->internal.open_descs) &&
5594 	    bdev->blockcnt > size) {
5595 		ret = -EBUSY;
5596 	} else {
5597 		bdev->blockcnt = size;
5598 		TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
5599 			event_notify(desc, _resize_notify);
5600 		}
5601 		ret = 0;
5602 	}
5603 
5604 	spdk_spin_unlock(&bdev->internal.spinlock);
5605 
5606 	return ret;
5607 }
5608 
5609 /*
5610  * Convert I/O offset and length from bytes to blocks.
5611  *
5612  * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size.
5613  */
5614 static uint64_t
5615 bdev_bytes_to_blocks(struct spdk_bdev_desc *desc, uint64_t offset_bytes,
5616 		     uint64_t *offset_blocks, uint64_t num_bytes, uint64_t *num_blocks)
5617 {
5618 	uint32_t block_size = bdev_desc_get_block_size(desc);
5619 	uint8_t shift_cnt;
5620 
5621 	/* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */
5622 	if (spdk_likely(spdk_u32_is_pow2(block_size))) {
5623 		shift_cnt = spdk_u32log2(block_size);
5624 		*offset_blocks = offset_bytes >> shift_cnt;
5625 		*num_blocks = num_bytes >> shift_cnt;
5626 		return (offset_bytes - (*offset_blocks << shift_cnt)) |
5627 		       (num_bytes - (*num_blocks << shift_cnt));
5628 	} else {
5629 		*offset_blocks = offset_bytes / block_size;
5630 		*num_blocks = num_bytes / block_size;
5631 		return (offset_bytes % block_size) | (num_bytes % block_size);
5632 	}
5633 }
5634 
5635 static bool
5636 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks)
5637 {
5638 	/* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there
5639 	 * has been an overflow and hence the offset has been wrapped around */
5640 	if (offset_blocks + num_blocks < offset_blocks) {
5641 		return false;
5642 	}
5643 
5644 	/* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */
5645 	if (offset_blocks + num_blocks > bdev->blockcnt) {
5646 		return false;
5647 	}
5648 
5649 	return true;
5650 }
5651 
5652 static void
5653 bdev_seek_complete_cb(void *ctx)
5654 {
5655 	struct spdk_bdev_io *bdev_io = ctx;
5656 
5657 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
5658 	bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx);
5659 }
5660 
5661 static int
5662 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5663 	  uint64_t offset_blocks, enum spdk_bdev_io_type io_type,
5664 	  spdk_bdev_io_completion_cb cb, void *cb_arg)
5665 {
5666 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5667 	struct spdk_bdev_io *bdev_io;
5668 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5669 
5670 	assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE);
5671 
5672 	/* Check if offset_blocks is valid looking at the validity of one block */
5673 	if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) {
5674 		return -EINVAL;
5675 	}
5676 
5677 	bdev_io = bdev_channel_get_io(channel);
5678 	if (!bdev_io) {
5679 		return -ENOMEM;
5680 	}
5681 
5682 	bdev_io->internal.ch = channel;
5683 	bdev_io->internal.desc = desc;
5684 	bdev_io->type = io_type;
5685 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5686 	bdev_io->u.bdev.memory_domain = NULL;
5687 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5688 	bdev_io->u.bdev.accel_sequence = NULL;
5689 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5690 
5691 	if (!spdk_bdev_io_type_supported(bdev, io_type)) {
5692 		/* In case bdev doesn't support seek to next data/hole offset,
5693 		 * it is assumed that only data and no holes are present */
5694 		if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) {
5695 			bdev_io->u.bdev.seek.offset = offset_blocks;
5696 		} else {
5697 			bdev_io->u.bdev.seek.offset = UINT64_MAX;
5698 		}
5699 
5700 		spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io);
5701 		return 0;
5702 	}
5703 
5704 	bdev_io_submit(bdev_io);
5705 	return 0;
5706 }
5707 
5708 int
5709 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5710 		    uint64_t offset_blocks,
5711 		    spdk_bdev_io_completion_cb cb, void *cb_arg)
5712 {
5713 	return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg);
5714 }
5715 
5716 int
5717 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5718 		    uint64_t offset_blocks,
5719 		    spdk_bdev_io_completion_cb cb, void *cb_arg)
5720 {
5721 	return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg);
5722 }
5723 
5724 uint64_t
5725 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io)
5726 {
5727 	return bdev_io->u.bdev.seek.offset;
5728 }
5729 
5730 static int
5731 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf,
5732 			 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5733 			 spdk_bdev_io_completion_cb cb, void *cb_arg)
5734 {
5735 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5736 	struct spdk_bdev_io *bdev_io;
5737 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5738 
5739 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5740 		return -EINVAL;
5741 	}
5742 
5743 	bdev_io = bdev_channel_get_io(channel);
5744 	if (!bdev_io) {
5745 		return -ENOMEM;
5746 	}
5747 
5748 	bdev_io->internal.ch = channel;
5749 	bdev_io->internal.desc = desc;
5750 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
5751 	bdev_io->u.bdev.iovs = &bdev_io->iov;
5752 	bdev_io->u.bdev.iovs[0].iov_base = buf;
5753 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc);
5754 	bdev_io->u.bdev.iovcnt = 1;
5755 	bdev_io->u.bdev.md_buf = md_buf;
5756 	bdev_io->u.bdev.num_blocks = num_blocks;
5757 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5758 	bdev_io->u.bdev.memory_domain = NULL;
5759 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5760 	bdev_io->u.bdev.accel_sequence = NULL;
5761 	bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags;
5762 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5763 
5764 	bdev_io_submit(bdev_io);
5765 	return 0;
5766 }
5767 
5768 int
5769 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5770 	       void *buf, uint64_t offset, uint64_t nbytes,
5771 	       spdk_bdev_io_completion_cb cb, void *cb_arg)
5772 {
5773 	uint64_t offset_blocks, num_blocks;
5774 
5775 	if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
5776 		return -EINVAL;
5777 	}
5778 
5779 	return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
5780 }
5781 
5782 int
5783 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5784 		      void *buf, uint64_t offset_blocks, uint64_t num_blocks,
5785 		      spdk_bdev_io_completion_cb cb, void *cb_arg)
5786 {
5787 	return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg);
5788 }
5789 
5790 int
5791 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5792 			      void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5793 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
5794 {
5795 	struct iovec iov = {
5796 		.iov_base = buf,
5797 	};
5798 
5799 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5800 		return -EINVAL;
5801 	}
5802 
5803 	if ((md_buf || desc->opts.hide_metadata) && !_is_buf_allocated(&iov)) {
5804 		return -EINVAL;
5805 	}
5806 
5807 	return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
5808 					cb, cb_arg);
5809 }
5810 
5811 int
5812 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5813 		struct iovec *iov, int iovcnt,
5814 		uint64_t offset, uint64_t nbytes,
5815 		spdk_bdev_io_completion_cb cb, void *cb_arg)
5816 {
5817 	uint64_t offset_blocks, num_blocks;
5818 
5819 	if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
5820 		return -EINVAL;
5821 	}
5822 
5823 	return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
5824 }
5825 
5826 static int
5827 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5828 			  struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks,
5829 			  uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx,
5830 			  struct spdk_accel_sequence *seq, uint32_t dif_check_flags,
5831 			  spdk_bdev_io_completion_cb cb, void *cb_arg)
5832 {
5833 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5834 	struct spdk_bdev_io *bdev_io;
5835 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5836 
5837 	if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) {
5838 		return -EINVAL;
5839 	}
5840 
5841 	bdev_io = bdev_channel_get_io(channel);
5842 	if (spdk_unlikely(!bdev_io)) {
5843 		return -ENOMEM;
5844 	}
5845 
5846 	bdev_io->internal.ch = channel;
5847 	bdev_io->internal.desc = desc;
5848 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
5849 	bdev_io->u.bdev.iovs = iov;
5850 	bdev_io->u.bdev.iovcnt = iovcnt;
5851 	bdev_io->u.bdev.md_buf = md_buf;
5852 	bdev_io->u.bdev.num_blocks = num_blocks;
5853 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5854 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5855 
5856 	if (seq != NULL) {
5857 		bdev_io->internal.f.has_accel_sequence = true;
5858 		bdev_io->internal.accel_sequence = seq;
5859 	}
5860 
5861 	if (domain != NULL) {
5862 		bdev_io->internal.f.has_memory_domain = true;
5863 		bdev_io->internal.memory_domain = domain;
5864 		bdev_io->internal.memory_domain_ctx = domain_ctx;
5865 	}
5866 
5867 	bdev_io->u.bdev.memory_domain = domain;
5868 	bdev_io->u.bdev.memory_domain_ctx = domain_ctx;
5869 	bdev_io->u.bdev.accel_sequence = seq;
5870 	bdev_io->u.bdev.dif_check_flags = dif_check_flags;
5871 
5872 	_bdev_io_submit_ext(desc, bdev_io);
5873 
5874 	return 0;
5875 }
5876 
5877 int
5878 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5879 		       struct iovec *iov, int iovcnt,
5880 		       uint64_t offset_blocks, uint64_t num_blocks,
5881 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
5882 {
5883 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5884 
5885 	return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
5886 					 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg);
5887 }
5888 
5889 int
5890 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5891 			       struct iovec *iov, int iovcnt, void *md_buf,
5892 			       uint64_t offset_blocks, uint64_t num_blocks,
5893 			       spdk_bdev_io_completion_cb cb, void *cb_arg)
5894 {
5895 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5896 
5897 	if (md_buf && !spdk_bdev_is_md_separate(bdev)) {
5898 		return -EINVAL;
5899 	}
5900 
5901 	if (md_buf && !_is_buf_allocated(iov)) {
5902 		return -EINVAL;
5903 	}
5904 
5905 	return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
5906 					 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg);
5907 }
5908 
5909 static inline bool
5910 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov)
5911 {
5912 	/*
5913 	 * We check if opts size is at least of size when we first introduced
5914 	 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members
5915 	 * are not checked internal.
5916 	 */
5917 	return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) +
5918 	       sizeof(opts->metadata) &&
5919 	       opts->size <= sizeof(*opts) &&
5920 	       /* When memory domain is used, the user must provide data buffers */
5921 	       (!opts->memory_domain || (iov && iov[0].iov_base));
5922 }
5923 
5924 int
5925 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5926 			   struct iovec *iov, int iovcnt,
5927 			   uint64_t offset_blocks, uint64_t num_blocks,
5928 			   spdk_bdev_io_completion_cb cb, void *cb_arg,
5929 			   struct spdk_bdev_ext_io_opts *opts)
5930 {
5931 	struct spdk_memory_domain *domain = NULL;
5932 	struct spdk_accel_sequence *seq = NULL;
5933 	void *domain_ctx = NULL, *md = NULL;
5934 	uint32_t dif_check_flags = 0;
5935 	uint32_t nvme_cdw12_raw;
5936 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5937 
5938 	if (opts) {
5939 		if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) {
5940 			return -EINVAL;
5941 		}
5942 
5943 		md = opts->metadata;
5944 		domain = bdev_get_ext_io_opt(opts, memory_domain, NULL);
5945 		domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL);
5946 		seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL);
5947 		nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0);
5948 		if (md) {
5949 			if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) {
5950 				return -EINVAL;
5951 			}
5952 
5953 			if (spdk_unlikely(!_is_buf_allocated(iov))) {
5954 				return -EINVAL;
5955 			}
5956 
5957 			if (spdk_unlikely(seq != NULL)) {
5958 				return -EINVAL;
5959 			}
5960 
5961 			if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) {
5962 				SPDK_ERRLOG("Separate metadata with NVMe PRACT is not supported.\n");
5963 				return -ENOTSUP;
5964 			}
5965 		}
5966 
5967 		if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) {
5968 			dif_check_flags |= SPDK_DIF_FLAGS_NVME_PRACT;
5969 		}
5970 	}
5971 
5972 	dif_check_flags |= bdev->dif_check_flags &
5973 			   ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0));
5974 
5975 	return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks,
5976 					 num_blocks, domain, domain_ctx, seq, dif_check_flags, cb, cb_arg);
5977 }
5978 
5979 static int
5980 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5981 			  void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5982 			  spdk_bdev_io_completion_cb cb, void *cb_arg)
5983 {
5984 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5985 	struct spdk_bdev_io *bdev_io;
5986 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5987 
5988 	if (!desc->write) {
5989 		return -EBADF;
5990 	}
5991 
5992 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5993 		return -EINVAL;
5994 	}
5995 
5996 	bdev_io = bdev_channel_get_io(channel);
5997 	if (!bdev_io) {
5998 		return -ENOMEM;
5999 	}
6000 
6001 	bdev_io->internal.ch = channel;
6002 	bdev_io->internal.desc = desc;
6003 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
6004 	bdev_io->u.bdev.iovs = &bdev_io->iov;
6005 	bdev_io->u.bdev.iovs[0].iov_base = buf;
6006 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc);
6007 	bdev_io->u.bdev.iovcnt = 1;
6008 	bdev_io->u.bdev.md_buf = md_buf;
6009 	bdev_io->u.bdev.num_blocks = num_blocks;
6010 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6011 	bdev_io->u.bdev.memory_domain = NULL;
6012 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6013 	bdev_io->u.bdev.accel_sequence = NULL;
6014 	bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags;
6015 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6016 
6017 	bdev_io_submit(bdev_io);
6018 	return 0;
6019 }
6020 
6021 int
6022 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6023 		void *buf, uint64_t offset, uint64_t nbytes,
6024 		spdk_bdev_io_completion_cb cb, void *cb_arg)
6025 {
6026 	uint64_t offset_blocks, num_blocks;
6027 
6028 	if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
6029 		return -EINVAL;
6030 	}
6031 
6032 	return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
6033 }
6034 
6035 int
6036 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6037 		       void *buf, uint64_t offset_blocks, uint64_t num_blocks,
6038 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
6039 {
6040 	return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks,
6041 					 cb, cb_arg);
6042 }
6043 
6044 int
6045 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6046 			       void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
6047 			       spdk_bdev_io_completion_cb cb, void *cb_arg)
6048 {
6049 	struct iovec iov = {
6050 		.iov_base = buf,
6051 	};
6052 
6053 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
6054 		return -EINVAL;
6055 	}
6056 
6057 	if (md_buf && !_is_buf_allocated(&iov)) {
6058 		return -EINVAL;
6059 	}
6060 
6061 	return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
6062 					 cb, cb_arg);
6063 }
6064 
6065 static int
6066 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6067 			   struct iovec *iov, int iovcnt, void *md_buf,
6068 			   uint64_t offset_blocks, uint64_t num_blocks,
6069 			   struct spdk_memory_domain *domain, void *domain_ctx,
6070 			   struct spdk_accel_sequence *seq, uint32_t dif_check_flags,
6071 			   uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw,
6072 			   spdk_bdev_io_completion_cb cb, void *cb_arg)
6073 {
6074 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6075 	struct spdk_bdev_io *bdev_io;
6076 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6077 
6078 	if (spdk_unlikely(!desc->write)) {
6079 		return -EBADF;
6080 	}
6081 
6082 	if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) {
6083 		return -EINVAL;
6084 	}
6085 
6086 	bdev_io = bdev_channel_get_io(channel);
6087 	if (spdk_unlikely(!bdev_io)) {
6088 		return -ENOMEM;
6089 	}
6090 
6091 	bdev_io->internal.ch = channel;
6092 	bdev_io->internal.desc = desc;
6093 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
6094 	bdev_io->u.bdev.iovs = iov;
6095 	bdev_io->u.bdev.iovcnt = iovcnt;
6096 	bdev_io->u.bdev.md_buf = md_buf;
6097 	bdev_io->u.bdev.num_blocks = num_blocks;
6098 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6099 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6100 	if (seq != NULL) {
6101 		bdev_io->internal.f.has_accel_sequence = true;
6102 		bdev_io->internal.accel_sequence = seq;
6103 	}
6104 
6105 	if (domain != NULL) {
6106 		bdev_io->internal.f.has_memory_domain = true;
6107 		bdev_io->internal.memory_domain = domain;
6108 		bdev_io->internal.memory_domain_ctx = domain_ctx;
6109 	}
6110 
6111 	bdev_io->u.bdev.memory_domain = domain;
6112 	bdev_io->u.bdev.memory_domain_ctx = domain_ctx;
6113 	bdev_io->u.bdev.accel_sequence = seq;
6114 	bdev_io->u.bdev.dif_check_flags = dif_check_flags;
6115 	bdev_io->u.bdev.nvme_cdw12.raw = nvme_cdw12_raw;
6116 	bdev_io->u.bdev.nvme_cdw13.raw = nvme_cdw13_raw;
6117 
6118 	_bdev_io_submit_ext(desc, bdev_io);
6119 
6120 	return 0;
6121 }
6122 
6123 int
6124 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6125 		 struct iovec *iov, int iovcnt,
6126 		 uint64_t offset, uint64_t len,
6127 		 spdk_bdev_io_completion_cb cb, void *cb_arg)
6128 {
6129 	uint64_t offset_blocks, num_blocks;
6130 
6131 	if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, len, &num_blocks) != 0) {
6132 		return -EINVAL;
6133 	}
6134 
6135 	return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
6136 }
6137 
6138 int
6139 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6140 			struct iovec *iov, int iovcnt,
6141 			uint64_t offset_blocks, uint64_t num_blocks,
6142 			spdk_bdev_io_completion_cb cb, void *cb_arg)
6143 {
6144 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6145 
6146 	return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
6147 					  num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0,
6148 					  cb, cb_arg);
6149 }
6150 
6151 int
6152 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6153 				struct iovec *iov, int iovcnt, void *md_buf,
6154 				uint64_t offset_blocks, uint64_t num_blocks,
6155 				spdk_bdev_io_completion_cb cb, void *cb_arg)
6156 {
6157 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6158 
6159 	if (md_buf && !spdk_bdev_is_md_separate(bdev)) {
6160 		return -EINVAL;
6161 	}
6162 
6163 	if (md_buf && !_is_buf_allocated(iov)) {
6164 		return -EINVAL;
6165 	}
6166 
6167 	return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
6168 					  num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0,
6169 					  cb, cb_arg);
6170 }
6171 
6172 int
6173 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6174 			    struct iovec *iov, int iovcnt,
6175 			    uint64_t offset_blocks, uint64_t num_blocks,
6176 			    spdk_bdev_io_completion_cb cb, void *cb_arg,
6177 			    struct spdk_bdev_ext_io_opts *opts)
6178 {
6179 	struct spdk_memory_domain *domain = NULL;
6180 	struct spdk_accel_sequence *seq = NULL;
6181 	void *domain_ctx = NULL, *md = NULL;
6182 	uint32_t dif_check_flags = 0;
6183 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6184 	uint32_t nvme_cdw12_raw = 0;
6185 	uint32_t nvme_cdw13_raw = 0;
6186 
6187 	if (opts) {
6188 		if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) {
6189 			return -EINVAL;
6190 		}
6191 		md = opts->metadata;
6192 		domain = bdev_get_ext_io_opt(opts, memory_domain, NULL);
6193 		domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL);
6194 		seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL);
6195 		nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0);
6196 		nvme_cdw13_raw = bdev_get_ext_io_opt(opts, nvme_cdw13.raw, 0);
6197 		if (md) {
6198 			if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) {
6199 				return -EINVAL;
6200 			}
6201 
6202 			if (spdk_unlikely(!_is_buf_allocated(iov))) {
6203 				return -EINVAL;
6204 			}
6205 
6206 			if (spdk_unlikely(seq != NULL)) {
6207 				return -EINVAL;
6208 			}
6209 
6210 			if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) {
6211 				SPDK_ERRLOG("Separate metadata with NVMe PRACT is not supported.\n");
6212 				return -ENOTSUP;
6213 			}
6214 		}
6215 
6216 		if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) {
6217 			dif_check_flags |= SPDK_DIF_FLAGS_NVME_PRACT;
6218 		}
6219 	}
6220 
6221 	dif_check_flags |= bdev->dif_check_flags &
6222 			   ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0));
6223 
6224 	return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks,
6225 					  domain, domain_ctx, seq, dif_check_flags,
6226 					  nvme_cdw12_raw, nvme_cdw13_raw, cb, cb_arg);
6227 }
6228 
6229 static void
6230 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
6231 {
6232 	struct spdk_bdev_io *parent_io = cb_arg;
6233 	struct spdk_bdev *bdev = parent_io->bdev;
6234 	uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base;
6235 	int i, rc = 0;
6236 
6237 	if (!success) {
6238 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
6239 		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
6240 		spdk_bdev_free_io(bdev_io);
6241 		return;
6242 	}
6243 
6244 	for (i = 0; i < parent_io->u.bdev.iovcnt; i++) {
6245 		rc = memcmp(read_buf,
6246 			    parent_io->u.bdev.iovs[i].iov_base,
6247 			    parent_io->u.bdev.iovs[i].iov_len);
6248 		if (rc) {
6249 			break;
6250 		}
6251 		read_buf += parent_io->u.bdev.iovs[i].iov_len;
6252 	}
6253 
6254 	if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) {
6255 		rc = memcmp(bdev_io->u.bdev.md_buf,
6256 			    parent_io->u.bdev.md_buf,
6257 			    spdk_bdev_get_md_size(bdev));
6258 	}
6259 
6260 	spdk_bdev_free_io(bdev_io);
6261 
6262 	if (rc == 0) {
6263 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
6264 		parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx);
6265 	} else {
6266 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE;
6267 		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
6268 	}
6269 }
6270 
6271 static void
6272 bdev_compare_do_read(void *_bdev_io)
6273 {
6274 	struct spdk_bdev_io *bdev_io = _bdev_io;
6275 	int rc;
6276 
6277 	rc = spdk_bdev_read_blocks(bdev_io->internal.desc,
6278 				   spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL,
6279 				   bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
6280 				   bdev_compare_do_read_done, bdev_io);
6281 
6282 	if (rc == -ENOMEM) {
6283 		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read);
6284 	} else if (rc != 0) {
6285 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
6286 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
6287 	}
6288 }
6289 
6290 static int
6291 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6292 			     struct iovec *iov, int iovcnt, void *md_buf,
6293 			     uint64_t offset_blocks, uint64_t num_blocks,
6294 			     spdk_bdev_io_completion_cb cb, void *cb_arg)
6295 {
6296 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6297 	struct spdk_bdev_io *bdev_io;
6298 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6299 
6300 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6301 		return -EINVAL;
6302 	}
6303 
6304 	bdev_io = bdev_channel_get_io(channel);
6305 	if (!bdev_io) {
6306 		return -ENOMEM;
6307 	}
6308 
6309 	bdev_io->internal.ch = channel;
6310 	bdev_io->internal.desc = desc;
6311 	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE;
6312 	bdev_io->u.bdev.iovs = iov;
6313 	bdev_io->u.bdev.iovcnt = iovcnt;
6314 	bdev_io->u.bdev.md_buf = md_buf;
6315 	bdev_io->u.bdev.num_blocks = num_blocks;
6316 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6317 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6318 	bdev_io->u.bdev.memory_domain = NULL;
6319 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6320 	bdev_io->u.bdev.accel_sequence = NULL;
6321 
6322 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) {
6323 		bdev_io_submit(bdev_io);
6324 		return 0;
6325 	}
6326 
6327 	bdev_compare_do_read(bdev_io);
6328 
6329 	return 0;
6330 }
6331 
6332 int
6333 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6334 			  struct iovec *iov, int iovcnt,
6335 			  uint64_t offset_blocks, uint64_t num_blocks,
6336 			  spdk_bdev_io_completion_cb cb, void *cb_arg)
6337 {
6338 	return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
6339 					    num_blocks, cb, cb_arg);
6340 }
6341 
6342 int
6343 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6344 				  struct iovec *iov, int iovcnt, void *md_buf,
6345 				  uint64_t offset_blocks, uint64_t num_blocks,
6346 				  spdk_bdev_io_completion_cb cb, void *cb_arg)
6347 {
6348 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
6349 		return -EINVAL;
6350 	}
6351 
6352 	if (md_buf && !_is_buf_allocated(iov)) {
6353 		return -EINVAL;
6354 	}
6355 
6356 	return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
6357 					    num_blocks, cb, cb_arg);
6358 }
6359 
6360 static int
6361 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6362 			    void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
6363 			    spdk_bdev_io_completion_cb cb, void *cb_arg)
6364 {
6365 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6366 	struct spdk_bdev_io *bdev_io;
6367 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6368 
6369 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6370 		return -EINVAL;
6371 	}
6372 
6373 	bdev_io = bdev_channel_get_io(channel);
6374 	if (!bdev_io) {
6375 		return -ENOMEM;
6376 	}
6377 
6378 	bdev_io->internal.ch = channel;
6379 	bdev_io->internal.desc = desc;
6380 	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE;
6381 	bdev_io->u.bdev.iovs = &bdev_io->iov;
6382 	bdev_io->u.bdev.iovs[0].iov_base = buf;
6383 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc);
6384 	bdev_io->u.bdev.iovcnt = 1;
6385 	bdev_io->u.bdev.md_buf = md_buf;
6386 	bdev_io->u.bdev.num_blocks = num_blocks;
6387 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6388 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6389 	bdev_io->u.bdev.memory_domain = NULL;
6390 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6391 	bdev_io->u.bdev.accel_sequence = NULL;
6392 
6393 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) {
6394 		bdev_io_submit(bdev_io);
6395 		return 0;
6396 	}
6397 
6398 	bdev_compare_do_read(bdev_io);
6399 
6400 	return 0;
6401 }
6402 
6403 int
6404 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6405 			 void *buf, uint64_t offset_blocks, uint64_t num_blocks,
6406 			 spdk_bdev_io_completion_cb cb, void *cb_arg)
6407 {
6408 	return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks,
6409 					   cb, cb_arg);
6410 }
6411 
6412 int
6413 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6414 				 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
6415 				 spdk_bdev_io_completion_cb cb, void *cb_arg)
6416 {
6417 	struct iovec iov = {
6418 		.iov_base = buf,
6419 	};
6420 
6421 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
6422 		return -EINVAL;
6423 	}
6424 
6425 	if (md_buf && !_is_buf_allocated(&iov)) {
6426 		return -EINVAL;
6427 	}
6428 
6429 	return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
6430 					   cb, cb_arg);
6431 }
6432 
6433 static void
6434 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status)
6435 {
6436 	struct spdk_bdev_io *bdev_io = ctx;
6437 
6438 	if (unlock_status) {
6439 		SPDK_ERRLOG("LBA range unlock failed\n");
6440 	}
6441 
6442 	bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true :
6443 			     false, bdev_io->internal.caller_ctx);
6444 }
6445 
6446 static void
6447 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status)
6448 {
6449 	bdev_io->internal.status = status;
6450 
6451 	bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch),
6452 			      bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
6453 			      bdev_comparev_and_writev_blocks_unlocked, bdev_io);
6454 }
6455 
6456 static void
6457 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
6458 {
6459 	struct spdk_bdev_io *parent_io = cb_arg;
6460 
6461 	if (!success) {
6462 		SPDK_ERRLOG("Compare and write operation failed\n");
6463 	}
6464 
6465 	spdk_bdev_free_io(bdev_io);
6466 
6467 	bdev_comparev_and_writev_blocks_unlock(parent_io,
6468 					       success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED);
6469 }
6470 
6471 static void
6472 bdev_compare_and_write_do_write(void *_bdev_io)
6473 {
6474 	struct spdk_bdev_io *bdev_io = _bdev_io;
6475 	int rc;
6476 
6477 	rc = spdk_bdev_writev_blocks(bdev_io->internal.desc,
6478 				     spdk_io_channel_from_ctx(bdev_io->internal.ch),
6479 				     bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt,
6480 				     bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
6481 				     bdev_compare_and_write_do_write_done, bdev_io);
6482 
6483 
6484 	if (rc == -ENOMEM) {
6485 		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write);
6486 	} else if (rc != 0) {
6487 		bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
6488 	}
6489 }
6490 
6491 static void
6492 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
6493 {
6494 	struct spdk_bdev_io *parent_io = cb_arg;
6495 
6496 	spdk_bdev_free_io(bdev_io);
6497 
6498 	if (!success) {
6499 		bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE);
6500 		return;
6501 	}
6502 
6503 	bdev_compare_and_write_do_write(parent_io);
6504 }
6505 
6506 static void
6507 bdev_compare_and_write_do_compare(void *_bdev_io)
6508 {
6509 	struct spdk_bdev_io *bdev_io = _bdev_io;
6510 	int rc;
6511 
6512 	rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc,
6513 				       spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs,
6514 				       bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
6515 				       bdev_compare_and_write_do_compare_done, bdev_io);
6516 
6517 	if (rc == -ENOMEM) {
6518 		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare);
6519 	} else if (rc != 0) {
6520 		bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED);
6521 	}
6522 }
6523 
6524 static void
6525 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status)
6526 {
6527 	struct spdk_bdev_io *bdev_io = ctx;
6528 
6529 	if (status) {
6530 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED;
6531 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
6532 		return;
6533 	}
6534 
6535 	bdev_compare_and_write_do_compare(bdev_io);
6536 }
6537 
6538 int
6539 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6540 				     struct iovec *compare_iov, int compare_iovcnt,
6541 				     struct iovec *write_iov, int write_iovcnt,
6542 				     uint64_t offset_blocks, uint64_t num_blocks,
6543 				     spdk_bdev_io_completion_cb cb, void *cb_arg)
6544 {
6545 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6546 	struct spdk_bdev_io *bdev_io;
6547 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6548 
6549 	if (!desc->write) {
6550 		return -EBADF;
6551 	}
6552 
6553 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6554 		return -EINVAL;
6555 	}
6556 
6557 	if (num_blocks > bdev->acwu) {
6558 		return -EINVAL;
6559 	}
6560 
6561 	bdev_io = bdev_channel_get_io(channel);
6562 	if (!bdev_io) {
6563 		return -ENOMEM;
6564 	}
6565 
6566 	bdev_io->internal.ch = channel;
6567 	bdev_io->internal.desc = desc;
6568 	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE;
6569 	bdev_io->u.bdev.iovs = compare_iov;
6570 	bdev_io->u.bdev.iovcnt = compare_iovcnt;
6571 	bdev_io->u.bdev.fused_iovs = write_iov;
6572 	bdev_io->u.bdev.fused_iovcnt = write_iovcnt;
6573 	bdev_io->u.bdev.md_buf = NULL;
6574 	bdev_io->u.bdev.num_blocks = num_blocks;
6575 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6576 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6577 	bdev_io->u.bdev.memory_domain = NULL;
6578 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6579 	bdev_io->u.bdev.accel_sequence = NULL;
6580 
6581 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) {
6582 		bdev_io_submit(bdev_io);
6583 		return 0;
6584 	}
6585 
6586 	return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks,
6587 				   bdev_comparev_and_writev_blocks_locked, bdev_io);
6588 }
6589 
6590 int
6591 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6592 		      struct iovec *iov, int iovcnt,
6593 		      uint64_t offset_blocks, uint64_t num_blocks,
6594 		      bool populate,
6595 		      spdk_bdev_io_completion_cb cb, void *cb_arg)
6596 {
6597 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6598 	struct spdk_bdev_io *bdev_io;
6599 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6600 
6601 	if (!desc->write) {
6602 		return -EBADF;
6603 	}
6604 
6605 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6606 		return -EINVAL;
6607 	}
6608 
6609 	if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) {
6610 		return -ENOTSUP;
6611 	}
6612 
6613 	bdev_io = bdev_channel_get_io(channel);
6614 	if (!bdev_io) {
6615 		return -ENOMEM;
6616 	}
6617 
6618 	bdev_io->internal.ch = channel;
6619 	bdev_io->internal.desc = desc;
6620 	bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY;
6621 	bdev_io->u.bdev.num_blocks = num_blocks;
6622 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6623 	bdev_io->u.bdev.iovs = iov;
6624 	bdev_io->u.bdev.iovcnt = iovcnt;
6625 	bdev_io->u.bdev.md_buf = NULL;
6626 	bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0;
6627 	bdev_io->u.bdev.zcopy.commit = 0;
6628 	bdev_io->u.bdev.zcopy.start = 1;
6629 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6630 	bdev_io->u.bdev.memory_domain = NULL;
6631 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6632 	bdev_io->u.bdev.accel_sequence = NULL;
6633 
6634 	bdev_io_submit(bdev_io);
6635 
6636 	return 0;
6637 }
6638 
6639 int
6640 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit,
6641 		    spdk_bdev_io_completion_cb cb, void *cb_arg)
6642 {
6643 	if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) {
6644 		return -EINVAL;
6645 	}
6646 
6647 	bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0;
6648 	bdev_io->u.bdev.zcopy.start = 0;
6649 	bdev_io->internal.caller_ctx = cb_arg;
6650 	bdev_io->internal.cb = cb;
6651 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
6652 
6653 	bdev_io_submit(bdev_io);
6654 
6655 	return 0;
6656 }
6657 
6658 int
6659 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6660 		       uint64_t offset, uint64_t len,
6661 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
6662 {
6663 	uint64_t offset_blocks, num_blocks;
6664 
6665 	if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, len, &num_blocks) != 0) {
6666 		return -EINVAL;
6667 	}
6668 
6669 	return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
6670 }
6671 
6672 int
6673 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6674 			      uint64_t offset_blocks, uint64_t num_blocks,
6675 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
6676 {
6677 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6678 	struct spdk_bdev_io *bdev_io;
6679 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6680 
6681 	if (!desc->write) {
6682 		return -EBADF;
6683 	}
6684 
6685 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6686 		return -EINVAL;
6687 	}
6688 
6689 	if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) &&
6690 	    !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) {
6691 		return -ENOTSUP;
6692 	}
6693 
6694 	bdev_io = bdev_channel_get_io(channel);
6695 
6696 	if (!bdev_io) {
6697 		return -ENOMEM;
6698 	}
6699 
6700 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES;
6701 	bdev_io->internal.ch = channel;
6702 	bdev_io->internal.desc = desc;
6703 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6704 	bdev_io->u.bdev.num_blocks = num_blocks;
6705 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6706 	bdev_io->u.bdev.memory_domain = NULL;
6707 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6708 	bdev_io->u.bdev.accel_sequence = NULL;
6709 
6710 	/* If the write_zeroes size is large and should be split, use the generic split
6711 	 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not.
6712 	 *
6713 	 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported
6714 	 * or emulate it using regular write request otherwise.
6715 	 */
6716 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) ||
6717 	    bdev_io->internal.f.split) {
6718 		bdev_io_submit(bdev_io);
6719 		return 0;
6720 	}
6721 
6722 	assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE);
6723 
6724 	return bdev_write_zero_buffer(bdev_io);
6725 }
6726 
6727 int
6728 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6729 		uint64_t offset, uint64_t nbytes,
6730 		spdk_bdev_io_completion_cb cb, void *cb_arg)
6731 {
6732 	uint64_t offset_blocks, num_blocks;
6733 
6734 	if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) {
6735 		return -EINVAL;
6736 	}
6737 
6738 	return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
6739 }
6740 
6741 static void
6742 bdev_io_complete_cb(void *ctx)
6743 {
6744 	struct spdk_bdev_io *bdev_io = ctx;
6745 
6746 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
6747 	bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx);
6748 }
6749 
6750 int
6751 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6752 		       uint64_t offset_blocks, uint64_t num_blocks,
6753 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
6754 {
6755 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6756 	struct spdk_bdev_io *bdev_io;
6757 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6758 
6759 	if (!desc->write) {
6760 		return -EBADF;
6761 	}
6762 
6763 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6764 		return -EINVAL;
6765 	}
6766 
6767 	bdev_io = bdev_channel_get_io(channel);
6768 	if (!bdev_io) {
6769 		return -ENOMEM;
6770 	}
6771 
6772 	bdev_io->internal.ch = channel;
6773 	bdev_io->internal.desc = desc;
6774 	bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP;
6775 
6776 	bdev_io->u.bdev.iovs = &bdev_io->iov;
6777 	bdev_io->u.bdev.iovs[0].iov_base = NULL;
6778 	bdev_io->u.bdev.iovs[0].iov_len = 0;
6779 	bdev_io->u.bdev.iovcnt = 1;
6780 
6781 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6782 	bdev_io->u.bdev.num_blocks = num_blocks;
6783 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6784 	bdev_io->u.bdev.memory_domain = NULL;
6785 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6786 	bdev_io->u.bdev.accel_sequence = NULL;
6787 
6788 	if (num_blocks == 0) {
6789 		spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io);
6790 		return 0;
6791 	}
6792 
6793 	bdev_io_submit(bdev_io);
6794 	return 0;
6795 }
6796 
6797 int
6798 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6799 		uint64_t offset, uint64_t length,
6800 		spdk_bdev_io_completion_cb cb, void *cb_arg)
6801 {
6802 	uint64_t offset_blocks, num_blocks;
6803 
6804 	if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, length, &num_blocks) != 0) {
6805 		return -EINVAL;
6806 	}
6807 
6808 	return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
6809 }
6810 
6811 int
6812 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6813 		       uint64_t offset_blocks, uint64_t num_blocks,
6814 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
6815 {
6816 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6817 	struct spdk_bdev_io *bdev_io;
6818 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6819 
6820 	if (!desc->write) {
6821 		return -EBADF;
6822 	}
6823 
6824 	if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH))) {
6825 		return -ENOTSUP;
6826 	}
6827 
6828 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6829 		return -EINVAL;
6830 	}
6831 
6832 	bdev_io = bdev_channel_get_io(channel);
6833 	if (!bdev_io) {
6834 		return -ENOMEM;
6835 	}
6836 
6837 	bdev_io->internal.ch = channel;
6838 	bdev_io->internal.desc = desc;
6839 	bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH;
6840 	bdev_io->u.bdev.iovs = NULL;
6841 	bdev_io->u.bdev.iovcnt = 0;
6842 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6843 	bdev_io->u.bdev.num_blocks = num_blocks;
6844 	bdev_io->u.bdev.memory_domain = NULL;
6845 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6846 	bdev_io->u.bdev.accel_sequence = NULL;
6847 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6848 
6849 	bdev_io_submit(bdev_io);
6850 	return 0;
6851 }
6852 
6853 static int bdev_reset_poll_for_outstanding_io(void *ctx);
6854 
6855 static void
6856 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
6857 {
6858 	struct spdk_bdev_io *bdev_io = _ctx;
6859 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
6860 
6861 	if (status == -EBUSY) {
6862 		if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) {
6863 			bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io,
6864 							      bdev_io, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD);
6865 		} else {
6866 			if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) {
6867 				/* If outstanding IOs are still present and reset_io_drain_timeout
6868 				 * seconds passed, start the reset. */
6869 				bdev_io_submit_reset(bdev_io);
6870 			} else {
6871 				/* We still have in progress memory domain pull/push or we're
6872 				 * executing accel sequence.  Since we cannot abort either of those
6873 				 * operations, fail the reset request. */
6874 				spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
6875 			}
6876 		}
6877 	} else {
6878 		SPDK_DEBUGLOG(bdev,
6879 			      "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n",
6880 			      ch->bdev->name);
6881 		/* Mark the completion status as a SUCCESS and complete the reset. */
6882 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
6883 	}
6884 }
6885 
6886 static void
6887 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6888 				struct spdk_io_channel *io_ch, void *_ctx)
6889 {
6890 	struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch);
6891 	int status = 0;
6892 
6893 	if (cur_ch->io_outstanding > 0 ||
6894 	    !TAILQ_EMPTY(&cur_ch->io_memory_domain) ||
6895 	    !TAILQ_EMPTY(&cur_ch->io_accel_exec)) {
6896 		/* If a channel has outstanding IO, set status to -EBUSY code. This will stop
6897 		 * further iteration over the rest of the channels and pass non-zero status
6898 		 * to the callback function. */
6899 		status = -EBUSY;
6900 	}
6901 	spdk_bdev_for_each_channel_continue(i, status);
6902 }
6903 
6904 static int
6905 bdev_reset_poll_for_outstanding_io(void *ctx)
6906 {
6907 	struct spdk_bdev_io *bdev_io = ctx;
6908 
6909 	spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller);
6910 	spdk_bdev_for_each_channel(bdev_io->bdev, bdev_reset_check_outstanding_io, bdev_io,
6911 				   bdev_reset_check_outstanding_io_done);
6912 
6913 	return SPDK_POLLER_BUSY;
6914 }
6915 
6916 static void
6917 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status)
6918 {
6919 	struct spdk_bdev_io *bdev_io = _ctx;
6920 
6921 	if (bdev->reset_io_drain_timeout == 0) {
6922 		bdev_io_submit_reset(bdev_io);
6923 		return;
6924 	}
6925 
6926 	bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() +
6927 			(bdev->reset_io_drain_timeout * spdk_get_ticks_hz());
6928 
6929 	/* In case bdev->reset_io_drain_timeout is not equal to zero,
6930 	 * submit the reset to the underlying module only if outstanding I/O
6931 	 * remain after reset_io_drain_timeout seconds have passed. */
6932 	spdk_bdev_for_each_channel(bdev, bdev_reset_check_outstanding_io, bdev_io,
6933 				   bdev_reset_check_outstanding_io_done);
6934 }
6935 
6936 static void
6937 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6938 			  struct spdk_io_channel *ch, void *_ctx)
6939 {
6940 	struct spdk_bdev_channel	*channel;
6941 	struct spdk_bdev_mgmt_channel	*mgmt_channel;
6942 	struct spdk_bdev_shared_resource *shared_resource;
6943 
6944 	channel = __io_ch_to_bdev_ch(ch);
6945 	shared_resource = channel->shared_resource;
6946 	mgmt_channel = shared_resource->mgmt_ch;
6947 
6948 	channel->flags |= BDEV_CH_RESET_IN_PROGRESS;
6949 
6950 	/**
6951 	 * Abort nomem I/Os first so that aborting other queued I/Os won't resubmit
6952 	 * nomem I/Os of this channel.
6953 	 */
6954 	bdev_abort_all_nomem_io(channel);
6955 	bdev_abort_all_buf_io(mgmt_channel, channel);
6956 
6957 	if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) {
6958 		bdev_abort_all_queued_io(&channel->qos_queued_io, channel);
6959 	}
6960 
6961 	spdk_bdev_for_each_channel_continue(i, 0);
6962 }
6963 
6964 static void
6965 bdev_start_reset(struct spdk_bdev_io *bdev_io)
6966 {
6967 	struct spdk_bdev *bdev = bdev_io->bdev;
6968 	bool freeze_channel = false;
6969 
6970 	bdev_ch_add_to_io_submitted(bdev_io);
6971 
6972 	/**
6973 	 * Take a channel reference for the target bdev for the life of this
6974 	 *  reset.  This guards against the channel getting destroyed before
6975 	 *  the reset is completed.  We will release the reference when this
6976 	 *  reset is completed.
6977 	 */
6978 	bdev_io->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev));
6979 
6980 	spdk_spin_lock(&bdev->internal.spinlock);
6981 	if (bdev->internal.reset_in_progress == NULL) {
6982 		bdev->internal.reset_in_progress = bdev_io;
6983 		freeze_channel = true;
6984 	} else {
6985 		TAILQ_INSERT_TAIL(&bdev->internal.queued_resets, bdev_io, internal.link);
6986 	}
6987 	spdk_spin_unlock(&bdev->internal.spinlock);
6988 
6989 	if (freeze_channel) {
6990 		spdk_bdev_for_each_channel(bdev, bdev_reset_freeze_channel, bdev_io,
6991 					   bdev_reset_freeze_channel_done);
6992 	}
6993 }
6994 
6995 int
6996 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6997 		spdk_bdev_io_completion_cb cb, void *cb_arg)
6998 {
6999 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7000 	struct spdk_bdev_io *bdev_io;
7001 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7002 
7003 	bdev_io = bdev_channel_get_io(channel);
7004 	if (!bdev_io) {
7005 		return -ENOMEM;
7006 	}
7007 
7008 	bdev_io->internal.ch = channel;
7009 	bdev_io->internal.desc = desc;
7010 	bdev_io->internal.submit_tsc = spdk_get_ticks();
7011 	bdev_io->type = SPDK_BDEV_IO_TYPE_RESET;
7012 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
7013 
7014 	bdev_start_reset(bdev_io);
7015 	return 0;
7016 }
7017 
7018 void
7019 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
7020 		      struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode reset_mode)
7021 {
7022 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7023 
7024 	bdev_get_io_stat(stat, channel->stat);
7025 	spdk_bdev_reset_io_stat(channel->stat, reset_mode);
7026 }
7027 
7028 static void
7029 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status)
7030 {
7031 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx;
7032 
7033 	bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat,
7034 			    bdev_iostat_ctx->cb_arg, 0);
7035 	free(bdev_iostat_ctx);
7036 }
7037 
7038 static void
7039 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
7040 			   struct spdk_io_channel *ch, void *_ctx)
7041 {
7042 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx;
7043 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7044 
7045 	spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat);
7046 	spdk_bdev_reset_io_stat(channel->stat, bdev_iostat_ctx->reset_mode);
7047 	spdk_bdev_for_each_channel_continue(i, 0);
7048 }
7049 
7050 void
7051 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat,
7052 			  enum spdk_bdev_reset_stat_mode reset_mode, spdk_bdev_get_device_stat_cb cb, void *cb_arg)
7053 {
7054 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx;
7055 
7056 	assert(bdev != NULL);
7057 	assert(stat != NULL);
7058 	assert(cb != NULL);
7059 
7060 	bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx));
7061 	if (bdev_iostat_ctx == NULL) {
7062 		SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n");
7063 		cb(bdev, stat, cb_arg, -ENOMEM);
7064 		return;
7065 	}
7066 
7067 	bdev_iostat_ctx->stat = stat;
7068 	bdev_iostat_ctx->cb = cb;
7069 	bdev_iostat_ctx->cb_arg = cb_arg;
7070 	bdev_iostat_ctx->reset_mode = reset_mode;
7071 
7072 	/* Start with the statistics from previously deleted channels. */
7073 	spdk_spin_lock(&bdev->internal.spinlock);
7074 	bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat);
7075 	spdk_bdev_reset_io_stat(bdev->internal.stat, reset_mode);
7076 	spdk_spin_unlock(&bdev->internal.spinlock);
7077 
7078 	/* Then iterate and add the statistics from each existing channel. */
7079 	spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx,
7080 				   bdev_get_device_stat_done);
7081 }
7082 
7083 struct bdev_iostat_reset_ctx {
7084 	enum spdk_bdev_reset_stat_mode mode;
7085 	bdev_reset_device_stat_cb cb;
7086 	void *cb_arg;
7087 };
7088 
7089 static void
7090 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status)
7091 {
7092 	struct bdev_iostat_reset_ctx *ctx = _ctx;
7093 
7094 	ctx->cb(bdev, ctx->cb_arg, 0);
7095 
7096 	free(ctx);
7097 }
7098 
7099 static void
7100 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
7101 			     struct spdk_io_channel *ch, void *_ctx)
7102 {
7103 	struct bdev_iostat_reset_ctx *ctx = _ctx;
7104 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7105 
7106 	spdk_bdev_reset_io_stat(channel->stat, ctx->mode);
7107 
7108 	spdk_bdev_for_each_channel_continue(i, 0);
7109 }
7110 
7111 void
7112 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode,
7113 		       bdev_reset_device_stat_cb cb, void *cb_arg)
7114 {
7115 	struct bdev_iostat_reset_ctx *ctx;
7116 
7117 	assert(bdev != NULL);
7118 	assert(cb != NULL);
7119 
7120 	ctx = calloc(1, sizeof(*ctx));
7121 	if (ctx == NULL) {
7122 		SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n");
7123 		cb(bdev, cb_arg, -ENOMEM);
7124 		return;
7125 	}
7126 
7127 	ctx->mode = mode;
7128 	ctx->cb = cb;
7129 	ctx->cb_arg = cb_arg;
7130 
7131 	spdk_spin_lock(&bdev->internal.spinlock);
7132 	spdk_bdev_reset_io_stat(bdev->internal.stat, mode);
7133 	spdk_spin_unlock(&bdev->internal.spinlock);
7134 
7135 	spdk_bdev_for_each_channel(bdev,
7136 				   bdev_reset_each_channel_stat,
7137 				   ctx,
7138 				   bdev_reset_device_stat_done);
7139 }
7140 
7141 int
7142 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
7143 			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
7144 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
7145 {
7146 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7147 	struct spdk_bdev_io *bdev_io;
7148 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7149 
7150 	if (!desc->write) {
7151 		return -EBADF;
7152 	}
7153 
7154 	if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) {
7155 		return -ENOTSUP;
7156 	}
7157 
7158 	bdev_io = bdev_channel_get_io(channel);
7159 	if (!bdev_io) {
7160 		return -ENOMEM;
7161 	}
7162 
7163 	bdev_io->internal.ch = channel;
7164 	bdev_io->internal.desc = desc;
7165 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN;
7166 	bdev_io->u.nvme_passthru.cmd = *cmd;
7167 	bdev_io->u.nvme_passthru.buf = buf;
7168 	bdev_io->u.nvme_passthru.nbytes = nbytes;
7169 	bdev_io->u.nvme_passthru.md_buf = NULL;
7170 	bdev_io->u.nvme_passthru.md_len = 0;
7171 
7172 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
7173 
7174 	bdev_io_submit(bdev_io);
7175 	return 0;
7176 }
7177 
7178 int
7179 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
7180 			   const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
7181 			   spdk_bdev_io_completion_cb cb, void *cb_arg)
7182 {
7183 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7184 	struct spdk_bdev_io *bdev_io;
7185 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7186 
7187 	if (!desc->write) {
7188 		/*
7189 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
7190 		 *  to easily determine if the command is a read or write, but for now just
7191 		 *  do not allow io_passthru with a read-only descriptor.
7192 		 */
7193 		return -EBADF;
7194 	}
7195 
7196 	if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) {
7197 		return -ENOTSUP;
7198 	}
7199 
7200 	bdev_io = bdev_channel_get_io(channel);
7201 	if (!bdev_io) {
7202 		return -ENOMEM;
7203 	}
7204 
7205 	bdev_io->internal.ch = channel;
7206 	bdev_io->internal.desc = desc;
7207 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO;
7208 	bdev_io->u.nvme_passthru.cmd = *cmd;
7209 	bdev_io->u.nvme_passthru.buf = buf;
7210 	bdev_io->u.nvme_passthru.nbytes = nbytes;
7211 	bdev_io->u.nvme_passthru.md_buf = NULL;
7212 	bdev_io->u.nvme_passthru.md_len = 0;
7213 
7214 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
7215 
7216 	bdev_io_submit(bdev_io);
7217 	return 0;
7218 }
7219 
7220 int
7221 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
7222 			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len,
7223 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
7224 {
7225 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7226 	struct spdk_bdev_io *bdev_io;
7227 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7228 
7229 	if (!desc->write) {
7230 		/*
7231 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
7232 		 *  to easily determine if the command is a read or write, but for now just
7233 		 *  do not allow io_passthru with a read-only descriptor.
7234 		 */
7235 		return -EBADF;
7236 	}
7237 
7238 	if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) {
7239 		return -ENOTSUP;
7240 	}
7241 
7242 	bdev_io = bdev_channel_get_io(channel);
7243 	if (!bdev_io) {
7244 		return -ENOMEM;
7245 	}
7246 
7247 	bdev_io->internal.ch = channel;
7248 	bdev_io->internal.desc = desc;
7249 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD;
7250 	bdev_io->u.nvme_passthru.cmd = *cmd;
7251 	bdev_io->u.nvme_passthru.buf = buf;
7252 	bdev_io->u.nvme_passthru.nbytes = nbytes;
7253 	bdev_io->u.nvme_passthru.md_buf = md_buf;
7254 	bdev_io->u.nvme_passthru.md_len = md_len;
7255 
7256 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
7257 
7258 	bdev_io_submit(bdev_io);
7259 	return 0;
7260 }
7261 
7262 int
7263 spdk_bdev_nvme_iov_passthru_md(struct spdk_bdev_desc *desc,
7264 			       struct spdk_io_channel *ch,
7265 			       const struct spdk_nvme_cmd *cmd,
7266 			       struct iovec *iov, int iovcnt, size_t nbytes,
7267 			       void *md_buf, size_t md_len,
7268 			       spdk_bdev_io_completion_cb cb, void *cb_arg)
7269 {
7270 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7271 	struct spdk_bdev_io *bdev_io;
7272 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7273 
7274 	if (!desc->write) {
7275 		/*
7276 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
7277 		 * to easily determine if the command is a read or write, but for now just
7278 		 * do not allow io_passthru with a read-only descriptor.
7279 		 */
7280 		return -EBADF;
7281 	}
7282 
7283 	if (md_buf && spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) {
7284 		return -ENOTSUP;
7285 	} else if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) {
7286 		return -ENOTSUP;
7287 	}
7288 
7289 	bdev_io = bdev_channel_get_io(channel);
7290 	if (!bdev_io) {
7291 		return -ENOMEM;
7292 	}
7293 
7294 	bdev_io->internal.ch = channel;
7295 	bdev_io->internal.desc = desc;
7296 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IOV_MD;
7297 	bdev_io->u.nvme_passthru.cmd = *cmd;
7298 	bdev_io->u.nvme_passthru.iovs = iov;
7299 	bdev_io->u.nvme_passthru.iovcnt = iovcnt;
7300 	bdev_io->u.nvme_passthru.nbytes = nbytes;
7301 	bdev_io->u.nvme_passthru.md_buf = md_buf;
7302 	bdev_io->u.nvme_passthru.md_len = md_len;
7303 
7304 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
7305 
7306 	bdev_io_submit(bdev_io);
7307 	return 0;
7308 }
7309 
7310 static void bdev_abort_retry(void *ctx);
7311 static void bdev_abort(struct spdk_bdev_io *parent_io);
7312 
7313 static void
7314 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
7315 {
7316 	struct spdk_bdev_channel *channel = bdev_io->internal.ch;
7317 	struct spdk_bdev_io *parent_io = cb_arg;
7318 	struct spdk_bdev_io *bio_to_abort, *tmp_io;
7319 
7320 	bio_to_abort = bdev_io->u.abort.bio_to_abort;
7321 
7322 	spdk_bdev_free_io(bdev_io);
7323 
7324 	if (!success) {
7325 		/* Check if the target I/O completed in the meantime. */
7326 		TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) {
7327 			if (tmp_io == bio_to_abort) {
7328 				break;
7329 			}
7330 		}
7331 
7332 		/* If the target I/O still exists, set the parent to failed. */
7333 		if (tmp_io != NULL) {
7334 			parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7335 		}
7336 	}
7337 
7338 	assert(parent_io->internal.f.split);
7339 
7340 	parent_io->internal.split.outstanding--;
7341 	if (parent_io->internal.split.outstanding == 0) {
7342 		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
7343 			bdev_abort_retry(parent_io);
7344 		} else {
7345 			bdev_io_complete(parent_io);
7346 		}
7347 	}
7348 }
7349 
7350 static int
7351 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel,
7352 	      struct spdk_bdev_io *bio_to_abort,
7353 	      spdk_bdev_io_completion_cb cb, void *cb_arg)
7354 {
7355 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7356 	struct spdk_bdev_io *bdev_io;
7357 
7358 	if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT ||
7359 	    bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) {
7360 		/* TODO: Abort reset or abort request. */
7361 		return -ENOTSUP;
7362 	}
7363 
7364 	bdev_io = bdev_channel_get_io(channel);
7365 	if (bdev_io == NULL) {
7366 		return -ENOMEM;
7367 	}
7368 
7369 	bdev_io->internal.ch = channel;
7370 	bdev_io->internal.desc = desc;
7371 	bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT;
7372 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
7373 
7374 	if (bio_to_abort->internal.f.split) {
7375 		assert(bdev_io_should_split(bio_to_abort));
7376 		bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort;
7377 
7378 		/* Parent abort request is not submitted directly, but to manage its
7379 		 * execution add it to the submitted list here.
7380 		 */
7381 		bdev_io->internal.submit_tsc = spdk_get_ticks();
7382 		bdev_ch_add_to_io_submitted(bdev_io);
7383 
7384 		bdev_abort(bdev_io);
7385 
7386 		return 0;
7387 	}
7388 
7389 	bdev_io->u.abort.bio_to_abort = bio_to_abort;
7390 
7391 	/* Submit the abort request to the underlying bdev module. */
7392 	bdev_io_submit(bdev_io);
7393 
7394 	return 0;
7395 }
7396 
7397 static bool
7398 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq)
7399 {
7400 	struct spdk_bdev_io *iter;
7401 
7402 	TAILQ_FOREACH(iter, tailq, internal.link) {
7403 		if (iter == bdev_io) {
7404 			return true;
7405 		}
7406 	}
7407 
7408 	return false;
7409 }
7410 
7411 static uint32_t
7412 _bdev_abort(struct spdk_bdev_io *parent_io)
7413 {
7414 	struct spdk_bdev_desc *desc = parent_io->internal.desc;
7415 	struct spdk_bdev_channel *channel = parent_io->internal.ch;
7416 	void *bio_cb_arg;
7417 	struct spdk_bdev_io *bio_to_abort;
7418 	uint32_t matched_ios;
7419 	int rc;
7420 
7421 	bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg;
7422 
7423 	/* matched_ios is returned and will be kept by the caller.
7424 	 *
7425 	 * This function will be used for two cases, 1) the same cb_arg is used for
7426 	 * multiple I/Os, 2) a single large I/O is split into smaller ones.
7427 	 * Incrementing split_outstanding directly here may confuse readers especially
7428 	 * for the 1st case.
7429 	 *
7430 	 * Completion of I/O abort is processed after stack unwinding. Hence this trick
7431 	 * works as expected.
7432 	 */
7433 	matched_ios = 0;
7434 	parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
7435 
7436 	TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) {
7437 		if (bio_to_abort->internal.caller_ctx != bio_cb_arg) {
7438 			continue;
7439 		}
7440 
7441 		if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) {
7442 			/* Any I/O which was submitted after this abort command should be excluded. */
7443 			continue;
7444 		}
7445 
7446 		/* We can't abort a request that's being pushed/pulled or executed by accel */
7447 		if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) ||
7448 		    bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) {
7449 			parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7450 			break;
7451 		}
7452 
7453 		rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io);
7454 		if (rc != 0) {
7455 			if (rc == -ENOMEM) {
7456 				parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM;
7457 			} else {
7458 				parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7459 			}
7460 			break;
7461 		}
7462 		matched_ios++;
7463 	}
7464 
7465 	return matched_ios;
7466 }
7467 
7468 static void
7469 bdev_abort_retry(void *ctx)
7470 {
7471 	struct spdk_bdev_io *parent_io = ctx;
7472 	uint32_t matched_ios;
7473 
7474 	matched_ios = _bdev_abort(parent_io);
7475 
7476 	if (matched_ios == 0) {
7477 		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
7478 			bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry);
7479 		} else {
7480 			/* For retry, the case that no target I/O was found is success
7481 			 * because it means target I/Os completed in the meantime.
7482 			 */
7483 			bdev_io_complete(parent_io);
7484 		}
7485 		return;
7486 	}
7487 
7488 	/* Use split_outstanding to manage the progress of aborting I/Os. */
7489 	parent_io->internal.f.split = true;
7490 	parent_io->internal.split.outstanding = matched_ios;
7491 }
7492 
7493 static void
7494 bdev_abort(struct spdk_bdev_io *parent_io)
7495 {
7496 	uint32_t matched_ios;
7497 
7498 	matched_ios = _bdev_abort(parent_io);
7499 
7500 	if (matched_ios == 0) {
7501 		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
7502 			bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry);
7503 		} else {
7504 			/* The case the no target I/O was found is failure. */
7505 			parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7506 			bdev_io_complete(parent_io);
7507 		}
7508 		return;
7509 	}
7510 
7511 	/* Use split_outstanding to manage the progress of aborting I/Os. */
7512 	parent_io->internal.f.split = true;
7513 	parent_io->internal.split.outstanding = matched_ios;
7514 }
7515 
7516 int
7517 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
7518 		void *bio_cb_arg,
7519 		spdk_bdev_io_completion_cb cb, void *cb_arg)
7520 {
7521 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7522 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7523 	struct spdk_bdev_io *bdev_io;
7524 
7525 	if (bio_cb_arg == NULL) {
7526 		return -EINVAL;
7527 	}
7528 
7529 	if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) {
7530 		return -ENOTSUP;
7531 	}
7532 
7533 	bdev_io = bdev_channel_get_io(channel);
7534 	if (bdev_io == NULL) {
7535 		return -ENOMEM;
7536 	}
7537 
7538 	bdev_io->internal.ch = channel;
7539 	bdev_io->internal.desc = desc;
7540 	bdev_io->internal.submit_tsc = spdk_get_ticks();
7541 	bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT;
7542 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
7543 
7544 	bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg;
7545 
7546 	/* Parent abort request is not submitted directly, but to manage its execution,
7547 	 * add it to the submitted list here.
7548 	 */
7549 	bdev_ch_add_to_io_submitted(bdev_io);
7550 
7551 	bdev_abort(bdev_io);
7552 
7553 	return 0;
7554 }
7555 
7556 int
7557 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
7558 			struct spdk_bdev_io_wait_entry *entry)
7559 {
7560 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
7561 	struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch;
7562 
7563 	if (bdev != entry->bdev) {
7564 		SPDK_ERRLOG("bdevs do not match\n");
7565 		return -EINVAL;
7566 	}
7567 
7568 	if (mgmt_ch->per_thread_cache_count > 0) {
7569 		SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n");
7570 		return -EINVAL;
7571 	}
7572 
7573 	TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link);
7574 	return 0;
7575 }
7576 
7577 static inline void
7578 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff)
7579 {
7580 	enum spdk_bdev_io_status io_status = bdev_io->internal.status;
7581 	struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat;
7582 	uint64_t num_blocks = bdev_io->u.bdev.num_blocks;
7583 	uint32_t blocklen = bdev_io->bdev->blocklen;
7584 
7585 	if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
7586 		switch (bdev_io->type) {
7587 		case SPDK_BDEV_IO_TYPE_READ:
7588 			io_stat->bytes_read += num_blocks * blocklen;
7589 			io_stat->num_read_ops++;
7590 			io_stat->read_latency_ticks += tsc_diff;
7591 			if (io_stat->max_read_latency_ticks < tsc_diff) {
7592 				io_stat->max_read_latency_ticks = tsc_diff;
7593 			}
7594 			if (io_stat->min_read_latency_ticks > tsc_diff) {
7595 				io_stat->min_read_latency_ticks = tsc_diff;
7596 			}
7597 			break;
7598 		case SPDK_BDEV_IO_TYPE_WRITE:
7599 			io_stat->bytes_written += num_blocks * blocklen;
7600 			io_stat->num_write_ops++;
7601 			io_stat->write_latency_ticks += tsc_diff;
7602 			if (io_stat->max_write_latency_ticks < tsc_diff) {
7603 				io_stat->max_write_latency_ticks = tsc_diff;
7604 			}
7605 			if (io_stat->min_write_latency_ticks > tsc_diff) {
7606 				io_stat->min_write_latency_ticks = tsc_diff;
7607 			}
7608 			break;
7609 		case SPDK_BDEV_IO_TYPE_UNMAP:
7610 			io_stat->bytes_unmapped += num_blocks * blocklen;
7611 			io_stat->num_unmap_ops++;
7612 			io_stat->unmap_latency_ticks += tsc_diff;
7613 			if (io_stat->max_unmap_latency_ticks < tsc_diff) {
7614 				io_stat->max_unmap_latency_ticks = tsc_diff;
7615 			}
7616 			if (io_stat->min_unmap_latency_ticks > tsc_diff) {
7617 				io_stat->min_unmap_latency_ticks = tsc_diff;
7618 			}
7619 			break;
7620 		case SPDK_BDEV_IO_TYPE_ZCOPY:
7621 			/* Track the data in the start phase only */
7622 			if (bdev_io->u.bdev.zcopy.start) {
7623 				if (bdev_io->u.bdev.zcopy.populate) {
7624 					io_stat->bytes_read += num_blocks * blocklen;
7625 					io_stat->num_read_ops++;
7626 					io_stat->read_latency_ticks += tsc_diff;
7627 					if (io_stat->max_read_latency_ticks < tsc_diff) {
7628 						io_stat->max_read_latency_ticks = tsc_diff;
7629 					}
7630 					if (io_stat->min_read_latency_ticks > tsc_diff) {
7631 						io_stat->min_read_latency_ticks = tsc_diff;
7632 					}
7633 				} else {
7634 					io_stat->bytes_written += num_blocks * blocklen;
7635 					io_stat->num_write_ops++;
7636 					io_stat->write_latency_ticks += tsc_diff;
7637 					if (io_stat->max_write_latency_ticks < tsc_diff) {
7638 						io_stat->max_write_latency_ticks = tsc_diff;
7639 					}
7640 					if (io_stat->min_write_latency_ticks > tsc_diff) {
7641 						io_stat->min_write_latency_ticks = tsc_diff;
7642 					}
7643 				}
7644 			}
7645 			break;
7646 		case SPDK_BDEV_IO_TYPE_COPY:
7647 			io_stat->bytes_copied += num_blocks * blocklen;
7648 			io_stat->num_copy_ops++;
7649 			bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff;
7650 			if (io_stat->max_copy_latency_ticks < tsc_diff) {
7651 				io_stat->max_copy_latency_ticks = tsc_diff;
7652 			}
7653 			if (io_stat->min_copy_latency_ticks > tsc_diff) {
7654 				io_stat->min_copy_latency_ticks = tsc_diff;
7655 			}
7656 			break;
7657 		default:
7658 			break;
7659 		}
7660 	} else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) {
7661 		io_stat = bdev_io->bdev->internal.stat;
7662 		assert(io_stat->io_error != NULL);
7663 
7664 		spdk_spin_lock(&bdev_io->bdev->internal.spinlock);
7665 		io_stat->io_error->error_status[-io_status - 1]++;
7666 		spdk_spin_unlock(&bdev_io->bdev->internal.spinlock);
7667 	}
7668 
7669 #ifdef SPDK_CONFIG_VTUNE
7670 	uint64_t now_tsc = spdk_get_ticks();
7671 	if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) {
7672 		uint64_t data[5];
7673 		struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat;
7674 
7675 		data[0] = io_stat->num_read_ops - prev_stat->num_read_ops;
7676 		data[1] = io_stat->bytes_read - prev_stat->bytes_read;
7677 		data[2] = io_stat->num_write_ops - prev_stat->num_write_ops;
7678 		data[3] = io_stat->bytes_written - prev_stat->bytes_written;
7679 		data[4] = bdev_io->bdev->fn_table->get_spin_time ?
7680 			  bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0;
7681 
7682 		__itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle,
7683 				   __itt_metadata_u64, 5, data);
7684 
7685 		memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat));
7686 		bdev_io->internal.ch->start_tsc = now_tsc;
7687 	}
7688 #endif
7689 }
7690 
7691 static inline void
7692 _bdev_io_complete(void *ctx)
7693 {
7694 	struct spdk_bdev_io *bdev_io = ctx;
7695 
7696 	if (spdk_unlikely(bdev_io_use_accel_sequence(bdev_io))) {
7697 		assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS);
7698 		spdk_accel_sequence_abort(bdev_io->internal.accel_sequence);
7699 	}
7700 
7701 	assert(bdev_io->internal.cb != NULL);
7702 	assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io));
7703 
7704 	bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
7705 			     bdev_io->internal.caller_ctx);
7706 }
7707 
7708 static inline void
7709 bdev_io_complete(void *ctx)
7710 {
7711 	struct spdk_bdev_io *bdev_io = ctx;
7712 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
7713 	uint64_t tsc, tsc_diff;
7714 
7715 	if (spdk_unlikely(bdev_io->internal.f.in_submit_request)) {
7716 		/*
7717 		 * Defer completion to avoid potential infinite recursion if the
7718 		 * user's completion callback issues a new I/O.
7719 		 */
7720 		spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
7721 				     bdev_io_complete, bdev_io);
7722 		return;
7723 	}
7724 
7725 	tsc = spdk_get_ticks();
7726 	tsc_diff = tsc - bdev_io->internal.submit_tsc;
7727 
7728 	bdev_ch_remove_from_io_submitted(bdev_io);
7729 	spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, bdev_ch->trace_id, 0, (uintptr_t)bdev_io,
7730 			      bdev_io->internal.caller_ctx, bdev_ch->queue_depth);
7731 
7732 	if (bdev_ch->histogram) {
7733 		if (bdev_io->bdev->internal.histogram_io_type == 0 ||
7734 		    bdev_io->bdev->internal.histogram_io_type == bdev_io->type) {
7735 			/*
7736 			 * Tally all I/O types if the histogram_io_type is set to 0.
7737 			 */
7738 			spdk_histogram_data_tally(bdev_ch->histogram, tsc_diff);
7739 		}
7740 	}
7741 
7742 	bdev_io_update_io_stat(bdev_io, tsc_diff);
7743 	_bdev_io_complete(bdev_io);
7744 }
7745 
7746 /* The difference between this function and bdev_io_complete() is that this should be called to
7747  * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the
7748  * io_submitted list and don't have submit_tsc updated.
7749  */
7750 static inline void
7751 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io)
7752 {
7753 	/* Since the IO hasn't been submitted it's bound to be failed */
7754 	assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS);
7755 
7756 	/* At this point we don't know if the IO is completed from submission context or not, but,
7757 	 * since this is an error path, we can always do an spdk_thread_send_msg(). */
7758 	spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
7759 			     _bdev_io_complete, bdev_io);
7760 }
7761 
7762 static void bdev_destroy_cb(void *io_device);
7763 
7764 static inline void
7765 _bdev_reset_complete(void *ctx)
7766 {
7767 	struct spdk_bdev_io *bdev_io = ctx;
7768 
7769 	/* Put the channel reference we got in submission. */
7770 	assert(bdev_io->u.reset.ch_ref != NULL);
7771 	spdk_put_io_channel(bdev_io->u.reset.ch_ref);
7772 	bdev_io->u.reset.ch_ref = NULL;
7773 
7774 	bdev_io_complete(bdev_io);
7775 }
7776 
7777 static void
7778 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status)
7779 {
7780 	struct spdk_bdev_io *bdev_io = _ctx;
7781 	bdev_io_tailq_t queued_resets;
7782 	struct spdk_bdev_io *queued_reset;
7783 
7784 	assert(bdev_io == bdev->internal.reset_in_progress);
7785 
7786 	TAILQ_INIT(&queued_resets);
7787 
7788 	spdk_spin_lock(&bdev->internal.spinlock);
7789 	TAILQ_SWAP(&bdev->internal.queued_resets, &queued_resets,
7790 		   spdk_bdev_io, internal.link);
7791 	bdev->internal.reset_in_progress = NULL;
7792 	spdk_spin_unlock(&bdev->internal.spinlock);
7793 
7794 	while (!TAILQ_EMPTY(&queued_resets)) {
7795 		queued_reset = TAILQ_FIRST(&queued_resets);
7796 		TAILQ_REMOVE(&queued_resets, queued_reset, internal.link);
7797 		queued_reset->internal.status = bdev_io->internal.status;
7798 		spdk_thread_send_msg(spdk_bdev_io_get_thread(queued_reset),
7799 				     _bdev_reset_complete, queued_reset);
7800 	}
7801 
7802 	_bdev_reset_complete(bdev_io);
7803 
7804 	if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING &&
7805 	    TAILQ_EMPTY(&bdev->internal.open_descs)) {
7806 		spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
7807 	}
7808 }
7809 
7810 static void
7811 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
7812 		      struct spdk_io_channel *_ch, void *_ctx)
7813 {
7814 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
7815 
7816 	ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS;
7817 
7818 	spdk_bdev_for_each_channel_continue(i, 0);
7819 }
7820 
7821 static void
7822 bdev_io_complete_sequence_cb(void *ctx, int status)
7823 {
7824 	struct spdk_bdev_io *bdev_io = ctx;
7825 
7826 	/* u.bdev.accel_sequence should have already been cleared at this point */
7827 	assert(bdev_io->u.bdev.accel_sequence == NULL);
7828 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
7829 	bdev_io->internal.f.has_accel_sequence = false;
7830 
7831 	if (spdk_unlikely(status != 0)) {
7832 		SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status);
7833 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7834 	}
7835 
7836 	bdev_io_complete(bdev_io);
7837 }
7838 
7839 void
7840 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status)
7841 {
7842 	struct spdk_bdev *bdev = bdev_io->bdev;
7843 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
7844 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
7845 
7846 	if (spdk_unlikely(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING)) {
7847 		SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n",
7848 			    spdk_bdev_get_module_name(bdev),
7849 			    bdev_io_status_get_string(bdev_io->internal.status));
7850 		assert(false);
7851 	}
7852 	bdev_io->internal.status = status;
7853 
7854 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) {
7855 		assert(bdev_io == bdev->internal.reset_in_progress);
7856 		spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io,
7857 					   bdev_reset_complete);
7858 		return;
7859 	} else {
7860 		bdev_io_decrement_outstanding(bdev_ch, shared_resource);
7861 		if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
7862 			if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) {
7863 				bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb);
7864 				return;
7865 			} else if (spdk_unlikely(bdev_io->internal.f.has_bounce_buf &&
7866 						 !bdev_io_use_accel_sequence(bdev_io))) {
7867 				_bdev_io_push_bounce_data_buffer(bdev_io,
7868 								 _bdev_io_complete_push_bounce_done);
7869 				/* bdev IO will be completed in the callback */
7870 				return;
7871 			}
7872 		}
7873 
7874 		if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) {
7875 			return;
7876 		}
7877 	}
7878 
7879 	bdev_io_complete(bdev_io);
7880 }
7881 
7882 void
7883 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc,
7884 				  enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq)
7885 {
7886 	enum spdk_bdev_io_status status;
7887 
7888 	if (sc == SPDK_SCSI_STATUS_GOOD) {
7889 		status = SPDK_BDEV_IO_STATUS_SUCCESS;
7890 	} else {
7891 		status = SPDK_BDEV_IO_STATUS_SCSI_ERROR;
7892 		bdev_io->internal.error.scsi.sc = sc;
7893 		bdev_io->internal.error.scsi.sk = sk;
7894 		bdev_io->internal.error.scsi.asc = asc;
7895 		bdev_io->internal.error.scsi.ascq = ascq;
7896 	}
7897 
7898 	spdk_bdev_io_complete(bdev_io, status);
7899 }
7900 
7901 void
7902 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io,
7903 			     int *sc, int *sk, int *asc, int *ascq)
7904 {
7905 	assert(sc != NULL);
7906 	assert(sk != NULL);
7907 	assert(asc != NULL);
7908 	assert(ascq != NULL);
7909 
7910 	switch (bdev_io->internal.status) {
7911 	case SPDK_BDEV_IO_STATUS_SUCCESS:
7912 		*sc = SPDK_SCSI_STATUS_GOOD;
7913 		*sk = SPDK_SCSI_SENSE_NO_SENSE;
7914 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
7915 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
7916 		break;
7917 	case SPDK_BDEV_IO_STATUS_NVME_ERROR:
7918 		spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq);
7919 		break;
7920 	case SPDK_BDEV_IO_STATUS_MISCOMPARE:
7921 		*sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
7922 		*sk = SPDK_SCSI_SENSE_MISCOMPARE;
7923 		*asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION;
7924 		*ascq = bdev_io->internal.error.scsi.ascq;
7925 		break;
7926 	case SPDK_BDEV_IO_STATUS_SCSI_ERROR:
7927 		*sc = bdev_io->internal.error.scsi.sc;
7928 		*sk = bdev_io->internal.error.scsi.sk;
7929 		*asc = bdev_io->internal.error.scsi.asc;
7930 		*ascq = bdev_io->internal.error.scsi.ascq;
7931 		break;
7932 	default:
7933 		*sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
7934 		*sk = SPDK_SCSI_SENSE_ABORTED_COMMAND;
7935 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
7936 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
7937 		break;
7938 	}
7939 }
7940 
7941 void
7942 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result)
7943 {
7944 	enum spdk_bdev_io_status status;
7945 
7946 	if (aio_result == 0) {
7947 		status = SPDK_BDEV_IO_STATUS_SUCCESS;
7948 	} else {
7949 		status = SPDK_BDEV_IO_STATUS_AIO_ERROR;
7950 	}
7951 
7952 	bdev_io->internal.error.aio_result = aio_result;
7953 
7954 	spdk_bdev_io_complete(bdev_io, status);
7955 }
7956 
7957 void
7958 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result)
7959 {
7960 	assert(aio_result != NULL);
7961 
7962 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) {
7963 		*aio_result = bdev_io->internal.error.aio_result;
7964 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
7965 		*aio_result = 0;
7966 	} else {
7967 		*aio_result = -EIO;
7968 	}
7969 }
7970 
7971 void
7972 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc)
7973 {
7974 	enum spdk_bdev_io_status status;
7975 
7976 	if (spdk_likely(sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS)) {
7977 		status = SPDK_BDEV_IO_STATUS_SUCCESS;
7978 	} else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) {
7979 		status = SPDK_BDEV_IO_STATUS_ABORTED;
7980 	} else {
7981 		status = SPDK_BDEV_IO_STATUS_NVME_ERROR;
7982 	}
7983 
7984 	bdev_io->internal.error.nvme.cdw0 = cdw0;
7985 	bdev_io->internal.error.nvme.sct = sct;
7986 	bdev_io->internal.error.nvme.sc = sc;
7987 
7988 	spdk_bdev_io_complete(bdev_io, status);
7989 }
7990 
7991 void
7992 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc)
7993 {
7994 	assert(sct != NULL);
7995 	assert(sc != NULL);
7996 	assert(cdw0 != NULL);
7997 
7998 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) {
7999 		*sct = SPDK_NVME_SCT_GENERIC;
8000 		*sc = SPDK_NVME_SC_SUCCESS;
8001 		if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
8002 			*cdw0 = 0;
8003 		} else {
8004 			*cdw0 = 1U;
8005 		}
8006 		return;
8007 	}
8008 
8009 	if (spdk_likely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
8010 		*sct = SPDK_NVME_SCT_GENERIC;
8011 		*sc = SPDK_NVME_SC_SUCCESS;
8012 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
8013 		*sct = bdev_io->internal.error.nvme.sct;
8014 		*sc = bdev_io->internal.error.nvme.sc;
8015 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) {
8016 		*sct = SPDK_NVME_SCT_GENERIC;
8017 		*sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
8018 	} else {
8019 		*sct = SPDK_NVME_SCT_GENERIC;
8020 		*sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
8021 	}
8022 
8023 	*cdw0 = bdev_io->internal.error.nvme.cdw0;
8024 }
8025 
8026 void
8027 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0,
8028 				   int *first_sct, int *first_sc, int *second_sct, int *second_sc)
8029 {
8030 	assert(first_sct != NULL);
8031 	assert(first_sc != NULL);
8032 	assert(second_sct != NULL);
8033 	assert(second_sc != NULL);
8034 	assert(cdw0 != NULL);
8035 
8036 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
8037 		if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR &&
8038 		    bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) {
8039 			*first_sct = bdev_io->internal.error.nvme.sct;
8040 			*first_sc = bdev_io->internal.error.nvme.sc;
8041 			*second_sct = SPDK_NVME_SCT_GENERIC;
8042 			*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
8043 		} else {
8044 			*first_sct = SPDK_NVME_SCT_GENERIC;
8045 			*first_sc = SPDK_NVME_SC_SUCCESS;
8046 			*second_sct = bdev_io->internal.error.nvme.sct;
8047 			*second_sc = bdev_io->internal.error.nvme.sc;
8048 		}
8049 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) {
8050 		*first_sct = SPDK_NVME_SCT_GENERIC;
8051 		*first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
8052 		*second_sct = SPDK_NVME_SCT_GENERIC;
8053 		*second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
8054 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
8055 		*first_sct = SPDK_NVME_SCT_GENERIC;
8056 		*first_sc = SPDK_NVME_SC_SUCCESS;
8057 		*second_sct = SPDK_NVME_SCT_GENERIC;
8058 		*second_sc = SPDK_NVME_SC_SUCCESS;
8059 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) {
8060 		*first_sct = SPDK_NVME_SCT_GENERIC;
8061 		*first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
8062 		*second_sct = SPDK_NVME_SCT_GENERIC;
8063 		*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
8064 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) {
8065 		*first_sct = SPDK_NVME_SCT_MEDIA_ERROR;
8066 		*first_sc = SPDK_NVME_SC_COMPARE_FAILURE;
8067 		*second_sct = SPDK_NVME_SCT_GENERIC;
8068 		*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
8069 	} else {
8070 		*first_sct = SPDK_NVME_SCT_GENERIC;
8071 		*first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
8072 		*second_sct = SPDK_NVME_SCT_GENERIC;
8073 		*second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
8074 	}
8075 
8076 	*cdw0 = bdev_io->internal.error.nvme.cdw0;
8077 }
8078 
8079 void
8080 spdk_bdev_io_complete_base_io_status(struct spdk_bdev_io *bdev_io,
8081 				     const struct spdk_bdev_io *base_io)
8082 {
8083 	switch (base_io->internal.status) {
8084 	case SPDK_BDEV_IO_STATUS_NVME_ERROR:
8085 		spdk_bdev_io_complete_nvme_status(bdev_io,
8086 						  base_io->internal.error.nvme.cdw0,
8087 						  base_io->internal.error.nvme.sct,
8088 						  base_io->internal.error.nvme.sc);
8089 		break;
8090 	case SPDK_BDEV_IO_STATUS_SCSI_ERROR:
8091 		spdk_bdev_io_complete_scsi_status(bdev_io,
8092 						  base_io->internal.error.scsi.sc,
8093 						  base_io->internal.error.scsi.sk,
8094 						  base_io->internal.error.scsi.asc,
8095 						  base_io->internal.error.scsi.ascq);
8096 		break;
8097 	case SPDK_BDEV_IO_STATUS_AIO_ERROR:
8098 		spdk_bdev_io_complete_aio_status(bdev_io, base_io->internal.error.aio_result);
8099 		break;
8100 	default:
8101 		spdk_bdev_io_complete(bdev_io, base_io->internal.status);
8102 		break;
8103 	}
8104 }
8105 
8106 struct spdk_thread *
8107 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io)
8108 {
8109 	return spdk_io_channel_get_thread(bdev_io->internal.ch->channel);
8110 }
8111 
8112 struct spdk_io_channel *
8113 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io)
8114 {
8115 	return bdev_io->internal.ch->channel;
8116 }
8117 
8118 static int
8119 bdev_register(struct spdk_bdev *bdev)
8120 {
8121 	char *bdev_name;
8122 	char uuid[SPDK_UUID_STRING_LEN];
8123 	struct spdk_iobuf_opts iobuf_opts;
8124 	int ret;
8125 
8126 	assert(bdev->module != NULL);
8127 
8128 	if (!bdev->name) {
8129 		SPDK_ERRLOG("Bdev name is NULL\n");
8130 		return -EINVAL;
8131 	}
8132 
8133 	if (!strlen(bdev->name)) {
8134 		SPDK_ERRLOG("Bdev name must not be an empty string\n");
8135 		return -EINVAL;
8136 	}
8137 
8138 	/* Users often register their own I/O devices using the bdev name. In
8139 	 * order to avoid conflicts, prepend bdev_. */
8140 	bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name);
8141 	if (!bdev_name) {
8142 		SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n");
8143 		return -ENOMEM;
8144 	}
8145 
8146 	bdev->internal.stat = bdev_alloc_io_stat(true);
8147 	if (!bdev->internal.stat) {
8148 		SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n");
8149 		free(bdev_name);
8150 		return -ENOMEM;
8151 	}
8152 
8153 	bdev->internal.status = SPDK_BDEV_STATUS_READY;
8154 	bdev->internal.measured_queue_depth = UINT64_MAX;
8155 	bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE;
8156 	memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim));
8157 	bdev->internal.qd_poller = NULL;
8158 	bdev->internal.qos = NULL;
8159 
8160 	TAILQ_INIT(&bdev->internal.open_descs);
8161 	TAILQ_INIT(&bdev->internal.locked_ranges);
8162 	TAILQ_INIT(&bdev->internal.pending_locked_ranges);
8163 	TAILQ_INIT(&bdev->internal.queued_resets);
8164 	TAILQ_INIT(&bdev->aliases);
8165 
8166 	/* UUID may be specified by the user or defined by bdev itself.
8167 	 * Otherwise it will be generated here, so this field will never be empty. */
8168 	if (spdk_uuid_is_null(&bdev->uuid)) {
8169 		spdk_uuid_generate(&bdev->uuid);
8170 	}
8171 
8172 	/* Add the UUID alias only if it's different than the name */
8173 	spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid);
8174 	if (strcmp(bdev->name, uuid) != 0) {
8175 		ret = spdk_bdev_alias_add(bdev, uuid);
8176 		if (ret != 0) {
8177 			SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name);
8178 			bdev_free_io_stat(bdev->internal.stat);
8179 			free(bdev_name);
8180 			return ret;
8181 		}
8182 	}
8183 
8184 	spdk_iobuf_get_opts(&iobuf_opts, sizeof(iobuf_opts));
8185 	if (spdk_bdev_get_buf_align(bdev) > 1) {
8186 		bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX,
8187 					     iobuf_opts.large_bufsize / bdev->blocklen);
8188 	}
8189 
8190 	/* If the user didn't specify a write unit size, set it to one. */
8191 	if (bdev->write_unit_size == 0) {
8192 		bdev->write_unit_size = 1;
8193 	}
8194 
8195 	/* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */
8196 	if (bdev->acwu == 0) {
8197 		bdev->acwu = bdev->write_unit_size;
8198 	}
8199 
8200 	if (bdev->phys_blocklen == 0) {
8201 		bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev);
8202 	}
8203 
8204 	if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) {
8205 		bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize);
8206 	}
8207 
8208 	if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
8209 		bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE);
8210 	}
8211 
8212 	bdev->internal.reset_in_progress = NULL;
8213 	bdev->internal.qd_poll_in_progress = false;
8214 	bdev->internal.period = 0;
8215 	bdev->internal.new_period = 0;
8216 	bdev->internal.trace_id = spdk_trace_register_owner(OWNER_TYPE_BDEV, bdev_name);
8217 
8218 	/*
8219 	 * Initialize spinlock before registering IO device because spinlock is used in
8220 	 * bdev_channel_create
8221 	 */
8222 	spdk_spin_init(&bdev->internal.spinlock);
8223 
8224 	spdk_io_device_register(__bdev_to_io_dev(bdev),
8225 				bdev_channel_create, bdev_channel_destroy,
8226 				sizeof(struct spdk_bdev_channel),
8227 				bdev_name);
8228 
8229 	/*
8230 	 * Register bdev name only after the bdev object is ready.
8231 	 * After bdev_name_add returns, it is possible for other threads to start using the bdev,
8232 	 * create IO channels...
8233 	 */
8234 	ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name);
8235 	if (ret != 0) {
8236 		spdk_io_device_unregister(__bdev_to_io_dev(bdev), NULL);
8237 		bdev_free_io_stat(bdev->internal.stat);
8238 		spdk_spin_destroy(&bdev->internal.spinlock);
8239 		free(bdev_name);
8240 		return ret;
8241 	}
8242 
8243 	free(bdev_name);
8244 
8245 	SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name);
8246 	TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link);
8247 
8248 	return 0;
8249 }
8250 
8251 static void
8252 bdev_destroy_cb(void *io_device)
8253 {
8254 	int			rc;
8255 	struct spdk_bdev	*bdev;
8256 	spdk_bdev_unregister_cb	cb_fn;
8257 	void			*cb_arg;
8258 
8259 	bdev = __bdev_from_io_dev(io_device);
8260 
8261 	if (bdev->internal.unregister_td != spdk_get_thread()) {
8262 		spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device);
8263 		return;
8264 	}
8265 
8266 	cb_fn = bdev->internal.unregister_cb;
8267 	cb_arg = bdev->internal.unregister_ctx;
8268 
8269 	spdk_spin_destroy(&bdev->internal.spinlock);
8270 	free(bdev->internal.qos);
8271 	bdev_free_io_stat(bdev->internal.stat);
8272 	spdk_trace_unregister_owner(bdev->internal.trace_id);
8273 
8274 	rc = bdev->fn_table->destruct(bdev->ctxt);
8275 	if (rc < 0) {
8276 		SPDK_ERRLOG("destruct failed\n");
8277 	}
8278 	if (rc <= 0 && cb_fn != NULL) {
8279 		cb_fn(cb_arg, rc);
8280 	}
8281 }
8282 
8283 void
8284 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno)
8285 {
8286 	if (bdev->internal.unregister_cb != NULL) {
8287 		bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno);
8288 	}
8289 }
8290 
8291 static void
8292 _remove_notify(void *arg)
8293 {
8294 	struct spdk_bdev_desc *desc = arg;
8295 
8296 	_event_notify(desc, SPDK_BDEV_EVENT_REMOVE);
8297 }
8298 
8299 /* returns: 0 - bdev removed and ready to be destructed.
8300  *          -EBUSY - bdev can't be destructed yet.  */
8301 static int
8302 bdev_unregister_unsafe(struct spdk_bdev *bdev)
8303 {
8304 	struct spdk_bdev_desc	*desc, *tmp;
8305 	struct spdk_bdev_alias	*alias;
8306 	int			rc = 0;
8307 	char			uuid[SPDK_UUID_STRING_LEN];
8308 
8309 	assert(spdk_spin_held(&g_bdev_mgr.spinlock));
8310 	assert(spdk_spin_held(&bdev->internal.spinlock));
8311 
8312 	/* Notify each descriptor about hotremoval */
8313 	TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) {
8314 		rc = -EBUSY;
8315 		/*
8316 		 * Defer invocation of the event_cb to a separate message that will
8317 		 *  run later on its thread.  This ensures this context unwinds and
8318 		 *  we don't recursively unregister this bdev again if the event_cb
8319 		 *  immediately closes its descriptor.
8320 		 */
8321 		event_notify(desc, _remove_notify);
8322 	}
8323 
8324 	/* If there are no descriptors, proceed removing the bdev */
8325 	if (rc == 0) {
8326 		bdev_examine_allowlist_remove(bdev->name);
8327 		TAILQ_FOREACH(alias, &bdev->aliases, tailq) {
8328 			bdev_examine_allowlist_remove(alias->alias.name);
8329 		}
8330 		TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
8331 		SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name);
8332 
8333 		/* Delete the name and the UUID alias */
8334 		spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid);
8335 		bdev_name_del_unsafe(&bdev->internal.bdev_name);
8336 		bdev_alias_del(bdev, uuid, bdev_name_del_unsafe);
8337 
8338 		spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev));
8339 
8340 		if (bdev->internal.reset_in_progress != NULL) {
8341 			/* If reset is in progress, let the completion callback for reset
8342 			 * unregister the bdev.
8343 			 */
8344 			rc = -EBUSY;
8345 		}
8346 	}
8347 
8348 	return rc;
8349 }
8350 
8351 static void
8352 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
8353 			      struct spdk_io_channel *io_ch, void *_ctx)
8354 {
8355 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
8356 
8357 	bdev_channel_abort_queued_ios(bdev_ch);
8358 	spdk_bdev_for_each_channel_continue(i, 0);
8359 }
8360 
8361 static void
8362 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status)
8363 {
8364 	int rc;
8365 
8366 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8367 	spdk_spin_lock(&bdev->internal.spinlock);
8368 	/*
8369 	 * Set the status to REMOVING after completing to abort channels. Otherwise,
8370 	 * the last spdk_bdev_close() may call spdk_io_device_unregister() while
8371 	 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister()
8372 	 * may fail.
8373 	 */
8374 	bdev->internal.status = SPDK_BDEV_STATUS_REMOVING;
8375 	rc = bdev_unregister_unsafe(bdev);
8376 	spdk_spin_unlock(&bdev->internal.spinlock);
8377 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8378 
8379 	if (rc == 0) {
8380 		spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
8381 	}
8382 }
8383 
8384 void
8385 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
8386 {
8387 	struct spdk_thread	*thread;
8388 
8389 	SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name);
8390 
8391 	thread = spdk_get_thread();
8392 	if (!thread) {
8393 		/* The user called this from a non-SPDK thread. */
8394 		if (cb_fn != NULL) {
8395 			cb_fn(cb_arg, -ENOTSUP);
8396 		}
8397 		return;
8398 	}
8399 
8400 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8401 	if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING ||
8402 	    bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) {
8403 		spdk_spin_unlock(&g_bdev_mgr.spinlock);
8404 		if (cb_fn) {
8405 			cb_fn(cb_arg, -EBUSY);
8406 		}
8407 		return;
8408 	}
8409 
8410 	spdk_spin_lock(&bdev->internal.spinlock);
8411 	bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING;
8412 	bdev->internal.unregister_cb = cb_fn;
8413 	bdev->internal.unregister_ctx = cb_arg;
8414 	bdev->internal.unregister_td = thread;
8415 	spdk_spin_unlock(&bdev->internal.spinlock);
8416 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8417 
8418 	spdk_bdev_set_qd_sampling_period(bdev, 0);
8419 
8420 	spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev,
8421 				   bdev_unregister);
8422 }
8423 
8424 int
8425 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module,
8426 			     spdk_bdev_unregister_cb cb_fn, void *cb_arg)
8427 {
8428 	struct spdk_bdev_desc *desc;
8429 	struct spdk_bdev *bdev;
8430 	int rc;
8431 
8432 	rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc);
8433 	if (rc != 0) {
8434 		SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name);
8435 		return rc;
8436 	}
8437 
8438 	bdev = spdk_bdev_desc_get_bdev(desc);
8439 
8440 	if (bdev->module != module) {
8441 		spdk_bdev_close(desc);
8442 		SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n",
8443 			    bdev_name);
8444 		return -ENODEV;
8445 	}
8446 
8447 	spdk_bdev_unregister(bdev, cb_fn, cb_arg);
8448 
8449 	spdk_bdev_close(desc);
8450 
8451 	return 0;
8452 }
8453 
8454 static int
8455 bdev_start_qos(struct spdk_bdev *bdev)
8456 {
8457 	struct set_qos_limit_ctx *ctx;
8458 
8459 	/* Enable QoS */
8460 	if (bdev->internal.qos && bdev->internal.qos->thread == NULL) {
8461 		ctx = calloc(1, sizeof(*ctx));
8462 		if (ctx == NULL) {
8463 			SPDK_ERRLOG("Failed to allocate memory for QoS context\n");
8464 			return -ENOMEM;
8465 		}
8466 		ctx->bdev = bdev;
8467 		spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done);
8468 	}
8469 
8470 	return 0;
8471 }
8472 
8473 static void
8474 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail,
8475 		    struct spdk_bdev *bdev)
8476 {
8477 	enum spdk_bdev_claim_type type;
8478 	const char *typename, *modname;
8479 	extern struct spdk_log_flag SPDK_LOG_bdev;
8480 
8481 	assert(spdk_spin_held(&bdev->internal.spinlock));
8482 
8483 	if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) {
8484 		return;
8485 	}
8486 
8487 	type = bdev->internal.claim_type;
8488 	typename = spdk_bdev_claim_get_name(type);
8489 
8490 	if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) {
8491 		modname = bdev->internal.claim.v1.module->name;
8492 		spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n",
8493 			 bdev->name, detail, typename, modname);
8494 		return;
8495 	}
8496 
8497 	if (claim_type_is_v2(type)) {
8498 		struct spdk_bdev_module_claim *claim;
8499 
8500 		TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) {
8501 			modname = claim->module->name;
8502 			spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n",
8503 				 bdev->name, detail, typename, modname);
8504 		}
8505 		return;
8506 	}
8507 
8508 	assert(false);
8509 }
8510 
8511 static int
8512 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc)
8513 {
8514 	struct spdk_thread *thread;
8515 	int rc = 0;
8516 
8517 	thread = spdk_get_thread();
8518 	if (!thread) {
8519 		SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n");
8520 		return -ENOTSUP;
8521 	}
8522 
8523 	SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
8524 		      spdk_get_thread());
8525 
8526 	desc->bdev = bdev;
8527 	desc->thread = thread;
8528 	desc->write = write;
8529 
8530 	spdk_spin_lock(&bdev->internal.spinlock);
8531 	if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING ||
8532 	    bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) {
8533 		spdk_spin_unlock(&bdev->internal.spinlock);
8534 		return -ENODEV;
8535 	}
8536 
8537 	if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
8538 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
8539 		spdk_spin_unlock(&bdev->internal.spinlock);
8540 		return -EPERM;
8541 	}
8542 
8543 	rc = bdev_start_qos(bdev);
8544 	if (rc != 0) {
8545 		SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name);
8546 		spdk_spin_unlock(&bdev->internal.spinlock);
8547 		return rc;
8548 	}
8549 
8550 	TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link);
8551 
8552 	spdk_spin_unlock(&bdev->internal.spinlock);
8553 
8554 	return 0;
8555 }
8556 
8557 static void
8558 bdev_open_opts_get_defaults(struct spdk_bdev_open_opts *opts, size_t opts_size)
8559 {
8560 	if (!opts) {
8561 		SPDK_ERRLOG("opts should not be NULL.\n");
8562 		return;
8563 	}
8564 
8565 	if (!opts_size) {
8566 		SPDK_ERRLOG("opts_size should not be zero.\n");
8567 		return;
8568 	}
8569 
8570 	memset(opts, 0, opts_size);
8571 	opts->size = opts_size;
8572 
8573 #define FIELD_OK(field) \
8574 	offsetof(struct spdk_bdev_open_opts, field) + sizeof(opts->field) <= opts_size
8575 
8576 #define SET_FIELD(field, value) \
8577 	if (FIELD_OK(field)) { \
8578 		opts->field = value; \
8579 	} \
8580 
8581 	SET_FIELD(hide_metadata, false);
8582 
8583 #undef FIELD_OK
8584 #undef SET_FIELD
8585 }
8586 
8587 static void
8588 bdev_open_opts_copy(struct spdk_bdev_open_opts *opts,
8589 		    const struct spdk_bdev_open_opts *opts_src, size_t opts_size)
8590 {
8591 	assert(opts);
8592 	assert(opts_src);
8593 
8594 #define SET_FIELD(field) \
8595 	if (offsetof(struct spdk_bdev_open_opts, field) + sizeof(opts->field) <= opts_size) { \
8596 		opts->field = opts_src->field; \
8597 	} \
8598 
8599 	SET_FIELD(hide_metadata);
8600 
8601 	opts->size = opts_src->size;
8602 
8603 	/* We should not remove this statement, but need to update the assert statement
8604 	 * if we add a new field, and also add a corresponding SET_FIELD statement.
8605 	 */
8606 	SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_opts) == 16, "Incorrect size");
8607 
8608 #undef SET_FIELD
8609 }
8610 
8611 void
8612 spdk_bdev_open_opts_init(struct spdk_bdev_open_opts *opts, size_t opts_size)
8613 {
8614 	struct spdk_bdev_open_opts opts_local;
8615 
8616 	bdev_open_opts_get_defaults(&opts_local, sizeof(opts_local));
8617 	bdev_open_opts_copy(opts, &opts_local, opts_size);
8618 }
8619 
8620 static int
8621 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx,
8622 		struct spdk_bdev_open_opts *user_opts, struct spdk_bdev_desc **_desc)
8623 {
8624 	struct spdk_bdev_desc *desc;
8625 	struct spdk_bdev_open_opts opts;
8626 	unsigned int i;
8627 
8628 	bdev_open_opts_get_defaults(&opts, sizeof(opts));
8629 	if (user_opts != NULL) {
8630 		bdev_open_opts_copy(&opts, user_opts, user_opts->size);
8631 	}
8632 
8633 	desc = calloc(1, sizeof(*desc));
8634 	if (desc == NULL) {
8635 		SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n");
8636 		return -ENOMEM;
8637 	}
8638 
8639 	desc->opts = opts;
8640 
8641 	TAILQ_INIT(&desc->pending_media_events);
8642 	TAILQ_INIT(&desc->free_media_events);
8643 
8644 	desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0;
8645 	desc->callback.event_fn = event_cb;
8646 	desc->callback.ctx = event_ctx;
8647 	spdk_spin_init(&desc->spinlock);
8648 
8649 	if (desc->opts.hide_metadata) {
8650 		if (spdk_bdev_is_md_separate(bdev)) {
8651 			SPDK_ERRLOG("hide_metadata option is not supported with separate metadata.\n");
8652 			bdev_desc_free(desc);
8653 			return -EINVAL;
8654 		}
8655 	}
8656 
8657 	if (bdev->media_events) {
8658 		desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE,
8659 						   sizeof(*desc->media_events_buffer));
8660 		if (desc->media_events_buffer == NULL) {
8661 			SPDK_ERRLOG("Failed to initialize media event pool\n");
8662 			bdev_desc_free(desc);
8663 			return -ENOMEM;
8664 		}
8665 
8666 		for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) {
8667 			TAILQ_INSERT_TAIL(&desc->free_media_events,
8668 					  &desc->media_events_buffer[i], tailq);
8669 		}
8670 	}
8671 
8672 	if (bdev->fn_table->accel_sequence_supported != NULL) {
8673 		for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) {
8674 			desc->accel_sequence_supported[i] =
8675 				bdev->fn_table->accel_sequence_supported(bdev->ctxt,
8676 						(enum spdk_bdev_io_type)i);
8677 		}
8678 	}
8679 
8680 	*_desc = desc;
8681 
8682 	return 0;
8683 }
8684 
8685 static int
8686 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
8687 	      void *event_ctx, struct spdk_bdev_open_opts *opts,
8688 	      struct spdk_bdev_desc **_desc)
8689 {
8690 	struct spdk_bdev_desc *desc;
8691 	struct spdk_bdev *bdev;
8692 	int rc;
8693 
8694 	bdev = bdev_get_by_name(bdev_name);
8695 
8696 	if (bdev == NULL) {
8697 		SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name);
8698 		return -ENODEV;
8699 	}
8700 
8701 	rc = bdev_desc_alloc(bdev, event_cb, event_ctx, opts, &desc);
8702 	if (rc != 0) {
8703 		return rc;
8704 	}
8705 
8706 	rc = bdev_open(bdev, write, desc);
8707 	if (rc != 0) {
8708 		bdev_desc_free(desc);
8709 		desc = NULL;
8710 	}
8711 
8712 	*_desc = desc;
8713 
8714 	return rc;
8715 }
8716 
8717 int
8718 spdk_bdev_open_ext_v2(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
8719 		      void *event_ctx, struct spdk_bdev_open_opts *opts,
8720 		      struct spdk_bdev_desc **_desc)
8721 {
8722 	int rc;
8723 
8724 	if (event_cb == NULL) {
8725 		SPDK_ERRLOG("Missing event callback function\n");
8726 		return -EINVAL;
8727 	}
8728 
8729 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8730 	rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, opts, _desc);
8731 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8732 
8733 	return rc;
8734 }
8735 
8736 int
8737 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
8738 		   void *event_ctx, struct spdk_bdev_desc **_desc)
8739 {
8740 	return spdk_bdev_open_ext_v2(bdev_name, write, event_cb, event_ctx, NULL, _desc);
8741 }
8742 
8743 struct spdk_bdev_open_async_ctx {
8744 	char					*bdev_name;
8745 	spdk_bdev_event_cb_t			event_cb;
8746 	void					*event_ctx;
8747 	bool					write;
8748 	int					rc;
8749 	spdk_bdev_open_async_cb_t		cb_fn;
8750 	void					*cb_arg;
8751 	struct spdk_bdev_desc			*desc;
8752 	struct spdk_bdev_open_async_opts	opts;
8753 	uint64_t				start_ticks;
8754 	struct spdk_thread			*orig_thread;
8755 	struct spdk_poller			*poller;
8756 	TAILQ_ENTRY(spdk_bdev_open_async_ctx)	tailq;
8757 };
8758 
8759 static void
8760 bdev_open_async_done(void *arg)
8761 {
8762 	struct spdk_bdev_open_async_ctx *ctx = arg;
8763 
8764 	ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg);
8765 
8766 	free(ctx->bdev_name);
8767 	free(ctx);
8768 }
8769 
8770 static void
8771 bdev_open_async_cancel(void *arg)
8772 {
8773 	struct spdk_bdev_open_async_ctx *ctx = arg;
8774 
8775 	assert(ctx->rc == -ESHUTDOWN);
8776 
8777 	spdk_poller_unregister(&ctx->poller);
8778 
8779 	bdev_open_async_done(ctx);
8780 }
8781 
8782 /* This is called when the bdev library finishes at shutdown. */
8783 static void
8784 bdev_open_async_fini(void)
8785 {
8786 	struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx;
8787 
8788 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8789 	TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) {
8790 		TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq);
8791 		/*
8792 		 * We have to move to ctx->orig_thread to unregister ctx->poller.
8793 		 * However, there is a chance that ctx->poller is executed before
8794 		 * message is executed, which could result in bdev_open_async_done()
8795 		 * being called twice. To avoid such race condition, set ctx->rc to
8796 		 * -ESHUTDOWN.
8797 		 */
8798 		ctx->rc = -ESHUTDOWN;
8799 		spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx);
8800 	}
8801 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8802 }
8803 
8804 static int bdev_open_async(void *arg);
8805 
8806 static void
8807 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx)
8808 {
8809 	uint64_t timeout_ticks;
8810 
8811 	if (ctx->rc == -ESHUTDOWN) {
8812 		/* This context is being canceled. Do nothing. */
8813 		return;
8814 	}
8815 
8816 	ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx,
8817 				NULL, &ctx->desc);
8818 	if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) {
8819 		goto exit;
8820 	}
8821 
8822 	timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull;
8823 	if (spdk_get_ticks() >= timeout_ticks) {
8824 		SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name);
8825 		ctx->rc = -ETIMEDOUT;
8826 		goto exit;
8827 	}
8828 
8829 	return;
8830 
8831 exit:
8832 	spdk_poller_unregister(&ctx->poller);
8833 	TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq);
8834 
8835 	/* Completion callback is processed after stack unwinding. */
8836 	spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx);
8837 }
8838 
8839 static int
8840 bdev_open_async(void *arg)
8841 {
8842 	struct spdk_bdev_open_async_ctx *ctx = arg;
8843 
8844 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8845 
8846 	_bdev_open_async(ctx);
8847 
8848 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8849 
8850 	return SPDK_POLLER_BUSY;
8851 }
8852 
8853 static void
8854 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts,
8855 			  struct spdk_bdev_open_async_opts *opts_src,
8856 			  size_t size)
8857 {
8858 	assert(opts);
8859 	assert(opts_src);
8860 
8861 	opts->size = size;
8862 
8863 #define SET_FIELD(field) \
8864 	if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \
8865 		opts->field = opts_src->field; \
8866 	} \
8867 
8868 	SET_FIELD(timeout_ms);
8869 
8870 	/* Do not remove this statement, you should always update this statement when you adding a new field,
8871 	 * and do not forget to add the SET_FIELD statement for your added field. */
8872 	SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size");
8873 
8874 #undef SET_FIELD
8875 }
8876 
8877 static void
8878 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size)
8879 {
8880 	assert(opts);
8881 
8882 	opts->size = size;
8883 
8884 #define SET_FIELD(field, value) \
8885 	if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \
8886 		opts->field = value; \
8887 	} \
8888 
8889 	SET_FIELD(timeout_ms, 0);
8890 
8891 #undef SET_FIELD
8892 }
8893 
8894 int
8895 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
8896 		     void *event_ctx, struct spdk_bdev_open_async_opts *opts,
8897 		     spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg)
8898 {
8899 	struct spdk_bdev_open_async_ctx *ctx;
8900 
8901 	if (event_cb == NULL) {
8902 		SPDK_ERRLOG("Missing event callback function\n");
8903 		return -EINVAL;
8904 	}
8905 
8906 	if (open_cb == NULL) {
8907 		SPDK_ERRLOG("Missing open callback function\n");
8908 		return -EINVAL;
8909 	}
8910 
8911 	if (opts != NULL && opts->size == 0) {
8912 		SPDK_ERRLOG("size in the options structure should not be zero\n");
8913 		return -EINVAL;
8914 	}
8915 
8916 	ctx = calloc(1, sizeof(*ctx));
8917 	if (ctx == NULL) {
8918 		SPDK_ERRLOG("Failed to allocate open context\n");
8919 		return -ENOMEM;
8920 	}
8921 
8922 	ctx->bdev_name = strdup(bdev_name);
8923 	if (ctx->bdev_name == NULL) {
8924 		SPDK_ERRLOG("Failed to duplicate bdev_name\n");
8925 		free(ctx);
8926 		return -ENOMEM;
8927 	}
8928 
8929 	ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000);
8930 	if (ctx->poller == NULL) {
8931 		SPDK_ERRLOG("Failed to register bdev_open_async poller\n");
8932 		free(ctx->bdev_name);
8933 		free(ctx);
8934 		return -ENOMEM;
8935 	}
8936 
8937 	ctx->cb_fn = open_cb;
8938 	ctx->cb_arg = open_cb_arg;
8939 	ctx->write = write;
8940 	ctx->event_cb = event_cb;
8941 	ctx->event_ctx = event_ctx;
8942 	ctx->orig_thread = spdk_get_thread();
8943 	ctx->start_ticks = spdk_get_ticks();
8944 
8945 	bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts));
8946 	if (opts != NULL) {
8947 		bdev_open_async_opts_copy(&ctx->opts, opts, opts->size);
8948 	}
8949 
8950 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8951 
8952 	TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq);
8953 	_bdev_open_async(ctx);
8954 
8955 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8956 
8957 	return 0;
8958 }
8959 
8960 static void
8961 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc)
8962 {
8963 	int rc;
8964 
8965 	spdk_spin_lock(&bdev->internal.spinlock);
8966 	spdk_spin_lock(&desc->spinlock);
8967 
8968 	TAILQ_REMOVE(&bdev->internal.open_descs, desc, link);
8969 
8970 	desc->closed = true;
8971 
8972 	if (desc->claim != NULL) {
8973 		bdev_desc_release_claims(desc);
8974 	}
8975 
8976 	if (0 == desc->refs) {
8977 		spdk_spin_unlock(&desc->spinlock);
8978 		bdev_desc_free(desc);
8979 	} else {
8980 		spdk_spin_unlock(&desc->spinlock);
8981 	}
8982 
8983 	/* If no more descriptors, kill QoS channel */
8984 	if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) {
8985 		SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n",
8986 			      bdev->name, spdk_get_thread());
8987 
8988 		if (bdev_qos_destroy(bdev)) {
8989 			/* There isn't anything we can do to recover here. Just let the
8990 			 * old QoS poller keep running. The QoS handling won't change
8991 			 * cores when the user allocates a new channel, but it won't break. */
8992 			SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n");
8993 		}
8994 	}
8995 
8996 	if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) {
8997 		rc = bdev_unregister_unsafe(bdev);
8998 		spdk_spin_unlock(&bdev->internal.spinlock);
8999 
9000 		if (rc == 0) {
9001 			spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
9002 		}
9003 	} else {
9004 		spdk_spin_unlock(&bdev->internal.spinlock);
9005 	}
9006 }
9007 
9008 void
9009 spdk_bdev_close(struct spdk_bdev_desc *desc)
9010 {
9011 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
9012 
9013 	SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
9014 		      spdk_get_thread());
9015 
9016 	assert(desc->thread == spdk_get_thread());
9017 
9018 	spdk_poller_unregister(&desc->io_timeout_poller);
9019 
9020 	spdk_spin_lock(&g_bdev_mgr.spinlock);
9021 
9022 	bdev_close(bdev, desc);
9023 
9024 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
9025 }
9026 
9027 int32_t
9028 spdk_bdev_get_numa_id(struct spdk_bdev *bdev)
9029 {
9030 	if (bdev->numa.id_valid) {
9031 		return bdev->numa.id;
9032 	} else {
9033 		return SPDK_ENV_NUMA_ID_ANY;
9034 	}
9035 }
9036 
9037 static void
9038 bdev_register_finished(void *arg)
9039 {
9040 	struct spdk_bdev_desc *desc = arg;
9041 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
9042 
9043 	spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev));
9044 
9045 	spdk_spin_lock(&g_bdev_mgr.spinlock);
9046 
9047 	bdev_close(bdev, desc);
9048 
9049 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
9050 }
9051 
9052 int
9053 spdk_bdev_register(struct spdk_bdev *bdev)
9054 {
9055 	struct spdk_bdev_desc *desc;
9056 	struct spdk_thread *thread = spdk_get_thread();
9057 	int rc;
9058 
9059 	if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) {
9060 		SPDK_ERRLOG("Cannot register bdev %s on thread %p (%s)\n", bdev->name, thread,
9061 			    thread ? spdk_thread_get_name(thread) : "null");
9062 		return -EINVAL;
9063 	}
9064 
9065 	rc = bdev_register(bdev);
9066 	if (rc != 0) {
9067 		return rc;
9068 	}
9069 
9070 	/* A descriptor is opened to prevent bdev deletion during examination */
9071 	rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc);
9072 	if (rc != 0) {
9073 		spdk_bdev_unregister(bdev, NULL, NULL);
9074 		return rc;
9075 	}
9076 
9077 	rc = bdev_open(bdev, false, desc);
9078 	if (rc != 0) {
9079 		bdev_desc_free(desc);
9080 		spdk_bdev_unregister(bdev, NULL, NULL);
9081 		return rc;
9082 	}
9083 
9084 	/* Examine configuration before initializing I/O */
9085 	bdev_examine(bdev);
9086 
9087 	rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc);
9088 	if (rc != 0) {
9089 		bdev_close(bdev, desc);
9090 		spdk_bdev_unregister(bdev, NULL, NULL);
9091 	}
9092 
9093 	return rc;
9094 }
9095 
9096 int
9097 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
9098 			    struct spdk_bdev_module *module)
9099 {
9100 	spdk_spin_lock(&bdev->internal.spinlock);
9101 
9102 	if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
9103 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
9104 		spdk_spin_unlock(&bdev->internal.spinlock);
9105 		return -EPERM;
9106 	}
9107 
9108 	if (desc && !desc->write) {
9109 		desc->write = true;
9110 	}
9111 
9112 	bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE;
9113 	bdev->internal.claim.v1.module = module;
9114 
9115 	spdk_spin_unlock(&bdev->internal.spinlock);
9116 	return 0;
9117 }
9118 
9119 void
9120 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev)
9121 {
9122 	spdk_spin_lock(&bdev->internal.spinlock);
9123 
9124 	assert(bdev->internal.claim.v1.module != NULL);
9125 	assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE);
9126 	bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE;
9127 	bdev->internal.claim.v1.module = NULL;
9128 
9129 	spdk_spin_unlock(&bdev->internal.spinlock);
9130 }
9131 
9132 /*
9133  * Start claims v2
9134  */
9135 
9136 const char *
9137 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type)
9138 {
9139 	switch (type) {
9140 	case SPDK_BDEV_CLAIM_NONE:
9141 		return "not_claimed";
9142 	case SPDK_BDEV_CLAIM_EXCL_WRITE:
9143 		return "exclusive_write";
9144 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
9145 		return "read_many_write_one";
9146 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE:
9147 		return "read_many_write_none";
9148 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
9149 		return "read_many_write_many";
9150 	default:
9151 		break;
9152 	}
9153 	return "invalid_claim";
9154 }
9155 
9156 static bool
9157 claim_type_is_v2(enum spdk_bdev_claim_type type)
9158 {
9159 	switch (type) {
9160 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
9161 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE:
9162 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
9163 		return true;
9164 	default:
9165 		break;
9166 	}
9167 	return false;
9168 }
9169 
9170 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */
9171 static bool
9172 claim_type_promotes_to_write(enum spdk_bdev_claim_type type)
9173 {
9174 	switch (type) {
9175 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
9176 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
9177 		return true;
9178 	default:
9179 		break;
9180 	}
9181 	return false;
9182 }
9183 
9184 void
9185 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size)
9186 {
9187 	if (opts == NULL) {
9188 		SPDK_ERRLOG("opts should not be NULL\n");
9189 		assert(opts != NULL);
9190 		return;
9191 	}
9192 	if (size == 0) {
9193 		SPDK_ERRLOG("size should not be zero\n");
9194 		assert(size != 0);
9195 		return;
9196 	}
9197 
9198 	memset(opts, 0, size);
9199 	opts->opts_size = size;
9200 
9201 #define FIELD_OK(field) \
9202         offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size
9203 
9204 #define SET_FIELD(field, value) \
9205         if (FIELD_OK(field)) { \
9206                 opts->field = value; \
9207         } \
9208 
9209 	SET_FIELD(shared_claim_key, 0);
9210 
9211 #undef FIELD_OK
9212 #undef SET_FIELD
9213 }
9214 
9215 static int
9216 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst)
9217 {
9218 	if (src->opts_size == 0) {
9219 		SPDK_ERRLOG("size should not be zero\n");
9220 		return -1;
9221 	}
9222 
9223 	memset(dst, 0, sizeof(*dst));
9224 	dst->opts_size = src->opts_size;
9225 
9226 #define FIELD_OK(field) \
9227         offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size
9228 
9229 #define SET_FIELD(field) \
9230         if (FIELD_OK(field)) { \
9231                 dst->field = src->field; \
9232         } \
9233 
9234 	if (FIELD_OK(name)) {
9235 		snprintf(dst->name, sizeof(dst->name), "%s", src->name);
9236 	}
9237 
9238 	SET_FIELD(shared_claim_key);
9239 
9240 	/* You should not remove this statement, but need to update the assert statement
9241 	 * if you add a new field, and also add a corresponding SET_FIELD statement */
9242 	SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size");
9243 
9244 #undef FIELD_OK
9245 #undef SET_FIELD
9246 	return 0;
9247 }
9248 
9249 /* Returns 0 if a read-write-once claim can be taken. */
9250 static int
9251 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
9252 		 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
9253 {
9254 	struct spdk_bdev *bdev = desc->bdev;
9255 	struct spdk_bdev_desc *open_desc;
9256 
9257 	assert(spdk_spin_held(&bdev->internal.spinlock));
9258 	assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE);
9259 
9260 	if (opts->shared_claim_key != 0) {
9261 		SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n",
9262 			    bdev->name);
9263 		return -EINVAL;
9264 	}
9265 	if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
9266 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
9267 		return -EPERM;
9268 	}
9269 	if (desc->claim != NULL) {
9270 		SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n",
9271 			       bdev->name, desc->claim->module->name);
9272 		return -EPERM;
9273 	}
9274 	TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) {
9275 		if (desc != open_desc && open_desc->write) {
9276 			SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while "
9277 				       "another descriptor is open for writing\n",
9278 				       bdev->name);
9279 			return -EPERM;
9280 		}
9281 	}
9282 
9283 	return 0;
9284 }
9285 
9286 /* Returns 0 if a read-only-many claim can be taken. */
9287 static int
9288 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
9289 		 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
9290 {
9291 	struct spdk_bdev *bdev = desc->bdev;
9292 	struct spdk_bdev_desc *open_desc;
9293 
9294 	assert(spdk_spin_held(&bdev->internal.spinlock));
9295 	assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE);
9296 	assert(desc->claim == NULL);
9297 
9298 	if (desc->write) {
9299 		SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n",
9300 			    bdev->name);
9301 		return -EINVAL;
9302 	}
9303 	if (opts->shared_claim_key != 0) {
9304 		SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name);
9305 		return -EINVAL;
9306 	}
9307 	if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) {
9308 		TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) {
9309 			if (open_desc->write) {
9310 				SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while "
9311 					       "another descriptor is open for writing\n",
9312 					       bdev->name);
9313 				return -EPERM;
9314 			}
9315 		}
9316 	}
9317 
9318 	return 0;
9319 }
9320 
9321 /* Returns 0 if a read-write-many claim can be taken. */
9322 static int
9323 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
9324 		 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
9325 {
9326 	struct spdk_bdev *bdev = desc->bdev;
9327 	struct spdk_bdev_desc *open_desc;
9328 
9329 	assert(spdk_spin_held(&bdev->internal.spinlock));
9330 	assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED);
9331 	assert(desc->claim == NULL);
9332 
9333 	if (opts->shared_claim_key == 0) {
9334 		SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n",
9335 			    bdev->name);
9336 		return -EINVAL;
9337 	}
9338 	switch (bdev->internal.claim_type) {
9339 	case SPDK_BDEV_CLAIM_NONE:
9340 		TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) {
9341 			if (open_desc == desc) {
9342 				continue;
9343 			}
9344 			if (open_desc->write) {
9345 				SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while "
9346 					       "another descriptor is open for writing without a "
9347 					       "claim\n", bdev->name);
9348 				return -EPERM;
9349 			}
9350 		}
9351 		break;
9352 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
9353 		if (opts->shared_claim_key != bdev->internal.claim.v2.key) {
9354 			LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev);
9355 			return -EPERM;
9356 		}
9357 		break;
9358 	default:
9359 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
9360 		return -EBUSY;
9361 	}
9362 
9363 	return 0;
9364 }
9365 
9366 /* Updates desc and its bdev with a v2 claim. */
9367 static int
9368 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
9369 	   struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
9370 {
9371 	struct spdk_bdev *bdev = desc->bdev;
9372 	struct spdk_bdev_module_claim *claim;
9373 
9374 	assert(spdk_spin_held(&bdev->internal.spinlock));
9375 	assert(claim_type_is_v2(type));
9376 	assert(desc->claim == NULL);
9377 
9378 	claim = calloc(1, sizeof(*desc->claim));
9379 	if (claim == NULL) {
9380 		SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name);
9381 		return -ENOMEM;
9382 	}
9383 	claim->module = module;
9384 	claim->desc = desc;
9385 	SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match");
9386 	memcpy(claim->name, opts->name, sizeof(claim->name));
9387 	desc->claim = claim;
9388 
9389 	if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) {
9390 		bdev->internal.claim_type = type;
9391 		TAILQ_INIT(&bdev->internal.claim.v2.claims);
9392 		bdev->internal.claim.v2.key = opts->shared_claim_key;
9393 	}
9394 	assert(type == bdev->internal.claim_type);
9395 
9396 	TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link);
9397 
9398 	if (!desc->write && claim_type_promotes_to_write(type)) {
9399 		desc->write = true;
9400 	}
9401 
9402 	return 0;
9403 }
9404 
9405 int
9406 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
9407 				 struct spdk_bdev_claim_opts *_opts,
9408 				 struct spdk_bdev_module *module)
9409 {
9410 	struct spdk_bdev *bdev;
9411 	struct spdk_bdev_claim_opts opts;
9412 	int rc = 0;
9413 
9414 	if (desc == NULL) {
9415 		SPDK_ERRLOG("descriptor must not be NULL\n");
9416 		return -EINVAL;
9417 	}
9418 
9419 	bdev = desc->bdev;
9420 
9421 	if (_opts == NULL) {
9422 		spdk_bdev_claim_opts_init(&opts, sizeof(opts));
9423 	} else if (claim_opts_copy(_opts, &opts) != 0) {
9424 		return -EINVAL;
9425 	}
9426 
9427 	spdk_spin_lock(&bdev->internal.spinlock);
9428 
9429 	if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE &&
9430 	    bdev->internal.claim_type != type) {
9431 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
9432 		spdk_spin_unlock(&bdev->internal.spinlock);
9433 		return -EPERM;
9434 	}
9435 
9436 	if (claim_type_is_v2(type) && desc->claim != NULL) {
9437 		SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n",
9438 			    bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name);
9439 		spdk_spin_unlock(&bdev->internal.spinlock);
9440 		return -EPERM;
9441 	}
9442 
9443 	switch (type) {
9444 	case SPDK_BDEV_CLAIM_EXCL_WRITE:
9445 		spdk_spin_unlock(&bdev->internal.spinlock);
9446 		return spdk_bdev_module_claim_bdev(bdev, desc, module);
9447 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
9448 		rc = claim_verify_rwo(desc, type, &opts, module);
9449 		break;
9450 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE:
9451 		rc = claim_verify_rom(desc, type, &opts, module);
9452 		break;
9453 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
9454 		rc = claim_verify_rwm(desc, type, &opts, module);
9455 		break;
9456 	default:
9457 		SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type);
9458 		rc = -ENOTSUP;
9459 	}
9460 
9461 	if (rc == 0) {
9462 		rc = claim_bdev(desc, type, &opts, module);
9463 	}
9464 
9465 	spdk_spin_unlock(&bdev->internal.spinlock);
9466 	return rc;
9467 }
9468 
9469 static void
9470 claim_reset(struct spdk_bdev *bdev)
9471 {
9472 	assert(spdk_spin_held(&bdev->internal.spinlock));
9473 	assert(claim_type_is_v2(bdev->internal.claim_type));
9474 	assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims));
9475 
9476 	memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim));
9477 	bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE;
9478 }
9479 
9480 static void
9481 bdev_desc_release_claims(struct spdk_bdev_desc *desc)
9482 {
9483 	struct spdk_bdev *bdev = desc->bdev;
9484 
9485 	assert(spdk_spin_held(&bdev->internal.spinlock));
9486 	assert(claim_type_is_v2(bdev->internal.claim_type));
9487 
9488 	if (bdev->internal.examine_in_progress == 0) {
9489 		TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link);
9490 		free(desc->claim);
9491 		if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) {
9492 			claim_reset(bdev);
9493 		}
9494 	} else {
9495 		/* This is a dead claim that will be cleaned up when bdev_examine() is done. */
9496 		desc->claim->module = NULL;
9497 		desc->claim->desc = NULL;
9498 	}
9499 	desc->claim = NULL;
9500 }
9501 
9502 /*
9503  * End claims v2
9504  */
9505 
9506 struct spdk_bdev *
9507 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc)
9508 {
9509 	assert(desc != NULL);
9510 	return desc->bdev;
9511 }
9512 
9513 int
9514 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn)
9515 {
9516 	struct spdk_bdev *bdev, *tmp;
9517 	struct spdk_bdev_desc *desc;
9518 	int rc = 0;
9519 
9520 	assert(fn != NULL);
9521 
9522 	spdk_spin_lock(&g_bdev_mgr.spinlock);
9523 	bdev = spdk_bdev_first();
9524 	while (bdev != NULL) {
9525 		rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc);
9526 		if (rc != 0) {
9527 			break;
9528 		}
9529 		rc = bdev_open(bdev, false, desc);
9530 		if (rc != 0) {
9531 			bdev_desc_free(desc);
9532 			if (rc == -ENODEV) {
9533 				/* Ignore the error and move to the next bdev. */
9534 				rc = 0;
9535 				bdev = spdk_bdev_next(bdev);
9536 				continue;
9537 			}
9538 			break;
9539 		}
9540 		spdk_spin_unlock(&g_bdev_mgr.spinlock);
9541 
9542 		rc = fn(ctx, bdev);
9543 
9544 		spdk_spin_lock(&g_bdev_mgr.spinlock);
9545 		tmp = spdk_bdev_next(bdev);
9546 		bdev_close(bdev, desc);
9547 		if (rc != 0) {
9548 			break;
9549 		}
9550 		bdev = tmp;
9551 	}
9552 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
9553 
9554 	return rc;
9555 }
9556 
9557 int
9558 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn)
9559 {
9560 	struct spdk_bdev *bdev, *tmp;
9561 	struct spdk_bdev_desc *desc;
9562 	int rc = 0;
9563 
9564 	assert(fn != NULL);
9565 
9566 	spdk_spin_lock(&g_bdev_mgr.spinlock);
9567 	bdev = spdk_bdev_first_leaf();
9568 	while (bdev != NULL) {
9569 		rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc);
9570 		if (rc != 0) {
9571 			break;
9572 		}
9573 		rc = bdev_open(bdev, false, desc);
9574 		if (rc != 0) {
9575 			bdev_desc_free(desc);
9576 			if (rc == -ENODEV) {
9577 				/* Ignore the error and move to the next bdev. */
9578 				rc = 0;
9579 				bdev = spdk_bdev_next_leaf(bdev);
9580 				continue;
9581 			}
9582 			break;
9583 		}
9584 		spdk_spin_unlock(&g_bdev_mgr.spinlock);
9585 
9586 		rc = fn(ctx, bdev);
9587 
9588 		spdk_spin_lock(&g_bdev_mgr.spinlock);
9589 		tmp = spdk_bdev_next_leaf(bdev);
9590 		bdev_close(bdev, desc);
9591 		if (rc != 0) {
9592 			break;
9593 		}
9594 		bdev = tmp;
9595 	}
9596 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
9597 
9598 	return rc;
9599 }
9600 
9601 void
9602 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp)
9603 {
9604 	struct iovec *iovs;
9605 	int iovcnt;
9606 
9607 	if (bdev_io == NULL) {
9608 		return;
9609 	}
9610 
9611 	switch (bdev_io->type) {
9612 	case SPDK_BDEV_IO_TYPE_READ:
9613 	case SPDK_BDEV_IO_TYPE_WRITE:
9614 	case SPDK_BDEV_IO_TYPE_ZCOPY:
9615 		iovs = bdev_io->u.bdev.iovs;
9616 		iovcnt = bdev_io->u.bdev.iovcnt;
9617 		break;
9618 	default:
9619 		iovs = NULL;
9620 		iovcnt = 0;
9621 		break;
9622 	}
9623 
9624 	if (iovp) {
9625 		*iovp = iovs;
9626 	}
9627 	if (iovcntp) {
9628 		*iovcntp = iovcnt;
9629 	}
9630 }
9631 
9632 void *
9633 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io)
9634 {
9635 	if (bdev_io == NULL) {
9636 		return NULL;
9637 	}
9638 
9639 	if (!spdk_bdev_is_md_separate(bdev_io->bdev)) {
9640 		return NULL;
9641 	}
9642 
9643 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ ||
9644 	    bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
9645 		return bdev_io->u.bdev.md_buf;
9646 	}
9647 
9648 	return NULL;
9649 }
9650 
9651 void *
9652 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io)
9653 {
9654 	if (bdev_io == NULL) {
9655 		assert(false);
9656 		return NULL;
9657 	}
9658 
9659 	return bdev_io->internal.caller_ctx;
9660 }
9661 
9662 void
9663 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module)
9664 {
9665 
9666 	if (spdk_bdev_module_list_find(bdev_module->name)) {
9667 		SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name);
9668 		assert(false);
9669 	}
9670 
9671 	spdk_spin_init(&bdev_module->internal.spinlock);
9672 	TAILQ_INIT(&bdev_module->internal.quiesced_ranges);
9673 
9674 	/*
9675 	 * Modules with examine callbacks must be initialized first, so they are
9676 	 *  ready to handle examine callbacks from later modules that will
9677 	 *  register physical bdevs.
9678 	 */
9679 	if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) {
9680 		TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
9681 	} else {
9682 		TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
9683 	}
9684 }
9685 
9686 struct spdk_bdev_module *
9687 spdk_bdev_module_list_find(const char *name)
9688 {
9689 	struct spdk_bdev_module *bdev_module;
9690 
9691 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
9692 		if (strcmp(name, bdev_module->name) == 0) {
9693 			break;
9694 		}
9695 	}
9696 
9697 	return bdev_module;
9698 }
9699 
9700 static int
9701 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io)
9702 {
9703 	uint64_t num_blocks;
9704 	void *md_buf = NULL;
9705 
9706 	num_blocks = bdev_io->u.bdev.num_blocks;
9707 
9708 	if (spdk_bdev_is_md_separate(bdev_io->bdev)) {
9709 		md_buf = (char *)g_bdev_mgr.zero_buffer +
9710 			 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks;
9711 	}
9712 
9713 	return bdev_write_blocks_with_md(bdev_io->internal.desc,
9714 					 spdk_io_channel_from_ctx(bdev_io->internal.ch),
9715 					 g_bdev_mgr.zero_buffer, md_buf,
9716 					 bdev_io->u.bdev.offset_blocks, num_blocks,
9717 					 bdev_write_zero_buffer_done, bdev_io);
9718 }
9719 
9720 static void
9721 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
9722 {
9723 	struct spdk_bdev_io *parent_io = cb_arg;
9724 
9725 	spdk_bdev_free_io(bdev_io);
9726 
9727 	parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
9728 	parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx);
9729 }
9730 
9731 static void
9732 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status)
9733 {
9734 	spdk_spin_lock(&ctx->bdev->internal.spinlock);
9735 	ctx->bdev->internal.qos_mod_in_progress = false;
9736 	spdk_spin_unlock(&ctx->bdev->internal.spinlock);
9737 
9738 	if (ctx->cb_fn) {
9739 		ctx->cb_fn(ctx->cb_arg, status);
9740 	}
9741 	free(ctx);
9742 }
9743 
9744 static void
9745 bdev_disable_qos_done(void *cb_arg)
9746 {
9747 	struct set_qos_limit_ctx *ctx = cb_arg;
9748 	struct spdk_bdev *bdev = ctx->bdev;
9749 	struct spdk_bdev_qos *qos;
9750 
9751 	spdk_spin_lock(&bdev->internal.spinlock);
9752 	qos = bdev->internal.qos;
9753 	bdev->internal.qos = NULL;
9754 	spdk_spin_unlock(&bdev->internal.spinlock);
9755 
9756 	if (qos->thread != NULL) {
9757 		spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
9758 		spdk_poller_unregister(&qos->poller);
9759 	}
9760 
9761 	free(qos);
9762 
9763 	bdev_set_qos_limit_done(ctx, 0);
9764 }
9765 
9766 static void
9767 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status)
9768 {
9769 	struct set_qos_limit_ctx *ctx = _ctx;
9770 	struct spdk_thread *thread;
9771 
9772 	spdk_spin_lock(&bdev->internal.spinlock);
9773 	thread = bdev->internal.qos->thread;
9774 	spdk_spin_unlock(&bdev->internal.spinlock);
9775 
9776 	if (thread != NULL) {
9777 		spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx);
9778 	} else {
9779 		bdev_disable_qos_done(ctx);
9780 	}
9781 }
9782 
9783 static void
9784 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9785 		     struct spdk_io_channel *ch, void *_ctx)
9786 {
9787 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
9788 	struct spdk_bdev_io *bdev_io;
9789 
9790 	bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED;
9791 
9792 	while (!TAILQ_EMPTY(&bdev_ch->qos_queued_io)) {
9793 		/* Re-submit the queued I/O. */
9794 		bdev_io = TAILQ_FIRST(&bdev_ch->qos_queued_io);
9795 		TAILQ_REMOVE(&bdev_ch->qos_queued_io, bdev_io, internal.link);
9796 		_bdev_io_submit(bdev_io);
9797 	}
9798 
9799 	spdk_bdev_for_each_channel_continue(i, 0);
9800 }
9801 
9802 static void
9803 bdev_update_qos_rate_limit_msg(void *cb_arg)
9804 {
9805 	struct set_qos_limit_ctx *ctx = cb_arg;
9806 	struct spdk_bdev *bdev = ctx->bdev;
9807 
9808 	spdk_spin_lock(&bdev->internal.spinlock);
9809 	bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos);
9810 	spdk_spin_unlock(&bdev->internal.spinlock);
9811 
9812 	bdev_set_qos_limit_done(ctx, 0);
9813 }
9814 
9815 static void
9816 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9817 		    struct spdk_io_channel *ch, void *_ctx)
9818 {
9819 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
9820 
9821 	spdk_spin_lock(&bdev->internal.spinlock);
9822 	bdev_enable_qos(bdev, bdev_ch);
9823 	spdk_spin_unlock(&bdev->internal.spinlock);
9824 	spdk_bdev_for_each_channel_continue(i, 0);
9825 }
9826 
9827 static void
9828 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status)
9829 {
9830 	struct set_qos_limit_ctx *ctx = _ctx;
9831 
9832 	bdev_set_qos_limit_done(ctx, status);
9833 }
9834 
9835 static void
9836 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits)
9837 {
9838 	int i;
9839 
9840 	assert(bdev->internal.qos != NULL);
9841 
9842 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
9843 		if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
9844 			bdev->internal.qos->rate_limits[i].limit = limits[i];
9845 
9846 			if (limits[i] == 0) {
9847 				bdev->internal.qos->rate_limits[i].limit =
9848 					SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
9849 			}
9850 		}
9851 	}
9852 }
9853 
9854 void
9855 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits,
9856 			      void (*cb_fn)(void *cb_arg, int status), void *cb_arg)
9857 {
9858 	struct set_qos_limit_ctx	*ctx;
9859 	uint32_t			limit_set_complement;
9860 	uint64_t			min_limit_per_sec;
9861 	int				i;
9862 	bool				disable_rate_limit = true;
9863 
9864 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
9865 		if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
9866 			continue;
9867 		}
9868 
9869 		if (limits[i] > 0) {
9870 			disable_rate_limit = false;
9871 		}
9872 
9873 		if (bdev_qos_is_iops_rate_limit(i) == true) {
9874 			min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC;
9875 		} else {
9876 			if (limits[i] > SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC) {
9877 				SPDK_WARNLOG("Requested rate limit %" PRIu64 " will result in uint64_t overflow, "
9878 					     "reset to %" PRIu64 "\n", limits[i], SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC);
9879 				limits[i] = SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC;
9880 			}
9881 			/* Change from megabyte to byte rate limit */
9882 			limits[i] = limits[i] * 1024 * 1024;
9883 			min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC;
9884 		}
9885 
9886 		limit_set_complement = limits[i] % min_limit_per_sec;
9887 		if (limit_set_complement) {
9888 			SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n",
9889 				    limits[i], min_limit_per_sec);
9890 			limits[i] += min_limit_per_sec - limit_set_complement;
9891 			SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]);
9892 		}
9893 	}
9894 
9895 	ctx = calloc(1, sizeof(*ctx));
9896 	if (ctx == NULL) {
9897 		cb_fn(cb_arg, -ENOMEM);
9898 		return;
9899 	}
9900 
9901 	ctx->cb_fn = cb_fn;
9902 	ctx->cb_arg = cb_arg;
9903 	ctx->bdev = bdev;
9904 
9905 	spdk_spin_lock(&bdev->internal.spinlock);
9906 	if (bdev->internal.qos_mod_in_progress) {
9907 		spdk_spin_unlock(&bdev->internal.spinlock);
9908 		free(ctx);
9909 		cb_fn(cb_arg, -EAGAIN);
9910 		return;
9911 	}
9912 	bdev->internal.qos_mod_in_progress = true;
9913 
9914 	if (disable_rate_limit == true && bdev->internal.qos) {
9915 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
9916 			if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED &&
9917 			    (bdev->internal.qos->rate_limits[i].limit > 0 &&
9918 			     bdev->internal.qos->rate_limits[i].limit !=
9919 			     SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) {
9920 				disable_rate_limit = false;
9921 				break;
9922 			}
9923 		}
9924 	}
9925 
9926 	if (disable_rate_limit == false) {
9927 		if (bdev->internal.qos == NULL) {
9928 			bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos));
9929 			if (!bdev->internal.qos) {
9930 				spdk_spin_unlock(&bdev->internal.spinlock);
9931 				SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n");
9932 				bdev_set_qos_limit_done(ctx, -ENOMEM);
9933 				return;
9934 			}
9935 		}
9936 
9937 		if (bdev->internal.qos->thread == NULL) {
9938 			/* Enabling */
9939 			bdev_set_qos_rate_limits(bdev, limits);
9940 
9941 			spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx,
9942 						   bdev_enable_qos_done);
9943 		} else {
9944 			/* Updating */
9945 			bdev_set_qos_rate_limits(bdev, limits);
9946 
9947 			spdk_thread_send_msg(bdev->internal.qos->thread,
9948 					     bdev_update_qos_rate_limit_msg, ctx);
9949 		}
9950 	} else {
9951 		if (bdev->internal.qos != NULL) {
9952 			bdev_set_qos_rate_limits(bdev, limits);
9953 
9954 			/* Disabling */
9955 			spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx,
9956 						   bdev_disable_qos_msg_done);
9957 		} else {
9958 			spdk_spin_unlock(&bdev->internal.spinlock);
9959 			bdev_set_qos_limit_done(ctx, 0);
9960 			return;
9961 		}
9962 	}
9963 
9964 	spdk_spin_unlock(&bdev->internal.spinlock);
9965 }
9966 
9967 struct spdk_bdev_histogram_ctx {
9968 	spdk_bdev_histogram_status_cb cb_fn;
9969 	void *cb_arg;
9970 	struct spdk_bdev *bdev;
9971 	int status;
9972 };
9973 
9974 static void
9975 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
9976 {
9977 	struct spdk_bdev_histogram_ctx *ctx = _ctx;
9978 
9979 	spdk_spin_lock(&ctx->bdev->internal.spinlock);
9980 	ctx->bdev->internal.histogram_in_progress = false;
9981 	spdk_spin_unlock(&ctx->bdev->internal.spinlock);
9982 	ctx->cb_fn(ctx->cb_arg, ctx->status);
9983 	free(ctx);
9984 }
9985 
9986 static void
9987 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9988 			       struct spdk_io_channel *_ch, void *_ctx)
9989 {
9990 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9991 
9992 	if (ch->histogram != NULL) {
9993 		spdk_histogram_data_free(ch->histogram);
9994 		ch->histogram = NULL;
9995 	}
9996 	spdk_bdev_for_each_channel_continue(i, 0);
9997 }
9998 
9999 static void
10000 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
10001 {
10002 	struct spdk_bdev_histogram_ctx *ctx = _ctx;
10003 
10004 	if (status != 0) {
10005 		ctx->status = status;
10006 		ctx->bdev->internal.histogram_enabled = false;
10007 		spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx,
10008 					   bdev_histogram_disable_channel_cb);
10009 	} else {
10010 		spdk_spin_lock(&ctx->bdev->internal.spinlock);
10011 		ctx->bdev->internal.histogram_in_progress = false;
10012 		spdk_spin_unlock(&ctx->bdev->internal.spinlock);
10013 		ctx->cb_fn(ctx->cb_arg, ctx->status);
10014 		free(ctx);
10015 	}
10016 }
10017 
10018 static void
10019 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
10020 			      struct spdk_io_channel *_ch, void *_ctx)
10021 {
10022 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10023 	int status = 0;
10024 
10025 	if (ch->histogram == NULL) {
10026 		ch->histogram = spdk_histogram_data_alloc();
10027 		if (ch->histogram == NULL) {
10028 			status = -ENOMEM;
10029 		}
10030 	}
10031 
10032 	spdk_bdev_for_each_channel_continue(i, status);
10033 }
10034 
10035 void
10036 spdk_bdev_histogram_enable_ext(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn,
10037 			       void *cb_arg, bool enable, struct spdk_bdev_enable_histogram_opts *opts)
10038 {
10039 	struct spdk_bdev_histogram_ctx *ctx;
10040 
10041 	ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx));
10042 	if (ctx == NULL) {
10043 		cb_fn(cb_arg, -ENOMEM);
10044 		return;
10045 	}
10046 
10047 	ctx->bdev = bdev;
10048 	ctx->status = 0;
10049 	ctx->cb_fn = cb_fn;
10050 	ctx->cb_arg = cb_arg;
10051 
10052 	spdk_spin_lock(&bdev->internal.spinlock);
10053 	if (bdev->internal.histogram_in_progress) {
10054 		spdk_spin_unlock(&bdev->internal.spinlock);
10055 		free(ctx);
10056 		cb_fn(cb_arg, -EAGAIN);
10057 		return;
10058 	}
10059 
10060 	bdev->internal.histogram_in_progress = true;
10061 	spdk_spin_unlock(&bdev->internal.spinlock);
10062 
10063 	bdev->internal.histogram_enabled = enable;
10064 	bdev->internal.histogram_io_type = opts->io_type;
10065 
10066 	if (enable) {
10067 		/* Allocate histogram for each channel */
10068 		spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx,
10069 					   bdev_histogram_enable_channel_cb);
10070 	} else {
10071 		spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx,
10072 					   bdev_histogram_disable_channel_cb);
10073 	}
10074 }
10075 
10076 void
10077 spdk_bdev_enable_histogram_opts_init(struct spdk_bdev_enable_histogram_opts *opts, size_t size)
10078 {
10079 	if (opts == NULL) {
10080 		SPDK_ERRLOG("opts should not be NULL\n");
10081 		assert(opts != NULL);
10082 		return;
10083 	}
10084 	if (size == 0) {
10085 		SPDK_ERRLOG("size should not be zero\n");
10086 		assert(size != 0);
10087 		return;
10088 	}
10089 
10090 	memset(opts, 0, size);
10091 	opts->size = size;
10092 
10093 #define FIELD_OK(field) \
10094         offsetof(struct spdk_bdev_enable_histogram_opts, field) + sizeof(opts->field) <= size
10095 
10096 #define SET_FIELD(field, value) \
10097         if (FIELD_OK(field)) { \
10098                 opts->field = value; \
10099         } \
10100 
10101 	SET_FIELD(io_type, 0);
10102 
10103 	/* You should not remove this statement, but need to update the assert statement
10104 	 * if you add a new field, and also add a corresponding SET_FIELD statement */
10105 	SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_enable_histogram_opts) == 9, "Incorrect size");
10106 
10107 #undef FIELD_OK
10108 #undef SET_FIELD
10109 }
10110 
10111 void
10112 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn,
10113 			   void *cb_arg, bool enable)
10114 {
10115 	struct spdk_bdev_enable_histogram_opts opts;
10116 
10117 	spdk_bdev_enable_histogram_opts_init(&opts, sizeof(opts));
10118 	spdk_bdev_histogram_enable_ext(bdev, cb_fn, cb_arg, enable, &opts);
10119 }
10120 
10121 struct spdk_bdev_histogram_data_ctx {
10122 	spdk_bdev_histogram_data_cb cb_fn;
10123 	void *cb_arg;
10124 	struct spdk_bdev *bdev;
10125 	/** merged histogram data from all channels */
10126 	struct spdk_histogram_data	*histogram;
10127 };
10128 
10129 static void
10130 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
10131 {
10132 	struct spdk_bdev_histogram_data_ctx *ctx = _ctx;
10133 
10134 	ctx->cb_fn(ctx->cb_arg, status, ctx->histogram);
10135 	free(ctx);
10136 }
10137 
10138 static void
10139 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
10140 			   struct spdk_io_channel *_ch, void *_ctx)
10141 {
10142 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10143 	struct spdk_bdev_histogram_data_ctx *ctx = _ctx;
10144 	int status = 0;
10145 
10146 	if (ch->histogram == NULL) {
10147 		status = -EFAULT;
10148 	} else {
10149 		spdk_histogram_data_merge(ctx->histogram, ch->histogram);
10150 	}
10151 
10152 	spdk_bdev_for_each_channel_continue(i, status);
10153 }
10154 
10155 void
10156 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram,
10157 			spdk_bdev_histogram_data_cb cb_fn,
10158 			void *cb_arg)
10159 {
10160 	struct spdk_bdev_histogram_data_ctx *ctx;
10161 
10162 	ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx));
10163 	if (ctx == NULL) {
10164 		cb_fn(cb_arg, -ENOMEM, NULL);
10165 		return;
10166 	}
10167 
10168 	ctx->bdev = bdev;
10169 	ctx->cb_fn = cb_fn;
10170 	ctx->cb_arg = cb_arg;
10171 
10172 	ctx->histogram = histogram;
10173 
10174 	spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx,
10175 				   bdev_histogram_get_channel_cb);
10176 }
10177 
10178 void
10179 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn,
10180 				void *cb_arg)
10181 {
10182 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
10183 	int status = 0;
10184 
10185 	assert(cb_fn != NULL);
10186 
10187 	if (bdev_ch->histogram == NULL) {
10188 		status = -EFAULT;
10189 	}
10190 	cb_fn(cb_arg, status, bdev_ch->histogram);
10191 }
10192 
10193 size_t
10194 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events,
10195 			   size_t max_events)
10196 {
10197 	struct media_event_entry *entry;
10198 	size_t num_events = 0;
10199 
10200 	for (; num_events < max_events; ++num_events) {
10201 		entry = TAILQ_FIRST(&desc->pending_media_events);
10202 		if (entry == NULL) {
10203 			break;
10204 		}
10205 
10206 		events[num_events] = entry->event;
10207 		TAILQ_REMOVE(&desc->pending_media_events, entry, tailq);
10208 		TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq);
10209 	}
10210 
10211 	return num_events;
10212 }
10213 
10214 int
10215 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events,
10216 			    size_t num_events)
10217 {
10218 	struct spdk_bdev_desc *desc;
10219 	struct media_event_entry *entry;
10220 	size_t event_id;
10221 	int rc = 0;
10222 
10223 	assert(bdev->media_events);
10224 
10225 	spdk_spin_lock(&bdev->internal.spinlock);
10226 	TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
10227 		if (desc->write) {
10228 			break;
10229 		}
10230 	}
10231 
10232 	if (desc == NULL || desc->media_events_buffer == NULL) {
10233 		rc = -ENODEV;
10234 		goto out;
10235 	}
10236 
10237 	for (event_id = 0; event_id < num_events; ++event_id) {
10238 		entry = TAILQ_FIRST(&desc->free_media_events);
10239 		if (entry == NULL) {
10240 			break;
10241 		}
10242 
10243 		TAILQ_REMOVE(&desc->free_media_events, entry, tailq);
10244 		TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq);
10245 		entry->event = events[event_id];
10246 	}
10247 
10248 	rc = event_id;
10249 out:
10250 	spdk_spin_unlock(&bdev->internal.spinlock);
10251 	return rc;
10252 }
10253 
10254 static void
10255 _media_management_notify(void *arg)
10256 {
10257 	struct spdk_bdev_desc *desc = arg;
10258 
10259 	_event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT);
10260 }
10261 
10262 void
10263 spdk_bdev_notify_media_management(struct spdk_bdev *bdev)
10264 {
10265 	struct spdk_bdev_desc *desc;
10266 
10267 	spdk_spin_lock(&bdev->internal.spinlock);
10268 	TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
10269 		if (!TAILQ_EMPTY(&desc->pending_media_events)) {
10270 			event_notify(desc, _media_management_notify);
10271 		}
10272 	}
10273 	spdk_spin_unlock(&bdev->internal.spinlock);
10274 }
10275 
10276 struct locked_lba_range_ctx {
10277 	struct lba_range		range;
10278 	struct lba_range		*current_range;
10279 	struct lba_range		*owner_range;
10280 	struct spdk_poller		*poller;
10281 	lock_range_cb			cb_fn;
10282 	void				*cb_arg;
10283 };
10284 
10285 static void
10286 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status)
10287 {
10288 	struct locked_lba_range_ctx *ctx = _ctx;
10289 
10290 	ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM);
10291 	free(ctx);
10292 }
10293 
10294 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i,
10295 		struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx);
10296 
10297 static void
10298 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status)
10299 {
10300 	struct locked_lba_range_ctx *ctx = _ctx;
10301 
10302 	if (status == -ENOMEM) {
10303 		/* One of the channels could not allocate a range object.
10304 		 * So we have to go back and clean up any ranges that were
10305 		 * allocated successfully before we return error status to
10306 		 * the caller.  We can reuse the unlock function to do that
10307 		 * clean up.
10308 		 */
10309 		spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx,
10310 					   bdev_lock_error_cleanup_cb);
10311 		return;
10312 	}
10313 
10314 	/* All channels have locked this range and no I/O overlapping the range
10315 	 * are outstanding!  Set the owner_ch for the range object for the
10316 	 * locking channel, so that this channel will know that it is allowed
10317 	 * to write to this range.
10318 	 */
10319 	if (ctx->owner_range != NULL) {
10320 		ctx->owner_range->owner_ch = ctx->range.owner_ch;
10321 	}
10322 
10323 	ctx->cb_fn(&ctx->range, ctx->cb_arg, status);
10324 
10325 	/* Don't free the ctx here.  Its range is in the bdev's global list of
10326 	 * locked ranges still, and will be removed and freed when this range
10327 	 * is later unlocked.
10328 	 */
10329 }
10330 
10331 static int
10332 bdev_lock_lba_range_check_io(void *_i)
10333 {
10334 	struct spdk_bdev_channel_iter *i = _i;
10335 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i);
10336 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10337 	struct locked_lba_range_ctx *ctx = i->ctx;
10338 	struct lba_range *range = ctx->current_range;
10339 	struct spdk_bdev_io *bdev_io;
10340 
10341 	spdk_poller_unregister(&ctx->poller);
10342 
10343 	/* The range is now in the locked_ranges, so no new IO can be submitted to this
10344 	 * range.  But we need to wait until any outstanding IO overlapping with this range
10345 	 * are completed.
10346 	 */
10347 	TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) {
10348 		if (bdev_io_range_is_locked(bdev_io, range)) {
10349 			ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100);
10350 			return SPDK_POLLER_BUSY;
10351 		}
10352 	}
10353 
10354 	spdk_bdev_for_each_channel_continue(i, 0);
10355 	return SPDK_POLLER_BUSY;
10356 }
10357 
10358 static void
10359 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
10360 				struct spdk_io_channel *_ch, void *_ctx)
10361 {
10362 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10363 	struct locked_lba_range_ctx *ctx = _ctx;
10364 	struct lba_range *range;
10365 
10366 	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
10367 		if (range->length == ctx->range.length &&
10368 		    range->offset == ctx->range.offset &&
10369 		    range->locked_ctx == ctx->range.locked_ctx) {
10370 			/* This range already exists on this channel, so don't add
10371 			 * it again.  This can happen when a new channel is created
10372 			 * while the for_each_channel operation is in progress.
10373 			 * Do not check for outstanding I/O in that case, since the
10374 			 * range was locked before any I/O could be submitted to the
10375 			 * new channel.
10376 			 */
10377 			spdk_bdev_for_each_channel_continue(i, 0);
10378 			return;
10379 		}
10380 	}
10381 
10382 	range = calloc(1, sizeof(*range));
10383 	if (range == NULL) {
10384 		spdk_bdev_for_each_channel_continue(i, -ENOMEM);
10385 		return;
10386 	}
10387 
10388 	range->length = ctx->range.length;
10389 	range->offset = ctx->range.offset;
10390 	range->locked_ctx = ctx->range.locked_ctx;
10391 	range->quiesce = ctx->range.quiesce;
10392 	ctx->current_range = range;
10393 	if (ctx->range.owner_ch == ch) {
10394 		/* This is the range object for the channel that will hold
10395 		 * the lock.  Store it in the ctx object so that we can easily
10396 		 * set its owner_ch after the lock is finally acquired.
10397 		 */
10398 		ctx->owner_range = range;
10399 	}
10400 	TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq);
10401 	bdev_lock_lba_range_check_io(i);
10402 }
10403 
10404 static void
10405 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx)
10406 {
10407 	assert(spdk_get_thread() == ctx->range.owner_thread);
10408 	assert(ctx->range.owner_ch == NULL ||
10409 	       spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread);
10410 
10411 	/* We will add a copy of this range to each channel now. */
10412 	spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx,
10413 				   bdev_lock_lba_range_cb);
10414 }
10415 
10416 static bool
10417 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq)
10418 {
10419 	struct lba_range *r;
10420 
10421 	TAILQ_FOREACH(r, tailq, tailq) {
10422 		if (bdev_lba_range_overlapped(range, r)) {
10423 			return true;
10424 		}
10425 	}
10426 	return false;
10427 }
10428 
10429 static void bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status);
10430 
10431 static int
10432 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch,
10433 		     uint64_t offset, uint64_t length,
10434 		     lock_range_cb cb_fn, void *cb_arg)
10435 {
10436 	struct locked_lba_range_ctx *ctx;
10437 
10438 	ctx = calloc(1, sizeof(*ctx));
10439 	if (ctx == NULL) {
10440 		return -ENOMEM;
10441 	}
10442 
10443 	ctx->range.offset = offset;
10444 	ctx->range.length = length;
10445 	ctx->range.owner_thread = spdk_get_thread();
10446 	ctx->range.owner_ch = ch;
10447 	ctx->range.locked_ctx = cb_arg;
10448 	ctx->range.bdev = bdev;
10449 	ctx->range.quiesce = (cb_fn == bdev_quiesce_range_locked);
10450 	ctx->cb_fn = cb_fn;
10451 	ctx->cb_arg = cb_arg;
10452 
10453 	spdk_spin_lock(&bdev->internal.spinlock);
10454 	if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) {
10455 		/* There is an active lock overlapping with this range.
10456 		 * Put it on the pending list until this range no
10457 		 * longer overlaps with another.
10458 		 */
10459 		TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq);
10460 	} else {
10461 		TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq);
10462 		bdev_lock_lba_range_ctx(bdev, ctx);
10463 	}
10464 	spdk_spin_unlock(&bdev->internal.spinlock);
10465 	return 0;
10466 }
10467 
10468 static int
10469 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
10470 		    uint64_t offset, uint64_t length,
10471 		    lock_range_cb cb_fn, void *cb_arg)
10472 {
10473 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
10474 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10475 
10476 	if (cb_arg == NULL) {
10477 		SPDK_ERRLOG("cb_arg must not be NULL\n");
10478 		return -EINVAL;
10479 	}
10480 
10481 	return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg);
10482 }
10483 
10484 static void
10485 bdev_lock_lba_range_ctx_msg(void *_ctx)
10486 {
10487 	struct locked_lba_range_ctx *ctx = _ctx;
10488 
10489 	bdev_lock_lba_range_ctx(ctx->range.bdev, ctx);
10490 }
10491 
10492 static void
10493 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status)
10494 {
10495 	struct locked_lba_range_ctx *ctx = _ctx;
10496 	struct locked_lba_range_ctx *pending_ctx;
10497 	struct lba_range *range, *tmp;
10498 
10499 	spdk_spin_lock(&bdev->internal.spinlock);
10500 	/* Check if there are any pending locked ranges that overlap with this range
10501 	 * that was just unlocked.  If there are, check that it doesn't overlap with any
10502 	 * other locked ranges before calling bdev_lock_lba_range_ctx which will start
10503 	 * the lock process.
10504 	 */
10505 	TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) {
10506 		if (bdev_lba_range_overlapped(range, &ctx->range) &&
10507 		    !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) {
10508 			TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq);
10509 			pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range);
10510 			TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq);
10511 			spdk_thread_send_msg(pending_ctx->range.owner_thread,
10512 					     bdev_lock_lba_range_ctx_msg, pending_ctx);
10513 		}
10514 	}
10515 	spdk_spin_unlock(&bdev->internal.spinlock);
10516 
10517 	ctx->cb_fn(&ctx->range, ctx->cb_arg, status);
10518 	free(ctx);
10519 }
10520 
10521 static void
10522 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
10523 				  struct spdk_io_channel *_ch, void *_ctx)
10524 {
10525 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10526 	struct locked_lba_range_ctx *ctx = _ctx;
10527 	TAILQ_HEAD(, spdk_bdev_io) io_locked;
10528 	struct spdk_bdev_io *bdev_io;
10529 	struct lba_range *range;
10530 
10531 	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
10532 		if (ctx->range.offset == range->offset &&
10533 		    ctx->range.length == range->length &&
10534 		    ctx->range.locked_ctx == range->locked_ctx) {
10535 			TAILQ_REMOVE(&ch->locked_ranges, range, tailq);
10536 			free(range);
10537 			break;
10538 		}
10539 	}
10540 
10541 	/* Note: we should almost always be able to assert that the range specified
10542 	 * was found.  But there are some very rare corner cases where a new channel
10543 	 * gets created simultaneously with a range unlock, where this function
10544 	 * would execute on that new channel and wouldn't have the range.
10545 	 * We also use this to clean up range allocations when a later allocation
10546 	 * fails in the locking path.
10547 	 * So we can't actually assert() here.
10548 	 */
10549 
10550 	/* Swap the locked IO into a temporary list, and then try to submit them again.
10551 	 * We could hyper-optimize this to only resubmit locked I/O that overlap
10552 	 * with the range that was just unlocked, but this isn't a performance path so
10553 	 * we go for simplicity here.
10554 	 */
10555 	TAILQ_INIT(&io_locked);
10556 	TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link);
10557 	while (!TAILQ_EMPTY(&io_locked)) {
10558 		bdev_io = TAILQ_FIRST(&io_locked);
10559 		TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link);
10560 		bdev_io_submit(bdev_io);
10561 	}
10562 
10563 	spdk_bdev_for_each_channel_continue(i, 0);
10564 }
10565 
10566 static int
10567 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length,
10568 		       lock_range_cb cb_fn, void *cb_arg)
10569 {
10570 	struct locked_lba_range_ctx *ctx;
10571 	struct lba_range *range;
10572 
10573 	spdk_spin_lock(&bdev->internal.spinlock);
10574 	/* To start the unlock the process, we find the range in the bdev's locked_ranges
10575 	 * and remove it. This ensures new channels don't inherit the locked range.
10576 	 * Then we will send a message to each channel to remove the range from its
10577 	 * per-channel list.
10578 	 */
10579 	TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) {
10580 		if (range->offset == offset && range->length == length &&
10581 		    (range->owner_ch == NULL || range->locked_ctx == cb_arg)) {
10582 			break;
10583 		}
10584 	}
10585 	if (range == NULL) {
10586 		assert(false);
10587 		spdk_spin_unlock(&bdev->internal.spinlock);
10588 		return -EINVAL;
10589 	}
10590 	TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq);
10591 	ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range);
10592 	spdk_spin_unlock(&bdev->internal.spinlock);
10593 
10594 	ctx->cb_fn = cb_fn;
10595 	ctx->cb_arg = cb_arg;
10596 
10597 	spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx,
10598 				   bdev_unlock_lba_range_cb);
10599 	return 0;
10600 }
10601 
10602 static int
10603 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
10604 		      uint64_t offset, uint64_t length,
10605 		      lock_range_cb cb_fn, void *cb_arg)
10606 {
10607 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
10608 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
10609 	struct lba_range *range;
10610 	bool range_found = false;
10611 
10612 	/* Let's make sure the specified channel actually has a lock on
10613 	 * the specified range.  Note that the range must match exactly.
10614 	 */
10615 	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
10616 		if (range->offset == offset && range->length == length &&
10617 		    range->owner_ch == ch && range->locked_ctx == cb_arg) {
10618 			range_found = true;
10619 			break;
10620 		}
10621 	}
10622 
10623 	if (!range_found) {
10624 		return -EINVAL;
10625 	}
10626 
10627 	return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg);
10628 }
10629 
10630 struct bdev_quiesce_ctx {
10631 	spdk_bdev_quiesce_cb cb_fn;
10632 	void *cb_arg;
10633 };
10634 
10635 static void
10636 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status)
10637 {
10638 	struct bdev_quiesce_ctx *quiesce_ctx = ctx;
10639 
10640 	if (quiesce_ctx->cb_fn != NULL) {
10641 		quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status);
10642 	}
10643 
10644 	free(quiesce_ctx);
10645 }
10646 
10647 static void
10648 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status)
10649 {
10650 	struct bdev_quiesce_ctx *quiesce_ctx = ctx;
10651 	struct spdk_bdev_module *module = range->bdev->module;
10652 
10653 	if (status != 0) {
10654 		if (quiesce_ctx->cb_fn != NULL) {
10655 			quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status);
10656 		}
10657 		free(quiesce_ctx);
10658 		return;
10659 	}
10660 
10661 	spdk_spin_lock(&module->internal.spinlock);
10662 	TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module);
10663 	spdk_spin_unlock(&module->internal.spinlock);
10664 
10665 	if (quiesce_ctx->cb_fn != NULL) {
10666 		/* copy the context in case the range is unlocked by the callback */
10667 		struct bdev_quiesce_ctx tmp = *quiesce_ctx;
10668 
10669 		quiesce_ctx->cb_fn = NULL;
10670 		quiesce_ctx->cb_arg = NULL;
10671 
10672 		tmp.cb_fn(tmp.cb_arg, status);
10673 	}
10674 	/* quiesce_ctx will be freed on unquiesce */
10675 }
10676 
10677 static int
10678 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
10679 		   uint64_t offset, uint64_t length,
10680 		   spdk_bdev_quiesce_cb cb_fn, void *cb_arg,
10681 		   bool unquiesce)
10682 {
10683 	struct bdev_quiesce_ctx *quiesce_ctx;
10684 	int rc;
10685 
10686 	if (module != bdev->module) {
10687 		SPDK_ERRLOG("Bdev does not belong to specified module.\n");
10688 		return -EINVAL;
10689 	}
10690 
10691 	if (!bdev_io_valid_blocks(bdev, offset, length)) {
10692 		return -EINVAL;
10693 	}
10694 
10695 	if (unquiesce) {
10696 		struct lba_range *range;
10697 
10698 		/* Make sure the specified range is actually quiesced in the specified module and
10699 		 * then remove it from the list. Note that the range must match exactly.
10700 		 */
10701 		spdk_spin_lock(&module->internal.spinlock);
10702 		TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) {
10703 			if (range->bdev == bdev && range->offset == offset && range->length == length) {
10704 				TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module);
10705 				break;
10706 			}
10707 		}
10708 		spdk_spin_unlock(&module->internal.spinlock);
10709 
10710 		if (range == NULL) {
10711 			SPDK_ERRLOG("The range to unquiesce was not found.\n");
10712 			return -EINVAL;
10713 		}
10714 
10715 		quiesce_ctx = range->locked_ctx;
10716 		quiesce_ctx->cb_fn = cb_fn;
10717 		quiesce_ctx->cb_arg = cb_arg;
10718 
10719 		rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx);
10720 	} else {
10721 		quiesce_ctx = malloc(sizeof(*quiesce_ctx));
10722 		if (quiesce_ctx == NULL) {
10723 			return -ENOMEM;
10724 		}
10725 
10726 		quiesce_ctx->cb_fn = cb_fn;
10727 		quiesce_ctx->cb_arg = cb_arg;
10728 
10729 		rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx);
10730 		if (rc != 0) {
10731 			free(quiesce_ctx);
10732 		}
10733 	}
10734 
10735 	return rc;
10736 }
10737 
10738 int
10739 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
10740 		  spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
10741 {
10742 	return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false);
10743 }
10744 
10745 int
10746 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
10747 		    spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
10748 {
10749 	return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true);
10750 }
10751 
10752 int
10753 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
10754 			uint64_t offset, uint64_t length,
10755 			spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
10756 {
10757 	return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false);
10758 }
10759 
10760 int
10761 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
10762 			  uint64_t offset, uint64_t length,
10763 			  spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
10764 {
10765 	return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true);
10766 }
10767 
10768 int
10769 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains,
10770 			     int array_size)
10771 {
10772 	if (!bdev) {
10773 		return -EINVAL;
10774 	}
10775 
10776 	if (bdev->fn_table->get_memory_domains) {
10777 		return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size);
10778 	}
10779 
10780 	return 0;
10781 }
10782 
10783 struct spdk_bdev_for_each_io_ctx {
10784 	void *ctx;
10785 	spdk_bdev_io_fn fn;
10786 	spdk_bdev_for_each_io_cb cb;
10787 };
10788 
10789 static void
10790 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
10791 			 struct spdk_io_channel *io_ch, void *_ctx)
10792 {
10793 	struct spdk_bdev_for_each_io_ctx *ctx = _ctx;
10794 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
10795 	struct spdk_bdev_io *bdev_io;
10796 	int rc = 0;
10797 
10798 	TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) {
10799 		rc = ctx->fn(ctx->ctx, bdev_io);
10800 		if (rc != 0) {
10801 			break;
10802 		}
10803 	}
10804 
10805 	spdk_bdev_for_each_channel_continue(i, rc);
10806 }
10807 
10808 static void
10809 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
10810 {
10811 	struct spdk_bdev_for_each_io_ctx *ctx = _ctx;
10812 
10813 	ctx->cb(ctx->ctx, status);
10814 
10815 	free(ctx);
10816 }
10817 
10818 void
10819 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn,
10820 			   spdk_bdev_for_each_io_cb cb)
10821 {
10822 	struct spdk_bdev_for_each_io_ctx *ctx;
10823 
10824 	assert(fn != NULL && cb != NULL);
10825 
10826 	ctx = calloc(1, sizeof(*ctx));
10827 	if (ctx == NULL) {
10828 		SPDK_ERRLOG("Failed to allocate context.\n");
10829 		cb(_ctx, -ENOMEM);
10830 		return;
10831 	}
10832 
10833 	ctx->ctx = _ctx;
10834 	ctx->fn = fn;
10835 	ctx->cb = cb;
10836 
10837 	spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx,
10838 				   bdev_for_each_io_done);
10839 }
10840 
10841 void
10842 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status)
10843 {
10844 	spdk_for_each_channel_continue(iter->i, status);
10845 }
10846 
10847 static struct spdk_bdev *
10848 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i)
10849 {
10850 	void *io_device = spdk_io_channel_iter_get_io_device(i);
10851 
10852 	return __bdev_from_io_dev(io_device);
10853 }
10854 
10855 static void
10856 bdev_each_channel_msg(struct spdk_io_channel_iter *i)
10857 {
10858 	struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
10859 	struct spdk_bdev *bdev = io_channel_iter_get_bdev(i);
10860 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
10861 
10862 	iter->i = i;
10863 	iter->fn(iter, bdev, ch, iter->ctx);
10864 }
10865 
10866 static void
10867 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status)
10868 {
10869 	struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
10870 	struct spdk_bdev *bdev = io_channel_iter_get_bdev(i);
10871 
10872 	iter->i = i;
10873 	iter->cpl(bdev, iter->ctx, status);
10874 
10875 	free(iter);
10876 }
10877 
10878 void
10879 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn,
10880 			   void *ctx, spdk_bdev_for_each_channel_done cpl)
10881 {
10882 	struct spdk_bdev_channel_iter *iter;
10883 
10884 	assert(bdev != NULL && fn != NULL && ctx != NULL);
10885 
10886 	iter = calloc(1, sizeof(struct spdk_bdev_channel_iter));
10887 	if (iter == NULL) {
10888 		SPDK_ERRLOG("Unable to allocate iterator\n");
10889 		assert(false);
10890 		return;
10891 	}
10892 
10893 	iter->fn = fn;
10894 	iter->cpl = cpl;
10895 	iter->ctx = ctx;
10896 
10897 	spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg,
10898 			      iter, bdev_each_channel_cpl);
10899 }
10900 
10901 static void
10902 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
10903 {
10904 	struct spdk_bdev_io *parent_io = cb_arg;
10905 
10906 	spdk_bdev_free_io(bdev_io);
10907 
10908 	/* Check return status of write */
10909 	parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
10910 	parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx);
10911 }
10912 
10913 static void
10914 bdev_copy_do_write(void *_bdev_io)
10915 {
10916 	struct spdk_bdev_io *bdev_io = _bdev_io;
10917 	int rc;
10918 
10919 	/* Write blocks */
10920 	rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc,
10921 					    spdk_io_channel_from_ctx(bdev_io->internal.ch),
10922 					    bdev_io->u.bdev.iovs[0].iov_base,
10923 					    bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks,
10924 					    bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io);
10925 
10926 	if (rc == -ENOMEM) {
10927 		bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write);
10928 	} else if (rc != 0) {
10929 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10930 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
10931 	}
10932 }
10933 
10934 static void
10935 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
10936 {
10937 	struct spdk_bdev_io *parent_io = cb_arg;
10938 
10939 	spdk_bdev_free_io(bdev_io);
10940 
10941 	/* Check return status of read */
10942 	if (!success) {
10943 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10944 		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
10945 		return;
10946 	}
10947 
10948 	/* Do write */
10949 	bdev_copy_do_write(parent_io);
10950 }
10951 
10952 static void
10953 bdev_copy_do_read(void *_bdev_io)
10954 {
10955 	struct spdk_bdev_io *bdev_io = _bdev_io;
10956 	int rc;
10957 
10958 	/* Read blocks */
10959 	rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc,
10960 					   spdk_io_channel_from_ctx(bdev_io->internal.ch),
10961 					   bdev_io->u.bdev.iovs[0].iov_base,
10962 					   bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks,
10963 					   bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io);
10964 
10965 	if (rc == -ENOMEM) {
10966 		bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read);
10967 	} else if (rc != 0) {
10968 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10969 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
10970 	}
10971 }
10972 
10973 static void
10974 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
10975 {
10976 	if (!success) {
10977 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10978 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
10979 		return;
10980 	}
10981 
10982 	bdev_copy_do_read(bdev_io);
10983 }
10984 
10985 int
10986 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
10987 		      uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks,
10988 		      spdk_bdev_io_completion_cb cb, void *cb_arg)
10989 {
10990 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
10991 	struct spdk_bdev_io *bdev_io;
10992 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
10993 
10994 	if (!desc->write) {
10995 		return -EBADF;
10996 	}
10997 
10998 	if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) ||
10999 	    !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) {
11000 		SPDK_DEBUGLOG(bdev,
11001 			      "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n",
11002 			      dst_offset_blocks, src_offset_blocks, num_blocks);
11003 		return -EINVAL;
11004 	}
11005 
11006 	bdev_io = bdev_channel_get_io(channel);
11007 	if (!bdev_io) {
11008 		return -ENOMEM;
11009 	}
11010 
11011 	bdev_io->internal.ch = channel;
11012 	bdev_io->internal.desc = desc;
11013 	bdev_io->type = SPDK_BDEV_IO_TYPE_COPY;
11014 
11015 	bdev_io->u.bdev.offset_blocks = dst_offset_blocks;
11016 	bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks;
11017 	bdev_io->u.bdev.num_blocks = num_blocks;
11018 	bdev_io->u.bdev.memory_domain = NULL;
11019 	bdev_io->u.bdev.memory_domain_ctx = NULL;
11020 	bdev_io->u.bdev.iovs = NULL;
11021 	bdev_io->u.bdev.iovcnt = 0;
11022 	bdev_io->u.bdev.md_buf = NULL;
11023 	bdev_io->u.bdev.accel_sequence = NULL;
11024 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
11025 
11026 	if (dst_offset_blocks == src_offset_blocks || num_blocks == 0) {
11027 		spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io);
11028 		return 0;
11029 	}
11030 
11031 
11032 	/* If the copy size is large and should be split, use the generic split logic
11033 	 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not.
11034 	 *
11035 	 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or
11036 	 * emulate it using regular read and write requests otherwise.
11037 	 */
11038 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) ||
11039 	    bdev_io->internal.f.split) {
11040 		bdev_io_submit(bdev_io);
11041 		return 0;
11042 	}
11043 
11044 	spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev));
11045 
11046 	return 0;
11047 }
11048 
11049 SPDK_LOG_REGISTER_COMPONENT(bdev)
11050 
11051 static void
11052 bdev_trace(void)
11053 {
11054 	struct spdk_trace_tpoint_opts opts[] = {
11055 		{
11056 			"BDEV_IO_START", TRACE_BDEV_IO_START,
11057 			OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 1,
11058 			{
11059 				{ "type", SPDK_TRACE_ARG_TYPE_INT, 8 },
11060 				{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 },
11061 				{ "offset", SPDK_TRACE_ARG_TYPE_INT, 8 },
11062 				{ "qd", SPDK_TRACE_ARG_TYPE_INT, 4 }
11063 			}
11064 		},
11065 		{
11066 			"BDEV_IO_DONE", TRACE_BDEV_IO_DONE,
11067 			OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 0,
11068 			{
11069 				{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 },
11070 				{ "qd", SPDK_TRACE_ARG_TYPE_INT, 4 }
11071 			}
11072 		},
11073 		{
11074 			"BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE,
11075 			OWNER_TYPE_BDEV, OBJECT_NONE, 0,
11076 			{
11077 				{ "tid", SPDK_TRACE_ARG_TYPE_INT, 8 }
11078 			}
11079 		},
11080 		{
11081 			"BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY,
11082 			OWNER_TYPE_BDEV, OBJECT_NONE, 0,
11083 			{
11084 				{ "tid", SPDK_TRACE_ARG_TYPE_INT, 8 }
11085 			}
11086 		},
11087 	};
11088 
11089 
11090 	spdk_trace_register_owner_type(OWNER_TYPE_BDEV, 'b');
11091 	spdk_trace_register_object(OBJECT_BDEV_IO, 'i');
11092 	spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts));
11093 	spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0);
11094 	spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0);
11095 	spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_START, OBJECT_BDEV_IO, 0);
11096 	spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_COMPLETE, OBJECT_BDEV_IO, 0);
11097 	spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_START, OBJECT_BDEV_IO, 0);
11098 	spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_DONE, OBJECT_BDEV_IO, 0);
11099 }
11100 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV)
11101