xref: /spdk/lib/bdev/bdev.c (revision 1b1967bdd61daa5ec110e2ca6c73c7b38a60bb89)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2016 Intel Corporation. All rights reserved.
3  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
4  *   Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  */
6 
7 #include "spdk/stdinc.h"
8 
9 #include "spdk/bdev.h"
10 
11 #include "spdk/accel.h"
12 #include "spdk/config.h"
13 #include "spdk/env.h"
14 #include "spdk/thread.h"
15 #include "spdk/likely.h"
16 #include "spdk/queue.h"
17 #include "spdk/nvme_spec.h"
18 #include "spdk/scsi_spec.h"
19 #include "spdk/notify.h"
20 #include "spdk/util.h"
21 #include "spdk/trace.h"
22 #include "spdk/dma.h"
23 
24 #include "spdk/bdev_module.h"
25 #include "spdk/log.h"
26 #include "spdk/string.h"
27 
28 #include "bdev_internal.h"
29 #include "spdk_internal/trace_defs.h"
30 #include "spdk_internal/assert.h"
31 
32 #ifdef SPDK_CONFIG_VTUNE
33 #include "ittnotify.h"
34 #include "ittnotify_types.h"
35 int __itt_init_ittlib(const char *, __itt_group_id);
36 #endif
37 
38 #define SPDK_BDEV_IO_POOL_SIZE			(64 * 1024 - 1)
39 #define SPDK_BDEV_IO_CACHE_SIZE			256
40 #define SPDK_BDEV_AUTO_EXAMINE			true
41 #define BUF_SMALL_POOL_SIZE			8191
42 #define BUF_LARGE_POOL_SIZE			1023
43 #define BUF_SMALL_CACHE_SIZE			128
44 #define BUF_LARGE_CACHE_SIZE			16
45 #define NOMEM_THRESHOLD_COUNT			8
46 
47 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC		1000
48 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE	1
49 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE	512
50 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC		1000
51 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC		(1024 * 1024)
52 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED		UINT64_MAX
53 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC	1000
54 
55 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command
56  * when splitting into children requests at a time.
57  */
58 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8)
59 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000
60 
61 /* The maximum number of children requests for a COPY command
62  * when splitting into children requests at a time.
63  */
64 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8)
65 
66 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \
67 	log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev)
68 #ifdef DEBUG
69 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \
70 	log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev)
71 #else
72 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0)
73 #endif
74 
75 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func,
76 				const char *detail, struct spdk_bdev *bdev);
77 
78 SPDK_LOG_DEPRECATION_REGISTER(vtune_support, "Intel(R) VTune integration", "v23.09", 0);
79 
80 static const char *qos_rpc_type[] = {"rw_ios_per_sec",
81 				     "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec"
82 				    };
83 
84 TAILQ_HEAD(spdk_bdev_list, spdk_bdev);
85 
86 RB_HEAD(bdev_name_tree, spdk_bdev_name);
87 
88 static int
89 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2)
90 {
91 	return strcmp(name1->name, name2->name);
92 }
93 
94 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp);
95 
96 struct spdk_bdev_mgr {
97 	struct spdk_mempool *bdev_io_pool;
98 
99 	void *zero_buffer;
100 
101 	TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules;
102 
103 	struct spdk_bdev_list bdevs;
104 	struct bdev_name_tree bdev_names;
105 
106 	bool init_complete;
107 	bool module_init_complete;
108 
109 	struct spdk_spinlock spinlock;
110 
111 #ifdef SPDK_CONFIG_VTUNE
112 	__itt_domain	*domain;
113 #endif
114 };
115 
116 static struct spdk_bdev_mgr g_bdev_mgr = {
117 	.bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules),
118 	.bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs),
119 	.bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names),
120 	.init_complete = false,
121 	.module_init_complete = false,
122 };
123 
124 static void
125 __attribute__((constructor))
126 _bdev_init(void)
127 {
128 	spdk_spin_init(&g_bdev_mgr.spinlock);
129 }
130 
131 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status);
132 
133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc);
134 
135 struct lba_range {
136 	struct spdk_bdev		*bdev;
137 	uint64_t			offset;
138 	uint64_t			length;
139 	void				*locked_ctx;
140 	struct spdk_thread		*owner_thread;
141 	struct spdk_bdev_channel	*owner_ch;
142 	TAILQ_ENTRY(lba_range)		tailq;
143 	TAILQ_ENTRY(lba_range)		tailq_module;
144 };
145 
146 static struct spdk_bdev_opts	g_bdev_opts = {
147 	.bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE,
148 	.bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE,
149 	.bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE,
150 };
151 
152 static spdk_bdev_init_cb	g_init_cb_fn = NULL;
153 static void			*g_init_cb_arg = NULL;
154 
155 static spdk_bdev_fini_cb	g_fini_cb_fn = NULL;
156 static void			*g_fini_cb_arg = NULL;
157 static struct spdk_thread	*g_fini_thread = NULL;
158 
159 struct spdk_bdev_qos_limit {
160 	/** IOs or bytes allowed per second (i.e., 1s). */
161 	uint64_t limit;
162 
163 	/** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms).
164 	 *  For remaining bytes, allowed to run negative if an I/O is submitted when
165 	 *  some bytes are remaining, but the I/O is bigger than that amount. The
166 	 *  excess will be deducted from the next timeslice.
167 	 */
168 	int64_t remaining_this_timeslice;
169 
170 	/** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */
171 	uint32_t min_per_timeslice;
172 
173 	/** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */
174 	uint32_t max_per_timeslice;
175 
176 	/** Function to check whether to queue the IO. */
177 	bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io);
178 
179 	/** Function to update for the submitted IO. */
180 	void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io);
181 };
182 
183 struct spdk_bdev_qos {
184 	/** Types of structure of rate limits. */
185 	struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
186 
187 	/** The channel that all I/O are funneled through. */
188 	struct spdk_bdev_channel *ch;
189 
190 	/** The thread on which the poller is running. */
191 	struct spdk_thread *thread;
192 
193 	/** Queue of I/O waiting to be issued. */
194 	bdev_io_tailq_t queued;
195 
196 	/** Size of a timeslice in tsc ticks. */
197 	uint64_t timeslice_size;
198 
199 	/** Timestamp of start of last timeslice. */
200 	uint64_t last_timeslice;
201 
202 	/** Poller that processes queued I/O commands each time slice. */
203 	struct spdk_poller *poller;
204 };
205 
206 struct spdk_bdev_mgmt_channel {
207 	/*
208 	 * Each thread keeps a cache of bdev_io - this allows
209 	 *  bdev threads which are *not* DPDK threads to still
210 	 *  benefit from a per-thread bdev_io cache.  Without
211 	 *  this, non-DPDK threads fetching from the mempool
212 	 *  incur a cmpxchg on get and put.
213 	 */
214 	bdev_io_stailq_t per_thread_cache;
215 	uint32_t	per_thread_cache_count;
216 	uint32_t	bdev_io_cache_size;
217 
218 	struct spdk_iobuf_channel iobuf;
219 
220 	TAILQ_HEAD(, spdk_bdev_shared_resource)	shared_resources;
221 	TAILQ_HEAD(, spdk_bdev_io_wait_entry)	io_wait_queue;
222 };
223 
224 /*
225  * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device
226  * will queue here their IO that awaits retry. It makes it possible to retry sending
227  * IO to one bdev after IO from other bdev completes.
228  */
229 struct spdk_bdev_shared_resource {
230 	/* The bdev management channel */
231 	struct spdk_bdev_mgmt_channel *mgmt_ch;
232 
233 	/*
234 	 * Count of I/O submitted to bdev module and waiting for completion.
235 	 * Incremented before submit_request() is called on an spdk_bdev_io.
236 	 */
237 	uint64_t		io_outstanding;
238 
239 	/*
240 	 * Queue of IO awaiting retry because of a previous NOMEM status returned
241 	 *  on this channel.
242 	 */
243 	bdev_io_tailq_t		nomem_io;
244 
245 	/*
246 	 * Threshold which io_outstanding must drop to before retrying nomem_io.
247 	 */
248 	uint64_t		nomem_threshold;
249 
250 	/* I/O channel allocated by a bdev module */
251 	struct spdk_io_channel	*shared_ch;
252 
253 	/* Refcount of bdev channels using this resource */
254 	uint32_t		ref;
255 
256 	TAILQ_ENTRY(spdk_bdev_shared_resource) link;
257 };
258 
259 #define BDEV_CH_RESET_IN_PROGRESS	(1 << 0)
260 #define BDEV_CH_QOS_ENABLED		(1 << 1)
261 
262 struct spdk_bdev_channel {
263 	struct spdk_bdev	*bdev;
264 
265 	/* The channel for the underlying device */
266 	struct spdk_io_channel	*channel;
267 
268 	/* Accel channel */
269 	struct spdk_io_channel	*accel_channel;
270 
271 	/* Per io_device per thread data */
272 	struct spdk_bdev_shared_resource *shared_resource;
273 
274 	struct spdk_bdev_io_stat *stat;
275 
276 	/*
277 	 * Count of I/O submitted to the underlying dev module through this channel
278 	 * and waiting for completion.
279 	 */
280 	uint64_t		io_outstanding;
281 
282 	/*
283 	 * List of all submitted I/Os including I/O that are generated via splitting.
284 	 */
285 	bdev_io_tailq_t		io_submitted;
286 
287 	/*
288 	 * List of spdk_bdev_io that are currently queued because they write to a locked
289 	 * LBA range.
290 	 */
291 	bdev_io_tailq_t		io_locked;
292 
293 	/* List of I/Os with accel sequence being currently executed */
294 	bdev_io_tailq_t		io_accel_exec;
295 
296 	/* List of I/Os doing memory domain pull/push */
297 	bdev_io_tailq_t		io_memory_domain;
298 
299 	uint32_t		flags;
300 
301 	struct spdk_histogram_data *histogram;
302 
303 #ifdef SPDK_CONFIG_VTUNE
304 	uint64_t		start_tsc;
305 	uint64_t		interval_tsc;
306 	__itt_string_handle	*handle;
307 	struct spdk_bdev_io_stat *prev_stat;
308 #endif
309 
310 	bdev_io_tailq_t		queued_resets;
311 
312 	lba_range_tailq_t	locked_ranges;
313 };
314 
315 struct media_event_entry {
316 	struct spdk_bdev_media_event	event;
317 	TAILQ_ENTRY(media_event_entry)	tailq;
318 };
319 
320 #define MEDIA_EVENT_POOL_SIZE 64
321 
322 struct spdk_bdev_desc {
323 	struct spdk_bdev		*bdev;
324 	struct spdk_thread		*thread;
325 	struct {
326 		spdk_bdev_event_cb_t event_fn;
327 		void *ctx;
328 	}				callback;
329 	bool				closed;
330 	bool				write;
331 	bool				memory_domains_supported;
332 	bool				accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES];
333 	struct spdk_spinlock		spinlock;
334 	uint32_t			refs;
335 	TAILQ_HEAD(, media_event_entry)	pending_media_events;
336 	TAILQ_HEAD(, media_event_entry)	free_media_events;
337 	struct media_event_entry	*media_events_buffer;
338 	TAILQ_ENTRY(spdk_bdev_desc)	link;
339 
340 	uint64_t		timeout_in_sec;
341 	spdk_bdev_io_timeout_cb	cb_fn;
342 	void			*cb_arg;
343 	struct spdk_poller	*io_timeout_poller;
344 	struct spdk_bdev_module_claim	*claim;
345 };
346 
347 struct spdk_bdev_iostat_ctx {
348 	struct spdk_bdev_io_stat *stat;
349 	spdk_bdev_get_device_stat_cb cb;
350 	void *cb_arg;
351 };
352 
353 struct set_qos_limit_ctx {
354 	void (*cb_fn)(void *cb_arg, int status);
355 	void *cb_arg;
356 	struct spdk_bdev *bdev;
357 };
358 
359 struct spdk_bdev_channel_iter {
360 	spdk_bdev_for_each_channel_msg fn;
361 	spdk_bdev_for_each_channel_done cpl;
362 	struct spdk_io_channel_iter *i;
363 	void *ctx;
364 };
365 
366 struct spdk_bdev_io_error_stat {
367 	uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS];
368 };
369 
370 enum bdev_io_retry_state {
371 	BDEV_IO_RETRY_STATE_INVALID,
372 	BDEV_IO_RETRY_STATE_PULL,
373 	BDEV_IO_RETRY_STATE_PULL_MD,
374 	BDEV_IO_RETRY_STATE_SUBMIT,
375 	BDEV_IO_RETRY_STATE_PUSH,
376 	BDEV_IO_RETRY_STATE_PUSH_MD,
377 };
378 
379 #define __bdev_to_io_dev(bdev)		(((char *)bdev) + 1)
380 #define __bdev_from_io_dev(io_dev)	((struct spdk_bdev *)(((char *)io_dev) - 1))
381 #define __io_ch_to_bdev_ch(io_ch)	((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch))
382 #define __io_ch_to_bdev_mgmt_ch(io_ch)	((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch))
383 
384 static inline void bdev_io_complete(void *ctx);
385 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io);
386 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io);
387 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io);
388 
389 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
390 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io);
391 
392 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
393 				struct spdk_io_channel *ch, void *_ctx);
394 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status);
395 
396 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
397 				     struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks,
398 				     uint64_t num_blocks,
399 				     struct spdk_memory_domain *domain, void *domain_ctx,
400 				     struct spdk_accel_sequence *seq,
401 				     spdk_bdev_io_completion_cb cb, void *cb_arg);
402 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
403 				      struct iovec *iov, int iovcnt, void *md_buf,
404 				      uint64_t offset_blocks, uint64_t num_blocks,
405 				      struct spdk_memory_domain *domain, void *domain_ctx,
406 				      struct spdk_accel_sequence *seq,
407 				      spdk_bdev_io_completion_cb cb, void *cb_arg);
408 
409 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
410 			       uint64_t offset, uint64_t length,
411 			       lock_range_cb cb_fn, void *cb_arg);
412 
413 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
414 				 uint64_t offset, uint64_t length,
415 				 lock_range_cb cb_fn, void *cb_arg);
416 
417 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort);
418 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort);
419 
420 static bool claim_type_is_v2(enum spdk_bdev_claim_type type);
421 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc);
422 static void claim_reset(struct spdk_bdev *bdev);
423 
424 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch);
425 
426 #define bdev_get_ext_io_opt(opts, field, defval) \
427 	(((opts) != NULL && offsetof(struct spdk_bdev_ext_io_opts, field) + \
428 	 sizeof((opts)->field) <= sizeof(*(opts))) ? (opts)->field : (defval))
429 
430 void
431 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size)
432 {
433 	if (!opts) {
434 		SPDK_ERRLOG("opts should not be NULL\n");
435 		return;
436 	}
437 
438 	if (!opts_size) {
439 		SPDK_ERRLOG("opts_size should not be zero value\n");
440 		return;
441 	}
442 
443 	opts->opts_size = opts_size;
444 
445 #define SET_FIELD(field) \
446 	if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \
447 		opts->field = g_bdev_opts.field; \
448 	} \
449 
450 	SET_FIELD(bdev_io_pool_size);
451 	SET_FIELD(bdev_io_cache_size);
452 	SET_FIELD(bdev_auto_examine);
453 
454 	/* Do not remove this statement, you should always update this statement when you adding a new field,
455 	 * and do not forget to add the SET_FIELD statement for your added field. */
456 	SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size");
457 
458 #undef SET_FIELD
459 }
460 
461 int
462 spdk_bdev_set_opts(struct spdk_bdev_opts *opts)
463 {
464 	uint32_t min_pool_size;
465 
466 	if (!opts) {
467 		SPDK_ERRLOG("opts cannot be NULL\n");
468 		return -1;
469 	}
470 
471 	if (!opts->opts_size) {
472 		SPDK_ERRLOG("opts_size inside opts cannot be zero value\n");
473 		return -1;
474 	}
475 
476 	/*
477 	 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem
478 	 *  initialization.  A second mgmt_ch will be created on the same thread when the application starts
479 	 *  but before the deferred put_io_channel event is executed for the first mgmt_ch.
480 	 */
481 	min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1);
482 	if (opts->bdev_io_pool_size < min_pool_size) {
483 		SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32
484 			    " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size,
485 			    spdk_thread_get_count());
486 		SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size);
487 		return -1;
488 	}
489 
490 #define SET_FIELD(field) \
491         if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \
492                 g_bdev_opts.field = opts->field; \
493         } \
494 
495 	SET_FIELD(bdev_io_pool_size);
496 	SET_FIELD(bdev_io_cache_size);
497 	SET_FIELD(bdev_auto_examine);
498 
499 	g_bdev_opts.opts_size = opts->opts_size;
500 
501 #undef SET_FIELD
502 
503 	return 0;
504 }
505 
506 static struct spdk_bdev *
507 bdev_get_by_name(const char *bdev_name)
508 {
509 	struct spdk_bdev_name find;
510 	struct spdk_bdev_name *res;
511 
512 	find.name = (char *)bdev_name;
513 	res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find);
514 	if (res != NULL) {
515 		return res->bdev;
516 	}
517 
518 	return NULL;
519 }
520 
521 struct spdk_bdev *
522 spdk_bdev_get_by_name(const char *bdev_name)
523 {
524 	struct spdk_bdev *bdev;
525 
526 	spdk_spin_lock(&g_bdev_mgr.spinlock);
527 	bdev = bdev_get_by_name(bdev_name);
528 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
529 
530 	return bdev;
531 }
532 
533 struct bdev_io_status_string {
534 	enum spdk_bdev_io_status status;
535 	const char *str;
536 };
537 
538 static const struct bdev_io_status_string bdev_io_status_strings[] = {
539 	{ SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" },
540 	{ SPDK_BDEV_IO_STATUS_ABORTED, "aborted" },
541 	{ SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" },
542 	{ SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" },
543 	{ SPDK_BDEV_IO_STATUS_NOMEM, "nomem" },
544 	{ SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" },
545 	{ SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" },
546 	{ SPDK_BDEV_IO_STATUS_FAILED, "failed" },
547 	{ SPDK_BDEV_IO_STATUS_PENDING, "pending" },
548 	{ SPDK_BDEV_IO_STATUS_SUCCESS, "success" },
549 };
550 
551 static const char *
552 bdev_io_status_get_string(enum spdk_bdev_io_status status)
553 {
554 	uint32_t i;
555 
556 	for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) {
557 		if (bdev_io_status_strings[i].status == status) {
558 			return bdev_io_status_strings[i].str;
559 		}
560 	}
561 
562 	return "reserved";
563 }
564 
565 struct spdk_bdev_wait_for_examine_ctx {
566 	struct spdk_poller              *poller;
567 	spdk_bdev_wait_for_examine_cb	cb_fn;
568 	void				*cb_arg;
569 };
570 
571 static bool bdev_module_all_actions_completed(void);
572 
573 static int
574 bdev_wait_for_examine_cb(void *arg)
575 {
576 	struct spdk_bdev_wait_for_examine_ctx *ctx = arg;
577 
578 	if (!bdev_module_all_actions_completed()) {
579 		return SPDK_POLLER_IDLE;
580 	}
581 
582 	spdk_poller_unregister(&ctx->poller);
583 	ctx->cb_fn(ctx->cb_arg);
584 	free(ctx);
585 
586 	return SPDK_POLLER_BUSY;
587 }
588 
589 int
590 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg)
591 {
592 	struct spdk_bdev_wait_for_examine_ctx *ctx;
593 
594 	ctx = calloc(1, sizeof(*ctx));
595 	if (ctx == NULL) {
596 		return -ENOMEM;
597 	}
598 	ctx->cb_fn = cb_fn;
599 	ctx->cb_arg = cb_arg;
600 	ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0);
601 
602 	return 0;
603 }
604 
605 struct spdk_bdev_examine_item {
606 	char *name;
607 	TAILQ_ENTRY(spdk_bdev_examine_item) link;
608 };
609 
610 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item);
611 
612 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER(
613 			g_bdev_examine_allowlist);
614 
615 static inline bool
616 bdev_examine_allowlist_check(const char *name)
617 {
618 	struct spdk_bdev_examine_item *item;
619 	TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) {
620 		if (strcmp(name, item->name) == 0) {
621 			return true;
622 		}
623 	}
624 	return false;
625 }
626 
627 static inline void
628 bdev_examine_allowlist_free(void)
629 {
630 	struct spdk_bdev_examine_item *item;
631 	while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) {
632 		item = TAILQ_FIRST(&g_bdev_examine_allowlist);
633 		TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link);
634 		free(item->name);
635 		free(item);
636 	}
637 }
638 
639 static inline bool
640 bdev_in_examine_allowlist(struct spdk_bdev *bdev)
641 {
642 	struct spdk_bdev_alias *tmp;
643 	if (bdev_examine_allowlist_check(bdev->name)) {
644 		return true;
645 	}
646 	TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
647 		if (bdev_examine_allowlist_check(tmp->alias.name)) {
648 			return true;
649 		}
650 	}
651 	return false;
652 }
653 
654 static inline bool
655 bdev_ok_to_examine(struct spdk_bdev *bdev)
656 {
657 	if (g_bdev_opts.bdev_auto_examine) {
658 		return true;
659 	} else {
660 		return bdev_in_examine_allowlist(bdev);
661 	}
662 }
663 
664 static void
665 bdev_examine(struct spdk_bdev *bdev)
666 {
667 	struct spdk_bdev_module *module;
668 	struct spdk_bdev_module_claim *claim, *tmpclaim;
669 	uint32_t action;
670 
671 	if (!bdev_ok_to_examine(bdev)) {
672 		return;
673 	}
674 
675 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
676 		if (module->examine_config) {
677 			spdk_spin_lock(&module->internal.spinlock);
678 			action = module->internal.action_in_progress;
679 			module->internal.action_in_progress++;
680 			spdk_spin_unlock(&module->internal.spinlock);
681 			module->examine_config(bdev);
682 			if (action != module->internal.action_in_progress) {
683 				SPDK_ERRLOG("examine_config for module %s did not call "
684 					    "spdk_bdev_module_examine_done()\n", module->name);
685 			}
686 		}
687 	}
688 
689 	spdk_spin_lock(&bdev->internal.spinlock);
690 
691 	switch (bdev->internal.claim_type) {
692 	case SPDK_BDEV_CLAIM_NONE:
693 		/* Examine by all bdev modules */
694 		TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
695 			if (module->examine_disk) {
696 				spdk_spin_lock(&module->internal.spinlock);
697 				module->internal.action_in_progress++;
698 				spdk_spin_unlock(&module->internal.spinlock);
699 				spdk_spin_unlock(&bdev->internal.spinlock);
700 				module->examine_disk(bdev);
701 				spdk_spin_lock(&bdev->internal.spinlock);
702 			}
703 		}
704 		break;
705 	case SPDK_BDEV_CLAIM_EXCL_WRITE:
706 		/* Examine by the one bdev module with a v1 claim */
707 		module = bdev->internal.claim.v1.module;
708 		if (module->examine_disk) {
709 			spdk_spin_lock(&module->internal.spinlock);
710 			module->internal.action_in_progress++;
711 			spdk_spin_unlock(&module->internal.spinlock);
712 			spdk_spin_unlock(&bdev->internal.spinlock);
713 			module->examine_disk(bdev);
714 			return;
715 		}
716 		break;
717 	default:
718 		/* Examine by all bdev modules with a v2 claim */
719 		assert(claim_type_is_v2(bdev->internal.claim_type));
720 		/*
721 		 * Removal of tailq nodes while iterating can cause the iteration to jump out of the
722 		 * list, perhaps accessing freed memory. Without protection, this could happen
723 		 * while the lock is dropped during the examine callback.
724 		 */
725 		bdev->internal.examine_in_progress++;
726 
727 		TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) {
728 			module = claim->module;
729 
730 			if (module == NULL) {
731 				/* This is a vestigial claim, held by examine_count */
732 				continue;
733 			}
734 
735 			if (module->examine_disk == NULL) {
736 				continue;
737 			}
738 
739 			spdk_spin_lock(&module->internal.spinlock);
740 			module->internal.action_in_progress++;
741 			spdk_spin_unlock(&module->internal.spinlock);
742 
743 			/* Call examine_disk without holding internal.spinlock. */
744 			spdk_spin_unlock(&bdev->internal.spinlock);
745 			module->examine_disk(bdev);
746 			spdk_spin_lock(&bdev->internal.spinlock);
747 		}
748 
749 		assert(bdev->internal.examine_in_progress > 0);
750 		bdev->internal.examine_in_progress--;
751 		if (bdev->internal.examine_in_progress == 0) {
752 			/* Remove any claims that were released during examine_disk */
753 			TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) {
754 				if (claim->desc != NULL) {
755 					continue;
756 				}
757 
758 				TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link);
759 				free(claim);
760 			}
761 			if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) {
762 				claim_reset(bdev);
763 			}
764 		}
765 	}
766 
767 	spdk_spin_unlock(&bdev->internal.spinlock);
768 }
769 
770 int
771 spdk_bdev_examine(const char *name)
772 {
773 	struct spdk_bdev *bdev;
774 	struct spdk_bdev_examine_item *item;
775 	struct spdk_thread *thread = spdk_get_thread();
776 
777 	if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) {
778 		SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread,
779 			    thread ? spdk_thread_get_name(thread) : "null");
780 		return -EINVAL;
781 	}
782 
783 	if (g_bdev_opts.bdev_auto_examine) {
784 		SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled");
785 		return -EINVAL;
786 	}
787 
788 	if (bdev_examine_allowlist_check(name)) {
789 		SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name);
790 		return -EEXIST;
791 	}
792 
793 	item = calloc(1, sizeof(*item));
794 	if (!item) {
795 		return -ENOMEM;
796 	}
797 	item->name = strdup(name);
798 	if (!item->name) {
799 		free(item);
800 		return -ENOMEM;
801 	}
802 	TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link);
803 
804 	bdev = spdk_bdev_get_by_name(name);
805 	if (bdev) {
806 		bdev_examine(bdev);
807 	}
808 	return 0;
809 }
810 
811 static inline void
812 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w)
813 {
814 	struct spdk_bdev_examine_item *item;
815 	TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) {
816 		spdk_json_write_object_begin(w);
817 		spdk_json_write_named_string(w, "method", "bdev_examine");
818 		spdk_json_write_named_object_begin(w, "params");
819 		spdk_json_write_named_string(w, "name", item->name);
820 		spdk_json_write_object_end(w);
821 		spdk_json_write_object_end(w);
822 	}
823 }
824 
825 struct spdk_bdev *
826 spdk_bdev_first(void)
827 {
828 	struct spdk_bdev *bdev;
829 
830 	bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs);
831 	if (bdev) {
832 		SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name);
833 	}
834 
835 	return bdev;
836 }
837 
838 struct spdk_bdev *
839 spdk_bdev_next(struct spdk_bdev *prev)
840 {
841 	struct spdk_bdev *bdev;
842 
843 	bdev = TAILQ_NEXT(prev, internal.link);
844 	if (bdev) {
845 		SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name);
846 	}
847 
848 	return bdev;
849 }
850 
851 static struct spdk_bdev *
852 _bdev_next_leaf(struct spdk_bdev *bdev)
853 {
854 	while (bdev != NULL) {
855 		if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) {
856 			return bdev;
857 		} else {
858 			bdev = TAILQ_NEXT(bdev, internal.link);
859 		}
860 	}
861 
862 	return bdev;
863 }
864 
865 struct spdk_bdev *
866 spdk_bdev_first_leaf(void)
867 {
868 	struct spdk_bdev *bdev;
869 
870 	bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs));
871 
872 	if (bdev) {
873 		SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name);
874 	}
875 
876 	return bdev;
877 }
878 
879 struct spdk_bdev *
880 spdk_bdev_next_leaf(struct spdk_bdev *prev)
881 {
882 	struct spdk_bdev *bdev;
883 
884 	bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link));
885 
886 	if (bdev) {
887 		SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name);
888 	}
889 
890 	return bdev;
891 }
892 
893 static inline bool
894 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io)
895 {
896 	return bdev_io->internal.memory_domain;
897 }
898 
899 static inline bool
900 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io)
901 {
902 	return bdev_io->internal.accel_sequence;
903 }
904 
905 static inline void
906 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource,
907 			 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state)
908 {
909 	/* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io.
910 	 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth
911 	 * channels we will instead wait for half to complete.
912 	 */
913 	shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2,
914 					   (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT);
915 
916 	assert(state != BDEV_IO_RETRY_STATE_INVALID);
917 	bdev_io->internal.retry_state = state;
918 	TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link);
919 }
920 
921 static inline void
922 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource,
923 			 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state)
924 {
925 	/* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while
926 	 * the queue isn't empty, so we don't need to update the nomem_threshold here */
927 	assert(!TAILQ_EMPTY(&shared_resource->nomem_io));
928 
929 	assert(state != BDEV_IO_RETRY_STATE_INVALID);
930 	bdev_io->internal.retry_state = state;
931 	TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link);
932 }
933 
934 void
935 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len)
936 {
937 	struct iovec *iovs;
938 
939 	if (bdev_io->u.bdev.iovs == NULL) {
940 		bdev_io->u.bdev.iovs = &bdev_io->iov;
941 		bdev_io->u.bdev.iovcnt = 1;
942 	}
943 
944 	iovs = bdev_io->u.bdev.iovs;
945 
946 	assert(iovs != NULL);
947 	assert(bdev_io->u.bdev.iovcnt >= 1);
948 
949 	iovs[0].iov_base = buf;
950 	iovs[0].iov_len = len;
951 }
952 
953 void
954 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len)
955 {
956 	assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks);
957 	bdev_io->u.bdev.md_buf = md_buf;
958 }
959 
960 static bool
961 _is_buf_allocated(const struct iovec *iovs)
962 {
963 	if (iovs == NULL) {
964 		return false;
965 	}
966 
967 	return iovs[0].iov_base != NULL;
968 }
969 
970 static bool
971 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment)
972 {
973 	int i;
974 	uintptr_t iov_base;
975 
976 	if (spdk_likely(alignment == 1)) {
977 		return true;
978 	}
979 
980 	for (i = 0; i < iovcnt; i++) {
981 		iov_base = (uintptr_t)iovs[i].iov_base;
982 		if ((iov_base & (alignment - 1)) != 0) {
983 			return false;
984 		}
985 	}
986 
987 	return true;
988 }
989 
990 static inline bool
991 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io)
992 {
993 	if (!bdev_io_use_accel_sequence(bdev_io)) {
994 		return false;
995 	}
996 
997 	/* For now, we don't allow splitting IOs with an accel sequence and will treat them as if
998 	 * bdev module didn't support accel sequences */
999 	return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.split;
1000 }
1001 
1002 static inline void
1003 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch,
1004 			      struct spdk_bdev_shared_resource *shared_resource)
1005 {
1006 	bdev_ch->io_outstanding++;
1007 	shared_resource->io_outstanding++;
1008 }
1009 
1010 static inline void
1011 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch,
1012 			      struct spdk_bdev_shared_resource *shared_resource)
1013 {
1014 	assert(bdev_ch->io_outstanding > 0);
1015 	assert(shared_resource->io_outstanding > 0);
1016 	bdev_ch->io_outstanding--;
1017 	shared_resource->io_outstanding--;
1018 }
1019 
1020 static void
1021 bdev_io_submit_sequence_cb(void *ctx, int status)
1022 {
1023 	struct spdk_bdev_io *bdev_io = ctx;
1024 
1025 	bdev_io->u.bdev.accel_sequence = NULL;
1026 	bdev_io->internal.accel_sequence = NULL;
1027 
1028 	if (spdk_unlikely(status != 0)) {
1029 		SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status);
1030 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1031 		bdev_io_complete_unsubmitted(bdev_io);
1032 		return;
1033 	}
1034 
1035 	bdev_io_submit(bdev_io);
1036 }
1037 
1038 static void
1039 bdev_io_exec_sequence_cb(void *ctx, int status)
1040 {
1041 	struct spdk_bdev_io *bdev_io = ctx;
1042 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1043 
1044 	TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link);
1045 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1046 
1047 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1048 		bdev_ch_retry_io(ch);
1049 	}
1050 
1051 	bdev_io->internal.data_transfer_cpl(bdev_io, status);
1052 }
1053 
1054 static void
1055 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status))
1056 {
1057 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1058 
1059 	assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io));
1060 	assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
1061 
1062 	/* Since the operations are appended during submission, they're in the opposite order than
1063 	 * how we want to execute them for reads (i.e. we need to execute the most recently added
1064 	 * operation first), so reverse the sequence before executing it.
1065 	 */
1066 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
1067 		spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence);
1068 	}
1069 
1070 	TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link);
1071 	bdev_io_increment_outstanding(ch, ch->shared_resource);
1072 	bdev_io->internal.data_transfer_cpl = cb_fn;
1073 
1074 	spdk_accel_sequence_finish(bdev_io->internal.accel_sequence,
1075 				   bdev_io_exec_sequence_cb, bdev_io);
1076 }
1077 
1078 static void
1079 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status)
1080 {
1081 	struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io);
1082 	void *buf;
1083 
1084 	if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) {
1085 		buf = bdev_io->internal.buf;
1086 		bdev_io->internal.buf = NULL;
1087 		bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf);
1088 		bdev_io->internal.get_aux_buf_cb = NULL;
1089 	} else {
1090 		assert(bdev_io->internal.get_buf_cb != NULL);
1091 		bdev_io->internal.get_buf_cb(ch, bdev_io, status);
1092 		bdev_io->internal.get_buf_cb = NULL;
1093 	}
1094 }
1095 
1096 static void
1097 _bdev_io_pull_buffer_cpl(void *ctx, int rc)
1098 {
1099 	struct spdk_bdev_io *bdev_io = ctx;
1100 
1101 	if (rc) {
1102 		SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc);
1103 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1104 	}
1105 	bdev_io_get_buf_complete(bdev_io, !rc);
1106 }
1107 
1108 static void
1109 bdev_io_pull_md_buf_done(void *ctx, int status)
1110 {
1111 	struct spdk_bdev_io *bdev_io = ctx;
1112 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1113 
1114 	TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1115 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1116 
1117 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1118 		bdev_ch_retry_io(ch);
1119 	}
1120 
1121 	assert(bdev_io->internal.data_transfer_cpl);
1122 	bdev_io->internal.data_transfer_cpl(bdev_io, status);
1123 }
1124 
1125 static void
1126 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io)
1127 {
1128 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1129 	int rc = 0;
1130 
1131 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1132 		if (bdev_io_use_memory_domain(bdev_io)) {
1133 			TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1134 			bdev_io_increment_outstanding(ch, ch->shared_resource);
1135 			rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain,
1136 							  bdev_io->internal.memory_domain_ctx,
1137 							  &bdev_io->internal.orig_md_iov, 1,
1138 							  &bdev_io->internal.bounce_md_iov, 1,
1139 							  bdev_io_pull_md_buf_done, bdev_io);
1140 			if (rc == 0) {
1141 				/* Continue to submit IO in completion callback */
1142 				return;
1143 			}
1144 			bdev_io_decrement_outstanding(ch, ch->shared_resource);
1145 			TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1146 			if (rc != -ENOMEM) {
1147 				SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n",
1148 					    spdk_memory_domain_get_dma_device_id(
1149 						    bdev_io->internal.memory_domain), rc);
1150 			}
1151 		} else {
1152 			memcpy(bdev_io->internal.bounce_md_iov.iov_base,
1153 			       bdev_io->internal.orig_md_iov.iov_base,
1154 			       bdev_io->internal.orig_md_iov.iov_len);
1155 		}
1156 	}
1157 
1158 	if (spdk_unlikely(rc == -ENOMEM)) {
1159 		bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD);
1160 	} else {
1161 		assert(bdev_io->internal.data_transfer_cpl);
1162 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1163 	}
1164 }
1165 
1166 static void
1167 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len)
1168 {
1169 	/* save original md_buf */
1170 	bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf;
1171 	bdev_io->internal.orig_md_iov.iov_len = len;
1172 	bdev_io->internal.bounce_md_iov.iov_base = md_buf;
1173 	bdev_io->internal.bounce_md_iov.iov_len = len;
1174 	/* set bounce md_buf */
1175 	bdev_io->u.bdev.md_buf = md_buf;
1176 
1177 	bdev_io_pull_md_buf(bdev_io);
1178 }
1179 
1180 static void
1181 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io)
1182 {
1183 	struct spdk_bdev *bdev = bdev_io->bdev;
1184 	uint64_t md_len;
1185 	void *buf;
1186 
1187 	if (spdk_bdev_is_md_separate(bdev)) {
1188 		assert(!bdev_io_use_accel_sequence(bdev_io));
1189 
1190 		buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len;
1191 		md_len = bdev_io->u.bdev.num_blocks * bdev->md_len;
1192 
1193 		assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0);
1194 
1195 		if (bdev_io->u.bdev.md_buf != NULL) {
1196 			_bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len);
1197 			return;
1198 		} else {
1199 			spdk_bdev_io_set_md_buf(bdev_io, buf, md_len);
1200 		}
1201 	}
1202 
1203 	bdev_io_get_buf_complete(bdev_io, true);
1204 }
1205 
1206 static inline void
1207 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc)
1208 {
1209 	if (rc) {
1210 		SPDK_ERRLOG("Failed to get data buffer\n");
1211 		assert(bdev_io->internal.data_transfer_cpl);
1212 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1213 		return;
1214 	}
1215 
1216 	_bdev_io_set_md_buf(bdev_io);
1217 }
1218 
1219 static void
1220 bdev_io_pull_data_done_and_track(void *ctx, int status)
1221 {
1222 	struct spdk_bdev_io *bdev_io = ctx;
1223 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1224 
1225 	TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1226 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1227 
1228 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1229 		bdev_ch_retry_io(ch);
1230 	}
1231 
1232 	bdev_io_pull_data_done(bdev_io, status);
1233 }
1234 
1235 static void
1236 bdev_io_pull_data(struct spdk_bdev_io *bdev_io)
1237 {
1238 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1239 	int rc = 0;
1240 
1241 	/* If we need to exec an accel sequence, append a copy operation making accel change the
1242 	 * src/dst buffers of the previous operation */
1243 	if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) {
1244 		if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1245 			rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel,
1246 						    bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
1247 						    NULL, NULL,
1248 						    bdev_io->internal.orig_iovs,
1249 						    bdev_io->internal.orig_iovcnt,
1250 						    bdev_io->internal.memory_domain,
1251 						    bdev_io->internal.memory_domain_ctx,
1252 						    0, NULL, NULL);
1253 		} else {
1254 			/* We need to reverse the src/dst for reads */
1255 			assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
1256 			rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel,
1257 						    bdev_io->internal.orig_iovs,
1258 						    bdev_io->internal.orig_iovcnt,
1259 						    bdev_io->internal.memory_domain,
1260 						    bdev_io->internal.memory_domain_ctx,
1261 						    bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
1262 						    NULL, NULL, 0, NULL, NULL);
1263 		}
1264 
1265 		if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) {
1266 			SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n",
1267 				    bdev_io->internal.accel_sequence);
1268 		}
1269 	} else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1270 		/* if this is write path, copy data from original buffer to bounce buffer */
1271 		if (bdev_io_use_memory_domain(bdev_io)) {
1272 			TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1273 			bdev_io_increment_outstanding(ch, ch->shared_resource);
1274 			rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain,
1275 							  bdev_io->internal.memory_domain_ctx,
1276 							  bdev_io->internal.orig_iovs,
1277 							  (uint32_t) bdev_io->internal.orig_iovcnt,
1278 							  bdev_io->u.bdev.iovs, 1,
1279 							  bdev_io_pull_data_done_and_track,
1280 							  bdev_io);
1281 			if (rc == 0) {
1282 				/* Continue to submit IO in completion callback */
1283 				return;
1284 			}
1285 			TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1286 			bdev_io_decrement_outstanding(ch, ch->shared_resource);
1287 			if (rc != -ENOMEM) {
1288 				SPDK_ERRLOG("Failed to pull data from memory domain %s\n",
1289 					    spdk_memory_domain_get_dma_device_id(
1290 						    bdev_io->internal.memory_domain));
1291 			}
1292 		} else {
1293 			assert(bdev_io->u.bdev.iovcnt == 1);
1294 			spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base,
1295 					      bdev_io->u.bdev.iovs[0].iov_len,
1296 					      bdev_io->internal.orig_iovs,
1297 					      bdev_io->internal.orig_iovcnt);
1298 		}
1299 	}
1300 
1301 	if (spdk_unlikely(rc == -ENOMEM)) {
1302 		bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL);
1303 	} else {
1304 		bdev_io_pull_data_done(bdev_io, rc);
1305 	}
1306 }
1307 
1308 static void
1309 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len,
1310 			      bdev_copy_bounce_buffer_cpl cpl_cb)
1311 {
1312 	struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource;
1313 
1314 	bdev_io->internal.data_transfer_cpl = cpl_cb;
1315 	/* save original iovec */
1316 	bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs;
1317 	bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt;
1318 	/* set bounce iov */
1319 	bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov;
1320 	bdev_io->u.bdev.iovcnt = 1;
1321 	/* set bounce buffer for this operation */
1322 	bdev_io->u.bdev.iovs[0].iov_base = buf;
1323 	bdev_io->u.bdev.iovs[0].iov_len = len;
1324 
1325 	if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) {
1326 		bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL);
1327 	} else {
1328 		bdev_io_pull_data(bdev_io);
1329 	}
1330 }
1331 
1332 static void
1333 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len)
1334 {
1335 	struct spdk_bdev *bdev = bdev_io->bdev;
1336 	bool buf_allocated;
1337 	uint64_t alignment;
1338 	void *aligned_buf;
1339 
1340 	bdev_io->internal.buf = buf;
1341 
1342 	if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) {
1343 		bdev_io_get_buf_complete(bdev_io, true);
1344 		return;
1345 	}
1346 
1347 	alignment = spdk_bdev_get_buf_align(bdev);
1348 	buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs);
1349 	aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1));
1350 
1351 	if (buf_allocated) {
1352 		_bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl);
1353 		/* Continue in completion callback */
1354 		return;
1355 	} else {
1356 		spdk_bdev_io_set_buf(bdev_io, aligned_buf, len);
1357 	}
1358 
1359 	_bdev_io_set_md_buf(bdev_io);
1360 }
1361 
1362 static inline uint64_t
1363 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len)
1364 {
1365 	struct spdk_bdev *bdev = bdev_io->bdev;
1366 	uint64_t md_len, alignment;
1367 
1368 	md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0;
1369 
1370 	/* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */
1371 	alignment = spdk_bdev_get_buf_align(bdev) - 1;
1372 
1373 	return len + alignment + md_len;
1374 }
1375 
1376 static void
1377 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len)
1378 {
1379 	struct spdk_bdev_mgmt_channel *ch;
1380 
1381 	ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
1382 	spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len));
1383 }
1384 
1385 static void
1386 bdev_io_put_buf(struct spdk_bdev_io *bdev_io)
1387 {
1388 	assert(bdev_io->internal.buf != NULL);
1389 	_bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len);
1390 	bdev_io->internal.buf = NULL;
1391 }
1392 
1393 void
1394 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf)
1395 {
1396 	uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
1397 
1398 	assert(buf != NULL);
1399 	_bdev_io_put_buf(bdev_io, buf, len);
1400 }
1401 
1402 static inline void
1403 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch,
1404 		    struct spdk_bdev_io *bdev_io)
1405 {
1406 	/* After a request is submitted to a bdev module, the ownership of an accel sequence
1407 	 * associated with that bdev_io is transferred to the bdev module. So, clear the internal
1408 	 * sequence pointer to make sure we won't touch it anymore. */
1409 	if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE ||
1410 	     bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) {
1411 		assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io));
1412 		bdev_io->internal.accel_sequence = NULL;
1413 	}
1414 
1415 	bdev->fn_table->submit_request(ioch, bdev_io);
1416 }
1417 
1418 static inline void
1419 bdev_ch_resubmit_io(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io)
1420 {
1421 	struct spdk_bdev *bdev = bdev_ch->bdev;
1422 
1423 	bdev_io_increment_outstanding(bdev_io->internal.ch, bdev_ch->shared_resource);
1424 	bdev_io->internal.error.nvme.cdw0 = 0;
1425 	bdev_io->num_retries++;
1426 	bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io);
1427 }
1428 
1429 static void
1430 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
1431 {
1432 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
1433 	struct spdk_bdev_io *bdev_io;
1434 
1435 	if (shared_resource->io_outstanding > shared_resource->nomem_threshold) {
1436 		/*
1437 		 * Allow some more I/O to complete before retrying the nomem_io queue.
1438 		 *  Some drivers (such as nvme) cannot immediately take a new I/O in
1439 		 *  the context of a completion, because the resources for the I/O are
1440 		 *  not released until control returns to the bdev poller.  Also, we
1441 		 *  may require several small I/O to complete before a larger I/O
1442 		 *  (that requires splitting) can be submitted.
1443 		 */
1444 		return;
1445 	}
1446 
1447 	while (!TAILQ_EMPTY(&shared_resource->nomem_io)) {
1448 		bdev_io = TAILQ_FIRST(&shared_resource->nomem_io);
1449 		TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link);
1450 
1451 		switch (bdev_io->internal.retry_state) {
1452 		case BDEV_IO_RETRY_STATE_SUBMIT:
1453 			bdev_ch_resubmit_io(bdev_ch, bdev_io);
1454 			break;
1455 		case BDEV_IO_RETRY_STATE_PULL:
1456 			bdev_io_pull_data(bdev_io);
1457 			break;
1458 		case BDEV_IO_RETRY_STATE_PULL_MD:
1459 			bdev_io_pull_md_buf(bdev_io);
1460 			break;
1461 		case BDEV_IO_RETRY_STATE_PUSH:
1462 			bdev_io_push_bounce_data(bdev_io);
1463 			break;
1464 		case BDEV_IO_RETRY_STATE_PUSH_MD:
1465 			bdev_io_push_bounce_md_buf(bdev_io);
1466 			break;
1467 		default:
1468 			assert(0 && "invalid retry state");
1469 			break;
1470 		}
1471 
1472 		if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) {
1473 			/* This IO completed again with NOMEM status, so break the loop and
1474 			 * don't try anymore.  Note that a bdev_io that fails with NOMEM
1475 			 * always gets requeued at the front of the list, to maintain
1476 			 * ordering.
1477 			 */
1478 			break;
1479 		}
1480 	}
1481 }
1482 
1483 static inline bool
1484 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state)
1485 {
1486 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
1487 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
1488 
1489 	if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) {
1490 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
1491 		bdev_queue_nomem_io_head(shared_resource, bdev_io, state);
1492 
1493 		/* If bdev module completed an I/O that has an accel sequence with NOMEM status, the
1494 		 * ownership of that sequence is transferred back to the bdev layer, so we need to
1495 		 * restore internal.accel_sequence to make sure that the sequence is handled
1496 		 * correctly in case the I/O is later aborted. */
1497 		if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ ||
1498 		     bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) {
1499 			assert(bdev_io->internal.accel_sequence == NULL);
1500 			bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence;
1501 		}
1502 
1503 		return true;
1504 	}
1505 
1506 	if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) {
1507 		bdev_ch_retry_io(bdev_ch);
1508 	}
1509 
1510 	return false;
1511 }
1512 
1513 static void
1514 _bdev_io_complete_push_bounce_done(void *ctx, int rc)
1515 {
1516 	struct spdk_bdev_io *bdev_io = ctx;
1517 
1518 	if (rc) {
1519 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1520 	}
1521 	/* We want to free the bounce buffer here since we know we're done with it (as opposed
1522 	 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()).
1523 	 */
1524 	bdev_io_put_buf(bdev_io);
1525 
1526 	/* Continue with IO completion flow */
1527 	bdev_io_complete(bdev_io);
1528 }
1529 
1530 static void
1531 bdev_io_push_bounce_md_buf_done(void *ctx, int rc)
1532 {
1533 	struct spdk_bdev_io *bdev_io = ctx;
1534 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1535 
1536 	TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1537 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1538 
1539 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1540 		bdev_ch_retry_io(ch);
1541 	}
1542 
1543 	bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1544 }
1545 
1546 static inline void
1547 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io)
1548 {
1549 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1550 	int rc = 0;
1551 
1552 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
1553 	/* do the same for metadata buffer */
1554 	if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) {
1555 		assert(spdk_bdev_is_md_separate(bdev_io->bdev));
1556 
1557 		if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
1558 			if (bdev_io_use_memory_domain(bdev_io)) {
1559 				TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1560 				bdev_io_increment_outstanding(ch, ch->shared_resource);
1561 				/* If memory domain is used then we need to call async push function */
1562 				rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain,
1563 								  bdev_io->internal.memory_domain_ctx,
1564 								  &bdev_io->internal.orig_md_iov,
1565 								  (uint32_t)bdev_io->internal.orig_iovcnt,
1566 								  &bdev_io->internal.bounce_md_iov, 1,
1567 								  bdev_io_push_bounce_md_buf_done,
1568 								  bdev_io);
1569 				if (rc == 0) {
1570 					/* Continue IO completion in async callback */
1571 					return;
1572 				}
1573 				TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1574 				bdev_io_decrement_outstanding(ch, ch->shared_resource);
1575 				if (rc != -ENOMEM) {
1576 					SPDK_ERRLOG("Failed to push md to memory domain %s\n",
1577 						    spdk_memory_domain_get_dma_device_id(
1578 							    bdev_io->internal.memory_domain));
1579 				}
1580 			} else {
1581 				memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf,
1582 				       bdev_io->internal.orig_md_iov.iov_len);
1583 			}
1584 		}
1585 	}
1586 
1587 	if (spdk_unlikely(rc == -ENOMEM)) {
1588 		bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD);
1589 	} else {
1590 		assert(bdev_io->internal.data_transfer_cpl);
1591 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1592 	}
1593 }
1594 
1595 static inline void
1596 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc)
1597 {
1598 	assert(bdev_io->internal.data_transfer_cpl);
1599 	if (rc) {
1600 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1601 		return;
1602 	}
1603 
1604 	/* set original buffer for this io */
1605 	bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt;
1606 	bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs;
1607 	/* disable bouncing buffer for this io */
1608 	bdev_io->internal.orig_iovcnt = 0;
1609 	bdev_io->internal.orig_iovs = NULL;
1610 
1611 	bdev_io_push_bounce_md_buf(bdev_io);
1612 }
1613 
1614 static void
1615 bdev_io_push_bounce_data_done_and_track(void *ctx, int status)
1616 {
1617 	struct spdk_bdev_io *bdev_io = ctx;
1618 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1619 
1620 	TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1621 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1622 
1623 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1624 		bdev_ch_retry_io(ch);
1625 	}
1626 
1627 	bdev_io_push_bounce_data_done(bdev_io, status);
1628 }
1629 
1630 static inline void
1631 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io)
1632 {
1633 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1634 	int rc = 0;
1635 
1636 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
1637 	/* if this is read path, copy data from bounce buffer to original buffer */
1638 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
1639 		if (bdev_io_use_memory_domain(bdev_io)) {
1640 			TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1641 			bdev_io_increment_outstanding(ch, ch->shared_resource);
1642 			/* If memory domain is used then we need to call async push function */
1643 			rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain,
1644 							  bdev_io->internal.memory_domain_ctx,
1645 							  bdev_io->internal.orig_iovs,
1646 							  (uint32_t)bdev_io->internal.orig_iovcnt,
1647 							  &bdev_io->internal.bounce_iov, 1,
1648 							  bdev_io_push_bounce_data_done_and_track,
1649 							  bdev_io);
1650 			if (rc == 0) {
1651 				/* Continue IO completion in async callback */
1652 				return;
1653 			}
1654 
1655 			TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1656 			bdev_io_decrement_outstanding(ch, ch->shared_resource);
1657 			if (rc != -ENOMEM) {
1658 				SPDK_ERRLOG("Failed to push data to memory domain %s\n",
1659 					    spdk_memory_domain_get_dma_device_id(
1660 						    bdev_io->internal.memory_domain));
1661 			}
1662 		} else {
1663 			spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs,
1664 					      bdev_io->internal.orig_iovcnt,
1665 					      bdev_io->internal.bounce_iov.iov_base,
1666 					      bdev_io->internal.bounce_iov.iov_len);
1667 		}
1668 	}
1669 
1670 	if (spdk_unlikely(rc == -ENOMEM)) {
1671 		bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH);
1672 	} else {
1673 		bdev_io_push_bounce_data_done(bdev_io, rc);
1674 	}
1675 }
1676 
1677 static inline void
1678 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb)
1679 {
1680 	bdev_io->internal.data_transfer_cpl = cpl_cb;
1681 	bdev_io_push_bounce_data(bdev_io);
1682 }
1683 
1684 static void
1685 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf)
1686 {
1687 	struct spdk_bdev_io *bdev_io;
1688 
1689 	bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf);
1690 	_bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len);
1691 }
1692 
1693 static void
1694 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len)
1695 {
1696 	struct spdk_bdev_mgmt_channel *mgmt_ch;
1697 	uint64_t max_len;
1698 	void *buf;
1699 
1700 	assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread());
1701 	mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
1702 	max_len = bdev_io_get_max_buf_len(bdev_io, len);
1703 
1704 	if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) {
1705 		SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len);
1706 		bdev_io_get_buf_complete(bdev_io, false);
1707 		return;
1708 	}
1709 
1710 	bdev_io->internal.buf_len = len;
1711 	buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf,
1712 			     bdev_io_get_iobuf_cb);
1713 	if (buf != NULL) {
1714 		_bdev_io_set_buf(bdev_io, buf, len);
1715 	}
1716 }
1717 
1718 void
1719 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len)
1720 {
1721 	struct spdk_bdev *bdev = bdev_io->bdev;
1722 	uint64_t alignment;
1723 
1724 	assert(cb != NULL);
1725 	bdev_io->internal.get_buf_cb = cb;
1726 
1727 	alignment = spdk_bdev_get_buf_align(bdev);
1728 
1729 	if (_is_buf_allocated(bdev_io->u.bdev.iovs) &&
1730 	    _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) {
1731 		/* Buffer already present and aligned */
1732 		cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true);
1733 		return;
1734 	}
1735 
1736 	bdev_io_get_buf(bdev_io, len);
1737 }
1738 
1739 static void
1740 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
1741 			      bool success)
1742 {
1743 	if (!success) {
1744 		SPDK_ERRLOG("Failed to get data buffer, completing IO\n");
1745 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1746 		bdev_io_complete_unsubmitted(bdev_io);
1747 		return;
1748 	}
1749 
1750 	if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) {
1751 		if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1752 			bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb);
1753 			return;
1754 		}
1755 		/* For reads we'll execute the sequence after the data is read, so, for now, only
1756 		 * clear out accel_sequence pointer and submit the IO */
1757 		assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
1758 		bdev_io->u.bdev.accel_sequence = NULL;
1759 	}
1760 
1761 	bdev_io_submit(bdev_io);
1762 }
1763 
1764 static void
1765 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb,
1766 			       uint64_t len)
1767 {
1768 	assert(cb != NULL);
1769 	bdev_io->internal.get_buf_cb = cb;
1770 
1771 	bdev_io_get_buf(bdev_io, len);
1772 }
1773 
1774 void
1775 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb)
1776 {
1777 	uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
1778 
1779 	assert(cb != NULL);
1780 	assert(bdev_io->internal.get_aux_buf_cb == NULL);
1781 	bdev_io->internal.get_aux_buf_cb = cb;
1782 	bdev_io_get_buf(bdev_io, len);
1783 }
1784 
1785 static int
1786 bdev_module_get_max_ctx_size(void)
1787 {
1788 	struct spdk_bdev_module *bdev_module;
1789 	int max_bdev_module_size = 0;
1790 
1791 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
1792 		if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) {
1793 			max_bdev_module_size = bdev_module->get_ctx_size();
1794 		}
1795 	}
1796 
1797 	return max_bdev_module_size;
1798 }
1799 
1800 static void
1801 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1802 {
1803 	int i;
1804 	struct spdk_bdev_qos *qos = bdev->internal.qos;
1805 	uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
1806 
1807 	if (!qos) {
1808 		return;
1809 	}
1810 
1811 	spdk_bdev_get_qos_rate_limits(bdev, limits);
1812 
1813 	spdk_json_write_object_begin(w);
1814 	spdk_json_write_named_string(w, "method", "bdev_set_qos_limit");
1815 
1816 	spdk_json_write_named_object_begin(w, "params");
1817 	spdk_json_write_named_string(w, "name", bdev->name);
1818 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
1819 		if (limits[i] > 0) {
1820 			spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]);
1821 		}
1822 	}
1823 	spdk_json_write_object_end(w);
1824 
1825 	spdk_json_write_object_end(w);
1826 }
1827 
1828 void
1829 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w)
1830 {
1831 	struct spdk_bdev_module *bdev_module;
1832 	struct spdk_bdev *bdev;
1833 
1834 	assert(w != NULL);
1835 
1836 	spdk_json_write_array_begin(w);
1837 
1838 	spdk_json_write_object_begin(w);
1839 	spdk_json_write_named_string(w, "method", "bdev_set_options");
1840 	spdk_json_write_named_object_begin(w, "params");
1841 	spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size);
1842 	spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size);
1843 	spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine);
1844 	spdk_json_write_object_end(w);
1845 	spdk_json_write_object_end(w);
1846 
1847 	bdev_examine_allowlist_config_json(w);
1848 
1849 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
1850 		if (bdev_module->config_json) {
1851 			bdev_module->config_json(w);
1852 		}
1853 	}
1854 
1855 	spdk_spin_lock(&g_bdev_mgr.spinlock);
1856 
1857 	TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) {
1858 		if (bdev->fn_table->write_config_json) {
1859 			bdev->fn_table->write_config_json(bdev, w);
1860 		}
1861 
1862 		bdev_qos_config_json(bdev, w);
1863 	}
1864 
1865 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
1866 
1867 	/* This has to be last RPC in array to make sure all bdevs finished examine */
1868 	spdk_json_write_object_begin(w);
1869 	spdk_json_write_named_string(w, "method", "bdev_wait_for_examine");
1870 	spdk_json_write_object_end(w);
1871 
1872 	spdk_json_write_array_end(w);
1873 }
1874 
1875 static void
1876 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf)
1877 {
1878 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
1879 	struct spdk_bdev_io *bdev_io;
1880 
1881 	spdk_iobuf_channel_fini(&ch->iobuf);
1882 
1883 	while (!STAILQ_EMPTY(&ch->per_thread_cache)) {
1884 		bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
1885 		STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
1886 		ch->per_thread_cache_count--;
1887 		spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
1888 	}
1889 
1890 	assert(ch->per_thread_cache_count == 0);
1891 }
1892 
1893 static int
1894 bdev_mgmt_channel_create(void *io_device, void *ctx_buf)
1895 {
1896 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
1897 	struct spdk_bdev_io *bdev_io;
1898 	uint32_t i;
1899 	int rc;
1900 
1901 	rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE);
1902 	if (rc != 0) {
1903 		SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc));
1904 		return -1;
1905 	}
1906 
1907 	STAILQ_INIT(&ch->per_thread_cache);
1908 	ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size;
1909 
1910 	/* Pre-populate bdev_io cache to ensure this thread cannot be starved. */
1911 	ch->per_thread_cache_count = 0;
1912 	for (i = 0; i < ch->bdev_io_cache_size; i++) {
1913 		bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
1914 		if (bdev_io == NULL) {
1915 			SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n");
1916 			assert(false);
1917 			bdev_mgmt_channel_destroy(io_device, ctx_buf);
1918 			return -1;
1919 		}
1920 		ch->per_thread_cache_count++;
1921 		STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link);
1922 	}
1923 
1924 	TAILQ_INIT(&ch->shared_resources);
1925 	TAILQ_INIT(&ch->io_wait_queue);
1926 
1927 	return 0;
1928 }
1929 
1930 static void
1931 bdev_init_complete(int rc)
1932 {
1933 	spdk_bdev_init_cb cb_fn = g_init_cb_fn;
1934 	void *cb_arg = g_init_cb_arg;
1935 	struct spdk_bdev_module *m;
1936 
1937 	g_bdev_mgr.init_complete = true;
1938 	g_init_cb_fn = NULL;
1939 	g_init_cb_arg = NULL;
1940 
1941 	/*
1942 	 * For modules that need to know when subsystem init is complete,
1943 	 * inform them now.
1944 	 */
1945 	if (rc == 0) {
1946 		TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
1947 			if (m->init_complete) {
1948 				m->init_complete();
1949 			}
1950 		}
1951 	}
1952 
1953 	cb_fn(cb_arg, rc);
1954 }
1955 
1956 static bool
1957 bdev_module_all_actions_completed(void)
1958 {
1959 	struct spdk_bdev_module *m;
1960 
1961 	TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
1962 		if (m->internal.action_in_progress > 0) {
1963 			return false;
1964 		}
1965 	}
1966 	return true;
1967 }
1968 
1969 static void
1970 bdev_module_action_complete(void)
1971 {
1972 	/*
1973 	 * Don't finish bdev subsystem initialization if
1974 	 * module pre-initialization is still in progress, or
1975 	 * the subsystem been already initialized.
1976 	 */
1977 	if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) {
1978 		return;
1979 	}
1980 
1981 	/*
1982 	 * Check all bdev modules for inits/examinations in progress. If any
1983 	 * exist, return immediately since we cannot finish bdev subsystem
1984 	 * initialization until all are completed.
1985 	 */
1986 	if (!bdev_module_all_actions_completed()) {
1987 		return;
1988 	}
1989 
1990 	/*
1991 	 * Modules already finished initialization - now that all
1992 	 * the bdev modules have finished their asynchronous I/O
1993 	 * processing, the entire bdev layer can be marked as complete.
1994 	 */
1995 	bdev_init_complete(0);
1996 }
1997 
1998 static void
1999 bdev_module_action_done(struct spdk_bdev_module *module)
2000 {
2001 	spdk_spin_lock(&module->internal.spinlock);
2002 	assert(module->internal.action_in_progress > 0);
2003 	module->internal.action_in_progress--;
2004 	spdk_spin_unlock(&module->internal.spinlock);
2005 	bdev_module_action_complete();
2006 }
2007 
2008 void
2009 spdk_bdev_module_init_done(struct spdk_bdev_module *module)
2010 {
2011 	assert(module->async_init);
2012 	bdev_module_action_done(module);
2013 }
2014 
2015 void
2016 spdk_bdev_module_examine_done(struct spdk_bdev_module *module)
2017 {
2018 	bdev_module_action_done(module);
2019 }
2020 
2021 /** The last initialized bdev module */
2022 static struct spdk_bdev_module *g_resume_bdev_module = NULL;
2023 
2024 static void
2025 bdev_init_failed(void *cb_arg)
2026 {
2027 	struct spdk_bdev_module *module = cb_arg;
2028 
2029 	spdk_spin_lock(&module->internal.spinlock);
2030 	assert(module->internal.action_in_progress > 0);
2031 	module->internal.action_in_progress--;
2032 	spdk_spin_unlock(&module->internal.spinlock);
2033 	bdev_init_complete(-1);
2034 }
2035 
2036 static int
2037 bdev_modules_init(void)
2038 {
2039 	struct spdk_bdev_module *module;
2040 	int rc = 0;
2041 
2042 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
2043 		g_resume_bdev_module = module;
2044 		if (module->async_init) {
2045 			spdk_spin_lock(&module->internal.spinlock);
2046 			module->internal.action_in_progress = 1;
2047 			spdk_spin_unlock(&module->internal.spinlock);
2048 		}
2049 		rc = module->module_init();
2050 		if (rc != 0) {
2051 			/* Bump action_in_progress to prevent other modules from completion of modules_init
2052 			 * Send message to defer application shutdown until resources are cleaned up */
2053 			spdk_spin_lock(&module->internal.spinlock);
2054 			module->internal.action_in_progress = 1;
2055 			spdk_spin_unlock(&module->internal.spinlock);
2056 			spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module);
2057 			return rc;
2058 		}
2059 	}
2060 
2061 	g_resume_bdev_module = NULL;
2062 	return 0;
2063 }
2064 
2065 void
2066 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg)
2067 {
2068 	int rc = 0;
2069 	char mempool_name[32];
2070 
2071 	assert(cb_fn != NULL);
2072 
2073 	g_init_cb_fn = cb_fn;
2074 	g_init_cb_arg = cb_arg;
2075 
2076 	spdk_notify_type_register("bdev_register");
2077 	spdk_notify_type_register("bdev_unregister");
2078 
2079 	snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid());
2080 
2081 	rc = spdk_iobuf_register_module("bdev");
2082 	if (rc != 0) {
2083 		SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc));
2084 		bdev_init_complete(-1);
2085 		return;
2086 	}
2087 
2088 	g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name,
2089 				  g_bdev_opts.bdev_io_pool_size,
2090 				  sizeof(struct spdk_bdev_io) +
2091 				  bdev_module_get_max_ctx_size(),
2092 				  0,
2093 				  SPDK_ENV_SOCKET_ID_ANY);
2094 
2095 	if (g_bdev_mgr.bdev_io_pool == NULL) {
2096 		SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n");
2097 		bdev_init_complete(-1);
2098 		return;
2099 	}
2100 
2101 	g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE,
2102 					      NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
2103 	if (!g_bdev_mgr.zero_buffer) {
2104 		SPDK_ERRLOG("create bdev zero buffer failed\n");
2105 		bdev_init_complete(-1);
2106 		return;
2107 	}
2108 
2109 #ifdef SPDK_CONFIG_VTUNE
2110 	SPDK_LOG_DEPRECATED(vtune_support);
2111 	g_bdev_mgr.domain = __itt_domain_create("spdk_bdev");
2112 #endif
2113 
2114 	spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create,
2115 				bdev_mgmt_channel_destroy,
2116 				sizeof(struct spdk_bdev_mgmt_channel),
2117 				"bdev_mgr");
2118 
2119 	rc = bdev_modules_init();
2120 	g_bdev_mgr.module_init_complete = true;
2121 	if (rc != 0) {
2122 		SPDK_ERRLOG("bdev modules init failed\n");
2123 		return;
2124 	}
2125 
2126 	bdev_module_action_complete();
2127 }
2128 
2129 static void
2130 bdev_mgr_unregister_cb(void *io_device)
2131 {
2132 	spdk_bdev_fini_cb cb_fn = g_fini_cb_fn;
2133 
2134 	if (g_bdev_mgr.bdev_io_pool) {
2135 		if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) {
2136 			SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n",
2137 				    spdk_mempool_count(g_bdev_mgr.bdev_io_pool),
2138 				    g_bdev_opts.bdev_io_pool_size);
2139 		}
2140 
2141 		spdk_mempool_free(g_bdev_mgr.bdev_io_pool);
2142 	}
2143 
2144 	spdk_free(g_bdev_mgr.zero_buffer);
2145 
2146 	bdev_examine_allowlist_free();
2147 
2148 	cb_fn(g_fini_cb_arg);
2149 	g_fini_cb_fn = NULL;
2150 	g_fini_cb_arg = NULL;
2151 	g_bdev_mgr.init_complete = false;
2152 	g_bdev_mgr.module_init_complete = false;
2153 }
2154 
2155 static void
2156 bdev_module_fini_iter(void *arg)
2157 {
2158 	struct spdk_bdev_module *bdev_module;
2159 
2160 	/* FIXME: Handling initialization failures is broken now,
2161 	 * so we won't even try cleaning up after successfully
2162 	 * initialized modules. if module_init_complete is false,
2163 	 * just call spdk_bdev_mgr_unregister_cb
2164 	 */
2165 	if (!g_bdev_mgr.module_init_complete) {
2166 		bdev_mgr_unregister_cb(NULL);
2167 		return;
2168 	}
2169 
2170 	/* Start iterating from the last touched module */
2171 	if (!g_resume_bdev_module) {
2172 		bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list);
2173 	} else {
2174 		bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list,
2175 					 internal.tailq);
2176 	}
2177 
2178 	while (bdev_module) {
2179 		if (bdev_module->async_fini) {
2180 			/* Save our place so we can resume later. We must
2181 			 * save the variable here, before calling module_fini()
2182 			 * below, because in some cases the module may immediately
2183 			 * call spdk_bdev_module_fini_done() and re-enter
2184 			 * this function to continue iterating. */
2185 			g_resume_bdev_module = bdev_module;
2186 		}
2187 
2188 		if (bdev_module->module_fini) {
2189 			bdev_module->module_fini();
2190 		}
2191 
2192 		if (bdev_module->async_fini) {
2193 			return;
2194 		}
2195 
2196 		bdev_module = TAILQ_PREV(bdev_module, bdev_module_list,
2197 					 internal.tailq);
2198 	}
2199 
2200 	g_resume_bdev_module = NULL;
2201 	spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb);
2202 }
2203 
2204 void
2205 spdk_bdev_module_fini_done(void)
2206 {
2207 	if (spdk_get_thread() != g_fini_thread) {
2208 		spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL);
2209 	} else {
2210 		bdev_module_fini_iter(NULL);
2211 	}
2212 }
2213 
2214 static void
2215 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno)
2216 {
2217 	struct spdk_bdev *bdev = cb_arg;
2218 
2219 	if (bdeverrno && bdev) {
2220 		SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n",
2221 			     bdev->name);
2222 
2223 		/*
2224 		 * Since the call to spdk_bdev_unregister() failed, we have no way to free this
2225 		 *  bdev; try to continue by manually removing this bdev from the list and continue
2226 		 *  with the next bdev in the list.
2227 		 */
2228 		TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
2229 	}
2230 
2231 	if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) {
2232 		SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n");
2233 		/*
2234 		 * Bdev module finish need to be deferred as we might be in the middle of some context
2235 		 * (like bdev part free) that will use this bdev (or private bdev driver ctx data)
2236 		 * after returning.
2237 		 */
2238 		spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL);
2239 		return;
2240 	}
2241 
2242 	/*
2243 	 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem
2244 	 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity
2245 	 * to detect clean shutdown as opposed to run-time hot removal of the underlying
2246 	 * base bdevs.
2247 	 *
2248 	 * Also, walk the list in the reverse order.
2249 	 */
2250 	for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list);
2251 	     bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) {
2252 		spdk_spin_lock(&bdev->internal.spinlock);
2253 		if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
2254 			LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev);
2255 			spdk_spin_unlock(&bdev->internal.spinlock);
2256 			continue;
2257 		}
2258 		spdk_spin_unlock(&bdev->internal.spinlock);
2259 
2260 		SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name);
2261 		spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev);
2262 		return;
2263 	}
2264 
2265 	/*
2266 	 * If any bdev fails to unclaim underlying bdev properly, we may face the
2267 	 * case of bdev list consisting of claimed bdevs only (if claims are managed
2268 	 * correctly, this would mean there's a loop in the claims graph which is
2269 	 * clearly impossible). Warn and unregister last bdev on the list then.
2270 	 */
2271 	for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list);
2272 	     bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) {
2273 		SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name);
2274 		spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev);
2275 		return;
2276 	}
2277 }
2278 
2279 static void
2280 bdev_module_fini_start_iter(void *arg)
2281 {
2282 	struct spdk_bdev_module *bdev_module;
2283 
2284 	if (!g_resume_bdev_module) {
2285 		bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list);
2286 	} else {
2287 		bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq);
2288 	}
2289 
2290 	while (bdev_module) {
2291 		if (bdev_module->async_fini_start) {
2292 			/* Save our place so we can resume later. We must
2293 			 * save the variable here, before calling fini_start()
2294 			 * below, because in some cases the module may immediately
2295 			 * call spdk_bdev_module_fini_start_done() and re-enter
2296 			 * this function to continue iterating. */
2297 			g_resume_bdev_module = bdev_module;
2298 		}
2299 
2300 		if (bdev_module->fini_start) {
2301 			bdev_module->fini_start();
2302 		}
2303 
2304 		if (bdev_module->async_fini_start) {
2305 			return;
2306 		}
2307 
2308 		bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq);
2309 	}
2310 
2311 	g_resume_bdev_module = NULL;
2312 
2313 	bdev_finish_unregister_bdevs_iter(NULL, 0);
2314 }
2315 
2316 void
2317 spdk_bdev_module_fini_start_done(void)
2318 {
2319 	if (spdk_get_thread() != g_fini_thread) {
2320 		spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL);
2321 	} else {
2322 		bdev_module_fini_start_iter(NULL);
2323 	}
2324 }
2325 
2326 static void
2327 bdev_finish_wait_for_examine_done(void *cb_arg)
2328 {
2329 	bdev_module_fini_start_iter(NULL);
2330 }
2331 
2332 void
2333 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg)
2334 {
2335 	int rc;
2336 
2337 	assert(cb_fn != NULL);
2338 
2339 	g_fini_thread = spdk_get_thread();
2340 
2341 	g_fini_cb_fn = cb_fn;
2342 	g_fini_cb_arg = cb_arg;
2343 
2344 	rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL);
2345 	if (rc != 0) {
2346 		SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc));
2347 		bdev_finish_wait_for_examine_done(NULL);
2348 	}
2349 }
2350 
2351 struct spdk_bdev_io *
2352 bdev_channel_get_io(struct spdk_bdev_channel *channel)
2353 {
2354 	struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch;
2355 	struct spdk_bdev_io *bdev_io;
2356 
2357 	if (ch->per_thread_cache_count > 0) {
2358 		bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
2359 		STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
2360 		ch->per_thread_cache_count--;
2361 	} else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) {
2362 		/*
2363 		 * Don't try to look for bdev_ios in the global pool if there are
2364 		 * waiters on bdev_ios - we don't want this caller to jump the line.
2365 		 */
2366 		bdev_io = NULL;
2367 	} else {
2368 		bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
2369 	}
2370 
2371 	return bdev_io;
2372 }
2373 
2374 void
2375 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io)
2376 {
2377 	struct spdk_bdev_mgmt_channel *ch;
2378 
2379 	assert(bdev_io != NULL);
2380 	assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING);
2381 
2382 	ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
2383 
2384 	if (bdev_io->internal.buf != NULL) {
2385 		bdev_io_put_buf(bdev_io);
2386 	}
2387 
2388 	if (ch->per_thread_cache_count < ch->bdev_io_cache_size) {
2389 		ch->per_thread_cache_count++;
2390 		STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link);
2391 		while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) {
2392 			struct spdk_bdev_io_wait_entry *entry;
2393 
2394 			entry = TAILQ_FIRST(&ch->io_wait_queue);
2395 			TAILQ_REMOVE(&ch->io_wait_queue, entry, link);
2396 			entry->cb_fn(entry->cb_arg);
2397 		}
2398 	} else {
2399 		/* We should never have a full cache with entries on the io wait queue. */
2400 		assert(TAILQ_EMPTY(&ch->io_wait_queue));
2401 		spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
2402 	}
2403 }
2404 
2405 static bool
2406 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit)
2407 {
2408 	assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES);
2409 
2410 	switch (limit) {
2411 	case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
2412 		return true;
2413 	case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT:
2414 	case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT:
2415 	case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT:
2416 		return false;
2417 	case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES:
2418 	default:
2419 		return false;
2420 	}
2421 }
2422 
2423 static bool
2424 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io)
2425 {
2426 	switch (bdev_io->type) {
2427 	case SPDK_BDEV_IO_TYPE_NVME_IO:
2428 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2429 	case SPDK_BDEV_IO_TYPE_READ:
2430 	case SPDK_BDEV_IO_TYPE_WRITE:
2431 		return true;
2432 	case SPDK_BDEV_IO_TYPE_ZCOPY:
2433 		if (bdev_io->u.bdev.zcopy.start) {
2434 			return true;
2435 		} else {
2436 			return false;
2437 		}
2438 	default:
2439 		return false;
2440 	}
2441 }
2442 
2443 static bool
2444 bdev_is_read_io(struct spdk_bdev_io *bdev_io)
2445 {
2446 	switch (bdev_io->type) {
2447 	case SPDK_BDEV_IO_TYPE_NVME_IO:
2448 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2449 		/* Bit 1 (0x2) set for read operation */
2450 		if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) {
2451 			return true;
2452 		} else {
2453 			return false;
2454 		}
2455 	case SPDK_BDEV_IO_TYPE_READ:
2456 		return true;
2457 	case SPDK_BDEV_IO_TYPE_ZCOPY:
2458 		/* Populate to read from disk */
2459 		if (bdev_io->u.bdev.zcopy.populate) {
2460 			return true;
2461 		} else {
2462 			return false;
2463 		}
2464 	default:
2465 		return false;
2466 	}
2467 }
2468 
2469 static uint64_t
2470 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io)
2471 {
2472 	struct spdk_bdev	*bdev = bdev_io->bdev;
2473 
2474 	switch (bdev_io->type) {
2475 	case SPDK_BDEV_IO_TYPE_NVME_IO:
2476 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2477 		return bdev_io->u.nvme_passthru.nbytes;
2478 	case SPDK_BDEV_IO_TYPE_READ:
2479 	case SPDK_BDEV_IO_TYPE_WRITE:
2480 		return bdev_io->u.bdev.num_blocks * bdev->blocklen;
2481 	case SPDK_BDEV_IO_TYPE_ZCOPY:
2482 		/* Track the data in the start phase only */
2483 		if (bdev_io->u.bdev.zcopy.start) {
2484 			return bdev_io->u.bdev.num_blocks * bdev->blocklen;
2485 		} else {
2486 			return 0;
2487 		}
2488 	default:
2489 		return 0;
2490 	}
2491 }
2492 
2493 static bool
2494 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2495 {
2496 	if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) {
2497 		return true;
2498 	} else {
2499 		return false;
2500 	}
2501 }
2502 
2503 static bool
2504 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2505 {
2506 	if (bdev_is_read_io(io) == false) {
2507 		return false;
2508 	}
2509 
2510 	return bdev_qos_rw_queue_io(limit, io);
2511 }
2512 
2513 static bool
2514 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2515 {
2516 	if (bdev_is_read_io(io) == true) {
2517 		return false;
2518 	}
2519 
2520 	return bdev_qos_rw_queue_io(limit, io);
2521 }
2522 
2523 static void
2524 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2525 {
2526 	limit->remaining_this_timeslice--;
2527 }
2528 
2529 static void
2530 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2531 {
2532 	limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io);
2533 }
2534 
2535 static void
2536 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2537 {
2538 	if (bdev_is_read_io(io) == false) {
2539 		return;
2540 	}
2541 
2542 	return bdev_qos_rw_bps_update_quota(limit, io);
2543 }
2544 
2545 static void
2546 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2547 {
2548 	if (bdev_is_read_io(io) == true) {
2549 		return;
2550 	}
2551 
2552 	return bdev_qos_rw_bps_update_quota(limit, io);
2553 }
2554 
2555 static void
2556 bdev_qos_set_ops(struct spdk_bdev_qos *qos)
2557 {
2558 	int i;
2559 
2560 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2561 		if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
2562 			qos->rate_limits[i].queue_io = NULL;
2563 			qos->rate_limits[i].update_quota = NULL;
2564 			continue;
2565 		}
2566 
2567 		switch (i) {
2568 		case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
2569 			qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io;
2570 			qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota;
2571 			break;
2572 		case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT:
2573 			qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io;
2574 			qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota;
2575 			break;
2576 		case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT:
2577 			qos->rate_limits[i].queue_io = bdev_qos_r_queue_io;
2578 			qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota;
2579 			break;
2580 		case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT:
2581 			qos->rate_limits[i].queue_io = bdev_qos_w_queue_io;
2582 			qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota;
2583 			break;
2584 		default:
2585 			break;
2586 		}
2587 	}
2588 }
2589 
2590 static void
2591 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch,
2592 			    struct spdk_bdev_io *bdev_io,
2593 			    enum spdk_bdev_io_status status)
2594 {
2595 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
2596 
2597 	bdev_io->internal.in_submit_request = true;
2598 	bdev_ch->io_outstanding++;
2599 	shared_resource->io_outstanding++;
2600 	spdk_bdev_io_complete(bdev_io, status);
2601 	bdev_io->internal.in_submit_request = false;
2602 }
2603 
2604 static inline void
2605 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io)
2606 {
2607 	struct spdk_bdev *bdev = bdev_io->bdev;
2608 	struct spdk_io_channel *ch = bdev_ch->channel;
2609 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
2610 
2611 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) {
2612 		struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch;
2613 		struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort;
2614 
2615 		if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) ||
2616 		    bdev_abort_buf_io(mgmt_channel, bio_to_abort)) {
2617 			_bdev_io_complete_in_submit(bdev_ch, bdev_io,
2618 						    SPDK_BDEV_IO_STATUS_SUCCESS);
2619 			return;
2620 		}
2621 	}
2622 
2623 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE &&
2624 			  bdev_io->bdev->split_on_write_unit &&
2625 			  bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) {
2626 		SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n",
2627 			    bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size);
2628 		_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
2629 		return;
2630 	}
2631 
2632 	if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) {
2633 		bdev_ch->io_outstanding++;
2634 		shared_resource->io_outstanding++;
2635 		bdev_io->internal.in_submit_request = true;
2636 		bdev_submit_request(bdev, ch, bdev_io);
2637 		bdev_io->internal.in_submit_request = false;
2638 	} else {
2639 		bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT);
2640 	}
2641 }
2642 
2643 static bool
2644 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io)
2645 {
2646 	int i;
2647 
2648 	if (bdev_qos_io_to_limit(bdev_io) == true) {
2649 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2650 			if (!qos->rate_limits[i].queue_io) {
2651 				continue;
2652 			}
2653 
2654 			if (qos->rate_limits[i].queue_io(&qos->rate_limits[i],
2655 							 bdev_io) == true) {
2656 				return true;
2657 			}
2658 		}
2659 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2660 			if (!qos->rate_limits[i].update_quota) {
2661 				continue;
2662 			}
2663 
2664 			qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io);
2665 		}
2666 	}
2667 
2668 	return false;
2669 }
2670 
2671 static inline void
2672 _bdev_io_do_submit(void *ctx)
2673 {
2674 	struct spdk_bdev_io *bdev_io = ctx;
2675 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
2676 
2677 	bdev_io_do_submit(ch, bdev_io);
2678 }
2679 
2680 static int
2681 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos)
2682 {
2683 	struct spdk_bdev_io		*bdev_io = NULL, *tmp = NULL;
2684 	int				submitted_ios = 0;
2685 
2686 	TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) {
2687 		if (!bdev_qos_queue_io(qos, bdev_io)) {
2688 			TAILQ_REMOVE(&qos->queued, bdev_io, internal.link);
2689 
2690 			if (bdev_io->internal.io_submit_ch) {
2691 				/* Send back the IO to the original thread for the actual processing. */
2692 				bdev_io->internal.ch = bdev_io->internal.io_submit_ch;
2693 				bdev_io->internal.io_submit_ch = NULL;
2694 				spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
2695 						     _bdev_io_do_submit, bdev_io);
2696 			} else {
2697 				bdev_io_do_submit(ch, bdev_io);
2698 			}
2699 
2700 			submitted_ios++;
2701 		}
2702 	}
2703 
2704 	return submitted_ios;
2705 }
2706 
2707 static void
2708 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn)
2709 {
2710 	int rc;
2711 
2712 	bdev_io->internal.waitq_entry.bdev = bdev_io->bdev;
2713 	bdev_io->internal.waitq_entry.cb_fn = cb_fn;
2714 	bdev_io->internal.waitq_entry.cb_arg = bdev_io;
2715 	rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch),
2716 				     &bdev_io->internal.waitq_entry);
2717 	if (rc != 0) {
2718 		SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc);
2719 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
2720 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
2721 	}
2722 }
2723 
2724 static bool
2725 bdev_rw_should_split(struct spdk_bdev_io *bdev_io)
2726 {
2727 	uint32_t io_boundary;
2728 	struct spdk_bdev *bdev = bdev_io->bdev;
2729 	uint32_t max_size = bdev->max_segment_size;
2730 	int max_segs = bdev->max_num_segments;
2731 
2732 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) {
2733 		io_boundary = bdev->write_unit_size;
2734 	} else if (bdev->split_on_optimal_io_boundary) {
2735 		io_boundary = bdev->optimal_io_boundary;
2736 	} else {
2737 		io_boundary = 0;
2738 	}
2739 
2740 	if (spdk_likely(!io_boundary && !max_segs && !max_size)) {
2741 		return false;
2742 	}
2743 
2744 	if (io_boundary) {
2745 		uint64_t start_stripe, end_stripe;
2746 
2747 		start_stripe = bdev_io->u.bdev.offset_blocks;
2748 		end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1;
2749 		/* Avoid expensive div operations if possible.  These spdk_u32 functions are very cheap. */
2750 		if (spdk_likely(spdk_u32_is_pow2(io_boundary))) {
2751 			start_stripe >>= spdk_u32log2(io_boundary);
2752 			end_stripe >>= spdk_u32log2(io_boundary);
2753 		} else {
2754 			start_stripe /= io_boundary;
2755 			end_stripe /= io_boundary;
2756 		}
2757 
2758 		if (start_stripe != end_stripe) {
2759 			return true;
2760 		}
2761 	}
2762 
2763 	if (max_segs) {
2764 		if (bdev_io->u.bdev.iovcnt > max_segs) {
2765 			return true;
2766 		}
2767 	}
2768 
2769 	if (max_size) {
2770 		for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) {
2771 			if (bdev_io->u.bdev.iovs[i].iov_len > max_size) {
2772 				return true;
2773 			}
2774 		}
2775 	}
2776 
2777 	return false;
2778 }
2779 
2780 static bool
2781 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io)
2782 {
2783 	uint32_t num_unmap_segments;
2784 
2785 	if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) {
2786 		return false;
2787 	}
2788 	num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap);
2789 	if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) {
2790 		return true;
2791 	}
2792 
2793 	return false;
2794 }
2795 
2796 static bool
2797 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io)
2798 {
2799 	if (!bdev_io->bdev->max_write_zeroes) {
2800 		return false;
2801 	}
2802 
2803 	if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) {
2804 		return true;
2805 	}
2806 
2807 	return false;
2808 }
2809 
2810 static bool
2811 bdev_copy_should_split(struct spdk_bdev_io *bdev_io)
2812 {
2813 	if (bdev_io->bdev->max_copy != 0 &&
2814 	    bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) {
2815 		return true;
2816 	}
2817 
2818 	return false;
2819 }
2820 
2821 static bool
2822 bdev_io_should_split(struct spdk_bdev_io *bdev_io)
2823 {
2824 	switch (bdev_io->type) {
2825 	case SPDK_BDEV_IO_TYPE_READ:
2826 	case SPDK_BDEV_IO_TYPE_WRITE:
2827 		return bdev_rw_should_split(bdev_io);
2828 	case SPDK_BDEV_IO_TYPE_UNMAP:
2829 		return bdev_unmap_should_split(bdev_io);
2830 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
2831 		return bdev_write_zeroes_should_split(bdev_io);
2832 	case SPDK_BDEV_IO_TYPE_COPY:
2833 		return bdev_copy_should_split(bdev_io);
2834 	default:
2835 		return false;
2836 	}
2837 }
2838 
2839 static uint32_t
2840 _to_next_boundary(uint64_t offset, uint32_t boundary)
2841 {
2842 	return (boundary - (offset % boundary));
2843 }
2844 
2845 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
2846 
2847 static void _bdev_rw_split(void *_bdev_io);
2848 
2849 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io);
2850 
2851 static void
2852 _bdev_unmap_split(void *_bdev_io)
2853 {
2854 	return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io);
2855 }
2856 
2857 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io);
2858 
2859 static void
2860 _bdev_write_zeroes_split(void *_bdev_io)
2861 {
2862 	return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io);
2863 }
2864 
2865 static void bdev_copy_split(struct spdk_bdev_io *bdev_io);
2866 
2867 static void
2868 _bdev_copy_split(void *_bdev_io)
2869 {
2870 	return bdev_copy_split((struct spdk_bdev_io *)_bdev_io);
2871 }
2872 
2873 static int
2874 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf,
2875 		     uint64_t num_blocks, uint64_t *offset, uint64_t *remaining)
2876 {
2877 	int rc;
2878 	uint64_t current_offset, current_remaining, current_src_offset;
2879 	spdk_bdev_io_wait_cb io_wait_fn;
2880 
2881 	current_offset = *offset;
2882 	current_remaining = *remaining;
2883 
2884 	bdev_io->u.bdev.split_outstanding++;
2885 
2886 	io_wait_fn = _bdev_rw_split;
2887 	switch (bdev_io->type) {
2888 	case SPDK_BDEV_IO_TYPE_READ:
2889 		assert(bdev_io->u.bdev.accel_sequence == NULL);
2890 		rc = bdev_readv_blocks_with_md(bdev_io->internal.desc,
2891 					       spdk_io_channel_from_ctx(bdev_io->internal.ch),
2892 					       iov, iovcnt, md_buf, current_offset,
2893 					       num_blocks, bdev_io->internal.memory_domain,
2894 					       bdev_io->internal.memory_domain_ctx, NULL,
2895 					       bdev_io_split_done, bdev_io);
2896 		break;
2897 	case SPDK_BDEV_IO_TYPE_WRITE:
2898 		assert(bdev_io->u.bdev.accel_sequence == NULL);
2899 		rc = bdev_writev_blocks_with_md(bdev_io->internal.desc,
2900 						spdk_io_channel_from_ctx(bdev_io->internal.ch),
2901 						iov, iovcnt, md_buf, current_offset,
2902 						num_blocks, bdev_io->internal.memory_domain,
2903 						bdev_io->internal.memory_domain_ctx, NULL,
2904 						bdev_io_split_done, bdev_io);
2905 		break;
2906 	case SPDK_BDEV_IO_TYPE_UNMAP:
2907 		io_wait_fn = _bdev_unmap_split;
2908 		rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc,
2909 					    spdk_io_channel_from_ctx(bdev_io->internal.ch),
2910 					    current_offset, num_blocks,
2911 					    bdev_io_split_done, bdev_io);
2912 		break;
2913 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
2914 		io_wait_fn = _bdev_write_zeroes_split;
2915 		rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc,
2916 						   spdk_io_channel_from_ctx(bdev_io->internal.ch),
2917 						   current_offset, num_blocks,
2918 						   bdev_io_split_done, bdev_io);
2919 		break;
2920 	case SPDK_BDEV_IO_TYPE_COPY:
2921 		io_wait_fn = _bdev_copy_split;
2922 		current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks +
2923 				     (current_offset - bdev_io->u.bdev.offset_blocks);
2924 		rc = spdk_bdev_copy_blocks(bdev_io->internal.desc,
2925 					   spdk_io_channel_from_ctx(bdev_io->internal.ch),
2926 					   current_offset, current_src_offset, num_blocks,
2927 					   bdev_io_split_done, bdev_io);
2928 		break;
2929 	default:
2930 		assert(false);
2931 		rc = -EINVAL;
2932 		break;
2933 	}
2934 
2935 	if (rc == 0) {
2936 		current_offset += num_blocks;
2937 		current_remaining -= num_blocks;
2938 		bdev_io->u.bdev.split_current_offset_blocks = current_offset;
2939 		bdev_io->u.bdev.split_remaining_num_blocks = current_remaining;
2940 		*offset = current_offset;
2941 		*remaining = current_remaining;
2942 	} else {
2943 		bdev_io->u.bdev.split_outstanding--;
2944 		if (rc == -ENOMEM) {
2945 			if (bdev_io->u.bdev.split_outstanding == 0) {
2946 				/* No I/O is outstanding. Hence we should wait here. */
2947 				bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn);
2948 			}
2949 		} else {
2950 			bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
2951 			if (bdev_io->u.bdev.split_outstanding == 0) {
2952 				spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx);
2953 				TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link);
2954 				bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
2955 			}
2956 		}
2957 	}
2958 
2959 	return rc;
2960 }
2961 
2962 static void
2963 _bdev_rw_split(void *_bdev_io)
2964 {
2965 	struct iovec *parent_iov, *iov;
2966 	struct spdk_bdev_io *bdev_io = _bdev_io;
2967 	struct spdk_bdev *bdev = bdev_io->bdev;
2968 	uint64_t parent_offset, current_offset, remaining;
2969 	uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt;
2970 	uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes;
2971 	uint32_t iovcnt, iov_len, child_iovsize;
2972 	uint32_t blocklen = bdev->blocklen;
2973 	uint32_t io_boundary;
2974 	uint32_t max_segment_size = bdev->max_segment_size;
2975 	uint32_t max_child_iovcnt = bdev->max_num_segments;
2976 	void *md_buf = NULL;
2977 	int rc;
2978 
2979 	max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX;
2980 	max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) :
2981 			   SPDK_BDEV_IO_NUM_CHILD_IOV;
2982 
2983 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) {
2984 		io_boundary = bdev->write_unit_size;
2985 	} else if (bdev->split_on_optimal_io_boundary) {
2986 		io_boundary = bdev->optimal_io_boundary;
2987 	} else {
2988 		io_boundary = UINT32_MAX;
2989 	}
2990 
2991 	remaining = bdev_io->u.bdev.split_remaining_num_blocks;
2992 	current_offset = bdev_io->u.bdev.split_current_offset_blocks;
2993 	parent_offset = bdev_io->u.bdev.offset_blocks;
2994 	parent_iov_offset = (current_offset - parent_offset) * blocklen;
2995 	parent_iovcnt = bdev_io->u.bdev.iovcnt;
2996 
2997 	for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) {
2998 		parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
2999 		if (parent_iov_offset < parent_iov->iov_len) {
3000 			break;
3001 		}
3002 		parent_iov_offset -= parent_iov->iov_len;
3003 	}
3004 
3005 	child_iovcnt = 0;
3006 	while (remaining > 0 && parent_iovpos < parent_iovcnt &&
3007 	       child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) {
3008 		to_next_boundary = _to_next_boundary(current_offset, io_boundary);
3009 		to_next_boundary = spdk_min(remaining, to_next_boundary);
3010 		to_next_boundary_bytes = to_next_boundary * blocklen;
3011 
3012 		iov = &bdev_io->child_iov[child_iovcnt];
3013 		iovcnt = 0;
3014 
3015 		if (bdev_io->u.bdev.md_buf) {
3016 			md_buf = (char *)bdev_io->u.bdev.md_buf +
3017 				 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev);
3018 		}
3019 
3020 		child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt);
3021 		while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt &&
3022 		       iovcnt < child_iovsize) {
3023 			parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
3024 			iov_len = parent_iov->iov_len - parent_iov_offset;
3025 
3026 			iov_len = spdk_min(iov_len, max_segment_size);
3027 			iov_len = spdk_min(iov_len, to_next_boundary_bytes);
3028 			to_next_boundary_bytes -= iov_len;
3029 
3030 			bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset;
3031 			bdev_io->child_iov[child_iovcnt].iov_len = iov_len;
3032 
3033 			if (iov_len < parent_iov->iov_len - parent_iov_offset) {
3034 				parent_iov_offset += iov_len;
3035 			} else {
3036 				parent_iovpos++;
3037 				parent_iov_offset = 0;
3038 			}
3039 			child_iovcnt++;
3040 			iovcnt++;
3041 		}
3042 
3043 		if (to_next_boundary_bytes > 0) {
3044 			/* We had to stop this child I/O early because we ran out of
3045 			 * child_iov space or were limited by max_num_segments.
3046 			 * Ensure the iovs to be aligned with block size and
3047 			 * then adjust to_next_boundary before starting the
3048 			 * child I/O.
3049 			 */
3050 			assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV ||
3051 			       iovcnt == child_iovsize);
3052 			to_last_block_bytes = to_next_boundary_bytes % blocklen;
3053 			if (to_last_block_bytes != 0) {
3054 				uint32_t child_iovpos = child_iovcnt - 1;
3055 				/* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV
3056 				 * so the loop will naturally end
3057 				 */
3058 
3059 				to_last_block_bytes = blocklen - to_last_block_bytes;
3060 				to_next_boundary_bytes += to_last_block_bytes;
3061 				while (to_last_block_bytes > 0 && iovcnt > 0) {
3062 					iov_len = spdk_min(to_last_block_bytes,
3063 							   bdev_io->child_iov[child_iovpos].iov_len);
3064 					bdev_io->child_iov[child_iovpos].iov_len -= iov_len;
3065 					if (bdev_io->child_iov[child_iovpos].iov_len == 0) {
3066 						child_iovpos--;
3067 						if (--iovcnt == 0) {
3068 							/* If the child IO is less than a block size just return.
3069 							 * If the first child IO of any split round is less than
3070 							 * a block size, an error exit.
3071 							 */
3072 							if (bdev_io->u.bdev.split_outstanding == 0) {
3073 								SPDK_ERRLOG("The first child io was less than a block size\n");
3074 								bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3075 								spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx);
3076 								TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link);
3077 								bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
3078 							}
3079 
3080 							return;
3081 						}
3082 					}
3083 
3084 					to_last_block_bytes -= iov_len;
3085 
3086 					if (parent_iov_offset == 0) {
3087 						parent_iovpos--;
3088 						parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len;
3089 					}
3090 					parent_iov_offset -= iov_len;
3091 				}
3092 
3093 				assert(to_last_block_bytes == 0);
3094 			}
3095 			to_next_boundary -= to_next_boundary_bytes / blocklen;
3096 		}
3097 
3098 		rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary,
3099 					  &current_offset, &remaining);
3100 		if (spdk_unlikely(rc)) {
3101 			return;
3102 		}
3103 	}
3104 }
3105 
3106 static void
3107 bdev_unmap_split(struct spdk_bdev_io *bdev_io)
3108 {
3109 	uint64_t offset, unmap_blocks, remaining, max_unmap_blocks;
3110 	uint32_t num_children_reqs = 0;
3111 	int rc;
3112 
3113 	offset = bdev_io->u.bdev.split_current_offset_blocks;
3114 	remaining = bdev_io->u.bdev.split_remaining_num_blocks;
3115 	max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments;
3116 
3117 	while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) {
3118 		unmap_blocks = spdk_min(remaining, max_unmap_blocks);
3119 
3120 		rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks,
3121 					  &offset, &remaining);
3122 		if (spdk_likely(rc == 0)) {
3123 			num_children_reqs++;
3124 		} else {
3125 			return;
3126 		}
3127 	}
3128 }
3129 
3130 static void
3131 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io)
3132 {
3133 	uint64_t offset, write_zeroes_blocks, remaining;
3134 	uint32_t num_children_reqs = 0;
3135 	int rc;
3136 
3137 	offset = bdev_io->u.bdev.split_current_offset_blocks;
3138 	remaining = bdev_io->u.bdev.split_remaining_num_blocks;
3139 
3140 	while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) {
3141 		write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes);
3142 
3143 		rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks,
3144 					  &offset, &remaining);
3145 		if (spdk_likely(rc == 0)) {
3146 			num_children_reqs++;
3147 		} else {
3148 			return;
3149 		}
3150 	}
3151 }
3152 
3153 static void
3154 bdev_copy_split(struct spdk_bdev_io *bdev_io)
3155 {
3156 	uint64_t offset, copy_blocks, remaining;
3157 	uint32_t num_children_reqs = 0;
3158 	int rc;
3159 
3160 	offset = bdev_io->u.bdev.split_current_offset_blocks;
3161 	remaining = bdev_io->u.bdev.split_remaining_num_blocks;
3162 
3163 	assert(bdev_io->bdev->max_copy != 0);
3164 	while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) {
3165 		copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy);
3166 
3167 		rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks,
3168 					  &offset, &remaining);
3169 		if (spdk_likely(rc == 0)) {
3170 			num_children_reqs++;
3171 		} else {
3172 			return;
3173 		}
3174 	}
3175 }
3176 
3177 static void
3178 parent_bdev_io_complete(void *ctx, int rc)
3179 {
3180 	struct spdk_bdev_io *parent_io = ctx;
3181 
3182 	if (rc) {
3183 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3184 	}
3185 
3186 	parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
3187 			       parent_io->internal.caller_ctx);
3188 }
3189 
3190 static void
3191 bdev_io_complete_parent_sequence_cb(void *ctx, int status)
3192 {
3193 	struct spdk_bdev_io *bdev_io = ctx;
3194 
3195 	/* u.bdev.accel_sequence should have already been cleared at this point */
3196 	assert(bdev_io->u.bdev.accel_sequence == NULL);
3197 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
3198 	bdev_io->internal.accel_sequence = NULL;
3199 
3200 	if (spdk_unlikely(status != 0)) {
3201 		SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status);
3202 	}
3203 
3204 	parent_bdev_io_complete(bdev_io, status);
3205 }
3206 
3207 static void
3208 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
3209 {
3210 	struct spdk_bdev_io *parent_io = cb_arg;
3211 
3212 	spdk_bdev_free_io(bdev_io);
3213 
3214 	if (!success) {
3215 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3216 		/* If any child I/O failed, stop further splitting process. */
3217 		parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks;
3218 		parent_io->u.bdev.split_remaining_num_blocks = 0;
3219 	}
3220 	parent_io->u.bdev.split_outstanding--;
3221 	if (parent_io->u.bdev.split_outstanding != 0) {
3222 		return;
3223 	}
3224 
3225 	/*
3226 	 * Parent I/O finishes when all blocks are consumed.
3227 	 */
3228 	if (parent_io->u.bdev.split_remaining_num_blocks == 0) {
3229 		assert(parent_io->internal.cb != bdev_io_split_done);
3230 		spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx);
3231 		TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link);
3232 
3233 		if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
3234 			if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) {
3235 				bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb);
3236 				return;
3237 			} else if (parent_io->internal.orig_iovcnt != 0) {
3238 				/* bdev IO will be completed in the callback */
3239 				_bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete);
3240 				return;
3241 			}
3242 		}
3243 
3244 		parent_bdev_io_complete(parent_io, 0);
3245 		return;
3246 	}
3247 
3248 	/*
3249 	 * Continue with the splitting process.  This function will complete the parent I/O if the
3250 	 * splitting is done.
3251 	 */
3252 	switch (parent_io->type) {
3253 	case SPDK_BDEV_IO_TYPE_READ:
3254 	case SPDK_BDEV_IO_TYPE_WRITE:
3255 		_bdev_rw_split(parent_io);
3256 		break;
3257 	case SPDK_BDEV_IO_TYPE_UNMAP:
3258 		bdev_unmap_split(parent_io);
3259 		break;
3260 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3261 		bdev_write_zeroes_split(parent_io);
3262 		break;
3263 	case SPDK_BDEV_IO_TYPE_COPY:
3264 		bdev_copy_split(parent_io);
3265 		break;
3266 	default:
3267 		assert(false);
3268 		break;
3269 	}
3270 }
3271 
3272 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
3273 				     bool success);
3274 
3275 static void
3276 bdev_io_split(struct spdk_bdev_io *bdev_io)
3277 {
3278 	assert(bdev_io_should_split(bdev_io));
3279 
3280 	bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks;
3281 	bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks;
3282 	bdev_io->u.bdev.split_outstanding = 0;
3283 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
3284 
3285 	switch (bdev_io->type) {
3286 	case SPDK_BDEV_IO_TYPE_READ:
3287 	case SPDK_BDEV_IO_TYPE_WRITE:
3288 		if (_is_buf_allocated(bdev_io->u.bdev.iovs)) {
3289 			_bdev_rw_split(bdev_io);
3290 		} else {
3291 			assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
3292 			spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb,
3293 					     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
3294 		}
3295 		break;
3296 	case SPDK_BDEV_IO_TYPE_UNMAP:
3297 		bdev_unmap_split(bdev_io);
3298 		break;
3299 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3300 		bdev_write_zeroes_split(bdev_io);
3301 		break;
3302 	case SPDK_BDEV_IO_TYPE_COPY:
3303 		bdev_copy_split(bdev_io);
3304 		break;
3305 	default:
3306 		assert(false);
3307 		break;
3308 	}
3309 }
3310 
3311 static void
3312 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
3313 {
3314 	if (!success) {
3315 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
3316 		return;
3317 	}
3318 
3319 	_bdev_rw_split(bdev_io);
3320 }
3321 
3322 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't
3323  *  be inlined, at least on some compilers.
3324  */
3325 static inline void
3326 _bdev_io_submit(void *ctx)
3327 {
3328 	struct spdk_bdev_io *bdev_io = ctx;
3329 	struct spdk_bdev *bdev = bdev_io->bdev;
3330 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
3331 
3332 	if (spdk_likely(bdev_ch->flags == 0)) {
3333 		bdev_io_do_submit(bdev_ch, bdev_io);
3334 		return;
3335 	}
3336 
3337 	if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) {
3338 		_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
3339 	} else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) {
3340 		if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) &&
3341 		    bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) {
3342 			_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
3343 		} else {
3344 			TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link);
3345 			bdev_qos_io_submit(bdev_ch, bdev->internal.qos);
3346 		}
3347 	} else {
3348 		SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags);
3349 		_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
3350 	}
3351 }
3352 
3353 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2);
3354 
3355 bool
3356 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2)
3357 {
3358 	if (range1->length == 0 || range2->length == 0) {
3359 		return false;
3360 	}
3361 
3362 	if (range1->offset + range1->length <= range2->offset) {
3363 		return false;
3364 	}
3365 
3366 	if (range2->offset + range2->length <= range1->offset) {
3367 		return false;
3368 	}
3369 
3370 	return true;
3371 }
3372 
3373 static bool
3374 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range)
3375 {
3376 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
3377 	struct lba_range r;
3378 
3379 	switch (bdev_io->type) {
3380 	case SPDK_BDEV_IO_TYPE_NVME_IO:
3381 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
3382 		/* Don't try to decode the NVMe command - just assume worst-case and that
3383 		 * it overlaps a locked range.
3384 		 */
3385 		return true;
3386 	case SPDK_BDEV_IO_TYPE_WRITE:
3387 	case SPDK_BDEV_IO_TYPE_UNMAP:
3388 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3389 	case SPDK_BDEV_IO_TYPE_ZCOPY:
3390 	case SPDK_BDEV_IO_TYPE_COPY:
3391 		r.offset = bdev_io->u.bdev.offset_blocks;
3392 		r.length = bdev_io->u.bdev.num_blocks;
3393 		if (!bdev_lba_range_overlapped(range, &r)) {
3394 			/* This I/O doesn't overlap the specified LBA range. */
3395 			return false;
3396 		} else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) {
3397 			/* This I/O overlaps, but the I/O is on the same channel that locked this
3398 			 * range, and the caller_ctx is the same as the locked_ctx.  This means
3399 			 * that this I/O is associated with the lock, and is allowed to execute.
3400 			 */
3401 			return false;
3402 		} else {
3403 			return true;
3404 		}
3405 	default:
3406 		return false;
3407 	}
3408 }
3409 
3410 void
3411 bdev_io_submit(struct spdk_bdev_io *bdev_io)
3412 {
3413 	struct spdk_bdev *bdev = bdev_io->bdev;
3414 	struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io);
3415 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
3416 
3417 	assert(thread != NULL);
3418 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
3419 
3420 	if (!TAILQ_EMPTY(&ch->locked_ranges)) {
3421 		struct lba_range *range;
3422 
3423 		TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
3424 			if (bdev_io_range_is_locked(bdev_io, range)) {
3425 				TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link);
3426 				return;
3427 			}
3428 		}
3429 	}
3430 
3431 	TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link);
3432 
3433 	bdev_io->internal.submit_tsc = spdk_get_ticks();
3434 	spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0,
3435 			      (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx,
3436 			      bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
3437 			      spdk_bdev_get_name(bdev));
3438 
3439 	if (bdev_io->internal.split) {
3440 		bdev_io_split(bdev_io);
3441 		return;
3442 	}
3443 
3444 	if (ch->flags & BDEV_CH_QOS_ENABLED) {
3445 		if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) {
3446 			_bdev_io_submit(bdev_io);
3447 		} else {
3448 			bdev_io->internal.io_submit_ch = ch;
3449 			bdev_io->internal.ch = bdev->internal.qos->ch;
3450 			spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io);
3451 		}
3452 	} else {
3453 		_bdev_io_submit(bdev_io);
3454 	}
3455 }
3456 
3457 static inline void
3458 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io)
3459 {
3460 	/* bdev doesn't support memory domains, thereby buffers in this IO request can't
3461 	 * be accessed directly. It is needed to allocate buffers before issuing IO operation.
3462 	 * For write operation we need to pull buffers from memory domain before submitting IO.
3463 	 * Once read operation completes, we need to use memory_domain push functionality to
3464 	 * update data in original memory domain IO buffer
3465 	 * This IO request will go through a regular IO flow, so clear memory domains pointers */
3466 	bdev_io->u.bdev.memory_domain = NULL;
3467 	bdev_io->u.bdev.memory_domain_ctx = NULL;
3468 	_bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb,
3469 				       bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
3470 }
3471 
3472 static inline void
3473 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io)
3474 {
3475 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
3476 	bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io);
3477 
3478 	if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) {
3479 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED;
3480 		bdev_io_complete_unsubmitted(bdev_io);
3481 		return;
3482 	}
3483 
3484 	/* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does
3485 	 * support them, but we need to execute an accel sequence and the data buffer is from accel
3486 	 * memory domain (to avoid doing a push/pull from that domain).
3487 	 */
3488 	if ((bdev_io->internal.memory_domain && !desc->memory_domains_supported) ||
3489 	    (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) {
3490 		_bdev_io_ext_use_bounce_buffer(bdev_io);
3491 		return;
3492 	}
3493 
3494 	if (needs_exec) {
3495 		if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
3496 			bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb);
3497 			return;
3498 		}
3499 		/* For reads we'll execute the sequence after the data is read, so, for now, only
3500 		 * clear out accel_sequence pointer and submit the IO */
3501 		assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
3502 		bdev_io->u.bdev.accel_sequence = NULL;
3503 	}
3504 
3505 	bdev_io_submit(bdev_io);
3506 }
3507 
3508 static void
3509 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io)
3510 {
3511 	struct spdk_bdev *bdev = bdev_io->bdev;
3512 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
3513 	struct spdk_io_channel *ch = bdev_ch->channel;
3514 
3515 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
3516 
3517 	bdev_io->internal.in_submit_request = true;
3518 	bdev_submit_request(bdev, ch, bdev_io);
3519 	bdev_io->internal.in_submit_request = false;
3520 }
3521 
3522 void
3523 bdev_io_init(struct spdk_bdev_io *bdev_io,
3524 	     struct spdk_bdev *bdev, void *cb_arg,
3525 	     spdk_bdev_io_completion_cb cb)
3526 {
3527 	bdev_io->bdev = bdev;
3528 	bdev_io->internal.caller_ctx = cb_arg;
3529 	bdev_io->internal.cb = cb;
3530 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
3531 	bdev_io->internal.in_submit_request = false;
3532 	bdev_io->internal.buf = NULL;
3533 	bdev_io->internal.io_submit_ch = NULL;
3534 	bdev_io->internal.orig_iovs = NULL;
3535 	bdev_io->internal.orig_iovcnt = 0;
3536 	bdev_io->internal.orig_md_iov.iov_base = NULL;
3537 	bdev_io->internal.error.nvme.cdw0 = 0;
3538 	bdev_io->num_retries = 0;
3539 	bdev_io->internal.get_buf_cb = NULL;
3540 	bdev_io->internal.get_aux_buf_cb = NULL;
3541 	bdev_io->internal.memory_domain = NULL;
3542 	bdev_io->internal.memory_domain_ctx = NULL;
3543 	bdev_io->internal.data_transfer_cpl = NULL;
3544 	bdev_io->internal.split = bdev_io_should_split(bdev_io);
3545 	bdev_io->internal.accel_sequence = NULL;
3546 }
3547 
3548 static bool
3549 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
3550 {
3551 	return bdev->fn_table->io_type_supported(bdev->ctxt, io_type);
3552 }
3553 
3554 bool
3555 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
3556 {
3557 	bool supported;
3558 
3559 	supported = bdev_io_type_supported(bdev, io_type);
3560 
3561 	if (!supported) {
3562 		switch (io_type) {
3563 		case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3564 			/* The bdev layer will emulate write zeroes as long as write is supported. */
3565 			supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE);
3566 			break;
3567 		default:
3568 			break;
3569 		}
3570 	}
3571 
3572 	return supported;
3573 }
3574 
3575 uint64_t
3576 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io)
3577 {
3578 	return bdev_io->internal.submit_tsc;
3579 }
3580 
3581 int
3582 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
3583 {
3584 	if (bdev->fn_table->dump_info_json) {
3585 		return bdev->fn_table->dump_info_json(bdev->ctxt, w);
3586 	}
3587 
3588 	return 0;
3589 }
3590 
3591 static void
3592 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos)
3593 {
3594 	uint32_t max_per_timeslice = 0;
3595 	int i;
3596 
3597 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3598 		if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
3599 			qos->rate_limits[i].max_per_timeslice = 0;
3600 			continue;
3601 		}
3602 
3603 		max_per_timeslice = qos->rate_limits[i].limit *
3604 				    SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC;
3605 
3606 		qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice,
3607 							qos->rate_limits[i].min_per_timeslice);
3608 
3609 		qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice;
3610 	}
3611 
3612 	bdev_qos_set_ops(qos);
3613 }
3614 
3615 static int
3616 bdev_channel_poll_qos(void *arg)
3617 {
3618 	struct spdk_bdev_qos *qos = arg;
3619 	uint64_t now = spdk_get_ticks();
3620 	int i;
3621 
3622 	if (now < (qos->last_timeslice + qos->timeslice_size)) {
3623 		/* We received our callback earlier than expected - return
3624 		 *  immediately and wait to do accounting until at least one
3625 		 *  timeslice has actually expired.  This should never happen
3626 		 *  with a well-behaved timer implementation.
3627 		 */
3628 		return SPDK_POLLER_IDLE;
3629 	}
3630 
3631 	/* Reset for next round of rate limiting */
3632 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3633 		/* We may have allowed the IOs or bytes to slightly overrun in the last
3634 		 * timeslice. remaining_this_timeslice is signed, so if it's negative
3635 		 * here, we'll account for the overrun so that the next timeslice will
3636 		 * be appropriately reduced.
3637 		 */
3638 		if (qos->rate_limits[i].remaining_this_timeslice > 0) {
3639 			qos->rate_limits[i].remaining_this_timeslice = 0;
3640 		}
3641 	}
3642 
3643 	while (now >= (qos->last_timeslice + qos->timeslice_size)) {
3644 		qos->last_timeslice += qos->timeslice_size;
3645 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3646 			qos->rate_limits[i].remaining_this_timeslice +=
3647 				qos->rate_limits[i].max_per_timeslice;
3648 		}
3649 	}
3650 
3651 	return bdev_qos_io_submit(qos->ch, qos);
3652 }
3653 
3654 static void
3655 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch)
3656 {
3657 	struct spdk_bdev_shared_resource *shared_resource;
3658 	struct lba_range *range;
3659 
3660 	bdev_free_io_stat(ch->stat);
3661 #ifdef SPDK_CONFIG_VTUNE
3662 	bdev_free_io_stat(ch->prev_stat);
3663 #endif
3664 
3665 	while (!TAILQ_EMPTY(&ch->locked_ranges)) {
3666 		range = TAILQ_FIRST(&ch->locked_ranges);
3667 		TAILQ_REMOVE(&ch->locked_ranges, range, tailq);
3668 		free(range);
3669 	}
3670 
3671 	spdk_put_io_channel(ch->channel);
3672 	spdk_put_io_channel(ch->accel_channel);
3673 
3674 	shared_resource = ch->shared_resource;
3675 
3676 	assert(TAILQ_EMPTY(&ch->io_locked));
3677 	assert(TAILQ_EMPTY(&ch->io_submitted));
3678 	assert(TAILQ_EMPTY(&ch->io_accel_exec));
3679 	assert(TAILQ_EMPTY(&ch->io_memory_domain));
3680 	assert(ch->io_outstanding == 0);
3681 	assert(shared_resource->ref > 0);
3682 	shared_resource->ref--;
3683 	if (shared_resource->ref == 0) {
3684 		assert(shared_resource->io_outstanding == 0);
3685 		TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link);
3686 		spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch));
3687 		free(shared_resource);
3688 	}
3689 }
3690 
3691 static void
3692 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch)
3693 {
3694 	struct spdk_bdev_qos	*qos = bdev->internal.qos;
3695 	int			i;
3696 
3697 	assert(spdk_spin_held(&bdev->internal.spinlock));
3698 
3699 	/* Rate limiting on this bdev enabled */
3700 	if (qos) {
3701 		if (qos->ch == NULL) {
3702 			struct spdk_io_channel *io_ch;
3703 
3704 			SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch,
3705 				      bdev->name, spdk_get_thread());
3706 
3707 			/* No qos channel has been selected, so set one up */
3708 
3709 			/* Take another reference to ch */
3710 			io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev));
3711 			assert(io_ch != NULL);
3712 			qos->ch = ch;
3713 
3714 			qos->thread = spdk_io_channel_get_thread(io_ch);
3715 
3716 			TAILQ_INIT(&qos->queued);
3717 
3718 			for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3719 				if (bdev_qos_is_iops_rate_limit(i) == true) {
3720 					qos->rate_limits[i].min_per_timeslice =
3721 						SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE;
3722 				} else {
3723 					qos->rate_limits[i].min_per_timeslice =
3724 						SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE;
3725 				}
3726 
3727 				if (qos->rate_limits[i].limit == 0) {
3728 					qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
3729 				}
3730 			}
3731 			bdev_qos_update_max_quota_per_timeslice(qos);
3732 			qos->timeslice_size =
3733 				SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC;
3734 			qos->last_timeslice = spdk_get_ticks();
3735 			qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos,
3736 							   qos,
3737 							   SPDK_BDEV_QOS_TIMESLICE_IN_USEC);
3738 		}
3739 
3740 		ch->flags |= BDEV_CH_QOS_ENABLED;
3741 	}
3742 }
3743 
3744 struct poll_timeout_ctx {
3745 	struct spdk_bdev_desc	*desc;
3746 	uint64_t		timeout_in_sec;
3747 	spdk_bdev_io_timeout_cb	cb_fn;
3748 	void			*cb_arg;
3749 };
3750 
3751 static void
3752 bdev_desc_free(struct spdk_bdev_desc *desc)
3753 {
3754 	spdk_spin_destroy(&desc->spinlock);
3755 	free(desc->media_events_buffer);
3756 	free(desc);
3757 }
3758 
3759 static void
3760 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
3761 {
3762 	struct poll_timeout_ctx *ctx  = _ctx;
3763 	struct spdk_bdev_desc *desc = ctx->desc;
3764 
3765 	free(ctx);
3766 
3767 	spdk_spin_lock(&desc->spinlock);
3768 	desc->refs--;
3769 	if (desc->closed == true && desc->refs == 0) {
3770 		spdk_spin_unlock(&desc->spinlock);
3771 		bdev_desc_free(desc);
3772 		return;
3773 	}
3774 	spdk_spin_unlock(&desc->spinlock);
3775 }
3776 
3777 static void
3778 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
3779 			     struct spdk_io_channel *io_ch, void *_ctx)
3780 {
3781 	struct poll_timeout_ctx *ctx  = _ctx;
3782 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
3783 	struct spdk_bdev_desc *desc = ctx->desc;
3784 	struct spdk_bdev_io *bdev_io;
3785 	uint64_t now;
3786 
3787 	spdk_spin_lock(&desc->spinlock);
3788 	if (desc->closed == true) {
3789 		spdk_spin_unlock(&desc->spinlock);
3790 		spdk_bdev_for_each_channel_continue(i, -1);
3791 		return;
3792 	}
3793 	spdk_spin_unlock(&desc->spinlock);
3794 
3795 	now = spdk_get_ticks();
3796 	TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) {
3797 		/* Exclude any I/O that are generated via splitting. */
3798 		if (bdev_io->internal.cb == bdev_io_split_done) {
3799 			continue;
3800 		}
3801 
3802 		/* Once we find an I/O that has not timed out, we can immediately
3803 		 * exit the loop.
3804 		 */
3805 		if (now < (bdev_io->internal.submit_tsc +
3806 			   ctx->timeout_in_sec * spdk_get_ticks_hz())) {
3807 			goto end;
3808 		}
3809 
3810 		if (bdev_io->internal.desc == desc) {
3811 			ctx->cb_fn(ctx->cb_arg, bdev_io);
3812 		}
3813 	}
3814 
3815 end:
3816 	spdk_bdev_for_each_channel_continue(i, 0);
3817 }
3818 
3819 static int
3820 bdev_poll_timeout_io(void *arg)
3821 {
3822 	struct spdk_bdev_desc *desc = arg;
3823 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
3824 	struct poll_timeout_ctx *ctx;
3825 
3826 	ctx = calloc(1, sizeof(struct poll_timeout_ctx));
3827 	if (!ctx) {
3828 		SPDK_ERRLOG("failed to allocate memory\n");
3829 		return SPDK_POLLER_BUSY;
3830 	}
3831 	ctx->desc = desc;
3832 	ctx->cb_arg = desc->cb_arg;
3833 	ctx->cb_fn = desc->cb_fn;
3834 	ctx->timeout_in_sec = desc->timeout_in_sec;
3835 
3836 	/* Take a ref on the descriptor in case it gets closed while we are checking
3837 	 * all of the channels.
3838 	 */
3839 	spdk_spin_lock(&desc->spinlock);
3840 	desc->refs++;
3841 	spdk_spin_unlock(&desc->spinlock);
3842 
3843 	spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx,
3844 				   bdev_channel_poll_timeout_io_done);
3845 
3846 	return SPDK_POLLER_BUSY;
3847 }
3848 
3849 int
3850 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec,
3851 		      spdk_bdev_io_timeout_cb cb_fn, void *cb_arg)
3852 {
3853 	assert(desc->thread == spdk_get_thread());
3854 
3855 	spdk_poller_unregister(&desc->io_timeout_poller);
3856 
3857 	if (timeout_in_sec) {
3858 		assert(cb_fn != NULL);
3859 		desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io,
3860 					  desc,
3861 					  SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC /
3862 					  1000);
3863 		if (desc->io_timeout_poller == NULL) {
3864 			SPDK_ERRLOG("can not register the desc timeout IO poller\n");
3865 			return -1;
3866 		}
3867 	}
3868 
3869 	desc->cb_fn = cb_fn;
3870 	desc->cb_arg = cb_arg;
3871 	desc->timeout_in_sec = timeout_in_sec;
3872 
3873 	return 0;
3874 }
3875 
3876 static int
3877 bdev_channel_create(void *io_device, void *ctx_buf)
3878 {
3879 	struct spdk_bdev		*bdev = __bdev_from_io_dev(io_device);
3880 	struct spdk_bdev_channel	*ch = ctx_buf;
3881 	struct spdk_io_channel		*mgmt_io_ch;
3882 	struct spdk_bdev_mgmt_channel	*mgmt_ch;
3883 	struct spdk_bdev_shared_resource *shared_resource;
3884 	struct lba_range		*range;
3885 
3886 	ch->bdev = bdev;
3887 	ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt);
3888 	if (!ch->channel) {
3889 		return -1;
3890 	}
3891 
3892 	ch->accel_channel = spdk_accel_get_io_channel();
3893 	if (!ch->accel_channel) {
3894 		spdk_put_io_channel(ch->channel);
3895 		return -1;
3896 	}
3897 
3898 	spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name,
3899 			  spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel)));
3900 
3901 	assert(ch->histogram == NULL);
3902 	if (bdev->internal.histogram_enabled) {
3903 		ch->histogram = spdk_histogram_data_alloc();
3904 		if (ch->histogram == NULL) {
3905 			SPDK_ERRLOG("Could not allocate histogram\n");
3906 		}
3907 	}
3908 
3909 	mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr);
3910 	if (!mgmt_io_ch) {
3911 		spdk_put_io_channel(ch->channel);
3912 		spdk_put_io_channel(ch->accel_channel);
3913 		return -1;
3914 	}
3915 
3916 	mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch);
3917 	TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) {
3918 		if (shared_resource->shared_ch == ch->channel) {
3919 			spdk_put_io_channel(mgmt_io_ch);
3920 			shared_resource->ref++;
3921 			break;
3922 		}
3923 	}
3924 
3925 	if (shared_resource == NULL) {
3926 		shared_resource = calloc(1, sizeof(*shared_resource));
3927 		if (shared_resource == NULL) {
3928 			spdk_put_io_channel(ch->channel);
3929 			spdk_put_io_channel(ch->accel_channel);
3930 			spdk_put_io_channel(mgmt_io_ch);
3931 			return -1;
3932 		}
3933 
3934 		shared_resource->mgmt_ch = mgmt_ch;
3935 		shared_resource->io_outstanding = 0;
3936 		TAILQ_INIT(&shared_resource->nomem_io);
3937 		shared_resource->nomem_threshold = 0;
3938 		shared_resource->shared_ch = ch->channel;
3939 		shared_resource->ref = 1;
3940 		TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link);
3941 	}
3942 
3943 	ch->io_outstanding = 0;
3944 	TAILQ_INIT(&ch->queued_resets);
3945 	TAILQ_INIT(&ch->locked_ranges);
3946 	ch->flags = 0;
3947 	ch->shared_resource = shared_resource;
3948 
3949 	TAILQ_INIT(&ch->io_submitted);
3950 	TAILQ_INIT(&ch->io_locked);
3951 	TAILQ_INIT(&ch->io_accel_exec);
3952 	TAILQ_INIT(&ch->io_memory_domain);
3953 
3954 	ch->stat = bdev_alloc_io_stat(false);
3955 	if (ch->stat == NULL) {
3956 		bdev_channel_destroy_resource(ch);
3957 		return -1;
3958 	}
3959 
3960 	ch->stat->ticks_rate = spdk_get_ticks_hz();
3961 
3962 #ifdef SPDK_CONFIG_VTUNE
3963 	{
3964 		char *name;
3965 		__itt_init_ittlib(NULL, 0);
3966 		name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch);
3967 		if (!name) {
3968 			bdev_channel_destroy_resource(ch);
3969 			return -1;
3970 		}
3971 		ch->handle = __itt_string_handle_create(name);
3972 		free(name);
3973 		ch->start_tsc = spdk_get_ticks();
3974 		ch->interval_tsc = spdk_get_ticks_hz() / 100;
3975 		ch->prev_stat = bdev_alloc_io_stat(false);
3976 		if (ch->prev_stat == NULL) {
3977 			bdev_channel_destroy_resource(ch);
3978 			return -1;
3979 		}
3980 	}
3981 #endif
3982 
3983 	spdk_spin_lock(&bdev->internal.spinlock);
3984 	bdev_enable_qos(bdev, ch);
3985 
3986 	TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) {
3987 		struct lba_range *new_range;
3988 
3989 		new_range = calloc(1, sizeof(*new_range));
3990 		if (new_range == NULL) {
3991 			spdk_spin_unlock(&bdev->internal.spinlock);
3992 			bdev_channel_destroy_resource(ch);
3993 			return -1;
3994 		}
3995 		new_range->length = range->length;
3996 		new_range->offset = range->offset;
3997 		new_range->locked_ctx = range->locked_ctx;
3998 		TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq);
3999 	}
4000 
4001 	spdk_spin_unlock(&bdev->internal.spinlock);
4002 
4003 	return 0;
4004 }
4005 
4006 static int
4007 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry,
4008 			 void *cb_ctx)
4009 {
4010 	struct spdk_bdev_channel *bdev_ch = cb_ctx;
4011 	struct spdk_bdev_io *bdev_io;
4012 	uint64_t buf_len;
4013 
4014 	bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf);
4015 	if (bdev_io->internal.ch == bdev_ch) {
4016 		buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len);
4017 		spdk_iobuf_entry_abort(ch, entry, buf_len);
4018 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
4019 	}
4020 
4021 	return 0;
4022 }
4023 
4024 /*
4025  * Abort I/O that are waiting on a data buffer.
4026  */
4027 static void
4028 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch)
4029 {
4030 	spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small,
4031 				  bdev_abort_all_buf_io_cb, ch);
4032 	spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large,
4033 				  bdev_abort_all_buf_io_cb, ch);
4034 }
4035 
4036 /*
4037  * Abort I/O that are queued waiting for submission.  These types of I/O are
4038  *  linked using the spdk_bdev_io link TAILQ_ENTRY.
4039  */
4040 static void
4041 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch)
4042 {
4043 	struct spdk_bdev_io *bdev_io, *tmp;
4044 
4045 	TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) {
4046 		if (bdev_io->internal.ch == ch) {
4047 			TAILQ_REMOVE(queue, bdev_io, internal.link);
4048 			/*
4049 			 * spdk_bdev_io_complete() assumes that the completed I/O had
4050 			 *  been submitted to the bdev module.  Since in this case it
4051 			 *  hadn't, bump io_outstanding to account for the decrement
4052 			 *  that spdk_bdev_io_complete() will do.
4053 			 */
4054 			if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) {
4055 				ch->io_outstanding++;
4056 				ch->shared_resource->io_outstanding++;
4057 			}
4058 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
4059 		}
4060 	}
4061 }
4062 
4063 static bool
4064 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort)
4065 {
4066 	struct spdk_bdev_io *bdev_io;
4067 
4068 	TAILQ_FOREACH(bdev_io, queue, internal.link) {
4069 		if (bdev_io == bio_to_abort) {
4070 			TAILQ_REMOVE(queue, bio_to_abort, internal.link);
4071 			spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
4072 			return true;
4073 		}
4074 	}
4075 
4076 	return false;
4077 }
4078 
4079 static int
4080 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx)
4081 {
4082 	struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx;
4083 	uint64_t buf_len;
4084 
4085 	bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf);
4086 	if (bdev_io == bio_to_abort) {
4087 		buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len);
4088 		spdk_iobuf_entry_abort(ch, entry, buf_len);
4089 		spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
4090 		return 1;
4091 	}
4092 
4093 	return 0;
4094 }
4095 
4096 static bool
4097 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort)
4098 {
4099 	int rc;
4100 
4101 	rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small,
4102 				       bdev_abort_buf_io_cb, bio_to_abort);
4103 	if (rc == 1) {
4104 		return true;
4105 	}
4106 
4107 	rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large,
4108 				       bdev_abort_buf_io_cb, bio_to_abort);
4109 	return rc == 1;
4110 }
4111 
4112 static void
4113 bdev_qos_channel_destroy(void *cb_arg)
4114 {
4115 	struct spdk_bdev_qos *qos = cb_arg;
4116 
4117 	spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
4118 	spdk_poller_unregister(&qos->poller);
4119 
4120 	SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos);
4121 
4122 	free(qos);
4123 }
4124 
4125 static int
4126 bdev_qos_destroy(struct spdk_bdev *bdev)
4127 {
4128 	int i;
4129 
4130 	/*
4131 	 * Cleanly shutting down the QoS poller is tricky, because
4132 	 * during the asynchronous operation the user could open
4133 	 * a new descriptor and create a new channel, spawning
4134 	 * a new QoS poller.
4135 	 *
4136 	 * The strategy is to create a new QoS structure here and swap it
4137 	 * in. The shutdown path then continues to refer to the old one
4138 	 * until it completes and then releases it.
4139 	 */
4140 	struct spdk_bdev_qos *new_qos, *old_qos;
4141 
4142 	old_qos = bdev->internal.qos;
4143 
4144 	new_qos = calloc(1, sizeof(*new_qos));
4145 	if (!new_qos) {
4146 		SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n");
4147 		return -ENOMEM;
4148 	}
4149 
4150 	/* Copy the old QoS data into the newly allocated structure */
4151 	memcpy(new_qos, old_qos, sizeof(*new_qos));
4152 
4153 	/* Zero out the key parts of the QoS structure */
4154 	new_qos->ch = NULL;
4155 	new_qos->thread = NULL;
4156 	new_qos->poller = NULL;
4157 	TAILQ_INIT(&new_qos->queued);
4158 	/*
4159 	 * The limit member of spdk_bdev_qos_limit structure is not zeroed.
4160 	 * It will be used later for the new QoS structure.
4161 	 */
4162 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4163 		new_qos->rate_limits[i].remaining_this_timeslice = 0;
4164 		new_qos->rate_limits[i].min_per_timeslice = 0;
4165 		new_qos->rate_limits[i].max_per_timeslice = 0;
4166 	}
4167 
4168 	bdev->internal.qos = new_qos;
4169 
4170 	if (old_qos->thread == NULL) {
4171 		free(old_qos);
4172 	} else {
4173 		spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos);
4174 	}
4175 
4176 	/* It is safe to continue with destroying the bdev even though the QoS channel hasn't
4177 	 * been destroyed yet. The destruction path will end up waiting for the final
4178 	 * channel to be put before it releases resources. */
4179 
4180 	return 0;
4181 }
4182 
4183 void
4184 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add)
4185 {
4186 	total->bytes_read += add->bytes_read;
4187 	total->num_read_ops += add->num_read_ops;
4188 	total->bytes_written += add->bytes_written;
4189 	total->num_write_ops += add->num_write_ops;
4190 	total->bytes_unmapped += add->bytes_unmapped;
4191 	total->num_unmap_ops += add->num_unmap_ops;
4192 	total->bytes_copied += add->bytes_copied;
4193 	total->num_copy_ops += add->num_copy_ops;
4194 	total->read_latency_ticks += add->read_latency_ticks;
4195 	total->write_latency_ticks += add->write_latency_ticks;
4196 	total->unmap_latency_ticks += add->unmap_latency_ticks;
4197 	total->copy_latency_ticks += add->copy_latency_ticks;
4198 	if (total->max_read_latency_ticks < add->max_read_latency_ticks) {
4199 		total->max_read_latency_ticks = add->max_read_latency_ticks;
4200 	}
4201 	if (total->min_read_latency_ticks > add->min_read_latency_ticks) {
4202 		total->min_read_latency_ticks = add->min_read_latency_ticks;
4203 	}
4204 	if (total->max_write_latency_ticks < add->max_write_latency_ticks) {
4205 		total->max_write_latency_ticks = add->max_write_latency_ticks;
4206 	}
4207 	if (total->min_write_latency_ticks > add->min_write_latency_ticks) {
4208 		total->min_write_latency_ticks = add->min_write_latency_ticks;
4209 	}
4210 	if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) {
4211 		total->max_unmap_latency_ticks = add->max_unmap_latency_ticks;
4212 	}
4213 	if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) {
4214 		total->min_unmap_latency_ticks = add->min_unmap_latency_ticks;
4215 	}
4216 	if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) {
4217 		total->max_copy_latency_ticks = add->max_copy_latency_ticks;
4218 	}
4219 	if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) {
4220 		total->min_copy_latency_ticks = add->min_copy_latency_ticks;
4221 	}
4222 }
4223 
4224 static void
4225 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat)
4226 {
4227 	memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error));
4228 
4229 	if (to_stat->io_error != NULL && from_stat->io_error != NULL) {
4230 		memcpy(to_stat->io_error, from_stat->io_error,
4231 		       sizeof(struct spdk_bdev_io_error_stat));
4232 	}
4233 }
4234 
4235 void
4236 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode)
4237 {
4238 	stat->max_read_latency_ticks = 0;
4239 	stat->min_read_latency_ticks = UINT64_MAX;
4240 	stat->max_write_latency_ticks = 0;
4241 	stat->min_write_latency_ticks = UINT64_MAX;
4242 	stat->max_unmap_latency_ticks = 0;
4243 	stat->min_unmap_latency_ticks = UINT64_MAX;
4244 	stat->max_copy_latency_ticks = 0;
4245 	stat->min_copy_latency_ticks = UINT64_MAX;
4246 
4247 	if (mode != SPDK_BDEV_RESET_STAT_ALL) {
4248 		return;
4249 	}
4250 
4251 	stat->bytes_read = 0;
4252 	stat->num_read_ops = 0;
4253 	stat->bytes_written = 0;
4254 	stat->num_write_ops = 0;
4255 	stat->bytes_unmapped = 0;
4256 	stat->num_unmap_ops = 0;
4257 	stat->bytes_copied = 0;
4258 	stat->num_copy_ops = 0;
4259 	stat->read_latency_ticks = 0;
4260 	stat->write_latency_ticks = 0;
4261 	stat->unmap_latency_ticks = 0;
4262 	stat->copy_latency_ticks = 0;
4263 
4264 	if (stat->io_error != NULL) {
4265 		memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat));
4266 	}
4267 }
4268 
4269 struct spdk_bdev_io_stat *
4270 bdev_alloc_io_stat(bool io_error_stat)
4271 {
4272 	struct spdk_bdev_io_stat *stat;
4273 
4274 	stat = malloc(sizeof(struct spdk_bdev_io_stat));
4275 	if (stat == NULL) {
4276 		return NULL;
4277 	}
4278 
4279 	if (io_error_stat) {
4280 		stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat));
4281 		if (stat->io_error == NULL) {
4282 			free(stat);
4283 			return NULL;
4284 		}
4285 	} else {
4286 		stat->io_error = NULL;
4287 	}
4288 
4289 	spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL);
4290 
4291 	return stat;
4292 }
4293 
4294 void
4295 bdev_free_io_stat(struct spdk_bdev_io_stat *stat)
4296 {
4297 	if (stat != NULL) {
4298 		free(stat->io_error);
4299 		free(stat);
4300 	}
4301 }
4302 
4303 void
4304 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w)
4305 {
4306 	int i;
4307 
4308 	spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read);
4309 	spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops);
4310 	spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written);
4311 	spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops);
4312 	spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped);
4313 	spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops);
4314 	spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied);
4315 	spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops);
4316 	spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks);
4317 	spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks);
4318 	spdk_json_write_named_uint64(w, "min_read_latency_ticks",
4319 				     stat->min_read_latency_ticks != UINT64_MAX ?
4320 				     stat->min_read_latency_ticks : 0);
4321 	spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks);
4322 	spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks);
4323 	spdk_json_write_named_uint64(w, "min_write_latency_ticks",
4324 				     stat->min_write_latency_ticks != UINT64_MAX ?
4325 				     stat->min_write_latency_ticks : 0);
4326 	spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks);
4327 	spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks);
4328 	spdk_json_write_named_uint64(w, "min_unmap_latency_ticks",
4329 				     stat->min_unmap_latency_ticks != UINT64_MAX ?
4330 				     stat->min_unmap_latency_ticks : 0);
4331 	spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks);
4332 	spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks);
4333 	spdk_json_write_named_uint64(w, "min_copy_latency_ticks",
4334 				     stat->min_copy_latency_ticks != UINT64_MAX ?
4335 				     stat->min_copy_latency_ticks : 0);
4336 
4337 	if (stat->io_error != NULL) {
4338 		spdk_json_write_named_object_begin(w, "io_error");
4339 		for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) {
4340 			if (stat->io_error->error_status[i] != 0) {
4341 				spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)),
4342 							     stat->io_error->error_status[i]);
4343 			}
4344 		}
4345 		spdk_json_write_object_end(w);
4346 	}
4347 }
4348 
4349 static void
4350 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch)
4351 {
4352 	struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource;
4353 	struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch;
4354 
4355 	bdev_abort_all_queued_io(&shared_resource->nomem_io, ch);
4356 	bdev_abort_all_buf_io(mgmt_ch, ch);
4357 }
4358 
4359 static void
4360 bdev_channel_destroy(void *io_device, void *ctx_buf)
4361 {
4362 	struct spdk_bdev_channel *ch = ctx_buf;
4363 
4364 	SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name,
4365 		      spdk_get_thread());
4366 
4367 	spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name,
4368 			  spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel)));
4369 
4370 	/* This channel is going away, so add its statistics into the bdev so that they don't get lost. */
4371 	spdk_spin_lock(&ch->bdev->internal.spinlock);
4372 	spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat);
4373 	spdk_spin_unlock(&ch->bdev->internal.spinlock);
4374 
4375 	bdev_abort_all_queued_io(&ch->queued_resets, ch);
4376 
4377 	bdev_channel_abort_queued_ios(ch);
4378 
4379 	if (ch->histogram) {
4380 		spdk_histogram_data_free(ch->histogram);
4381 	}
4382 
4383 	bdev_channel_destroy_resource(ch);
4384 }
4385 
4386 /*
4387  * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer
4388  * to it. Hence we do not have to call bdev_get_by_name() when using this function.
4389  */
4390 static int
4391 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name)
4392 {
4393 	struct spdk_bdev_name *tmp;
4394 
4395 	bdev_name->name = strdup(name);
4396 	if (bdev_name->name == NULL) {
4397 		SPDK_ERRLOG("Unable to allocate bdev name\n");
4398 		return -ENOMEM;
4399 	}
4400 
4401 	bdev_name->bdev = bdev;
4402 
4403 	spdk_spin_lock(&g_bdev_mgr.spinlock);
4404 	tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name);
4405 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
4406 
4407 	if (tmp != NULL) {
4408 		SPDK_ERRLOG("Bdev name %s already exists\n", name);
4409 		free(bdev_name->name);
4410 		return -EEXIST;
4411 	}
4412 
4413 	return 0;
4414 }
4415 
4416 static void
4417 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name)
4418 {
4419 	RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name);
4420 	free(bdev_name->name);
4421 }
4422 
4423 static void
4424 bdev_name_del(struct spdk_bdev_name *bdev_name)
4425 {
4426 	spdk_spin_lock(&g_bdev_mgr.spinlock);
4427 	bdev_name_del_unsafe(bdev_name);
4428 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
4429 }
4430 
4431 int
4432 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias)
4433 {
4434 	struct spdk_bdev_alias *tmp;
4435 	int ret;
4436 
4437 	if (alias == NULL) {
4438 		SPDK_ERRLOG("Empty alias passed\n");
4439 		return -EINVAL;
4440 	}
4441 
4442 	tmp = calloc(1, sizeof(*tmp));
4443 	if (tmp == NULL) {
4444 		SPDK_ERRLOG("Unable to allocate alias\n");
4445 		return -ENOMEM;
4446 	}
4447 
4448 	ret = bdev_name_add(&tmp->alias, bdev, alias);
4449 	if (ret != 0) {
4450 		free(tmp);
4451 		return ret;
4452 	}
4453 
4454 	TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq);
4455 
4456 	return 0;
4457 }
4458 
4459 static int
4460 bdev_alias_del(struct spdk_bdev *bdev, const char *alias,
4461 	       void (*alias_del_fn)(struct spdk_bdev_name *n))
4462 {
4463 	struct spdk_bdev_alias *tmp;
4464 
4465 	TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
4466 		if (strcmp(alias, tmp->alias.name) == 0) {
4467 			TAILQ_REMOVE(&bdev->aliases, tmp, tailq);
4468 			alias_del_fn(&tmp->alias);
4469 			free(tmp);
4470 			return 0;
4471 		}
4472 	}
4473 
4474 	return -ENOENT;
4475 }
4476 
4477 int
4478 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias)
4479 {
4480 	int rc;
4481 
4482 	rc = bdev_alias_del(bdev, alias, bdev_name_del);
4483 	if (rc == -ENOENT) {
4484 		SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias);
4485 	}
4486 
4487 	return rc;
4488 }
4489 
4490 void
4491 spdk_bdev_alias_del_all(struct spdk_bdev *bdev)
4492 {
4493 	struct spdk_bdev_alias *p, *tmp;
4494 
4495 	TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) {
4496 		TAILQ_REMOVE(&bdev->aliases, p, tailq);
4497 		bdev_name_del(&p->alias);
4498 		free(p);
4499 	}
4500 }
4501 
4502 struct spdk_io_channel *
4503 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc)
4504 {
4505 	return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc)));
4506 }
4507 
4508 void *
4509 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc)
4510 {
4511 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
4512 	void *ctx = NULL;
4513 
4514 	if (bdev->fn_table->get_module_ctx) {
4515 		ctx = bdev->fn_table->get_module_ctx(bdev->ctxt);
4516 	}
4517 
4518 	return ctx;
4519 }
4520 
4521 const char *
4522 spdk_bdev_get_module_name(const struct spdk_bdev *bdev)
4523 {
4524 	return bdev->module->name;
4525 }
4526 
4527 const char *
4528 spdk_bdev_get_name(const struct spdk_bdev *bdev)
4529 {
4530 	return bdev->name;
4531 }
4532 
4533 const char *
4534 spdk_bdev_get_product_name(const struct spdk_bdev *bdev)
4535 {
4536 	return bdev->product_name;
4537 }
4538 
4539 const struct spdk_bdev_aliases_list *
4540 spdk_bdev_get_aliases(const struct spdk_bdev *bdev)
4541 {
4542 	return &bdev->aliases;
4543 }
4544 
4545 uint32_t
4546 spdk_bdev_get_block_size(const struct spdk_bdev *bdev)
4547 {
4548 	return bdev->blocklen;
4549 }
4550 
4551 uint32_t
4552 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev)
4553 {
4554 	return bdev->write_unit_size;
4555 }
4556 
4557 uint64_t
4558 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev)
4559 {
4560 	return bdev->blockcnt;
4561 }
4562 
4563 const char *
4564 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type)
4565 {
4566 	return qos_rpc_type[type];
4567 }
4568 
4569 void
4570 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits)
4571 {
4572 	int i;
4573 
4574 	memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES);
4575 
4576 	spdk_spin_lock(&bdev->internal.spinlock);
4577 	if (bdev->internal.qos) {
4578 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4579 			if (bdev->internal.qos->rate_limits[i].limit !=
4580 			    SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
4581 				limits[i] = bdev->internal.qos->rate_limits[i].limit;
4582 				if (bdev_qos_is_iops_rate_limit(i) == false) {
4583 					/* Change from Byte to Megabyte which is user visible. */
4584 					limits[i] = limits[i] / 1024 / 1024;
4585 				}
4586 			}
4587 		}
4588 	}
4589 	spdk_spin_unlock(&bdev->internal.spinlock);
4590 }
4591 
4592 size_t
4593 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev)
4594 {
4595 	return 1 << bdev->required_alignment;
4596 }
4597 
4598 uint32_t
4599 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev)
4600 {
4601 	return bdev->optimal_io_boundary;
4602 }
4603 
4604 bool
4605 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev)
4606 {
4607 	return bdev->write_cache;
4608 }
4609 
4610 const struct spdk_uuid *
4611 spdk_bdev_get_uuid(const struct spdk_bdev *bdev)
4612 {
4613 	return &bdev->uuid;
4614 }
4615 
4616 uint16_t
4617 spdk_bdev_get_acwu(const struct spdk_bdev *bdev)
4618 {
4619 	return bdev->acwu;
4620 }
4621 
4622 uint32_t
4623 spdk_bdev_get_md_size(const struct spdk_bdev *bdev)
4624 {
4625 	return bdev->md_len;
4626 }
4627 
4628 bool
4629 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev)
4630 {
4631 	return (bdev->md_len != 0) && bdev->md_interleave;
4632 }
4633 
4634 bool
4635 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev)
4636 {
4637 	return (bdev->md_len != 0) && !bdev->md_interleave;
4638 }
4639 
4640 bool
4641 spdk_bdev_is_zoned(const struct spdk_bdev *bdev)
4642 {
4643 	return bdev->zoned;
4644 }
4645 
4646 uint32_t
4647 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev)
4648 {
4649 	if (spdk_bdev_is_md_interleaved(bdev)) {
4650 		return bdev->blocklen - bdev->md_len;
4651 	} else {
4652 		return bdev->blocklen;
4653 	}
4654 }
4655 
4656 uint32_t
4657 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev)
4658 {
4659 	return bdev->phys_blocklen;
4660 }
4661 
4662 static uint32_t
4663 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev)
4664 {
4665 	if (!spdk_bdev_is_md_interleaved(bdev)) {
4666 		return bdev->blocklen + bdev->md_len;
4667 	} else {
4668 		return bdev->blocklen;
4669 	}
4670 }
4671 
4672 /* We have to use the typedef in the function declaration to appease astyle. */
4673 typedef enum spdk_dif_type spdk_dif_type_t;
4674 
4675 spdk_dif_type_t
4676 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev)
4677 {
4678 	if (bdev->md_len != 0) {
4679 		return bdev->dif_type;
4680 	} else {
4681 		return SPDK_DIF_DISABLE;
4682 	}
4683 }
4684 
4685 bool
4686 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev)
4687 {
4688 	if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) {
4689 		return bdev->dif_is_head_of_md;
4690 	} else {
4691 		return false;
4692 	}
4693 }
4694 
4695 bool
4696 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev,
4697 			       enum spdk_dif_check_type check_type)
4698 {
4699 	if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) {
4700 		return false;
4701 	}
4702 
4703 	switch (check_type) {
4704 	case SPDK_DIF_CHECK_TYPE_REFTAG:
4705 		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0;
4706 	case SPDK_DIF_CHECK_TYPE_APPTAG:
4707 		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0;
4708 	case SPDK_DIF_CHECK_TYPE_GUARD:
4709 		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0;
4710 	default:
4711 		return false;
4712 	}
4713 }
4714 
4715 static uint32_t
4716 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes)
4717 {
4718 	uint64_t aligned_length, max_write_blocks;
4719 
4720 	aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1);
4721 	max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev);
4722 	max_write_blocks -= max_write_blocks % bdev->write_unit_size;
4723 
4724 	return max_write_blocks;
4725 }
4726 
4727 uint32_t
4728 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev)
4729 {
4730 	return bdev->max_copy;
4731 }
4732 
4733 uint64_t
4734 spdk_bdev_get_qd(const struct spdk_bdev *bdev)
4735 {
4736 	return bdev->internal.measured_queue_depth;
4737 }
4738 
4739 uint64_t
4740 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev)
4741 {
4742 	return bdev->internal.period;
4743 }
4744 
4745 uint64_t
4746 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev)
4747 {
4748 	return bdev->internal.weighted_io_time;
4749 }
4750 
4751 uint64_t
4752 spdk_bdev_get_io_time(const struct spdk_bdev *bdev)
4753 {
4754 	return bdev->internal.io_time;
4755 }
4756 
4757 static void bdev_update_qd_sampling_period(void *ctx);
4758 
4759 static void
4760 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status)
4761 {
4762 	bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth;
4763 
4764 	if (bdev->internal.measured_queue_depth) {
4765 		bdev->internal.io_time += bdev->internal.period;
4766 		bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth;
4767 	}
4768 
4769 	bdev->internal.qd_poll_in_progress = false;
4770 
4771 	bdev_update_qd_sampling_period(bdev);
4772 }
4773 
4774 static void
4775 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
4776 		       struct spdk_io_channel *io_ch, void *_ctx)
4777 {
4778 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch);
4779 
4780 	bdev->internal.temporary_queue_depth += ch->io_outstanding;
4781 	spdk_bdev_for_each_channel_continue(i, 0);
4782 }
4783 
4784 static int
4785 bdev_calculate_measured_queue_depth(void *ctx)
4786 {
4787 	struct spdk_bdev *bdev = ctx;
4788 
4789 	bdev->internal.qd_poll_in_progress = true;
4790 	bdev->internal.temporary_queue_depth = 0;
4791 	spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl);
4792 	return SPDK_POLLER_BUSY;
4793 }
4794 
4795 static void
4796 bdev_update_qd_sampling_period(void *ctx)
4797 {
4798 	struct spdk_bdev *bdev = ctx;
4799 
4800 	if (bdev->internal.period == bdev->internal.new_period) {
4801 		return;
4802 	}
4803 
4804 	if (bdev->internal.qd_poll_in_progress) {
4805 		return;
4806 	}
4807 
4808 	bdev->internal.period = bdev->internal.new_period;
4809 
4810 	spdk_poller_unregister(&bdev->internal.qd_poller);
4811 	if (bdev->internal.period != 0) {
4812 		bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth,
4813 					   bdev, bdev->internal.period);
4814 	} else {
4815 		spdk_bdev_close(bdev->internal.qd_desc);
4816 		bdev->internal.qd_desc = NULL;
4817 	}
4818 }
4819 
4820 static void
4821 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
4822 {
4823 	SPDK_NOTICELOG("Unexpected event type: %d\n", type);
4824 }
4825 
4826 void
4827 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period)
4828 {
4829 	int rc;
4830 
4831 	if (bdev->internal.new_period == period) {
4832 		return;
4833 	}
4834 
4835 	bdev->internal.new_period = period;
4836 
4837 	if (bdev->internal.qd_desc != NULL) {
4838 		assert(bdev->internal.period != 0);
4839 
4840 		spdk_thread_send_msg(bdev->internal.qd_desc->thread,
4841 				     bdev_update_qd_sampling_period, bdev);
4842 		return;
4843 	}
4844 
4845 	assert(bdev->internal.period == 0);
4846 
4847 	rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb,
4848 				NULL, &bdev->internal.qd_desc);
4849 	if (rc != 0) {
4850 		return;
4851 	}
4852 
4853 	bdev->internal.period = period;
4854 	bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth,
4855 				   bdev, period);
4856 }
4857 
4858 struct bdev_get_current_qd_ctx {
4859 	uint64_t current_qd;
4860 	spdk_bdev_get_current_qd_cb cb_fn;
4861 	void *cb_arg;
4862 };
4863 
4864 static void
4865 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status)
4866 {
4867 	struct bdev_get_current_qd_ctx *ctx = _ctx;
4868 
4869 	ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0);
4870 
4871 	free(ctx);
4872 }
4873 
4874 static void
4875 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
4876 		    struct spdk_io_channel *io_ch, void *_ctx)
4877 {
4878 	struct bdev_get_current_qd_ctx *ctx = _ctx;
4879 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
4880 
4881 	ctx->current_qd += bdev_ch->io_outstanding;
4882 
4883 	spdk_bdev_for_each_channel_continue(i, 0);
4884 }
4885 
4886 void
4887 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn,
4888 			 void *cb_arg)
4889 {
4890 	struct bdev_get_current_qd_ctx *ctx;
4891 
4892 	assert(cb_fn != NULL);
4893 
4894 	ctx = calloc(1, sizeof(*ctx));
4895 	if (ctx == NULL) {
4896 		cb_fn(bdev, 0, cb_arg, -ENOMEM);
4897 		return;
4898 	}
4899 
4900 	ctx->cb_fn = cb_fn;
4901 	ctx->cb_arg = cb_arg;
4902 
4903 	spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done);
4904 }
4905 
4906 static void
4907 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type)
4908 {
4909 	assert(desc->thread == spdk_get_thread());
4910 
4911 	spdk_spin_lock(&desc->spinlock);
4912 	desc->refs--;
4913 	if (!desc->closed) {
4914 		spdk_spin_unlock(&desc->spinlock);
4915 		desc->callback.event_fn(type,
4916 					desc->bdev,
4917 					desc->callback.ctx);
4918 		return;
4919 	} else if (desc->refs == 0) {
4920 		/* This descriptor was closed after this event_notify message was sent.
4921 		 * spdk_bdev_close() could not free the descriptor since this message was
4922 		 * in flight, so we free it now using bdev_desc_free().
4923 		 */
4924 		spdk_spin_unlock(&desc->spinlock);
4925 		bdev_desc_free(desc);
4926 		return;
4927 	}
4928 	spdk_spin_unlock(&desc->spinlock);
4929 }
4930 
4931 static void
4932 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn)
4933 {
4934 	spdk_spin_lock(&desc->spinlock);
4935 	desc->refs++;
4936 	spdk_thread_send_msg(desc->thread, event_notify_fn, desc);
4937 	spdk_spin_unlock(&desc->spinlock);
4938 }
4939 
4940 static void
4941 _resize_notify(void *ctx)
4942 {
4943 	struct spdk_bdev_desc *desc = ctx;
4944 
4945 	_event_notify(desc, SPDK_BDEV_EVENT_RESIZE);
4946 }
4947 
4948 int
4949 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size)
4950 {
4951 	struct spdk_bdev_desc *desc;
4952 	int ret;
4953 
4954 	if (size == bdev->blockcnt) {
4955 		return 0;
4956 	}
4957 
4958 	spdk_spin_lock(&bdev->internal.spinlock);
4959 
4960 	/* bdev has open descriptors */
4961 	if (!TAILQ_EMPTY(&bdev->internal.open_descs) &&
4962 	    bdev->blockcnt > size) {
4963 		ret = -EBUSY;
4964 	} else {
4965 		bdev->blockcnt = size;
4966 		TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
4967 			event_notify(desc, _resize_notify);
4968 		}
4969 		ret = 0;
4970 	}
4971 
4972 	spdk_spin_unlock(&bdev->internal.spinlock);
4973 
4974 	return ret;
4975 }
4976 
4977 /*
4978  * Convert I/O offset and length from bytes to blocks.
4979  *
4980  * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size.
4981  */
4982 static uint64_t
4983 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks,
4984 		     uint64_t num_bytes, uint64_t *num_blocks)
4985 {
4986 	uint32_t block_size = bdev->blocklen;
4987 	uint8_t shift_cnt;
4988 
4989 	/* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */
4990 	if (spdk_likely(spdk_u32_is_pow2(block_size))) {
4991 		shift_cnt = spdk_u32log2(block_size);
4992 		*offset_blocks = offset_bytes >> shift_cnt;
4993 		*num_blocks = num_bytes >> shift_cnt;
4994 		return (offset_bytes - (*offset_blocks << shift_cnt)) |
4995 		       (num_bytes - (*num_blocks << shift_cnt));
4996 	} else {
4997 		*offset_blocks = offset_bytes / block_size;
4998 		*num_blocks = num_bytes / block_size;
4999 		return (offset_bytes % block_size) | (num_bytes % block_size);
5000 	}
5001 }
5002 
5003 static bool
5004 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks)
5005 {
5006 	/* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there
5007 	 * has been an overflow and hence the offset has been wrapped around */
5008 	if (offset_blocks + num_blocks < offset_blocks) {
5009 		return false;
5010 	}
5011 
5012 	/* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */
5013 	if (offset_blocks + num_blocks > bdev->blockcnt) {
5014 		return false;
5015 	}
5016 
5017 	return true;
5018 }
5019 
5020 static void
5021 bdev_seek_complete_cb(void *ctx)
5022 {
5023 	struct spdk_bdev_io *bdev_io = ctx;
5024 
5025 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
5026 	bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx);
5027 }
5028 
5029 static int
5030 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5031 	  uint64_t offset_blocks, enum spdk_bdev_io_type io_type,
5032 	  spdk_bdev_io_completion_cb cb, void *cb_arg)
5033 {
5034 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5035 	struct spdk_bdev_io *bdev_io;
5036 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5037 
5038 	assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE);
5039 
5040 	/* Check if offset_blocks is valid looking at the validity of one block */
5041 	if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) {
5042 		return -EINVAL;
5043 	}
5044 
5045 	bdev_io = bdev_channel_get_io(channel);
5046 	if (!bdev_io) {
5047 		return -ENOMEM;
5048 	}
5049 
5050 	bdev_io->internal.ch = channel;
5051 	bdev_io->internal.desc = desc;
5052 	bdev_io->type = io_type;
5053 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5054 	bdev_io->u.bdev.memory_domain = NULL;
5055 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5056 	bdev_io->u.bdev.accel_sequence = NULL;
5057 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5058 
5059 	if (!spdk_bdev_io_type_supported(bdev, io_type)) {
5060 		/* In case bdev doesn't support seek to next data/hole offset,
5061 		 * it is assumed that only data and no holes are present */
5062 		if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) {
5063 			bdev_io->u.bdev.seek.offset = offset_blocks;
5064 		} else {
5065 			bdev_io->u.bdev.seek.offset = UINT64_MAX;
5066 		}
5067 
5068 		spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io);
5069 		return 0;
5070 	}
5071 
5072 	bdev_io_submit(bdev_io);
5073 	return 0;
5074 }
5075 
5076 int
5077 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5078 		    uint64_t offset_blocks,
5079 		    spdk_bdev_io_completion_cb cb, void *cb_arg)
5080 {
5081 	return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg);
5082 }
5083 
5084 int
5085 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5086 		    uint64_t offset_blocks,
5087 		    spdk_bdev_io_completion_cb cb, void *cb_arg)
5088 {
5089 	return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg);
5090 }
5091 
5092 uint64_t
5093 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io)
5094 {
5095 	return bdev_io->u.bdev.seek.offset;
5096 }
5097 
5098 static int
5099 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf,
5100 			 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5101 			 spdk_bdev_io_completion_cb cb, void *cb_arg)
5102 {
5103 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5104 	struct spdk_bdev_io *bdev_io;
5105 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5106 
5107 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5108 		return -EINVAL;
5109 	}
5110 
5111 	bdev_io = bdev_channel_get_io(channel);
5112 	if (!bdev_io) {
5113 		return -ENOMEM;
5114 	}
5115 
5116 	bdev_io->internal.ch = channel;
5117 	bdev_io->internal.desc = desc;
5118 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
5119 	bdev_io->u.bdev.iovs = &bdev_io->iov;
5120 	bdev_io->u.bdev.iovs[0].iov_base = buf;
5121 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
5122 	bdev_io->u.bdev.iovcnt = 1;
5123 	bdev_io->u.bdev.md_buf = md_buf;
5124 	bdev_io->u.bdev.num_blocks = num_blocks;
5125 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5126 	bdev_io->u.bdev.memory_domain = NULL;
5127 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5128 	bdev_io->u.bdev.accel_sequence = NULL;
5129 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5130 
5131 	bdev_io_submit(bdev_io);
5132 	return 0;
5133 }
5134 
5135 int
5136 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5137 	       void *buf, uint64_t offset, uint64_t nbytes,
5138 	       spdk_bdev_io_completion_cb cb, void *cb_arg)
5139 {
5140 	uint64_t offset_blocks, num_blocks;
5141 
5142 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5143 				 nbytes, &num_blocks) != 0) {
5144 		return -EINVAL;
5145 	}
5146 
5147 	return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
5148 }
5149 
5150 int
5151 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5152 		      void *buf, uint64_t offset_blocks, uint64_t num_blocks,
5153 		      spdk_bdev_io_completion_cb cb, void *cb_arg)
5154 {
5155 	return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg);
5156 }
5157 
5158 int
5159 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5160 			      void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5161 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
5162 {
5163 	struct iovec iov = {
5164 		.iov_base = buf,
5165 	};
5166 
5167 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5168 		return -EINVAL;
5169 	}
5170 
5171 	if (md_buf && !_is_buf_allocated(&iov)) {
5172 		return -EINVAL;
5173 	}
5174 
5175 	return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
5176 					cb, cb_arg);
5177 }
5178 
5179 int
5180 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5181 		struct iovec *iov, int iovcnt,
5182 		uint64_t offset, uint64_t nbytes,
5183 		spdk_bdev_io_completion_cb cb, void *cb_arg)
5184 {
5185 	uint64_t offset_blocks, num_blocks;
5186 
5187 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5188 				 nbytes, &num_blocks) != 0) {
5189 		return -EINVAL;
5190 	}
5191 
5192 	return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
5193 }
5194 
5195 static int
5196 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5197 			  struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks,
5198 			  uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx,
5199 			  struct spdk_accel_sequence *seq,
5200 			  spdk_bdev_io_completion_cb cb, void *cb_arg)
5201 {
5202 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5203 	struct spdk_bdev_io *bdev_io;
5204 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5205 
5206 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5207 		return -EINVAL;
5208 	}
5209 
5210 	bdev_io = bdev_channel_get_io(channel);
5211 	if (!bdev_io) {
5212 		return -ENOMEM;
5213 	}
5214 
5215 	bdev_io->internal.ch = channel;
5216 	bdev_io->internal.desc = desc;
5217 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
5218 	bdev_io->u.bdev.iovs = iov;
5219 	bdev_io->u.bdev.iovcnt = iovcnt;
5220 	bdev_io->u.bdev.md_buf = md_buf;
5221 	bdev_io->u.bdev.num_blocks = num_blocks;
5222 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5223 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5224 	bdev_io->internal.memory_domain = domain;
5225 	bdev_io->internal.memory_domain_ctx = domain_ctx;
5226 	bdev_io->internal.accel_sequence = seq;
5227 	bdev_io->u.bdev.memory_domain = domain;
5228 	bdev_io->u.bdev.memory_domain_ctx = domain_ctx;
5229 	bdev_io->u.bdev.accel_sequence = seq;
5230 
5231 	_bdev_io_submit_ext(desc, bdev_io);
5232 
5233 	return 0;
5234 }
5235 
5236 int
5237 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5238 		       struct iovec *iov, int iovcnt,
5239 		       uint64_t offset_blocks, uint64_t num_blocks,
5240 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
5241 {
5242 	return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
5243 					 num_blocks, NULL, NULL, NULL, cb, cb_arg);
5244 }
5245 
5246 int
5247 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5248 			       struct iovec *iov, int iovcnt, void *md_buf,
5249 			       uint64_t offset_blocks, uint64_t num_blocks,
5250 			       spdk_bdev_io_completion_cb cb, void *cb_arg)
5251 {
5252 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5253 		return -EINVAL;
5254 	}
5255 
5256 	if (md_buf && !_is_buf_allocated(iov)) {
5257 		return -EINVAL;
5258 	}
5259 
5260 	return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
5261 					 num_blocks, NULL, NULL, NULL, cb, cb_arg);
5262 }
5263 
5264 static inline bool
5265 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov)
5266 {
5267 	/*
5268 	 * We check if opts size is at least of size when we first introduced
5269 	 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members
5270 	 * are not checked internal.
5271 	 */
5272 	return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) +
5273 	       sizeof(opts->metadata) &&
5274 	       opts->size <= sizeof(*opts) &&
5275 	       /* When memory domain is used, the user must provide data buffers */
5276 	       (!opts->memory_domain || (iov && iov[0].iov_base));
5277 }
5278 
5279 int
5280 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5281 			   struct iovec *iov, int iovcnt,
5282 			   uint64_t offset_blocks, uint64_t num_blocks,
5283 			   spdk_bdev_io_completion_cb cb, void *cb_arg,
5284 			   struct spdk_bdev_ext_io_opts *opts)
5285 {
5286 	void *md = NULL;
5287 
5288 	if (opts) {
5289 		if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) {
5290 			return -EINVAL;
5291 		}
5292 		md = opts->metadata;
5293 	}
5294 
5295 	if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5296 		return -EINVAL;
5297 	}
5298 
5299 	if (md && !_is_buf_allocated(iov)) {
5300 		return -EINVAL;
5301 	}
5302 
5303 	return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks,
5304 					 num_blocks,
5305 					 bdev_get_ext_io_opt(opts, memory_domain, NULL),
5306 					 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL),
5307 					 bdev_get_ext_io_opt(opts, accel_sequence, NULL),
5308 					 cb, cb_arg);
5309 }
5310 
5311 static int
5312 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5313 			  void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5314 			  spdk_bdev_io_completion_cb cb, void *cb_arg)
5315 {
5316 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5317 	struct spdk_bdev_io *bdev_io;
5318 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5319 
5320 	if (!desc->write) {
5321 		return -EBADF;
5322 	}
5323 
5324 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5325 		return -EINVAL;
5326 	}
5327 
5328 	bdev_io = bdev_channel_get_io(channel);
5329 	if (!bdev_io) {
5330 		return -ENOMEM;
5331 	}
5332 
5333 	bdev_io->internal.ch = channel;
5334 	bdev_io->internal.desc = desc;
5335 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
5336 	bdev_io->u.bdev.iovs = &bdev_io->iov;
5337 	bdev_io->u.bdev.iovs[0].iov_base = buf;
5338 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
5339 	bdev_io->u.bdev.iovcnt = 1;
5340 	bdev_io->u.bdev.md_buf = md_buf;
5341 	bdev_io->u.bdev.num_blocks = num_blocks;
5342 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5343 	bdev_io->u.bdev.memory_domain = NULL;
5344 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5345 	bdev_io->u.bdev.accel_sequence = NULL;
5346 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5347 
5348 	bdev_io_submit(bdev_io);
5349 	return 0;
5350 }
5351 
5352 int
5353 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5354 		void *buf, uint64_t offset, uint64_t nbytes,
5355 		spdk_bdev_io_completion_cb cb, void *cb_arg)
5356 {
5357 	uint64_t offset_blocks, num_blocks;
5358 
5359 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5360 				 nbytes, &num_blocks) != 0) {
5361 		return -EINVAL;
5362 	}
5363 
5364 	return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
5365 }
5366 
5367 int
5368 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5369 		       void *buf, uint64_t offset_blocks, uint64_t num_blocks,
5370 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
5371 {
5372 	return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks,
5373 					 cb, cb_arg);
5374 }
5375 
5376 int
5377 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5378 			       void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5379 			       spdk_bdev_io_completion_cb cb, void *cb_arg)
5380 {
5381 	struct iovec iov = {
5382 		.iov_base = buf,
5383 	};
5384 
5385 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5386 		return -EINVAL;
5387 	}
5388 
5389 	if (md_buf && !_is_buf_allocated(&iov)) {
5390 		return -EINVAL;
5391 	}
5392 
5393 	return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
5394 					 cb, cb_arg);
5395 }
5396 
5397 static int
5398 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5399 			   struct iovec *iov, int iovcnt, void *md_buf,
5400 			   uint64_t offset_blocks, uint64_t num_blocks,
5401 			   struct spdk_memory_domain *domain, void *domain_ctx,
5402 			   struct spdk_accel_sequence *seq,
5403 			   spdk_bdev_io_completion_cb cb, void *cb_arg)
5404 {
5405 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5406 	struct spdk_bdev_io *bdev_io;
5407 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5408 
5409 	if (!desc->write) {
5410 		return -EBADF;
5411 	}
5412 
5413 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5414 		return -EINVAL;
5415 	}
5416 
5417 	bdev_io = bdev_channel_get_io(channel);
5418 	if (!bdev_io) {
5419 		return -ENOMEM;
5420 	}
5421 
5422 	bdev_io->internal.ch = channel;
5423 	bdev_io->internal.desc = desc;
5424 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
5425 	bdev_io->u.bdev.iovs = iov;
5426 	bdev_io->u.bdev.iovcnt = iovcnt;
5427 	bdev_io->u.bdev.md_buf = md_buf;
5428 	bdev_io->u.bdev.num_blocks = num_blocks;
5429 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5430 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5431 	bdev_io->internal.memory_domain = domain;
5432 	bdev_io->internal.memory_domain_ctx = domain_ctx;
5433 	bdev_io->internal.accel_sequence = seq;
5434 	bdev_io->u.bdev.memory_domain = domain;
5435 	bdev_io->u.bdev.memory_domain_ctx = domain_ctx;
5436 	bdev_io->u.bdev.accel_sequence = seq;
5437 
5438 	_bdev_io_submit_ext(desc, bdev_io);
5439 
5440 	return 0;
5441 }
5442 
5443 int
5444 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5445 		 struct iovec *iov, int iovcnt,
5446 		 uint64_t offset, uint64_t len,
5447 		 spdk_bdev_io_completion_cb cb, void *cb_arg)
5448 {
5449 	uint64_t offset_blocks, num_blocks;
5450 
5451 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5452 				 len, &num_blocks) != 0) {
5453 		return -EINVAL;
5454 	}
5455 
5456 	return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
5457 }
5458 
5459 int
5460 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5461 			struct iovec *iov, int iovcnt,
5462 			uint64_t offset_blocks, uint64_t num_blocks,
5463 			spdk_bdev_io_completion_cb cb, void *cb_arg)
5464 {
5465 	return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
5466 					  num_blocks, NULL, NULL, NULL, cb, cb_arg);
5467 }
5468 
5469 int
5470 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5471 				struct iovec *iov, int iovcnt, void *md_buf,
5472 				uint64_t offset_blocks, uint64_t num_blocks,
5473 				spdk_bdev_io_completion_cb cb, void *cb_arg)
5474 {
5475 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5476 		return -EINVAL;
5477 	}
5478 
5479 	if (md_buf && !_is_buf_allocated(iov)) {
5480 		return -EINVAL;
5481 	}
5482 
5483 	return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
5484 					  num_blocks, NULL, NULL, NULL, cb, cb_arg);
5485 }
5486 
5487 int
5488 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5489 			    struct iovec *iov, int iovcnt,
5490 			    uint64_t offset_blocks, uint64_t num_blocks,
5491 			    spdk_bdev_io_completion_cb cb, void *cb_arg,
5492 			    struct spdk_bdev_ext_io_opts *opts)
5493 {
5494 	void *md = NULL;
5495 
5496 	if (opts) {
5497 		if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) {
5498 			return -EINVAL;
5499 		}
5500 		md = opts->metadata;
5501 	}
5502 
5503 	if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5504 		return -EINVAL;
5505 	}
5506 
5507 	if (md && !_is_buf_allocated(iov)) {
5508 		return -EINVAL;
5509 	}
5510 
5511 	return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks,
5512 					  bdev_get_ext_io_opt(opts, memory_domain, NULL),
5513 					  bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL),
5514 					  bdev_get_ext_io_opt(opts, accel_sequence, NULL),
5515 					  cb, cb_arg);
5516 }
5517 
5518 static void
5519 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
5520 {
5521 	struct spdk_bdev_io *parent_io = cb_arg;
5522 	struct spdk_bdev *bdev = parent_io->bdev;
5523 	uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base;
5524 	int i, rc = 0;
5525 
5526 	if (!success) {
5527 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
5528 		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
5529 		spdk_bdev_free_io(bdev_io);
5530 		return;
5531 	}
5532 
5533 	for (i = 0; i < parent_io->u.bdev.iovcnt; i++) {
5534 		rc = memcmp(read_buf,
5535 			    parent_io->u.bdev.iovs[i].iov_base,
5536 			    parent_io->u.bdev.iovs[i].iov_len);
5537 		if (rc) {
5538 			break;
5539 		}
5540 		read_buf += parent_io->u.bdev.iovs[i].iov_len;
5541 	}
5542 
5543 	if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) {
5544 		rc = memcmp(bdev_io->u.bdev.md_buf,
5545 			    parent_io->u.bdev.md_buf,
5546 			    spdk_bdev_get_md_size(bdev));
5547 	}
5548 
5549 	spdk_bdev_free_io(bdev_io);
5550 
5551 	if (rc == 0) {
5552 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
5553 		parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx);
5554 	} else {
5555 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE;
5556 		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
5557 	}
5558 }
5559 
5560 static void
5561 bdev_compare_do_read(void *_bdev_io)
5562 {
5563 	struct spdk_bdev_io *bdev_io = _bdev_io;
5564 	int rc;
5565 
5566 	rc = spdk_bdev_read_blocks(bdev_io->internal.desc,
5567 				   spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL,
5568 				   bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
5569 				   bdev_compare_do_read_done, bdev_io);
5570 
5571 	if (rc == -ENOMEM) {
5572 		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read);
5573 	} else if (rc != 0) {
5574 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
5575 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
5576 	}
5577 }
5578 
5579 static int
5580 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5581 			     struct iovec *iov, int iovcnt, void *md_buf,
5582 			     uint64_t offset_blocks, uint64_t num_blocks,
5583 			     spdk_bdev_io_completion_cb cb, void *cb_arg)
5584 {
5585 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5586 	struct spdk_bdev_io *bdev_io;
5587 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5588 
5589 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5590 		return -EINVAL;
5591 	}
5592 
5593 	bdev_io = bdev_channel_get_io(channel);
5594 	if (!bdev_io) {
5595 		return -ENOMEM;
5596 	}
5597 
5598 	bdev_io->internal.ch = channel;
5599 	bdev_io->internal.desc = desc;
5600 	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE;
5601 	bdev_io->u.bdev.iovs = iov;
5602 	bdev_io->u.bdev.iovcnt = iovcnt;
5603 	bdev_io->u.bdev.md_buf = md_buf;
5604 	bdev_io->u.bdev.num_blocks = num_blocks;
5605 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5606 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5607 	bdev_io->u.bdev.memory_domain = NULL;
5608 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5609 	bdev_io->u.bdev.accel_sequence = NULL;
5610 
5611 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) {
5612 		bdev_io_submit(bdev_io);
5613 		return 0;
5614 	}
5615 
5616 	bdev_compare_do_read(bdev_io);
5617 
5618 	return 0;
5619 }
5620 
5621 int
5622 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5623 			  struct iovec *iov, int iovcnt,
5624 			  uint64_t offset_blocks, uint64_t num_blocks,
5625 			  spdk_bdev_io_completion_cb cb, void *cb_arg)
5626 {
5627 	return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
5628 					    num_blocks, cb, cb_arg);
5629 }
5630 
5631 int
5632 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5633 				  struct iovec *iov, int iovcnt, void *md_buf,
5634 				  uint64_t offset_blocks, uint64_t num_blocks,
5635 				  spdk_bdev_io_completion_cb cb, void *cb_arg)
5636 {
5637 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5638 		return -EINVAL;
5639 	}
5640 
5641 	if (md_buf && !_is_buf_allocated(iov)) {
5642 		return -EINVAL;
5643 	}
5644 
5645 	return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
5646 					    num_blocks, cb, cb_arg);
5647 }
5648 
5649 static int
5650 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5651 			    void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5652 			    spdk_bdev_io_completion_cb cb, void *cb_arg)
5653 {
5654 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5655 	struct spdk_bdev_io *bdev_io;
5656 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5657 
5658 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5659 		return -EINVAL;
5660 	}
5661 
5662 	bdev_io = bdev_channel_get_io(channel);
5663 	if (!bdev_io) {
5664 		return -ENOMEM;
5665 	}
5666 
5667 	bdev_io->internal.ch = channel;
5668 	bdev_io->internal.desc = desc;
5669 	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE;
5670 	bdev_io->u.bdev.iovs = &bdev_io->iov;
5671 	bdev_io->u.bdev.iovs[0].iov_base = buf;
5672 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
5673 	bdev_io->u.bdev.iovcnt = 1;
5674 	bdev_io->u.bdev.md_buf = md_buf;
5675 	bdev_io->u.bdev.num_blocks = num_blocks;
5676 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5677 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5678 	bdev_io->u.bdev.memory_domain = NULL;
5679 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5680 	bdev_io->u.bdev.accel_sequence = NULL;
5681 
5682 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) {
5683 		bdev_io_submit(bdev_io);
5684 		return 0;
5685 	}
5686 
5687 	bdev_compare_do_read(bdev_io);
5688 
5689 	return 0;
5690 }
5691 
5692 int
5693 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5694 			 void *buf, uint64_t offset_blocks, uint64_t num_blocks,
5695 			 spdk_bdev_io_completion_cb cb, void *cb_arg)
5696 {
5697 	return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks,
5698 					   cb, cb_arg);
5699 }
5700 
5701 int
5702 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5703 				 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5704 				 spdk_bdev_io_completion_cb cb, void *cb_arg)
5705 {
5706 	struct iovec iov = {
5707 		.iov_base = buf,
5708 	};
5709 
5710 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5711 		return -EINVAL;
5712 	}
5713 
5714 	if (md_buf && !_is_buf_allocated(&iov)) {
5715 		return -EINVAL;
5716 	}
5717 
5718 	return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
5719 					   cb, cb_arg);
5720 }
5721 
5722 static void
5723 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status)
5724 {
5725 	struct spdk_bdev_io *bdev_io = ctx;
5726 
5727 	if (unlock_status) {
5728 		SPDK_ERRLOG("LBA range unlock failed\n");
5729 	}
5730 
5731 	bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true :
5732 			     false, bdev_io->internal.caller_ctx);
5733 }
5734 
5735 static void
5736 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status)
5737 {
5738 	bdev_io->internal.status = status;
5739 
5740 	bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch),
5741 			      bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
5742 			      bdev_comparev_and_writev_blocks_unlocked, bdev_io);
5743 }
5744 
5745 static void
5746 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
5747 {
5748 	struct spdk_bdev_io *parent_io = cb_arg;
5749 
5750 	if (!success) {
5751 		SPDK_ERRLOG("Compare and write operation failed\n");
5752 	}
5753 
5754 	spdk_bdev_free_io(bdev_io);
5755 
5756 	bdev_comparev_and_writev_blocks_unlock(parent_io,
5757 					       success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED);
5758 }
5759 
5760 static void
5761 bdev_compare_and_write_do_write(void *_bdev_io)
5762 {
5763 	struct spdk_bdev_io *bdev_io = _bdev_io;
5764 	int rc;
5765 
5766 	rc = spdk_bdev_writev_blocks(bdev_io->internal.desc,
5767 				     spdk_io_channel_from_ctx(bdev_io->internal.ch),
5768 				     bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt,
5769 				     bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
5770 				     bdev_compare_and_write_do_write_done, bdev_io);
5771 
5772 
5773 	if (rc == -ENOMEM) {
5774 		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write);
5775 	} else if (rc != 0) {
5776 		bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
5777 	}
5778 }
5779 
5780 static void
5781 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
5782 {
5783 	struct spdk_bdev_io *parent_io = cb_arg;
5784 
5785 	spdk_bdev_free_io(bdev_io);
5786 
5787 	if (!success) {
5788 		bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE);
5789 		return;
5790 	}
5791 
5792 	bdev_compare_and_write_do_write(parent_io);
5793 }
5794 
5795 static void
5796 bdev_compare_and_write_do_compare(void *_bdev_io)
5797 {
5798 	struct spdk_bdev_io *bdev_io = _bdev_io;
5799 	int rc;
5800 
5801 	rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc,
5802 				       spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs,
5803 				       bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
5804 				       bdev_compare_and_write_do_compare_done, bdev_io);
5805 
5806 	if (rc == -ENOMEM) {
5807 		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare);
5808 	} else if (rc != 0) {
5809 		bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED);
5810 	}
5811 }
5812 
5813 static void
5814 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status)
5815 {
5816 	struct spdk_bdev_io *bdev_io = ctx;
5817 
5818 	if (status) {
5819 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED;
5820 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
5821 		return;
5822 	}
5823 
5824 	bdev_compare_and_write_do_compare(bdev_io);
5825 }
5826 
5827 int
5828 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5829 				     struct iovec *compare_iov, int compare_iovcnt,
5830 				     struct iovec *write_iov, int write_iovcnt,
5831 				     uint64_t offset_blocks, uint64_t num_blocks,
5832 				     spdk_bdev_io_completion_cb cb, void *cb_arg)
5833 {
5834 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5835 	struct spdk_bdev_io *bdev_io;
5836 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5837 
5838 	if (!desc->write) {
5839 		return -EBADF;
5840 	}
5841 
5842 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5843 		return -EINVAL;
5844 	}
5845 
5846 	if (num_blocks > bdev->acwu) {
5847 		return -EINVAL;
5848 	}
5849 
5850 	bdev_io = bdev_channel_get_io(channel);
5851 	if (!bdev_io) {
5852 		return -ENOMEM;
5853 	}
5854 
5855 	bdev_io->internal.ch = channel;
5856 	bdev_io->internal.desc = desc;
5857 	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE;
5858 	bdev_io->u.bdev.iovs = compare_iov;
5859 	bdev_io->u.bdev.iovcnt = compare_iovcnt;
5860 	bdev_io->u.bdev.fused_iovs = write_iov;
5861 	bdev_io->u.bdev.fused_iovcnt = write_iovcnt;
5862 	bdev_io->u.bdev.md_buf = NULL;
5863 	bdev_io->u.bdev.num_blocks = num_blocks;
5864 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5865 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5866 	bdev_io->u.bdev.memory_domain = NULL;
5867 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5868 	bdev_io->u.bdev.accel_sequence = NULL;
5869 
5870 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) {
5871 		bdev_io_submit(bdev_io);
5872 		return 0;
5873 	}
5874 
5875 	return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks,
5876 				   bdev_comparev_and_writev_blocks_locked, bdev_io);
5877 }
5878 
5879 int
5880 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5881 		      struct iovec *iov, int iovcnt,
5882 		      uint64_t offset_blocks, uint64_t num_blocks,
5883 		      bool populate,
5884 		      spdk_bdev_io_completion_cb cb, void *cb_arg)
5885 {
5886 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5887 	struct spdk_bdev_io *bdev_io;
5888 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5889 
5890 	if (!desc->write) {
5891 		return -EBADF;
5892 	}
5893 
5894 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5895 		return -EINVAL;
5896 	}
5897 
5898 	if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) {
5899 		return -ENOTSUP;
5900 	}
5901 
5902 	bdev_io = bdev_channel_get_io(channel);
5903 	if (!bdev_io) {
5904 		return -ENOMEM;
5905 	}
5906 
5907 	bdev_io->internal.ch = channel;
5908 	bdev_io->internal.desc = desc;
5909 	bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY;
5910 	bdev_io->u.bdev.num_blocks = num_blocks;
5911 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5912 	bdev_io->u.bdev.iovs = iov;
5913 	bdev_io->u.bdev.iovcnt = iovcnt;
5914 	bdev_io->u.bdev.md_buf = NULL;
5915 	bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0;
5916 	bdev_io->u.bdev.zcopy.commit = 0;
5917 	bdev_io->u.bdev.zcopy.start = 1;
5918 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5919 	bdev_io->u.bdev.memory_domain = NULL;
5920 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5921 	bdev_io->u.bdev.accel_sequence = NULL;
5922 
5923 	bdev_io_submit(bdev_io);
5924 
5925 	return 0;
5926 }
5927 
5928 int
5929 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit,
5930 		    spdk_bdev_io_completion_cb cb, void *cb_arg)
5931 {
5932 	if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) {
5933 		return -EINVAL;
5934 	}
5935 
5936 	bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0;
5937 	bdev_io->u.bdev.zcopy.start = 0;
5938 	bdev_io->internal.caller_ctx = cb_arg;
5939 	bdev_io->internal.cb = cb;
5940 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
5941 
5942 	bdev_io_submit(bdev_io);
5943 
5944 	return 0;
5945 }
5946 
5947 int
5948 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5949 		       uint64_t offset, uint64_t len,
5950 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
5951 {
5952 	uint64_t offset_blocks, num_blocks;
5953 
5954 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5955 				 len, &num_blocks) != 0) {
5956 		return -EINVAL;
5957 	}
5958 
5959 	return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
5960 }
5961 
5962 int
5963 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5964 			      uint64_t offset_blocks, uint64_t num_blocks,
5965 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
5966 {
5967 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5968 	struct spdk_bdev_io *bdev_io;
5969 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5970 
5971 	if (!desc->write) {
5972 		return -EBADF;
5973 	}
5974 
5975 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5976 		return -EINVAL;
5977 	}
5978 
5979 	if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) &&
5980 	    !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) {
5981 		return -ENOTSUP;
5982 	}
5983 
5984 	bdev_io = bdev_channel_get_io(channel);
5985 
5986 	if (!bdev_io) {
5987 		return -ENOMEM;
5988 	}
5989 
5990 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES;
5991 	bdev_io->internal.ch = channel;
5992 	bdev_io->internal.desc = desc;
5993 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5994 	bdev_io->u.bdev.num_blocks = num_blocks;
5995 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5996 	bdev_io->u.bdev.memory_domain = NULL;
5997 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5998 	bdev_io->u.bdev.accel_sequence = NULL;
5999 
6000 	/* If the write_zeroes size is large and should be split, use the generic split
6001 	 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not.
6002 	 *
6003 	 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported
6004 	 * or emulate it using regular write request otherwise.
6005 	 */
6006 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) ||
6007 	    bdev_io->internal.split) {
6008 		bdev_io_submit(bdev_io);
6009 		return 0;
6010 	}
6011 
6012 	assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE);
6013 
6014 	return bdev_write_zero_buffer(bdev_io);
6015 }
6016 
6017 int
6018 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6019 		uint64_t offset, uint64_t nbytes,
6020 		spdk_bdev_io_completion_cb cb, void *cb_arg)
6021 {
6022 	uint64_t offset_blocks, num_blocks;
6023 
6024 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
6025 				 nbytes, &num_blocks) != 0) {
6026 		return -EINVAL;
6027 	}
6028 
6029 	return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
6030 }
6031 
6032 int
6033 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6034 		       uint64_t offset_blocks, uint64_t num_blocks,
6035 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
6036 {
6037 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6038 	struct spdk_bdev_io *bdev_io;
6039 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6040 
6041 	if (!desc->write) {
6042 		return -EBADF;
6043 	}
6044 
6045 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6046 		return -EINVAL;
6047 	}
6048 
6049 	if (num_blocks == 0) {
6050 		SPDK_ERRLOG("Can't unmap 0 bytes\n");
6051 		return -EINVAL;
6052 	}
6053 
6054 	bdev_io = bdev_channel_get_io(channel);
6055 	if (!bdev_io) {
6056 		return -ENOMEM;
6057 	}
6058 
6059 	bdev_io->internal.ch = channel;
6060 	bdev_io->internal.desc = desc;
6061 	bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP;
6062 
6063 	bdev_io->u.bdev.iovs = &bdev_io->iov;
6064 	bdev_io->u.bdev.iovs[0].iov_base = NULL;
6065 	bdev_io->u.bdev.iovs[0].iov_len = 0;
6066 	bdev_io->u.bdev.iovcnt = 1;
6067 
6068 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6069 	bdev_io->u.bdev.num_blocks = num_blocks;
6070 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6071 	bdev_io->u.bdev.memory_domain = NULL;
6072 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6073 	bdev_io->u.bdev.accel_sequence = NULL;
6074 
6075 	bdev_io_submit(bdev_io);
6076 	return 0;
6077 }
6078 
6079 int
6080 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6081 		uint64_t offset, uint64_t length,
6082 		spdk_bdev_io_completion_cb cb, void *cb_arg)
6083 {
6084 	uint64_t offset_blocks, num_blocks;
6085 
6086 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
6087 				 length, &num_blocks) != 0) {
6088 		return -EINVAL;
6089 	}
6090 
6091 	return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
6092 }
6093 
6094 int
6095 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6096 		       uint64_t offset_blocks, uint64_t num_blocks,
6097 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
6098 {
6099 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6100 	struct spdk_bdev_io *bdev_io;
6101 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6102 
6103 	if (!desc->write) {
6104 		return -EBADF;
6105 	}
6106 
6107 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6108 		return -EINVAL;
6109 	}
6110 
6111 	bdev_io = bdev_channel_get_io(channel);
6112 	if (!bdev_io) {
6113 		return -ENOMEM;
6114 	}
6115 
6116 	bdev_io->internal.ch = channel;
6117 	bdev_io->internal.desc = desc;
6118 	bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH;
6119 	bdev_io->u.bdev.iovs = NULL;
6120 	bdev_io->u.bdev.iovcnt = 0;
6121 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6122 	bdev_io->u.bdev.num_blocks = num_blocks;
6123 	bdev_io->u.bdev.memory_domain = NULL;
6124 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6125 	bdev_io->u.bdev.accel_sequence = NULL;
6126 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6127 
6128 	bdev_io_submit(bdev_io);
6129 	return 0;
6130 }
6131 
6132 static int bdev_reset_poll_for_outstanding_io(void *ctx);
6133 
6134 static void
6135 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
6136 {
6137 	struct spdk_bdev_channel *ch = _ctx;
6138 	struct spdk_bdev_io *bdev_io;
6139 
6140 	bdev_io = TAILQ_FIRST(&ch->queued_resets);
6141 
6142 	if (status == -EBUSY) {
6143 		if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) {
6144 			bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io,
6145 							      ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD);
6146 		} else {
6147 			TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
6148 
6149 			if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) {
6150 				/* If outstanding IOs are still present and reset_io_drain_timeout
6151 				 * seconds passed, start the reset. */
6152 				bdev_io_submit_reset(bdev_io);
6153 			} else {
6154 				/* We still have in progress memory domain pull/push or we're
6155 				 * executing accel sequence.  Since we cannot abort either of those
6156 				 * operaions, fail the reset request. */
6157 				spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
6158 			}
6159 		}
6160 	} else {
6161 		TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
6162 		SPDK_DEBUGLOG(bdev,
6163 			      "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n",
6164 			      ch->bdev->name);
6165 		/* Mark the completion status as a SUCCESS and complete the reset. */
6166 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
6167 	}
6168 }
6169 
6170 static void
6171 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6172 				struct spdk_io_channel *io_ch, void *_ctx)
6173 {
6174 	struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch);
6175 	int status = 0;
6176 
6177 	if (cur_ch->io_outstanding > 0 ||
6178 	    !TAILQ_EMPTY(&cur_ch->io_memory_domain) ||
6179 	    !TAILQ_EMPTY(&cur_ch->io_accel_exec)) {
6180 		/* If a channel has outstanding IO, set status to -EBUSY code. This will stop
6181 		 * further iteration over the rest of the channels and pass non-zero status
6182 		 * to the callback function. */
6183 		status = -EBUSY;
6184 	}
6185 	spdk_bdev_for_each_channel_continue(i, status);
6186 }
6187 
6188 static int
6189 bdev_reset_poll_for_outstanding_io(void *ctx)
6190 {
6191 	struct spdk_bdev_channel *ch = ctx;
6192 	struct spdk_bdev_io *bdev_io;
6193 
6194 	bdev_io = TAILQ_FIRST(&ch->queued_resets);
6195 
6196 	spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller);
6197 	spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch,
6198 				   bdev_reset_check_outstanding_io_done);
6199 
6200 	return SPDK_POLLER_BUSY;
6201 }
6202 
6203 static void
6204 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status)
6205 {
6206 	struct spdk_bdev_channel *ch = _ctx;
6207 	struct spdk_bdev_io *bdev_io;
6208 
6209 	bdev_io = TAILQ_FIRST(&ch->queued_resets);
6210 
6211 	if (bdev->reset_io_drain_timeout == 0) {
6212 		TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
6213 
6214 		bdev_io_submit_reset(bdev_io);
6215 		return;
6216 	}
6217 
6218 	bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() +
6219 			(ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz());
6220 
6221 	/* In case bdev->reset_io_drain_timeout is not equal to zero,
6222 	 * submit the reset to the underlying module only if outstanding I/O
6223 	 * remain after reset_io_drain_timeout seconds have passed. */
6224 	spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch,
6225 				   bdev_reset_check_outstanding_io_done);
6226 }
6227 
6228 static void
6229 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6230 			  struct spdk_io_channel *ch, void *_ctx)
6231 {
6232 	struct spdk_bdev_channel	*channel;
6233 	struct spdk_bdev_mgmt_channel	*mgmt_channel;
6234 	struct spdk_bdev_shared_resource *shared_resource;
6235 	bdev_io_tailq_t			tmp_queued;
6236 
6237 	TAILQ_INIT(&tmp_queued);
6238 
6239 	channel = __io_ch_to_bdev_ch(ch);
6240 	shared_resource = channel->shared_resource;
6241 	mgmt_channel = shared_resource->mgmt_ch;
6242 
6243 	channel->flags |= BDEV_CH_RESET_IN_PROGRESS;
6244 
6245 	if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) {
6246 		/* The QoS object is always valid and readable while
6247 		 * the channel flag is set, so the lock here should not
6248 		 * be necessary. We're not in the fast path though, so
6249 		 * just take it anyway. */
6250 		spdk_spin_lock(&channel->bdev->internal.spinlock);
6251 		if (channel->bdev->internal.qos->ch == channel) {
6252 			TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link);
6253 		}
6254 		spdk_spin_unlock(&channel->bdev->internal.spinlock);
6255 	}
6256 
6257 	bdev_abort_all_queued_io(&shared_resource->nomem_io, channel);
6258 	bdev_abort_all_buf_io(mgmt_channel, channel);
6259 	bdev_abort_all_queued_io(&tmp_queued, channel);
6260 
6261 	spdk_bdev_for_each_channel_continue(i, 0);
6262 }
6263 
6264 static void
6265 bdev_start_reset(void *ctx)
6266 {
6267 	struct spdk_bdev_channel *ch = ctx;
6268 
6269 	spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch,
6270 				   bdev_reset_freeze_channel_done);
6271 }
6272 
6273 static void
6274 bdev_channel_start_reset(struct spdk_bdev_channel *ch)
6275 {
6276 	struct spdk_bdev *bdev = ch->bdev;
6277 
6278 	assert(!TAILQ_EMPTY(&ch->queued_resets));
6279 
6280 	spdk_spin_lock(&bdev->internal.spinlock);
6281 	if (bdev->internal.reset_in_progress == NULL) {
6282 		bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets);
6283 		/*
6284 		 * Take a channel reference for the target bdev for the life of this
6285 		 *  reset.  This guards against the channel getting destroyed while
6286 		 *  spdk_bdev_for_each_channel() calls related to this reset IO are in
6287 		 *  progress.  We will release the reference when this reset is
6288 		 *  completed.
6289 		 */
6290 		bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev));
6291 		bdev_start_reset(ch);
6292 	}
6293 	spdk_spin_unlock(&bdev->internal.spinlock);
6294 }
6295 
6296 int
6297 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6298 		spdk_bdev_io_completion_cb cb, void *cb_arg)
6299 {
6300 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6301 	struct spdk_bdev_io *bdev_io;
6302 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6303 
6304 	bdev_io = bdev_channel_get_io(channel);
6305 	if (!bdev_io) {
6306 		return -ENOMEM;
6307 	}
6308 
6309 	bdev_io->internal.ch = channel;
6310 	bdev_io->internal.desc = desc;
6311 	bdev_io->internal.submit_tsc = spdk_get_ticks();
6312 	bdev_io->type = SPDK_BDEV_IO_TYPE_RESET;
6313 	bdev_io->u.reset.ch_ref = NULL;
6314 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6315 
6316 	spdk_spin_lock(&bdev->internal.spinlock);
6317 	TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link);
6318 	spdk_spin_unlock(&bdev->internal.spinlock);
6319 
6320 	TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io,
6321 			  internal.ch_link);
6322 
6323 	bdev_channel_start_reset(channel);
6324 
6325 	return 0;
6326 }
6327 
6328 void
6329 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
6330 		      struct spdk_bdev_io_stat *stat)
6331 {
6332 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6333 
6334 	bdev_get_io_stat(stat, channel->stat);
6335 }
6336 
6337 static void
6338 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status)
6339 {
6340 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx;
6341 
6342 	bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat,
6343 			    bdev_iostat_ctx->cb_arg, 0);
6344 	free(bdev_iostat_ctx);
6345 }
6346 
6347 static void
6348 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6349 			   struct spdk_io_channel *ch, void *_ctx)
6350 {
6351 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx;
6352 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6353 
6354 	spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat);
6355 	spdk_bdev_for_each_channel_continue(i, 0);
6356 }
6357 
6358 void
6359 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat,
6360 			  spdk_bdev_get_device_stat_cb cb, void *cb_arg)
6361 {
6362 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx;
6363 
6364 	assert(bdev != NULL);
6365 	assert(stat != NULL);
6366 	assert(cb != NULL);
6367 
6368 	bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx));
6369 	if (bdev_iostat_ctx == NULL) {
6370 		SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n");
6371 		cb(bdev, stat, cb_arg, -ENOMEM);
6372 		return;
6373 	}
6374 
6375 	bdev_iostat_ctx->stat = stat;
6376 	bdev_iostat_ctx->cb = cb;
6377 	bdev_iostat_ctx->cb_arg = cb_arg;
6378 
6379 	/* Start with the statistics from previously deleted channels. */
6380 	spdk_spin_lock(&bdev->internal.spinlock);
6381 	bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat);
6382 	spdk_spin_unlock(&bdev->internal.spinlock);
6383 
6384 	/* Then iterate and add the statistics from each existing channel. */
6385 	spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx,
6386 				   bdev_get_device_stat_done);
6387 }
6388 
6389 struct bdev_iostat_reset_ctx {
6390 	enum spdk_bdev_reset_stat_mode mode;
6391 	bdev_reset_device_stat_cb cb;
6392 	void *cb_arg;
6393 };
6394 
6395 static void
6396 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status)
6397 {
6398 	struct bdev_iostat_reset_ctx *ctx = _ctx;
6399 
6400 	ctx->cb(bdev, ctx->cb_arg, 0);
6401 
6402 	free(ctx);
6403 }
6404 
6405 static void
6406 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6407 			     struct spdk_io_channel *ch, void *_ctx)
6408 {
6409 	struct bdev_iostat_reset_ctx *ctx = _ctx;
6410 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6411 
6412 	spdk_bdev_reset_io_stat(channel->stat, ctx->mode);
6413 
6414 	spdk_bdev_for_each_channel_continue(i, 0);
6415 }
6416 
6417 void
6418 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode,
6419 		       bdev_reset_device_stat_cb cb, void *cb_arg)
6420 {
6421 	struct bdev_iostat_reset_ctx *ctx;
6422 
6423 	assert(bdev != NULL);
6424 	assert(cb != NULL);
6425 
6426 	ctx = calloc(1, sizeof(*ctx));
6427 	if (ctx == NULL) {
6428 		SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n");
6429 		cb(bdev, cb_arg, -ENOMEM);
6430 		return;
6431 	}
6432 
6433 	ctx->mode = mode;
6434 	ctx->cb = cb;
6435 	ctx->cb_arg = cb_arg;
6436 
6437 	spdk_spin_lock(&bdev->internal.spinlock);
6438 	spdk_bdev_reset_io_stat(bdev->internal.stat, mode);
6439 	spdk_spin_unlock(&bdev->internal.spinlock);
6440 
6441 	spdk_bdev_for_each_channel(bdev,
6442 				   bdev_reset_each_channel_stat,
6443 				   ctx,
6444 				   bdev_reset_device_stat_done);
6445 }
6446 
6447 int
6448 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6449 			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
6450 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
6451 {
6452 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6453 	struct spdk_bdev_io *bdev_io;
6454 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6455 
6456 	if (!desc->write) {
6457 		return -EBADF;
6458 	}
6459 
6460 	if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) {
6461 		return -ENOTSUP;
6462 	}
6463 
6464 	bdev_io = bdev_channel_get_io(channel);
6465 	if (!bdev_io) {
6466 		return -ENOMEM;
6467 	}
6468 
6469 	bdev_io->internal.ch = channel;
6470 	bdev_io->internal.desc = desc;
6471 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN;
6472 	bdev_io->u.nvme_passthru.cmd = *cmd;
6473 	bdev_io->u.nvme_passthru.buf = buf;
6474 	bdev_io->u.nvme_passthru.nbytes = nbytes;
6475 	bdev_io->u.nvme_passthru.md_buf = NULL;
6476 	bdev_io->u.nvme_passthru.md_len = 0;
6477 
6478 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6479 
6480 	bdev_io_submit(bdev_io);
6481 	return 0;
6482 }
6483 
6484 int
6485 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6486 			   const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
6487 			   spdk_bdev_io_completion_cb cb, void *cb_arg)
6488 {
6489 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6490 	struct spdk_bdev_io *bdev_io;
6491 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6492 
6493 	if (!desc->write) {
6494 		/*
6495 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
6496 		 *  to easily determine if the command is a read or write, but for now just
6497 		 *  do not allow io_passthru with a read-only descriptor.
6498 		 */
6499 		return -EBADF;
6500 	}
6501 
6502 	if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) {
6503 		return -ENOTSUP;
6504 	}
6505 
6506 	bdev_io = bdev_channel_get_io(channel);
6507 	if (!bdev_io) {
6508 		return -ENOMEM;
6509 	}
6510 
6511 	bdev_io->internal.ch = channel;
6512 	bdev_io->internal.desc = desc;
6513 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO;
6514 	bdev_io->u.nvme_passthru.cmd = *cmd;
6515 	bdev_io->u.nvme_passthru.buf = buf;
6516 	bdev_io->u.nvme_passthru.nbytes = nbytes;
6517 	bdev_io->u.nvme_passthru.md_buf = NULL;
6518 	bdev_io->u.nvme_passthru.md_len = 0;
6519 
6520 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6521 
6522 	bdev_io_submit(bdev_io);
6523 	return 0;
6524 }
6525 
6526 int
6527 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6528 			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len,
6529 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
6530 {
6531 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6532 	struct spdk_bdev_io *bdev_io;
6533 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6534 
6535 	if (!desc->write) {
6536 		/*
6537 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
6538 		 *  to easily determine if the command is a read or write, but for now just
6539 		 *  do not allow io_passthru with a read-only descriptor.
6540 		 */
6541 		return -EBADF;
6542 	}
6543 
6544 	if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) {
6545 		return -ENOTSUP;
6546 	}
6547 
6548 	bdev_io = bdev_channel_get_io(channel);
6549 	if (!bdev_io) {
6550 		return -ENOMEM;
6551 	}
6552 
6553 	bdev_io->internal.ch = channel;
6554 	bdev_io->internal.desc = desc;
6555 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD;
6556 	bdev_io->u.nvme_passthru.cmd = *cmd;
6557 	bdev_io->u.nvme_passthru.buf = buf;
6558 	bdev_io->u.nvme_passthru.nbytes = nbytes;
6559 	bdev_io->u.nvme_passthru.md_buf = md_buf;
6560 	bdev_io->u.nvme_passthru.md_len = md_len;
6561 
6562 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6563 
6564 	bdev_io_submit(bdev_io);
6565 	return 0;
6566 }
6567 
6568 static void bdev_abort_retry(void *ctx);
6569 static void bdev_abort(struct spdk_bdev_io *parent_io);
6570 
6571 static void
6572 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
6573 {
6574 	struct spdk_bdev_channel *channel = bdev_io->internal.ch;
6575 	struct spdk_bdev_io *parent_io = cb_arg;
6576 	struct spdk_bdev_io *bio_to_abort, *tmp_io;
6577 
6578 	bio_to_abort = bdev_io->u.abort.bio_to_abort;
6579 
6580 	spdk_bdev_free_io(bdev_io);
6581 
6582 	if (!success) {
6583 		/* Check if the target I/O completed in the meantime. */
6584 		TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) {
6585 			if (tmp_io == bio_to_abort) {
6586 				break;
6587 			}
6588 		}
6589 
6590 		/* If the target I/O still exists, set the parent to failed. */
6591 		if (tmp_io != NULL) {
6592 			parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
6593 		}
6594 	}
6595 
6596 	parent_io->u.bdev.split_outstanding--;
6597 	if (parent_io->u.bdev.split_outstanding == 0) {
6598 		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
6599 			bdev_abort_retry(parent_io);
6600 		} else {
6601 			bdev_io_complete(parent_io);
6602 		}
6603 	}
6604 }
6605 
6606 static int
6607 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel,
6608 	      struct spdk_bdev_io *bio_to_abort,
6609 	      spdk_bdev_io_completion_cb cb, void *cb_arg)
6610 {
6611 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6612 	struct spdk_bdev_io *bdev_io;
6613 
6614 	if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT ||
6615 	    bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) {
6616 		/* TODO: Abort reset or abort request. */
6617 		return -ENOTSUP;
6618 	}
6619 
6620 	bdev_io = bdev_channel_get_io(channel);
6621 	if (bdev_io == NULL) {
6622 		return -ENOMEM;
6623 	}
6624 
6625 	bdev_io->internal.ch = channel;
6626 	bdev_io->internal.desc = desc;
6627 	bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT;
6628 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6629 
6630 	if (bdev->split_on_optimal_io_boundary && bio_to_abort->internal.split) {
6631 		assert(bdev_io_should_split(bio_to_abort));
6632 		bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort;
6633 
6634 		/* Parent abort request is not submitted directly, but to manage its
6635 		 * execution add it to the submitted list here.
6636 		 */
6637 		bdev_io->internal.submit_tsc = spdk_get_ticks();
6638 		TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link);
6639 
6640 		bdev_abort(bdev_io);
6641 
6642 		return 0;
6643 	}
6644 
6645 	bdev_io->u.abort.bio_to_abort = bio_to_abort;
6646 
6647 	/* Submit the abort request to the underlying bdev module. */
6648 	bdev_io_submit(bdev_io);
6649 
6650 	return 0;
6651 }
6652 
6653 static bool
6654 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq)
6655 {
6656 	struct spdk_bdev_io *iter;
6657 
6658 	TAILQ_FOREACH(iter, tailq, internal.link) {
6659 		if (iter == bdev_io) {
6660 			return true;
6661 		}
6662 	}
6663 
6664 	return false;
6665 }
6666 
6667 static uint32_t
6668 _bdev_abort(struct spdk_bdev_io *parent_io)
6669 {
6670 	struct spdk_bdev_desc *desc = parent_io->internal.desc;
6671 	struct spdk_bdev_channel *channel = parent_io->internal.ch;
6672 	void *bio_cb_arg;
6673 	struct spdk_bdev_io *bio_to_abort;
6674 	uint32_t matched_ios;
6675 	int rc;
6676 
6677 	bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg;
6678 
6679 	/* matched_ios is returned and will be kept by the caller.
6680 	 *
6681 	 * This function will be used for two cases, 1) the same cb_arg is used for
6682 	 * multiple I/Os, 2) a single large I/O is split into smaller ones.
6683 	 * Incrementing split_outstanding directly here may confuse readers especially
6684 	 * for the 1st case.
6685 	 *
6686 	 * Completion of I/O abort is processed after stack unwinding. Hence this trick
6687 	 * works as expected.
6688 	 */
6689 	matched_ios = 0;
6690 	parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
6691 
6692 	TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) {
6693 		if (bio_to_abort->internal.caller_ctx != bio_cb_arg) {
6694 			continue;
6695 		}
6696 
6697 		if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) {
6698 			/* Any I/O which was submitted after this abort command should be excluded. */
6699 			continue;
6700 		}
6701 
6702 		/* We can't abort a request that's being pushed/pulled or executed by accel */
6703 		if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) ||
6704 		    bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) {
6705 			parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
6706 			break;
6707 		}
6708 
6709 		rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io);
6710 		if (rc != 0) {
6711 			if (rc == -ENOMEM) {
6712 				parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM;
6713 			} else {
6714 				parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
6715 			}
6716 			break;
6717 		}
6718 		matched_ios++;
6719 	}
6720 
6721 	return matched_ios;
6722 }
6723 
6724 static void
6725 bdev_abort_retry(void *ctx)
6726 {
6727 	struct spdk_bdev_io *parent_io = ctx;
6728 	uint32_t matched_ios;
6729 
6730 	matched_ios = _bdev_abort(parent_io);
6731 
6732 	if (matched_ios == 0) {
6733 		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
6734 			bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry);
6735 		} else {
6736 			/* For retry, the case that no target I/O was found is success
6737 			 * because it means target I/Os completed in the meantime.
6738 			 */
6739 			bdev_io_complete(parent_io);
6740 		}
6741 		return;
6742 	}
6743 
6744 	/* Use split_outstanding to manage the progress of aborting I/Os. */
6745 	parent_io->u.bdev.split_outstanding = matched_ios;
6746 }
6747 
6748 static void
6749 bdev_abort(struct spdk_bdev_io *parent_io)
6750 {
6751 	uint32_t matched_ios;
6752 
6753 	matched_ios = _bdev_abort(parent_io);
6754 
6755 	if (matched_ios == 0) {
6756 		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
6757 			bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry);
6758 		} else {
6759 			/* The case the no target I/O was found is failure. */
6760 			parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
6761 			bdev_io_complete(parent_io);
6762 		}
6763 		return;
6764 	}
6765 
6766 	/* Use split_outstanding to manage the progress of aborting I/Os. */
6767 	parent_io->u.bdev.split_outstanding = matched_ios;
6768 }
6769 
6770 int
6771 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6772 		void *bio_cb_arg,
6773 		spdk_bdev_io_completion_cb cb, void *cb_arg)
6774 {
6775 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6776 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6777 	struct spdk_bdev_io *bdev_io;
6778 
6779 	if (bio_cb_arg == NULL) {
6780 		return -EINVAL;
6781 	}
6782 
6783 	if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) {
6784 		return -ENOTSUP;
6785 	}
6786 
6787 	bdev_io = bdev_channel_get_io(channel);
6788 	if (bdev_io == NULL) {
6789 		return -ENOMEM;
6790 	}
6791 
6792 	bdev_io->internal.ch = channel;
6793 	bdev_io->internal.desc = desc;
6794 	bdev_io->internal.submit_tsc = spdk_get_ticks();
6795 	bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT;
6796 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6797 
6798 	bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg;
6799 
6800 	/* Parent abort request is not submitted directly, but to manage its execution,
6801 	 * add it to the submitted list here.
6802 	 */
6803 	TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link);
6804 
6805 	bdev_abort(bdev_io);
6806 
6807 	return 0;
6808 }
6809 
6810 int
6811 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
6812 			struct spdk_bdev_io_wait_entry *entry)
6813 {
6814 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6815 	struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch;
6816 
6817 	if (bdev != entry->bdev) {
6818 		SPDK_ERRLOG("bdevs do not match\n");
6819 		return -EINVAL;
6820 	}
6821 
6822 	if (mgmt_ch->per_thread_cache_count > 0) {
6823 		SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n");
6824 		return -EINVAL;
6825 	}
6826 
6827 	TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link);
6828 	return 0;
6829 }
6830 
6831 static inline void
6832 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff)
6833 {
6834 	enum spdk_bdev_io_status io_status = bdev_io->internal.status;
6835 	struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat;
6836 	uint64_t num_blocks = bdev_io->u.bdev.num_blocks;
6837 	uint32_t blocklen = bdev_io->bdev->blocklen;
6838 
6839 	if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
6840 		switch (bdev_io->type) {
6841 		case SPDK_BDEV_IO_TYPE_READ:
6842 			io_stat->bytes_read += num_blocks * blocklen;
6843 			io_stat->num_read_ops++;
6844 			io_stat->read_latency_ticks += tsc_diff;
6845 			if (io_stat->max_read_latency_ticks < tsc_diff) {
6846 				io_stat->max_read_latency_ticks = tsc_diff;
6847 			}
6848 			if (io_stat->min_read_latency_ticks > tsc_diff) {
6849 				io_stat->min_read_latency_ticks = tsc_diff;
6850 			}
6851 			break;
6852 		case SPDK_BDEV_IO_TYPE_WRITE:
6853 			io_stat->bytes_written += num_blocks * blocklen;
6854 			io_stat->num_write_ops++;
6855 			io_stat->write_latency_ticks += tsc_diff;
6856 			if (io_stat->max_write_latency_ticks < tsc_diff) {
6857 				io_stat->max_write_latency_ticks = tsc_diff;
6858 			}
6859 			if (io_stat->min_write_latency_ticks > tsc_diff) {
6860 				io_stat->min_write_latency_ticks = tsc_diff;
6861 			}
6862 			break;
6863 		case SPDK_BDEV_IO_TYPE_UNMAP:
6864 			io_stat->bytes_unmapped += num_blocks * blocklen;
6865 			io_stat->num_unmap_ops++;
6866 			io_stat->unmap_latency_ticks += tsc_diff;
6867 			if (io_stat->max_unmap_latency_ticks < tsc_diff) {
6868 				io_stat->max_unmap_latency_ticks = tsc_diff;
6869 			}
6870 			if (io_stat->min_unmap_latency_ticks > tsc_diff) {
6871 				io_stat->min_unmap_latency_ticks = tsc_diff;
6872 			}
6873 			break;
6874 		case SPDK_BDEV_IO_TYPE_ZCOPY:
6875 			/* Track the data in the start phase only */
6876 			if (bdev_io->u.bdev.zcopy.start) {
6877 				if (bdev_io->u.bdev.zcopy.populate) {
6878 					io_stat->bytes_read += num_blocks * blocklen;
6879 					io_stat->num_read_ops++;
6880 					io_stat->read_latency_ticks += tsc_diff;
6881 					if (io_stat->max_read_latency_ticks < tsc_diff) {
6882 						io_stat->max_read_latency_ticks = tsc_diff;
6883 					}
6884 					if (io_stat->min_read_latency_ticks > tsc_diff) {
6885 						io_stat->min_read_latency_ticks = tsc_diff;
6886 					}
6887 				} else {
6888 					io_stat->bytes_written += num_blocks * blocklen;
6889 					io_stat->num_write_ops++;
6890 					io_stat->write_latency_ticks += tsc_diff;
6891 					if (io_stat->max_write_latency_ticks < tsc_diff) {
6892 						io_stat->max_write_latency_ticks = tsc_diff;
6893 					}
6894 					if (io_stat->min_write_latency_ticks > tsc_diff) {
6895 						io_stat->min_write_latency_ticks = tsc_diff;
6896 					}
6897 				}
6898 			}
6899 			break;
6900 		case SPDK_BDEV_IO_TYPE_COPY:
6901 			io_stat->bytes_copied += num_blocks * blocklen;
6902 			io_stat->num_copy_ops++;
6903 			bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff;
6904 			if (io_stat->max_copy_latency_ticks < tsc_diff) {
6905 				io_stat->max_copy_latency_ticks = tsc_diff;
6906 			}
6907 			if (io_stat->min_copy_latency_ticks > tsc_diff) {
6908 				io_stat->min_copy_latency_ticks = tsc_diff;
6909 			}
6910 			break;
6911 		default:
6912 			break;
6913 		}
6914 	} else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) {
6915 		io_stat = bdev_io->bdev->internal.stat;
6916 		assert(io_stat->io_error != NULL);
6917 
6918 		spdk_spin_lock(&bdev_io->bdev->internal.spinlock);
6919 		io_stat->io_error->error_status[-io_status - 1]++;
6920 		spdk_spin_unlock(&bdev_io->bdev->internal.spinlock);
6921 	}
6922 
6923 #ifdef SPDK_CONFIG_VTUNE
6924 	uint64_t now_tsc = spdk_get_ticks();
6925 	if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) {
6926 		uint64_t data[5];
6927 		struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat;
6928 
6929 		data[0] = io_stat->num_read_ops - prev_stat->num_read_ops;
6930 		data[1] = io_stat->bytes_read - prev_stat->bytes_read;
6931 		data[2] = io_stat->num_write_ops - prev_stat->num_write_ops;
6932 		data[3] = io_stat->bytes_written - prev_stat->bytes_written;
6933 		data[4] = bdev_io->bdev->fn_table->get_spin_time ?
6934 			  bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0;
6935 
6936 		__itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle,
6937 				   __itt_metadata_u64, 5, data);
6938 
6939 		memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat));
6940 		bdev_io->internal.ch->start_tsc = now_tsc;
6941 	}
6942 #endif
6943 }
6944 
6945 static inline void
6946 _bdev_io_complete(void *ctx)
6947 {
6948 	struct spdk_bdev_io *bdev_io = ctx;
6949 
6950 	if (spdk_unlikely(bdev_io->internal.accel_sequence != NULL)) {
6951 		assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS);
6952 		spdk_accel_sequence_abort(bdev_io->internal.accel_sequence);
6953 	}
6954 
6955 	assert(bdev_io->internal.cb != NULL);
6956 	assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io));
6957 
6958 	bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
6959 			     bdev_io->internal.caller_ctx);
6960 }
6961 
6962 static inline void
6963 bdev_io_complete(void *ctx)
6964 {
6965 	struct spdk_bdev_io *bdev_io = ctx;
6966 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
6967 	uint64_t tsc, tsc_diff;
6968 
6969 	if (spdk_unlikely(bdev_io->internal.in_submit_request)) {
6970 		/*
6971 		 * Defer completion to avoid potential infinite recursion if the
6972 		 * user's completion callback issues a new I/O.
6973 		 */
6974 		spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
6975 				     bdev_io_complete, bdev_io);
6976 		return;
6977 	}
6978 
6979 	tsc = spdk_get_ticks();
6980 	tsc_diff = tsc - bdev_io->internal.submit_tsc;
6981 	spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io,
6982 			      bdev_io->internal.caller_ctx);
6983 
6984 	TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link);
6985 
6986 	if (bdev_io->internal.ch->histogram) {
6987 		spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff);
6988 	}
6989 
6990 	bdev_io_update_io_stat(bdev_io, tsc_diff);
6991 	_bdev_io_complete(bdev_io);
6992 }
6993 
6994 /* The difference between this function and bdev_io_complete() is that this should be called to
6995  * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the
6996  * io_submitted list and don't have submit_tsc updated.
6997  */
6998 static inline void
6999 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io)
7000 {
7001 	/* Since the IO hasn't been submitted it's bound to be failed */
7002 	assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS);
7003 
7004 	/* At this point we don't know if the IO is completed from submission context or not, but,
7005 	 * since this is an error path, we can always do an spdk_thread_send_msg(). */
7006 	spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
7007 			     _bdev_io_complete, bdev_io);
7008 }
7009 
7010 static void bdev_destroy_cb(void *io_device);
7011 
7012 static void
7013 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status)
7014 {
7015 	struct spdk_bdev_io *bdev_io = _ctx;
7016 
7017 	if (bdev_io->u.reset.ch_ref != NULL) {
7018 		spdk_put_io_channel(bdev_io->u.reset.ch_ref);
7019 		bdev_io->u.reset.ch_ref = NULL;
7020 	}
7021 
7022 	bdev_io_complete(bdev_io);
7023 
7024 	if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING &&
7025 	    TAILQ_EMPTY(&bdev->internal.open_descs)) {
7026 		spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
7027 	}
7028 }
7029 
7030 static void
7031 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
7032 		      struct spdk_io_channel *_ch, void *_ctx)
7033 {
7034 	struct spdk_bdev_io *bdev_io = _ctx;
7035 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
7036 	struct spdk_bdev_io *queued_reset;
7037 
7038 	ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS;
7039 	while (!TAILQ_EMPTY(&ch->queued_resets)) {
7040 		queued_reset = TAILQ_FIRST(&ch->queued_resets);
7041 		TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link);
7042 		spdk_bdev_io_complete(queued_reset, bdev_io->internal.status);
7043 	}
7044 
7045 	spdk_bdev_for_each_channel_continue(i, 0);
7046 }
7047 
7048 static void
7049 bdev_io_complete_sequence_cb(void *ctx, int status)
7050 {
7051 	struct spdk_bdev_io *bdev_io = ctx;
7052 
7053 	/* u.bdev.accel_sequence should have already been cleared at this point */
7054 	assert(bdev_io->u.bdev.accel_sequence == NULL);
7055 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
7056 	bdev_io->internal.accel_sequence = NULL;
7057 
7058 	if (spdk_unlikely(status != 0)) {
7059 		SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status);
7060 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7061 	}
7062 
7063 	bdev_io_complete(bdev_io);
7064 }
7065 
7066 void
7067 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status)
7068 {
7069 	struct spdk_bdev *bdev = bdev_io->bdev;
7070 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
7071 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
7072 
7073 	if (bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING) {
7074 		SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n",
7075 			    spdk_bdev_get_module_name(bdev),
7076 			    bdev_io_status_get_string(bdev_io->internal.status));
7077 		assert(false);
7078 	}
7079 	bdev_io->internal.status = status;
7080 
7081 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) {
7082 		bool unlock_channels = false;
7083 
7084 		if (status == SPDK_BDEV_IO_STATUS_NOMEM) {
7085 			SPDK_ERRLOG("NOMEM returned for reset\n");
7086 		}
7087 		spdk_spin_lock(&bdev->internal.spinlock);
7088 		if (bdev_io == bdev->internal.reset_in_progress) {
7089 			bdev->internal.reset_in_progress = NULL;
7090 			unlock_channels = true;
7091 		}
7092 		spdk_spin_unlock(&bdev->internal.spinlock);
7093 
7094 		if (unlock_channels) {
7095 			spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io,
7096 						   bdev_reset_complete);
7097 			return;
7098 		}
7099 	} else {
7100 		bdev_io_decrement_outstanding(bdev_ch, shared_resource);
7101 		if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
7102 			if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) {
7103 				bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb);
7104 				return;
7105 			} else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) {
7106 				_bdev_io_push_bounce_data_buffer(bdev_io,
7107 								 _bdev_io_complete_push_bounce_done);
7108 				/* bdev IO will be completed in the callback */
7109 				return;
7110 			}
7111 		}
7112 
7113 		if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) {
7114 			return;
7115 		}
7116 	}
7117 
7118 	bdev_io_complete(bdev_io);
7119 }
7120 
7121 void
7122 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc,
7123 				  enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq)
7124 {
7125 	enum spdk_bdev_io_status status;
7126 
7127 	if (sc == SPDK_SCSI_STATUS_GOOD) {
7128 		status = SPDK_BDEV_IO_STATUS_SUCCESS;
7129 	} else {
7130 		status = SPDK_BDEV_IO_STATUS_SCSI_ERROR;
7131 		bdev_io->internal.error.scsi.sc = sc;
7132 		bdev_io->internal.error.scsi.sk = sk;
7133 		bdev_io->internal.error.scsi.asc = asc;
7134 		bdev_io->internal.error.scsi.ascq = ascq;
7135 	}
7136 
7137 	spdk_bdev_io_complete(bdev_io, status);
7138 }
7139 
7140 void
7141 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io,
7142 			     int *sc, int *sk, int *asc, int *ascq)
7143 {
7144 	assert(sc != NULL);
7145 	assert(sk != NULL);
7146 	assert(asc != NULL);
7147 	assert(ascq != NULL);
7148 
7149 	switch (bdev_io->internal.status) {
7150 	case SPDK_BDEV_IO_STATUS_SUCCESS:
7151 		*sc = SPDK_SCSI_STATUS_GOOD;
7152 		*sk = SPDK_SCSI_SENSE_NO_SENSE;
7153 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
7154 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
7155 		break;
7156 	case SPDK_BDEV_IO_STATUS_NVME_ERROR:
7157 		spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq);
7158 		break;
7159 	case SPDK_BDEV_IO_STATUS_SCSI_ERROR:
7160 		*sc = bdev_io->internal.error.scsi.sc;
7161 		*sk = bdev_io->internal.error.scsi.sk;
7162 		*asc = bdev_io->internal.error.scsi.asc;
7163 		*ascq = bdev_io->internal.error.scsi.ascq;
7164 		break;
7165 	default:
7166 		*sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
7167 		*sk = SPDK_SCSI_SENSE_ABORTED_COMMAND;
7168 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
7169 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
7170 		break;
7171 	}
7172 }
7173 
7174 void
7175 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result)
7176 {
7177 	enum spdk_bdev_io_status status;
7178 
7179 	if (aio_result == 0) {
7180 		status = SPDK_BDEV_IO_STATUS_SUCCESS;
7181 	} else {
7182 		status = SPDK_BDEV_IO_STATUS_AIO_ERROR;
7183 	}
7184 
7185 	bdev_io->internal.error.aio_result = aio_result;
7186 
7187 	spdk_bdev_io_complete(bdev_io, status);
7188 }
7189 
7190 void
7191 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result)
7192 {
7193 	assert(aio_result != NULL);
7194 
7195 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) {
7196 		*aio_result = bdev_io->internal.error.aio_result;
7197 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
7198 		*aio_result = 0;
7199 	} else {
7200 		*aio_result = -EIO;
7201 	}
7202 }
7203 
7204 void
7205 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc)
7206 {
7207 	enum spdk_bdev_io_status status;
7208 
7209 	if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) {
7210 		status = SPDK_BDEV_IO_STATUS_SUCCESS;
7211 	} else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) {
7212 		status = SPDK_BDEV_IO_STATUS_ABORTED;
7213 	} else {
7214 		status = SPDK_BDEV_IO_STATUS_NVME_ERROR;
7215 	}
7216 
7217 	bdev_io->internal.error.nvme.cdw0 = cdw0;
7218 	bdev_io->internal.error.nvme.sct = sct;
7219 	bdev_io->internal.error.nvme.sc = sc;
7220 
7221 	spdk_bdev_io_complete(bdev_io, status);
7222 }
7223 
7224 void
7225 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc)
7226 {
7227 	assert(sct != NULL);
7228 	assert(sc != NULL);
7229 	assert(cdw0 != NULL);
7230 
7231 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) {
7232 		*sct = SPDK_NVME_SCT_GENERIC;
7233 		*sc = SPDK_NVME_SC_SUCCESS;
7234 		if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
7235 			*cdw0 = 0;
7236 		} else {
7237 			*cdw0 = 1U;
7238 		}
7239 		return;
7240 	}
7241 
7242 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
7243 		*sct = bdev_io->internal.error.nvme.sct;
7244 		*sc = bdev_io->internal.error.nvme.sc;
7245 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
7246 		*sct = SPDK_NVME_SCT_GENERIC;
7247 		*sc = SPDK_NVME_SC_SUCCESS;
7248 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) {
7249 		*sct = SPDK_NVME_SCT_GENERIC;
7250 		*sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
7251 	} else {
7252 		*sct = SPDK_NVME_SCT_GENERIC;
7253 		*sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
7254 	}
7255 
7256 	*cdw0 = bdev_io->internal.error.nvme.cdw0;
7257 }
7258 
7259 void
7260 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0,
7261 				   int *first_sct, int *first_sc, int *second_sct, int *second_sc)
7262 {
7263 	assert(first_sct != NULL);
7264 	assert(first_sc != NULL);
7265 	assert(second_sct != NULL);
7266 	assert(second_sc != NULL);
7267 	assert(cdw0 != NULL);
7268 
7269 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
7270 		if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR &&
7271 		    bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) {
7272 			*first_sct = bdev_io->internal.error.nvme.sct;
7273 			*first_sc = bdev_io->internal.error.nvme.sc;
7274 			*second_sct = SPDK_NVME_SCT_GENERIC;
7275 			*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
7276 		} else {
7277 			*first_sct = SPDK_NVME_SCT_GENERIC;
7278 			*first_sc = SPDK_NVME_SC_SUCCESS;
7279 			*second_sct = bdev_io->internal.error.nvme.sct;
7280 			*second_sc = bdev_io->internal.error.nvme.sc;
7281 		}
7282 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) {
7283 		*first_sct = SPDK_NVME_SCT_GENERIC;
7284 		*first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
7285 		*second_sct = SPDK_NVME_SCT_GENERIC;
7286 		*second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
7287 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
7288 		*first_sct = SPDK_NVME_SCT_GENERIC;
7289 		*first_sc = SPDK_NVME_SC_SUCCESS;
7290 		*second_sct = SPDK_NVME_SCT_GENERIC;
7291 		*second_sc = SPDK_NVME_SC_SUCCESS;
7292 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) {
7293 		*first_sct = SPDK_NVME_SCT_GENERIC;
7294 		*first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
7295 		*second_sct = SPDK_NVME_SCT_GENERIC;
7296 		*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
7297 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) {
7298 		*first_sct = SPDK_NVME_SCT_MEDIA_ERROR;
7299 		*first_sc = SPDK_NVME_SC_COMPARE_FAILURE;
7300 		*second_sct = SPDK_NVME_SCT_GENERIC;
7301 		*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
7302 	} else {
7303 		*first_sct = SPDK_NVME_SCT_GENERIC;
7304 		*first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
7305 		*second_sct = SPDK_NVME_SCT_GENERIC;
7306 		*second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
7307 	}
7308 
7309 	*cdw0 = bdev_io->internal.error.nvme.cdw0;
7310 }
7311 
7312 struct spdk_thread *
7313 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io)
7314 {
7315 	return spdk_io_channel_get_thread(bdev_io->internal.ch->channel);
7316 }
7317 
7318 struct spdk_io_channel *
7319 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io)
7320 {
7321 	return bdev_io->internal.ch->channel;
7322 }
7323 
7324 static int
7325 bdev_register(struct spdk_bdev *bdev)
7326 {
7327 	char *bdev_name;
7328 	char uuid[SPDK_UUID_STRING_LEN];
7329 	struct spdk_iobuf_opts iobuf_opts;
7330 	int ret, i;
7331 
7332 	assert(bdev->module != NULL);
7333 
7334 	if (!bdev->name) {
7335 		SPDK_ERRLOG("Bdev name is NULL\n");
7336 		return -EINVAL;
7337 	}
7338 
7339 	if (!strlen(bdev->name)) {
7340 		SPDK_ERRLOG("Bdev name must not be an empty string\n");
7341 		return -EINVAL;
7342 	}
7343 
7344 	for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) {
7345 		if (bdev->fn_table->accel_sequence_supported == NULL) {
7346 			continue;
7347 		}
7348 		if (!bdev->fn_table->accel_sequence_supported(bdev->ctxt,
7349 				(enum spdk_bdev_io_type)i)) {
7350 			continue;
7351 		}
7352 
7353 		if (spdk_bdev_get_memory_domains(bdev, NULL, 0) <= 0) {
7354 			SPDK_ERRLOG("bdev supporting accel sequence is required to support "
7355 				    "memory domains\n");
7356 			return -EINVAL;
7357 		}
7358 
7359 		if (spdk_bdev_is_md_separate(bdev)) {
7360 			SPDK_ERRLOG("Separate metadata is currently unsupported for bdevs with "
7361 				    "accel sequence support\n");
7362 			return -EINVAL;
7363 		}
7364 	}
7365 
7366 	/* Users often register their own I/O devices using the bdev name. In
7367 	 * order to avoid conflicts, prepend bdev_. */
7368 	bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name);
7369 	if (!bdev_name) {
7370 		SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n");
7371 		return -ENOMEM;
7372 	}
7373 
7374 	bdev->internal.stat = bdev_alloc_io_stat(true);
7375 	if (!bdev->internal.stat) {
7376 		SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n");
7377 		free(bdev_name);
7378 		return -ENOMEM;
7379 	}
7380 
7381 	bdev->internal.status = SPDK_BDEV_STATUS_READY;
7382 	bdev->internal.measured_queue_depth = UINT64_MAX;
7383 	bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE;
7384 	memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim));
7385 	bdev->internal.qd_poller = NULL;
7386 	bdev->internal.qos = NULL;
7387 
7388 	TAILQ_INIT(&bdev->internal.open_descs);
7389 	TAILQ_INIT(&bdev->internal.locked_ranges);
7390 	TAILQ_INIT(&bdev->internal.pending_locked_ranges);
7391 	TAILQ_INIT(&bdev->aliases);
7392 
7393 	ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name);
7394 	if (ret != 0) {
7395 		bdev_free_io_stat(bdev->internal.stat);
7396 		free(bdev_name);
7397 		return ret;
7398 	}
7399 
7400 	/* UUID may be specified by the user or defined by bdev itself.
7401 	 * Otherwise it will be generated here, so this field will never be empty. */
7402 	if (spdk_uuid_is_null(&bdev->uuid)) {
7403 		spdk_uuid_generate(&bdev->uuid);
7404 	}
7405 
7406 	/* Add the UUID alias only if it's different than the name */
7407 	spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid);
7408 	if (strcmp(bdev->name, uuid) != 0) {
7409 		ret = spdk_bdev_alias_add(bdev, uuid);
7410 		if (ret != 0) {
7411 			SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name);
7412 			bdev_name_del(&bdev->internal.bdev_name);
7413 			bdev_free_io_stat(bdev->internal.stat);
7414 			free(bdev_name);
7415 			return ret;
7416 		}
7417 	}
7418 
7419 	if (spdk_bdev_get_buf_align(bdev) > 1) {
7420 		if (bdev->split_on_optimal_io_boundary) {
7421 			bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary,
7422 							     SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen);
7423 		} else {
7424 			bdev->split_on_optimal_io_boundary = true;
7425 			bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen;
7426 		}
7427 	}
7428 
7429 	/* If the user didn't specify a write unit size, set it to one. */
7430 	if (bdev->write_unit_size == 0) {
7431 		bdev->write_unit_size = 1;
7432 	}
7433 
7434 	/* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */
7435 	if (bdev->acwu == 0) {
7436 		bdev->acwu = bdev->write_unit_size;
7437 	}
7438 
7439 	if (bdev->phys_blocklen == 0) {
7440 		bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev);
7441 	}
7442 
7443 	if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) {
7444 		spdk_iobuf_get_opts(&iobuf_opts);
7445 		bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize);
7446 	}
7447 
7448 	if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
7449 		bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE);
7450 	}
7451 
7452 	bdev->internal.reset_in_progress = NULL;
7453 	bdev->internal.qd_poll_in_progress = false;
7454 	bdev->internal.period = 0;
7455 	bdev->internal.new_period = 0;
7456 
7457 	spdk_io_device_register(__bdev_to_io_dev(bdev),
7458 				bdev_channel_create, bdev_channel_destroy,
7459 				sizeof(struct spdk_bdev_channel),
7460 				bdev_name);
7461 
7462 	free(bdev_name);
7463 
7464 	spdk_spin_init(&bdev->internal.spinlock);
7465 
7466 	SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name);
7467 	TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link);
7468 
7469 	return 0;
7470 }
7471 
7472 static void
7473 bdev_destroy_cb(void *io_device)
7474 {
7475 	int			rc;
7476 	struct spdk_bdev	*bdev;
7477 	spdk_bdev_unregister_cb	cb_fn;
7478 	void			*cb_arg;
7479 
7480 	bdev = __bdev_from_io_dev(io_device);
7481 
7482 	if (bdev->internal.unregister_td != spdk_get_thread()) {
7483 		spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device);
7484 		return;
7485 	}
7486 
7487 	cb_fn = bdev->internal.unregister_cb;
7488 	cb_arg = bdev->internal.unregister_ctx;
7489 
7490 	spdk_spin_destroy(&bdev->internal.spinlock);
7491 	free(bdev->internal.qos);
7492 	bdev_free_io_stat(bdev->internal.stat);
7493 
7494 	rc = bdev->fn_table->destruct(bdev->ctxt);
7495 	if (rc < 0) {
7496 		SPDK_ERRLOG("destruct failed\n");
7497 	}
7498 	if (rc <= 0 && cb_fn != NULL) {
7499 		cb_fn(cb_arg, rc);
7500 	}
7501 }
7502 
7503 void
7504 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno)
7505 {
7506 	if (bdev->internal.unregister_cb != NULL) {
7507 		bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno);
7508 	}
7509 }
7510 
7511 static void
7512 _remove_notify(void *arg)
7513 {
7514 	struct spdk_bdev_desc *desc = arg;
7515 
7516 	_event_notify(desc, SPDK_BDEV_EVENT_REMOVE);
7517 }
7518 
7519 /* returns: 0 - bdev removed and ready to be destructed.
7520  *          -EBUSY - bdev can't be destructed yet.  */
7521 static int
7522 bdev_unregister_unsafe(struct spdk_bdev *bdev)
7523 {
7524 	struct spdk_bdev_desc	*desc, *tmp;
7525 	int			rc = 0;
7526 	char			uuid[SPDK_UUID_STRING_LEN];
7527 
7528 	assert(spdk_spin_held(&g_bdev_mgr.spinlock));
7529 	assert(spdk_spin_held(&bdev->internal.spinlock));
7530 
7531 	/* Notify each descriptor about hotremoval */
7532 	TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) {
7533 		rc = -EBUSY;
7534 		/*
7535 		 * Defer invocation of the event_cb to a separate message that will
7536 		 *  run later on its thread.  This ensures this context unwinds and
7537 		 *  we don't recursively unregister this bdev again if the event_cb
7538 		 *  immediately closes its descriptor.
7539 		 */
7540 		event_notify(desc, _remove_notify);
7541 	}
7542 
7543 	/* If there are no descriptors, proceed removing the bdev */
7544 	if (rc == 0) {
7545 		TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
7546 		SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name);
7547 
7548 		/* Delete the name and the UUID alias */
7549 		spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid);
7550 		bdev_name_del_unsafe(&bdev->internal.bdev_name);
7551 		bdev_alias_del(bdev, uuid, bdev_name_del_unsafe);
7552 
7553 		spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev));
7554 
7555 		if (bdev->internal.reset_in_progress != NULL) {
7556 			/* If reset is in progress, let the completion callback for reset
7557 			 * unregister the bdev.
7558 			 */
7559 			rc = -EBUSY;
7560 		}
7561 	}
7562 
7563 	return rc;
7564 }
7565 
7566 static void
7567 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
7568 			      struct spdk_io_channel *io_ch, void *_ctx)
7569 {
7570 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
7571 
7572 	bdev_channel_abort_queued_ios(bdev_ch);
7573 	spdk_bdev_for_each_channel_continue(i, 0);
7574 }
7575 
7576 static void
7577 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status)
7578 {
7579 	int rc;
7580 
7581 	spdk_spin_lock(&g_bdev_mgr.spinlock);
7582 	spdk_spin_lock(&bdev->internal.spinlock);
7583 	/*
7584 	 * Set the status to REMOVING after completing to abort channels. Otherwise,
7585 	 * the last spdk_bdev_close() may call spdk_io_device_unregister() while
7586 	 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister()
7587 	 * may fail.
7588 	 */
7589 	bdev->internal.status = SPDK_BDEV_STATUS_REMOVING;
7590 	rc = bdev_unregister_unsafe(bdev);
7591 	spdk_spin_unlock(&bdev->internal.spinlock);
7592 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
7593 
7594 	if (rc == 0) {
7595 		spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
7596 	}
7597 }
7598 
7599 void
7600 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
7601 {
7602 	struct spdk_thread	*thread;
7603 
7604 	SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name);
7605 
7606 	thread = spdk_get_thread();
7607 	if (!thread) {
7608 		/* The user called this from a non-SPDK thread. */
7609 		if (cb_fn != NULL) {
7610 			cb_fn(cb_arg, -ENOTSUP);
7611 		}
7612 		return;
7613 	}
7614 
7615 	spdk_spin_lock(&g_bdev_mgr.spinlock);
7616 	if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING ||
7617 	    bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) {
7618 		spdk_spin_unlock(&g_bdev_mgr.spinlock);
7619 		if (cb_fn) {
7620 			cb_fn(cb_arg, -EBUSY);
7621 		}
7622 		return;
7623 	}
7624 
7625 	spdk_spin_lock(&bdev->internal.spinlock);
7626 	bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING;
7627 	bdev->internal.unregister_cb = cb_fn;
7628 	bdev->internal.unregister_ctx = cb_arg;
7629 	bdev->internal.unregister_td = thread;
7630 	spdk_spin_unlock(&bdev->internal.spinlock);
7631 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
7632 
7633 	spdk_bdev_set_qd_sampling_period(bdev, 0);
7634 
7635 	spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev,
7636 				   bdev_unregister);
7637 }
7638 
7639 int
7640 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module,
7641 			     spdk_bdev_unregister_cb cb_fn, void *cb_arg)
7642 {
7643 	struct spdk_bdev_desc *desc;
7644 	struct spdk_bdev *bdev;
7645 	int rc;
7646 
7647 	rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc);
7648 	if (rc != 0) {
7649 		SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name);
7650 		return rc;
7651 	}
7652 
7653 	bdev = spdk_bdev_desc_get_bdev(desc);
7654 
7655 	if (bdev->module != module) {
7656 		spdk_bdev_close(desc);
7657 		SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n",
7658 			    bdev_name);
7659 		return -ENODEV;
7660 	}
7661 
7662 	spdk_bdev_unregister(bdev, cb_fn, cb_arg);
7663 
7664 	spdk_bdev_close(desc);
7665 
7666 	return 0;
7667 }
7668 
7669 static int
7670 bdev_start_qos(struct spdk_bdev *bdev)
7671 {
7672 	struct set_qos_limit_ctx *ctx;
7673 
7674 	/* Enable QoS */
7675 	if (bdev->internal.qos && bdev->internal.qos->thread == NULL) {
7676 		ctx = calloc(1, sizeof(*ctx));
7677 		if (ctx == NULL) {
7678 			SPDK_ERRLOG("Failed to allocate memory for QoS context\n");
7679 			return -ENOMEM;
7680 		}
7681 		ctx->bdev = bdev;
7682 		spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done);
7683 	}
7684 
7685 	return 0;
7686 }
7687 
7688 static void
7689 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail,
7690 		    struct spdk_bdev *bdev)
7691 {
7692 	enum spdk_bdev_claim_type type;
7693 	const char *typename, *modname;
7694 	extern struct spdk_log_flag SPDK_LOG_bdev;
7695 
7696 	assert(spdk_spin_held(&bdev->internal.spinlock));
7697 
7698 	if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) {
7699 		return;
7700 	}
7701 
7702 	type = bdev->internal.claim_type;
7703 	typename = spdk_bdev_claim_get_name(type);
7704 
7705 	if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) {
7706 		modname = bdev->internal.claim.v1.module->name;
7707 		spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n",
7708 			 bdev->name, detail, typename, modname);
7709 		return;
7710 	}
7711 
7712 	if (claim_type_is_v2(type)) {
7713 		struct spdk_bdev_module_claim *claim;
7714 
7715 		TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) {
7716 			modname = claim->module->name;
7717 			spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n",
7718 				 bdev->name, detail, typename, modname);
7719 		}
7720 		return;
7721 	}
7722 
7723 	assert(false);
7724 }
7725 
7726 static int
7727 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc)
7728 {
7729 	struct spdk_thread *thread;
7730 	int rc = 0;
7731 
7732 	thread = spdk_get_thread();
7733 	if (!thread) {
7734 		SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n");
7735 		return -ENOTSUP;
7736 	}
7737 
7738 	SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
7739 		      spdk_get_thread());
7740 
7741 	desc->bdev = bdev;
7742 	desc->thread = thread;
7743 	desc->write = write;
7744 
7745 	spdk_spin_lock(&bdev->internal.spinlock);
7746 	if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING ||
7747 	    bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) {
7748 		spdk_spin_unlock(&bdev->internal.spinlock);
7749 		return -ENODEV;
7750 	}
7751 
7752 	if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
7753 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
7754 		spdk_spin_unlock(&bdev->internal.spinlock);
7755 		return -EPERM;
7756 	}
7757 
7758 	rc = bdev_start_qos(bdev);
7759 	if (rc != 0) {
7760 		SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name);
7761 		spdk_spin_unlock(&bdev->internal.spinlock);
7762 		return rc;
7763 	}
7764 
7765 	TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link);
7766 
7767 	spdk_spin_unlock(&bdev->internal.spinlock);
7768 
7769 	return 0;
7770 }
7771 
7772 static int
7773 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx,
7774 		struct spdk_bdev_desc **_desc)
7775 {
7776 	struct spdk_bdev_desc *desc;
7777 	unsigned int i;
7778 
7779 	desc = calloc(1, sizeof(*desc));
7780 	if (desc == NULL) {
7781 		SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n");
7782 		return -ENOMEM;
7783 	}
7784 
7785 	TAILQ_INIT(&desc->pending_media_events);
7786 	TAILQ_INIT(&desc->free_media_events);
7787 
7788 	desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0;
7789 	desc->callback.event_fn = event_cb;
7790 	desc->callback.ctx = event_ctx;
7791 	spdk_spin_init(&desc->spinlock);
7792 
7793 	if (bdev->media_events) {
7794 		desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE,
7795 						   sizeof(*desc->media_events_buffer));
7796 		if (desc->media_events_buffer == NULL) {
7797 			SPDK_ERRLOG("Failed to initialize media event pool\n");
7798 			bdev_desc_free(desc);
7799 			return -ENOMEM;
7800 		}
7801 
7802 		for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) {
7803 			TAILQ_INSERT_TAIL(&desc->free_media_events,
7804 					  &desc->media_events_buffer[i], tailq);
7805 		}
7806 	}
7807 
7808 	if (bdev->fn_table->accel_sequence_supported != NULL) {
7809 		for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) {
7810 			desc->accel_sequence_supported[i] =
7811 				bdev->fn_table->accel_sequence_supported(bdev->ctxt,
7812 						(enum spdk_bdev_io_type)i);
7813 		}
7814 	}
7815 
7816 	*_desc = desc;
7817 
7818 	return 0;
7819 }
7820 
7821 int
7822 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
7823 		   void *event_ctx, struct spdk_bdev_desc **_desc)
7824 {
7825 	struct spdk_bdev_desc *desc;
7826 	struct spdk_bdev *bdev;
7827 	int rc;
7828 
7829 	if (event_cb == NULL) {
7830 		SPDK_ERRLOG("Missing event callback function\n");
7831 		return -EINVAL;
7832 	}
7833 
7834 	spdk_spin_lock(&g_bdev_mgr.spinlock);
7835 
7836 	bdev = bdev_get_by_name(bdev_name);
7837 
7838 	if (bdev == NULL) {
7839 		SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name);
7840 		spdk_spin_unlock(&g_bdev_mgr.spinlock);
7841 		return -ENODEV;
7842 	}
7843 
7844 	rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc);
7845 	if (rc != 0) {
7846 		spdk_spin_unlock(&g_bdev_mgr.spinlock);
7847 		return rc;
7848 	}
7849 
7850 	rc = bdev_open(bdev, write, desc);
7851 	if (rc != 0) {
7852 		bdev_desc_free(desc);
7853 		desc = NULL;
7854 	}
7855 
7856 	*_desc = desc;
7857 
7858 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
7859 
7860 	return rc;
7861 }
7862 
7863 static void
7864 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc)
7865 {
7866 	int rc;
7867 
7868 	spdk_spin_lock(&bdev->internal.spinlock);
7869 	spdk_spin_lock(&desc->spinlock);
7870 
7871 	TAILQ_REMOVE(&bdev->internal.open_descs, desc, link);
7872 
7873 	desc->closed = true;
7874 
7875 	if (desc->claim != NULL) {
7876 		bdev_desc_release_claims(desc);
7877 	}
7878 
7879 	if (0 == desc->refs) {
7880 		spdk_spin_unlock(&desc->spinlock);
7881 		bdev_desc_free(desc);
7882 	} else {
7883 		spdk_spin_unlock(&desc->spinlock);
7884 	}
7885 
7886 	/* If no more descriptors, kill QoS channel */
7887 	if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) {
7888 		SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n",
7889 			      bdev->name, spdk_get_thread());
7890 
7891 		if (bdev_qos_destroy(bdev)) {
7892 			/* There isn't anything we can do to recover here. Just let the
7893 			 * old QoS poller keep running. The QoS handling won't change
7894 			 * cores when the user allocates a new channel, but it won't break. */
7895 			SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n");
7896 		}
7897 	}
7898 
7899 	if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) {
7900 		rc = bdev_unregister_unsafe(bdev);
7901 		spdk_spin_unlock(&bdev->internal.spinlock);
7902 
7903 		if (rc == 0) {
7904 			spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
7905 		}
7906 	} else {
7907 		spdk_spin_unlock(&bdev->internal.spinlock);
7908 	}
7909 }
7910 
7911 void
7912 spdk_bdev_close(struct spdk_bdev_desc *desc)
7913 {
7914 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7915 
7916 	SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
7917 		      spdk_get_thread());
7918 
7919 	assert(desc->thread == spdk_get_thread());
7920 
7921 	spdk_poller_unregister(&desc->io_timeout_poller);
7922 
7923 	spdk_spin_lock(&g_bdev_mgr.spinlock);
7924 
7925 	bdev_close(bdev, desc);
7926 
7927 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
7928 }
7929 
7930 static void
7931 bdev_register_finished(void *arg)
7932 {
7933 	struct spdk_bdev_desc *desc = arg;
7934 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
7935 
7936 	spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev));
7937 
7938 	spdk_spin_lock(&g_bdev_mgr.spinlock);
7939 
7940 	bdev_close(bdev, desc);
7941 
7942 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
7943 }
7944 
7945 int
7946 spdk_bdev_register(struct spdk_bdev *bdev)
7947 {
7948 	struct spdk_bdev_desc *desc;
7949 	struct spdk_thread *thread = spdk_get_thread();
7950 	int rc;
7951 
7952 	if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) {
7953 		SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", bdev->name, thread,
7954 			    thread ? spdk_thread_get_name(thread) : "null");
7955 		return -EINVAL;
7956 	}
7957 
7958 	rc = bdev_register(bdev);
7959 	if (rc != 0) {
7960 		return rc;
7961 	}
7962 
7963 	/* A descriptor is opened to prevent bdev deletion during examination */
7964 	rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc);
7965 	if (rc != 0) {
7966 		spdk_bdev_unregister(bdev, NULL, NULL);
7967 		return rc;
7968 	}
7969 
7970 	rc = bdev_open(bdev, false, desc);
7971 	if (rc != 0) {
7972 		bdev_desc_free(desc);
7973 		spdk_bdev_unregister(bdev, NULL, NULL);
7974 		return rc;
7975 	}
7976 
7977 	/* Examine configuration before initializing I/O */
7978 	bdev_examine(bdev);
7979 
7980 	rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc);
7981 	if (rc != 0) {
7982 		bdev_close(bdev, desc);
7983 		spdk_bdev_unregister(bdev, NULL, NULL);
7984 	}
7985 
7986 	return rc;
7987 }
7988 
7989 int
7990 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
7991 			    struct spdk_bdev_module *module)
7992 {
7993 	spdk_spin_lock(&bdev->internal.spinlock);
7994 
7995 	if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
7996 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
7997 		spdk_spin_unlock(&bdev->internal.spinlock);
7998 		return -EPERM;
7999 	}
8000 
8001 	if (desc && !desc->write) {
8002 		desc->write = true;
8003 	}
8004 
8005 	bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE;
8006 	bdev->internal.claim.v1.module = module;
8007 
8008 	spdk_spin_unlock(&bdev->internal.spinlock);
8009 	return 0;
8010 }
8011 
8012 void
8013 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev)
8014 {
8015 	spdk_spin_lock(&bdev->internal.spinlock);
8016 
8017 	assert(bdev->internal.claim.v1.module != NULL);
8018 	assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE);
8019 	bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE;
8020 	bdev->internal.claim.v1.module = NULL;
8021 
8022 	spdk_spin_unlock(&bdev->internal.spinlock);
8023 }
8024 
8025 /*
8026  * Start claims v2
8027  */
8028 
8029 const char *
8030 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type)
8031 {
8032 	switch (type) {
8033 	case SPDK_BDEV_CLAIM_NONE:
8034 		return "not_claimed";
8035 	case SPDK_BDEV_CLAIM_EXCL_WRITE:
8036 		return "exclusive_write";
8037 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
8038 		return "read_many_write_one";
8039 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE:
8040 		return "read_many_write_none";
8041 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
8042 		return "read_many_write_many";
8043 	default:
8044 		break;
8045 	}
8046 	return "invalid_claim";
8047 }
8048 
8049 static bool
8050 claim_type_is_v2(enum spdk_bdev_claim_type type)
8051 {
8052 	switch (type) {
8053 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
8054 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE:
8055 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
8056 		return true;
8057 	default:
8058 		break;
8059 	}
8060 	return false;
8061 }
8062 
8063 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */
8064 static bool
8065 claim_type_promotes_to_write(enum spdk_bdev_claim_type type)
8066 {
8067 	switch (type) {
8068 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
8069 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
8070 		return true;
8071 	default:
8072 		break;
8073 	}
8074 	return false;
8075 }
8076 
8077 void
8078 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size)
8079 {
8080 	if (opts == NULL) {
8081 		SPDK_ERRLOG("opts should not be NULL\n");
8082 		assert(opts != NULL);
8083 		return;
8084 	}
8085 	if (size == 0) {
8086 		SPDK_ERRLOG("size should not be zero\n");
8087 		assert(size != 0);
8088 		return;
8089 	}
8090 
8091 	memset(opts, 0, size);
8092 	opts->opts_size = size;
8093 
8094 #define FIELD_OK(field) \
8095         offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size
8096 
8097 #define SET_FIELD(field, value) \
8098         if (FIELD_OK(field)) { \
8099                 opts->field = value; \
8100         } \
8101 
8102 	SET_FIELD(shared_claim_key, 0);
8103 
8104 #undef FIELD_OK
8105 #undef SET_FIELD
8106 }
8107 
8108 static int
8109 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst)
8110 {
8111 	if (src->opts_size == 0) {
8112 		SPDK_ERRLOG("size should not be zero\n");
8113 		return -1;
8114 	}
8115 
8116 	memset(dst, 0, sizeof(*dst));
8117 	dst->opts_size = src->opts_size;
8118 
8119 #define FIELD_OK(field) \
8120         offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size
8121 
8122 #define SET_FIELD(field) \
8123         if (FIELD_OK(field)) { \
8124                 dst->field = src->field; \
8125         } \
8126 
8127 	if (FIELD_OK(name)) {
8128 		snprintf(dst->name, sizeof(dst->name), "%s", src->name);
8129 	}
8130 
8131 	SET_FIELD(shared_claim_key);
8132 
8133 	/* You should not remove this statement, but need to update the assert statement
8134 	 * if you add a new field, and also add a corresponding SET_FIELD statement */
8135 	SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size");
8136 
8137 #undef FIELD_OK
8138 #undef SET_FIELD
8139 	return 0;
8140 }
8141 
8142 /* Returns 0 if a read-write-once claim can be taken. */
8143 static int
8144 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
8145 		 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
8146 {
8147 	struct spdk_bdev *bdev = desc->bdev;
8148 	struct spdk_bdev_desc *open_desc;
8149 
8150 	assert(spdk_spin_held(&bdev->internal.spinlock));
8151 	assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE);
8152 
8153 	if (opts->shared_claim_key != 0) {
8154 		SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n",
8155 			    bdev->name);
8156 		return -EINVAL;
8157 	}
8158 	if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
8159 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
8160 		return -EPERM;
8161 	}
8162 	if (desc->claim != NULL) {
8163 		SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n",
8164 			       bdev->name, desc->claim->module->name);
8165 		return -EPERM;
8166 	}
8167 	TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) {
8168 		if (desc != open_desc && open_desc->write) {
8169 			SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while "
8170 				       "another descriptor is open for writing\n",
8171 				       bdev->name);
8172 			return -EPERM;
8173 		}
8174 	}
8175 
8176 	return 0;
8177 }
8178 
8179 /* Returns 0 if a read-only-many claim can be taken. */
8180 static int
8181 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
8182 		 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
8183 {
8184 	struct spdk_bdev *bdev = desc->bdev;
8185 	struct spdk_bdev_desc *open_desc;
8186 
8187 	assert(spdk_spin_held(&bdev->internal.spinlock));
8188 	assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE);
8189 	assert(desc->claim == NULL);
8190 
8191 	if (desc->write) {
8192 		SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n",
8193 			    bdev->name);
8194 		return -EINVAL;
8195 	}
8196 	if (opts->shared_claim_key != 0) {
8197 		SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name);
8198 		return -EINVAL;
8199 	}
8200 	if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) {
8201 		TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) {
8202 			if (open_desc->write) {
8203 				SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while "
8204 					       "another descriptor is open for writing\n",
8205 					       bdev->name);
8206 				return -EPERM;
8207 			}
8208 		}
8209 	}
8210 
8211 	return 0;
8212 }
8213 
8214 /* Returns 0 if a read-write-many claim can be taken. */
8215 static int
8216 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
8217 		 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
8218 {
8219 	struct spdk_bdev *bdev = desc->bdev;
8220 	struct spdk_bdev_desc *open_desc;
8221 
8222 	assert(spdk_spin_held(&bdev->internal.spinlock));
8223 	assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED);
8224 	assert(desc->claim == NULL);
8225 
8226 	if (opts->shared_claim_key == 0) {
8227 		SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n",
8228 			    bdev->name);
8229 		return -EINVAL;
8230 	}
8231 	switch (bdev->internal.claim_type) {
8232 	case SPDK_BDEV_CLAIM_NONE:
8233 		TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) {
8234 			if (open_desc == desc) {
8235 				continue;
8236 			}
8237 			if (open_desc->write) {
8238 				SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while "
8239 					       "another descriptor is open for writing without a "
8240 					       "claim\n", bdev->name);
8241 				return -EPERM;
8242 			}
8243 		}
8244 		break;
8245 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
8246 		if (opts->shared_claim_key != bdev->internal.claim.v2.key) {
8247 			LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev);
8248 			return -EPERM;
8249 		}
8250 		break;
8251 	default:
8252 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
8253 		return -EBUSY;
8254 	}
8255 
8256 	return 0;
8257 }
8258 
8259 /* Updates desc and its bdev with a v2 claim. */
8260 static int
8261 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
8262 	   struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
8263 {
8264 	struct spdk_bdev *bdev = desc->bdev;
8265 	struct spdk_bdev_module_claim *claim;
8266 
8267 	assert(spdk_spin_held(&bdev->internal.spinlock));
8268 	assert(claim_type_is_v2(type));
8269 	assert(desc->claim == NULL);
8270 
8271 	claim = calloc(1, sizeof(*desc->claim));
8272 	if (claim == NULL) {
8273 		SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name);
8274 		return -ENOMEM;
8275 	}
8276 	claim->module = module;
8277 	claim->desc = desc;
8278 	SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match");
8279 	memcpy(claim->name, opts->name, sizeof(claim->name));
8280 	desc->claim = claim;
8281 
8282 	if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) {
8283 		bdev->internal.claim_type = type;
8284 		TAILQ_INIT(&bdev->internal.claim.v2.claims);
8285 		bdev->internal.claim.v2.key = opts->shared_claim_key;
8286 	}
8287 	assert(type == bdev->internal.claim_type);
8288 
8289 	TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link);
8290 
8291 	if (!desc->write && claim_type_promotes_to_write(type)) {
8292 		desc->write = true;
8293 	}
8294 
8295 	return 0;
8296 }
8297 
8298 int
8299 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
8300 				 struct spdk_bdev_claim_opts *_opts,
8301 				 struct spdk_bdev_module *module)
8302 {
8303 	struct spdk_bdev *bdev;
8304 	struct spdk_bdev_claim_opts opts;
8305 	int rc = 0;
8306 
8307 	if (desc == NULL) {
8308 		SPDK_ERRLOG("descriptor must not be NULL\n");
8309 		return -EINVAL;
8310 	}
8311 
8312 	bdev = desc->bdev;
8313 
8314 	if (_opts == NULL) {
8315 		spdk_bdev_claim_opts_init(&opts, sizeof(opts));
8316 	} else if (claim_opts_copy(_opts, &opts) != 0) {
8317 		return -EINVAL;
8318 	}
8319 
8320 	spdk_spin_lock(&bdev->internal.spinlock);
8321 
8322 	if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE &&
8323 	    bdev->internal.claim_type != type) {
8324 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
8325 		spdk_spin_unlock(&bdev->internal.spinlock);
8326 		return -EPERM;
8327 	}
8328 
8329 	if (claim_type_is_v2(type) && desc->claim != NULL) {
8330 		SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n",
8331 			    bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name);
8332 		spdk_spin_unlock(&bdev->internal.spinlock);
8333 		return -EPERM;
8334 	}
8335 
8336 	switch (type) {
8337 	case SPDK_BDEV_CLAIM_EXCL_WRITE:
8338 		spdk_spin_unlock(&bdev->internal.spinlock);
8339 		return spdk_bdev_module_claim_bdev(bdev, desc, module);
8340 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
8341 		rc = claim_verify_rwo(desc, type, &opts, module);
8342 		break;
8343 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE:
8344 		rc = claim_verify_rom(desc, type, &opts, module);
8345 		break;
8346 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
8347 		rc = claim_verify_rwm(desc, type, &opts, module);
8348 		break;
8349 	default:
8350 		SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type);
8351 		rc = -ENOTSUP;
8352 	}
8353 
8354 	if (rc == 0) {
8355 		rc = claim_bdev(desc, type, &opts, module);
8356 	}
8357 
8358 	spdk_spin_unlock(&bdev->internal.spinlock);
8359 	return rc;
8360 }
8361 
8362 static void
8363 claim_reset(struct spdk_bdev *bdev)
8364 {
8365 	assert(spdk_spin_held(&bdev->internal.spinlock));
8366 	assert(claim_type_is_v2(bdev->internal.claim_type));
8367 	assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims));
8368 
8369 	memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim));
8370 	bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE;
8371 }
8372 
8373 static void
8374 bdev_desc_release_claims(struct spdk_bdev_desc *desc)
8375 {
8376 	struct spdk_bdev *bdev = desc->bdev;
8377 
8378 	assert(spdk_spin_held(&bdev->internal.spinlock));
8379 	assert(claim_type_is_v2(bdev->internal.claim_type));
8380 
8381 	if (bdev->internal.examine_in_progress == 0) {
8382 		TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link);
8383 		free(desc->claim);
8384 		if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) {
8385 			claim_reset(bdev);
8386 		}
8387 	} else {
8388 		/* This is a dead claim that will be cleaned up when bdev_examine() is done. */
8389 		desc->claim->module = NULL;
8390 		desc->claim->desc = NULL;
8391 	}
8392 	desc->claim = NULL;
8393 }
8394 
8395 /*
8396  * End claims v2
8397  */
8398 
8399 struct spdk_bdev *
8400 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc)
8401 {
8402 	assert(desc != NULL);
8403 	return desc->bdev;
8404 }
8405 
8406 int
8407 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn)
8408 {
8409 	struct spdk_bdev *bdev, *tmp;
8410 	struct spdk_bdev_desc *desc;
8411 	int rc = 0;
8412 
8413 	assert(fn != NULL);
8414 
8415 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8416 	bdev = spdk_bdev_first();
8417 	while (bdev != NULL) {
8418 		rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc);
8419 		if (rc != 0) {
8420 			break;
8421 		}
8422 		rc = bdev_open(bdev, false, desc);
8423 		if (rc != 0) {
8424 			bdev_desc_free(desc);
8425 			if (rc == -ENODEV) {
8426 				/* Ignore the error and move to the next bdev. */
8427 				rc = 0;
8428 				bdev = spdk_bdev_next(bdev);
8429 				continue;
8430 			}
8431 			break;
8432 		}
8433 		spdk_spin_unlock(&g_bdev_mgr.spinlock);
8434 
8435 		rc = fn(ctx, bdev);
8436 
8437 		spdk_spin_lock(&g_bdev_mgr.spinlock);
8438 		tmp = spdk_bdev_next(bdev);
8439 		bdev_close(bdev, desc);
8440 		if (rc != 0) {
8441 			break;
8442 		}
8443 		bdev = tmp;
8444 	}
8445 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8446 
8447 	return rc;
8448 }
8449 
8450 int
8451 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn)
8452 {
8453 	struct spdk_bdev *bdev, *tmp;
8454 	struct spdk_bdev_desc *desc;
8455 	int rc = 0;
8456 
8457 	assert(fn != NULL);
8458 
8459 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8460 	bdev = spdk_bdev_first_leaf();
8461 	while (bdev != NULL) {
8462 		rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc);
8463 		if (rc != 0) {
8464 			break;
8465 		}
8466 		rc = bdev_open(bdev, false, desc);
8467 		if (rc != 0) {
8468 			bdev_desc_free(desc);
8469 			if (rc == -ENODEV) {
8470 				/* Ignore the error and move to the next bdev. */
8471 				rc = 0;
8472 				bdev = spdk_bdev_next_leaf(bdev);
8473 				continue;
8474 			}
8475 			break;
8476 		}
8477 		spdk_spin_unlock(&g_bdev_mgr.spinlock);
8478 
8479 		rc = fn(ctx, bdev);
8480 
8481 		spdk_spin_lock(&g_bdev_mgr.spinlock);
8482 		tmp = spdk_bdev_next_leaf(bdev);
8483 		bdev_close(bdev, desc);
8484 		if (rc != 0) {
8485 			break;
8486 		}
8487 		bdev = tmp;
8488 	}
8489 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8490 
8491 	return rc;
8492 }
8493 
8494 void
8495 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp)
8496 {
8497 	struct iovec *iovs;
8498 	int iovcnt;
8499 
8500 	if (bdev_io == NULL) {
8501 		return;
8502 	}
8503 
8504 	switch (bdev_io->type) {
8505 	case SPDK_BDEV_IO_TYPE_READ:
8506 	case SPDK_BDEV_IO_TYPE_WRITE:
8507 	case SPDK_BDEV_IO_TYPE_ZCOPY:
8508 		iovs = bdev_io->u.bdev.iovs;
8509 		iovcnt = bdev_io->u.bdev.iovcnt;
8510 		break;
8511 	default:
8512 		iovs = NULL;
8513 		iovcnt = 0;
8514 		break;
8515 	}
8516 
8517 	if (iovp) {
8518 		*iovp = iovs;
8519 	}
8520 	if (iovcntp) {
8521 		*iovcntp = iovcnt;
8522 	}
8523 }
8524 
8525 void *
8526 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io)
8527 {
8528 	if (bdev_io == NULL) {
8529 		return NULL;
8530 	}
8531 
8532 	if (!spdk_bdev_is_md_separate(bdev_io->bdev)) {
8533 		return NULL;
8534 	}
8535 
8536 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ ||
8537 	    bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
8538 		return bdev_io->u.bdev.md_buf;
8539 	}
8540 
8541 	return NULL;
8542 }
8543 
8544 void *
8545 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io)
8546 {
8547 	if (bdev_io == NULL) {
8548 		assert(false);
8549 		return NULL;
8550 	}
8551 
8552 	return bdev_io->internal.caller_ctx;
8553 }
8554 
8555 void
8556 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module)
8557 {
8558 
8559 	if (spdk_bdev_module_list_find(bdev_module->name)) {
8560 		SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name);
8561 		assert(false);
8562 	}
8563 
8564 	spdk_spin_init(&bdev_module->internal.spinlock);
8565 	TAILQ_INIT(&bdev_module->internal.quiesced_ranges);
8566 
8567 	/*
8568 	 * Modules with examine callbacks must be initialized first, so they are
8569 	 *  ready to handle examine callbacks from later modules that will
8570 	 *  register physical bdevs.
8571 	 */
8572 	if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) {
8573 		TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
8574 	} else {
8575 		TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
8576 	}
8577 }
8578 
8579 struct spdk_bdev_module *
8580 spdk_bdev_module_list_find(const char *name)
8581 {
8582 	struct spdk_bdev_module *bdev_module;
8583 
8584 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
8585 		if (strcmp(name, bdev_module->name) == 0) {
8586 			break;
8587 		}
8588 	}
8589 
8590 	return bdev_module;
8591 }
8592 
8593 static int
8594 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io)
8595 {
8596 	uint64_t num_blocks;
8597 	void *md_buf = NULL;
8598 
8599 	num_blocks = bdev_io->u.bdev.num_blocks;
8600 
8601 	if (spdk_bdev_is_md_separate(bdev_io->bdev)) {
8602 		md_buf = (char *)g_bdev_mgr.zero_buffer +
8603 			 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks;
8604 	}
8605 
8606 	return bdev_write_blocks_with_md(bdev_io->internal.desc,
8607 					 spdk_io_channel_from_ctx(bdev_io->internal.ch),
8608 					 g_bdev_mgr.zero_buffer, md_buf,
8609 					 bdev_io->u.bdev.offset_blocks, num_blocks,
8610 					 bdev_write_zero_buffer_done, bdev_io);
8611 }
8612 
8613 static void
8614 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
8615 {
8616 	struct spdk_bdev_io *parent_io = cb_arg;
8617 
8618 	spdk_bdev_free_io(bdev_io);
8619 
8620 	parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
8621 	parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx);
8622 }
8623 
8624 static void
8625 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status)
8626 {
8627 	spdk_spin_lock(&ctx->bdev->internal.spinlock);
8628 	ctx->bdev->internal.qos_mod_in_progress = false;
8629 	spdk_spin_unlock(&ctx->bdev->internal.spinlock);
8630 
8631 	if (ctx->cb_fn) {
8632 		ctx->cb_fn(ctx->cb_arg, status);
8633 	}
8634 	free(ctx);
8635 }
8636 
8637 static void
8638 bdev_disable_qos_done(void *cb_arg)
8639 {
8640 	struct set_qos_limit_ctx *ctx = cb_arg;
8641 	struct spdk_bdev *bdev = ctx->bdev;
8642 	struct spdk_bdev_io *bdev_io;
8643 	struct spdk_bdev_qos *qos;
8644 
8645 	spdk_spin_lock(&bdev->internal.spinlock);
8646 	qos = bdev->internal.qos;
8647 	bdev->internal.qos = NULL;
8648 	spdk_spin_unlock(&bdev->internal.spinlock);
8649 
8650 	while (!TAILQ_EMPTY(&qos->queued)) {
8651 		/* Send queued I/O back to their original thread for resubmission. */
8652 		bdev_io = TAILQ_FIRST(&qos->queued);
8653 		TAILQ_REMOVE(&qos->queued, bdev_io, internal.link);
8654 
8655 		if (bdev_io->internal.io_submit_ch) {
8656 			/*
8657 			 * Channel was changed when sending it to the QoS thread - change it back
8658 			 *  before sending it back to the original thread.
8659 			 */
8660 			bdev_io->internal.ch = bdev_io->internal.io_submit_ch;
8661 			bdev_io->internal.io_submit_ch = NULL;
8662 		}
8663 
8664 		spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
8665 				     _bdev_io_submit, bdev_io);
8666 	}
8667 
8668 	if (qos->thread != NULL) {
8669 		spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
8670 		spdk_poller_unregister(&qos->poller);
8671 	}
8672 
8673 	free(qos);
8674 
8675 	bdev_set_qos_limit_done(ctx, 0);
8676 }
8677 
8678 static void
8679 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status)
8680 {
8681 	struct set_qos_limit_ctx *ctx = _ctx;
8682 	struct spdk_thread *thread;
8683 
8684 	spdk_spin_lock(&bdev->internal.spinlock);
8685 	thread = bdev->internal.qos->thread;
8686 	spdk_spin_unlock(&bdev->internal.spinlock);
8687 
8688 	if (thread != NULL) {
8689 		spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx);
8690 	} else {
8691 		bdev_disable_qos_done(ctx);
8692 	}
8693 }
8694 
8695 static void
8696 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
8697 		     struct spdk_io_channel *ch, void *_ctx)
8698 {
8699 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
8700 
8701 	bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED;
8702 
8703 	spdk_bdev_for_each_channel_continue(i, 0);
8704 }
8705 
8706 static void
8707 bdev_update_qos_rate_limit_msg(void *cb_arg)
8708 {
8709 	struct set_qos_limit_ctx *ctx = cb_arg;
8710 	struct spdk_bdev *bdev = ctx->bdev;
8711 
8712 	spdk_spin_lock(&bdev->internal.spinlock);
8713 	bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos);
8714 	spdk_spin_unlock(&bdev->internal.spinlock);
8715 
8716 	bdev_set_qos_limit_done(ctx, 0);
8717 }
8718 
8719 static void
8720 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
8721 		    struct spdk_io_channel *ch, void *_ctx)
8722 {
8723 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
8724 
8725 	spdk_spin_lock(&bdev->internal.spinlock);
8726 	bdev_enable_qos(bdev, bdev_ch);
8727 	spdk_spin_unlock(&bdev->internal.spinlock);
8728 	spdk_bdev_for_each_channel_continue(i, 0);
8729 }
8730 
8731 static void
8732 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status)
8733 {
8734 	struct set_qos_limit_ctx *ctx = _ctx;
8735 
8736 	bdev_set_qos_limit_done(ctx, status);
8737 }
8738 
8739 static void
8740 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits)
8741 {
8742 	int i;
8743 
8744 	assert(bdev->internal.qos != NULL);
8745 
8746 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
8747 		if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
8748 			bdev->internal.qos->rate_limits[i].limit = limits[i];
8749 
8750 			if (limits[i] == 0) {
8751 				bdev->internal.qos->rate_limits[i].limit =
8752 					SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
8753 			}
8754 		}
8755 	}
8756 }
8757 
8758 void
8759 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits,
8760 			      void (*cb_fn)(void *cb_arg, int status), void *cb_arg)
8761 {
8762 	struct set_qos_limit_ctx	*ctx;
8763 	uint32_t			limit_set_complement;
8764 	uint64_t			min_limit_per_sec;
8765 	int				i;
8766 	bool				disable_rate_limit = true;
8767 
8768 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
8769 		if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
8770 			continue;
8771 		}
8772 
8773 		if (limits[i] > 0) {
8774 			disable_rate_limit = false;
8775 		}
8776 
8777 		if (bdev_qos_is_iops_rate_limit(i) == true) {
8778 			min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC;
8779 		} else {
8780 			/* Change from megabyte to byte rate limit */
8781 			limits[i] = limits[i] * 1024 * 1024;
8782 			min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC;
8783 		}
8784 
8785 		limit_set_complement = limits[i] % min_limit_per_sec;
8786 		if (limit_set_complement) {
8787 			SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n",
8788 				    limits[i], min_limit_per_sec);
8789 			limits[i] += min_limit_per_sec - limit_set_complement;
8790 			SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]);
8791 		}
8792 	}
8793 
8794 	ctx = calloc(1, sizeof(*ctx));
8795 	if (ctx == NULL) {
8796 		cb_fn(cb_arg, -ENOMEM);
8797 		return;
8798 	}
8799 
8800 	ctx->cb_fn = cb_fn;
8801 	ctx->cb_arg = cb_arg;
8802 	ctx->bdev = bdev;
8803 
8804 	spdk_spin_lock(&bdev->internal.spinlock);
8805 	if (bdev->internal.qos_mod_in_progress) {
8806 		spdk_spin_unlock(&bdev->internal.spinlock);
8807 		free(ctx);
8808 		cb_fn(cb_arg, -EAGAIN);
8809 		return;
8810 	}
8811 	bdev->internal.qos_mod_in_progress = true;
8812 
8813 	if (disable_rate_limit == true && bdev->internal.qos) {
8814 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
8815 			if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED &&
8816 			    (bdev->internal.qos->rate_limits[i].limit > 0 &&
8817 			     bdev->internal.qos->rate_limits[i].limit !=
8818 			     SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) {
8819 				disable_rate_limit = false;
8820 				break;
8821 			}
8822 		}
8823 	}
8824 
8825 	if (disable_rate_limit == false) {
8826 		if (bdev->internal.qos == NULL) {
8827 			bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos));
8828 			if (!bdev->internal.qos) {
8829 				spdk_spin_unlock(&bdev->internal.spinlock);
8830 				SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n");
8831 				bdev_set_qos_limit_done(ctx, -ENOMEM);
8832 				return;
8833 			}
8834 		}
8835 
8836 		if (bdev->internal.qos->thread == NULL) {
8837 			/* Enabling */
8838 			bdev_set_qos_rate_limits(bdev, limits);
8839 
8840 			spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx,
8841 						   bdev_enable_qos_done);
8842 		} else {
8843 			/* Updating */
8844 			bdev_set_qos_rate_limits(bdev, limits);
8845 
8846 			spdk_thread_send_msg(bdev->internal.qos->thread,
8847 					     bdev_update_qos_rate_limit_msg, ctx);
8848 		}
8849 	} else {
8850 		if (bdev->internal.qos != NULL) {
8851 			bdev_set_qos_rate_limits(bdev, limits);
8852 
8853 			/* Disabling */
8854 			spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx,
8855 						   bdev_disable_qos_msg_done);
8856 		} else {
8857 			spdk_spin_unlock(&bdev->internal.spinlock);
8858 			bdev_set_qos_limit_done(ctx, 0);
8859 			return;
8860 		}
8861 	}
8862 
8863 	spdk_spin_unlock(&bdev->internal.spinlock);
8864 }
8865 
8866 struct spdk_bdev_histogram_ctx {
8867 	spdk_bdev_histogram_status_cb cb_fn;
8868 	void *cb_arg;
8869 	struct spdk_bdev *bdev;
8870 	int status;
8871 };
8872 
8873 static void
8874 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
8875 {
8876 	struct spdk_bdev_histogram_ctx *ctx = _ctx;
8877 
8878 	spdk_spin_lock(&ctx->bdev->internal.spinlock);
8879 	ctx->bdev->internal.histogram_in_progress = false;
8880 	spdk_spin_unlock(&ctx->bdev->internal.spinlock);
8881 	ctx->cb_fn(ctx->cb_arg, ctx->status);
8882 	free(ctx);
8883 }
8884 
8885 static void
8886 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
8887 			       struct spdk_io_channel *_ch, void *_ctx)
8888 {
8889 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
8890 
8891 	if (ch->histogram != NULL) {
8892 		spdk_histogram_data_free(ch->histogram);
8893 		ch->histogram = NULL;
8894 	}
8895 	spdk_bdev_for_each_channel_continue(i, 0);
8896 }
8897 
8898 static void
8899 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
8900 {
8901 	struct spdk_bdev_histogram_ctx *ctx = _ctx;
8902 
8903 	if (status != 0) {
8904 		ctx->status = status;
8905 		ctx->bdev->internal.histogram_enabled = false;
8906 		spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx,
8907 					   bdev_histogram_disable_channel_cb);
8908 	} else {
8909 		spdk_spin_lock(&ctx->bdev->internal.spinlock);
8910 		ctx->bdev->internal.histogram_in_progress = false;
8911 		spdk_spin_unlock(&ctx->bdev->internal.spinlock);
8912 		ctx->cb_fn(ctx->cb_arg, ctx->status);
8913 		free(ctx);
8914 	}
8915 }
8916 
8917 static void
8918 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
8919 			      struct spdk_io_channel *_ch, void *_ctx)
8920 {
8921 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
8922 	int status = 0;
8923 
8924 	if (ch->histogram == NULL) {
8925 		ch->histogram = spdk_histogram_data_alloc();
8926 		if (ch->histogram == NULL) {
8927 			status = -ENOMEM;
8928 		}
8929 	}
8930 
8931 	spdk_bdev_for_each_channel_continue(i, status);
8932 }
8933 
8934 void
8935 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn,
8936 			   void *cb_arg, bool enable)
8937 {
8938 	struct spdk_bdev_histogram_ctx *ctx;
8939 
8940 	ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx));
8941 	if (ctx == NULL) {
8942 		cb_fn(cb_arg, -ENOMEM);
8943 		return;
8944 	}
8945 
8946 	ctx->bdev = bdev;
8947 	ctx->status = 0;
8948 	ctx->cb_fn = cb_fn;
8949 	ctx->cb_arg = cb_arg;
8950 
8951 	spdk_spin_lock(&bdev->internal.spinlock);
8952 	if (bdev->internal.histogram_in_progress) {
8953 		spdk_spin_unlock(&bdev->internal.spinlock);
8954 		free(ctx);
8955 		cb_fn(cb_arg, -EAGAIN);
8956 		return;
8957 	}
8958 
8959 	bdev->internal.histogram_in_progress = true;
8960 	spdk_spin_unlock(&bdev->internal.spinlock);
8961 
8962 	bdev->internal.histogram_enabled = enable;
8963 
8964 	if (enable) {
8965 		/* Allocate histogram for each channel */
8966 		spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx,
8967 					   bdev_histogram_enable_channel_cb);
8968 	} else {
8969 		spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx,
8970 					   bdev_histogram_disable_channel_cb);
8971 	}
8972 }
8973 
8974 struct spdk_bdev_histogram_data_ctx {
8975 	spdk_bdev_histogram_data_cb cb_fn;
8976 	void *cb_arg;
8977 	struct spdk_bdev *bdev;
8978 	/** merged histogram data from all channels */
8979 	struct spdk_histogram_data	*histogram;
8980 };
8981 
8982 static void
8983 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
8984 {
8985 	struct spdk_bdev_histogram_data_ctx *ctx = _ctx;
8986 
8987 	ctx->cb_fn(ctx->cb_arg, status, ctx->histogram);
8988 	free(ctx);
8989 }
8990 
8991 static void
8992 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
8993 			   struct spdk_io_channel *_ch, void *_ctx)
8994 {
8995 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
8996 	struct spdk_bdev_histogram_data_ctx *ctx = _ctx;
8997 	int status = 0;
8998 
8999 	if (ch->histogram == NULL) {
9000 		status = -EFAULT;
9001 	} else {
9002 		spdk_histogram_data_merge(ctx->histogram, ch->histogram);
9003 	}
9004 
9005 	spdk_bdev_for_each_channel_continue(i, status);
9006 }
9007 
9008 void
9009 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram,
9010 			spdk_bdev_histogram_data_cb cb_fn,
9011 			void *cb_arg)
9012 {
9013 	struct spdk_bdev_histogram_data_ctx *ctx;
9014 
9015 	ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx));
9016 	if (ctx == NULL) {
9017 		cb_fn(cb_arg, -ENOMEM, NULL);
9018 		return;
9019 	}
9020 
9021 	ctx->bdev = bdev;
9022 	ctx->cb_fn = cb_fn;
9023 	ctx->cb_arg = cb_arg;
9024 
9025 	ctx->histogram = histogram;
9026 
9027 	spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx,
9028 				   bdev_histogram_get_channel_cb);
9029 }
9030 
9031 void
9032 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn,
9033 				void *cb_arg)
9034 {
9035 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
9036 	int status = 0;
9037 
9038 	assert(cb_fn != NULL);
9039 
9040 	if (bdev_ch->histogram == NULL) {
9041 		status = -EFAULT;
9042 	}
9043 	cb_fn(cb_arg, status, bdev_ch->histogram);
9044 }
9045 
9046 size_t
9047 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events,
9048 			   size_t max_events)
9049 {
9050 	struct media_event_entry *entry;
9051 	size_t num_events = 0;
9052 
9053 	for (; num_events < max_events; ++num_events) {
9054 		entry = TAILQ_FIRST(&desc->pending_media_events);
9055 		if (entry == NULL) {
9056 			break;
9057 		}
9058 
9059 		events[num_events] = entry->event;
9060 		TAILQ_REMOVE(&desc->pending_media_events, entry, tailq);
9061 		TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq);
9062 	}
9063 
9064 	return num_events;
9065 }
9066 
9067 int
9068 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events,
9069 			    size_t num_events)
9070 {
9071 	struct spdk_bdev_desc *desc;
9072 	struct media_event_entry *entry;
9073 	size_t event_id;
9074 	int rc = 0;
9075 
9076 	assert(bdev->media_events);
9077 
9078 	spdk_spin_lock(&bdev->internal.spinlock);
9079 	TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
9080 		if (desc->write) {
9081 			break;
9082 		}
9083 	}
9084 
9085 	if (desc == NULL || desc->media_events_buffer == NULL) {
9086 		rc = -ENODEV;
9087 		goto out;
9088 	}
9089 
9090 	for (event_id = 0; event_id < num_events; ++event_id) {
9091 		entry = TAILQ_FIRST(&desc->free_media_events);
9092 		if (entry == NULL) {
9093 			break;
9094 		}
9095 
9096 		TAILQ_REMOVE(&desc->free_media_events, entry, tailq);
9097 		TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq);
9098 		entry->event = events[event_id];
9099 	}
9100 
9101 	rc = event_id;
9102 out:
9103 	spdk_spin_unlock(&bdev->internal.spinlock);
9104 	return rc;
9105 }
9106 
9107 static void
9108 _media_management_notify(void *arg)
9109 {
9110 	struct spdk_bdev_desc *desc = arg;
9111 
9112 	_event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT);
9113 }
9114 
9115 void
9116 spdk_bdev_notify_media_management(struct spdk_bdev *bdev)
9117 {
9118 	struct spdk_bdev_desc *desc;
9119 
9120 	spdk_spin_lock(&bdev->internal.spinlock);
9121 	TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
9122 		if (!TAILQ_EMPTY(&desc->pending_media_events)) {
9123 			event_notify(desc, _media_management_notify);
9124 		}
9125 	}
9126 	spdk_spin_unlock(&bdev->internal.spinlock);
9127 }
9128 
9129 struct locked_lba_range_ctx {
9130 	struct lba_range		range;
9131 	struct lba_range		*current_range;
9132 	struct lba_range		*owner_range;
9133 	struct spdk_poller		*poller;
9134 	lock_range_cb			cb_fn;
9135 	void				*cb_arg;
9136 };
9137 
9138 static void
9139 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status)
9140 {
9141 	struct locked_lba_range_ctx *ctx = _ctx;
9142 
9143 	ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM);
9144 	free(ctx);
9145 }
9146 
9147 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i,
9148 		struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx);
9149 
9150 static void
9151 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status)
9152 {
9153 	struct locked_lba_range_ctx *ctx = _ctx;
9154 
9155 	if (status == -ENOMEM) {
9156 		/* One of the channels could not allocate a range object.
9157 		 * So we have to go back and clean up any ranges that were
9158 		 * allocated successfully before we return error status to
9159 		 * the caller.  We can reuse the unlock function to do that
9160 		 * clean up.
9161 		 */
9162 		spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx,
9163 					   bdev_lock_error_cleanup_cb);
9164 		return;
9165 	}
9166 
9167 	/* All channels have locked this range and no I/O overlapping the range
9168 	 * are outstanding!  Set the owner_ch for the range object for the
9169 	 * locking channel, so that this channel will know that it is allowed
9170 	 * to write to this range.
9171 	 */
9172 	if (ctx->owner_range != NULL) {
9173 		ctx->owner_range->owner_ch = ctx->range.owner_ch;
9174 	}
9175 
9176 	ctx->cb_fn(&ctx->range, ctx->cb_arg, status);
9177 
9178 	/* Don't free the ctx here.  Its range is in the bdev's global list of
9179 	 * locked ranges still, and will be removed and freed when this range
9180 	 * is later unlocked.
9181 	 */
9182 }
9183 
9184 static int
9185 bdev_lock_lba_range_check_io(void *_i)
9186 {
9187 	struct spdk_bdev_channel_iter *i = _i;
9188 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i);
9189 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9190 	struct locked_lba_range_ctx *ctx = i->ctx;
9191 	struct lba_range *range = ctx->current_range;
9192 	struct spdk_bdev_io *bdev_io;
9193 
9194 	spdk_poller_unregister(&ctx->poller);
9195 
9196 	/* The range is now in the locked_ranges, so no new IO can be submitted to this
9197 	 * range.  But we need to wait until any outstanding IO overlapping with this range
9198 	 * are completed.
9199 	 */
9200 	TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) {
9201 		if (bdev_io_range_is_locked(bdev_io, range)) {
9202 			ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100);
9203 			return SPDK_POLLER_BUSY;
9204 		}
9205 	}
9206 
9207 	spdk_bdev_for_each_channel_continue(i, 0);
9208 	return SPDK_POLLER_BUSY;
9209 }
9210 
9211 static void
9212 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9213 				struct spdk_io_channel *_ch, void *_ctx)
9214 {
9215 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9216 	struct locked_lba_range_ctx *ctx = _ctx;
9217 	struct lba_range *range;
9218 
9219 	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
9220 		if (range->length == ctx->range.length &&
9221 		    range->offset == ctx->range.offset &&
9222 		    range->locked_ctx == ctx->range.locked_ctx) {
9223 			/* This range already exists on this channel, so don't add
9224 			 * it again.  This can happen when a new channel is created
9225 			 * while the for_each_channel operation is in progress.
9226 			 * Do not check for outstanding I/O in that case, since the
9227 			 * range was locked before any I/O could be submitted to the
9228 			 * new channel.
9229 			 */
9230 			spdk_bdev_for_each_channel_continue(i, 0);
9231 			return;
9232 		}
9233 	}
9234 
9235 	range = calloc(1, sizeof(*range));
9236 	if (range == NULL) {
9237 		spdk_bdev_for_each_channel_continue(i, -ENOMEM);
9238 		return;
9239 	}
9240 
9241 	range->length = ctx->range.length;
9242 	range->offset = ctx->range.offset;
9243 	range->locked_ctx = ctx->range.locked_ctx;
9244 	ctx->current_range = range;
9245 	if (ctx->range.owner_ch == ch) {
9246 		/* This is the range object for the channel that will hold
9247 		 * the lock.  Store it in the ctx object so that we can easily
9248 		 * set its owner_ch after the lock is finally acquired.
9249 		 */
9250 		ctx->owner_range = range;
9251 	}
9252 	TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq);
9253 	bdev_lock_lba_range_check_io(i);
9254 }
9255 
9256 static void
9257 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx)
9258 {
9259 	assert(spdk_get_thread() == ctx->range.owner_thread);
9260 	assert(ctx->range.owner_ch == NULL ||
9261 	       spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread);
9262 
9263 	/* We will add a copy of this range to each channel now. */
9264 	spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx,
9265 				   bdev_lock_lba_range_cb);
9266 }
9267 
9268 static bool
9269 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq)
9270 {
9271 	struct lba_range *r;
9272 
9273 	TAILQ_FOREACH(r, tailq, tailq) {
9274 		if (bdev_lba_range_overlapped(range, r)) {
9275 			return true;
9276 		}
9277 	}
9278 	return false;
9279 }
9280 
9281 static int
9282 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch,
9283 		     uint64_t offset, uint64_t length,
9284 		     lock_range_cb cb_fn, void *cb_arg)
9285 {
9286 	struct locked_lba_range_ctx *ctx;
9287 
9288 	ctx = calloc(1, sizeof(*ctx));
9289 	if (ctx == NULL) {
9290 		return -ENOMEM;
9291 	}
9292 
9293 	ctx->range.offset = offset;
9294 	ctx->range.length = length;
9295 	ctx->range.owner_thread = spdk_get_thread();
9296 	ctx->range.owner_ch = ch;
9297 	ctx->range.locked_ctx = cb_arg;
9298 	ctx->range.bdev = bdev;
9299 	ctx->cb_fn = cb_fn;
9300 	ctx->cb_arg = cb_arg;
9301 
9302 	spdk_spin_lock(&bdev->internal.spinlock);
9303 	if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) {
9304 		/* There is an active lock overlapping with this range.
9305 		 * Put it on the pending list until this range no
9306 		 * longer overlaps with another.
9307 		 */
9308 		TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq);
9309 	} else {
9310 		TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq);
9311 		bdev_lock_lba_range_ctx(bdev, ctx);
9312 	}
9313 	spdk_spin_unlock(&bdev->internal.spinlock);
9314 	return 0;
9315 }
9316 
9317 static int
9318 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
9319 		    uint64_t offset, uint64_t length,
9320 		    lock_range_cb cb_fn, void *cb_arg)
9321 {
9322 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
9323 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9324 
9325 	if (cb_arg == NULL) {
9326 		SPDK_ERRLOG("cb_arg must not be NULL\n");
9327 		return -EINVAL;
9328 	}
9329 
9330 	return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg);
9331 }
9332 
9333 static void
9334 bdev_lock_lba_range_ctx_msg(void *_ctx)
9335 {
9336 	struct locked_lba_range_ctx *ctx = _ctx;
9337 
9338 	bdev_lock_lba_range_ctx(ctx->range.bdev, ctx);
9339 }
9340 
9341 static void
9342 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status)
9343 {
9344 	struct locked_lba_range_ctx *ctx = _ctx;
9345 	struct locked_lba_range_ctx *pending_ctx;
9346 	struct lba_range *range, *tmp;
9347 
9348 	spdk_spin_lock(&bdev->internal.spinlock);
9349 	/* Check if there are any pending locked ranges that overlap with this range
9350 	 * that was just unlocked.  If there are, check that it doesn't overlap with any
9351 	 * other locked ranges before calling bdev_lock_lba_range_ctx which will start
9352 	 * the lock process.
9353 	 */
9354 	TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) {
9355 		if (bdev_lba_range_overlapped(range, &ctx->range) &&
9356 		    !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) {
9357 			TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq);
9358 			pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range);
9359 			TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq);
9360 			spdk_thread_send_msg(pending_ctx->range.owner_thread,
9361 					     bdev_lock_lba_range_ctx_msg, pending_ctx);
9362 		}
9363 	}
9364 	spdk_spin_unlock(&bdev->internal.spinlock);
9365 
9366 	ctx->cb_fn(&ctx->range, ctx->cb_arg, status);
9367 	free(ctx);
9368 }
9369 
9370 static void
9371 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9372 				  struct spdk_io_channel *_ch, void *_ctx)
9373 {
9374 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9375 	struct locked_lba_range_ctx *ctx = _ctx;
9376 	TAILQ_HEAD(, spdk_bdev_io) io_locked;
9377 	struct spdk_bdev_io *bdev_io;
9378 	struct lba_range *range;
9379 
9380 	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
9381 		if (ctx->range.offset == range->offset &&
9382 		    ctx->range.length == range->length &&
9383 		    ctx->range.locked_ctx == range->locked_ctx) {
9384 			TAILQ_REMOVE(&ch->locked_ranges, range, tailq);
9385 			free(range);
9386 			break;
9387 		}
9388 	}
9389 
9390 	/* Note: we should almost always be able to assert that the range specified
9391 	 * was found.  But there are some very rare corner cases where a new channel
9392 	 * gets created simultaneously with a range unlock, where this function
9393 	 * would execute on that new channel and wouldn't have the range.
9394 	 * We also use this to clean up range allocations when a later allocation
9395 	 * fails in the locking path.
9396 	 * So we can't actually assert() here.
9397 	 */
9398 
9399 	/* Swap the locked IO into a temporary list, and then try to submit them again.
9400 	 * We could hyper-optimize this to only resubmit locked I/O that overlap
9401 	 * with the range that was just unlocked, but this isn't a performance path so
9402 	 * we go for simplicity here.
9403 	 */
9404 	TAILQ_INIT(&io_locked);
9405 	TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link);
9406 	while (!TAILQ_EMPTY(&io_locked)) {
9407 		bdev_io = TAILQ_FIRST(&io_locked);
9408 		TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link);
9409 		bdev_io_submit(bdev_io);
9410 	}
9411 
9412 	spdk_bdev_for_each_channel_continue(i, 0);
9413 }
9414 
9415 static int
9416 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length,
9417 		       lock_range_cb cb_fn, void *cb_arg)
9418 {
9419 	struct locked_lba_range_ctx *ctx;
9420 	struct lba_range *range;
9421 
9422 	spdk_spin_lock(&bdev->internal.spinlock);
9423 	/* To start the unlock the process, we find the range in the bdev's locked_ranges
9424 	 * and remove it. This ensures new channels don't inherit the locked range.
9425 	 * Then we will send a message to each channel to remove the range from its
9426 	 * per-channel list.
9427 	 */
9428 	TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) {
9429 		if (range->offset == offset && range->length == length &&
9430 		    (range->owner_ch == NULL || range->locked_ctx == cb_arg)) {
9431 			break;
9432 		}
9433 	}
9434 	if (range == NULL) {
9435 		assert(false);
9436 		spdk_spin_unlock(&bdev->internal.spinlock);
9437 		return -EINVAL;
9438 	}
9439 	TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq);
9440 	ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range);
9441 	spdk_spin_unlock(&bdev->internal.spinlock);
9442 
9443 	ctx->cb_fn = cb_fn;
9444 	ctx->cb_arg = cb_arg;
9445 
9446 	spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx,
9447 				   bdev_unlock_lba_range_cb);
9448 	return 0;
9449 }
9450 
9451 static int
9452 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
9453 		      uint64_t offset, uint64_t length,
9454 		      lock_range_cb cb_fn, void *cb_arg)
9455 {
9456 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
9457 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9458 	struct lba_range *range;
9459 	bool range_found = false;
9460 
9461 	/* Let's make sure the specified channel actually has a lock on
9462 	 * the specified range.  Note that the range must match exactly.
9463 	 */
9464 	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
9465 		if (range->offset == offset && range->length == length &&
9466 		    range->owner_ch == ch && range->locked_ctx == cb_arg) {
9467 			range_found = true;
9468 			break;
9469 		}
9470 	}
9471 
9472 	if (!range_found) {
9473 		return -EINVAL;
9474 	}
9475 
9476 	return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg);
9477 }
9478 
9479 struct bdev_quiesce_ctx {
9480 	spdk_bdev_quiesce_cb cb_fn;
9481 	void *cb_arg;
9482 };
9483 
9484 static void
9485 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status)
9486 {
9487 	struct bdev_quiesce_ctx *quiesce_ctx = ctx;
9488 
9489 	if (quiesce_ctx->cb_fn != NULL) {
9490 		quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status);
9491 	}
9492 
9493 	free(quiesce_ctx);
9494 }
9495 
9496 static void
9497 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status)
9498 {
9499 	struct bdev_quiesce_ctx *quiesce_ctx = ctx;
9500 	struct spdk_bdev_module *module = range->bdev->module;
9501 
9502 	if (status != 0) {
9503 		if (quiesce_ctx->cb_fn != NULL) {
9504 			quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status);
9505 		}
9506 		free(quiesce_ctx);
9507 		return;
9508 	}
9509 
9510 	spdk_spin_lock(&module->internal.spinlock);
9511 	TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module);
9512 	spdk_spin_unlock(&module->internal.spinlock);
9513 
9514 	if (quiesce_ctx->cb_fn != NULL) {
9515 		quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status);
9516 		quiesce_ctx->cb_fn = NULL;
9517 		quiesce_ctx->cb_arg = NULL;
9518 	}
9519 	/* quiesce_ctx will be freed on unquiesce */
9520 }
9521 
9522 static int
9523 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
9524 		   uint64_t offset, uint64_t length,
9525 		   spdk_bdev_quiesce_cb cb_fn, void *cb_arg,
9526 		   bool unquiesce)
9527 {
9528 	struct bdev_quiesce_ctx *quiesce_ctx;
9529 	int rc;
9530 
9531 	if (module != bdev->module) {
9532 		SPDK_ERRLOG("Bdev does not belong to specified module.\n");
9533 		return -EINVAL;
9534 	}
9535 
9536 	if (!bdev_io_valid_blocks(bdev, offset, length)) {
9537 		return -EINVAL;
9538 	}
9539 
9540 	if (unquiesce) {
9541 		struct lba_range *range;
9542 
9543 		/* Make sure the specified range is actually quiesced in the specified module and
9544 		 * then remove it from the list. Note that the range must match exactly.
9545 		 */
9546 		spdk_spin_lock(&module->internal.spinlock);
9547 		TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) {
9548 			if (range->bdev == bdev && range->offset == offset && range->length == length) {
9549 				TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module);
9550 				break;
9551 			}
9552 		}
9553 		spdk_spin_unlock(&module->internal.spinlock);
9554 
9555 		if (range == NULL) {
9556 			SPDK_ERRLOG("The range to unquiesce was not found.\n");
9557 			return -EINVAL;
9558 		}
9559 
9560 		quiesce_ctx = range->locked_ctx;
9561 		quiesce_ctx->cb_fn = cb_fn;
9562 		quiesce_ctx->cb_arg = cb_arg;
9563 
9564 		rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx);
9565 	} else {
9566 		quiesce_ctx = malloc(sizeof(*quiesce_ctx));
9567 		if (quiesce_ctx == NULL) {
9568 			return -ENOMEM;
9569 		}
9570 
9571 		quiesce_ctx->cb_fn = cb_fn;
9572 		quiesce_ctx->cb_arg = cb_arg;
9573 
9574 		rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx);
9575 		if (rc != 0) {
9576 			free(quiesce_ctx);
9577 		}
9578 	}
9579 
9580 	return rc;
9581 }
9582 
9583 int
9584 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
9585 		  spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
9586 {
9587 	return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false);
9588 }
9589 
9590 int
9591 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
9592 		    spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
9593 {
9594 	return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true);
9595 }
9596 
9597 int
9598 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
9599 			uint64_t offset, uint64_t length,
9600 			spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
9601 {
9602 	return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false);
9603 }
9604 
9605 int
9606 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
9607 			  uint64_t offset, uint64_t length,
9608 			  spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
9609 {
9610 	return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true);
9611 }
9612 
9613 int
9614 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains,
9615 			     int array_size)
9616 {
9617 	if (!bdev) {
9618 		return -EINVAL;
9619 	}
9620 
9621 	if (bdev->fn_table->get_memory_domains) {
9622 		return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size);
9623 	}
9624 
9625 	return 0;
9626 }
9627 
9628 struct spdk_bdev_for_each_io_ctx {
9629 	void *ctx;
9630 	spdk_bdev_io_fn fn;
9631 	spdk_bdev_for_each_io_cb cb;
9632 };
9633 
9634 static void
9635 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9636 			 struct spdk_io_channel *io_ch, void *_ctx)
9637 {
9638 	struct spdk_bdev_for_each_io_ctx *ctx = _ctx;
9639 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
9640 	struct spdk_bdev_io *bdev_io;
9641 	int rc = 0;
9642 
9643 	TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) {
9644 		rc = ctx->fn(ctx->ctx, bdev_io);
9645 		if (rc != 0) {
9646 			break;
9647 		}
9648 	}
9649 
9650 	spdk_bdev_for_each_channel_continue(i, rc);
9651 }
9652 
9653 static void
9654 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
9655 {
9656 	struct spdk_bdev_for_each_io_ctx *ctx = _ctx;
9657 
9658 	ctx->cb(ctx->ctx, status);
9659 
9660 	free(ctx);
9661 }
9662 
9663 void
9664 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn,
9665 			   spdk_bdev_for_each_io_cb cb)
9666 {
9667 	struct spdk_bdev_for_each_io_ctx *ctx;
9668 
9669 	assert(fn != NULL && cb != NULL);
9670 
9671 	ctx = calloc(1, sizeof(*ctx));
9672 	if (ctx == NULL) {
9673 		SPDK_ERRLOG("Failed to allocate context.\n");
9674 		cb(_ctx, -ENOMEM);
9675 		return;
9676 	}
9677 
9678 	ctx->ctx = _ctx;
9679 	ctx->fn = fn;
9680 	ctx->cb = cb;
9681 
9682 	spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx,
9683 				   bdev_for_each_io_done);
9684 }
9685 
9686 void
9687 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status)
9688 {
9689 	spdk_for_each_channel_continue(iter->i, status);
9690 }
9691 
9692 static struct spdk_bdev *
9693 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i)
9694 {
9695 	void *io_device = spdk_io_channel_iter_get_io_device(i);
9696 
9697 	return __bdev_from_io_dev(io_device);
9698 }
9699 
9700 static void
9701 bdev_each_channel_msg(struct spdk_io_channel_iter *i)
9702 {
9703 	struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
9704 	struct spdk_bdev *bdev = io_channel_iter_get_bdev(i);
9705 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
9706 
9707 	iter->i = i;
9708 	iter->fn(iter, bdev, ch, iter->ctx);
9709 }
9710 
9711 static void
9712 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status)
9713 {
9714 	struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
9715 	struct spdk_bdev *bdev = io_channel_iter_get_bdev(i);
9716 
9717 	iter->i = i;
9718 	iter->cpl(bdev, iter->ctx, status);
9719 
9720 	free(iter);
9721 }
9722 
9723 void
9724 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn,
9725 			   void *ctx, spdk_bdev_for_each_channel_done cpl)
9726 {
9727 	struct spdk_bdev_channel_iter *iter;
9728 
9729 	assert(bdev != NULL && fn != NULL && ctx != NULL);
9730 
9731 	iter = calloc(1, sizeof(struct spdk_bdev_channel_iter));
9732 	if (iter == NULL) {
9733 		SPDK_ERRLOG("Unable to allocate iterator\n");
9734 		assert(false);
9735 		return;
9736 	}
9737 
9738 	iter->fn = fn;
9739 	iter->cpl = cpl;
9740 	iter->ctx = ctx;
9741 
9742 	spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg,
9743 			      iter, bdev_each_channel_cpl);
9744 }
9745 
9746 static void
9747 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
9748 {
9749 	struct spdk_bdev_io *parent_io = cb_arg;
9750 
9751 	spdk_bdev_free_io(bdev_io);
9752 
9753 	/* Check return status of write */
9754 	parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
9755 	parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx);
9756 }
9757 
9758 static void
9759 bdev_copy_do_write(void *_bdev_io)
9760 {
9761 	struct spdk_bdev_io *bdev_io = _bdev_io;
9762 	int rc;
9763 
9764 	/* Write blocks */
9765 	rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc,
9766 					    spdk_io_channel_from_ctx(bdev_io->internal.ch),
9767 					    bdev_io->u.bdev.iovs[0].iov_base,
9768 					    bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks,
9769 					    bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io);
9770 
9771 	if (rc == -ENOMEM) {
9772 		bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write);
9773 	} else if (rc != 0) {
9774 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
9775 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
9776 	}
9777 }
9778 
9779 static void
9780 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
9781 {
9782 	struct spdk_bdev_io *parent_io = cb_arg;
9783 
9784 	spdk_bdev_free_io(bdev_io);
9785 
9786 	/* Check return status of read */
9787 	if (!success) {
9788 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
9789 		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
9790 		return;
9791 	}
9792 
9793 	/* Do write */
9794 	bdev_copy_do_write(parent_io);
9795 }
9796 
9797 static void
9798 bdev_copy_do_read(void *_bdev_io)
9799 {
9800 	struct spdk_bdev_io *bdev_io = _bdev_io;
9801 	int rc;
9802 
9803 	/* Read blocks */
9804 	rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc,
9805 					   spdk_io_channel_from_ctx(bdev_io->internal.ch),
9806 					   bdev_io->u.bdev.iovs[0].iov_base,
9807 					   bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks,
9808 					   bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io);
9809 
9810 	if (rc == -ENOMEM) {
9811 		bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read);
9812 	} else if (rc != 0) {
9813 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
9814 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
9815 	}
9816 }
9817 
9818 static void
9819 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
9820 {
9821 	if (!success) {
9822 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
9823 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
9824 		return;
9825 	}
9826 
9827 	bdev_copy_do_read(bdev_io);
9828 }
9829 
9830 int
9831 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
9832 		      uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks,
9833 		      spdk_bdev_io_completion_cb cb, void *cb_arg)
9834 {
9835 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
9836 	struct spdk_bdev_io *bdev_io;
9837 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
9838 
9839 	if (!desc->write) {
9840 		return -EBADF;
9841 	}
9842 
9843 	if (num_blocks == 0) {
9844 		SPDK_ERRLOG("Can't copy 0 blocks\n");
9845 		return -EINVAL;
9846 	}
9847 
9848 	if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) ||
9849 	    !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) {
9850 		SPDK_DEBUGLOG(bdev,
9851 			      "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n",
9852 			      dst_offset_blocks, src_offset_blocks, num_blocks);
9853 		return -EINVAL;
9854 	}
9855 
9856 	bdev_io = bdev_channel_get_io(channel);
9857 	if (!bdev_io) {
9858 		return -ENOMEM;
9859 	}
9860 
9861 	bdev_io->internal.ch = channel;
9862 	bdev_io->internal.desc = desc;
9863 	bdev_io->type = SPDK_BDEV_IO_TYPE_COPY;
9864 
9865 	bdev_io->u.bdev.offset_blocks = dst_offset_blocks;
9866 	bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks;
9867 	bdev_io->u.bdev.num_blocks = num_blocks;
9868 	bdev_io->u.bdev.memory_domain = NULL;
9869 	bdev_io->u.bdev.memory_domain_ctx = NULL;
9870 	bdev_io->u.bdev.iovs = NULL;
9871 	bdev_io->u.bdev.iovcnt = 0;
9872 	bdev_io->u.bdev.md_buf = NULL;
9873 	bdev_io->u.bdev.accel_sequence = NULL;
9874 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
9875 
9876 	if (dst_offset_blocks == src_offset_blocks) {
9877 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
9878 		bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx);
9879 
9880 		return 0;
9881 	}
9882 
9883 
9884 	/* If the copy size is large and should be split, use the generic split logic
9885 	 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not.
9886 	 *
9887 	 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or
9888 	 * emulate it using regular read and write requests otherwise.
9889 	 */
9890 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) ||
9891 	    bdev_io->internal.split) {
9892 		bdev_io_submit(bdev_io);
9893 		return 0;
9894 	}
9895 
9896 	spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev));
9897 
9898 	return 0;
9899 }
9900 
9901 SPDK_LOG_REGISTER_COMPONENT(bdev)
9902 
9903 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV)
9904 {
9905 	struct spdk_trace_tpoint_opts opts[] = {
9906 		{
9907 			"BDEV_IO_START", TRACE_BDEV_IO_START,
9908 			OWNER_BDEV, OBJECT_BDEV_IO, 1,
9909 			{
9910 				{ "type", SPDK_TRACE_ARG_TYPE_INT, 8 },
9911 				{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 },
9912 				{ "offset", SPDK_TRACE_ARG_TYPE_INT, 8 },
9913 				{ "len", SPDK_TRACE_ARG_TYPE_INT, 8 },
9914 				{ "name", SPDK_TRACE_ARG_TYPE_STR, 40}
9915 			}
9916 		},
9917 		{
9918 			"BDEV_IO_DONE", TRACE_BDEV_IO_DONE,
9919 			OWNER_BDEV, OBJECT_BDEV_IO, 0,
9920 			{{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }}
9921 		},
9922 		{
9923 			"BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE,
9924 			OWNER_BDEV, OBJECT_NONE, 1,
9925 			{
9926 				{ "name", SPDK_TRACE_ARG_TYPE_STR, 40 },
9927 				{ "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8}
9928 			}
9929 		},
9930 		{
9931 			"BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY,
9932 			OWNER_BDEV, OBJECT_NONE, 0,
9933 			{
9934 				{ "name", SPDK_TRACE_ARG_TYPE_STR, 40 },
9935 				{ "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8}
9936 			}
9937 		},
9938 	};
9939 
9940 
9941 	spdk_trace_register_owner(OWNER_BDEV, 'b');
9942 	spdk_trace_register_object(OBJECT_BDEV_IO, 'i');
9943 	spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts));
9944 	spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0);
9945 	spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0);
9946 }
9947