xref: /spdk/lib/bdev/bdev.c (revision 26cac6bf15cd3e90bf6221c5c53672ec2befca30)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2016 Intel Corporation. All rights reserved.
3  *   Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
4  *   Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  */
6 
7 #include "spdk/stdinc.h"
8 
9 #include "spdk/bdev.h"
10 
11 #include "spdk/accel.h"
12 #include "spdk/config.h"
13 #include "spdk/env.h"
14 #include "spdk/thread.h"
15 #include "spdk/likely.h"
16 #include "spdk/queue.h"
17 #include "spdk/nvme_spec.h"
18 #include "spdk/scsi_spec.h"
19 #include "spdk/notify.h"
20 #include "spdk/util.h"
21 #include "spdk/trace.h"
22 #include "spdk/dma.h"
23 
24 #include "spdk/bdev_module.h"
25 #include "spdk/log.h"
26 #include "spdk/string.h"
27 
28 #include "bdev_internal.h"
29 #include "spdk_internal/trace_defs.h"
30 #include "spdk_internal/assert.h"
31 
32 #ifdef SPDK_CONFIG_VTUNE
33 #include "ittnotify.h"
34 #include "ittnotify_types.h"
35 int __itt_init_ittlib(const char *, __itt_group_id);
36 #endif
37 
38 #define SPDK_BDEV_IO_POOL_SIZE			(64 * 1024 - 1)
39 #define SPDK_BDEV_IO_CACHE_SIZE			256
40 #define SPDK_BDEV_AUTO_EXAMINE			true
41 #define BUF_SMALL_POOL_SIZE			8191
42 #define BUF_LARGE_POOL_SIZE			1023
43 #define BUF_SMALL_CACHE_SIZE			128
44 #define BUF_LARGE_CACHE_SIZE			16
45 #define NOMEM_THRESHOLD_COUNT			8
46 
47 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC		1000
48 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE	1
49 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE	512
50 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC		1000
51 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC		(1024 * 1024)
52 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED		UINT64_MAX
53 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC	1000
54 
55 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command
56  * when splitting into children requests at a time.
57  */
58 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8)
59 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000
60 
61 /* The maximum number of children requests for a COPY command
62  * when splitting into children requests at a time.
63  */
64 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8)
65 
66 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \
67 	log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev)
68 #ifdef DEBUG
69 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \
70 	log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev)
71 #else
72 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0)
73 #endif
74 
75 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func,
76 				const char *detail, struct spdk_bdev *bdev);
77 
78 SPDK_LOG_DEPRECATION_REGISTER(vtune_support, "Intel(R) VTune integration", "v23.09", 0);
79 
80 static const char *qos_rpc_type[] = {"rw_ios_per_sec",
81 				     "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec"
82 				    };
83 
84 TAILQ_HEAD(spdk_bdev_list, spdk_bdev);
85 
86 RB_HEAD(bdev_name_tree, spdk_bdev_name);
87 
88 static int
89 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2)
90 {
91 	return strcmp(name1->name, name2->name);
92 }
93 
94 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp);
95 
96 struct spdk_bdev_mgr {
97 	struct spdk_mempool *bdev_io_pool;
98 
99 	void *zero_buffer;
100 
101 	TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules;
102 
103 	struct spdk_bdev_list bdevs;
104 	struct bdev_name_tree bdev_names;
105 
106 	bool init_complete;
107 	bool module_init_complete;
108 
109 	struct spdk_spinlock spinlock;
110 
111 	TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens;
112 
113 #ifdef SPDK_CONFIG_VTUNE
114 	__itt_domain	*domain;
115 #endif
116 };
117 
118 static struct spdk_bdev_mgr g_bdev_mgr = {
119 	.bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules),
120 	.bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs),
121 	.bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names),
122 	.init_complete = false,
123 	.module_init_complete = false,
124 	.async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens),
125 };
126 
127 static void
128 __attribute__((constructor))
129 _bdev_init(void)
130 {
131 	spdk_spin_init(&g_bdev_mgr.spinlock);
132 }
133 
134 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status);
135 
136 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc);
137 
138 struct lba_range {
139 	struct spdk_bdev		*bdev;
140 	uint64_t			offset;
141 	uint64_t			length;
142 	void				*locked_ctx;
143 	struct spdk_thread		*owner_thread;
144 	struct spdk_bdev_channel	*owner_ch;
145 	TAILQ_ENTRY(lba_range)		tailq;
146 	TAILQ_ENTRY(lba_range)		tailq_module;
147 };
148 
149 static struct spdk_bdev_opts	g_bdev_opts = {
150 	.bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE,
151 	.bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE,
152 	.bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE,
153 };
154 
155 static spdk_bdev_init_cb	g_init_cb_fn = NULL;
156 static void			*g_init_cb_arg = NULL;
157 
158 static spdk_bdev_fini_cb	g_fini_cb_fn = NULL;
159 static void			*g_fini_cb_arg = NULL;
160 static struct spdk_thread	*g_fini_thread = NULL;
161 
162 struct spdk_bdev_qos_limit {
163 	/** IOs or bytes allowed per second (i.e., 1s). */
164 	uint64_t limit;
165 
166 	/** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms).
167 	 *  For remaining bytes, allowed to run negative if an I/O is submitted when
168 	 *  some bytes are remaining, but the I/O is bigger than that amount. The
169 	 *  excess will be deducted from the next timeslice.
170 	 */
171 	int64_t remaining_this_timeslice;
172 
173 	/** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */
174 	uint32_t min_per_timeslice;
175 
176 	/** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */
177 	uint32_t max_per_timeslice;
178 
179 	/** Function to check whether to queue the IO. */
180 	bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io);
181 
182 	/** Function to update for the submitted IO. */
183 	void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io);
184 };
185 
186 struct spdk_bdev_qos {
187 	/** Types of structure of rate limits. */
188 	struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
189 
190 	/** The channel that all I/O are funneled through. */
191 	struct spdk_bdev_channel *ch;
192 
193 	/** The thread on which the poller is running. */
194 	struct spdk_thread *thread;
195 
196 	/** Queue of I/O waiting to be issued. */
197 	bdev_io_tailq_t queued;
198 
199 	/** Size of a timeslice in tsc ticks. */
200 	uint64_t timeslice_size;
201 
202 	/** Timestamp of start of last timeslice. */
203 	uint64_t last_timeslice;
204 
205 	/** Poller that processes queued I/O commands each time slice. */
206 	struct spdk_poller *poller;
207 };
208 
209 struct spdk_bdev_mgmt_channel {
210 	/*
211 	 * Each thread keeps a cache of bdev_io - this allows
212 	 *  bdev threads which are *not* DPDK threads to still
213 	 *  benefit from a per-thread bdev_io cache.  Without
214 	 *  this, non-DPDK threads fetching from the mempool
215 	 *  incur a cmpxchg on get and put.
216 	 */
217 	bdev_io_stailq_t per_thread_cache;
218 	uint32_t	per_thread_cache_count;
219 	uint32_t	bdev_io_cache_size;
220 
221 	struct spdk_iobuf_channel iobuf;
222 
223 	TAILQ_HEAD(, spdk_bdev_shared_resource)	shared_resources;
224 	TAILQ_HEAD(, spdk_bdev_io_wait_entry)	io_wait_queue;
225 };
226 
227 /*
228  * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device
229  * will queue here their IO that awaits retry. It makes it possible to retry sending
230  * IO to one bdev after IO from other bdev completes.
231  */
232 struct spdk_bdev_shared_resource {
233 	/* The bdev management channel */
234 	struct spdk_bdev_mgmt_channel *mgmt_ch;
235 
236 	/*
237 	 * Count of I/O submitted to bdev module and waiting for completion.
238 	 * Incremented before submit_request() is called on an spdk_bdev_io.
239 	 */
240 	uint64_t		io_outstanding;
241 
242 	/*
243 	 * Queue of IO awaiting retry because of a previous NOMEM status returned
244 	 *  on this channel.
245 	 */
246 	bdev_io_tailq_t		nomem_io;
247 
248 	/*
249 	 * Threshold which io_outstanding must drop to before retrying nomem_io.
250 	 */
251 	uint64_t		nomem_threshold;
252 
253 	/* I/O channel allocated by a bdev module */
254 	struct spdk_io_channel	*shared_ch;
255 
256 	/* Refcount of bdev channels using this resource */
257 	uint32_t		ref;
258 
259 	TAILQ_ENTRY(spdk_bdev_shared_resource) link;
260 };
261 
262 #define BDEV_CH_RESET_IN_PROGRESS	(1 << 0)
263 #define BDEV_CH_QOS_ENABLED		(1 << 1)
264 
265 struct spdk_bdev_channel {
266 	struct spdk_bdev	*bdev;
267 
268 	/* The channel for the underlying device */
269 	struct spdk_io_channel	*channel;
270 
271 	/* Accel channel */
272 	struct spdk_io_channel	*accel_channel;
273 
274 	/* Per io_device per thread data */
275 	struct spdk_bdev_shared_resource *shared_resource;
276 
277 	struct spdk_bdev_io_stat *stat;
278 
279 	/*
280 	 * Count of I/O submitted to the underlying dev module through this channel
281 	 * and waiting for completion.
282 	 */
283 	uint64_t		io_outstanding;
284 
285 	/*
286 	 * List of all submitted I/Os including I/O that are generated via splitting.
287 	 */
288 	bdev_io_tailq_t		io_submitted;
289 
290 	/*
291 	 * List of spdk_bdev_io that are currently queued because they write to a locked
292 	 * LBA range.
293 	 */
294 	bdev_io_tailq_t		io_locked;
295 
296 	/* List of I/Os with accel sequence being currently executed */
297 	bdev_io_tailq_t		io_accel_exec;
298 
299 	/* List of I/Os doing memory domain pull/push */
300 	bdev_io_tailq_t		io_memory_domain;
301 
302 	uint32_t		flags;
303 
304 	struct spdk_histogram_data *histogram;
305 
306 #ifdef SPDK_CONFIG_VTUNE
307 	uint64_t		start_tsc;
308 	uint64_t		interval_tsc;
309 	__itt_string_handle	*handle;
310 	struct spdk_bdev_io_stat *prev_stat;
311 #endif
312 
313 	bdev_io_tailq_t		queued_resets;
314 
315 	lba_range_tailq_t	locked_ranges;
316 };
317 
318 struct media_event_entry {
319 	struct spdk_bdev_media_event	event;
320 	TAILQ_ENTRY(media_event_entry)	tailq;
321 };
322 
323 #define MEDIA_EVENT_POOL_SIZE 64
324 
325 struct spdk_bdev_desc {
326 	struct spdk_bdev		*bdev;
327 	struct spdk_thread		*thread;
328 	struct {
329 		spdk_bdev_event_cb_t event_fn;
330 		void *ctx;
331 	}				callback;
332 	bool				closed;
333 	bool				write;
334 	bool				memory_domains_supported;
335 	bool				accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES];
336 	struct spdk_spinlock		spinlock;
337 	uint32_t			refs;
338 	TAILQ_HEAD(, media_event_entry)	pending_media_events;
339 	TAILQ_HEAD(, media_event_entry)	free_media_events;
340 	struct media_event_entry	*media_events_buffer;
341 	TAILQ_ENTRY(spdk_bdev_desc)	link;
342 
343 	uint64_t		timeout_in_sec;
344 	spdk_bdev_io_timeout_cb	cb_fn;
345 	void			*cb_arg;
346 	struct spdk_poller	*io_timeout_poller;
347 	struct spdk_bdev_module_claim	*claim;
348 };
349 
350 struct spdk_bdev_iostat_ctx {
351 	struct spdk_bdev_io_stat *stat;
352 	spdk_bdev_get_device_stat_cb cb;
353 	void *cb_arg;
354 };
355 
356 struct set_qos_limit_ctx {
357 	void (*cb_fn)(void *cb_arg, int status);
358 	void *cb_arg;
359 	struct spdk_bdev *bdev;
360 };
361 
362 struct spdk_bdev_channel_iter {
363 	spdk_bdev_for_each_channel_msg fn;
364 	spdk_bdev_for_each_channel_done cpl;
365 	struct spdk_io_channel_iter *i;
366 	void *ctx;
367 };
368 
369 struct spdk_bdev_io_error_stat {
370 	uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS];
371 };
372 
373 enum bdev_io_retry_state {
374 	BDEV_IO_RETRY_STATE_INVALID,
375 	BDEV_IO_RETRY_STATE_PULL,
376 	BDEV_IO_RETRY_STATE_PULL_MD,
377 	BDEV_IO_RETRY_STATE_SUBMIT,
378 	BDEV_IO_RETRY_STATE_PUSH,
379 	BDEV_IO_RETRY_STATE_PUSH_MD,
380 };
381 
382 #define __bdev_to_io_dev(bdev)		(((char *)bdev) + 1)
383 #define __bdev_from_io_dev(io_dev)	((struct spdk_bdev *)(((char *)io_dev) - 1))
384 #define __io_ch_to_bdev_ch(io_ch)	((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch))
385 #define __io_ch_to_bdev_mgmt_ch(io_ch)	((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch))
386 
387 static inline void bdev_io_complete(void *ctx);
388 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io);
389 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io);
390 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io);
391 
392 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
393 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io);
394 
395 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
396 				struct spdk_io_channel *ch, void *_ctx);
397 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status);
398 
399 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
400 				     struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks,
401 				     uint64_t num_blocks,
402 				     struct spdk_memory_domain *domain, void *domain_ctx,
403 				     struct spdk_accel_sequence *seq,
404 				     spdk_bdev_io_completion_cb cb, void *cb_arg);
405 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
406 				      struct iovec *iov, int iovcnt, void *md_buf,
407 				      uint64_t offset_blocks, uint64_t num_blocks,
408 				      struct spdk_memory_domain *domain, void *domain_ctx,
409 				      struct spdk_accel_sequence *seq,
410 				      spdk_bdev_io_completion_cb cb, void *cb_arg);
411 
412 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
413 			       uint64_t offset, uint64_t length,
414 			       lock_range_cb cb_fn, void *cb_arg);
415 
416 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
417 				 uint64_t offset, uint64_t length,
418 				 lock_range_cb cb_fn, void *cb_arg);
419 
420 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort);
421 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort);
422 
423 static bool claim_type_is_v2(enum spdk_bdev_claim_type type);
424 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc);
425 static void claim_reset(struct spdk_bdev *bdev);
426 
427 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch);
428 
429 #define bdev_get_ext_io_opt(opts, field, defval) \
430 	(((opts) != NULL && offsetof(struct spdk_bdev_ext_io_opts, field) + \
431 	 sizeof((opts)->field) <= (opts)->size) ? (opts)->field : (defval))
432 
433 void
434 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size)
435 {
436 	if (!opts) {
437 		SPDK_ERRLOG("opts should not be NULL\n");
438 		return;
439 	}
440 
441 	if (!opts_size) {
442 		SPDK_ERRLOG("opts_size should not be zero value\n");
443 		return;
444 	}
445 
446 	opts->opts_size = opts_size;
447 
448 #define SET_FIELD(field) \
449 	if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \
450 		opts->field = g_bdev_opts.field; \
451 	} \
452 
453 	SET_FIELD(bdev_io_pool_size);
454 	SET_FIELD(bdev_io_cache_size);
455 	SET_FIELD(bdev_auto_examine);
456 
457 	/* Do not remove this statement, you should always update this statement when you adding a new field,
458 	 * and do not forget to add the SET_FIELD statement for your added field. */
459 	SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size");
460 
461 #undef SET_FIELD
462 }
463 
464 int
465 spdk_bdev_set_opts(struct spdk_bdev_opts *opts)
466 {
467 	uint32_t min_pool_size;
468 
469 	if (!opts) {
470 		SPDK_ERRLOG("opts cannot be NULL\n");
471 		return -1;
472 	}
473 
474 	if (!opts->opts_size) {
475 		SPDK_ERRLOG("opts_size inside opts cannot be zero value\n");
476 		return -1;
477 	}
478 
479 	/*
480 	 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem
481 	 *  initialization.  A second mgmt_ch will be created on the same thread when the application starts
482 	 *  but before the deferred put_io_channel event is executed for the first mgmt_ch.
483 	 */
484 	min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1);
485 	if (opts->bdev_io_pool_size < min_pool_size) {
486 		SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32
487 			    " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size,
488 			    spdk_thread_get_count());
489 		SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size);
490 		return -1;
491 	}
492 
493 #define SET_FIELD(field) \
494         if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \
495                 g_bdev_opts.field = opts->field; \
496         } \
497 
498 	SET_FIELD(bdev_io_pool_size);
499 	SET_FIELD(bdev_io_cache_size);
500 	SET_FIELD(bdev_auto_examine);
501 
502 	g_bdev_opts.opts_size = opts->opts_size;
503 
504 #undef SET_FIELD
505 
506 	return 0;
507 }
508 
509 static struct spdk_bdev *
510 bdev_get_by_name(const char *bdev_name)
511 {
512 	struct spdk_bdev_name find;
513 	struct spdk_bdev_name *res;
514 
515 	find.name = (char *)bdev_name;
516 	res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find);
517 	if (res != NULL) {
518 		return res->bdev;
519 	}
520 
521 	return NULL;
522 }
523 
524 struct spdk_bdev *
525 spdk_bdev_get_by_name(const char *bdev_name)
526 {
527 	struct spdk_bdev *bdev;
528 
529 	spdk_spin_lock(&g_bdev_mgr.spinlock);
530 	bdev = bdev_get_by_name(bdev_name);
531 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
532 
533 	return bdev;
534 }
535 
536 struct bdev_io_status_string {
537 	enum spdk_bdev_io_status status;
538 	const char *str;
539 };
540 
541 static const struct bdev_io_status_string bdev_io_status_strings[] = {
542 	{ SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" },
543 	{ SPDK_BDEV_IO_STATUS_ABORTED, "aborted" },
544 	{ SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" },
545 	{ SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" },
546 	{ SPDK_BDEV_IO_STATUS_NOMEM, "nomem" },
547 	{ SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" },
548 	{ SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" },
549 	{ SPDK_BDEV_IO_STATUS_FAILED, "failed" },
550 	{ SPDK_BDEV_IO_STATUS_PENDING, "pending" },
551 	{ SPDK_BDEV_IO_STATUS_SUCCESS, "success" },
552 };
553 
554 static const char *
555 bdev_io_status_get_string(enum spdk_bdev_io_status status)
556 {
557 	uint32_t i;
558 
559 	for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) {
560 		if (bdev_io_status_strings[i].status == status) {
561 			return bdev_io_status_strings[i].str;
562 		}
563 	}
564 
565 	return "reserved";
566 }
567 
568 struct spdk_bdev_wait_for_examine_ctx {
569 	struct spdk_poller              *poller;
570 	spdk_bdev_wait_for_examine_cb	cb_fn;
571 	void				*cb_arg;
572 };
573 
574 static bool bdev_module_all_actions_completed(void);
575 
576 static int
577 bdev_wait_for_examine_cb(void *arg)
578 {
579 	struct spdk_bdev_wait_for_examine_ctx *ctx = arg;
580 
581 	if (!bdev_module_all_actions_completed()) {
582 		return SPDK_POLLER_IDLE;
583 	}
584 
585 	spdk_poller_unregister(&ctx->poller);
586 	ctx->cb_fn(ctx->cb_arg);
587 	free(ctx);
588 
589 	return SPDK_POLLER_BUSY;
590 }
591 
592 int
593 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg)
594 {
595 	struct spdk_bdev_wait_for_examine_ctx *ctx;
596 
597 	ctx = calloc(1, sizeof(*ctx));
598 	if (ctx == NULL) {
599 		return -ENOMEM;
600 	}
601 	ctx->cb_fn = cb_fn;
602 	ctx->cb_arg = cb_arg;
603 	ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0);
604 
605 	return 0;
606 }
607 
608 struct spdk_bdev_examine_item {
609 	char *name;
610 	TAILQ_ENTRY(spdk_bdev_examine_item) link;
611 };
612 
613 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item);
614 
615 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER(
616 			g_bdev_examine_allowlist);
617 
618 static inline bool
619 bdev_examine_allowlist_check(const char *name)
620 {
621 	struct spdk_bdev_examine_item *item;
622 	TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) {
623 		if (strcmp(name, item->name) == 0) {
624 			return true;
625 		}
626 	}
627 	return false;
628 }
629 
630 static inline void
631 bdev_examine_allowlist_free(void)
632 {
633 	struct spdk_bdev_examine_item *item;
634 	while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) {
635 		item = TAILQ_FIRST(&g_bdev_examine_allowlist);
636 		TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link);
637 		free(item->name);
638 		free(item);
639 	}
640 }
641 
642 static inline bool
643 bdev_in_examine_allowlist(struct spdk_bdev *bdev)
644 {
645 	struct spdk_bdev_alias *tmp;
646 	if (bdev_examine_allowlist_check(bdev->name)) {
647 		return true;
648 	}
649 	TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
650 		if (bdev_examine_allowlist_check(tmp->alias.name)) {
651 			return true;
652 		}
653 	}
654 	return false;
655 }
656 
657 static inline bool
658 bdev_ok_to_examine(struct spdk_bdev *bdev)
659 {
660 	if (g_bdev_opts.bdev_auto_examine) {
661 		return true;
662 	} else {
663 		return bdev_in_examine_allowlist(bdev);
664 	}
665 }
666 
667 static void
668 bdev_examine(struct spdk_bdev *bdev)
669 {
670 	struct spdk_bdev_module *module;
671 	struct spdk_bdev_module_claim *claim, *tmpclaim;
672 	uint32_t action;
673 
674 	if (!bdev_ok_to_examine(bdev)) {
675 		return;
676 	}
677 
678 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
679 		if (module->examine_config) {
680 			spdk_spin_lock(&module->internal.spinlock);
681 			action = module->internal.action_in_progress;
682 			module->internal.action_in_progress++;
683 			spdk_spin_unlock(&module->internal.spinlock);
684 			module->examine_config(bdev);
685 			if (action != module->internal.action_in_progress) {
686 				SPDK_ERRLOG("examine_config for module %s did not call "
687 					    "spdk_bdev_module_examine_done()\n", module->name);
688 			}
689 		}
690 	}
691 
692 	spdk_spin_lock(&bdev->internal.spinlock);
693 
694 	switch (bdev->internal.claim_type) {
695 	case SPDK_BDEV_CLAIM_NONE:
696 		/* Examine by all bdev modules */
697 		TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
698 			if (module->examine_disk) {
699 				spdk_spin_lock(&module->internal.spinlock);
700 				module->internal.action_in_progress++;
701 				spdk_spin_unlock(&module->internal.spinlock);
702 				spdk_spin_unlock(&bdev->internal.spinlock);
703 				module->examine_disk(bdev);
704 				spdk_spin_lock(&bdev->internal.spinlock);
705 			}
706 		}
707 		break;
708 	case SPDK_BDEV_CLAIM_EXCL_WRITE:
709 		/* Examine by the one bdev module with a v1 claim */
710 		module = bdev->internal.claim.v1.module;
711 		if (module->examine_disk) {
712 			spdk_spin_lock(&module->internal.spinlock);
713 			module->internal.action_in_progress++;
714 			spdk_spin_unlock(&module->internal.spinlock);
715 			spdk_spin_unlock(&bdev->internal.spinlock);
716 			module->examine_disk(bdev);
717 			return;
718 		}
719 		break;
720 	default:
721 		/* Examine by all bdev modules with a v2 claim */
722 		assert(claim_type_is_v2(bdev->internal.claim_type));
723 		/*
724 		 * Removal of tailq nodes while iterating can cause the iteration to jump out of the
725 		 * list, perhaps accessing freed memory. Without protection, this could happen
726 		 * while the lock is dropped during the examine callback.
727 		 */
728 		bdev->internal.examine_in_progress++;
729 
730 		TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) {
731 			module = claim->module;
732 
733 			if (module == NULL) {
734 				/* This is a vestigial claim, held by examine_count */
735 				continue;
736 			}
737 
738 			if (module->examine_disk == NULL) {
739 				continue;
740 			}
741 
742 			spdk_spin_lock(&module->internal.spinlock);
743 			module->internal.action_in_progress++;
744 			spdk_spin_unlock(&module->internal.spinlock);
745 
746 			/* Call examine_disk without holding internal.spinlock. */
747 			spdk_spin_unlock(&bdev->internal.spinlock);
748 			module->examine_disk(bdev);
749 			spdk_spin_lock(&bdev->internal.spinlock);
750 		}
751 
752 		assert(bdev->internal.examine_in_progress > 0);
753 		bdev->internal.examine_in_progress--;
754 		if (bdev->internal.examine_in_progress == 0) {
755 			/* Remove any claims that were released during examine_disk */
756 			TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) {
757 				if (claim->desc != NULL) {
758 					continue;
759 				}
760 
761 				TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link);
762 				free(claim);
763 			}
764 			if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) {
765 				claim_reset(bdev);
766 			}
767 		}
768 	}
769 
770 	spdk_spin_unlock(&bdev->internal.spinlock);
771 }
772 
773 int
774 spdk_bdev_examine(const char *name)
775 {
776 	struct spdk_bdev *bdev;
777 	struct spdk_bdev_examine_item *item;
778 	struct spdk_thread *thread = spdk_get_thread();
779 
780 	if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) {
781 		SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread,
782 			    thread ? spdk_thread_get_name(thread) : "null");
783 		return -EINVAL;
784 	}
785 
786 	if (g_bdev_opts.bdev_auto_examine) {
787 		SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled");
788 		return -EINVAL;
789 	}
790 
791 	if (bdev_examine_allowlist_check(name)) {
792 		SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name);
793 		return -EEXIST;
794 	}
795 
796 	item = calloc(1, sizeof(*item));
797 	if (!item) {
798 		return -ENOMEM;
799 	}
800 	item->name = strdup(name);
801 	if (!item->name) {
802 		free(item);
803 		return -ENOMEM;
804 	}
805 	TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link);
806 
807 	bdev = spdk_bdev_get_by_name(name);
808 	if (bdev) {
809 		bdev_examine(bdev);
810 	}
811 	return 0;
812 }
813 
814 static inline void
815 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w)
816 {
817 	struct spdk_bdev_examine_item *item;
818 	TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) {
819 		spdk_json_write_object_begin(w);
820 		spdk_json_write_named_string(w, "method", "bdev_examine");
821 		spdk_json_write_named_object_begin(w, "params");
822 		spdk_json_write_named_string(w, "name", item->name);
823 		spdk_json_write_object_end(w);
824 		spdk_json_write_object_end(w);
825 	}
826 }
827 
828 struct spdk_bdev *
829 spdk_bdev_first(void)
830 {
831 	struct spdk_bdev *bdev;
832 
833 	bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs);
834 	if (bdev) {
835 		SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name);
836 	}
837 
838 	return bdev;
839 }
840 
841 struct spdk_bdev *
842 spdk_bdev_next(struct spdk_bdev *prev)
843 {
844 	struct spdk_bdev *bdev;
845 
846 	bdev = TAILQ_NEXT(prev, internal.link);
847 	if (bdev) {
848 		SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name);
849 	}
850 
851 	return bdev;
852 }
853 
854 static struct spdk_bdev *
855 _bdev_next_leaf(struct spdk_bdev *bdev)
856 {
857 	while (bdev != NULL) {
858 		if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) {
859 			return bdev;
860 		} else {
861 			bdev = TAILQ_NEXT(bdev, internal.link);
862 		}
863 	}
864 
865 	return bdev;
866 }
867 
868 struct spdk_bdev *
869 spdk_bdev_first_leaf(void)
870 {
871 	struct spdk_bdev *bdev;
872 
873 	bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs));
874 
875 	if (bdev) {
876 		SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name);
877 	}
878 
879 	return bdev;
880 }
881 
882 struct spdk_bdev *
883 spdk_bdev_next_leaf(struct spdk_bdev *prev)
884 {
885 	struct spdk_bdev *bdev;
886 
887 	bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link));
888 
889 	if (bdev) {
890 		SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name);
891 	}
892 
893 	return bdev;
894 }
895 
896 static inline bool
897 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io)
898 {
899 	return bdev_io->internal.memory_domain;
900 }
901 
902 static inline bool
903 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io)
904 {
905 	return bdev_io->internal.has_accel_sequence;
906 }
907 
908 static inline void
909 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource,
910 			 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state)
911 {
912 	/* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io.
913 	 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth
914 	 * channels we will instead wait for half to complete.
915 	 */
916 	shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2,
917 					   (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT);
918 
919 	assert(state != BDEV_IO_RETRY_STATE_INVALID);
920 	bdev_io->internal.retry_state = state;
921 	TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link);
922 }
923 
924 static inline void
925 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource,
926 			 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state)
927 {
928 	/* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while
929 	 * the queue isn't empty, so we don't need to update the nomem_threshold here */
930 	assert(!TAILQ_EMPTY(&shared_resource->nomem_io));
931 
932 	assert(state != BDEV_IO_RETRY_STATE_INVALID);
933 	bdev_io->internal.retry_state = state;
934 	TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link);
935 }
936 
937 void
938 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len)
939 {
940 	struct iovec *iovs;
941 
942 	if (bdev_io->u.bdev.iovs == NULL) {
943 		bdev_io->u.bdev.iovs = &bdev_io->iov;
944 		bdev_io->u.bdev.iovcnt = 1;
945 	}
946 
947 	iovs = bdev_io->u.bdev.iovs;
948 
949 	assert(iovs != NULL);
950 	assert(bdev_io->u.bdev.iovcnt >= 1);
951 
952 	iovs[0].iov_base = buf;
953 	iovs[0].iov_len = len;
954 }
955 
956 void
957 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len)
958 {
959 	assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks);
960 	bdev_io->u.bdev.md_buf = md_buf;
961 }
962 
963 static bool
964 _is_buf_allocated(const struct iovec *iovs)
965 {
966 	if (iovs == NULL) {
967 		return false;
968 	}
969 
970 	return iovs[0].iov_base != NULL;
971 }
972 
973 static bool
974 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment)
975 {
976 	int i;
977 	uintptr_t iov_base;
978 
979 	if (spdk_likely(alignment == 1)) {
980 		return true;
981 	}
982 
983 	for (i = 0; i < iovcnt; i++) {
984 		iov_base = (uintptr_t)iovs[i].iov_base;
985 		if ((iov_base & (alignment - 1)) != 0) {
986 			return false;
987 		}
988 	}
989 
990 	return true;
991 }
992 
993 static inline bool
994 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io)
995 {
996 	if (!bdev_io->internal.accel_sequence) {
997 		return false;
998 	}
999 
1000 	/* For now, we don't allow splitting IOs with an accel sequence and will treat them as if
1001 	 * bdev module didn't support accel sequences */
1002 	return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.split;
1003 }
1004 
1005 static inline void
1006 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch,
1007 			      struct spdk_bdev_shared_resource *shared_resource)
1008 {
1009 	bdev_ch->io_outstanding++;
1010 	shared_resource->io_outstanding++;
1011 }
1012 
1013 static inline void
1014 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch,
1015 			      struct spdk_bdev_shared_resource *shared_resource)
1016 {
1017 	assert(bdev_ch->io_outstanding > 0);
1018 	assert(shared_resource->io_outstanding > 0);
1019 	bdev_ch->io_outstanding--;
1020 	shared_resource->io_outstanding--;
1021 }
1022 
1023 static void
1024 bdev_io_submit_sequence_cb(void *ctx, int status)
1025 {
1026 	struct spdk_bdev_io *bdev_io = ctx;
1027 
1028 	bdev_io->u.bdev.accel_sequence = NULL;
1029 	bdev_io->internal.accel_sequence = NULL;
1030 
1031 	if (spdk_unlikely(status != 0)) {
1032 		SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status);
1033 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1034 		bdev_io_complete_unsubmitted(bdev_io);
1035 		return;
1036 	}
1037 
1038 	bdev_io_submit(bdev_io);
1039 }
1040 
1041 static void
1042 bdev_io_exec_sequence_cb(void *ctx, int status)
1043 {
1044 	struct spdk_bdev_io *bdev_io = ctx;
1045 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1046 
1047 	TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link);
1048 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1049 
1050 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1051 		bdev_ch_retry_io(ch);
1052 	}
1053 
1054 	bdev_io->internal.data_transfer_cpl(bdev_io, status);
1055 }
1056 
1057 static void
1058 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status))
1059 {
1060 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1061 
1062 	assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io));
1063 	assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
1064 
1065 	/* Since the operations are appended during submission, they're in the opposite order than
1066 	 * how we want to execute them for reads (i.e. we need to execute the most recently added
1067 	 * operation first), so reverse the sequence before executing it.
1068 	 */
1069 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
1070 		spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence);
1071 	}
1072 
1073 	TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link);
1074 	bdev_io_increment_outstanding(ch, ch->shared_resource);
1075 	bdev_io->internal.data_transfer_cpl = cb_fn;
1076 
1077 	spdk_accel_sequence_finish(bdev_io->internal.accel_sequence,
1078 				   bdev_io_exec_sequence_cb, bdev_io);
1079 }
1080 
1081 static void
1082 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status)
1083 {
1084 	struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io);
1085 	void *buf;
1086 
1087 	if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) {
1088 		buf = bdev_io->internal.buf;
1089 		bdev_io->internal.buf = NULL;
1090 		bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf);
1091 		bdev_io->internal.get_aux_buf_cb = NULL;
1092 	} else {
1093 		assert(bdev_io->internal.get_buf_cb != NULL);
1094 		bdev_io->internal.get_buf_cb(ch, bdev_io, status);
1095 		bdev_io->internal.get_buf_cb = NULL;
1096 	}
1097 }
1098 
1099 static void
1100 _bdev_io_pull_buffer_cpl(void *ctx, int rc)
1101 {
1102 	struct spdk_bdev_io *bdev_io = ctx;
1103 
1104 	if (rc) {
1105 		SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc);
1106 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1107 	}
1108 	bdev_io_get_buf_complete(bdev_io, !rc);
1109 }
1110 
1111 static void
1112 bdev_io_pull_md_buf_done(void *ctx, int status)
1113 {
1114 	struct spdk_bdev_io *bdev_io = ctx;
1115 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1116 
1117 	TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1118 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1119 
1120 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1121 		bdev_ch_retry_io(ch);
1122 	}
1123 
1124 	assert(bdev_io->internal.data_transfer_cpl);
1125 	bdev_io->internal.data_transfer_cpl(bdev_io, status);
1126 }
1127 
1128 static void
1129 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io)
1130 {
1131 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1132 	int rc = 0;
1133 
1134 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1135 		if (bdev_io_use_memory_domain(bdev_io)) {
1136 			TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1137 			bdev_io_increment_outstanding(ch, ch->shared_resource);
1138 			rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain,
1139 							  bdev_io->internal.memory_domain_ctx,
1140 							  &bdev_io->internal.orig_md_iov, 1,
1141 							  &bdev_io->internal.bounce_md_iov, 1,
1142 							  bdev_io_pull_md_buf_done, bdev_io);
1143 			if (rc == 0) {
1144 				/* Continue to submit IO in completion callback */
1145 				return;
1146 			}
1147 			bdev_io_decrement_outstanding(ch, ch->shared_resource);
1148 			TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1149 			if (rc != -ENOMEM) {
1150 				SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n",
1151 					    spdk_memory_domain_get_dma_device_id(
1152 						    bdev_io->internal.memory_domain), rc);
1153 			}
1154 		} else {
1155 			memcpy(bdev_io->internal.bounce_md_iov.iov_base,
1156 			       bdev_io->internal.orig_md_iov.iov_base,
1157 			       bdev_io->internal.orig_md_iov.iov_len);
1158 		}
1159 	}
1160 
1161 	if (spdk_unlikely(rc == -ENOMEM)) {
1162 		bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD);
1163 	} else {
1164 		assert(bdev_io->internal.data_transfer_cpl);
1165 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1166 	}
1167 }
1168 
1169 static void
1170 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len)
1171 {
1172 	/* save original md_buf */
1173 	bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf;
1174 	bdev_io->internal.orig_md_iov.iov_len = len;
1175 	bdev_io->internal.bounce_md_iov.iov_base = md_buf;
1176 	bdev_io->internal.bounce_md_iov.iov_len = len;
1177 	/* set bounce md_buf */
1178 	bdev_io->u.bdev.md_buf = md_buf;
1179 
1180 	bdev_io_pull_md_buf(bdev_io);
1181 }
1182 
1183 static void
1184 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io)
1185 {
1186 	struct spdk_bdev *bdev = bdev_io->bdev;
1187 	uint64_t md_len;
1188 	void *buf;
1189 
1190 	if (spdk_bdev_is_md_separate(bdev)) {
1191 		assert(!bdev_io_use_accel_sequence(bdev_io));
1192 
1193 		buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len;
1194 		md_len = bdev_io->u.bdev.num_blocks * bdev->md_len;
1195 
1196 		assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0);
1197 
1198 		if (bdev_io->u.bdev.md_buf != NULL) {
1199 			_bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len);
1200 			return;
1201 		} else {
1202 			spdk_bdev_io_set_md_buf(bdev_io, buf, md_len);
1203 		}
1204 	}
1205 
1206 	bdev_io_get_buf_complete(bdev_io, true);
1207 }
1208 
1209 static inline void
1210 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc)
1211 {
1212 	if (rc) {
1213 		SPDK_ERRLOG("Failed to get data buffer\n");
1214 		assert(bdev_io->internal.data_transfer_cpl);
1215 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1216 		return;
1217 	}
1218 
1219 	_bdev_io_set_md_buf(bdev_io);
1220 }
1221 
1222 static void
1223 bdev_io_pull_data_done_and_track(void *ctx, int status)
1224 {
1225 	struct spdk_bdev_io *bdev_io = ctx;
1226 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1227 
1228 	TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1229 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1230 
1231 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1232 		bdev_ch_retry_io(ch);
1233 	}
1234 
1235 	bdev_io_pull_data_done(bdev_io, status);
1236 }
1237 
1238 static void
1239 bdev_io_pull_data(struct spdk_bdev_io *bdev_io)
1240 {
1241 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1242 	int rc = 0;
1243 
1244 	/* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a
1245 	 * sequence, append a copy operation making accel change the src/dst buffers of the previous
1246 	 * operation */
1247 	if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) ||
1248 	    (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) {
1249 		if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1250 			rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel,
1251 						    bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
1252 						    NULL, NULL,
1253 						    bdev_io->internal.orig_iovs,
1254 						    bdev_io->internal.orig_iovcnt,
1255 						    bdev_io->internal.memory_domain,
1256 						    bdev_io->internal.memory_domain_ctx,
1257 						    0, NULL, NULL);
1258 		} else {
1259 			/* We need to reverse the src/dst for reads */
1260 			assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
1261 			rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel,
1262 						    bdev_io->internal.orig_iovs,
1263 						    bdev_io->internal.orig_iovcnt,
1264 						    bdev_io->internal.memory_domain,
1265 						    bdev_io->internal.memory_domain_ctx,
1266 						    bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
1267 						    NULL, NULL, 0, NULL, NULL);
1268 		}
1269 
1270 		if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) {
1271 			SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n",
1272 				    bdev_io->internal.accel_sequence);
1273 		}
1274 	} else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1275 		/* if this is write path, copy data from original buffer to bounce buffer */
1276 		if (bdev_io_use_memory_domain(bdev_io)) {
1277 			TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1278 			bdev_io_increment_outstanding(ch, ch->shared_resource);
1279 			rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain,
1280 							  bdev_io->internal.memory_domain_ctx,
1281 							  bdev_io->internal.orig_iovs,
1282 							  (uint32_t) bdev_io->internal.orig_iovcnt,
1283 							  bdev_io->u.bdev.iovs, 1,
1284 							  bdev_io_pull_data_done_and_track,
1285 							  bdev_io);
1286 			if (rc == 0) {
1287 				/* Continue to submit IO in completion callback */
1288 				return;
1289 			}
1290 			TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1291 			bdev_io_decrement_outstanding(ch, ch->shared_resource);
1292 			if (rc != -ENOMEM) {
1293 				SPDK_ERRLOG("Failed to pull data from memory domain %s\n",
1294 					    spdk_memory_domain_get_dma_device_id(
1295 						    bdev_io->internal.memory_domain));
1296 			}
1297 		} else {
1298 			assert(bdev_io->u.bdev.iovcnt == 1);
1299 			spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base,
1300 					      bdev_io->u.bdev.iovs[0].iov_len,
1301 					      bdev_io->internal.orig_iovs,
1302 					      bdev_io->internal.orig_iovcnt);
1303 		}
1304 	}
1305 
1306 	if (spdk_unlikely(rc == -ENOMEM)) {
1307 		bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL);
1308 	} else {
1309 		bdev_io_pull_data_done(bdev_io, rc);
1310 	}
1311 }
1312 
1313 static void
1314 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len,
1315 			      bdev_copy_bounce_buffer_cpl cpl_cb)
1316 {
1317 	struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource;
1318 
1319 	bdev_io->internal.data_transfer_cpl = cpl_cb;
1320 	/* save original iovec */
1321 	bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs;
1322 	bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt;
1323 	/* set bounce iov */
1324 	bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov;
1325 	bdev_io->u.bdev.iovcnt = 1;
1326 	/* set bounce buffer for this operation */
1327 	bdev_io->u.bdev.iovs[0].iov_base = buf;
1328 	bdev_io->u.bdev.iovs[0].iov_len = len;
1329 
1330 	if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) {
1331 		bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL);
1332 	} else {
1333 		bdev_io_pull_data(bdev_io);
1334 	}
1335 }
1336 
1337 static void
1338 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len)
1339 {
1340 	struct spdk_bdev *bdev = bdev_io->bdev;
1341 	bool buf_allocated;
1342 	uint64_t alignment;
1343 	void *aligned_buf;
1344 
1345 	bdev_io->internal.buf = buf;
1346 
1347 	if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) {
1348 		bdev_io_get_buf_complete(bdev_io, true);
1349 		return;
1350 	}
1351 
1352 	alignment = spdk_bdev_get_buf_align(bdev);
1353 	buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs);
1354 	aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1));
1355 
1356 	if (buf_allocated) {
1357 		_bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl);
1358 		/* Continue in completion callback */
1359 		return;
1360 	} else {
1361 		spdk_bdev_io_set_buf(bdev_io, aligned_buf, len);
1362 	}
1363 
1364 	_bdev_io_set_md_buf(bdev_io);
1365 }
1366 
1367 static inline uint64_t
1368 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len)
1369 {
1370 	struct spdk_bdev *bdev = bdev_io->bdev;
1371 	uint64_t md_len, alignment;
1372 
1373 	md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0;
1374 
1375 	/* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */
1376 	alignment = spdk_bdev_get_buf_align(bdev) - 1;
1377 
1378 	return len + alignment + md_len;
1379 }
1380 
1381 static void
1382 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len)
1383 {
1384 	struct spdk_bdev_mgmt_channel *ch;
1385 
1386 	ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
1387 	spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len));
1388 }
1389 
1390 static void
1391 bdev_io_put_buf(struct spdk_bdev_io *bdev_io)
1392 {
1393 	assert(bdev_io->internal.buf != NULL);
1394 	_bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len);
1395 	bdev_io->internal.buf = NULL;
1396 }
1397 
1398 void
1399 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf)
1400 {
1401 	uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
1402 
1403 	assert(buf != NULL);
1404 	_bdev_io_put_buf(bdev_io, buf, len);
1405 }
1406 
1407 static inline void
1408 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch,
1409 		    struct spdk_bdev_io *bdev_io)
1410 {
1411 	/* After a request is submitted to a bdev module, the ownership of an accel sequence
1412 	 * associated with that bdev_io is transferred to the bdev module. So, clear the internal
1413 	 * sequence pointer to make sure we won't touch it anymore. */
1414 	if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE ||
1415 	     bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) {
1416 		assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io));
1417 		bdev_io->internal.accel_sequence = NULL;
1418 	}
1419 
1420 	bdev->fn_table->submit_request(ioch, bdev_io);
1421 }
1422 
1423 static inline void
1424 bdev_ch_resubmit_io(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io)
1425 {
1426 	struct spdk_bdev *bdev = bdev_ch->bdev;
1427 
1428 	bdev_io_increment_outstanding(bdev_io->internal.ch, bdev_ch->shared_resource);
1429 	bdev_io->internal.error.nvme.cdw0 = 0;
1430 	bdev_io->num_retries++;
1431 	bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io);
1432 }
1433 
1434 static void
1435 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
1436 {
1437 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
1438 	struct spdk_bdev_io *bdev_io;
1439 
1440 	if (shared_resource->io_outstanding > shared_resource->nomem_threshold) {
1441 		/*
1442 		 * Allow some more I/O to complete before retrying the nomem_io queue.
1443 		 *  Some drivers (such as nvme) cannot immediately take a new I/O in
1444 		 *  the context of a completion, because the resources for the I/O are
1445 		 *  not released until control returns to the bdev poller.  Also, we
1446 		 *  may require several small I/O to complete before a larger I/O
1447 		 *  (that requires splitting) can be submitted.
1448 		 */
1449 		return;
1450 	}
1451 
1452 	while (!TAILQ_EMPTY(&shared_resource->nomem_io)) {
1453 		bdev_io = TAILQ_FIRST(&shared_resource->nomem_io);
1454 		TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link);
1455 
1456 		switch (bdev_io->internal.retry_state) {
1457 		case BDEV_IO_RETRY_STATE_SUBMIT:
1458 			bdev_ch_resubmit_io(bdev_ch, bdev_io);
1459 			break;
1460 		case BDEV_IO_RETRY_STATE_PULL:
1461 			bdev_io_pull_data(bdev_io);
1462 			break;
1463 		case BDEV_IO_RETRY_STATE_PULL_MD:
1464 			bdev_io_pull_md_buf(bdev_io);
1465 			break;
1466 		case BDEV_IO_RETRY_STATE_PUSH:
1467 			bdev_io_push_bounce_data(bdev_io);
1468 			break;
1469 		case BDEV_IO_RETRY_STATE_PUSH_MD:
1470 			bdev_io_push_bounce_md_buf(bdev_io);
1471 			break;
1472 		default:
1473 			assert(0 && "invalid retry state");
1474 			break;
1475 		}
1476 
1477 		if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) {
1478 			/* This IO completed again with NOMEM status, so break the loop and
1479 			 * don't try anymore.  Note that a bdev_io that fails with NOMEM
1480 			 * always gets requeued at the front of the list, to maintain
1481 			 * ordering.
1482 			 */
1483 			break;
1484 		}
1485 	}
1486 }
1487 
1488 static inline bool
1489 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state)
1490 {
1491 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
1492 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
1493 
1494 	if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) {
1495 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
1496 		bdev_queue_nomem_io_head(shared_resource, bdev_io, state);
1497 
1498 		/* If bdev module completed an I/O that has an accel sequence with NOMEM status, the
1499 		 * ownership of that sequence is transferred back to the bdev layer, so we need to
1500 		 * restore internal.accel_sequence to make sure that the sequence is handled
1501 		 * correctly in case the I/O is later aborted. */
1502 		if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ ||
1503 		     bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) {
1504 			assert(bdev_io->internal.accel_sequence == NULL);
1505 			bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence;
1506 		}
1507 
1508 		return true;
1509 	}
1510 
1511 	if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) {
1512 		bdev_ch_retry_io(bdev_ch);
1513 	}
1514 
1515 	return false;
1516 }
1517 
1518 static void
1519 _bdev_io_complete_push_bounce_done(void *ctx, int rc)
1520 {
1521 	struct spdk_bdev_io *bdev_io = ctx;
1522 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1523 
1524 	if (rc) {
1525 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1526 	}
1527 	/* We want to free the bounce buffer here since we know we're done with it (as opposed
1528 	 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()).
1529 	 */
1530 	bdev_io_put_buf(bdev_io);
1531 
1532 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1533 		bdev_ch_retry_io(ch);
1534 	}
1535 
1536 	/* Continue with IO completion flow */
1537 	bdev_io_complete(bdev_io);
1538 }
1539 
1540 static void
1541 bdev_io_push_bounce_md_buf_done(void *ctx, int rc)
1542 {
1543 	struct spdk_bdev_io *bdev_io = ctx;
1544 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1545 
1546 	TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1547 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1548 
1549 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1550 		bdev_ch_retry_io(ch);
1551 	}
1552 
1553 	bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1554 }
1555 
1556 static inline void
1557 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io)
1558 {
1559 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1560 	int rc = 0;
1561 
1562 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
1563 	/* do the same for metadata buffer */
1564 	if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) {
1565 		assert(spdk_bdev_is_md_separate(bdev_io->bdev));
1566 
1567 		if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
1568 			if (bdev_io_use_memory_domain(bdev_io)) {
1569 				TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1570 				bdev_io_increment_outstanding(ch, ch->shared_resource);
1571 				/* If memory domain is used then we need to call async push function */
1572 				rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain,
1573 								  bdev_io->internal.memory_domain_ctx,
1574 								  &bdev_io->internal.orig_md_iov,
1575 								  (uint32_t)bdev_io->internal.orig_iovcnt,
1576 								  &bdev_io->internal.bounce_md_iov, 1,
1577 								  bdev_io_push_bounce_md_buf_done,
1578 								  bdev_io);
1579 				if (rc == 0) {
1580 					/* Continue IO completion in async callback */
1581 					return;
1582 				}
1583 				TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1584 				bdev_io_decrement_outstanding(ch, ch->shared_resource);
1585 				if (rc != -ENOMEM) {
1586 					SPDK_ERRLOG("Failed to push md to memory domain %s\n",
1587 						    spdk_memory_domain_get_dma_device_id(
1588 							    bdev_io->internal.memory_domain));
1589 				}
1590 			} else {
1591 				memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf,
1592 				       bdev_io->internal.orig_md_iov.iov_len);
1593 			}
1594 		}
1595 	}
1596 
1597 	if (spdk_unlikely(rc == -ENOMEM)) {
1598 		bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD);
1599 	} else {
1600 		assert(bdev_io->internal.data_transfer_cpl);
1601 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1602 	}
1603 }
1604 
1605 static inline void
1606 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc)
1607 {
1608 	assert(bdev_io->internal.data_transfer_cpl);
1609 	if (rc) {
1610 		bdev_io->internal.data_transfer_cpl(bdev_io, rc);
1611 		return;
1612 	}
1613 
1614 	/* set original buffer for this io */
1615 	bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt;
1616 	bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs;
1617 	/* disable bouncing buffer for this io */
1618 	bdev_io->internal.orig_iovcnt = 0;
1619 	bdev_io->internal.orig_iovs = NULL;
1620 
1621 	bdev_io_push_bounce_md_buf(bdev_io);
1622 }
1623 
1624 static void
1625 bdev_io_push_bounce_data_done_and_track(void *ctx, int status)
1626 {
1627 	struct spdk_bdev_io *bdev_io = ctx;
1628 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1629 
1630 	TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1631 	bdev_io_decrement_outstanding(ch, ch->shared_resource);
1632 
1633 	if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) {
1634 		bdev_ch_retry_io(ch);
1635 	}
1636 
1637 	bdev_io_push_bounce_data_done(bdev_io, status);
1638 }
1639 
1640 static inline void
1641 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io)
1642 {
1643 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
1644 	int rc = 0;
1645 
1646 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
1647 	assert(!bdev_io_use_accel_sequence(bdev_io));
1648 
1649 	/* if this is read path, copy data from bounce buffer to original buffer */
1650 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
1651 		if (bdev_io_use_memory_domain(bdev_io)) {
1652 			TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link);
1653 			bdev_io_increment_outstanding(ch, ch->shared_resource);
1654 			/* If memory domain is used then we need to call async push function */
1655 			rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain,
1656 							  bdev_io->internal.memory_domain_ctx,
1657 							  bdev_io->internal.orig_iovs,
1658 							  (uint32_t)bdev_io->internal.orig_iovcnt,
1659 							  &bdev_io->internal.bounce_iov, 1,
1660 							  bdev_io_push_bounce_data_done_and_track,
1661 							  bdev_io);
1662 			if (rc == 0) {
1663 				/* Continue IO completion in async callback */
1664 				return;
1665 			}
1666 
1667 			TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link);
1668 			bdev_io_decrement_outstanding(ch, ch->shared_resource);
1669 			if (rc != -ENOMEM) {
1670 				SPDK_ERRLOG("Failed to push data to memory domain %s\n",
1671 					    spdk_memory_domain_get_dma_device_id(
1672 						    bdev_io->internal.memory_domain));
1673 			}
1674 		} else {
1675 			spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs,
1676 					      bdev_io->internal.orig_iovcnt,
1677 					      bdev_io->internal.bounce_iov.iov_base,
1678 					      bdev_io->internal.bounce_iov.iov_len);
1679 		}
1680 	}
1681 
1682 	if (spdk_unlikely(rc == -ENOMEM)) {
1683 		bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH);
1684 	} else {
1685 		bdev_io_push_bounce_data_done(bdev_io, rc);
1686 	}
1687 }
1688 
1689 static inline void
1690 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb)
1691 {
1692 	bdev_io->internal.data_transfer_cpl = cpl_cb;
1693 	bdev_io_push_bounce_data(bdev_io);
1694 }
1695 
1696 static void
1697 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf)
1698 {
1699 	struct spdk_bdev_io *bdev_io;
1700 
1701 	bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf);
1702 	_bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len);
1703 }
1704 
1705 static void
1706 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len)
1707 {
1708 	struct spdk_bdev_mgmt_channel *mgmt_ch;
1709 	uint64_t max_len;
1710 	void *buf;
1711 
1712 	assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread());
1713 	mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
1714 	max_len = bdev_io_get_max_buf_len(bdev_io, len);
1715 
1716 	if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) {
1717 		SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len);
1718 		bdev_io_get_buf_complete(bdev_io, false);
1719 		return;
1720 	}
1721 
1722 	bdev_io->internal.buf_len = len;
1723 	buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf,
1724 			     bdev_io_get_iobuf_cb);
1725 	if (buf != NULL) {
1726 		_bdev_io_set_buf(bdev_io, buf, len);
1727 	}
1728 }
1729 
1730 void
1731 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len)
1732 {
1733 	struct spdk_bdev *bdev = bdev_io->bdev;
1734 	uint64_t alignment;
1735 
1736 	assert(cb != NULL);
1737 	bdev_io->internal.get_buf_cb = cb;
1738 
1739 	alignment = spdk_bdev_get_buf_align(bdev);
1740 
1741 	if (_is_buf_allocated(bdev_io->u.bdev.iovs) &&
1742 	    _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) {
1743 		/* Buffer already present and aligned */
1744 		cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true);
1745 		return;
1746 	}
1747 
1748 	bdev_io_get_buf(bdev_io, len);
1749 }
1750 
1751 static void
1752 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
1753 			      bool success)
1754 {
1755 	if (!success) {
1756 		SPDK_ERRLOG("Failed to get data buffer, completing IO\n");
1757 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
1758 		bdev_io_complete_unsubmitted(bdev_io);
1759 		return;
1760 	}
1761 
1762 	if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) {
1763 		if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
1764 			bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb);
1765 			return;
1766 		}
1767 		/* For reads we'll execute the sequence after the data is read, so, for now, only
1768 		 * clear out accel_sequence pointer and submit the IO */
1769 		assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
1770 		bdev_io->u.bdev.accel_sequence = NULL;
1771 	}
1772 
1773 	bdev_io_submit(bdev_io);
1774 }
1775 
1776 static void
1777 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb,
1778 			       uint64_t len)
1779 {
1780 	assert(cb != NULL);
1781 	bdev_io->internal.get_buf_cb = cb;
1782 
1783 	bdev_io_get_buf(bdev_io, len);
1784 }
1785 
1786 void
1787 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb)
1788 {
1789 	uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
1790 
1791 	assert(cb != NULL);
1792 	assert(bdev_io->internal.get_aux_buf_cb == NULL);
1793 	bdev_io->internal.get_aux_buf_cb = cb;
1794 	bdev_io_get_buf(bdev_io, len);
1795 }
1796 
1797 static int
1798 bdev_module_get_max_ctx_size(void)
1799 {
1800 	struct spdk_bdev_module *bdev_module;
1801 	int max_bdev_module_size = 0;
1802 
1803 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
1804 		if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) {
1805 			max_bdev_module_size = bdev_module->get_ctx_size();
1806 		}
1807 	}
1808 
1809 	return max_bdev_module_size;
1810 }
1811 
1812 static void
1813 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
1814 {
1815 	int i;
1816 	struct spdk_bdev_qos *qos = bdev->internal.qos;
1817 	uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
1818 
1819 	if (!qos) {
1820 		return;
1821 	}
1822 
1823 	spdk_bdev_get_qos_rate_limits(bdev, limits);
1824 
1825 	spdk_json_write_object_begin(w);
1826 	spdk_json_write_named_string(w, "method", "bdev_set_qos_limit");
1827 
1828 	spdk_json_write_named_object_begin(w, "params");
1829 	spdk_json_write_named_string(w, "name", bdev->name);
1830 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
1831 		if (limits[i] > 0) {
1832 			spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]);
1833 		}
1834 	}
1835 	spdk_json_write_object_end(w);
1836 
1837 	spdk_json_write_object_end(w);
1838 }
1839 
1840 void
1841 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w)
1842 {
1843 	struct spdk_bdev_module *bdev_module;
1844 	struct spdk_bdev *bdev;
1845 
1846 	assert(w != NULL);
1847 
1848 	spdk_json_write_array_begin(w);
1849 
1850 	spdk_json_write_object_begin(w);
1851 	spdk_json_write_named_string(w, "method", "bdev_set_options");
1852 	spdk_json_write_named_object_begin(w, "params");
1853 	spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size);
1854 	spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size);
1855 	spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine);
1856 	spdk_json_write_object_end(w);
1857 	spdk_json_write_object_end(w);
1858 
1859 	bdev_examine_allowlist_config_json(w);
1860 
1861 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
1862 		if (bdev_module->config_json) {
1863 			bdev_module->config_json(w);
1864 		}
1865 	}
1866 
1867 	spdk_spin_lock(&g_bdev_mgr.spinlock);
1868 
1869 	TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) {
1870 		if (bdev->fn_table->write_config_json) {
1871 			bdev->fn_table->write_config_json(bdev, w);
1872 		}
1873 
1874 		bdev_qos_config_json(bdev, w);
1875 	}
1876 
1877 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
1878 
1879 	/* This has to be last RPC in array to make sure all bdevs finished examine */
1880 	spdk_json_write_object_begin(w);
1881 	spdk_json_write_named_string(w, "method", "bdev_wait_for_examine");
1882 	spdk_json_write_object_end(w);
1883 
1884 	spdk_json_write_array_end(w);
1885 }
1886 
1887 static void
1888 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf)
1889 {
1890 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
1891 	struct spdk_bdev_io *bdev_io;
1892 
1893 	spdk_iobuf_channel_fini(&ch->iobuf);
1894 
1895 	while (!STAILQ_EMPTY(&ch->per_thread_cache)) {
1896 		bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
1897 		STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
1898 		ch->per_thread_cache_count--;
1899 		spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
1900 	}
1901 
1902 	assert(ch->per_thread_cache_count == 0);
1903 }
1904 
1905 static int
1906 bdev_mgmt_channel_create(void *io_device, void *ctx_buf)
1907 {
1908 	struct spdk_bdev_mgmt_channel *ch = ctx_buf;
1909 	struct spdk_bdev_io *bdev_io;
1910 	uint32_t i;
1911 	int rc;
1912 
1913 	rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE);
1914 	if (rc != 0) {
1915 		SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc));
1916 		return -1;
1917 	}
1918 
1919 	STAILQ_INIT(&ch->per_thread_cache);
1920 	ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size;
1921 
1922 	/* Pre-populate bdev_io cache to ensure this thread cannot be starved. */
1923 	ch->per_thread_cache_count = 0;
1924 	for (i = 0; i < ch->bdev_io_cache_size; i++) {
1925 		bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
1926 		if (bdev_io == NULL) {
1927 			SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n");
1928 			assert(false);
1929 			bdev_mgmt_channel_destroy(io_device, ctx_buf);
1930 			return -1;
1931 		}
1932 		ch->per_thread_cache_count++;
1933 		STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link);
1934 	}
1935 
1936 	TAILQ_INIT(&ch->shared_resources);
1937 	TAILQ_INIT(&ch->io_wait_queue);
1938 
1939 	return 0;
1940 }
1941 
1942 static void
1943 bdev_init_complete(int rc)
1944 {
1945 	spdk_bdev_init_cb cb_fn = g_init_cb_fn;
1946 	void *cb_arg = g_init_cb_arg;
1947 	struct spdk_bdev_module *m;
1948 
1949 	g_bdev_mgr.init_complete = true;
1950 	g_init_cb_fn = NULL;
1951 	g_init_cb_arg = NULL;
1952 
1953 	/*
1954 	 * For modules that need to know when subsystem init is complete,
1955 	 * inform them now.
1956 	 */
1957 	if (rc == 0) {
1958 		TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
1959 			if (m->init_complete) {
1960 				m->init_complete();
1961 			}
1962 		}
1963 	}
1964 
1965 	cb_fn(cb_arg, rc);
1966 }
1967 
1968 static bool
1969 bdev_module_all_actions_completed(void)
1970 {
1971 	struct spdk_bdev_module *m;
1972 
1973 	TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
1974 		if (m->internal.action_in_progress > 0) {
1975 			return false;
1976 		}
1977 	}
1978 	return true;
1979 }
1980 
1981 static void
1982 bdev_module_action_complete(void)
1983 {
1984 	/*
1985 	 * Don't finish bdev subsystem initialization if
1986 	 * module pre-initialization is still in progress, or
1987 	 * the subsystem been already initialized.
1988 	 */
1989 	if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) {
1990 		return;
1991 	}
1992 
1993 	/*
1994 	 * Check all bdev modules for inits/examinations in progress. If any
1995 	 * exist, return immediately since we cannot finish bdev subsystem
1996 	 * initialization until all are completed.
1997 	 */
1998 	if (!bdev_module_all_actions_completed()) {
1999 		return;
2000 	}
2001 
2002 	/*
2003 	 * Modules already finished initialization - now that all
2004 	 * the bdev modules have finished their asynchronous I/O
2005 	 * processing, the entire bdev layer can be marked as complete.
2006 	 */
2007 	bdev_init_complete(0);
2008 }
2009 
2010 static void
2011 bdev_module_action_done(struct spdk_bdev_module *module)
2012 {
2013 	spdk_spin_lock(&module->internal.spinlock);
2014 	assert(module->internal.action_in_progress > 0);
2015 	module->internal.action_in_progress--;
2016 	spdk_spin_unlock(&module->internal.spinlock);
2017 	bdev_module_action_complete();
2018 }
2019 
2020 void
2021 spdk_bdev_module_init_done(struct spdk_bdev_module *module)
2022 {
2023 	assert(module->async_init);
2024 	bdev_module_action_done(module);
2025 }
2026 
2027 void
2028 spdk_bdev_module_examine_done(struct spdk_bdev_module *module)
2029 {
2030 	bdev_module_action_done(module);
2031 }
2032 
2033 /** The last initialized bdev module */
2034 static struct spdk_bdev_module *g_resume_bdev_module = NULL;
2035 
2036 static void
2037 bdev_init_failed(void *cb_arg)
2038 {
2039 	struct spdk_bdev_module *module = cb_arg;
2040 
2041 	spdk_spin_lock(&module->internal.spinlock);
2042 	assert(module->internal.action_in_progress > 0);
2043 	module->internal.action_in_progress--;
2044 	spdk_spin_unlock(&module->internal.spinlock);
2045 	bdev_init_complete(-1);
2046 }
2047 
2048 static int
2049 bdev_modules_init(void)
2050 {
2051 	struct spdk_bdev_module *module;
2052 	int rc = 0;
2053 
2054 	TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
2055 		g_resume_bdev_module = module;
2056 		if (module->async_init) {
2057 			spdk_spin_lock(&module->internal.spinlock);
2058 			module->internal.action_in_progress = 1;
2059 			spdk_spin_unlock(&module->internal.spinlock);
2060 		}
2061 		rc = module->module_init();
2062 		if (rc != 0) {
2063 			/* Bump action_in_progress to prevent other modules from completion of modules_init
2064 			 * Send message to defer application shutdown until resources are cleaned up */
2065 			spdk_spin_lock(&module->internal.spinlock);
2066 			module->internal.action_in_progress = 1;
2067 			spdk_spin_unlock(&module->internal.spinlock);
2068 			spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module);
2069 			return rc;
2070 		}
2071 	}
2072 
2073 	g_resume_bdev_module = NULL;
2074 	return 0;
2075 }
2076 
2077 void
2078 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg)
2079 {
2080 	int rc = 0;
2081 	char mempool_name[32];
2082 
2083 	assert(cb_fn != NULL);
2084 
2085 	g_init_cb_fn = cb_fn;
2086 	g_init_cb_arg = cb_arg;
2087 
2088 	spdk_notify_type_register("bdev_register");
2089 	spdk_notify_type_register("bdev_unregister");
2090 
2091 	snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid());
2092 
2093 	rc = spdk_iobuf_register_module("bdev");
2094 	if (rc != 0) {
2095 		SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc));
2096 		bdev_init_complete(-1);
2097 		return;
2098 	}
2099 
2100 	g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name,
2101 				  g_bdev_opts.bdev_io_pool_size,
2102 				  sizeof(struct spdk_bdev_io) +
2103 				  bdev_module_get_max_ctx_size(),
2104 				  0,
2105 				  SPDK_ENV_SOCKET_ID_ANY);
2106 
2107 	if (g_bdev_mgr.bdev_io_pool == NULL) {
2108 		SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n");
2109 		bdev_init_complete(-1);
2110 		return;
2111 	}
2112 
2113 	g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE,
2114 					      NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
2115 	if (!g_bdev_mgr.zero_buffer) {
2116 		SPDK_ERRLOG("create bdev zero buffer failed\n");
2117 		bdev_init_complete(-1);
2118 		return;
2119 	}
2120 
2121 #ifdef SPDK_CONFIG_VTUNE
2122 	SPDK_LOG_DEPRECATED(vtune_support);
2123 	g_bdev_mgr.domain = __itt_domain_create("spdk_bdev");
2124 #endif
2125 
2126 	spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create,
2127 				bdev_mgmt_channel_destroy,
2128 				sizeof(struct spdk_bdev_mgmt_channel),
2129 				"bdev_mgr");
2130 
2131 	rc = bdev_modules_init();
2132 	g_bdev_mgr.module_init_complete = true;
2133 	if (rc != 0) {
2134 		SPDK_ERRLOG("bdev modules init failed\n");
2135 		return;
2136 	}
2137 
2138 	bdev_module_action_complete();
2139 }
2140 
2141 static void
2142 bdev_mgr_unregister_cb(void *io_device)
2143 {
2144 	spdk_bdev_fini_cb cb_fn = g_fini_cb_fn;
2145 
2146 	if (g_bdev_mgr.bdev_io_pool) {
2147 		if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) {
2148 			SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n",
2149 				    spdk_mempool_count(g_bdev_mgr.bdev_io_pool),
2150 				    g_bdev_opts.bdev_io_pool_size);
2151 		}
2152 
2153 		spdk_mempool_free(g_bdev_mgr.bdev_io_pool);
2154 	}
2155 
2156 	spdk_free(g_bdev_mgr.zero_buffer);
2157 
2158 	bdev_examine_allowlist_free();
2159 
2160 	cb_fn(g_fini_cb_arg);
2161 	g_fini_cb_fn = NULL;
2162 	g_fini_cb_arg = NULL;
2163 	g_bdev_mgr.init_complete = false;
2164 	g_bdev_mgr.module_init_complete = false;
2165 }
2166 
2167 static void
2168 bdev_module_fini_iter(void *arg)
2169 {
2170 	struct spdk_bdev_module *bdev_module;
2171 
2172 	/* FIXME: Handling initialization failures is broken now,
2173 	 * so we won't even try cleaning up after successfully
2174 	 * initialized modules. if module_init_complete is false,
2175 	 * just call spdk_bdev_mgr_unregister_cb
2176 	 */
2177 	if (!g_bdev_mgr.module_init_complete) {
2178 		bdev_mgr_unregister_cb(NULL);
2179 		return;
2180 	}
2181 
2182 	/* Start iterating from the last touched module */
2183 	if (!g_resume_bdev_module) {
2184 		bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list);
2185 	} else {
2186 		bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list,
2187 					 internal.tailq);
2188 	}
2189 
2190 	while (bdev_module) {
2191 		if (bdev_module->async_fini) {
2192 			/* Save our place so we can resume later. We must
2193 			 * save the variable here, before calling module_fini()
2194 			 * below, because in some cases the module may immediately
2195 			 * call spdk_bdev_module_fini_done() and re-enter
2196 			 * this function to continue iterating. */
2197 			g_resume_bdev_module = bdev_module;
2198 		}
2199 
2200 		if (bdev_module->module_fini) {
2201 			bdev_module->module_fini();
2202 		}
2203 
2204 		if (bdev_module->async_fini) {
2205 			return;
2206 		}
2207 
2208 		bdev_module = TAILQ_PREV(bdev_module, bdev_module_list,
2209 					 internal.tailq);
2210 	}
2211 
2212 	g_resume_bdev_module = NULL;
2213 	spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb);
2214 }
2215 
2216 void
2217 spdk_bdev_module_fini_done(void)
2218 {
2219 	if (spdk_get_thread() != g_fini_thread) {
2220 		spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL);
2221 	} else {
2222 		bdev_module_fini_iter(NULL);
2223 	}
2224 }
2225 
2226 static void
2227 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno)
2228 {
2229 	struct spdk_bdev *bdev = cb_arg;
2230 
2231 	if (bdeverrno && bdev) {
2232 		SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n",
2233 			     bdev->name);
2234 
2235 		/*
2236 		 * Since the call to spdk_bdev_unregister() failed, we have no way to free this
2237 		 *  bdev; try to continue by manually removing this bdev from the list and continue
2238 		 *  with the next bdev in the list.
2239 		 */
2240 		TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
2241 	}
2242 
2243 	if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) {
2244 		SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n");
2245 		/*
2246 		 * Bdev module finish need to be deferred as we might be in the middle of some context
2247 		 * (like bdev part free) that will use this bdev (or private bdev driver ctx data)
2248 		 * after returning.
2249 		 */
2250 		spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL);
2251 		return;
2252 	}
2253 
2254 	/*
2255 	 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem
2256 	 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity
2257 	 * to detect clean shutdown as opposed to run-time hot removal of the underlying
2258 	 * base bdevs.
2259 	 *
2260 	 * Also, walk the list in the reverse order.
2261 	 */
2262 	for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list);
2263 	     bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) {
2264 		spdk_spin_lock(&bdev->internal.spinlock);
2265 		if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
2266 			LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev);
2267 			spdk_spin_unlock(&bdev->internal.spinlock);
2268 			continue;
2269 		}
2270 		spdk_spin_unlock(&bdev->internal.spinlock);
2271 
2272 		SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name);
2273 		spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev);
2274 		return;
2275 	}
2276 
2277 	/*
2278 	 * If any bdev fails to unclaim underlying bdev properly, we may face the
2279 	 * case of bdev list consisting of claimed bdevs only (if claims are managed
2280 	 * correctly, this would mean there's a loop in the claims graph which is
2281 	 * clearly impossible). Warn and unregister last bdev on the list then.
2282 	 */
2283 	for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list);
2284 	     bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) {
2285 		SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name);
2286 		spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev);
2287 		return;
2288 	}
2289 }
2290 
2291 static void
2292 bdev_module_fini_start_iter(void *arg)
2293 {
2294 	struct spdk_bdev_module *bdev_module;
2295 
2296 	if (!g_resume_bdev_module) {
2297 		bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list);
2298 	} else {
2299 		bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq);
2300 	}
2301 
2302 	while (bdev_module) {
2303 		if (bdev_module->async_fini_start) {
2304 			/* Save our place so we can resume later. We must
2305 			 * save the variable here, before calling fini_start()
2306 			 * below, because in some cases the module may immediately
2307 			 * call spdk_bdev_module_fini_start_done() and re-enter
2308 			 * this function to continue iterating. */
2309 			g_resume_bdev_module = bdev_module;
2310 		}
2311 
2312 		if (bdev_module->fini_start) {
2313 			bdev_module->fini_start();
2314 		}
2315 
2316 		if (bdev_module->async_fini_start) {
2317 			return;
2318 		}
2319 
2320 		bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq);
2321 	}
2322 
2323 	g_resume_bdev_module = NULL;
2324 
2325 	bdev_finish_unregister_bdevs_iter(NULL, 0);
2326 }
2327 
2328 void
2329 spdk_bdev_module_fini_start_done(void)
2330 {
2331 	if (spdk_get_thread() != g_fini_thread) {
2332 		spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL);
2333 	} else {
2334 		bdev_module_fini_start_iter(NULL);
2335 	}
2336 }
2337 
2338 static void
2339 bdev_finish_wait_for_examine_done(void *cb_arg)
2340 {
2341 	bdev_module_fini_start_iter(NULL);
2342 }
2343 
2344 static void bdev_open_async_fini(void);
2345 
2346 void
2347 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg)
2348 {
2349 	int rc;
2350 
2351 	assert(cb_fn != NULL);
2352 
2353 	g_fini_thread = spdk_get_thread();
2354 
2355 	g_fini_cb_fn = cb_fn;
2356 	g_fini_cb_arg = cb_arg;
2357 
2358 	bdev_open_async_fini();
2359 
2360 	rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL);
2361 	if (rc != 0) {
2362 		SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc));
2363 		bdev_finish_wait_for_examine_done(NULL);
2364 	}
2365 }
2366 
2367 struct spdk_bdev_io *
2368 bdev_channel_get_io(struct spdk_bdev_channel *channel)
2369 {
2370 	struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch;
2371 	struct spdk_bdev_io *bdev_io;
2372 
2373 	if (ch->per_thread_cache_count > 0) {
2374 		bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
2375 		STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
2376 		ch->per_thread_cache_count--;
2377 	} else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) {
2378 		/*
2379 		 * Don't try to look for bdev_ios in the global pool if there are
2380 		 * waiters on bdev_ios - we don't want this caller to jump the line.
2381 		 */
2382 		bdev_io = NULL;
2383 	} else {
2384 		bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
2385 	}
2386 
2387 	return bdev_io;
2388 }
2389 
2390 void
2391 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io)
2392 {
2393 	struct spdk_bdev_mgmt_channel *ch;
2394 
2395 	assert(bdev_io != NULL);
2396 	assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING);
2397 
2398 	ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
2399 
2400 	if (bdev_io->internal.buf != NULL) {
2401 		bdev_io_put_buf(bdev_io);
2402 	}
2403 
2404 	if (ch->per_thread_cache_count < ch->bdev_io_cache_size) {
2405 		ch->per_thread_cache_count++;
2406 		STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link);
2407 		while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) {
2408 			struct spdk_bdev_io_wait_entry *entry;
2409 
2410 			entry = TAILQ_FIRST(&ch->io_wait_queue);
2411 			TAILQ_REMOVE(&ch->io_wait_queue, entry, link);
2412 			entry->cb_fn(entry->cb_arg);
2413 		}
2414 	} else {
2415 		/* We should never have a full cache with entries on the io wait queue. */
2416 		assert(TAILQ_EMPTY(&ch->io_wait_queue));
2417 		spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
2418 	}
2419 }
2420 
2421 static bool
2422 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit)
2423 {
2424 	assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES);
2425 
2426 	switch (limit) {
2427 	case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
2428 		return true;
2429 	case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT:
2430 	case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT:
2431 	case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT:
2432 		return false;
2433 	case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES:
2434 	default:
2435 		return false;
2436 	}
2437 }
2438 
2439 static bool
2440 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io)
2441 {
2442 	switch (bdev_io->type) {
2443 	case SPDK_BDEV_IO_TYPE_NVME_IO:
2444 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2445 	case SPDK_BDEV_IO_TYPE_READ:
2446 	case SPDK_BDEV_IO_TYPE_WRITE:
2447 		return true;
2448 	case SPDK_BDEV_IO_TYPE_ZCOPY:
2449 		if (bdev_io->u.bdev.zcopy.start) {
2450 			return true;
2451 		} else {
2452 			return false;
2453 		}
2454 	default:
2455 		return false;
2456 	}
2457 }
2458 
2459 static bool
2460 bdev_is_read_io(struct spdk_bdev_io *bdev_io)
2461 {
2462 	switch (bdev_io->type) {
2463 	case SPDK_BDEV_IO_TYPE_NVME_IO:
2464 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2465 		/* Bit 1 (0x2) set for read operation */
2466 		if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) {
2467 			return true;
2468 		} else {
2469 			return false;
2470 		}
2471 	case SPDK_BDEV_IO_TYPE_READ:
2472 		return true;
2473 	case SPDK_BDEV_IO_TYPE_ZCOPY:
2474 		/* Populate to read from disk */
2475 		if (bdev_io->u.bdev.zcopy.populate) {
2476 			return true;
2477 		} else {
2478 			return false;
2479 		}
2480 	default:
2481 		return false;
2482 	}
2483 }
2484 
2485 static uint64_t
2486 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io)
2487 {
2488 	struct spdk_bdev	*bdev = bdev_io->bdev;
2489 
2490 	switch (bdev_io->type) {
2491 	case SPDK_BDEV_IO_TYPE_NVME_IO:
2492 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
2493 		return bdev_io->u.nvme_passthru.nbytes;
2494 	case SPDK_BDEV_IO_TYPE_READ:
2495 	case SPDK_BDEV_IO_TYPE_WRITE:
2496 		return bdev_io->u.bdev.num_blocks * bdev->blocklen;
2497 	case SPDK_BDEV_IO_TYPE_ZCOPY:
2498 		/* Track the data in the start phase only */
2499 		if (bdev_io->u.bdev.zcopy.start) {
2500 			return bdev_io->u.bdev.num_blocks * bdev->blocklen;
2501 		} else {
2502 			return 0;
2503 		}
2504 	default:
2505 		return 0;
2506 	}
2507 }
2508 
2509 static bool
2510 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2511 {
2512 	if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) {
2513 		return true;
2514 	} else {
2515 		return false;
2516 	}
2517 }
2518 
2519 static bool
2520 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2521 {
2522 	if (bdev_is_read_io(io) == false) {
2523 		return false;
2524 	}
2525 
2526 	return bdev_qos_rw_queue_io(limit, io);
2527 }
2528 
2529 static bool
2530 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2531 {
2532 	if (bdev_is_read_io(io) == true) {
2533 		return false;
2534 	}
2535 
2536 	return bdev_qos_rw_queue_io(limit, io);
2537 }
2538 
2539 static void
2540 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2541 {
2542 	limit->remaining_this_timeslice--;
2543 }
2544 
2545 static void
2546 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2547 {
2548 	limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io);
2549 }
2550 
2551 static void
2552 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2553 {
2554 	if (bdev_is_read_io(io) == false) {
2555 		return;
2556 	}
2557 
2558 	return bdev_qos_rw_bps_update_quota(limit, io);
2559 }
2560 
2561 static void
2562 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
2563 {
2564 	if (bdev_is_read_io(io) == true) {
2565 		return;
2566 	}
2567 
2568 	return bdev_qos_rw_bps_update_quota(limit, io);
2569 }
2570 
2571 static void
2572 bdev_qos_set_ops(struct spdk_bdev_qos *qos)
2573 {
2574 	int i;
2575 
2576 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2577 		if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
2578 			qos->rate_limits[i].queue_io = NULL;
2579 			qos->rate_limits[i].update_quota = NULL;
2580 			continue;
2581 		}
2582 
2583 		switch (i) {
2584 		case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
2585 			qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io;
2586 			qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota;
2587 			break;
2588 		case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT:
2589 			qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io;
2590 			qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota;
2591 			break;
2592 		case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT:
2593 			qos->rate_limits[i].queue_io = bdev_qos_r_queue_io;
2594 			qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota;
2595 			break;
2596 		case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT:
2597 			qos->rate_limits[i].queue_io = bdev_qos_w_queue_io;
2598 			qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota;
2599 			break;
2600 		default:
2601 			break;
2602 		}
2603 	}
2604 }
2605 
2606 static void
2607 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch,
2608 			    struct spdk_bdev_io *bdev_io,
2609 			    enum spdk_bdev_io_status status)
2610 {
2611 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
2612 
2613 	bdev_io->internal.in_submit_request = true;
2614 	bdev_ch->io_outstanding++;
2615 	shared_resource->io_outstanding++;
2616 	spdk_bdev_io_complete(bdev_io, status);
2617 	bdev_io->internal.in_submit_request = false;
2618 }
2619 
2620 static inline void
2621 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io)
2622 {
2623 	struct spdk_bdev *bdev = bdev_io->bdev;
2624 	struct spdk_io_channel *ch = bdev_ch->channel;
2625 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
2626 
2627 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) {
2628 		struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch;
2629 		struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort;
2630 
2631 		if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) ||
2632 		    bdev_abort_buf_io(mgmt_channel, bio_to_abort)) {
2633 			_bdev_io_complete_in_submit(bdev_ch, bdev_io,
2634 						    SPDK_BDEV_IO_STATUS_SUCCESS);
2635 			return;
2636 		}
2637 	}
2638 
2639 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE &&
2640 			  bdev_io->bdev->split_on_write_unit &&
2641 			  bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) {
2642 		SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n",
2643 			    bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size);
2644 		_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
2645 		return;
2646 	}
2647 
2648 	if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) {
2649 		bdev_ch->io_outstanding++;
2650 		shared_resource->io_outstanding++;
2651 		bdev_io->internal.in_submit_request = true;
2652 		bdev_submit_request(bdev, ch, bdev_io);
2653 		bdev_io->internal.in_submit_request = false;
2654 	} else {
2655 		bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT);
2656 	}
2657 }
2658 
2659 static bool
2660 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io)
2661 {
2662 	int i;
2663 
2664 	if (bdev_qos_io_to_limit(bdev_io) == true) {
2665 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2666 			if (!qos->rate_limits[i].queue_io) {
2667 				continue;
2668 			}
2669 
2670 			if (qos->rate_limits[i].queue_io(&qos->rate_limits[i],
2671 							 bdev_io) == true) {
2672 				return true;
2673 			}
2674 		}
2675 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
2676 			if (!qos->rate_limits[i].update_quota) {
2677 				continue;
2678 			}
2679 
2680 			qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io);
2681 		}
2682 	}
2683 
2684 	return false;
2685 }
2686 
2687 static inline void
2688 _bdev_io_do_submit(void *ctx)
2689 {
2690 	struct spdk_bdev_io *bdev_io = ctx;
2691 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
2692 
2693 	bdev_io_do_submit(ch, bdev_io);
2694 }
2695 
2696 static int
2697 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos)
2698 {
2699 	struct spdk_bdev_io		*bdev_io = NULL, *tmp = NULL;
2700 	int				submitted_ios = 0;
2701 
2702 	TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) {
2703 		if (!bdev_qos_queue_io(qos, bdev_io)) {
2704 			TAILQ_REMOVE(&qos->queued, bdev_io, internal.link);
2705 
2706 			if (bdev_io->internal.io_submit_ch) {
2707 				/* Send back the IO to the original thread for the actual processing. */
2708 				bdev_io->internal.ch = bdev_io->internal.io_submit_ch;
2709 				bdev_io->internal.io_submit_ch = NULL;
2710 				spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
2711 						     _bdev_io_do_submit, bdev_io);
2712 			} else {
2713 				bdev_io_do_submit(ch, bdev_io);
2714 			}
2715 
2716 			submitted_ios++;
2717 		}
2718 	}
2719 
2720 	return submitted_ios;
2721 }
2722 
2723 static void
2724 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn)
2725 {
2726 	int rc;
2727 
2728 	bdev_io->internal.waitq_entry.bdev = bdev_io->bdev;
2729 	bdev_io->internal.waitq_entry.cb_fn = cb_fn;
2730 	bdev_io->internal.waitq_entry.cb_arg = bdev_io;
2731 	rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch),
2732 				     &bdev_io->internal.waitq_entry);
2733 	if (rc != 0) {
2734 		SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc);
2735 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
2736 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
2737 	}
2738 }
2739 
2740 static bool
2741 bdev_rw_should_split(struct spdk_bdev_io *bdev_io)
2742 {
2743 	uint32_t io_boundary;
2744 	struct spdk_bdev *bdev = bdev_io->bdev;
2745 	uint32_t max_size = bdev->max_segment_size;
2746 	int max_segs = bdev->max_num_segments;
2747 
2748 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) {
2749 		io_boundary = bdev->write_unit_size;
2750 	} else if (bdev->split_on_optimal_io_boundary) {
2751 		io_boundary = bdev->optimal_io_boundary;
2752 	} else {
2753 		io_boundary = 0;
2754 	}
2755 
2756 	if (spdk_likely(!io_boundary && !max_segs && !max_size)) {
2757 		return false;
2758 	}
2759 
2760 	if (io_boundary) {
2761 		uint64_t start_stripe, end_stripe;
2762 
2763 		start_stripe = bdev_io->u.bdev.offset_blocks;
2764 		end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1;
2765 		/* Avoid expensive div operations if possible.  These spdk_u32 functions are very cheap. */
2766 		if (spdk_likely(spdk_u32_is_pow2(io_boundary))) {
2767 			start_stripe >>= spdk_u32log2(io_boundary);
2768 			end_stripe >>= spdk_u32log2(io_boundary);
2769 		} else {
2770 			start_stripe /= io_boundary;
2771 			end_stripe /= io_boundary;
2772 		}
2773 
2774 		if (start_stripe != end_stripe) {
2775 			return true;
2776 		}
2777 	}
2778 
2779 	if (max_segs) {
2780 		if (bdev_io->u.bdev.iovcnt > max_segs) {
2781 			return true;
2782 		}
2783 	}
2784 
2785 	if (max_size) {
2786 		for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) {
2787 			if (bdev_io->u.bdev.iovs[i].iov_len > max_size) {
2788 				return true;
2789 			}
2790 		}
2791 	}
2792 
2793 	return false;
2794 }
2795 
2796 static bool
2797 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io)
2798 {
2799 	uint32_t num_unmap_segments;
2800 
2801 	if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) {
2802 		return false;
2803 	}
2804 	num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap);
2805 	if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) {
2806 		return true;
2807 	}
2808 
2809 	return false;
2810 }
2811 
2812 static bool
2813 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io)
2814 {
2815 	if (!bdev_io->bdev->max_write_zeroes) {
2816 		return false;
2817 	}
2818 
2819 	if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) {
2820 		return true;
2821 	}
2822 
2823 	return false;
2824 }
2825 
2826 static bool
2827 bdev_copy_should_split(struct spdk_bdev_io *bdev_io)
2828 {
2829 	if (bdev_io->bdev->max_copy != 0 &&
2830 	    bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) {
2831 		return true;
2832 	}
2833 
2834 	return false;
2835 }
2836 
2837 static bool
2838 bdev_io_should_split(struct spdk_bdev_io *bdev_io)
2839 {
2840 	switch (bdev_io->type) {
2841 	case SPDK_BDEV_IO_TYPE_READ:
2842 	case SPDK_BDEV_IO_TYPE_WRITE:
2843 		return bdev_rw_should_split(bdev_io);
2844 	case SPDK_BDEV_IO_TYPE_UNMAP:
2845 		return bdev_unmap_should_split(bdev_io);
2846 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
2847 		return bdev_write_zeroes_should_split(bdev_io);
2848 	case SPDK_BDEV_IO_TYPE_COPY:
2849 		return bdev_copy_should_split(bdev_io);
2850 	default:
2851 		return false;
2852 	}
2853 }
2854 
2855 static uint32_t
2856 _to_next_boundary(uint64_t offset, uint32_t boundary)
2857 {
2858 	return (boundary - (offset % boundary));
2859 }
2860 
2861 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
2862 
2863 static void _bdev_rw_split(void *_bdev_io);
2864 
2865 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io);
2866 
2867 static void
2868 _bdev_unmap_split(void *_bdev_io)
2869 {
2870 	return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io);
2871 }
2872 
2873 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io);
2874 
2875 static void
2876 _bdev_write_zeroes_split(void *_bdev_io)
2877 {
2878 	return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io);
2879 }
2880 
2881 static void bdev_copy_split(struct spdk_bdev_io *bdev_io);
2882 
2883 static void
2884 _bdev_copy_split(void *_bdev_io)
2885 {
2886 	return bdev_copy_split((struct spdk_bdev_io *)_bdev_io);
2887 }
2888 
2889 static int
2890 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf,
2891 		     uint64_t num_blocks, uint64_t *offset, uint64_t *remaining)
2892 {
2893 	int rc;
2894 	uint64_t current_offset, current_remaining, current_src_offset;
2895 	spdk_bdev_io_wait_cb io_wait_fn;
2896 
2897 	current_offset = *offset;
2898 	current_remaining = *remaining;
2899 
2900 	bdev_io->u.bdev.split_outstanding++;
2901 
2902 	io_wait_fn = _bdev_rw_split;
2903 	switch (bdev_io->type) {
2904 	case SPDK_BDEV_IO_TYPE_READ:
2905 		assert(bdev_io->u.bdev.accel_sequence == NULL);
2906 		rc = bdev_readv_blocks_with_md(bdev_io->internal.desc,
2907 					       spdk_io_channel_from_ctx(bdev_io->internal.ch),
2908 					       iov, iovcnt, md_buf, current_offset,
2909 					       num_blocks, bdev_io->internal.memory_domain,
2910 					       bdev_io->internal.memory_domain_ctx, NULL,
2911 					       bdev_io_split_done, bdev_io);
2912 		break;
2913 	case SPDK_BDEV_IO_TYPE_WRITE:
2914 		assert(bdev_io->u.bdev.accel_sequence == NULL);
2915 		rc = bdev_writev_blocks_with_md(bdev_io->internal.desc,
2916 						spdk_io_channel_from_ctx(bdev_io->internal.ch),
2917 						iov, iovcnt, md_buf, current_offset,
2918 						num_blocks, bdev_io->internal.memory_domain,
2919 						bdev_io->internal.memory_domain_ctx, NULL,
2920 						bdev_io_split_done, bdev_io);
2921 		break;
2922 	case SPDK_BDEV_IO_TYPE_UNMAP:
2923 		io_wait_fn = _bdev_unmap_split;
2924 		rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc,
2925 					    spdk_io_channel_from_ctx(bdev_io->internal.ch),
2926 					    current_offset, num_blocks,
2927 					    bdev_io_split_done, bdev_io);
2928 		break;
2929 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
2930 		io_wait_fn = _bdev_write_zeroes_split;
2931 		rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc,
2932 						   spdk_io_channel_from_ctx(bdev_io->internal.ch),
2933 						   current_offset, num_blocks,
2934 						   bdev_io_split_done, bdev_io);
2935 		break;
2936 	case SPDK_BDEV_IO_TYPE_COPY:
2937 		io_wait_fn = _bdev_copy_split;
2938 		current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks +
2939 				     (current_offset - bdev_io->u.bdev.offset_blocks);
2940 		rc = spdk_bdev_copy_blocks(bdev_io->internal.desc,
2941 					   spdk_io_channel_from_ctx(bdev_io->internal.ch),
2942 					   current_offset, current_src_offset, num_blocks,
2943 					   bdev_io_split_done, bdev_io);
2944 		break;
2945 	default:
2946 		assert(false);
2947 		rc = -EINVAL;
2948 		break;
2949 	}
2950 
2951 	if (rc == 0) {
2952 		current_offset += num_blocks;
2953 		current_remaining -= num_blocks;
2954 		bdev_io->u.bdev.split_current_offset_blocks = current_offset;
2955 		bdev_io->u.bdev.split_remaining_num_blocks = current_remaining;
2956 		*offset = current_offset;
2957 		*remaining = current_remaining;
2958 	} else {
2959 		bdev_io->u.bdev.split_outstanding--;
2960 		if (rc == -ENOMEM) {
2961 			if (bdev_io->u.bdev.split_outstanding == 0) {
2962 				/* No I/O is outstanding. Hence we should wait here. */
2963 				bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn);
2964 			}
2965 		} else {
2966 			bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
2967 			if (bdev_io->u.bdev.split_outstanding == 0) {
2968 				spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx);
2969 				TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link);
2970 				bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
2971 			}
2972 		}
2973 	}
2974 
2975 	return rc;
2976 }
2977 
2978 static void
2979 _bdev_rw_split(void *_bdev_io)
2980 {
2981 	struct iovec *parent_iov, *iov;
2982 	struct spdk_bdev_io *bdev_io = _bdev_io;
2983 	struct spdk_bdev *bdev = bdev_io->bdev;
2984 	uint64_t parent_offset, current_offset, remaining;
2985 	uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt;
2986 	uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes;
2987 	uint32_t iovcnt, iov_len, child_iovsize;
2988 	uint32_t blocklen = bdev->blocklen;
2989 	uint32_t io_boundary;
2990 	uint32_t max_segment_size = bdev->max_segment_size;
2991 	uint32_t max_child_iovcnt = bdev->max_num_segments;
2992 	void *md_buf = NULL;
2993 	int rc;
2994 
2995 	max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX;
2996 	max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) :
2997 			   SPDK_BDEV_IO_NUM_CHILD_IOV;
2998 
2999 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) {
3000 		io_boundary = bdev->write_unit_size;
3001 	} else if (bdev->split_on_optimal_io_boundary) {
3002 		io_boundary = bdev->optimal_io_boundary;
3003 	} else {
3004 		io_boundary = UINT32_MAX;
3005 	}
3006 
3007 	remaining = bdev_io->u.bdev.split_remaining_num_blocks;
3008 	current_offset = bdev_io->u.bdev.split_current_offset_blocks;
3009 	parent_offset = bdev_io->u.bdev.offset_blocks;
3010 	parent_iov_offset = (current_offset - parent_offset) * blocklen;
3011 	parent_iovcnt = bdev_io->u.bdev.iovcnt;
3012 
3013 	for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) {
3014 		parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
3015 		if (parent_iov_offset < parent_iov->iov_len) {
3016 			break;
3017 		}
3018 		parent_iov_offset -= parent_iov->iov_len;
3019 	}
3020 
3021 	child_iovcnt = 0;
3022 	while (remaining > 0 && parent_iovpos < parent_iovcnt &&
3023 	       child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) {
3024 		to_next_boundary = _to_next_boundary(current_offset, io_boundary);
3025 		to_next_boundary = spdk_min(remaining, to_next_boundary);
3026 		to_next_boundary_bytes = to_next_boundary * blocklen;
3027 
3028 		iov = &bdev_io->child_iov[child_iovcnt];
3029 		iovcnt = 0;
3030 
3031 		if (bdev_io->u.bdev.md_buf) {
3032 			md_buf = (char *)bdev_io->u.bdev.md_buf +
3033 				 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev);
3034 		}
3035 
3036 		child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt);
3037 		while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt &&
3038 		       iovcnt < child_iovsize) {
3039 			parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
3040 			iov_len = parent_iov->iov_len - parent_iov_offset;
3041 
3042 			iov_len = spdk_min(iov_len, max_segment_size);
3043 			iov_len = spdk_min(iov_len, to_next_boundary_bytes);
3044 			to_next_boundary_bytes -= iov_len;
3045 
3046 			bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset;
3047 			bdev_io->child_iov[child_iovcnt].iov_len = iov_len;
3048 
3049 			if (iov_len < parent_iov->iov_len - parent_iov_offset) {
3050 				parent_iov_offset += iov_len;
3051 			} else {
3052 				parent_iovpos++;
3053 				parent_iov_offset = 0;
3054 			}
3055 			child_iovcnt++;
3056 			iovcnt++;
3057 		}
3058 
3059 		if (to_next_boundary_bytes > 0) {
3060 			/* We had to stop this child I/O early because we ran out of
3061 			 * child_iov space or were limited by max_num_segments.
3062 			 * Ensure the iovs to be aligned with block size and
3063 			 * then adjust to_next_boundary before starting the
3064 			 * child I/O.
3065 			 */
3066 			assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV ||
3067 			       iovcnt == child_iovsize);
3068 			to_last_block_bytes = to_next_boundary_bytes % blocklen;
3069 			if (to_last_block_bytes != 0) {
3070 				uint32_t child_iovpos = child_iovcnt - 1;
3071 				/* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV
3072 				 * so the loop will naturally end
3073 				 */
3074 
3075 				to_last_block_bytes = blocklen - to_last_block_bytes;
3076 				to_next_boundary_bytes += to_last_block_bytes;
3077 				while (to_last_block_bytes > 0 && iovcnt > 0) {
3078 					iov_len = spdk_min(to_last_block_bytes,
3079 							   bdev_io->child_iov[child_iovpos].iov_len);
3080 					bdev_io->child_iov[child_iovpos].iov_len -= iov_len;
3081 					if (bdev_io->child_iov[child_iovpos].iov_len == 0) {
3082 						child_iovpos--;
3083 						if (--iovcnt == 0) {
3084 							/* If the child IO is less than a block size just return.
3085 							 * If the first child IO of any split round is less than
3086 							 * a block size, an error exit.
3087 							 */
3088 							if (bdev_io->u.bdev.split_outstanding == 0) {
3089 								SPDK_ERRLOG("The first child io was less than a block size\n");
3090 								bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3091 								spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx);
3092 								TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link);
3093 								bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
3094 							}
3095 
3096 							return;
3097 						}
3098 					}
3099 
3100 					to_last_block_bytes -= iov_len;
3101 
3102 					if (parent_iov_offset == 0) {
3103 						parent_iovpos--;
3104 						parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len;
3105 					}
3106 					parent_iov_offset -= iov_len;
3107 				}
3108 
3109 				assert(to_last_block_bytes == 0);
3110 			}
3111 			to_next_boundary -= to_next_boundary_bytes / blocklen;
3112 		}
3113 
3114 		rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary,
3115 					  &current_offset, &remaining);
3116 		if (spdk_unlikely(rc)) {
3117 			return;
3118 		}
3119 	}
3120 }
3121 
3122 static void
3123 bdev_unmap_split(struct spdk_bdev_io *bdev_io)
3124 {
3125 	uint64_t offset, unmap_blocks, remaining, max_unmap_blocks;
3126 	uint32_t num_children_reqs = 0;
3127 	int rc;
3128 
3129 	offset = bdev_io->u.bdev.split_current_offset_blocks;
3130 	remaining = bdev_io->u.bdev.split_remaining_num_blocks;
3131 	max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments;
3132 
3133 	while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) {
3134 		unmap_blocks = spdk_min(remaining, max_unmap_blocks);
3135 
3136 		rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks,
3137 					  &offset, &remaining);
3138 		if (spdk_likely(rc == 0)) {
3139 			num_children_reqs++;
3140 		} else {
3141 			return;
3142 		}
3143 	}
3144 }
3145 
3146 static void
3147 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io)
3148 {
3149 	uint64_t offset, write_zeroes_blocks, remaining;
3150 	uint32_t num_children_reqs = 0;
3151 	int rc;
3152 
3153 	offset = bdev_io->u.bdev.split_current_offset_blocks;
3154 	remaining = bdev_io->u.bdev.split_remaining_num_blocks;
3155 
3156 	while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) {
3157 		write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes);
3158 
3159 		rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks,
3160 					  &offset, &remaining);
3161 		if (spdk_likely(rc == 0)) {
3162 			num_children_reqs++;
3163 		} else {
3164 			return;
3165 		}
3166 	}
3167 }
3168 
3169 static void
3170 bdev_copy_split(struct spdk_bdev_io *bdev_io)
3171 {
3172 	uint64_t offset, copy_blocks, remaining;
3173 	uint32_t num_children_reqs = 0;
3174 	int rc;
3175 
3176 	offset = bdev_io->u.bdev.split_current_offset_blocks;
3177 	remaining = bdev_io->u.bdev.split_remaining_num_blocks;
3178 
3179 	assert(bdev_io->bdev->max_copy != 0);
3180 	while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) {
3181 		copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy);
3182 
3183 		rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks,
3184 					  &offset, &remaining);
3185 		if (spdk_likely(rc == 0)) {
3186 			num_children_reqs++;
3187 		} else {
3188 			return;
3189 		}
3190 	}
3191 }
3192 
3193 static void
3194 parent_bdev_io_complete(void *ctx, int rc)
3195 {
3196 	struct spdk_bdev_io *parent_io = ctx;
3197 
3198 	if (rc) {
3199 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3200 	}
3201 
3202 	parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
3203 			       parent_io->internal.caller_ctx);
3204 }
3205 
3206 static void
3207 bdev_io_complete_parent_sequence_cb(void *ctx, int status)
3208 {
3209 	struct spdk_bdev_io *bdev_io = ctx;
3210 
3211 	/* u.bdev.accel_sequence should have already been cleared at this point */
3212 	assert(bdev_io->u.bdev.accel_sequence == NULL);
3213 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
3214 	bdev_io->internal.accel_sequence = NULL;
3215 
3216 	if (spdk_unlikely(status != 0)) {
3217 		SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status);
3218 	}
3219 
3220 	parent_bdev_io_complete(bdev_io, status);
3221 }
3222 
3223 static void
3224 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
3225 {
3226 	struct spdk_bdev_io *parent_io = cb_arg;
3227 
3228 	spdk_bdev_free_io(bdev_io);
3229 
3230 	if (!success) {
3231 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
3232 		/* If any child I/O failed, stop further splitting process. */
3233 		parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks;
3234 		parent_io->u.bdev.split_remaining_num_blocks = 0;
3235 	}
3236 	parent_io->u.bdev.split_outstanding--;
3237 	if (parent_io->u.bdev.split_outstanding != 0) {
3238 		return;
3239 	}
3240 
3241 	/*
3242 	 * Parent I/O finishes when all blocks are consumed.
3243 	 */
3244 	if (parent_io->u.bdev.split_remaining_num_blocks == 0) {
3245 		assert(parent_io->internal.cb != bdev_io_split_done);
3246 		spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx);
3247 		TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link);
3248 
3249 		if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
3250 			if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) {
3251 				bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb);
3252 				return;
3253 			} else if (parent_io->internal.orig_iovcnt != 0 &&
3254 				   !bdev_io_use_accel_sequence(bdev_io)) {
3255 				/* bdev IO will be completed in the callback */
3256 				_bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete);
3257 				return;
3258 			}
3259 		}
3260 
3261 		parent_bdev_io_complete(parent_io, 0);
3262 		return;
3263 	}
3264 
3265 	/*
3266 	 * Continue with the splitting process.  This function will complete the parent I/O if the
3267 	 * splitting is done.
3268 	 */
3269 	switch (parent_io->type) {
3270 	case SPDK_BDEV_IO_TYPE_READ:
3271 	case SPDK_BDEV_IO_TYPE_WRITE:
3272 		_bdev_rw_split(parent_io);
3273 		break;
3274 	case SPDK_BDEV_IO_TYPE_UNMAP:
3275 		bdev_unmap_split(parent_io);
3276 		break;
3277 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3278 		bdev_write_zeroes_split(parent_io);
3279 		break;
3280 	case SPDK_BDEV_IO_TYPE_COPY:
3281 		bdev_copy_split(parent_io);
3282 		break;
3283 	default:
3284 		assert(false);
3285 		break;
3286 	}
3287 }
3288 
3289 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
3290 				     bool success);
3291 
3292 static void
3293 bdev_io_split(struct spdk_bdev_io *bdev_io)
3294 {
3295 	assert(bdev_io_should_split(bdev_io));
3296 
3297 	bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks;
3298 	bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks;
3299 	bdev_io->u.bdev.split_outstanding = 0;
3300 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
3301 
3302 	switch (bdev_io->type) {
3303 	case SPDK_BDEV_IO_TYPE_READ:
3304 	case SPDK_BDEV_IO_TYPE_WRITE:
3305 		if (_is_buf_allocated(bdev_io->u.bdev.iovs)) {
3306 			_bdev_rw_split(bdev_io);
3307 		} else {
3308 			assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
3309 			spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb,
3310 					     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
3311 		}
3312 		break;
3313 	case SPDK_BDEV_IO_TYPE_UNMAP:
3314 		bdev_unmap_split(bdev_io);
3315 		break;
3316 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3317 		bdev_write_zeroes_split(bdev_io);
3318 		break;
3319 	case SPDK_BDEV_IO_TYPE_COPY:
3320 		bdev_copy_split(bdev_io);
3321 		break;
3322 	default:
3323 		assert(false);
3324 		break;
3325 	}
3326 }
3327 
3328 static void
3329 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
3330 {
3331 	if (!success) {
3332 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
3333 		return;
3334 	}
3335 
3336 	_bdev_rw_split(bdev_io);
3337 }
3338 
3339 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't
3340  *  be inlined, at least on some compilers.
3341  */
3342 static inline void
3343 _bdev_io_submit(void *ctx)
3344 {
3345 	struct spdk_bdev_io *bdev_io = ctx;
3346 	struct spdk_bdev *bdev = bdev_io->bdev;
3347 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
3348 
3349 	if (spdk_likely(bdev_ch->flags == 0)) {
3350 		bdev_io_do_submit(bdev_ch, bdev_io);
3351 		return;
3352 	}
3353 
3354 	if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) {
3355 		_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
3356 	} else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) {
3357 		if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) &&
3358 		    bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) {
3359 			_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
3360 		} else {
3361 			TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link);
3362 			bdev_qos_io_submit(bdev_ch, bdev->internal.qos);
3363 		}
3364 	} else {
3365 		SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags);
3366 		_bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
3367 	}
3368 }
3369 
3370 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2);
3371 
3372 bool
3373 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2)
3374 {
3375 	if (range1->length == 0 || range2->length == 0) {
3376 		return false;
3377 	}
3378 
3379 	if (range1->offset + range1->length <= range2->offset) {
3380 		return false;
3381 	}
3382 
3383 	if (range2->offset + range2->length <= range1->offset) {
3384 		return false;
3385 	}
3386 
3387 	return true;
3388 }
3389 
3390 static bool
3391 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range)
3392 {
3393 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
3394 	struct lba_range r;
3395 
3396 	switch (bdev_io->type) {
3397 	case SPDK_BDEV_IO_TYPE_NVME_IO:
3398 	case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
3399 		/* Don't try to decode the NVMe command - just assume worst-case and that
3400 		 * it overlaps a locked range.
3401 		 */
3402 		return true;
3403 	case SPDK_BDEV_IO_TYPE_WRITE:
3404 	case SPDK_BDEV_IO_TYPE_UNMAP:
3405 	case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3406 	case SPDK_BDEV_IO_TYPE_ZCOPY:
3407 	case SPDK_BDEV_IO_TYPE_COPY:
3408 		r.offset = bdev_io->u.bdev.offset_blocks;
3409 		r.length = bdev_io->u.bdev.num_blocks;
3410 		if (!bdev_lba_range_overlapped(range, &r)) {
3411 			/* This I/O doesn't overlap the specified LBA range. */
3412 			return false;
3413 		} else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) {
3414 			/* This I/O overlaps, but the I/O is on the same channel that locked this
3415 			 * range, and the caller_ctx is the same as the locked_ctx.  This means
3416 			 * that this I/O is associated with the lock, and is allowed to execute.
3417 			 */
3418 			return false;
3419 		} else {
3420 			return true;
3421 		}
3422 	default:
3423 		return false;
3424 	}
3425 }
3426 
3427 void
3428 bdev_io_submit(struct spdk_bdev_io *bdev_io)
3429 {
3430 	struct spdk_bdev *bdev = bdev_io->bdev;
3431 	struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io);
3432 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
3433 
3434 	assert(thread != NULL);
3435 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
3436 
3437 	if (!TAILQ_EMPTY(&ch->locked_ranges)) {
3438 		struct lba_range *range;
3439 
3440 		TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
3441 			if (bdev_io_range_is_locked(bdev_io, range)) {
3442 				TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link);
3443 				return;
3444 			}
3445 		}
3446 	}
3447 
3448 	TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link);
3449 
3450 	bdev_io->internal.submit_tsc = spdk_get_ticks();
3451 	spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0,
3452 			      (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx,
3453 			      bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
3454 			      spdk_bdev_get_name(bdev));
3455 
3456 	if (bdev_io->internal.split) {
3457 		bdev_io_split(bdev_io);
3458 		return;
3459 	}
3460 
3461 	if (ch->flags & BDEV_CH_QOS_ENABLED) {
3462 		if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) {
3463 			_bdev_io_submit(bdev_io);
3464 		} else {
3465 			bdev_io->internal.io_submit_ch = ch;
3466 			bdev_io->internal.ch = bdev->internal.qos->ch;
3467 			spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io);
3468 		}
3469 	} else {
3470 		_bdev_io_submit(bdev_io);
3471 	}
3472 }
3473 
3474 static inline void
3475 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io)
3476 {
3477 	/* bdev doesn't support memory domains, thereby buffers in this IO request can't
3478 	 * be accessed directly. It is needed to allocate buffers before issuing IO operation.
3479 	 * For write operation we need to pull buffers from memory domain before submitting IO.
3480 	 * Once read operation completes, we need to use memory_domain push functionality to
3481 	 * update data in original memory domain IO buffer
3482 	 * This IO request will go through a regular IO flow, so clear memory domains pointers */
3483 	bdev_io->u.bdev.memory_domain = NULL;
3484 	bdev_io->u.bdev.memory_domain_ctx = NULL;
3485 	_bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb,
3486 				       bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
3487 }
3488 
3489 static inline void
3490 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io)
3491 {
3492 	struct spdk_bdev_channel *ch = bdev_io->internal.ch;
3493 	bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io);
3494 
3495 	if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) {
3496 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED;
3497 		bdev_io_complete_unsubmitted(bdev_io);
3498 		return;
3499 	}
3500 
3501 	/* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does
3502 	 * support them, but we need to execute an accel sequence and the data buffer is from accel
3503 	 * memory domain (to avoid doing a push/pull from that domain).
3504 	 */
3505 	if ((bdev_io->internal.memory_domain && !desc->memory_domains_supported) ||
3506 	    (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) {
3507 		_bdev_io_ext_use_bounce_buffer(bdev_io);
3508 		return;
3509 	}
3510 
3511 	if (needs_exec) {
3512 		if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
3513 			bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb);
3514 			return;
3515 		}
3516 		/* For reads we'll execute the sequence after the data is read, so, for now, only
3517 		 * clear out accel_sequence pointer and submit the IO */
3518 		assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
3519 		bdev_io->u.bdev.accel_sequence = NULL;
3520 	}
3521 
3522 	bdev_io_submit(bdev_io);
3523 }
3524 
3525 static void
3526 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io)
3527 {
3528 	struct spdk_bdev *bdev = bdev_io->bdev;
3529 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
3530 	struct spdk_io_channel *ch = bdev_ch->channel;
3531 
3532 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
3533 
3534 	bdev_io->internal.in_submit_request = true;
3535 	bdev_submit_request(bdev, ch, bdev_io);
3536 	bdev_io->internal.in_submit_request = false;
3537 }
3538 
3539 void
3540 bdev_io_init(struct spdk_bdev_io *bdev_io,
3541 	     struct spdk_bdev *bdev, void *cb_arg,
3542 	     spdk_bdev_io_completion_cb cb)
3543 {
3544 	bdev_io->bdev = bdev;
3545 	bdev_io->internal.caller_ctx = cb_arg;
3546 	bdev_io->internal.cb = cb;
3547 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
3548 	bdev_io->internal.in_submit_request = false;
3549 	bdev_io->internal.buf = NULL;
3550 	bdev_io->internal.io_submit_ch = NULL;
3551 	bdev_io->internal.orig_iovs = NULL;
3552 	bdev_io->internal.orig_iovcnt = 0;
3553 	bdev_io->internal.orig_md_iov.iov_base = NULL;
3554 	bdev_io->internal.error.nvme.cdw0 = 0;
3555 	bdev_io->num_retries = 0;
3556 	bdev_io->internal.get_buf_cb = NULL;
3557 	bdev_io->internal.get_aux_buf_cb = NULL;
3558 	bdev_io->internal.memory_domain = NULL;
3559 	bdev_io->internal.memory_domain_ctx = NULL;
3560 	bdev_io->internal.data_transfer_cpl = NULL;
3561 	bdev_io->internal.split = bdev_io_should_split(bdev_io);
3562 	bdev_io->internal.accel_sequence = NULL;
3563 	bdev_io->internal.has_accel_sequence = false;
3564 }
3565 
3566 static bool
3567 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
3568 {
3569 	return bdev->fn_table->io_type_supported(bdev->ctxt, io_type);
3570 }
3571 
3572 bool
3573 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
3574 {
3575 	bool supported;
3576 
3577 	supported = bdev_io_type_supported(bdev, io_type);
3578 
3579 	if (!supported) {
3580 		switch (io_type) {
3581 		case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3582 			/* The bdev layer will emulate write zeroes as long as write is supported. */
3583 			supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE);
3584 			break;
3585 		default:
3586 			break;
3587 		}
3588 	}
3589 
3590 	return supported;
3591 }
3592 
3593 uint64_t
3594 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io)
3595 {
3596 	return bdev_io->internal.submit_tsc;
3597 }
3598 
3599 int
3600 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
3601 {
3602 	if (bdev->fn_table->dump_info_json) {
3603 		return bdev->fn_table->dump_info_json(bdev->ctxt, w);
3604 	}
3605 
3606 	return 0;
3607 }
3608 
3609 static void
3610 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos)
3611 {
3612 	uint32_t max_per_timeslice = 0;
3613 	int i;
3614 
3615 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3616 		if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
3617 			qos->rate_limits[i].max_per_timeslice = 0;
3618 			continue;
3619 		}
3620 
3621 		max_per_timeslice = qos->rate_limits[i].limit *
3622 				    SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC;
3623 
3624 		qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice,
3625 							qos->rate_limits[i].min_per_timeslice);
3626 
3627 		qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice;
3628 	}
3629 
3630 	bdev_qos_set_ops(qos);
3631 }
3632 
3633 static int
3634 bdev_channel_poll_qos(void *arg)
3635 {
3636 	struct spdk_bdev_qos *qos = arg;
3637 	uint64_t now = spdk_get_ticks();
3638 	int i;
3639 
3640 	if (now < (qos->last_timeslice + qos->timeslice_size)) {
3641 		/* We received our callback earlier than expected - return
3642 		 *  immediately and wait to do accounting until at least one
3643 		 *  timeslice has actually expired.  This should never happen
3644 		 *  with a well-behaved timer implementation.
3645 		 */
3646 		return SPDK_POLLER_IDLE;
3647 	}
3648 
3649 	/* Reset for next round of rate limiting */
3650 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3651 		/* We may have allowed the IOs or bytes to slightly overrun in the last
3652 		 * timeslice. remaining_this_timeslice is signed, so if it's negative
3653 		 * here, we'll account for the overrun so that the next timeslice will
3654 		 * be appropriately reduced.
3655 		 */
3656 		if (qos->rate_limits[i].remaining_this_timeslice > 0) {
3657 			qos->rate_limits[i].remaining_this_timeslice = 0;
3658 		}
3659 	}
3660 
3661 	while (now >= (qos->last_timeslice + qos->timeslice_size)) {
3662 		qos->last_timeslice += qos->timeslice_size;
3663 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3664 			qos->rate_limits[i].remaining_this_timeslice +=
3665 				qos->rate_limits[i].max_per_timeslice;
3666 		}
3667 	}
3668 
3669 	return bdev_qos_io_submit(qos->ch, qos);
3670 }
3671 
3672 static void
3673 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch)
3674 {
3675 	struct spdk_bdev_shared_resource *shared_resource;
3676 	struct lba_range *range;
3677 
3678 	bdev_free_io_stat(ch->stat);
3679 #ifdef SPDK_CONFIG_VTUNE
3680 	bdev_free_io_stat(ch->prev_stat);
3681 #endif
3682 
3683 	while (!TAILQ_EMPTY(&ch->locked_ranges)) {
3684 		range = TAILQ_FIRST(&ch->locked_ranges);
3685 		TAILQ_REMOVE(&ch->locked_ranges, range, tailq);
3686 		free(range);
3687 	}
3688 
3689 	spdk_put_io_channel(ch->channel);
3690 	spdk_put_io_channel(ch->accel_channel);
3691 
3692 	shared_resource = ch->shared_resource;
3693 
3694 	assert(TAILQ_EMPTY(&ch->io_locked));
3695 	assert(TAILQ_EMPTY(&ch->io_submitted));
3696 	assert(TAILQ_EMPTY(&ch->io_accel_exec));
3697 	assert(TAILQ_EMPTY(&ch->io_memory_domain));
3698 	assert(ch->io_outstanding == 0);
3699 	assert(shared_resource->ref > 0);
3700 	shared_resource->ref--;
3701 	if (shared_resource->ref == 0) {
3702 		assert(shared_resource->io_outstanding == 0);
3703 		TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link);
3704 		spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch));
3705 		free(shared_resource);
3706 	}
3707 }
3708 
3709 static void
3710 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch)
3711 {
3712 	struct spdk_bdev_qos	*qos = bdev->internal.qos;
3713 	int			i;
3714 
3715 	assert(spdk_spin_held(&bdev->internal.spinlock));
3716 
3717 	/* Rate limiting on this bdev enabled */
3718 	if (qos) {
3719 		if (qos->ch == NULL) {
3720 			struct spdk_io_channel *io_ch;
3721 
3722 			SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch,
3723 				      bdev->name, spdk_get_thread());
3724 
3725 			/* No qos channel has been selected, so set one up */
3726 
3727 			/* Take another reference to ch */
3728 			io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev));
3729 			assert(io_ch != NULL);
3730 			qos->ch = ch;
3731 
3732 			qos->thread = spdk_io_channel_get_thread(io_ch);
3733 
3734 			TAILQ_INIT(&qos->queued);
3735 
3736 			for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
3737 				if (bdev_qos_is_iops_rate_limit(i) == true) {
3738 					qos->rate_limits[i].min_per_timeslice =
3739 						SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE;
3740 				} else {
3741 					qos->rate_limits[i].min_per_timeslice =
3742 						SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE;
3743 				}
3744 
3745 				if (qos->rate_limits[i].limit == 0) {
3746 					qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
3747 				}
3748 			}
3749 			bdev_qos_update_max_quota_per_timeslice(qos);
3750 			qos->timeslice_size =
3751 				SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC;
3752 			qos->last_timeslice = spdk_get_ticks();
3753 			qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos,
3754 							   qos,
3755 							   SPDK_BDEV_QOS_TIMESLICE_IN_USEC);
3756 		}
3757 
3758 		ch->flags |= BDEV_CH_QOS_ENABLED;
3759 	}
3760 }
3761 
3762 struct poll_timeout_ctx {
3763 	struct spdk_bdev_desc	*desc;
3764 	uint64_t		timeout_in_sec;
3765 	spdk_bdev_io_timeout_cb	cb_fn;
3766 	void			*cb_arg;
3767 };
3768 
3769 static void
3770 bdev_desc_free(struct spdk_bdev_desc *desc)
3771 {
3772 	spdk_spin_destroy(&desc->spinlock);
3773 	free(desc->media_events_buffer);
3774 	free(desc);
3775 }
3776 
3777 static void
3778 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
3779 {
3780 	struct poll_timeout_ctx *ctx  = _ctx;
3781 	struct spdk_bdev_desc *desc = ctx->desc;
3782 
3783 	free(ctx);
3784 
3785 	spdk_spin_lock(&desc->spinlock);
3786 	desc->refs--;
3787 	if (desc->closed == true && desc->refs == 0) {
3788 		spdk_spin_unlock(&desc->spinlock);
3789 		bdev_desc_free(desc);
3790 		return;
3791 	}
3792 	spdk_spin_unlock(&desc->spinlock);
3793 }
3794 
3795 static void
3796 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
3797 			     struct spdk_io_channel *io_ch, void *_ctx)
3798 {
3799 	struct poll_timeout_ctx *ctx  = _ctx;
3800 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
3801 	struct spdk_bdev_desc *desc = ctx->desc;
3802 	struct spdk_bdev_io *bdev_io;
3803 	uint64_t now;
3804 
3805 	spdk_spin_lock(&desc->spinlock);
3806 	if (desc->closed == true) {
3807 		spdk_spin_unlock(&desc->spinlock);
3808 		spdk_bdev_for_each_channel_continue(i, -1);
3809 		return;
3810 	}
3811 	spdk_spin_unlock(&desc->spinlock);
3812 
3813 	now = spdk_get_ticks();
3814 	TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) {
3815 		/* Exclude any I/O that are generated via splitting. */
3816 		if (bdev_io->internal.cb == bdev_io_split_done) {
3817 			continue;
3818 		}
3819 
3820 		/* Once we find an I/O that has not timed out, we can immediately
3821 		 * exit the loop.
3822 		 */
3823 		if (now < (bdev_io->internal.submit_tsc +
3824 			   ctx->timeout_in_sec * spdk_get_ticks_hz())) {
3825 			goto end;
3826 		}
3827 
3828 		if (bdev_io->internal.desc == desc) {
3829 			ctx->cb_fn(ctx->cb_arg, bdev_io);
3830 		}
3831 	}
3832 
3833 end:
3834 	spdk_bdev_for_each_channel_continue(i, 0);
3835 }
3836 
3837 static int
3838 bdev_poll_timeout_io(void *arg)
3839 {
3840 	struct spdk_bdev_desc *desc = arg;
3841 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
3842 	struct poll_timeout_ctx *ctx;
3843 
3844 	ctx = calloc(1, sizeof(struct poll_timeout_ctx));
3845 	if (!ctx) {
3846 		SPDK_ERRLOG("failed to allocate memory\n");
3847 		return SPDK_POLLER_BUSY;
3848 	}
3849 	ctx->desc = desc;
3850 	ctx->cb_arg = desc->cb_arg;
3851 	ctx->cb_fn = desc->cb_fn;
3852 	ctx->timeout_in_sec = desc->timeout_in_sec;
3853 
3854 	/* Take a ref on the descriptor in case it gets closed while we are checking
3855 	 * all of the channels.
3856 	 */
3857 	spdk_spin_lock(&desc->spinlock);
3858 	desc->refs++;
3859 	spdk_spin_unlock(&desc->spinlock);
3860 
3861 	spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx,
3862 				   bdev_channel_poll_timeout_io_done);
3863 
3864 	return SPDK_POLLER_BUSY;
3865 }
3866 
3867 int
3868 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec,
3869 		      spdk_bdev_io_timeout_cb cb_fn, void *cb_arg)
3870 {
3871 	assert(desc->thread == spdk_get_thread());
3872 
3873 	spdk_poller_unregister(&desc->io_timeout_poller);
3874 
3875 	if (timeout_in_sec) {
3876 		assert(cb_fn != NULL);
3877 		desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io,
3878 					  desc,
3879 					  SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC /
3880 					  1000);
3881 		if (desc->io_timeout_poller == NULL) {
3882 			SPDK_ERRLOG("can not register the desc timeout IO poller\n");
3883 			return -1;
3884 		}
3885 	}
3886 
3887 	desc->cb_fn = cb_fn;
3888 	desc->cb_arg = cb_arg;
3889 	desc->timeout_in_sec = timeout_in_sec;
3890 
3891 	return 0;
3892 }
3893 
3894 static int
3895 bdev_channel_create(void *io_device, void *ctx_buf)
3896 {
3897 	struct spdk_bdev		*bdev = __bdev_from_io_dev(io_device);
3898 	struct spdk_bdev_channel	*ch = ctx_buf;
3899 	struct spdk_io_channel		*mgmt_io_ch;
3900 	struct spdk_bdev_mgmt_channel	*mgmt_ch;
3901 	struct spdk_bdev_shared_resource *shared_resource;
3902 	struct lba_range		*range;
3903 
3904 	ch->bdev = bdev;
3905 	ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt);
3906 	if (!ch->channel) {
3907 		return -1;
3908 	}
3909 
3910 	ch->accel_channel = spdk_accel_get_io_channel();
3911 	if (!ch->accel_channel) {
3912 		spdk_put_io_channel(ch->channel);
3913 		return -1;
3914 	}
3915 
3916 	spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name,
3917 			  spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel)));
3918 
3919 	assert(ch->histogram == NULL);
3920 	if (bdev->internal.histogram_enabled) {
3921 		ch->histogram = spdk_histogram_data_alloc();
3922 		if (ch->histogram == NULL) {
3923 			SPDK_ERRLOG("Could not allocate histogram\n");
3924 		}
3925 	}
3926 
3927 	mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr);
3928 	if (!mgmt_io_ch) {
3929 		spdk_put_io_channel(ch->channel);
3930 		spdk_put_io_channel(ch->accel_channel);
3931 		return -1;
3932 	}
3933 
3934 	mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch);
3935 	TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) {
3936 		if (shared_resource->shared_ch == ch->channel) {
3937 			spdk_put_io_channel(mgmt_io_ch);
3938 			shared_resource->ref++;
3939 			break;
3940 		}
3941 	}
3942 
3943 	if (shared_resource == NULL) {
3944 		shared_resource = calloc(1, sizeof(*shared_resource));
3945 		if (shared_resource == NULL) {
3946 			spdk_put_io_channel(ch->channel);
3947 			spdk_put_io_channel(ch->accel_channel);
3948 			spdk_put_io_channel(mgmt_io_ch);
3949 			return -1;
3950 		}
3951 
3952 		shared_resource->mgmt_ch = mgmt_ch;
3953 		shared_resource->io_outstanding = 0;
3954 		TAILQ_INIT(&shared_resource->nomem_io);
3955 		shared_resource->nomem_threshold = 0;
3956 		shared_resource->shared_ch = ch->channel;
3957 		shared_resource->ref = 1;
3958 		TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link);
3959 	}
3960 
3961 	ch->io_outstanding = 0;
3962 	TAILQ_INIT(&ch->queued_resets);
3963 	TAILQ_INIT(&ch->locked_ranges);
3964 	ch->flags = 0;
3965 	ch->shared_resource = shared_resource;
3966 
3967 	TAILQ_INIT(&ch->io_submitted);
3968 	TAILQ_INIT(&ch->io_locked);
3969 	TAILQ_INIT(&ch->io_accel_exec);
3970 	TAILQ_INIT(&ch->io_memory_domain);
3971 
3972 	ch->stat = bdev_alloc_io_stat(false);
3973 	if (ch->stat == NULL) {
3974 		bdev_channel_destroy_resource(ch);
3975 		return -1;
3976 	}
3977 
3978 	ch->stat->ticks_rate = spdk_get_ticks_hz();
3979 
3980 #ifdef SPDK_CONFIG_VTUNE
3981 	{
3982 		char *name;
3983 		__itt_init_ittlib(NULL, 0);
3984 		name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch);
3985 		if (!name) {
3986 			bdev_channel_destroy_resource(ch);
3987 			return -1;
3988 		}
3989 		ch->handle = __itt_string_handle_create(name);
3990 		free(name);
3991 		ch->start_tsc = spdk_get_ticks();
3992 		ch->interval_tsc = spdk_get_ticks_hz() / 100;
3993 		ch->prev_stat = bdev_alloc_io_stat(false);
3994 		if (ch->prev_stat == NULL) {
3995 			bdev_channel_destroy_resource(ch);
3996 			return -1;
3997 		}
3998 	}
3999 #endif
4000 
4001 	spdk_spin_lock(&bdev->internal.spinlock);
4002 	bdev_enable_qos(bdev, ch);
4003 
4004 	TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) {
4005 		struct lba_range *new_range;
4006 
4007 		new_range = calloc(1, sizeof(*new_range));
4008 		if (new_range == NULL) {
4009 			spdk_spin_unlock(&bdev->internal.spinlock);
4010 			bdev_channel_destroy_resource(ch);
4011 			return -1;
4012 		}
4013 		new_range->length = range->length;
4014 		new_range->offset = range->offset;
4015 		new_range->locked_ctx = range->locked_ctx;
4016 		TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq);
4017 	}
4018 
4019 	spdk_spin_unlock(&bdev->internal.spinlock);
4020 
4021 	return 0;
4022 }
4023 
4024 static int
4025 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry,
4026 			 void *cb_ctx)
4027 {
4028 	struct spdk_bdev_channel *bdev_ch = cb_ctx;
4029 	struct spdk_bdev_io *bdev_io;
4030 	uint64_t buf_len;
4031 
4032 	bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf);
4033 	if (bdev_io->internal.ch == bdev_ch) {
4034 		buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len);
4035 		spdk_iobuf_entry_abort(ch, entry, buf_len);
4036 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
4037 	}
4038 
4039 	return 0;
4040 }
4041 
4042 /*
4043  * Abort I/O that are waiting on a data buffer.
4044  */
4045 static void
4046 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch)
4047 {
4048 	spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small,
4049 				  bdev_abort_all_buf_io_cb, ch);
4050 	spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large,
4051 				  bdev_abort_all_buf_io_cb, ch);
4052 }
4053 
4054 /*
4055  * Abort I/O that are queued waiting for submission.  These types of I/O are
4056  *  linked using the spdk_bdev_io link TAILQ_ENTRY.
4057  */
4058 static void
4059 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch)
4060 {
4061 	struct spdk_bdev_io *bdev_io, *tmp;
4062 
4063 	TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) {
4064 		if (bdev_io->internal.ch == ch) {
4065 			TAILQ_REMOVE(queue, bdev_io, internal.link);
4066 			/*
4067 			 * spdk_bdev_io_complete() assumes that the completed I/O had
4068 			 *  been submitted to the bdev module.  Since in this case it
4069 			 *  hadn't, bump io_outstanding to account for the decrement
4070 			 *  that spdk_bdev_io_complete() will do.
4071 			 */
4072 			if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) {
4073 				ch->io_outstanding++;
4074 				ch->shared_resource->io_outstanding++;
4075 			}
4076 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
4077 		}
4078 	}
4079 }
4080 
4081 static bool
4082 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort)
4083 {
4084 	struct spdk_bdev_io *bdev_io;
4085 
4086 	TAILQ_FOREACH(bdev_io, queue, internal.link) {
4087 		if (bdev_io == bio_to_abort) {
4088 			TAILQ_REMOVE(queue, bio_to_abort, internal.link);
4089 			spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
4090 			return true;
4091 		}
4092 	}
4093 
4094 	return false;
4095 }
4096 
4097 static int
4098 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx)
4099 {
4100 	struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx;
4101 	uint64_t buf_len;
4102 
4103 	bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf);
4104 	if (bdev_io == bio_to_abort) {
4105 		buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len);
4106 		spdk_iobuf_entry_abort(ch, entry, buf_len);
4107 		spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
4108 		return 1;
4109 	}
4110 
4111 	return 0;
4112 }
4113 
4114 static bool
4115 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort)
4116 {
4117 	int rc;
4118 
4119 	rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small,
4120 				       bdev_abort_buf_io_cb, bio_to_abort);
4121 	if (rc == 1) {
4122 		return true;
4123 	}
4124 
4125 	rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large,
4126 				       bdev_abort_buf_io_cb, bio_to_abort);
4127 	return rc == 1;
4128 }
4129 
4130 static void
4131 bdev_qos_channel_destroy(void *cb_arg)
4132 {
4133 	struct spdk_bdev_qos *qos = cb_arg;
4134 
4135 	spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
4136 	spdk_poller_unregister(&qos->poller);
4137 
4138 	SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos);
4139 
4140 	free(qos);
4141 }
4142 
4143 static int
4144 bdev_qos_destroy(struct spdk_bdev *bdev)
4145 {
4146 	int i;
4147 
4148 	/*
4149 	 * Cleanly shutting down the QoS poller is tricky, because
4150 	 * during the asynchronous operation the user could open
4151 	 * a new descriptor and create a new channel, spawning
4152 	 * a new QoS poller.
4153 	 *
4154 	 * The strategy is to create a new QoS structure here and swap it
4155 	 * in. The shutdown path then continues to refer to the old one
4156 	 * until it completes and then releases it.
4157 	 */
4158 	struct spdk_bdev_qos *new_qos, *old_qos;
4159 
4160 	old_qos = bdev->internal.qos;
4161 
4162 	new_qos = calloc(1, sizeof(*new_qos));
4163 	if (!new_qos) {
4164 		SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n");
4165 		return -ENOMEM;
4166 	}
4167 
4168 	/* Copy the old QoS data into the newly allocated structure */
4169 	memcpy(new_qos, old_qos, sizeof(*new_qos));
4170 
4171 	/* Zero out the key parts of the QoS structure */
4172 	new_qos->ch = NULL;
4173 	new_qos->thread = NULL;
4174 	new_qos->poller = NULL;
4175 	TAILQ_INIT(&new_qos->queued);
4176 	/*
4177 	 * The limit member of spdk_bdev_qos_limit structure is not zeroed.
4178 	 * It will be used later for the new QoS structure.
4179 	 */
4180 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4181 		new_qos->rate_limits[i].remaining_this_timeslice = 0;
4182 		new_qos->rate_limits[i].min_per_timeslice = 0;
4183 		new_qos->rate_limits[i].max_per_timeslice = 0;
4184 	}
4185 
4186 	bdev->internal.qos = new_qos;
4187 
4188 	if (old_qos->thread == NULL) {
4189 		free(old_qos);
4190 	} else {
4191 		spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos);
4192 	}
4193 
4194 	/* It is safe to continue with destroying the bdev even though the QoS channel hasn't
4195 	 * been destroyed yet. The destruction path will end up waiting for the final
4196 	 * channel to be put before it releases resources. */
4197 
4198 	return 0;
4199 }
4200 
4201 void
4202 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add)
4203 {
4204 	total->bytes_read += add->bytes_read;
4205 	total->num_read_ops += add->num_read_ops;
4206 	total->bytes_written += add->bytes_written;
4207 	total->num_write_ops += add->num_write_ops;
4208 	total->bytes_unmapped += add->bytes_unmapped;
4209 	total->num_unmap_ops += add->num_unmap_ops;
4210 	total->bytes_copied += add->bytes_copied;
4211 	total->num_copy_ops += add->num_copy_ops;
4212 	total->read_latency_ticks += add->read_latency_ticks;
4213 	total->write_latency_ticks += add->write_latency_ticks;
4214 	total->unmap_latency_ticks += add->unmap_latency_ticks;
4215 	total->copy_latency_ticks += add->copy_latency_ticks;
4216 	if (total->max_read_latency_ticks < add->max_read_latency_ticks) {
4217 		total->max_read_latency_ticks = add->max_read_latency_ticks;
4218 	}
4219 	if (total->min_read_latency_ticks > add->min_read_latency_ticks) {
4220 		total->min_read_latency_ticks = add->min_read_latency_ticks;
4221 	}
4222 	if (total->max_write_latency_ticks < add->max_write_latency_ticks) {
4223 		total->max_write_latency_ticks = add->max_write_latency_ticks;
4224 	}
4225 	if (total->min_write_latency_ticks > add->min_write_latency_ticks) {
4226 		total->min_write_latency_ticks = add->min_write_latency_ticks;
4227 	}
4228 	if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) {
4229 		total->max_unmap_latency_ticks = add->max_unmap_latency_ticks;
4230 	}
4231 	if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) {
4232 		total->min_unmap_latency_ticks = add->min_unmap_latency_ticks;
4233 	}
4234 	if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) {
4235 		total->max_copy_latency_ticks = add->max_copy_latency_ticks;
4236 	}
4237 	if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) {
4238 		total->min_copy_latency_ticks = add->min_copy_latency_ticks;
4239 	}
4240 }
4241 
4242 static void
4243 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat)
4244 {
4245 	memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error));
4246 
4247 	if (to_stat->io_error != NULL && from_stat->io_error != NULL) {
4248 		memcpy(to_stat->io_error, from_stat->io_error,
4249 		       sizeof(struct spdk_bdev_io_error_stat));
4250 	}
4251 }
4252 
4253 void
4254 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode)
4255 {
4256 	stat->max_read_latency_ticks = 0;
4257 	stat->min_read_latency_ticks = UINT64_MAX;
4258 	stat->max_write_latency_ticks = 0;
4259 	stat->min_write_latency_ticks = UINT64_MAX;
4260 	stat->max_unmap_latency_ticks = 0;
4261 	stat->min_unmap_latency_ticks = UINT64_MAX;
4262 	stat->max_copy_latency_ticks = 0;
4263 	stat->min_copy_latency_ticks = UINT64_MAX;
4264 
4265 	if (mode != SPDK_BDEV_RESET_STAT_ALL) {
4266 		return;
4267 	}
4268 
4269 	stat->bytes_read = 0;
4270 	stat->num_read_ops = 0;
4271 	stat->bytes_written = 0;
4272 	stat->num_write_ops = 0;
4273 	stat->bytes_unmapped = 0;
4274 	stat->num_unmap_ops = 0;
4275 	stat->bytes_copied = 0;
4276 	stat->num_copy_ops = 0;
4277 	stat->read_latency_ticks = 0;
4278 	stat->write_latency_ticks = 0;
4279 	stat->unmap_latency_ticks = 0;
4280 	stat->copy_latency_ticks = 0;
4281 
4282 	if (stat->io_error != NULL) {
4283 		memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat));
4284 	}
4285 }
4286 
4287 struct spdk_bdev_io_stat *
4288 bdev_alloc_io_stat(bool io_error_stat)
4289 {
4290 	struct spdk_bdev_io_stat *stat;
4291 
4292 	stat = malloc(sizeof(struct spdk_bdev_io_stat));
4293 	if (stat == NULL) {
4294 		return NULL;
4295 	}
4296 
4297 	if (io_error_stat) {
4298 		stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat));
4299 		if (stat->io_error == NULL) {
4300 			free(stat);
4301 			return NULL;
4302 		}
4303 	} else {
4304 		stat->io_error = NULL;
4305 	}
4306 
4307 	spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL);
4308 
4309 	return stat;
4310 }
4311 
4312 void
4313 bdev_free_io_stat(struct spdk_bdev_io_stat *stat)
4314 {
4315 	if (stat != NULL) {
4316 		free(stat->io_error);
4317 		free(stat);
4318 	}
4319 }
4320 
4321 void
4322 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w)
4323 {
4324 	int i;
4325 
4326 	spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read);
4327 	spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops);
4328 	spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written);
4329 	spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops);
4330 	spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped);
4331 	spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops);
4332 	spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied);
4333 	spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops);
4334 	spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks);
4335 	spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks);
4336 	spdk_json_write_named_uint64(w, "min_read_latency_ticks",
4337 				     stat->min_read_latency_ticks != UINT64_MAX ?
4338 				     stat->min_read_latency_ticks : 0);
4339 	spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks);
4340 	spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks);
4341 	spdk_json_write_named_uint64(w, "min_write_latency_ticks",
4342 				     stat->min_write_latency_ticks != UINT64_MAX ?
4343 				     stat->min_write_latency_ticks : 0);
4344 	spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks);
4345 	spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks);
4346 	spdk_json_write_named_uint64(w, "min_unmap_latency_ticks",
4347 				     stat->min_unmap_latency_ticks != UINT64_MAX ?
4348 				     stat->min_unmap_latency_ticks : 0);
4349 	spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks);
4350 	spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks);
4351 	spdk_json_write_named_uint64(w, "min_copy_latency_ticks",
4352 				     stat->min_copy_latency_ticks != UINT64_MAX ?
4353 				     stat->min_copy_latency_ticks : 0);
4354 
4355 	if (stat->io_error != NULL) {
4356 		spdk_json_write_named_object_begin(w, "io_error");
4357 		for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) {
4358 			if (stat->io_error->error_status[i] != 0) {
4359 				spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)),
4360 							     stat->io_error->error_status[i]);
4361 			}
4362 		}
4363 		spdk_json_write_object_end(w);
4364 	}
4365 }
4366 
4367 static void
4368 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch)
4369 {
4370 	struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource;
4371 	struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch;
4372 
4373 	bdev_abort_all_queued_io(&shared_resource->nomem_io, ch);
4374 	bdev_abort_all_buf_io(mgmt_ch, ch);
4375 }
4376 
4377 static void
4378 bdev_channel_destroy(void *io_device, void *ctx_buf)
4379 {
4380 	struct spdk_bdev_channel *ch = ctx_buf;
4381 
4382 	SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name,
4383 		      spdk_get_thread());
4384 
4385 	spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name,
4386 			  spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel)));
4387 
4388 	/* This channel is going away, so add its statistics into the bdev so that they don't get lost. */
4389 	spdk_spin_lock(&ch->bdev->internal.spinlock);
4390 	spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat);
4391 	spdk_spin_unlock(&ch->bdev->internal.spinlock);
4392 
4393 	bdev_abort_all_queued_io(&ch->queued_resets, ch);
4394 
4395 	bdev_channel_abort_queued_ios(ch);
4396 
4397 	if (ch->histogram) {
4398 		spdk_histogram_data_free(ch->histogram);
4399 	}
4400 
4401 	bdev_channel_destroy_resource(ch);
4402 }
4403 
4404 /*
4405  * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer
4406  * to it. Hence we do not have to call bdev_get_by_name() when using this function.
4407  */
4408 static int
4409 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name)
4410 {
4411 	struct spdk_bdev_name *tmp;
4412 
4413 	bdev_name->name = strdup(name);
4414 	if (bdev_name->name == NULL) {
4415 		SPDK_ERRLOG("Unable to allocate bdev name\n");
4416 		return -ENOMEM;
4417 	}
4418 
4419 	bdev_name->bdev = bdev;
4420 
4421 	spdk_spin_lock(&g_bdev_mgr.spinlock);
4422 	tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name);
4423 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
4424 
4425 	if (tmp != NULL) {
4426 		SPDK_ERRLOG("Bdev name %s already exists\n", name);
4427 		free(bdev_name->name);
4428 		return -EEXIST;
4429 	}
4430 
4431 	return 0;
4432 }
4433 
4434 static void
4435 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name)
4436 {
4437 	RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name);
4438 	free(bdev_name->name);
4439 }
4440 
4441 static void
4442 bdev_name_del(struct spdk_bdev_name *bdev_name)
4443 {
4444 	spdk_spin_lock(&g_bdev_mgr.spinlock);
4445 	bdev_name_del_unsafe(bdev_name);
4446 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
4447 }
4448 
4449 int
4450 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias)
4451 {
4452 	struct spdk_bdev_alias *tmp;
4453 	int ret;
4454 
4455 	if (alias == NULL) {
4456 		SPDK_ERRLOG("Empty alias passed\n");
4457 		return -EINVAL;
4458 	}
4459 
4460 	tmp = calloc(1, sizeof(*tmp));
4461 	if (tmp == NULL) {
4462 		SPDK_ERRLOG("Unable to allocate alias\n");
4463 		return -ENOMEM;
4464 	}
4465 
4466 	ret = bdev_name_add(&tmp->alias, bdev, alias);
4467 	if (ret != 0) {
4468 		free(tmp);
4469 		return ret;
4470 	}
4471 
4472 	TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq);
4473 
4474 	return 0;
4475 }
4476 
4477 static int
4478 bdev_alias_del(struct spdk_bdev *bdev, const char *alias,
4479 	       void (*alias_del_fn)(struct spdk_bdev_name *n))
4480 {
4481 	struct spdk_bdev_alias *tmp;
4482 
4483 	TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
4484 		if (strcmp(alias, tmp->alias.name) == 0) {
4485 			TAILQ_REMOVE(&bdev->aliases, tmp, tailq);
4486 			alias_del_fn(&tmp->alias);
4487 			free(tmp);
4488 			return 0;
4489 		}
4490 	}
4491 
4492 	return -ENOENT;
4493 }
4494 
4495 int
4496 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias)
4497 {
4498 	int rc;
4499 
4500 	rc = bdev_alias_del(bdev, alias, bdev_name_del);
4501 	if (rc == -ENOENT) {
4502 		SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias);
4503 	}
4504 
4505 	return rc;
4506 }
4507 
4508 void
4509 spdk_bdev_alias_del_all(struct spdk_bdev *bdev)
4510 {
4511 	struct spdk_bdev_alias *p, *tmp;
4512 
4513 	TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) {
4514 		TAILQ_REMOVE(&bdev->aliases, p, tailq);
4515 		bdev_name_del(&p->alias);
4516 		free(p);
4517 	}
4518 }
4519 
4520 struct spdk_io_channel *
4521 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc)
4522 {
4523 	return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc)));
4524 }
4525 
4526 void *
4527 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc)
4528 {
4529 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
4530 	void *ctx = NULL;
4531 
4532 	if (bdev->fn_table->get_module_ctx) {
4533 		ctx = bdev->fn_table->get_module_ctx(bdev->ctxt);
4534 	}
4535 
4536 	return ctx;
4537 }
4538 
4539 const char *
4540 spdk_bdev_get_module_name(const struct spdk_bdev *bdev)
4541 {
4542 	return bdev->module->name;
4543 }
4544 
4545 const char *
4546 spdk_bdev_get_name(const struct spdk_bdev *bdev)
4547 {
4548 	return bdev->name;
4549 }
4550 
4551 const char *
4552 spdk_bdev_get_product_name(const struct spdk_bdev *bdev)
4553 {
4554 	return bdev->product_name;
4555 }
4556 
4557 const struct spdk_bdev_aliases_list *
4558 spdk_bdev_get_aliases(const struct spdk_bdev *bdev)
4559 {
4560 	return &bdev->aliases;
4561 }
4562 
4563 uint32_t
4564 spdk_bdev_get_block_size(const struct spdk_bdev *bdev)
4565 {
4566 	return bdev->blocklen;
4567 }
4568 
4569 uint32_t
4570 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev)
4571 {
4572 	return bdev->write_unit_size;
4573 }
4574 
4575 uint64_t
4576 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev)
4577 {
4578 	return bdev->blockcnt;
4579 }
4580 
4581 const char *
4582 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type)
4583 {
4584 	return qos_rpc_type[type];
4585 }
4586 
4587 void
4588 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits)
4589 {
4590 	int i;
4591 
4592 	memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES);
4593 
4594 	spdk_spin_lock(&bdev->internal.spinlock);
4595 	if (bdev->internal.qos) {
4596 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
4597 			if (bdev->internal.qos->rate_limits[i].limit !=
4598 			    SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
4599 				limits[i] = bdev->internal.qos->rate_limits[i].limit;
4600 				if (bdev_qos_is_iops_rate_limit(i) == false) {
4601 					/* Change from Byte to Megabyte which is user visible. */
4602 					limits[i] = limits[i] / 1024 / 1024;
4603 				}
4604 			}
4605 		}
4606 	}
4607 	spdk_spin_unlock(&bdev->internal.spinlock);
4608 }
4609 
4610 size_t
4611 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev)
4612 {
4613 	return 1 << bdev->required_alignment;
4614 }
4615 
4616 uint32_t
4617 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev)
4618 {
4619 	return bdev->optimal_io_boundary;
4620 }
4621 
4622 bool
4623 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev)
4624 {
4625 	return bdev->write_cache;
4626 }
4627 
4628 const struct spdk_uuid *
4629 spdk_bdev_get_uuid(const struct spdk_bdev *bdev)
4630 {
4631 	return &bdev->uuid;
4632 }
4633 
4634 uint16_t
4635 spdk_bdev_get_acwu(const struct spdk_bdev *bdev)
4636 {
4637 	return bdev->acwu;
4638 }
4639 
4640 uint32_t
4641 spdk_bdev_get_md_size(const struct spdk_bdev *bdev)
4642 {
4643 	return bdev->md_len;
4644 }
4645 
4646 bool
4647 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev)
4648 {
4649 	return (bdev->md_len != 0) && bdev->md_interleave;
4650 }
4651 
4652 bool
4653 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev)
4654 {
4655 	return (bdev->md_len != 0) && !bdev->md_interleave;
4656 }
4657 
4658 bool
4659 spdk_bdev_is_zoned(const struct spdk_bdev *bdev)
4660 {
4661 	return bdev->zoned;
4662 }
4663 
4664 uint32_t
4665 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev)
4666 {
4667 	if (spdk_bdev_is_md_interleaved(bdev)) {
4668 		return bdev->blocklen - bdev->md_len;
4669 	} else {
4670 		return bdev->blocklen;
4671 	}
4672 }
4673 
4674 uint32_t
4675 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev)
4676 {
4677 	return bdev->phys_blocklen;
4678 }
4679 
4680 static uint32_t
4681 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev)
4682 {
4683 	if (!spdk_bdev_is_md_interleaved(bdev)) {
4684 		return bdev->blocklen + bdev->md_len;
4685 	} else {
4686 		return bdev->blocklen;
4687 	}
4688 }
4689 
4690 /* We have to use the typedef in the function declaration to appease astyle. */
4691 typedef enum spdk_dif_type spdk_dif_type_t;
4692 
4693 spdk_dif_type_t
4694 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev)
4695 {
4696 	if (bdev->md_len != 0) {
4697 		return bdev->dif_type;
4698 	} else {
4699 		return SPDK_DIF_DISABLE;
4700 	}
4701 }
4702 
4703 bool
4704 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev)
4705 {
4706 	if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) {
4707 		return bdev->dif_is_head_of_md;
4708 	} else {
4709 		return false;
4710 	}
4711 }
4712 
4713 bool
4714 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev,
4715 			       enum spdk_dif_check_type check_type)
4716 {
4717 	if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) {
4718 		return false;
4719 	}
4720 
4721 	switch (check_type) {
4722 	case SPDK_DIF_CHECK_TYPE_REFTAG:
4723 		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0;
4724 	case SPDK_DIF_CHECK_TYPE_APPTAG:
4725 		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0;
4726 	case SPDK_DIF_CHECK_TYPE_GUARD:
4727 		return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0;
4728 	default:
4729 		return false;
4730 	}
4731 }
4732 
4733 static uint32_t
4734 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes)
4735 {
4736 	uint64_t aligned_length, max_write_blocks;
4737 
4738 	aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1);
4739 	max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev);
4740 	max_write_blocks -= max_write_blocks % bdev->write_unit_size;
4741 
4742 	return max_write_blocks;
4743 }
4744 
4745 uint32_t
4746 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev)
4747 {
4748 	return bdev->max_copy;
4749 }
4750 
4751 uint64_t
4752 spdk_bdev_get_qd(const struct spdk_bdev *bdev)
4753 {
4754 	return bdev->internal.measured_queue_depth;
4755 }
4756 
4757 uint64_t
4758 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev)
4759 {
4760 	return bdev->internal.period;
4761 }
4762 
4763 uint64_t
4764 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev)
4765 {
4766 	return bdev->internal.weighted_io_time;
4767 }
4768 
4769 uint64_t
4770 spdk_bdev_get_io_time(const struct spdk_bdev *bdev)
4771 {
4772 	return bdev->internal.io_time;
4773 }
4774 
4775 static void bdev_update_qd_sampling_period(void *ctx);
4776 
4777 static void
4778 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status)
4779 {
4780 	bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth;
4781 
4782 	if (bdev->internal.measured_queue_depth) {
4783 		bdev->internal.io_time += bdev->internal.period;
4784 		bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth;
4785 	}
4786 
4787 	bdev->internal.qd_poll_in_progress = false;
4788 
4789 	bdev_update_qd_sampling_period(bdev);
4790 }
4791 
4792 static void
4793 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
4794 		       struct spdk_io_channel *io_ch, void *_ctx)
4795 {
4796 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch);
4797 
4798 	bdev->internal.temporary_queue_depth += ch->io_outstanding;
4799 	spdk_bdev_for_each_channel_continue(i, 0);
4800 }
4801 
4802 static int
4803 bdev_calculate_measured_queue_depth(void *ctx)
4804 {
4805 	struct spdk_bdev *bdev = ctx;
4806 
4807 	bdev->internal.qd_poll_in_progress = true;
4808 	bdev->internal.temporary_queue_depth = 0;
4809 	spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl);
4810 	return SPDK_POLLER_BUSY;
4811 }
4812 
4813 static void
4814 bdev_update_qd_sampling_period(void *ctx)
4815 {
4816 	struct spdk_bdev *bdev = ctx;
4817 
4818 	if (bdev->internal.period == bdev->internal.new_period) {
4819 		return;
4820 	}
4821 
4822 	if (bdev->internal.qd_poll_in_progress) {
4823 		return;
4824 	}
4825 
4826 	bdev->internal.period = bdev->internal.new_period;
4827 
4828 	spdk_poller_unregister(&bdev->internal.qd_poller);
4829 	if (bdev->internal.period != 0) {
4830 		bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth,
4831 					   bdev, bdev->internal.period);
4832 	} else {
4833 		spdk_bdev_close(bdev->internal.qd_desc);
4834 		bdev->internal.qd_desc = NULL;
4835 	}
4836 }
4837 
4838 static void
4839 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
4840 {
4841 	SPDK_NOTICELOG("Unexpected event type: %d\n", type);
4842 }
4843 
4844 void
4845 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period)
4846 {
4847 	int rc;
4848 
4849 	if (bdev->internal.new_period == period) {
4850 		return;
4851 	}
4852 
4853 	bdev->internal.new_period = period;
4854 
4855 	if (bdev->internal.qd_desc != NULL) {
4856 		assert(bdev->internal.period != 0);
4857 
4858 		spdk_thread_send_msg(bdev->internal.qd_desc->thread,
4859 				     bdev_update_qd_sampling_period, bdev);
4860 		return;
4861 	}
4862 
4863 	assert(bdev->internal.period == 0);
4864 
4865 	rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb,
4866 				NULL, &bdev->internal.qd_desc);
4867 	if (rc != 0) {
4868 		return;
4869 	}
4870 
4871 	bdev->internal.period = period;
4872 	bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth,
4873 				   bdev, period);
4874 }
4875 
4876 struct bdev_get_current_qd_ctx {
4877 	uint64_t current_qd;
4878 	spdk_bdev_get_current_qd_cb cb_fn;
4879 	void *cb_arg;
4880 };
4881 
4882 static void
4883 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status)
4884 {
4885 	struct bdev_get_current_qd_ctx *ctx = _ctx;
4886 
4887 	ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0);
4888 
4889 	free(ctx);
4890 }
4891 
4892 static void
4893 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
4894 		    struct spdk_io_channel *io_ch, void *_ctx)
4895 {
4896 	struct bdev_get_current_qd_ctx *ctx = _ctx;
4897 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
4898 
4899 	ctx->current_qd += bdev_ch->io_outstanding;
4900 
4901 	spdk_bdev_for_each_channel_continue(i, 0);
4902 }
4903 
4904 void
4905 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn,
4906 			 void *cb_arg)
4907 {
4908 	struct bdev_get_current_qd_ctx *ctx;
4909 
4910 	assert(cb_fn != NULL);
4911 
4912 	ctx = calloc(1, sizeof(*ctx));
4913 	if (ctx == NULL) {
4914 		cb_fn(bdev, 0, cb_arg, -ENOMEM);
4915 		return;
4916 	}
4917 
4918 	ctx->cb_fn = cb_fn;
4919 	ctx->cb_arg = cb_arg;
4920 
4921 	spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done);
4922 }
4923 
4924 static void
4925 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type)
4926 {
4927 	assert(desc->thread == spdk_get_thread());
4928 
4929 	spdk_spin_lock(&desc->spinlock);
4930 	desc->refs--;
4931 	if (!desc->closed) {
4932 		spdk_spin_unlock(&desc->spinlock);
4933 		desc->callback.event_fn(type,
4934 					desc->bdev,
4935 					desc->callback.ctx);
4936 		return;
4937 	} else if (desc->refs == 0) {
4938 		/* This descriptor was closed after this event_notify message was sent.
4939 		 * spdk_bdev_close() could not free the descriptor since this message was
4940 		 * in flight, so we free it now using bdev_desc_free().
4941 		 */
4942 		spdk_spin_unlock(&desc->spinlock);
4943 		bdev_desc_free(desc);
4944 		return;
4945 	}
4946 	spdk_spin_unlock(&desc->spinlock);
4947 }
4948 
4949 static void
4950 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn)
4951 {
4952 	spdk_spin_lock(&desc->spinlock);
4953 	desc->refs++;
4954 	spdk_thread_send_msg(desc->thread, event_notify_fn, desc);
4955 	spdk_spin_unlock(&desc->spinlock);
4956 }
4957 
4958 static void
4959 _resize_notify(void *ctx)
4960 {
4961 	struct spdk_bdev_desc *desc = ctx;
4962 
4963 	_event_notify(desc, SPDK_BDEV_EVENT_RESIZE);
4964 }
4965 
4966 int
4967 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size)
4968 {
4969 	struct spdk_bdev_desc *desc;
4970 	int ret;
4971 
4972 	if (size == bdev->blockcnt) {
4973 		return 0;
4974 	}
4975 
4976 	spdk_spin_lock(&bdev->internal.spinlock);
4977 
4978 	/* bdev has open descriptors */
4979 	if (!TAILQ_EMPTY(&bdev->internal.open_descs) &&
4980 	    bdev->blockcnt > size) {
4981 		ret = -EBUSY;
4982 	} else {
4983 		bdev->blockcnt = size;
4984 		TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
4985 			event_notify(desc, _resize_notify);
4986 		}
4987 		ret = 0;
4988 	}
4989 
4990 	spdk_spin_unlock(&bdev->internal.spinlock);
4991 
4992 	return ret;
4993 }
4994 
4995 /*
4996  * Convert I/O offset and length from bytes to blocks.
4997  *
4998  * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size.
4999  */
5000 static uint64_t
5001 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks,
5002 		     uint64_t num_bytes, uint64_t *num_blocks)
5003 {
5004 	uint32_t block_size = bdev->blocklen;
5005 	uint8_t shift_cnt;
5006 
5007 	/* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */
5008 	if (spdk_likely(spdk_u32_is_pow2(block_size))) {
5009 		shift_cnt = spdk_u32log2(block_size);
5010 		*offset_blocks = offset_bytes >> shift_cnt;
5011 		*num_blocks = num_bytes >> shift_cnt;
5012 		return (offset_bytes - (*offset_blocks << shift_cnt)) |
5013 		       (num_bytes - (*num_blocks << shift_cnt));
5014 	} else {
5015 		*offset_blocks = offset_bytes / block_size;
5016 		*num_blocks = num_bytes / block_size;
5017 		return (offset_bytes % block_size) | (num_bytes % block_size);
5018 	}
5019 }
5020 
5021 static bool
5022 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks)
5023 {
5024 	/* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there
5025 	 * has been an overflow and hence the offset has been wrapped around */
5026 	if (offset_blocks + num_blocks < offset_blocks) {
5027 		return false;
5028 	}
5029 
5030 	/* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */
5031 	if (offset_blocks + num_blocks > bdev->blockcnt) {
5032 		return false;
5033 	}
5034 
5035 	return true;
5036 }
5037 
5038 static void
5039 bdev_seek_complete_cb(void *ctx)
5040 {
5041 	struct spdk_bdev_io *bdev_io = ctx;
5042 
5043 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
5044 	bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx);
5045 }
5046 
5047 static int
5048 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5049 	  uint64_t offset_blocks, enum spdk_bdev_io_type io_type,
5050 	  spdk_bdev_io_completion_cb cb, void *cb_arg)
5051 {
5052 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5053 	struct spdk_bdev_io *bdev_io;
5054 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5055 
5056 	assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE);
5057 
5058 	/* Check if offset_blocks is valid looking at the validity of one block */
5059 	if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) {
5060 		return -EINVAL;
5061 	}
5062 
5063 	bdev_io = bdev_channel_get_io(channel);
5064 	if (!bdev_io) {
5065 		return -ENOMEM;
5066 	}
5067 
5068 	bdev_io->internal.ch = channel;
5069 	bdev_io->internal.desc = desc;
5070 	bdev_io->type = io_type;
5071 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5072 	bdev_io->u.bdev.memory_domain = NULL;
5073 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5074 	bdev_io->u.bdev.accel_sequence = NULL;
5075 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5076 
5077 	if (!spdk_bdev_io_type_supported(bdev, io_type)) {
5078 		/* In case bdev doesn't support seek to next data/hole offset,
5079 		 * it is assumed that only data and no holes are present */
5080 		if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) {
5081 			bdev_io->u.bdev.seek.offset = offset_blocks;
5082 		} else {
5083 			bdev_io->u.bdev.seek.offset = UINT64_MAX;
5084 		}
5085 
5086 		spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io);
5087 		return 0;
5088 	}
5089 
5090 	bdev_io_submit(bdev_io);
5091 	return 0;
5092 }
5093 
5094 int
5095 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5096 		    uint64_t offset_blocks,
5097 		    spdk_bdev_io_completion_cb cb, void *cb_arg)
5098 {
5099 	return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg);
5100 }
5101 
5102 int
5103 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5104 		    uint64_t offset_blocks,
5105 		    spdk_bdev_io_completion_cb cb, void *cb_arg)
5106 {
5107 	return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg);
5108 }
5109 
5110 uint64_t
5111 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io)
5112 {
5113 	return bdev_io->u.bdev.seek.offset;
5114 }
5115 
5116 static int
5117 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf,
5118 			 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5119 			 spdk_bdev_io_completion_cb cb, void *cb_arg)
5120 {
5121 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5122 	struct spdk_bdev_io *bdev_io;
5123 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5124 
5125 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5126 		return -EINVAL;
5127 	}
5128 
5129 	bdev_io = bdev_channel_get_io(channel);
5130 	if (!bdev_io) {
5131 		return -ENOMEM;
5132 	}
5133 
5134 	bdev_io->internal.ch = channel;
5135 	bdev_io->internal.desc = desc;
5136 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
5137 	bdev_io->u.bdev.iovs = &bdev_io->iov;
5138 	bdev_io->u.bdev.iovs[0].iov_base = buf;
5139 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
5140 	bdev_io->u.bdev.iovcnt = 1;
5141 	bdev_io->u.bdev.md_buf = md_buf;
5142 	bdev_io->u.bdev.num_blocks = num_blocks;
5143 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5144 	bdev_io->u.bdev.memory_domain = NULL;
5145 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5146 	bdev_io->u.bdev.accel_sequence = NULL;
5147 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5148 
5149 	bdev_io_submit(bdev_io);
5150 	return 0;
5151 }
5152 
5153 int
5154 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5155 	       void *buf, uint64_t offset, uint64_t nbytes,
5156 	       spdk_bdev_io_completion_cb cb, void *cb_arg)
5157 {
5158 	uint64_t offset_blocks, num_blocks;
5159 
5160 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5161 				 nbytes, &num_blocks) != 0) {
5162 		return -EINVAL;
5163 	}
5164 
5165 	return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
5166 }
5167 
5168 int
5169 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5170 		      void *buf, uint64_t offset_blocks, uint64_t num_blocks,
5171 		      spdk_bdev_io_completion_cb cb, void *cb_arg)
5172 {
5173 	return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg);
5174 }
5175 
5176 int
5177 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5178 			      void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5179 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
5180 {
5181 	struct iovec iov = {
5182 		.iov_base = buf,
5183 	};
5184 
5185 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5186 		return -EINVAL;
5187 	}
5188 
5189 	if (md_buf && !_is_buf_allocated(&iov)) {
5190 		return -EINVAL;
5191 	}
5192 
5193 	return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
5194 					cb, cb_arg);
5195 }
5196 
5197 int
5198 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5199 		struct iovec *iov, int iovcnt,
5200 		uint64_t offset, uint64_t nbytes,
5201 		spdk_bdev_io_completion_cb cb, void *cb_arg)
5202 {
5203 	uint64_t offset_blocks, num_blocks;
5204 
5205 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5206 				 nbytes, &num_blocks) != 0) {
5207 		return -EINVAL;
5208 	}
5209 
5210 	return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
5211 }
5212 
5213 static int
5214 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5215 			  struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks,
5216 			  uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx,
5217 			  struct spdk_accel_sequence *seq,
5218 			  spdk_bdev_io_completion_cb cb, void *cb_arg)
5219 {
5220 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5221 	struct spdk_bdev_io *bdev_io;
5222 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5223 
5224 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5225 		return -EINVAL;
5226 	}
5227 
5228 	bdev_io = bdev_channel_get_io(channel);
5229 	if (!bdev_io) {
5230 		return -ENOMEM;
5231 	}
5232 
5233 	bdev_io->internal.ch = channel;
5234 	bdev_io->internal.desc = desc;
5235 	bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
5236 	bdev_io->u.bdev.iovs = iov;
5237 	bdev_io->u.bdev.iovcnt = iovcnt;
5238 	bdev_io->u.bdev.md_buf = md_buf;
5239 	bdev_io->u.bdev.num_blocks = num_blocks;
5240 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5241 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5242 	bdev_io->internal.memory_domain = domain;
5243 	bdev_io->internal.memory_domain_ctx = domain_ctx;
5244 	bdev_io->internal.accel_sequence = seq;
5245 	bdev_io->internal.has_accel_sequence = seq != NULL;
5246 	bdev_io->u.bdev.memory_domain = domain;
5247 	bdev_io->u.bdev.memory_domain_ctx = domain_ctx;
5248 	bdev_io->u.bdev.accel_sequence = seq;
5249 
5250 	_bdev_io_submit_ext(desc, bdev_io);
5251 
5252 	return 0;
5253 }
5254 
5255 int
5256 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5257 		       struct iovec *iov, int iovcnt,
5258 		       uint64_t offset_blocks, uint64_t num_blocks,
5259 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
5260 {
5261 	return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
5262 					 num_blocks, NULL, NULL, NULL, cb, cb_arg);
5263 }
5264 
5265 int
5266 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5267 			       struct iovec *iov, int iovcnt, void *md_buf,
5268 			       uint64_t offset_blocks, uint64_t num_blocks,
5269 			       spdk_bdev_io_completion_cb cb, void *cb_arg)
5270 {
5271 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5272 		return -EINVAL;
5273 	}
5274 
5275 	if (md_buf && !_is_buf_allocated(iov)) {
5276 		return -EINVAL;
5277 	}
5278 
5279 	return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
5280 					 num_blocks, NULL, NULL, NULL, cb, cb_arg);
5281 }
5282 
5283 static inline bool
5284 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov)
5285 {
5286 	/*
5287 	 * We check if opts size is at least of size when we first introduced
5288 	 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members
5289 	 * are not checked internal.
5290 	 */
5291 	return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) +
5292 	       sizeof(opts->metadata) &&
5293 	       opts->size <= sizeof(*opts) &&
5294 	       /* When memory domain is used, the user must provide data buffers */
5295 	       (!opts->memory_domain || (iov && iov[0].iov_base));
5296 }
5297 
5298 int
5299 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5300 			   struct iovec *iov, int iovcnt,
5301 			   uint64_t offset_blocks, uint64_t num_blocks,
5302 			   spdk_bdev_io_completion_cb cb, void *cb_arg,
5303 			   struct spdk_bdev_ext_io_opts *opts)
5304 {
5305 	void *md = NULL;
5306 
5307 	if (opts) {
5308 		if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) {
5309 			return -EINVAL;
5310 		}
5311 		md = opts->metadata;
5312 	}
5313 
5314 	if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5315 		return -EINVAL;
5316 	}
5317 
5318 	if (md && !_is_buf_allocated(iov)) {
5319 		return -EINVAL;
5320 	}
5321 
5322 	return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks,
5323 					 num_blocks,
5324 					 bdev_get_ext_io_opt(opts, memory_domain, NULL),
5325 					 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL),
5326 					 bdev_get_ext_io_opt(opts, accel_sequence, NULL),
5327 					 cb, cb_arg);
5328 }
5329 
5330 static int
5331 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5332 			  void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5333 			  spdk_bdev_io_completion_cb cb, void *cb_arg)
5334 {
5335 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5336 	struct spdk_bdev_io *bdev_io;
5337 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5338 
5339 	if (!desc->write) {
5340 		return -EBADF;
5341 	}
5342 
5343 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5344 		return -EINVAL;
5345 	}
5346 
5347 	bdev_io = bdev_channel_get_io(channel);
5348 	if (!bdev_io) {
5349 		return -ENOMEM;
5350 	}
5351 
5352 	bdev_io->internal.ch = channel;
5353 	bdev_io->internal.desc = desc;
5354 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
5355 	bdev_io->u.bdev.iovs = &bdev_io->iov;
5356 	bdev_io->u.bdev.iovs[0].iov_base = buf;
5357 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
5358 	bdev_io->u.bdev.iovcnt = 1;
5359 	bdev_io->u.bdev.md_buf = md_buf;
5360 	bdev_io->u.bdev.num_blocks = num_blocks;
5361 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5362 	bdev_io->u.bdev.memory_domain = NULL;
5363 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5364 	bdev_io->u.bdev.accel_sequence = NULL;
5365 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5366 
5367 	bdev_io_submit(bdev_io);
5368 	return 0;
5369 }
5370 
5371 int
5372 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5373 		void *buf, uint64_t offset, uint64_t nbytes,
5374 		spdk_bdev_io_completion_cb cb, void *cb_arg)
5375 {
5376 	uint64_t offset_blocks, num_blocks;
5377 
5378 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5379 				 nbytes, &num_blocks) != 0) {
5380 		return -EINVAL;
5381 	}
5382 
5383 	return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
5384 }
5385 
5386 int
5387 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5388 		       void *buf, uint64_t offset_blocks, uint64_t num_blocks,
5389 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
5390 {
5391 	return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks,
5392 					 cb, cb_arg);
5393 }
5394 
5395 int
5396 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5397 			       void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5398 			       spdk_bdev_io_completion_cb cb, void *cb_arg)
5399 {
5400 	struct iovec iov = {
5401 		.iov_base = buf,
5402 	};
5403 
5404 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5405 		return -EINVAL;
5406 	}
5407 
5408 	if (md_buf && !_is_buf_allocated(&iov)) {
5409 		return -EINVAL;
5410 	}
5411 
5412 	return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
5413 					 cb, cb_arg);
5414 }
5415 
5416 static int
5417 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5418 			   struct iovec *iov, int iovcnt, void *md_buf,
5419 			   uint64_t offset_blocks, uint64_t num_blocks,
5420 			   struct spdk_memory_domain *domain, void *domain_ctx,
5421 			   struct spdk_accel_sequence *seq,
5422 			   spdk_bdev_io_completion_cb cb, void *cb_arg)
5423 {
5424 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5425 	struct spdk_bdev_io *bdev_io;
5426 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5427 
5428 	if (!desc->write) {
5429 		return -EBADF;
5430 	}
5431 
5432 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5433 		return -EINVAL;
5434 	}
5435 
5436 	bdev_io = bdev_channel_get_io(channel);
5437 	if (!bdev_io) {
5438 		return -ENOMEM;
5439 	}
5440 
5441 	bdev_io->internal.ch = channel;
5442 	bdev_io->internal.desc = desc;
5443 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
5444 	bdev_io->u.bdev.iovs = iov;
5445 	bdev_io->u.bdev.iovcnt = iovcnt;
5446 	bdev_io->u.bdev.md_buf = md_buf;
5447 	bdev_io->u.bdev.num_blocks = num_blocks;
5448 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5449 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5450 	bdev_io->internal.memory_domain = domain;
5451 	bdev_io->internal.memory_domain_ctx = domain_ctx;
5452 	bdev_io->internal.accel_sequence = seq;
5453 	bdev_io->internal.has_accel_sequence = seq != NULL;
5454 	bdev_io->u.bdev.memory_domain = domain;
5455 	bdev_io->u.bdev.memory_domain_ctx = domain_ctx;
5456 	bdev_io->u.bdev.accel_sequence = seq;
5457 
5458 	_bdev_io_submit_ext(desc, bdev_io);
5459 
5460 	return 0;
5461 }
5462 
5463 int
5464 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5465 		 struct iovec *iov, int iovcnt,
5466 		 uint64_t offset, uint64_t len,
5467 		 spdk_bdev_io_completion_cb cb, void *cb_arg)
5468 {
5469 	uint64_t offset_blocks, num_blocks;
5470 
5471 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5472 				 len, &num_blocks) != 0) {
5473 		return -EINVAL;
5474 	}
5475 
5476 	return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
5477 }
5478 
5479 int
5480 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5481 			struct iovec *iov, int iovcnt,
5482 			uint64_t offset_blocks, uint64_t num_blocks,
5483 			spdk_bdev_io_completion_cb cb, void *cb_arg)
5484 {
5485 	return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
5486 					  num_blocks, NULL, NULL, NULL, cb, cb_arg);
5487 }
5488 
5489 int
5490 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5491 				struct iovec *iov, int iovcnt, void *md_buf,
5492 				uint64_t offset_blocks, uint64_t num_blocks,
5493 				spdk_bdev_io_completion_cb cb, void *cb_arg)
5494 {
5495 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5496 		return -EINVAL;
5497 	}
5498 
5499 	if (md_buf && !_is_buf_allocated(iov)) {
5500 		return -EINVAL;
5501 	}
5502 
5503 	return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
5504 					  num_blocks, NULL, NULL, NULL, cb, cb_arg);
5505 }
5506 
5507 int
5508 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5509 			    struct iovec *iov, int iovcnt,
5510 			    uint64_t offset_blocks, uint64_t num_blocks,
5511 			    spdk_bdev_io_completion_cb cb, void *cb_arg,
5512 			    struct spdk_bdev_ext_io_opts *opts)
5513 {
5514 	void *md = NULL;
5515 
5516 	if (opts) {
5517 		if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) {
5518 			return -EINVAL;
5519 		}
5520 		md = opts->metadata;
5521 	}
5522 
5523 	if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5524 		return -EINVAL;
5525 	}
5526 
5527 	if (md && !_is_buf_allocated(iov)) {
5528 		return -EINVAL;
5529 	}
5530 
5531 	return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks,
5532 					  bdev_get_ext_io_opt(opts, memory_domain, NULL),
5533 					  bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL),
5534 					  bdev_get_ext_io_opt(opts, accel_sequence, NULL),
5535 					  cb, cb_arg);
5536 }
5537 
5538 static void
5539 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
5540 {
5541 	struct spdk_bdev_io *parent_io = cb_arg;
5542 	struct spdk_bdev *bdev = parent_io->bdev;
5543 	uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base;
5544 	int i, rc = 0;
5545 
5546 	if (!success) {
5547 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
5548 		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
5549 		spdk_bdev_free_io(bdev_io);
5550 		return;
5551 	}
5552 
5553 	for (i = 0; i < parent_io->u.bdev.iovcnt; i++) {
5554 		rc = memcmp(read_buf,
5555 			    parent_io->u.bdev.iovs[i].iov_base,
5556 			    parent_io->u.bdev.iovs[i].iov_len);
5557 		if (rc) {
5558 			break;
5559 		}
5560 		read_buf += parent_io->u.bdev.iovs[i].iov_len;
5561 	}
5562 
5563 	if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) {
5564 		rc = memcmp(bdev_io->u.bdev.md_buf,
5565 			    parent_io->u.bdev.md_buf,
5566 			    spdk_bdev_get_md_size(bdev));
5567 	}
5568 
5569 	spdk_bdev_free_io(bdev_io);
5570 
5571 	if (rc == 0) {
5572 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
5573 		parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx);
5574 	} else {
5575 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE;
5576 		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
5577 	}
5578 }
5579 
5580 static void
5581 bdev_compare_do_read(void *_bdev_io)
5582 {
5583 	struct spdk_bdev_io *bdev_io = _bdev_io;
5584 	int rc;
5585 
5586 	rc = spdk_bdev_read_blocks(bdev_io->internal.desc,
5587 				   spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL,
5588 				   bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
5589 				   bdev_compare_do_read_done, bdev_io);
5590 
5591 	if (rc == -ENOMEM) {
5592 		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read);
5593 	} else if (rc != 0) {
5594 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
5595 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
5596 	}
5597 }
5598 
5599 static int
5600 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5601 			     struct iovec *iov, int iovcnt, void *md_buf,
5602 			     uint64_t offset_blocks, uint64_t num_blocks,
5603 			     spdk_bdev_io_completion_cb cb, void *cb_arg)
5604 {
5605 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5606 	struct spdk_bdev_io *bdev_io;
5607 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5608 
5609 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5610 		return -EINVAL;
5611 	}
5612 
5613 	bdev_io = bdev_channel_get_io(channel);
5614 	if (!bdev_io) {
5615 		return -ENOMEM;
5616 	}
5617 
5618 	bdev_io->internal.ch = channel;
5619 	bdev_io->internal.desc = desc;
5620 	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE;
5621 	bdev_io->u.bdev.iovs = iov;
5622 	bdev_io->u.bdev.iovcnt = iovcnt;
5623 	bdev_io->u.bdev.md_buf = md_buf;
5624 	bdev_io->u.bdev.num_blocks = num_blocks;
5625 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5626 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5627 	bdev_io->u.bdev.memory_domain = NULL;
5628 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5629 	bdev_io->u.bdev.accel_sequence = NULL;
5630 
5631 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) {
5632 		bdev_io_submit(bdev_io);
5633 		return 0;
5634 	}
5635 
5636 	bdev_compare_do_read(bdev_io);
5637 
5638 	return 0;
5639 }
5640 
5641 int
5642 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5643 			  struct iovec *iov, int iovcnt,
5644 			  uint64_t offset_blocks, uint64_t num_blocks,
5645 			  spdk_bdev_io_completion_cb cb, void *cb_arg)
5646 {
5647 	return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
5648 					    num_blocks, cb, cb_arg);
5649 }
5650 
5651 int
5652 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5653 				  struct iovec *iov, int iovcnt, void *md_buf,
5654 				  uint64_t offset_blocks, uint64_t num_blocks,
5655 				  spdk_bdev_io_completion_cb cb, void *cb_arg)
5656 {
5657 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5658 		return -EINVAL;
5659 	}
5660 
5661 	if (md_buf && !_is_buf_allocated(iov)) {
5662 		return -EINVAL;
5663 	}
5664 
5665 	return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
5666 					    num_blocks, cb, cb_arg);
5667 }
5668 
5669 static int
5670 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5671 			    void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5672 			    spdk_bdev_io_completion_cb cb, void *cb_arg)
5673 {
5674 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5675 	struct spdk_bdev_io *bdev_io;
5676 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5677 
5678 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5679 		return -EINVAL;
5680 	}
5681 
5682 	bdev_io = bdev_channel_get_io(channel);
5683 	if (!bdev_io) {
5684 		return -ENOMEM;
5685 	}
5686 
5687 	bdev_io->internal.ch = channel;
5688 	bdev_io->internal.desc = desc;
5689 	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE;
5690 	bdev_io->u.bdev.iovs = &bdev_io->iov;
5691 	bdev_io->u.bdev.iovs[0].iov_base = buf;
5692 	bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
5693 	bdev_io->u.bdev.iovcnt = 1;
5694 	bdev_io->u.bdev.md_buf = md_buf;
5695 	bdev_io->u.bdev.num_blocks = num_blocks;
5696 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5697 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5698 	bdev_io->u.bdev.memory_domain = NULL;
5699 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5700 	bdev_io->u.bdev.accel_sequence = NULL;
5701 
5702 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) {
5703 		bdev_io_submit(bdev_io);
5704 		return 0;
5705 	}
5706 
5707 	bdev_compare_do_read(bdev_io);
5708 
5709 	return 0;
5710 }
5711 
5712 int
5713 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5714 			 void *buf, uint64_t offset_blocks, uint64_t num_blocks,
5715 			 spdk_bdev_io_completion_cb cb, void *cb_arg)
5716 {
5717 	return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks,
5718 					   cb, cb_arg);
5719 }
5720 
5721 int
5722 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5723 				 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
5724 				 spdk_bdev_io_completion_cb cb, void *cb_arg)
5725 {
5726 	struct iovec iov = {
5727 		.iov_base = buf,
5728 	};
5729 
5730 	if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
5731 		return -EINVAL;
5732 	}
5733 
5734 	if (md_buf && !_is_buf_allocated(&iov)) {
5735 		return -EINVAL;
5736 	}
5737 
5738 	return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
5739 					   cb, cb_arg);
5740 }
5741 
5742 static void
5743 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status)
5744 {
5745 	struct spdk_bdev_io *bdev_io = ctx;
5746 
5747 	if (unlock_status) {
5748 		SPDK_ERRLOG("LBA range unlock failed\n");
5749 	}
5750 
5751 	bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true :
5752 			     false, bdev_io->internal.caller_ctx);
5753 }
5754 
5755 static void
5756 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status)
5757 {
5758 	bdev_io->internal.status = status;
5759 
5760 	bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch),
5761 			      bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
5762 			      bdev_comparev_and_writev_blocks_unlocked, bdev_io);
5763 }
5764 
5765 static void
5766 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
5767 {
5768 	struct spdk_bdev_io *parent_io = cb_arg;
5769 
5770 	if (!success) {
5771 		SPDK_ERRLOG("Compare and write operation failed\n");
5772 	}
5773 
5774 	spdk_bdev_free_io(bdev_io);
5775 
5776 	bdev_comparev_and_writev_blocks_unlock(parent_io,
5777 					       success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED);
5778 }
5779 
5780 static void
5781 bdev_compare_and_write_do_write(void *_bdev_io)
5782 {
5783 	struct spdk_bdev_io *bdev_io = _bdev_io;
5784 	int rc;
5785 
5786 	rc = spdk_bdev_writev_blocks(bdev_io->internal.desc,
5787 				     spdk_io_channel_from_ctx(bdev_io->internal.ch),
5788 				     bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt,
5789 				     bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
5790 				     bdev_compare_and_write_do_write_done, bdev_io);
5791 
5792 
5793 	if (rc == -ENOMEM) {
5794 		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write);
5795 	} else if (rc != 0) {
5796 		bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
5797 	}
5798 }
5799 
5800 static void
5801 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
5802 {
5803 	struct spdk_bdev_io *parent_io = cb_arg;
5804 
5805 	spdk_bdev_free_io(bdev_io);
5806 
5807 	if (!success) {
5808 		bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE);
5809 		return;
5810 	}
5811 
5812 	bdev_compare_and_write_do_write(parent_io);
5813 }
5814 
5815 static void
5816 bdev_compare_and_write_do_compare(void *_bdev_io)
5817 {
5818 	struct spdk_bdev_io *bdev_io = _bdev_io;
5819 	int rc;
5820 
5821 	rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc,
5822 				       spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs,
5823 				       bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
5824 				       bdev_compare_and_write_do_compare_done, bdev_io);
5825 
5826 	if (rc == -ENOMEM) {
5827 		bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare);
5828 	} else if (rc != 0) {
5829 		bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED);
5830 	}
5831 }
5832 
5833 static void
5834 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status)
5835 {
5836 	struct spdk_bdev_io *bdev_io = ctx;
5837 
5838 	if (status) {
5839 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED;
5840 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
5841 		return;
5842 	}
5843 
5844 	bdev_compare_and_write_do_compare(bdev_io);
5845 }
5846 
5847 int
5848 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5849 				     struct iovec *compare_iov, int compare_iovcnt,
5850 				     struct iovec *write_iov, int write_iovcnt,
5851 				     uint64_t offset_blocks, uint64_t num_blocks,
5852 				     spdk_bdev_io_completion_cb cb, void *cb_arg)
5853 {
5854 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5855 	struct spdk_bdev_io *bdev_io;
5856 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5857 
5858 	if (!desc->write) {
5859 		return -EBADF;
5860 	}
5861 
5862 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5863 		return -EINVAL;
5864 	}
5865 
5866 	if (num_blocks > bdev->acwu) {
5867 		return -EINVAL;
5868 	}
5869 
5870 	bdev_io = bdev_channel_get_io(channel);
5871 	if (!bdev_io) {
5872 		return -ENOMEM;
5873 	}
5874 
5875 	bdev_io->internal.ch = channel;
5876 	bdev_io->internal.desc = desc;
5877 	bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE;
5878 	bdev_io->u.bdev.iovs = compare_iov;
5879 	bdev_io->u.bdev.iovcnt = compare_iovcnt;
5880 	bdev_io->u.bdev.fused_iovs = write_iov;
5881 	bdev_io->u.bdev.fused_iovcnt = write_iovcnt;
5882 	bdev_io->u.bdev.md_buf = NULL;
5883 	bdev_io->u.bdev.num_blocks = num_blocks;
5884 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5885 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5886 	bdev_io->u.bdev.memory_domain = NULL;
5887 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5888 	bdev_io->u.bdev.accel_sequence = NULL;
5889 
5890 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) {
5891 		bdev_io_submit(bdev_io);
5892 		return 0;
5893 	}
5894 
5895 	return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks,
5896 				   bdev_comparev_and_writev_blocks_locked, bdev_io);
5897 }
5898 
5899 int
5900 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5901 		      struct iovec *iov, int iovcnt,
5902 		      uint64_t offset_blocks, uint64_t num_blocks,
5903 		      bool populate,
5904 		      spdk_bdev_io_completion_cb cb, void *cb_arg)
5905 {
5906 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5907 	struct spdk_bdev_io *bdev_io;
5908 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5909 
5910 	if (!desc->write) {
5911 		return -EBADF;
5912 	}
5913 
5914 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5915 		return -EINVAL;
5916 	}
5917 
5918 	if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) {
5919 		return -ENOTSUP;
5920 	}
5921 
5922 	bdev_io = bdev_channel_get_io(channel);
5923 	if (!bdev_io) {
5924 		return -ENOMEM;
5925 	}
5926 
5927 	bdev_io->internal.ch = channel;
5928 	bdev_io->internal.desc = desc;
5929 	bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY;
5930 	bdev_io->u.bdev.num_blocks = num_blocks;
5931 	bdev_io->u.bdev.offset_blocks = offset_blocks;
5932 	bdev_io->u.bdev.iovs = iov;
5933 	bdev_io->u.bdev.iovcnt = iovcnt;
5934 	bdev_io->u.bdev.md_buf = NULL;
5935 	bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0;
5936 	bdev_io->u.bdev.zcopy.commit = 0;
5937 	bdev_io->u.bdev.zcopy.start = 1;
5938 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
5939 	bdev_io->u.bdev.memory_domain = NULL;
5940 	bdev_io->u.bdev.memory_domain_ctx = NULL;
5941 	bdev_io->u.bdev.accel_sequence = NULL;
5942 
5943 	bdev_io_submit(bdev_io);
5944 
5945 	return 0;
5946 }
5947 
5948 int
5949 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit,
5950 		    spdk_bdev_io_completion_cb cb, void *cb_arg)
5951 {
5952 	if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) {
5953 		return -EINVAL;
5954 	}
5955 
5956 	bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0;
5957 	bdev_io->u.bdev.zcopy.start = 0;
5958 	bdev_io->internal.caller_ctx = cb_arg;
5959 	bdev_io->internal.cb = cb;
5960 	bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
5961 
5962 	bdev_io_submit(bdev_io);
5963 
5964 	return 0;
5965 }
5966 
5967 int
5968 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5969 		       uint64_t offset, uint64_t len,
5970 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
5971 {
5972 	uint64_t offset_blocks, num_blocks;
5973 
5974 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
5975 				 len, &num_blocks) != 0) {
5976 		return -EINVAL;
5977 	}
5978 
5979 	return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
5980 }
5981 
5982 int
5983 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
5984 			      uint64_t offset_blocks, uint64_t num_blocks,
5985 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
5986 {
5987 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
5988 	struct spdk_bdev_io *bdev_io;
5989 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
5990 
5991 	if (!desc->write) {
5992 		return -EBADF;
5993 	}
5994 
5995 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
5996 		return -EINVAL;
5997 	}
5998 
5999 	if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) &&
6000 	    !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) {
6001 		return -ENOTSUP;
6002 	}
6003 
6004 	bdev_io = bdev_channel_get_io(channel);
6005 
6006 	if (!bdev_io) {
6007 		return -ENOMEM;
6008 	}
6009 
6010 	bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES;
6011 	bdev_io->internal.ch = channel;
6012 	bdev_io->internal.desc = desc;
6013 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6014 	bdev_io->u.bdev.num_blocks = num_blocks;
6015 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6016 	bdev_io->u.bdev.memory_domain = NULL;
6017 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6018 	bdev_io->u.bdev.accel_sequence = NULL;
6019 
6020 	/* If the write_zeroes size is large and should be split, use the generic split
6021 	 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not.
6022 	 *
6023 	 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported
6024 	 * or emulate it using regular write request otherwise.
6025 	 */
6026 	if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) ||
6027 	    bdev_io->internal.split) {
6028 		bdev_io_submit(bdev_io);
6029 		return 0;
6030 	}
6031 
6032 	assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE);
6033 
6034 	return bdev_write_zero_buffer(bdev_io);
6035 }
6036 
6037 int
6038 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6039 		uint64_t offset, uint64_t nbytes,
6040 		spdk_bdev_io_completion_cb cb, void *cb_arg)
6041 {
6042 	uint64_t offset_blocks, num_blocks;
6043 
6044 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
6045 				 nbytes, &num_blocks) != 0) {
6046 		return -EINVAL;
6047 	}
6048 
6049 	return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
6050 }
6051 
6052 int
6053 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6054 		       uint64_t offset_blocks, uint64_t num_blocks,
6055 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
6056 {
6057 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6058 	struct spdk_bdev_io *bdev_io;
6059 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6060 
6061 	if (!desc->write) {
6062 		return -EBADF;
6063 	}
6064 
6065 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6066 		return -EINVAL;
6067 	}
6068 
6069 	if (num_blocks == 0) {
6070 		SPDK_ERRLOG("Can't unmap 0 bytes\n");
6071 		return -EINVAL;
6072 	}
6073 
6074 	bdev_io = bdev_channel_get_io(channel);
6075 	if (!bdev_io) {
6076 		return -ENOMEM;
6077 	}
6078 
6079 	bdev_io->internal.ch = channel;
6080 	bdev_io->internal.desc = desc;
6081 	bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP;
6082 
6083 	bdev_io->u.bdev.iovs = &bdev_io->iov;
6084 	bdev_io->u.bdev.iovs[0].iov_base = NULL;
6085 	bdev_io->u.bdev.iovs[0].iov_len = 0;
6086 	bdev_io->u.bdev.iovcnt = 1;
6087 
6088 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6089 	bdev_io->u.bdev.num_blocks = num_blocks;
6090 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6091 	bdev_io->u.bdev.memory_domain = NULL;
6092 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6093 	bdev_io->u.bdev.accel_sequence = NULL;
6094 
6095 	bdev_io_submit(bdev_io);
6096 	return 0;
6097 }
6098 
6099 int
6100 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6101 		uint64_t offset, uint64_t length,
6102 		spdk_bdev_io_completion_cb cb, void *cb_arg)
6103 {
6104 	uint64_t offset_blocks, num_blocks;
6105 
6106 	if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
6107 				 length, &num_blocks) != 0) {
6108 		return -EINVAL;
6109 	}
6110 
6111 	return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
6112 }
6113 
6114 int
6115 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6116 		       uint64_t offset_blocks, uint64_t num_blocks,
6117 		       spdk_bdev_io_completion_cb cb, void *cb_arg)
6118 {
6119 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6120 	struct spdk_bdev_io *bdev_io;
6121 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6122 
6123 	if (!desc->write) {
6124 		return -EBADF;
6125 	}
6126 
6127 	if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
6128 		return -EINVAL;
6129 	}
6130 
6131 	bdev_io = bdev_channel_get_io(channel);
6132 	if (!bdev_io) {
6133 		return -ENOMEM;
6134 	}
6135 
6136 	bdev_io->internal.ch = channel;
6137 	bdev_io->internal.desc = desc;
6138 	bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH;
6139 	bdev_io->u.bdev.iovs = NULL;
6140 	bdev_io->u.bdev.iovcnt = 0;
6141 	bdev_io->u.bdev.offset_blocks = offset_blocks;
6142 	bdev_io->u.bdev.num_blocks = num_blocks;
6143 	bdev_io->u.bdev.memory_domain = NULL;
6144 	bdev_io->u.bdev.memory_domain_ctx = NULL;
6145 	bdev_io->u.bdev.accel_sequence = NULL;
6146 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6147 
6148 	bdev_io_submit(bdev_io);
6149 	return 0;
6150 }
6151 
6152 static int bdev_reset_poll_for_outstanding_io(void *ctx);
6153 
6154 static void
6155 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
6156 {
6157 	struct spdk_bdev_channel *ch = _ctx;
6158 	struct spdk_bdev_io *bdev_io;
6159 
6160 	bdev_io = TAILQ_FIRST(&ch->queued_resets);
6161 
6162 	if (status == -EBUSY) {
6163 		if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) {
6164 			bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io,
6165 							      ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD);
6166 		} else {
6167 			TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
6168 
6169 			if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) {
6170 				/* If outstanding IOs are still present and reset_io_drain_timeout
6171 				 * seconds passed, start the reset. */
6172 				bdev_io_submit_reset(bdev_io);
6173 			} else {
6174 				/* We still have in progress memory domain pull/push or we're
6175 				 * executing accel sequence.  Since we cannot abort either of those
6176 				 * operaions, fail the reset request. */
6177 				spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
6178 			}
6179 		}
6180 	} else {
6181 		TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
6182 		SPDK_DEBUGLOG(bdev,
6183 			      "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n",
6184 			      ch->bdev->name);
6185 		/* Mark the completion status as a SUCCESS and complete the reset. */
6186 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
6187 	}
6188 }
6189 
6190 static void
6191 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6192 				struct spdk_io_channel *io_ch, void *_ctx)
6193 {
6194 	struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch);
6195 	int status = 0;
6196 
6197 	if (cur_ch->io_outstanding > 0 ||
6198 	    !TAILQ_EMPTY(&cur_ch->io_memory_domain) ||
6199 	    !TAILQ_EMPTY(&cur_ch->io_accel_exec)) {
6200 		/* If a channel has outstanding IO, set status to -EBUSY code. This will stop
6201 		 * further iteration over the rest of the channels and pass non-zero status
6202 		 * to the callback function. */
6203 		status = -EBUSY;
6204 	}
6205 	spdk_bdev_for_each_channel_continue(i, status);
6206 }
6207 
6208 static int
6209 bdev_reset_poll_for_outstanding_io(void *ctx)
6210 {
6211 	struct spdk_bdev_channel *ch = ctx;
6212 	struct spdk_bdev_io *bdev_io;
6213 
6214 	bdev_io = TAILQ_FIRST(&ch->queued_resets);
6215 
6216 	spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller);
6217 	spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch,
6218 				   bdev_reset_check_outstanding_io_done);
6219 
6220 	return SPDK_POLLER_BUSY;
6221 }
6222 
6223 static void
6224 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status)
6225 {
6226 	struct spdk_bdev_channel *ch = _ctx;
6227 	struct spdk_bdev_io *bdev_io;
6228 
6229 	bdev_io = TAILQ_FIRST(&ch->queued_resets);
6230 
6231 	if (bdev->reset_io_drain_timeout == 0) {
6232 		TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
6233 
6234 		bdev_io_submit_reset(bdev_io);
6235 		return;
6236 	}
6237 
6238 	bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() +
6239 			(ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz());
6240 
6241 	/* In case bdev->reset_io_drain_timeout is not equal to zero,
6242 	 * submit the reset to the underlying module only if outstanding I/O
6243 	 * remain after reset_io_drain_timeout seconds have passed. */
6244 	spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch,
6245 				   bdev_reset_check_outstanding_io_done);
6246 }
6247 
6248 static void
6249 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6250 			  struct spdk_io_channel *ch, void *_ctx)
6251 {
6252 	struct spdk_bdev_channel	*channel;
6253 	struct spdk_bdev_mgmt_channel	*mgmt_channel;
6254 	struct spdk_bdev_shared_resource *shared_resource;
6255 	bdev_io_tailq_t			tmp_queued;
6256 
6257 	TAILQ_INIT(&tmp_queued);
6258 
6259 	channel = __io_ch_to_bdev_ch(ch);
6260 	shared_resource = channel->shared_resource;
6261 	mgmt_channel = shared_resource->mgmt_ch;
6262 
6263 	channel->flags |= BDEV_CH_RESET_IN_PROGRESS;
6264 
6265 	if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) {
6266 		/* The QoS object is always valid and readable while
6267 		 * the channel flag is set, so the lock here should not
6268 		 * be necessary. We're not in the fast path though, so
6269 		 * just take it anyway. */
6270 		spdk_spin_lock(&channel->bdev->internal.spinlock);
6271 		if (channel->bdev->internal.qos->ch == channel) {
6272 			TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link);
6273 		}
6274 		spdk_spin_unlock(&channel->bdev->internal.spinlock);
6275 	}
6276 
6277 	bdev_abort_all_queued_io(&shared_resource->nomem_io, channel);
6278 	bdev_abort_all_buf_io(mgmt_channel, channel);
6279 	bdev_abort_all_queued_io(&tmp_queued, channel);
6280 
6281 	spdk_bdev_for_each_channel_continue(i, 0);
6282 }
6283 
6284 static void
6285 bdev_start_reset(void *ctx)
6286 {
6287 	struct spdk_bdev_channel *ch = ctx;
6288 
6289 	spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch,
6290 				   bdev_reset_freeze_channel_done);
6291 }
6292 
6293 static void
6294 bdev_channel_start_reset(struct spdk_bdev_channel *ch)
6295 {
6296 	struct spdk_bdev *bdev = ch->bdev;
6297 
6298 	assert(!TAILQ_EMPTY(&ch->queued_resets));
6299 
6300 	spdk_spin_lock(&bdev->internal.spinlock);
6301 	if (bdev->internal.reset_in_progress == NULL) {
6302 		bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets);
6303 		/*
6304 		 * Take a channel reference for the target bdev for the life of this
6305 		 *  reset.  This guards against the channel getting destroyed while
6306 		 *  spdk_bdev_for_each_channel() calls related to this reset IO are in
6307 		 *  progress.  We will release the reference when this reset is
6308 		 *  completed.
6309 		 */
6310 		bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev));
6311 		bdev_start_reset(ch);
6312 	}
6313 	spdk_spin_unlock(&bdev->internal.spinlock);
6314 }
6315 
6316 int
6317 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6318 		spdk_bdev_io_completion_cb cb, void *cb_arg)
6319 {
6320 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6321 	struct spdk_bdev_io *bdev_io;
6322 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6323 
6324 	bdev_io = bdev_channel_get_io(channel);
6325 	if (!bdev_io) {
6326 		return -ENOMEM;
6327 	}
6328 
6329 	bdev_io->internal.ch = channel;
6330 	bdev_io->internal.desc = desc;
6331 	bdev_io->internal.submit_tsc = spdk_get_ticks();
6332 	bdev_io->type = SPDK_BDEV_IO_TYPE_RESET;
6333 	bdev_io->u.reset.ch_ref = NULL;
6334 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6335 
6336 	spdk_spin_lock(&bdev->internal.spinlock);
6337 	TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link);
6338 	spdk_spin_unlock(&bdev->internal.spinlock);
6339 
6340 	TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io,
6341 			  internal.ch_link);
6342 
6343 	bdev_channel_start_reset(channel);
6344 
6345 	return 0;
6346 }
6347 
6348 void
6349 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
6350 		      struct spdk_bdev_io_stat *stat)
6351 {
6352 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6353 
6354 	bdev_get_io_stat(stat, channel->stat);
6355 }
6356 
6357 static void
6358 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status)
6359 {
6360 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx;
6361 
6362 	bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat,
6363 			    bdev_iostat_ctx->cb_arg, 0);
6364 	free(bdev_iostat_ctx);
6365 }
6366 
6367 static void
6368 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6369 			   struct spdk_io_channel *ch, void *_ctx)
6370 {
6371 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx;
6372 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6373 
6374 	spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat);
6375 	spdk_bdev_for_each_channel_continue(i, 0);
6376 }
6377 
6378 void
6379 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat,
6380 			  spdk_bdev_get_device_stat_cb cb, void *cb_arg)
6381 {
6382 	struct spdk_bdev_iostat_ctx *bdev_iostat_ctx;
6383 
6384 	assert(bdev != NULL);
6385 	assert(stat != NULL);
6386 	assert(cb != NULL);
6387 
6388 	bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx));
6389 	if (bdev_iostat_ctx == NULL) {
6390 		SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n");
6391 		cb(bdev, stat, cb_arg, -ENOMEM);
6392 		return;
6393 	}
6394 
6395 	bdev_iostat_ctx->stat = stat;
6396 	bdev_iostat_ctx->cb = cb;
6397 	bdev_iostat_ctx->cb_arg = cb_arg;
6398 
6399 	/* Start with the statistics from previously deleted channels. */
6400 	spdk_spin_lock(&bdev->internal.spinlock);
6401 	bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat);
6402 	spdk_spin_unlock(&bdev->internal.spinlock);
6403 
6404 	/* Then iterate and add the statistics from each existing channel. */
6405 	spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx,
6406 				   bdev_get_device_stat_done);
6407 }
6408 
6409 struct bdev_iostat_reset_ctx {
6410 	enum spdk_bdev_reset_stat_mode mode;
6411 	bdev_reset_device_stat_cb cb;
6412 	void *cb_arg;
6413 };
6414 
6415 static void
6416 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status)
6417 {
6418 	struct bdev_iostat_reset_ctx *ctx = _ctx;
6419 
6420 	ctx->cb(bdev, ctx->cb_arg, 0);
6421 
6422 	free(ctx);
6423 }
6424 
6425 static void
6426 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
6427 			     struct spdk_io_channel *ch, void *_ctx)
6428 {
6429 	struct bdev_iostat_reset_ctx *ctx = _ctx;
6430 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6431 
6432 	spdk_bdev_reset_io_stat(channel->stat, ctx->mode);
6433 
6434 	spdk_bdev_for_each_channel_continue(i, 0);
6435 }
6436 
6437 void
6438 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode,
6439 		       bdev_reset_device_stat_cb cb, void *cb_arg)
6440 {
6441 	struct bdev_iostat_reset_ctx *ctx;
6442 
6443 	assert(bdev != NULL);
6444 	assert(cb != NULL);
6445 
6446 	ctx = calloc(1, sizeof(*ctx));
6447 	if (ctx == NULL) {
6448 		SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n");
6449 		cb(bdev, cb_arg, -ENOMEM);
6450 		return;
6451 	}
6452 
6453 	ctx->mode = mode;
6454 	ctx->cb = cb;
6455 	ctx->cb_arg = cb_arg;
6456 
6457 	spdk_spin_lock(&bdev->internal.spinlock);
6458 	spdk_bdev_reset_io_stat(bdev->internal.stat, mode);
6459 	spdk_spin_unlock(&bdev->internal.spinlock);
6460 
6461 	spdk_bdev_for_each_channel(bdev,
6462 				   bdev_reset_each_channel_stat,
6463 				   ctx,
6464 				   bdev_reset_device_stat_done);
6465 }
6466 
6467 int
6468 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6469 			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
6470 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
6471 {
6472 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6473 	struct spdk_bdev_io *bdev_io;
6474 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6475 
6476 	if (!desc->write) {
6477 		return -EBADF;
6478 	}
6479 
6480 	if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) {
6481 		return -ENOTSUP;
6482 	}
6483 
6484 	bdev_io = bdev_channel_get_io(channel);
6485 	if (!bdev_io) {
6486 		return -ENOMEM;
6487 	}
6488 
6489 	bdev_io->internal.ch = channel;
6490 	bdev_io->internal.desc = desc;
6491 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN;
6492 	bdev_io->u.nvme_passthru.cmd = *cmd;
6493 	bdev_io->u.nvme_passthru.buf = buf;
6494 	bdev_io->u.nvme_passthru.nbytes = nbytes;
6495 	bdev_io->u.nvme_passthru.md_buf = NULL;
6496 	bdev_io->u.nvme_passthru.md_len = 0;
6497 
6498 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6499 
6500 	bdev_io_submit(bdev_io);
6501 	return 0;
6502 }
6503 
6504 int
6505 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6506 			   const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
6507 			   spdk_bdev_io_completion_cb cb, void *cb_arg)
6508 {
6509 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6510 	struct spdk_bdev_io *bdev_io;
6511 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6512 
6513 	if (!desc->write) {
6514 		/*
6515 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
6516 		 *  to easily determine if the command is a read or write, but for now just
6517 		 *  do not allow io_passthru with a read-only descriptor.
6518 		 */
6519 		return -EBADF;
6520 	}
6521 
6522 	if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) {
6523 		return -ENOTSUP;
6524 	}
6525 
6526 	bdev_io = bdev_channel_get_io(channel);
6527 	if (!bdev_io) {
6528 		return -ENOMEM;
6529 	}
6530 
6531 	bdev_io->internal.ch = channel;
6532 	bdev_io->internal.desc = desc;
6533 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO;
6534 	bdev_io->u.nvme_passthru.cmd = *cmd;
6535 	bdev_io->u.nvme_passthru.buf = buf;
6536 	bdev_io->u.nvme_passthru.nbytes = nbytes;
6537 	bdev_io->u.nvme_passthru.md_buf = NULL;
6538 	bdev_io->u.nvme_passthru.md_len = 0;
6539 
6540 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6541 
6542 	bdev_io_submit(bdev_io);
6543 	return 0;
6544 }
6545 
6546 int
6547 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6548 			      const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len,
6549 			      spdk_bdev_io_completion_cb cb, void *cb_arg)
6550 {
6551 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6552 	struct spdk_bdev_io *bdev_io;
6553 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6554 
6555 	if (!desc->write) {
6556 		/*
6557 		 * Do not try to parse the NVMe command - we could maybe use bits in the opcode
6558 		 *  to easily determine if the command is a read or write, but for now just
6559 		 *  do not allow io_passthru with a read-only descriptor.
6560 		 */
6561 		return -EBADF;
6562 	}
6563 
6564 	if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) {
6565 		return -ENOTSUP;
6566 	}
6567 
6568 	bdev_io = bdev_channel_get_io(channel);
6569 	if (!bdev_io) {
6570 		return -ENOMEM;
6571 	}
6572 
6573 	bdev_io->internal.ch = channel;
6574 	bdev_io->internal.desc = desc;
6575 	bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD;
6576 	bdev_io->u.nvme_passthru.cmd = *cmd;
6577 	bdev_io->u.nvme_passthru.buf = buf;
6578 	bdev_io->u.nvme_passthru.nbytes = nbytes;
6579 	bdev_io->u.nvme_passthru.md_buf = md_buf;
6580 	bdev_io->u.nvme_passthru.md_len = md_len;
6581 
6582 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6583 
6584 	bdev_io_submit(bdev_io);
6585 	return 0;
6586 }
6587 
6588 static void bdev_abort_retry(void *ctx);
6589 static void bdev_abort(struct spdk_bdev_io *parent_io);
6590 
6591 static void
6592 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
6593 {
6594 	struct spdk_bdev_channel *channel = bdev_io->internal.ch;
6595 	struct spdk_bdev_io *parent_io = cb_arg;
6596 	struct spdk_bdev_io *bio_to_abort, *tmp_io;
6597 
6598 	bio_to_abort = bdev_io->u.abort.bio_to_abort;
6599 
6600 	spdk_bdev_free_io(bdev_io);
6601 
6602 	if (!success) {
6603 		/* Check if the target I/O completed in the meantime. */
6604 		TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) {
6605 			if (tmp_io == bio_to_abort) {
6606 				break;
6607 			}
6608 		}
6609 
6610 		/* If the target I/O still exists, set the parent to failed. */
6611 		if (tmp_io != NULL) {
6612 			parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
6613 		}
6614 	}
6615 
6616 	parent_io->u.bdev.split_outstanding--;
6617 	if (parent_io->u.bdev.split_outstanding == 0) {
6618 		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
6619 			bdev_abort_retry(parent_io);
6620 		} else {
6621 			bdev_io_complete(parent_io);
6622 		}
6623 	}
6624 }
6625 
6626 static int
6627 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel,
6628 	      struct spdk_bdev_io *bio_to_abort,
6629 	      spdk_bdev_io_completion_cb cb, void *cb_arg)
6630 {
6631 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6632 	struct spdk_bdev_io *bdev_io;
6633 
6634 	if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT ||
6635 	    bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) {
6636 		/* TODO: Abort reset or abort request. */
6637 		return -ENOTSUP;
6638 	}
6639 
6640 	bdev_io = bdev_channel_get_io(channel);
6641 	if (bdev_io == NULL) {
6642 		return -ENOMEM;
6643 	}
6644 
6645 	bdev_io->internal.ch = channel;
6646 	bdev_io->internal.desc = desc;
6647 	bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT;
6648 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6649 
6650 	if (bdev->split_on_optimal_io_boundary && bio_to_abort->internal.split) {
6651 		assert(bdev_io_should_split(bio_to_abort));
6652 		bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort;
6653 
6654 		/* Parent abort request is not submitted directly, but to manage its
6655 		 * execution add it to the submitted list here.
6656 		 */
6657 		bdev_io->internal.submit_tsc = spdk_get_ticks();
6658 		TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link);
6659 
6660 		bdev_abort(bdev_io);
6661 
6662 		return 0;
6663 	}
6664 
6665 	bdev_io->u.abort.bio_to_abort = bio_to_abort;
6666 
6667 	/* Submit the abort request to the underlying bdev module. */
6668 	bdev_io_submit(bdev_io);
6669 
6670 	return 0;
6671 }
6672 
6673 static bool
6674 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq)
6675 {
6676 	struct spdk_bdev_io *iter;
6677 
6678 	TAILQ_FOREACH(iter, tailq, internal.link) {
6679 		if (iter == bdev_io) {
6680 			return true;
6681 		}
6682 	}
6683 
6684 	return false;
6685 }
6686 
6687 static uint32_t
6688 _bdev_abort(struct spdk_bdev_io *parent_io)
6689 {
6690 	struct spdk_bdev_desc *desc = parent_io->internal.desc;
6691 	struct spdk_bdev_channel *channel = parent_io->internal.ch;
6692 	void *bio_cb_arg;
6693 	struct spdk_bdev_io *bio_to_abort;
6694 	uint32_t matched_ios;
6695 	int rc;
6696 
6697 	bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg;
6698 
6699 	/* matched_ios is returned and will be kept by the caller.
6700 	 *
6701 	 * This function will be used for two cases, 1) the same cb_arg is used for
6702 	 * multiple I/Os, 2) a single large I/O is split into smaller ones.
6703 	 * Incrementing split_outstanding directly here may confuse readers especially
6704 	 * for the 1st case.
6705 	 *
6706 	 * Completion of I/O abort is processed after stack unwinding. Hence this trick
6707 	 * works as expected.
6708 	 */
6709 	matched_ios = 0;
6710 	parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
6711 
6712 	TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) {
6713 		if (bio_to_abort->internal.caller_ctx != bio_cb_arg) {
6714 			continue;
6715 		}
6716 
6717 		if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) {
6718 			/* Any I/O which was submitted after this abort command should be excluded. */
6719 			continue;
6720 		}
6721 
6722 		/* We can't abort a request that's being pushed/pulled or executed by accel */
6723 		if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) ||
6724 		    bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) {
6725 			parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
6726 			break;
6727 		}
6728 
6729 		rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io);
6730 		if (rc != 0) {
6731 			if (rc == -ENOMEM) {
6732 				parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM;
6733 			} else {
6734 				parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
6735 			}
6736 			break;
6737 		}
6738 		matched_ios++;
6739 	}
6740 
6741 	return matched_ios;
6742 }
6743 
6744 static void
6745 bdev_abort_retry(void *ctx)
6746 {
6747 	struct spdk_bdev_io *parent_io = ctx;
6748 	uint32_t matched_ios;
6749 
6750 	matched_ios = _bdev_abort(parent_io);
6751 
6752 	if (matched_ios == 0) {
6753 		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
6754 			bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry);
6755 		} else {
6756 			/* For retry, the case that no target I/O was found is success
6757 			 * because it means target I/Os completed in the meantime.
6758 			 */
6759 			bdev_io_complete(parent_io);
6760 		}
6761 		return;
6762 	}
6763 
6764 	/* Use split_outstanding to manage the progress of aborting I/Os. */
6765 	parent_io->u.bdev.split_outstanding = matched_ios;
6766 }
6767 
6768 static void
6769 bdev_abort(struct spdk_bdev_io *parent_io)
6770 {
6771 	uint32_t matched_ios;
6772 
6773 	matched_ios = _bdev_abort(parent_io);
6774 
6775 	if (matched_ios == 0) {
6776 		if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
6777 			bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry);
6778 		} else {
6779 			/* The case the no target I/O was found is failure. */
6780 			parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
6781 			bdev_io_complete(parent_io);
6782 		}
6783 		return;
6784 	}
6785 
6786 	/* Use split_outstanding to manage the progress of aborting I/Os. */
6787 	parent_io->u.bdev.split_outstanding = matched_ios;
6788 }
6789 
6790 int
6791 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
6792 		void *bio_cb_arg,
6793 		spdk_bdev_io_completion_cb cb, void *cb_arg)
6794 {
6795 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
6796 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6797 	struct spdk_bdev_io *bdev_io;
6798 
6799 	if (bio_cb_arg == NULL) {
6800 		return -EINVAL;
6801 	}
6802 
6803 	if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) {
6804 		return -ENOTSUP;
6805 	}
6806 
6807 	bdev_io = bdev_channel_get_io(channel);
6808 	if (bdev_io == NULL) {
6809 		return -ENOMEM;
6810 	}
6811 
6812 	bdev_io->internal.ch = channel;
6813 	bdev_io->internal.desc = desc;
6814 	bdev_io->internal.submit_tsc = spdk_get_ticks();
6815 	bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT;
6816 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
6817 
6818 	bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg;
6819 
6820 	/* Parent abort request is not submitted directly, but to manage its execution,
6821 	 * add it to the submitted list here.
6822 	 */
6823 	TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link);
6824 
6825 	bdev_abort(bdev_io);
6826 
6827 	return 0;
6828 }
6829 
6830 int
6831 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
6832 			struct spdk_bdev_io_wait_entry *entry)
6833 {
6834 	struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch);
6835 	struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch;
6836 
6837 	if (bdev != entry->bdev) {
6838 		SPDK_ERRLOG("bdevs do not match\n");
6839 		return -EINVAL;
6840 	}
6841 
6842 	if (mgmt_ch->per_thread_cache_count > 0) {
6843 		SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n");
6844 		return -EINVAL;
6845 	}
6846 
6847 	TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link);
6848 	return 0;
6849 }
6850 
6851 static inline void
6852 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff)
6853 {
6854 	enum spdk_bdev_io_status io_status = bdev_io->internal.status;
6855 	struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat;
6856 	uint64_t num_blocks = bdev_io->u.bdev.num_blocks;
6857 	uint32_t blocklen = bdev_io->bdev->blocklen;
6858 
6859 	if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
6860 		switch (bdev_io->type) {
6861 		case SPDK_BDEV_IO_TYPE_READ:
6862 			io_stat->bytes_read += num_blocks * blocklen;
6863 			io_stat->num_read_ops++;
6864 			io_stat->read_latency_ticks += tsc_diff;
6865 			if (io_stat->max_read_latency_ticks < tsc_diff) {
6866 				io_stat->max_read_latency_ticks = tsc_diff;
6867 			}
6868 			if (io_stat->min_read_latency_ticks > tsc_diff) {
6869 				io_stat->min_read_latency_ticks = tsc_diff;
6870 			}
6871 			break;
6872 		case SPDK_BDEV_IO_TYPE_WRITE:
6873 			io_stat->bytes_written += num_blocks * blocklen;
6874 			io_stat->num_write_ops++;
6875 			io_stat->write_latency_ticks += tsc_diff;
6876 			if (io_stat->max_write_latency_ticks < tsc_diff) {
6877 				io_stat->max_write_latency_ticks = tsc_diff;
6878 			}
6879 			if (io_stat->min_write_latency_ticks > tsc_diff) {
6880 				io_stat->min_write_latency_ticks = tsc_diff;
6881 			}
6882 			break;
6883 		case SPDK_BDEV_IO_TYPE_UNMAP:
6884 			io_stat->bytes_unmapped += num_blocks * blocklen;
6885 			io_stat->num_unmap_ops++;
6886 			io_stat->unmap_latency_ticks += tsc_diff;
6887 			if (io_stat->max_unmap_latency_ticks < tsc_diff) {
6888 				io_stat->max_unmap_latency_ticks = tsc_diff;
6889 			}
6890 			if (io_stat->min_unmap_latency_ticks > tsc_diff) {
6891 				io_stat->min_unmap_latency_ticks = tsc_diff;
6892 			}
6893 			break;
6894 		case SPDK_BDEV_IO_TYPE_ZCOPY:
6895 			/* Track the data in the start phase only */
6896 			if (bdev_io->u.bdev.zcopy.start) {
6897 				if (bdev_io->u.bdev.zcopy.populate) {
6898 					io_stat->bytes_read += num_blocks * blocklen;
6899 					io_stat->num_read_ops++;
6900 					io_stat->read_latency_ticks += tsc_diff;
6901 					if (io_stat->max_read_latency_ticks < tsc_diff) {
6902 						io_stat->max_read_latency_ticks = tsc_diff;
6903 					}
6904 					if (io_stat->min_read_latency_ticks > tsc_diff) {
6905 						io_stat->min_read_latency_ticks = tsc_diff;
6906 					}
6907 				} else {
6908 					io_stat->bytes_written += num_blocks * blocklen;
6909 					io_stat->num_write_ops++;
6910 					io_stat->write_latency_ticks += tsc_diff;
6911 					if (io_stat->max_write_latency_ticks < tsc_diff) {
6912 						io_stat->max_write_latency_ticks = tsc_diff;
6913 					}
6914 					if (io_stat->min_write_latency_ticks > tsc_diff) {
6915 						io_stat->min_write_latency_ticks = tsc_diff;
6916 					}
6917 				}
6918 			}
6919 			break;
6920 		case SPDK_BDEV_IO_TYPE_COPY:
6921 			io_stat->bytes_copied += num_blocks * blocklen;
6922 			io_stat->num_copy_ops++;
6923 			bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff;
6924 			if (io_stat->max_copy_latency_ticks < tsc_diff) {
6925 				io_stat->max_copy_latency_ticks = tsc_diff;
6926 			}
6927 			if (io_stat->min_copy_latency_ticks > tsc_diff) {
6928 				io_stat->min_copy_latency_ticks = tsc_diff;
6929 			}
6930 			break;
6931 		default:
6932 			break;
6933 		}
6934 	} else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) {
6935 		io_stat = bdev_io->bdev->internal.stat;
6936 		assert(io_stat->io_error != NULL);
6937 
6938 		spdk_spin_lock(&bdev_io->bdev->internal.spinlock);
6939 		io_stat->io_error->error_status[-io_status - 1]++;
6940 		spdk_spin_unlock(&bdev_io->bdev->internal.spinlock);
6941 	}
6942 
6943 #ifdef SPDK_CONFIG_VTUNE
6944 	uint64_t now_tsc = spdk_get_ticks();
6945 	if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) {
6946 		uint64_t data[5];
6947 		struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat;
6948 
6949 		data[0] = io_stat->num_read_ops - prev_stat->num_read_ops;
6950 		data[1] = io_stat->bytes_read - prev_stat->bytes_read;
6951 		data[2] = io_stat->num_write_ops - prev_stat->num_write_ops;
6952 		data[3] = io_stat->bytes_written - prev_stat->bytes_written;
6953 		data[4] = bdev_io->bdev->fn_table->get_spin_time ?
6954 			  bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0;
6955 
6956 		__itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle,
6957 				   __itt_metadata_u64, 5, data);
6958 
6959 		memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat));
6960 		bdev_io->internal.ch->start_tsc = now_tsc;
6961 	}
6962 #endif
6963 }
6964 
6965 static inline void
6966 _bdev_io_complete(void *ctx)
6967 {
6968 	struct spdk_bdev_io *bdev_io = ctx;
6969 
6970 	if (spdk_unlikely(bdev_io->internal.accel_sequence != NULL)) {
6971 		assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS);
6972 		spdk_accel_sequence_abort(bdev_io->internal.accel_sequence);
6973 	}
6974 
6975 	assert(bdev_io->internal.cb != NULL);
6976 	assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io));
6977 
6978 	bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
6979 			     bdev_io->internal.caller_ctx);
6980 }
6981 
6982 static inline void
6983 bdev_io_complete(void *ctx)
6984 {
6985 	struct spdk_bdev_io *bdev_io = ctx;
6986 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
6987 	uint64_t tsc, tsc_diff;
6988 
6989 	if (spdk_unlikely(bdev_io->internal.in_submit_request)) {
6990 		/*
6991 		 * Defer completion to avoid potential infinite recursion if the
6992 		 * user's completion callback issues a new I/O.
6993 		 */
6994 		spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
6995 				     bdev_io_complete, bdev_io);
6996 		return;
6997 	}
6998 
6999 	tsc = spdk_get_ticks();
7000 	tsc_diff = tsc - bdev_io->internal.submit_tsc;
7001 	spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io,
7002 			      bdev_io->internal.caller_ctx);
7003 
7004 	TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link);
7005 
7006 	if (bdev_io->internal.ch->histogram) {
7007 		spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff);
7008 	}
7009 
7010 	bdev_io_update_io_stat(bdev_io, tsc_diff);
7011 	_bdev_io_complete(bdev_io);
7012 }
7013 
7014 /* The difference between this function and bdev_io_complete() is that this should be called to
7015  * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the
7016  * io_submitted list and don't have submit_tsc updated.
7017  */
7018 static inline void
7019 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io)
7020 {
7021 	/* Since the IO hasn't been submitted it's bound to be failed */
7022 	assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS);
7023 
7024 	/* At this point we don't know if the IO is completed from submission context or not, but,
7025 	 * since this is an error path, we can always do an spdk_thread_send_msg(). */
7026 	spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
7027 			     _bdev_io_complete, bdev_io);
7028 }
7029 
7030 static void bdev_destroy_cb(void *io_device);
7031 
7032 static void
7033 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status)
7034 {
7035 	struct spdk_bdev_io *bdev_io = _ctx;
7036 
7037 	if (bdev_io->u.reset.ch_ref != NULL) {
7038 		spdk_put_io_channel(bdev_io->u.reset.ch_ref);
7039 		bdev_io->u.reset.ch_ref = NULL;
7040 	}
7041 
7042 	bdev_io_complete(bdev_io);
7043 
7044 	if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING &&
7045 	    TAILQ_EMPTY(&bdev->internal.open_descs)) {
7046 		spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
7047 	}
7048 }
7049 
7050 static void
7051 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
7052 		      struct spdk_io_channel *_ch, void *_ctx)
7053 {
7054 	struct spdk_bdev_io *bdev_io = _ctx;
7055 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
7056 	struct spdk_bdev_io *queued_reset;
7057 
7058 	ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS;
7059 	while (!TAILQ_EMPTY(&ch->queued_resets)) {
7060 		queued_reset = TAILQ_FIRST(&ch->queued_resets);
7061 		TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link);
7062 		spdk_bdev_io_complete(queued_reset, bdev_io->internal.status);
7063 	}
7064 
7065 	spdk_bdev_for_each_channel_continue(i, 0);
7066 }
7067 
7068 static void
7069 bdev_io_complete_sequence_cb(void *ctx, int status)
7070 {
7071 	struct spdk_bdev_io *bdev_io = ctx;
7072 
7073 	/* u.bdev.accel_sequence should have already been cleared at this point */
7074 	assert(bdev_io->u.bdev.accel_sequence == NULL);
7075 	assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS);
7076 	bdev_io->internal.accel_sequence = NULL;
7077 
7078 	if (spdk_unlikely(status != 0)) {
7079 		SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status);
7080 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
7081 	}
7082 
7083 	bdev_io_complete(bdev_io);
7084 }
7085 
7086 void
7087 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status)
7088 {
7089 	struct spdk_bdev *bdev = bdev_io->bdev;
7090 	struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
7091 	struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
7092 
7093 	if (bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING) {
7094 		SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n",
7095 			    spdk_bdev_get_module_name(bdev),
7096 			    bdev_io_status_get_string(bdev_io->internal.status));
7097 		assert(false);
7098 	}
7099 	bdev_io->internal.status = status;
7100 
7101 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) {
7102 		bool unlock_channels = false;
7103 
7104 		if (status == SPDK_BDEV_IO_STATUS_NOMEM) {
7105 			SPDK_ERRLOG("NOMEM returned for reset\n");
7106 		}
7107 		spdk_spin_lock(&bdev->internal.spinlock);
7108 		if (bdev_io == bdev->internal.reset_in_progress) {
7109 			bdev->internal.reset_in_progress = NULL;
7110 			unlock_channels = true;
7111 		}
7112 		spdk_spin_unlock(&bdev->internal.spinlock);
7113 
7114 		if (unlock_channels) {
7115 			spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io,
7116 						   bdev_reset_complete);
7117 			return;
7118 		}
7119 	} else {
7120 		bdev_io_decrement_outstanding(bdev_ch, shared_resource);
7121 		if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) {
7122 			if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) {
7123 				bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb);
7124 				return;
7125 			} else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0 &&
7126 						 !bdev_io_use_accel_sequence(bdev_io))) {
7127 				_bdev_io_push_bounce_data_buffer(bdev_io,
7128 								 _bdev_io_complete_push_bounce_done);
7129 				/* bdev IO will be completed in the callback */
7130 				return;
7131 			}
7132 		}
7133 
7134 		if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) {
7135 			return;
7136 		}
7137 	}
7138 
7139 	bdev_io_complete(bdev_io);
7140 }
7141 
7142 void
7143 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc,
7144 				  enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq)
7145 {
7146 	enum spdk_bdev_io_status status;
7147 
7148 	if (sc == SPDK_SCSI_STATUS_GOOD) {
7149 		status = SPDK_BDEV_IO_STATUS_SUCCESS;
7150 	} else {
7151 		status = SPDK_BDEV_IO_STATUS_SCSI_ERROR;
7152 		bdev_io->internal.error.scsi.sc = sc;
7153 		bdev_io->internal.error.scsi.sk = sk;
7154 		bdev_io->internal.error.scsi.asc = asc;
7155 		bdev_io->internal.error.scsi.ascq = ascq;
7156 	}
7157 
7158 	spdk_bdev_io_complete(bdev_io, status);
7159 }
7160 
7161 void
7162 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io,
7163 			     int *sc, int *sk, int *asc, int *ascq)
7164 {
7165 	assert(sc != NULL);
7166 	assert(sk != NULL);
7167 	assert(asc != NULL);
7168 	assert(ascq != NULL);
7169 
7170 	switch (bdev_io->internal.status) {
7171 	case SPDK_BDEV_IO_STATUS_SUCCESS:
7172 		*sc = SPDK_SCSI_STATUS_GOOD;
7173 		*sk = SPDK_SCSI_SENSE_NO_SENSE;
7174 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
7175 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
7176 		break;
7177 	case SPDK_BDEV_IO_STATUS_NVME_ERROR:
7178 		spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq);
7179 		break;
7180 	case SPDK_BDEV_IO_STATUS_SCSI_ERROR:
7181 		*sc = bdev_io->internal.error.scsi.sc;
7182 		*sk = bdev_io->internal.error.scsi.sk;
7183 		*asc = bdev_io->internal.error.scsi.asc;
7184 		*ascq = bdev_io->internal.error.scsi.ascq;
7185 		break;
7186 	default:
7187 		*sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
7188 		*sk = SPDK_SCSI_SENSE_ABORTED_COMMAND;
7189 		*asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
7190 		*ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
7191 		break;
7192 	}
7193 }
7194 
7195 void
7196 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result)
7197 {
7198 	enum spdk_bdev_io_status status;
7199 
7200 	if (aio_result == 0) {
7201 		status = SPDK_BDEV_IO_STATUS_SUCCESS;
7202 	} else {
7203 		status = SPDK_BDEV_IO_STATUS_AIO_ERROR;
7204 	}
7205 
7206 	bdev_io->internal.error.aio_result = aio_result;
7207 
7208 	spdk_bdev_io_complete(bdev_io, status);
7209 }
7210 
7211 void
7212 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result)
7213 {
7214 	assert(aio_result != NULL);
7215 
7216 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) {
7217 		*aio_result = bdev_io->internal.error.aio_result;
7218 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
7219 		*aio_result = 0;
7220 	} else {
7221 		*aio_result = -EIO;
7222 	}
7223 }
7224 
7225 void
7226 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc)
7227 {
7228 	enum spdk_bdev_io_status status;
7229 
7230 	if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) {
7231 		status = SPDK_BDEV_IO_STATUS_SUCCESS;
7232 	} else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) {
7233 		status = SPDK_BDEV_IO_STATUS_ABORTED;
7234 	} else {
7235 		status = SPDK_BDEV_IO_STATUS_NVME_ERROR;
7236 	}
7237 
7238 	bdev_io->internal.error.nvme.cdw0 = cdw0;
7239 	bdev_io->internal.error.nvme.sct = sct;
7240 	bdev_io->internal.error.nvme.sc = sc;
7241 
7242 	spdk_bdev_io_complete(bdev_io, status);
7243 }
7244 
7245 void
7246 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc)
7247 {
7248 	assert(sct != NULL);
7249 	assert(sc != NULL);
7250 	assert(cdw0 != NULL);
7251 
7252 	if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) {
7253 		*sct = SPDK_NVME_SCT_GENERIC;
7254 		*sc = SPDK_NVME_SC_SUCCESS;
7255 		if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
7256 			*cdw0 = 0;
7257 		} else {
7258 			*cdw0 = 1U;
7259 		}
7260 		return;
7261 	}
7262 
7263 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
7264 		*sct = bdev_io->internal.error.nvme.sct;
7265 		*sc = bdev_io->internal.error.nvme.sc;
7266 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
7267 		*sct = SPDK_NVME_SCT_GENERIC;
7268 		*sc = SPDK_NVME_SC_SUCCESS;
7269 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) {
7270 		*sct = SPDK_NVME_SCT_GENERIC;
7271 		*sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
7272 	} else {
7273 		*sct = SPDK_NVME_SCT_GENERIC;
7274 		*sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
7275 	}
7276 
7277 	*cdw0 = bdev_io->internal.error.nvme.cdw0;
7278 }
7279 
7280 void
7281 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0,
7282 				   int *first_sct, int *first_sc, int *second_sct, int *second_sc)
7283 {
7284 	assert(first_sct != NULL);
7285 	assert(first_sc != NULL);
7286 	assert(second_sct != NULL);
7287 	assert(second_sc != NULL);
7288 	assert(cdw0 != NULL);
7289 
7290 	if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
7291 		if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR &&
7292 		    bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) {
7293 			*first_sct = bdev_io->internal.error.nvme.sct;
7294 			*first_sc = bdev_io->internal.error.nvme.sc;
7295 			*second_sct = SPDK_NVME_SCT_GENERIC;
7296 			*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
7297 		} else {
7298 			*first_sct = SPDK_NVME_SCT_GENERIC;
7299 			*first_sc = SPDK_NVME_SC_SUCCESS;
7300 			*second_sct = bdev_io->internal.error.nvme.sct;
7301 			*second_sc = bdev_io->internal.error.nvme.sc;
7302 		}
7303 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) {
7304 		*first_sct = SPDK_NVME_SCT_GENERIC;
7305 		*first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
7306 		*second_sct = SPDK_NVME_SCT_GENERIC;
7307 		*second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
7308 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
7309 		*first_sct = SPDK_NVME_SCT_GENERIC;
7310 		*first_sc = SPDK_NVME_SC_SUCCESS;
7311 		*second_sct = SPDK_NVME_SCT_GENERIC;
7312 		*second_sc = SPDK_NVME_SC_SUCCESS;
7313 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) {
7314 		*first_sct = SPDK_NVME_SCT_GENERIC;
7315 		*first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
7316 		*second_sct = SPDK_NVME_SCT_GENERIC;
7317 		*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
7318 	} else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) {
7319 		*first_sct = SPDK_NVME_SCT_MEDIA_ERROR;
7320 		*first_sc = SPDK_NVME_SC_COMPARE_FAILURE;
7321 		*second_sct = SPDK_NVME_SCT_GENERIC;
7322 		*second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
7323 	} else {
7324 		*first_sct = SPDK_NVME_SCT_GENERIC;
7325 		*first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
7326 		*second_sct = SPDK_NVME_SCT_GENERIC;
7327 		*second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
7328 	}
7329 
7330 	*cdw0 = bdev_io->internal.error.nvme.cdw0;
7331 }
7332 
7333 struct spdk_thread *
7334 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io)
7335 {
7336 	return spdk_io_channel_get_thread(bdev_io->internal.ch->channel);
7337 }
7338 
7339 struct spdk_io_channel *
7340 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io)
7341 {
7342 	return bdev_io->internal.ch->channel;
7343 }
7344 
7345 static int
7346 bdev_register(struct spdk_bdev *bdev)
7347 {
7348 	char *bdev_name;
7349 	char uuid[SPDK_UUID_STRING_LEN];
7350 	struct spdk_iobuf_opts iobuf_opts;
7351 	int ret, i;
7352 
7353 	assert(bdev->module != NULL);
7354 
7355 	if (!bdev->name) {
7356 		SPDK_ERRLOG("Bdev name is NULL\n");
7357 		return -EINVAL;
7358 	}
7359 
7360 	if (!strlen(bdev->name)) {
7361 		SPDK_ERRLOG("Bdev name must not be an empty string\n");
7362 		return -EINVAL;
7363 	}
7364 
7365 	for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) {
7366 		if (bdev->fn_table->accel_sequence_supported == NULL) {
7367 			continue;
7368 		}
7369 		if (!bdev->fn_table->accel_sequence_supported(bdev->ctxt,
7370 				(enum spdk_bdev_io_type)i)) {
7371 			continue;
7372 		}
7373 
7374 		if (spdk_bdev_is_md_separate(bdev)) {
7375 			SPDK_ERRLOG("Separate metadata is currently unsupported for bdevs with "
7376 				    "accel sequence support\n");
7377 			return -EINVAL;
7378 		}
7379 	}
7380 
7381 	/* Users often register their own I/O devices using the bdev name. In
7382 	 * order to avoid conflicts, prepend bdev_. */
7383 	bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name);
7384 	if (!bdev_name) {
7385 		SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n");
7386 		return -ENOMEM;
7387 	}
7388 
7389 	bdev->internal.stat = bdev_alloc_io_stat(true);
7390 	if (!bdev->internal.stat) {
7391 		SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n");
7392 		free(bdev_name);
7393 		return -ENOMEM;
7394 	}
7395 
7396 	bdev->internal.status = SPDK_BDEV_STATUS_READY;
7397 	bdev->internal.measured_queue_depth = UINT64_MAX;
7398 	bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE;
7399 	memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim));
7400 	bdev->internal.qd_poller = NULL;
7401 	bdev->internal.qos = NULL;
7402 
7403 	TAILQ_INIT(&bdev->internal.open_descs);
7404 	TAILQ_INIT(&bdev->internal.locked_ranges);
7405 	TAILQ_INIT(&bdev->internal.pending_locked_ranges);
7406 	TAILQ_INIT(&bdev->aliases);
7407 
7408 	ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name);
7409 	if (ret != 0) {
7410 		bdev_free_io_stat(bdev->internal.stat);
7411 		free(bdev_name);
7412 		return ret;
7413 	}
7414 
7415 	/* UUID may be specified by the user or defined by bdev itself.
7416 	 * Otherwise it will be generated here, so this field will never be empty. */
7417 	if (spdk_uuid_is_null(&bdev->uuid)) {
7418 		spdk_uuid_generate(&bdev->uuid);
7419 	}
7420 
7421 	/* Add the UUID alias only if it's different than the name */
7422 	spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid);
7423 	if (strcmp(bdev->name, uuid) != 0) {
7424 		ret = spdk_bdev_alias_add(bdev, uuid);
7425 		if (ret != 0) {
7426 			SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name);
7427 			bdev_name_del(&bdev->internal.bdev_name);
7428 			bdev_free_io_stat(bdev->internal.stat);
7429 			free(bdev_name);
7430 			return ret;
7431 		}
7432 	}
7433 
7434 	if (spdk_bdev_get_buf_align(bdev) > 1) {
7435 		if (bdev->split_on_optimal_io_boundary) {
7436 			bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary,
7437 							     SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen);
7438 		} else {
7439 			bdev->split_on_optimal_io_boundary = true;
7440 			bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen;
7441 		}
7442 	}
7443 
7444 	/* If the user didn't specify a write unit size, set it to one. */
7445 	if (bdev->write_unit_size == 0) {
7446 		bdev->write_unit_size = 1;
7447 	}
7448 
7449 	/* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */
7450 	if (bdev->acwu == 0) {
7451 		bdev->acwu = bdev->write_unit_size;
7452 	}
7453 
7454 	if (bdev->phys_blocklen == 0) {
7455 		bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev);
7456 	}
7457 
7458 	if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) {
7459 		spdk_iobuf_get_opts(&iobuf_opts);
7460 		bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize);
7461 	}
7462 
7463 	if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
7464 		bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE);
7465 	}
7466 
7467 	bdev->internal.reset_in_progress = NULL;
7468 	bdev->internal.qd_poll_in_progress = false;
7469 	bdev->internal.period = 0;
7470 	bdev->internal.new_period = 0;
7471 
7472 	spdk_io_device_register(__bdev_to_io_dev(bdev),
7473 				bdev_channel_create, bdev_channel_destroy,
7474 				sizeof(struct spdk_bdev_channel),
7475 				bdev_name);
7476 
7477 	free(bdev_name);
7478 
7479 	spdk_spin_init(&bdev->internal.spinlock);
7480 
7481 	SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name);
7482 	TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link);
7483 
7484 	return 0;
7485 }
7486 
7487 static void
7488 bdev_destroy_cb(void *io_device)
7489 {
7490 	int			rc;
7491 	struct spdk_bdev	*bdev;
7492 	spdk_bdev_unregister_cb	cb_fn;
7493 	void			*cb_arg;
7494 
7495 	bdev = __bdev_from_io_dev(io_device);
7496 
7497 	if (bdev->internal.unregister_td != spdk_get_thread()) {
7498 		spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device);
7499 		return;
7500 	}
7501 
7502 	cb_fn = bdev->internal.unregister_cb;
7503 	cb_arg = bdev->internal.unregister_ctx;
7504 
7505 	spdk_spin_destroy(&bdev->internal.spinlock);
7506 	free(bdev->internal.qos);
7507 	bdev_free_io_stat(bdev->internal.stat);
7508 
7509 	rc = bdev->fn_table->destruct(bdev->ctxt);
7510 	if (rc < 0) {
7511 		SPDK_ERRLOG("destruct failed\n");
7512 	}
7513 	if (rc <= 0 && cb_fn != NULL) {
7514 		cb_fn(cb_arg, rc);
7515 	}
7516 }
7517 
7518 void
7519 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno)
7520 {
7521 	if (bdev->internal.unregister_cb != NULL) {
7522 		bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno);
7523 	}
7524 }
7525 
7526 static void
7527 _remove_notify(void *arg)
7528 {
7529 	struct spdk_bdev_desc *desc = arg;
7530 
7531 	_event_notify(desc, SPDK_BDEV_EVENT_REMOVE);
7532 }
7533 
7534 /* returns: 0 - bdev removed and ready to be destructed.
7535  *          -EBUSY - bdev can't be destructed yet.  */
7536 static int
7537 bdev_unregister_unsafe(struct spdk_bdev *bdev)
7538 {
7539 	struct spdk_bdev_desc	*desc, *tmp;
7540 	int			rc = 0;
7541 	char			uuid[SPDK_UUID_STRING_LEN];
7542 
7543 	assert(spdk_spin_held(&g_bdev_mgr.spinlock));
7544 	assert(spdk_spin_held(&bdev->internal.spinlock));
7545 
7546 	/* Notify each descriptor about hotremoval */
7547 	TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) {
7548 		rc = -EBUSY;
7549 		/*
7550 		 * Defer invocation of the event_cb to a separate message that will
7551 		 *  run later on its thread.  This ensures this context unwinds and
7552 		 *  we don't recursively unregister this bdev again if the event_cb
7553 		 *  immediately closes its descriptor.
7554 		 */
7555 		event_notify(desc, _remove_notify);
7556 	}
7557 
7558 	/* If there are no descriptors, proceed removing the bdev */
7559 	if (rc == 0) {
7560 		TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
7561 		SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name);
7562 
7563 		/* Delete the name and the UUID alias */
7564 		spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid);
7565 		bdev_name_del_unsafe(&bdev->internal.bdev_name);
7566 		bdev_alias_del(bdev, uuid, bdev_name_del_unsafe);
7567 
7568 		spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev));
7569 
7570 		if (bdev->internal.reset_in_progress != NULL) {
7571 			/* If reset is in progress, let the completion callback for reset
7572 			 * unregister the bdev.
7573 			 */
7574 			rc = -EBUSY;
7575 		}
7576 	}
7577 
7578 	return rc;
7579 }
7580 
7581 static void
7582 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
7583 			      struct spdk_io_channel *io_ch, void *_ctx)
7584 {
7585 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
7586 
7587 	bdev_channel_abort_queued_ios(bdev_ch);
7588 	spdk_bdev_for_each_channel_continue(i, 0);
7589 }
7590 
7591 static void
7592 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status)
7593 {
7594 	int rc;
7595 
7596 	spdk_spin_lock(&g_bdev_mgr.spinlock);
7597 	spdk_spin_lock(&bdev->internal.spinlock);
7598 	/*
7599 	 * Set the status to REMOVING after completing to abort channels. Otherwise,
7600 	 * the last spdk_bdev_close() may call spdk_io_device_unregister() while
7601 	 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister()
7602 	 * may fail.
7603 	 */
7604 	bdev->internal.status = SPDK_BDEV_STATUS_REMOVING;
7605 	rc = bdev_unregister_unsafe(bdev);
7606 	spdk_spin_unlock(&bdev->internal.spinlock);
7607 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
7608 
7609 	if (rc == 0) {
7610 		spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
7611 	}
7612 }
7613 
7614 void
7615 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
7616 {
7617 	struct spdk_thread	*thread;
7618 
7619 	SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name);
7620 
7621 	thread = spdk_get_thread();
7622 	if (!thread) {
7623 		/* The user called this from a non-SPDK thread. */
7624 		if (cb_fn != NULL) {
7625 			cb_fn(cb_arg, -ENOTSUP);
7626 		}
7627 		return;
7628 	}
7629 
7630 	spdk_spin_lock(&g_bdev_mgr.spinlock);
7631 	if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING ||
7632 	    bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) {
7633 		spdk_spin_unlock(&g_bdev_mgr.spinlock);
7634 		if (cb_fn) {
7635 			cb_fn(cb_arg, -EBUSY);
7636 		}
7637 		return;
7638 	}
7639 
7640 	spdk_spin_lock(&bdev->internal.spinlock);
7641 	bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING;
7642 	bdev->internal.unregister_cb = cb_fn;
7643 	bdev->internal.unregister_ctx = cb_arg;
7644 	bdev->internal.unregister_td = thread;
7645 	spdk_spin_unlock(&bdev->internal.spinlock);
7646 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
7647 
7648 	spdk_bdev_set_qd_sampling_period(bdev, 0);
7649 
7650 	spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev,
7651 				   bdev_unregister);
7652 }
7653 
7654 int
7655 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module,
7656 			     spdk_bdev_unregister_cb cb_fn, void *cb_arg)
7657 {
7658 	struct spdk_bdev_desc *desc;
7659 	struct spdk_bdev *bdev;
7660 	int rc;
7661 
7662 	rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc);
7663 	if (rc != 0) {
7664 		SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name);
7665 		return rc;
7666 	}
7667 
7668 	bdev = spdk_bdev_desc_get_bdev(desc);
7669 
7670 	if (bdev->module != module) {
7671 		spdk_bdev_close(desc);
7672 		SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n",
7673 			    bdev_name);
7674 		return -ENODEV;
7675 	}
7676 
7677 	spdk_bdev_unregister(bdev, cb_fn, cb_arg);
7678 
7679 	spdk_bdev_close(desc);
7680 
7681 	return 0;
7682 }
7683 
7684 static int
7685 bdev_start_qos(struct spdk_bdev *bdev)
7686 {
7687 	struct set_qos_limit_ctx *ctx;
7688 
7689 	/* Enable QoS */
7690 	if (bdev->internal.qos && bdev->internal.qos->thread == NULL) {
7691 		ctx = calloc(1, sizeof(*ctx));
7692 		if (ctx == NULL) {
7693 			SPDK_ERRLOG("Failed to allocate memory for QoS context\n");
7694 			return -ENOMEM;
7695 		}
7696 		ctx->bdev = bdev;
7697 		spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done);
7698 	}
7699 
7700 	return 0;
7701 }
7702 
7703 static void
7704 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail,
7705 		    struct spdk_bdev *bdev)
7706 {
7707 	enum spdk_bdev_claim_type type;
7708 	const char *typename, *modname;
7709 	extern struct spdk_log_flag SPDK_LOG_bdev;
7710 
7711 	assert(spdk_spin_held(&bdev->internal.spinlock));
7712 
7713 	if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) {
7714 		return;
7715 	}
7716 
7717 	type = bdev->internal.claim_type;
7718 	typename = spdk_bdev_claim_get_name(type);
7719 
7720 	if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) {
7721 		modname = bdev->internal.claim.v1.module->name;
7722 		spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n",
7723 			 bdev->name, detail, typename, modname);
7724 		return;
7725 	}
7726 
7727 	if (claim_type_is_v2(type)) {
7728 		struct spdk_bdev_module_claim *claim;
7729 
7730 		TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) {
7731 			modname = claim->module->name;
7732 			spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n",
7733 				 bdev->name, detail, typename, modname);
7734 		}
7735 		return;
7736 	}
7737 
7738 	assert(false);
7739 }
7740 
7741 static int
7742 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc)
7743 {
7744 	struct spdk_thread *thread;
7745 	int rc = 0;
7746 
7747 	thread = spdk_get_thread();
7748 	if (!thread) {
7749 		SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n");
7750 		return -ENOTSUP;
7751 	}
7752 
7753 	SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
7754 		      spdk_get_thread());
7755 
7756 	desc->bdev = bdev;
7757 	desc->thread = thread;
7758 	desc->write = write;
7759 
7760 	spdk_spin_lock(&bdev->internal.spinlock);
7761 	if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING ||
7762 	    bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) {
7763 		spdk_spin_unlock(&bdev->internal.spinlock);
7764 		return -ENODEV;
7765 	}
7766 
7767 	if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
7768 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
7769 		spdk_spin_unlock(&bdev->internal.spinlock);
7770 		return -EPERM;
7771 	}
7772 
7773 	rc = bdev_start_qos(bdev);
7774 	if (rc != 0) {
7775 		SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name);
7776 		spdk_spin_unlock(&bdev->internal.spinlock);
7777 		return rc;
7778 	}
7779 
7780 	TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link);
7781 
7782 	spdk_spin_unlock(&bdev->internal.spinlock);
7783 
7784 	return 0;
7785 }
7786 
7787 static int
7788 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx,
7789 		struct spdk_bdev_desc **_desc)
7790 {
7791 	struct spdk_bdev_desc *desc;
7792 	unsigned int i;
7793 
7794 	desc = calloc(1, sizeof(*desc));
7795 	if (desc == NULL) {
7796 		SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n");
7797 		return -ENOMEM;
7798 	}
7799 
7800 	TAILQ_INIT(&desc->pending_media_events);
7801 	TAILQ_INIT(&desc->free_media_events);
7802 
7803 	desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0;
7804 	desc->callback.event_fn = event_cb;
7805 	desc->callback.ctx = event_ctx;
7806 	spdk_spin_init(&desc->spinlock);
7807 
7808 	if (bdev->media_events) {
7809 		desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE,
7810 						   sizeof(*desc->media_events_buffer));
7811 		if (desc->media_events_buffer == NULL) {
7812 			SPDK_ERRLOG("Failed to initialize media event pool\n");
7813 			bdev_desc_free(desc);
7814 			return -ENOMEM;
7815 		}
7816 
7817 		for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) {
7818 			TAILQ_INSERT_TAIL(&desc->free_media_events,
7819 					  &desc->media_events_buffer[i], tailq);
7820 		}
7821 	}
7822 
7823 	if (bdev->fn_table->accel_sequence_supported != NULL) {
7824 		for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) {
7825 			desc->accel_sequence_supported[i] =
7826 				bdev->fn_table->accel_sequence_supported(bdev->ctxt,
7827 						(enum spdk_bdev_io_type)i);
7828 		}
7829 	}
7830 
7831 	*_desc = desc;
7832 
7833 	return 0;
7834 }
7835 
7836 static int
7837 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
7838 	      void *event_ctx, struct spdk_bdev_desc **_desc)
7839 {
7840 	struct spdk_bdev_desc *desc;
7841 	struct spdk_bdev *bdev;
7842 	int rc;
7843 
7844 	bdev = bdev_get_by_name(bdev_name);
7845 
7846 	if (bdev == NULL) {
7847 		SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name);
7848 		return -ENODEV;
7849 	}
7850 
7851 	rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc);
7852 	if (rc != 0) {
7853 		return rc;
7854 	}
7855 
7856 	rc = bdev_open(bdev, write, desc);
7857 	if (rc != 0) {
7858 		bdev_desc_free(desc);
7859 		desc = NULL;
7860 	}
7861 
7862 	*_desc = desc;
7863 
7864 	return rc;
7865 }
7866 
7867 int
7868 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
7869 		   void *event_ctx, struct spdk_bdev_desc **_desc)
7870 {
7871 	int rc;
7872 
7873 	if (event_cb == NULL) {
7874 		SPDK_ERRLOG("Missing event callback function\n");
7875 		return -EINVAL;
7876 	}
7877 
7878 	spdk_spin_lock(&g_bdev_mgr.spinlock);
7879 	rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, _desc);
7880 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
7881 
7882 	return rc;
7883 }
7884 
7885 struct spdk_bdev_open_async_ctx {
7886 	char					*bdev_name;
7887 	spdk_bdev_event_cb_t			event_cb;
7888 	void					*event_ctx;
7889 	bool					write;
7890 	int					rc;
7891 	spdk_bdev_open_async_cb_t		cb_fn;
7892 	void					*cb_arg;
7893 	struct spdk_bdev_desc			*desc;
7894 	struct spdk_bdev_open_async_opts	opts;
7895 	uint64_t				start_ticks;
7896 	struct spdk_thread			*orig_thread;
7897 	struct spdk_poller			*poller;
7898 	TAILQ_ENTRY(spdk_bdev_open_async_ctx)	tailq;
7899 };
7900 
7901 static void
7902 bdev_open_async_done(void *arg)
7903 {
7904 	struct spdk_bdev_open_async_ctx *ctx = arg;
7905 
7906 	ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg);
7907 
7908 	free(ctx->bdev_name);
7909 	free(ctx);
7910 }
7911 
7912 static void
7913 bdev_open_async_cancel(void *arg)
7914 {
7915 	struct spdk_bdev_open_async_ctx *ctx = arg;
7916 
7917 	assert(ctx->rc == -ESHUTDOWN);
7918 
7919 	spdk_poller_unregister(&ctx->poller);
7920 
7921 	bdev_open_async_done(ctx);
7922 }
7923 
7924 /* This is called when the bdev library finishes at shutdown. */
7925 static void
7926 bdev_open_async_fini(void)
7927 {
7928 	struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx;
7929 
7930 	spdk_spin_lock(&g_bdev_mgr.spinlock);
7931 	TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) {
7932 		TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq);
7933 		/*
7934 		 * We have to move to ctx->orig_thread to unregister ctx->poller.
7935 		 * However, there is a chance that ctx->poller is executed before
7936 		 * message is executed, which could result in bdev_open_async_done()
7937 		 * being called twice. To avoid such race condition, set ctx->rc to
7938 		 * -ESHUTDOWN.
7939 		 */
7940 		ctx->rc = -ESHUTDOWN;
7941 		spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx);
7942 	}
7943 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
7944 }
7945 
7946 static int bdev_open_async(void *arg);
7947 
7948 static void
7949 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx)
7950 {
7951 	uint64_t timeout_ticks;
7952 
7953 	if (ctx->rc == -ESHUTDOWN) {
7954 		/* This context is being canceled. Do nothing. */
7955 		return;
7956 	}
7957 
7958 	ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx,
7959 				&ctx->desc);
7960 	if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) {
7961 		goto exit;
7962 	}
7963 
7964 	timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull;
7965 	if (spdk_get_ticks() >= timeout_ticks) {
7966 		SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name);
7967 		ctx->rc = -ETIMEDOUT;
7968 		goto exit;
7969 	}
7970 
7971 	return;
7972 
7973 exit:
7974 	spdk_poller_unregister(&ctx->poller);
7975 	TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq);
7976 
7977 	/* Completion callback is processed after stack unwinding. */
7978 	spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx);
7979 }
7980 
7981 static int
7982 bdev_open_async(void *arg)
7983 {
7984 	struct spdk_bdev_open_async_ctx *ctx = arg;
7985 
7986 	spdk_spin_lock(&g_bdev_mgr.spinlock);
7987 
7988 	_bdev_open_async(ctx);
7989 
7990 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
7991 
7992 	return SPDK_POLLER_BUSY;
7993 }
7994 
7995 static void
7996 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts,
7997 			  struct spdk_bdev_open_async_opts *opts_src,
7998 			  size_t size)
7999 {
8000 	assert(opts);
8001 	assert(opts_src);
8002 
8003 	opts->size = size;
8004 
8005 #define SET_FIELD(field) \
8006 	if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \
8007 		opts->field = opts_src->field; \
8008 	} \
8009 
8010 	SET_FIELD(timeout_ms);
8011 
8012 	/* Do not remove this statement, you should always update this statement when you adding a new field,
8013 	 * and do not forget to add the SET_FIELD statement for your added field. */
8014 	SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size");
8015 
8016 #undef SET_FIELD
8017 }
8018 
8019 static void
8020 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size)
8021 {
8022 	assert(opts);
8023 
8024 	opts->size = size;
8025 
8026 #define SET_FIELD(field, value) \
8027 	if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \
8028 		opts->field = value; \
8029 	} \
8030 
8031 	SET_FIELD(timeout_ms, 0);
8032 
8033 #undef SET_FIELD
8034 }
8035 
8036 int
8037 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
8038 		     void *event_ctx, struct spdk_bdev_open_async_opts *opts,
8039 		     spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg)
8040 {
8041 	struct spdk_bdev_open_async_ctx *ctx;
8042 
8043 	if (event_cb == NULL) {
8044 		SPDK_ERRLOG("Missing event callback function\n");
8045 		return -EINVAL;
8046 	}
8047 
8048 	if (open_cb == NULL) {
8049 		SPDK_ERRLOG("Missing open callback function\n");
8050 		return -EINVAL;
8051 	}
8052 
8053 	if (opts != NULL && opts->size == 0) {
8054 		SPDK_ERRLOG("size in the options structure should not be zero\n");
8055 		return -EINVAL;
8056 	}
8057 
8058 	ctx = calloc(1, sizeof(*ctx));
8059 	if (ctx == NULL) {
8060 		SPDK_ERRLOG("Failed to allocate open context\n");
8061 		return -ENOMEM;
8062 	}
8063 
8064 	ctx->bdev_name = strdup(bdev_name);
8065 	if (ctx->bdev_name == NULL) {
8066 		SPDK_ERRLOG("Failed to duplicate bdev_name\n");
8067 		free(ctx);
8068 		return -ENOMEM;
8069 	}
8070 
8071 	ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000);
8072 	if (ctx->poller == NULL) {
8073 		SPDK_ERRLOG("Failed to register bdev_open_async poller\n");
8074 		free(ctx->bdev_name);
8075 		free(ctx);
8076 		return -ENOMEM;
8077 	}
8078 
8079 	ctx->cb_fn = open_cb;
8080 	ctx->cb_arg = open_cb_arg;
8081 	ctx->write = write;
8082 	ctx->event_cb = event_cb;
8083 	ctx->event_ctx = event_ctx;
8084 	ctx->orig_thread = spdk_get_thread();
8085 	ctx->start_ticks = spdk_get_ticks();
8086 
8087 	bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts));
8088 	if (opts != NULL) {
8089 		bdev_open_async_opts_copy(&ctx->opts, opts, opts->size);
8090 	}
8091 
8092 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8093 
8094 	TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq);
8095 	_bdev_open_async(ctx);
8096 
8097 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8098 
8099 	return 0;
8100 }
8101 
8102 static void
8103 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc)
8104 {
8105 	int rc;
8106 
8107 	spdk_spin_lock(&bdev->internal.spinlock);
8108 	spdk_spin_lock(&desc->spinlock);
8109 
8110 	TAILQ_REMOVE(&bdev->internal.open_descs, desc, link);
8111 
8112 	desc->closed = true;
8113 
8114 	if (desc->claim != NULL) {
8115 		bdev_desc_release_claims(desc);
8116 	}
8117 
8118 	if (0 == desc->refs) {
8119 		spdk_spin_unlock(&desc->spinlock);
8120 		bdev_desc_free(desc);
8121 	} else {
8122 		spdk_spin_unlock(&desc->spinlock);
8123 	}
8124 
8125 	/* If no more descriptors, kill QoS channel */
8126 	if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) {
8127 		SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n",
8128 			      bdev->name, spdk_get_thread());
8129 
8130 		if (bdev_qos_destroy(bdev)) {
8131 			/* There isn't anything we can do to recover here. Just let the
8132 			 * old QoS poller keep running. The QoS handling won't change
8133 			 * cores when the user allocates a new channel, but it won't break. */
8134 			SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n");
8135 		}
8136 	}
8137 
8138 	if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) {
8139 		rc = bdev_unregister_unsafe(bdev);
8140 		spdk_spin_unlock(&bdev->internal.spinlock);
8141 
8142 		if (rc == 0) {
8143 			spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
8144 		}
8145 	} else {
8146 		spdk_spin_unlock(&bdev->internal.spinlock);
8147 	}
8148 }
8149 
8150 void
8151 spdk_bdev_close(struct spdk_bdev_desc *desc)
8152 {
8153 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
8154 
8155 	SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
8156 		      spdk_get_thread());
8157 
8158 	assert(desc->thread == spdk_get_thread());
8159 
8160 	spdk_poller_unregister(&desc->io_timeout_poller);
8161 
8162 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8163 
8164 	bdev_close(bdev, desc);
8165 
8166 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8167 }
8168 
8169 static void
8170 bdev_register_finished(void *arg)
8171 {
8172 	struct spdk_bdev_desc *desc = arg;
8173 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
8174 
8175 	spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev));
8176 
8177 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8178 
8179 	bdev_close(bdev, desc);
8180 
8181 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8182 }
8183 
8184 int
8185 spdk_bdev_register(struct spdk_bdev *bdev)
8186 {
8187 	struct spdk_bdev_desc *desc;
8188 	struct spdk_thread *thread = spdk_get_thread();
8189 	int rc;
8190 
8191 	if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) {
8192 		SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", bdev->name, thread,
8193 			    thread ? spdk_thread_get_name(thread) : "null");
8194 		return -EINVAL;
8195 	}
8196 
8197 	rc = bdev_register(bdev);
8198 	if (rc != 0) {
8199 		return rc;
8200 	}
8201 
8202 	/* A descriptor is opened to prevent bdev deletion during examination */
8203 	rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc);
8204 	if (rc != 0) {
8205 		spdk_bdev_unregister(bdev, NULL, NULL);
8206 		return rc;
8207 	}
8208 
8209 	rc = bdev_open(bdev, false, desc);
8210 	if (rc != 0) {
8211 		bdev_desc_free(desc);
8212 		spdk_bdev_unregister(bdev, NULL, NULL);
8213 		return rc;
8214 	}
8215 
8216 	/* Examine configuration before initializing I/O */
8217 	bdev_examine(bdev);
8218 
8219 	rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc);
8220 	if (rc != 0) {
8221 		bdev_close(bdev, desc);
8222 		spdk_bdev_unregister(bdev, NULL, NULL);
8223 	}
8224 
8225 	return rc;
8226 }
8227 
8228 int
8229 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
8230 			    struct spdk_bdev_module *module)
8231 {
8232 	spdk_spin_lock(&bdev->internal.spinlock);
8233 
8234 	if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
8235 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
8236 		spdk_spin_unlock(&bdev->internal.spinlock);
8237 		return -EPERM;
8238 	}
8239 
8240 	if (desc && !desc->write) {
8241 		desc->write = true;
8242 	}
8243 
8244 	bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE;
8245 	bdev->internal.claim.v1.module = module;
8246 
8247 	spdk_spin_unlock(&bdev->internal.spinlock);
8248 	return 0;
8249 }
8250 
8251 void
8252 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev)
8253 {
8254 	spdk_spin_lock(&bdev->internal.spinlock);
8255 
8256 	assert(bdev->internal.claim.v1.module != NULL);
8257 	assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE);
8258 	bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE;
8259 	bdev->internal.claim.v1.module = NULL;
8260 
8261 	spdk_spin_unlock(&bdev->internal.spinlock);
8262 }
8263 
8264 /*
8265  * Start claims v2
8266  */
8267 
8268 const char *
8269 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type)
8270 {
8271 	switch (type) {
8272 	case SPDK_BDEV_CLAIM_NONE:
8273 		return "not_claimed";
8274 	case SPDK_BDEV_CLAIM_EXCL_WRITE:
8275 		return "exclusive_write";
8276 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
8277 		return "read_many_write_one";
8278 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE:
8279 		return "read_many_write_none";
8280 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
8281 		return "read_many_write_many";
8282 	default:
8283 		break;
8284 	}
8285 	return "invalid_claim";
8286 }
8287 
8288 static bool
8289 claim_type_is_v2(enum spdk_bdev_claim_type type)
8290 {
8291 	switch (type) {
8292 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
8293 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE:
8294 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
8295 		return true;
8296 	default:
8297 		break;
8298 	}
8299 	return false;
8300 }
8301 
8302 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */
8303 static bool
8304 claim_type_promotes_to_write(enum spdk_bdev_claim_type type)
8305 {
8306 	switch (type) {
8307 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
8308 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
8309 		return true;
8310 	default:
8311 		break;
8312 	}
8313 	return false;
8314 }
8315 
8316 void
8317 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size)
8318 {
8319 	if (opts == NULL) {
8320 		SPDK_ERRLOG("opts should not be NULL\n");
8321 		assert(opts != NULL);
8322 		return;
8323 	}
8324 	if (size == 0) {
8325 		SPDK_ERRLOG("size should not be zero\n");
8326 		assert(size != 0);
8327 		return;
8328 	}
8329 
8330 	memset(opts, 0, size);
8331 	opts->opts_size = size;
8332 
8333 #define FIELD_OK(field) \
8334         offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size
8335 
8336 #define SET_FIELD(field, value) \
8337         if (FIELD_OK(field)) { \
8338                 opts->field = value; \
8339         } \
8340 
8341 	SET_FIELD(shared_claim_key, 0);
8342 
8343 #undef FIELD_OK
8344 #undef SET_FIELD
8345 }
8346 
8347 static int
8348 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst)
8349 {
8350 	if (src->opts_size == 0) {
8351 		SPDK_ERRLOG("size should not be zero\n");
8352 		return -1;
8353 	}
8354 
8355 	memset(dst, 0, sizeof(*dst));
8356 	dst->opts_size = src->opts_size;
8357 
8358 #define FIELD_OK(field) \
8359         offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size
8360 
8361 #define SET_FIELD(field) \
8362         if (FIELD_OK(field)) { \
8363                 dst->field = src->field; \
8364         } \
8365 
8366 	if (FIELD_OK(name)) {
8367 		snprintf(dst->name, sizeof(dst->name), "%s", src->name);
8368 	}
8369 
8370 	SET_FIELD(shared_claim_key);
8371 
8372 	/* You should not remove this statement, but need to update the assert statement
8373 	 * if you add a new field, and also add a corresponding SET_FIELD statement */
8374 	SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size");
8375 
8376 #undef FIELD_OK
8377 #undef SET_FIELD
8378 	return 0;
8379 }
8380 
8381 /* Returns 0 if a read-write-once claim can be taken. */
8382 static int
8383 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
8384 		 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
8385 {
8386 	struct spdk_bdev *bdev = desc->bdev;
8387 	struct spdk_bdev_desc *open_desc;
8388 
8389 	assert(spdk_spin_held(&bdev->internal.spinlock));
8390 	assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE);
8391 
8392 	if (opts->shared_claim_key != 0) {
8393 		SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n",
8394 			    bdev->name);
8395 		return -EINVAL;
8396 	}
8397 	if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) {
8398 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
8399 		return -EPERM;
8400 	}
8401 	if (desc->claim != NULL) {
8402 		SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n",
8403 			       bdev->name, desc->claim->module->name);
8404 		return -EPERM;
8405 	}
8406 	TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) {
8407 		if (desc != open_desc && open_desc->write) {
8408 			SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while "
8409 				       "another descriptor is open for writing\n",
8410 				       bdev->name);
8411 			return -EPERM;
8412 		}
8413 	}
8414 
8415 	return 0;
8416 }
8417 
8418 /* Returns 0 if a read-only-many claim can be taken. */
8419 static int
8420 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
8421 		 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
8422 {
8423 	struct spdk_bdev *bdev = desc->bdev;
8424 	struct spdk_bdev_desc *open_desc;
8425 
8426 	assert(spdk_spin_held(&bdev->internal.spinlock));
8427 	assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE);
8428 	assert(desc->claim == NULL);
8429 
8430 	if (desc->write) {
8431 		SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n",
8432 			    bdev->name);
8433 		return -EINVAL;
8434 	}
8435 	if (opts->shared_claim_key != 0) {
8436 		SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name);
8437 		return -EINVAL;
8438 	}
8439 	if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) {
8440 		TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) {
8441 			if (open_desc->write) {
8442 				SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while "
8443 					       "another descriptor is open for writing\n",
8444 					       bdev->name);
8445 				return -EPERM;
8446 			}
8447 		}
8448 	}
8449 
8450 	return 0;
8451 }
8452 
8453 /* Returns 0 if a read-write-many claim can be taken. */
8454 static int
8455 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
8456 		 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
8457 {
8458 	struct spdk_bdev *bdev = desc->bdev;
8459 	struct spdk_bdev_desc *open_desc;
8460 
8461 	assert(spdk_spin_held(&bdev->internal.spinlock));
8462 	assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED);
8463 	assert(desc->claim == NULL);
8464 
8465 	if (opts->shared_claim_key == 0) {
8466 		SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n",
8467 			    bdev->name);
8468 		return -EINVAL;
8469 	}
8470 	switch (bdev->internal.claim_type) {
8471 	case SPDK_BDEV_CLAIM_NONE:
8472 		TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) {
8473 			if (open_desc == desc) {
8474 				continue;
8475 			}
8476 			if (open_desc->write) {
8477 				SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while "
8478 					       "another descriptor is open for writing without a "
8479 					       "claim\n", bdev->name);
8480 				return -EPERM;
8481 			}
8482 		}
8483 		break;
8484 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
8485 		if (opts->shared_claim_key != bdev->internal.claim.v2.key) {
8486 			LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev);
8487 			return -EPERM;
8488 		}
8489 		break;
8490 	default:
8491 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
8492 		return -EBUSY;
8493 	}
8494 
8495 	return 0;
8496 }
8497 
8498 /* Updates desc and its bdev with a v2 claim. */
8499 static int
8500 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
8501 	   struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module)
8502 {
8503 	struct spdk_bdev *bdev = desc->bdev;
8504 	struct spdk_bdev_module_claim *claim;
8505 
8506 	assert(spdk_spin_held(&bdev->internal.spinlock));
8507 	assert(claim_type_is_v2(type));
8508 	assert(desc->claim == NULL);
8509 
8510 	claim = calloc(1, sizeof(*desc->claim));
8511 	if (claim == NULL) {
8512 		SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name);
8513 		return -ENOMEM;
8514 	}
8515 	claim->module = module;
8516 	claim->desc = desc;
8517 	SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match");
8518 	memcpy(claim->name, opts->name, sizeof(claim->name));
8519 	desc->claim = claim;
8520 
8521 	if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) {
8522 		bdev->internal.claim_type = type;
8523 		TAILQ_INIT(&bdev->internal.claim.v2.claims);
8524 		bdev->internal.claim.v2.key = opts->shared_claim_key;
8525 	}
8526 	assert(type == bdev->internal.claim_type);
8527 
8528 	TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link);
8529 
8530 	if (!desc->write && claim_type_promotes_to_write(type)) {
8531 		desc->write = true;
8532 	}
8533 
8534 	return 0;
8535 }
8536 
8537 int
8538 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type,
8539 				 struct spdk_bdev_claim_opts *_opts,
8540 				 struct spdk_bdev_module *module)
8541 {
8542 	struct spdk_bdev *bdev;
8543 	struct spdk_bdev_claim_opts opts;
8544 	int rc = 0;
8545 
8546 	if (desc == NULL) {
8547 		SPDK_ERRLOG("descriptor must not be NULL\n");
8548 		return -EINVAL;
8549 	}
8550 
8551 	bdev = desc->bdev;
8552 
8553 	if (_opts == NULL) {
8554 		spdk_bdev_claim_opts_init(&opts, sizeof(opts));
8555 	} else if (claim_opts_copy(_opts, &opts) != 0) {
8556 		return -EINVAL;
8557 	}
8558 
8559 	spdk_spin_lock(&bdev->internal.spinlock);
8560 
8561 	if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE &&
8562 	    bdev->internal.claim_type != type) {
8563 		LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev);
8564 		spdk_spin_unlock(&bdev->internal.spinlock);
8565 		return -EPERM;
8566 	}
8567 
8568 	if (claim_type_is_v2(type) && desc->claim != NULL) {
8569 		SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n",
8570 			    bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name);
8571 		spdk_spin_unlock(&bdev->internal.spinlock);
8572 		return -EPERM;
8573 	}
8574 
8575 	switch (type) {
8576 	case SPDK_BDEV_CLAIM_EXCL_WRITE:
8577 		spdk_spin_unlock(&bdev->internal.spinlock);
8578 		return spdk_bdev_module_claim_bdev(bdev, desc, module);
8579 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE:
8580 		rc = claim_verify_rwo(desc, type, &opts, module);
8581 		break;
8582 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE:
8583 		rc = claim_verify_rom(desc, type, &opts, module);
8584 		break;
8585 	case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED:
8586 		rc = claim_verify_rwm(desc, type, &opts, module);
8587 		break;
8588 	default:
8589 		SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type);
8590 		rc = -ENOTSUP;
8591 	}
8592 
8593 	if (rc == 0) {
8594 		rc = claim_bdev(desc, type, &opts, module);
8595 	}
8596 
8597 	spdk_spin_unlock(&bdev->internal.spinlock);
8598 	return rc;
8599 }
8600 
8601 static void
8602 claim_reset(struct spdk_bdev *bdev)
8603 {
8604 	assert(spdk_spin_held(&bdev->internal.spinlock));
8605 	assert(claim_type_is_v2(bdev->internal.claim_type));
8606 	assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims));
8607 
8608 	memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim));
8609 	bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE;
8610 }
8611 
8612 static void
8613 bdev_desc_release_claims(struct spdk_bdev_desc *desc)
8614 {
8615 	struct spdk_bdev *bdev = desc->bdev;
8616 
8617 	assert(spdk_spin_held(&bdev->internal.spinlock));
8618 	assert(claim_type_is_v2(bdev->internal.claim_type));
8619 
8620 	if (bdev->internal.examine_in_progress == 0) {
8621 		TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link);
8622 		free(desc->claim);
8623 		if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) {
8624 			claim_reset(bdev);
8625 		}
8626 	} else {
8627 		/* This is a dead claim that will be cleaned up when bdev_examine() is done. */
8628 		desc->claim->module = NULL;
8629 		desc->claim->desc = NULL;
8630 	}
8631 	desc->claim = NULL;
8632 }
8633 
8634 /*
8635  * End claims v2
8636  */
8637 
8638 struct spdk_bdev *
8639 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc)
8640 {
8641 	assert(desc != NULL);
8642 	return desc->bdev;
8643 }
8644 
8645 int
8646 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn)
8647 {
8648 	struct spdk_bdev *bdev, *tmp;
8649 	struct spdk_bdev_desc *desc;
8650 	int rc = 0;
8651 
8652 	assert(fn != NULL);
8653 
8654 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8655 	bdev = spdk_bdev_first();
8656 	while (bdev != NULL) {
8657 		rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc);
8658 		if (rc != 0) {
8659 			break;
8660 		}
8661 		rc = bdev_open(bdev, false, desc);
8662 		if (rc != 0) {
8663 			bdev_desc_free(desc);
8664 			if (rc == -ENODEV) {
8665 				/* Ignore the error and move to the next bdev. */
8666 				rc = 0;
8667 				bdev = spdk_bdev_next(bdev);
8668 				continue;
8669 			}
8670 			break;
8671 		}
8672 		spdk_spin_unlock(&g_bdev_mgr.spinlock);
8673 
8674 		rc = fn(ctx, bdev);
8675 
8676 		spdk_spin_lock(&g_bdev_mgr.spinlock);
8677 		tmp = spdk_bdev_next(bdev);
8678 		bdev_close(bdev, desc);
8679 		if (rc != 0) {
8680 			break;
8681 		}
8682 		bdev = tmp;
8683 	}
8684 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8685 
8686 	return rc;
8687 }
8688 
8689 int
8690 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn)
8691 {
8692 	struct spdk_bdev *bdev, *tmp;
8693 	struct spdk_bdev_desc *desc;
8694 	int rc = 0;
8695 
8696 	assert(fn != NULL);
8697 
8698 	spdk_spin_lock(&g_bdev_mgr.spinlock);
8699 	bdev = spdk_bdev_first_leaf();
8700 	while (bdev != NULL) {
8701 		rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc);
8702 		if (rc != 0) {
8703 			break;
8704 		}
8705 		rc = bdev_open(bdev, false, desc);
8706 		if (rc != 0) {
8707 			bdev_desc_free(desc);
8708 			if (rc == -ENODEV) {
8709 				/* Ignore the error and move to the next bdev. */
8710 				rc = 0;
8711 				bdev = spdk_bdev_next_leaf(bdev);
8712 				continue;
8713 			}
8714 			break;
8715 		}
8716 		spdk_spin_unlock(&g_bdev_mgr.spinlock);
8717 
8718 		rc = fn(ctx, bdev);
8719 
8720 		spdk_spin_lock(&g_bdev_mgr.spinlock);
8721 		tmp = spdk_bdev_next_leaf(bdev);
8722 		bdev_close(bdev, desc);
8723 		if (rc != 0) {
8724 			break;
8725 		}
8726 		bdev = tmp;
8727 	}
8728 	spdk_spin_unlock(&g_bdev_mgr.spinlock);
8729 
8730 	return rc;
8731 }
8732 
8733 void
8734 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp)
8735 {
8736 	struct iovec *iovs;
8737 	int iovcnt;
8738 
8739 	if (bdev_io == NULL) {
8740 		return;
8741 	}
8742 
8743 	switch (bdev_io->type) {
8744 	case SPDK_BDEV_IO_TYPE_READ:
8745 	case SPDK_BDEV_IO_TYPE_WRITE:
8746 	case SPDK_BDEV_IO_TYPE_ZCOPY:
8747 		iovs = bdev_io->u.bdev.iovs;
8748 		iovcnt = bdev_io->u.bdev.iovcnt;
8749 		break;
8750 	default:
8751 		iovs = NULL;
8752 		iovcnt = 0;
8753 		break;
8754 	}
8755 
8756 	if (iovp) {
8757 		*iovp = iovs;
8758 	}
8759 	if (iovcntp) {
8760 		*iovcntp = iovcnt;
8761 	}
8762 }
8763 
8764 void *
8765 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io)
8766 {
8767 	if (bdev_io == NULL) {
8768 		return NULL;
8769 	}
8770 
8771 	if (!spdk_bdev_is_md_separate(bdev_io->bdev)) {
8772 		return NULL;
8773 	}
8774 
8775 	if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ ||
8776 	    bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
8777 		return bdev_io->u.bdev.md_buf;
8778 	}
8779 
8780 	return NULL;
8781 }
8782 
8783 void *
8784 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io)
8785 {
8786 	if (bdev_io == NULL) {
8787 		assert(false);
8788 		return NULL;
8789 	}
8790 
8791 	return bdev_io->internal.caller_ctx;
8792 }
8793 
8794 void
8795 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module)
8796 {
8797 
8798 	if (spdk_bdev_module_list_find(bdev_module->name)) {
8799 		SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name);
8800 		assert(false);
8801 	}
8802 
8803 	spdk_spin_init(&bdev_module->internal.spinlock);
8804 	TAILQ_INIT(&bdev_module->internal.quiesced_ranges);
8805 
8806 	/*
8807 	 * Modules with examine callbacks must be initialized first, so they are
8808 	 *  ready to handle examine callbacks from later modules that will
8809 	 *  register physical bdevs.
8810 	 */
8811 	if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) {
8812 		TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
8813 	} else {
8814 		TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
8815 	}
8816 }
8817 
8818 struct spdk_bdev_module *
8819 spdk_bdev_module_list_find(const char *name)
8820 {
8821 	struct spdk_bdev_module *bdev_module;
8822 
8823 	TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
8824 		if (strcmp(name, bdev_module->name) == 0) {
8825 			break;
8826 		}
8827 	}
8828 
8829 	return bdev_module;
8830 }
8831 
8832 static int
8833 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io)
8834 {
8835 	uint64_t num_blocks;
8836 	void *md_buf = NULL;
8837 
8838 	num_blocks = bdev_io->u.bdev.num_blocks;
8839 
8840 	if (spdk_bdev_is_md_separate(bdev_io->bdev)) {
8841 		md_buf = (char *)g_bdev_mgr.zero_buffer +
8842 			 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks;
8843 	}
8844 
8845 	return bdev_write_blocks_with_md(bdev_io->internal.desc,
8846 					 spdk_io_channel_from_ctx(bdev_io->internal.ch),
8847 					 g_bdev_mgr.zero_buffer, md_buf,
8848 					 bdev_io->u.bdev.offset_blocks, num_blocks,
8849 					 bdev_write_zero_buffer_done, bdev_io);
8850 }
8851 
8852 static void
8853 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
8854 {
8855 	struct spdk_bdev_io *parent_io = cb_arg;
8856 
8857 	spdk_bdev_free_io(bdev_io);
8858 
8859 	parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
8860 	parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx);
8861 }
8862 
8863 static void
8864 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status)
8865 {
8866 	spdk_spin_lock(&ctx->bdev->internal.spinlock);
8867 	ctx->bdev->internal.qos_mod_in_progress = false;
8868 	spdk_spin_unlock(&ctx->bdev->internal.spinlock);
8869 
8870 	if (ctx->cb_fn) {
8871 		ctx->cb_fn(ctx->cb_arg, status);
8872 	}
8873 	free(ctx);
8874 }
8875 
8876 static void
8877 bdev_disable_qos_done(void *cb_arg)
8878 {
8879 	struct set_qos_limit_ctx *ctx = cb_arg;
8880 	struct spdk_bdev *bdev = ctx->bdev;
8881 	struct spdk_bdev_io *bdev_io;
8882 	struct spdk_bdev_qos *qos;
8883 
8884 	spdk_spin_lock(&bdev->internal.spinlock);
8885 	qos = bdev->internal.qos;
8886 	bdev->internal.qos = NULL;
8887 	spdk_spin_unlock(&bdev->internal.spinlock);
8888 
8889 	while (!TAILQ_EMPTY(&qos->queued)) {
8890 		/* Send queued I/O back to their original thread for resubmission. */
8891 		bdev_io = TAILQ_FIRST(&qos->queued);
8892 		TAILQ_REMOVE(&qos->queued, bdev_io, internal.link);
8893 
8894 		if (bdev_io->internal.io_submit_ch) {
8895 			/*
8896 			 * Channel was changed when sending it to the QoS thread - change it back
8897 			 *  before sending it back to the original thread.
8898 			 */
8899 			bdev_io->internal.ch = bdev_io->internal.io_submit_ch;
8900 			bdev_io->internal.io_submit_ch = NULL;
8901 		}
8902 
8903 		spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
8904 				     _bdev_io_submit, bdev_io);
8905 	}
8906 
8907 	if (qos->thread != NULL) {
8908 		spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
8909 		spdk_poller_unregister(&qos->poller);
8910 	}
8911 
8912 	free(qos);
8913 
8914 	bdev_set_qos_limit_done(ctx, 0);
8915 }
8916 
8917 static void
8918 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status)
8919 {
8920 	struct set_qos_limit_ctx *ctx = _ctx;
8921 	struct spdk_thread *thread;
8922 
8923 	spdk_spin_lock(&bdev->internal.spinlock);
8924 	thread = bdev->internal.qos->thread;
8925 	spdk_spin_unlock(&bdev->internal.spinlock);
8926 
8927 	if (thread != NULL) {
8928 		spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx);
8929 	} else {
8930 		bdev_disable_qos_done(ctx);
8931 	}
8932 }
8933 
8934 static void
8935 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
8936 		     struct spdk_io_channel *ch, void *_ctx)
8937 {
8938 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
8939 
8940 	bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED;
8941 
8942 	spdk_bdev_for_each_channel_continue(i, 0);
8943 }
8944 
8945 static void
8946 bdev_update_qos_rate_limit_msg(void *cb_arg)
8947 {
8948 	struct set_qos_limit_ctx *ctx = cb_arg;
8949 	struct spdk_bdev *bdev = ctx->bdev;
8950 
8951 	spdk_spin_lock(&bdev->internal.spinlock);
8952 	bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos);
8953 	spdk_spin_unlock(&bdev->internal.spinlock);
8954 
8955 	bdev_set_qos_limit_done(ctx, 0);
8956 }
8957 
8958 static void
8959 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
8960 		    struct spdk_io_channel *ch, void *_ctx)
8961 {
8962 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
8963 
8964 	spdk_spin_lock(&bdev->internal.spinlock);
8965 	bdev_enable_qos(bdev, bdev_ch);
8966 	spdk_spin_unlock(&bdev->internal.spinlock);
8967 	spdk_bdev_for_each_channel_continue(i, 0);
8968 }
8969 
8970 static void
8971 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status)
8972 {
8973 	struct set_qos_limit_ctx *ctx = _ctx;
8974 
8975 	bdev_set_qos_limit_done(ctx, status);
8976 }
8977 
8978 static void
8979 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits)
8980 {
8981 	int i;
8982 
8983 	assert(bdev->internal.qos != NULL);
8984 
8985 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
8986 		if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
8987 			bdev->internal.qos->rate_limits[i].limit = limits[i];
8988 
8989 			if (limits[i] == 0) {
8990 				bdev->internal.qos->rate_limits[i].limit =
8991 					SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
8992 			}
8993 		}
8994 	}
8995 }
8996 
8997 void
8998 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits,
8999 			      void (*cb_fn)(void *cb_arg, int status), void *cb_arg)
9000 {
9001 	struct set_qos_limit_ctx	*ctx;
9002 	uint32_t			limit_set_complement;
9003 	uint64_t			min_limit_per_sec;
9004 	int				i;
9005 	bool				disable_rate_limit = true;
9006 
9007 	for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
9008 		if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
9009 			continue;
9010 		}
9011 
9012 		if (limits[i] > 0) {
9013 			disable_rate_limit = false;
9014 		}
9015 
9016 		if (bdev_qos_is_iops_rate_limit(i) == true) {
9017 			min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC;
9018 		} else {
9019 			/* Change from megabyte to byte rate limit */
9020 			limits[i] = limits[i] * 1024 * 1024;
9021 			min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC;
9022 		}
9023 
9024 		limit_set_complement = limits[i] % min_limit_per_sec;
9025 		if (limit_set_complement) {
9026 			SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n",
9027 				    limits[i], min_limit_per_sec);
9028 			limits[i] += min_limit_per_sec - limit_set_complement;
9029 			SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]);
9030 		}
9031 	}
9032 
9033 	ctx = calloc(1, sizeof(*ctx));
9034 	if (ctx == NULL) {
9035 		cb_fn(cb_arg, -ENOMEM);
9036 		return;
9037 	}
9038 
9039 	ctx->cb_fn = cb_fn;
9040 	ctx->cb_arg = cb_arg;
9041 	ctx->bdev = bdev;
9042 
9043 	spdk_spin_lock(&bdev->internal.spinlock);
9044 	if (bdev->internal.qos_mod_in_progress) {
9045 		spdk_spin_unlock(&bdev->internal.spinlock);
9046 		free(ctx);
9047 		cb_fn(cb_arg, -EAGAIN);
9048 		return;
9049 	}
9050 	bdev->internal.qos_mod_in_progress = true;
9051 
9052 	if (disable_rate_limit == true && bdev->internal.qos) {
9053 		for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
9054 			if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED &&
9055 			    (bdev->internal.qos->rate_limits[i].limit > 0 &&
9056 			     bdev->internal.qos->rate_limits[i].limit !=
9057 			     SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) {
9058 				disable_rate_limit = false;
9059 				break;
9060 			}
9061 		}
9062 	}
9063 
9064 	if (disable_rate_limit == false) {
9065 		if (bdev->internal.qos == NULL) {
9066 			bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos));
9067 			if (!bdev->internal.qos) {
9068 				spdk_spin_unlock(&bdev->internal.spinlock);
9069 				SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n");
9070 				bdev_set_qos_limit_done(ctx, -ENOMEM);
9071 				return;
9072 			}
9073 		}
9074 
9075 		if (bdev->internal.qos->thread == NULL) {
9076 			/* Enabling */
9077 			bdev_set_qos_rate_limits(bdev, limits);
9078 
9079 			spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx,
9080 						   bdev_enable_qos_done);
9081 		} else {
9082 			/* Updating */
9083 			bdev_set_qos_rate_limits(bdev, limits);
9084 
9085 			spdk_thread_send_msg(bdev->internal.qos->thread,
9086 					     bdev_update_qos_rate_limit_msg, ctx);
9087 		}
9088 	} else {
9089 		if (bdev->internal.qos != NULL) {
9090 			bdev_set_qos_rate_limits(bdev, limits);
9091 
9092 			/* Disabling */
9093 			spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx,
9094 						   bdev_disable_qos_msg_done);
9095 		} else {
9096 			spdk_spin_unlock(&bdev->internal.spinlock);
9097 			bdev_set_qos_limit_done(ctx, 0);
9098 			return;
9099 		}
9100 	}
9101 
9102 	spdk_spin_unlock(&bdev->internal.spinlock);
9103 }
9104 
9105 struct spdk_bdev_histogram_ctx {
9106 	spdk_bdev_histogram_status_cb cb_fn;
9107 	void *cb_arg;
9108 	struct spdk_bdev *bdev;
9109 	int status;
9110 };
9111 
9112 static void
9113 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
9114 {
9115 	struct spdk_bdev_histogram_ctx *ctx = _ctx;
9116 
9117 	spdk_spin_lock(&ctx->bdev->internal.spinlock);
9118 	ctx->bdev->internal.histogram_in_progress = false;
9119 	spdk_spin_unlock(&ctx->bdev->internal.spinlock);
9120 	ctx->cb_fn(ctx->cb_arg, ctx->status);
9121 	free(ctx);
9122 }
9123 
9124 static void
9125 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9126 			       struct spdk_io_channel *_ch, void *_ctx)
9127 {
9128 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9129 
9130 	if (ch->histogram != NULL) {
9131 		spdk_histogram_data_free(ch->histogram);
9132 		ch->histogram = NULL;
9133 	}
9134 	spdk_bdev_for_each_channel_continue(i, 0);
9135 }
9136 
9137 static void
9138 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
9139 {
9140 	struct spdk_bdev_histogram_ctx *ctx = _ctx;
9141 
9142 	if (status != 0) {
9143 		ctx->status = status;
9144 		ctx->bdev->internal.histogram_enabled = false;
9145 		spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx,
9146 					   bdev_histogram_disable_channel_cb);
9147 	} else {
9148 		spdk_spin_lock(&ctx->bdev->internal.spinlock);
9149 		ctx->bdev->internal.histogram_in_progress = false;
9150 		spdk_spin_unlock(&ctx->bdev->internal.spinlock);
9151 		ctx->cb_fn(ctx->cb_arg, ctx->status);
9152 		free(ctx);
9153 	}
9154 }
9155 
9156 static void
9157 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9158 			      struct spdk_io_channel *_ch, void *_ctx)
9159 {
9160 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9161 	int status = 0;
9162 
9163 	if (ch->histogram == NULL) {
9164 		ch->histogram = spdk_histogram_data_alloc();
9165 		if (ch->histogram == NULL) {
9166 			status = -ENOMEM;
9167 		}
9168 	}
9169 
9170 	spdk_bdev_for_each_channel_continue(i, status);
9171 }
9172 
9173 void
9174 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn,
9175 			   void *cb_arg, bool enable)
9176 {
9177 	struct spdk_bdev_histogram_ctx *ctx;
9178 
9179 	ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx));
9180 	if (ctx == NULL) {
9181 		cb_fn(cb_arg, -ENOMEM);
9182 		return;
9183 	}
9184 
9185 	ctx->bdev = bdev;
9186 	ctx->status = 0;
9187 	ctx->cb_fn = cb_fn;
9188 	ctx->cb_arg = cb_arg;
9189 
9190 	spdk_spin_lock(&bdev->internal.spinlock);
9191 	if (bdev->internal.histogram_in_progress) {
9192 		spdk_spin_unlock(&bdev->internal.spinlock);
9193 		free(ctx);
9194 		cb_fn(cb_arg, -EAGAIN);
9195 		return;
9196 	}
9197 
9198 	bdev->internal.histogram_in_progress = true;
9199 	spdk_spin_unlock(&bdev->internal.spinlock);
9200 
9201 	bdev->internal.histogram_enabled = enable;
9202 
9203 	if (enable) {
9204 		/* Allocate histogram for each channel */
9205 		spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx,
9206 					   bdev_histogram_enable_channel_cb);
9207 	} else {
9208 		spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx,
9209 					   bdev_histogram_disable_channel_cb);
9210 	}
9211 }
9212 
9213 struct spdk_bdev_histogram_data_ctx {
9214 	spdk_bdev_histogram_data_cb cb_fn;
9215 	void *cb_arg;
9216 	struct spdk_bdev *bdev;
9217 	/** merged histogram data from all channels */
9218 	struct spdk_histogram_data	*histogram;
9219 };
9220 
9221 static void
9222 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status)
9223 {
9224 	struct spdk_bdev_histogram_data_ctx *ctx = _ctx;
9225 
9226 	ctx->cb_fn(ctx->cb_arg, status, ctx->histogram);
9227 	free(ctx);
9228 }
9229 
9230 static void
9231 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9232 			   struct spdk_io_channel *_ch, void *_ctx)
9233 {
9234 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9235 	struct spdk_bdev_histogram_data_ctx *ctx = _ctx;
9236 	int status = 0;
9237 
9238 	if (ch->histogram == NULL) {
9239 		status = -EFAULT;
9240 	} else {
9241 		spdk_histogram_data_merge(ctx->histogram, ch->histogram);
9242 	}
9243 
9244 	spdk_bdev_for_each_channel_continue(i, status);
9245 }
9246 
9247 void
9248 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram,
9249 			spdk_bdev_histogram_data_cb cb_fn,
9250 			void *cb_arg)
9251 {
9252 	struct spdk_bdev_histogram_data_ctx *ctx;
9253 
9254 	ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx));
9255 	if (ctx == NULL) {
9256 		cb_fn(cb_arg, -ENOMEM, NULL);
9257 		return;
9258 	}
9259 
9260 	ctx->bdev = bdev;
9261 	ctx->cb_fn = cb_fn;
9262 	ctx->cb_arg = cb_arg;
9263 
9264 	ctx->histogram = histogram;
9265 
9266 	spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx,
9267 				   bdev_histogram_get_channel_cb);
9268 }
9269 
9270 void
9271 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn,
9272 				void *cb_arg)
9273 {
9274 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch);
9275 	int status = 0;
9276 
9277 	assert(cb_fn != NULL);
9278 
9279 	if (bdev_ch->histogram == NULL) {
9280 		status = -EFAULT;
9281 	}
9282 	cb_fn(cb_arg, status, bdev_ch->histogram);
9283 }
9284 
9285 size_t
9286 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events,
9287 			   size_t max_events)
9288 {
9289 	struct media_event_entry *entry;
9290 	size_t num_events = 0;
9291 
9292 	for (; num_events < max_events; ++num_events) {
9293 		entry = TAILQ_FIRST(&desc->pending_media_events);
9294 		if (entry == NULL) {
9295 			break;
9296 		}
9297 
9298 		events[num_events] = entry->event;
9299 		TAILQ_REMOVE(&desc->pending_media_events, entry, tailq);
9300 		TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq);
9301 	}
9302 
9303 	return num_events;
9304 }
9305 
9306 int
9307 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events,
9308 			    size_t num_events)
9309 {
9310 	struct spdk_bdev_desc *desc;
9311 	struct media_event_entry *entry;
9312 	size_t event_id;
9313 	int rc = 0;
9314 
9315 	assert(bdev->media_events);
9316 
9317 	spdk_spin_lock(&bdev->internal.spinlock);
9318 	TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
9319 		if (desc->write) {
9320 			break;
9321 		}
9322 	}
9323 
9324 	if (desc == NULL || desc->media_events_buffer == NULL) {
9325 		rc = -ENODEV;
9326 		goto out;
9327 	}
9328 
9329 	for (event_id = 0; event_id < num_events; ++event_id) {
9330 		entry = TAILQ_FIRST(&desc->free_media_events);
9331 		if (entry == NULL) {
9332 			break;
9333 		}
9334 
9335 		TAILQ_REMOVE(&desc->free_media_events, entry, tailq);
9336 		TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq);
9337 		entry->event = events[event_id];
9338 	}
9339 
9340 	rc = event_id;
9341 out:
9342 	spdk_spin_unlock(&bdev->internal.spinlock);
9343 	return rc;
9344 }
9345 
9346 static void
9347 _media_management_notify(void *arg)
9348 {
9349 	struct spdk_bdev_desc *desc = arg;
9350 
9351 	_event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT);
9352 }
9353 
9354 void
9355 spdk_bdev_notify_media_management(struct spdk_bdev *bdev)
9356 {
9357 	struct spdk_bdev_desc *desc;
9358 
9359 	spdk_spin_lock(&bdev->internal.spinlock);
9360 	TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
9361 		if (!TAILQ_EMPTY(&desc->pending_media_events)) {
9362 			event_notify(desc, _media_management_notify);
9363 		}
9364 	}
9365 	spdk_spin_unlock(&bdev->internal.spinlock);
9366 }
9367 
9368 struct locked_lba_range_ctx {
9369 	struct lba_range		range;
9370 	struct lba_range		*current_range;
9371 	struct lba_range		*owner_range;
9372 	struct spdk_poller		*poller;
9373 	lock_range_cb			cb_fn;
9374 	void				*cb_arg;
9375 };
9376 
9377 static void
9378 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status)
9379 {
9380 	struct locked_lba_range_ctx *ctx = _ctx;
9381 
9382 	ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM);
9383 	free(ctx);
9384 }
9385 
9386 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i,
9387 		struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx);
9388 
9389 static void
9390 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status)
9391 {
9392 	struct locked_lba_range_ctx *ctx = _ctx;
9393 
9394 	if (status == -ENOMEM) {
9395 		/* One of the channels could not allocate a range object.
9396 		 * So we have to go back and clean up any ranges that were
9397 		 * allocated successfully before we return error status to
9398 		 * the caller.  We can reuse the unlock function to do that
9399 		 * clean up.
9400 		 */
9401 		spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx,
9402 					   bdev_lock_error_cleanup_cb);
9403 		return;
9404 	}
9405 
9406 	/* All channels have locked this range and no I/O overlapping the range
9407 	 * are outstanding!  Set the owner_ch for the range object for the
9408 	 * locking channel, so that this channel will know that it is allowed
9409 	 * to write to this range.
9410 	 */
9411 	if (ctx->owner_range != NULL) {
9412 		ctx->owner_range->owner_ch = ctx->range.owner_ch;
9413 	}
9414 
9415 	ctx->cb_fn(&ctx->range, ctx->cb_arg, status);
9416 
9417 	/* Don't free the ctx here.  Its range is in the bdev's global list of
9418 	 * locked ranges still, and will be removed and freed when this range
9419 	 * is later unlocked.
9420 	 */
9421 }
9422 
9423 static int
9424 bdev_lock_lba_range_check_io(void *_i)
9425 {
9426 	struct spdk_bdev_channel_iter *i = _i;
9427 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i);
9428 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9429 	struct locked_lba_range_ctx *ctx = i->ctx;
9430 	struct lba_range *range = ctx->current_range;
9431 	struct spdk_bdev_io *bdev_io;
9432 
9433 	spdk_poller_unregister(&ctx->poller);
9434 
9435 	/* The range is now in the locked_ranges, so no new IO can be submitted to this
9436 	 * range.  But we need to wait until any outstanding IO overlapping with this range
9437 	 * are completed.
9438 	 */
9439 	TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) {
9440 		if (bdev_io_range_is_locked(bdev_io, range)) {
9441 			ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100);
9442 			return SPDK_POLLER_BUSY;
9443 		}
9444 	}
9445 
9446 	spdk_bdev_for_each_channel_continue(i, 0);
9447 	return SPDK_POLLER_BUSY;
9448 }
9449 
9450 static void
9451 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9452 				struct spdk_io_channel *_ch, void *_ctx)
9453 {
9454 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9455 	struct locked_lba_range_ctx *ctx = _ctx;
9456 	struct lba_range *range;
9457 
9458 	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
9459 		if (range->length == ctx->range.length &&
9460 		    range->offset == ctx->range.offset &&
9461 		    range->locked_ctx == ctx->range.locked_ctx) {
9462 			/* This range already exists on this channel, so don't add
9463 			 * it again.  This can happen when a new channel is created
9464 			 * while the for_each_channel operation is in progress.
9465 			 * Do not check for outstanding I/O in that case, since the
9466 			 * range was locked before any I/O could be submitted to the
9467 			 * new channel.
9468 			 */
9469 			spdk_bdev_for_each_channel_continue(i, 0);
9470 			return;
9471 		}
9472 	}
9473 
9474 	range = calloc(1, sizeof(*range));
9475 	if (range == NULL) {
9476 		spdk_bdev_for_each_channel_continue(i, -ENOMEM);
9477 		return;
9478 	}
9479 
9480 	range->length = ctx->range.length;
9481 	range->offset = ctx->range.offset;
9482 	range->locked_ctx = ctx->range.locked_ctx;
9483 	ctx->current_range = range;
9484 	if (ctx->range.owner_ch == ch) {
9485 		/* This is the range object for the channel that will hold
9486 		 * the lock.  Store it in the ctx object so that we can easily
9487 		 * set its owner_ch after the lock is finally acquired.
9488 		 */
9489 		ctx->owner_range = range;
9490 	}
9491 	TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq);
9492 	bdev_lock_lba_range_check_io(i);
9493 }
9494 
9495 static void
9496 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx)
9497 {
9498 	assert(spdk_get_thread() == ctx->range.owner_thread);
9499 	assert(ctx->range.owner_ch == NULL ||
9500 	       spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread);
9501 
9502 	/* We will add a copy of this range to each channel now. */
9503 	spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx,
9504 				   bdev_lock_lba_range_cb);
9505 }
9506 
9507 static bool
9508 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq)
9509 {
9510 	struct lba_range *r;
9511 
9512 	TAILQ_FOREACH(r, tailq, tailq) {
9513 		if (bdev_lba_range_overlapped(range, r)) {
9514 			return true;
9515 		}
9516 	}
9517 	return false;
9518 }
9519 
9520 static int
9521 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch,
9522 		     uint64_t offset, uint64_t length,
9523 		     lock_range_cb cb_fn, void *cb_arg)
9524 {
9525 	struct locked_lba_range_ctx *ctx;
9526 
9527 	ctx = calloc(1, sizeof(*ctx));
9528 	if (ctx == NULL) {
9529 		return -ENOMEM;
9530 	}
9531 
9532 	ctx->range.offset = offset;
9533 	ctx->range.length = length;
9534 	ctx->range.owner_thread = spdk_get_thread();
9535 	ctx->range.owner_ch = ch;
9536 	ctx->range.locked_ctx = cb_arg;
9537 	ctx->range.bdev = bdev;
9538 	ctx->cb_fn = cb_fn;
9539 	ctx->cb_arg = cb_arg;
9540 
9541 	spdk_spin_lock(&bdev->internal.spinlock);
9542 	if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) {
9543 		/* There is an active lock overlapping with this range.
9544 		 * Put it on the pending list until this range no
9545 		 * longer overlaps with another.
9546 		 */
9547 		TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq);
9548 	} else {
9549 		TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq);
9550 		bdev_lock_lba_range_ctx(bdev, ctx);
9551 	}
9552 	spdk_spin_unlock(&bdev->internal.spinlock);
9553 	return 0;
9554 }
9555 
9556 static int
9557 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
9558 		    uint64_t offset, uint64_t length,
9559 		    lock_range_cb cb_fn, void *cb_arg)
9560 {
9561 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
9562 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9563 
9564 	if (cb_arg == NULL) {
9565 		SPDK_ERRLOG("cb_arg must not be NULL\n");
9566 		return -EINVAL;
9567 	}
9568 
9569 	return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg);
9570 }
9571 
9572 static void
9573 bdev_lock_lba_range_ctx_msg(void *_ctx)
9574 {
9575 	struct locked_lba_range_ctx *ctx = _ctx;
9576 
9577 	bdev_lock_lba_range_ctx(ctx->range.bdev, ctx);
9578 }
9579 
9580 static void
9581 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status)
9582 {
9583 	struct locked_lba_range_ctx *ctx = _ctx;
9584 	struct locked_lba_range_ctx *pending_ctx;
9585 	struct lba_range *range, *tmp;
9586 
9587 	spdk_spin_lock(&bdev->internal.spinlock);
9588 	/* Check if there are any pending locked ranges that overlap with this range
9589 	 * that was just unlocked.  If there are, check that it doesn't overlap with any
9590 	 * other locked ranges before calling bdev_lock_lba_range_ctx which will start
9591 	 * the lock process.
9592 	 */
9593 	TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) {
9594 		if (bdev_lba_range_overlapped(range, &ctx->range) &&
9595 		    !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) {
9596 			TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq);
9597 			pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range);
9598 			TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq);
9599 			spdk_thread_send_msg(pending_ctx->range.owner_thread,
9600 					     bdev_lock_lba_range_ctx_msg, pending_ctx);
9601 		}
9602 	}
9603 	spdk_spin_unlock(&bdev->internal.spinlock);
9604 
9605 	ctx->cb_fn(&ctx->range, ctx->cb_arg, status);
9606 	free(ctx);
9607 }
9608 
9609 static void
9610 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9611 				  struct spdk_io_channel *_ch, void *_ctx)
9612 {
9613 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9614 	struct locked_lba_range_ctx *ctx = _ctx;
9615 	TAILQ_HEAD(, spdk_bdev_io) io_locked;
9616 	struct spdk_bdev_io *bdev_io;
9617 	struct lba_range *range;
9618 
9619 	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
9620 		if (ctx->range.offset == range->offset &&
9621 		    ctx->range.length == range->length &&
9622 		    ctx->range.locked_ctx == range->locked_ctx) {
9623 			TAILQ_REMOVE(&ch->locked_ranges, range, tailq);
9624 			free(range);
9625 			break;
9626 		}
9627 	}
9628 
9629 	/* Note: we should almost always be able to assert that the range specified
9630 	 * was found.  But there are some very rare corner cases where a new channel
9631 	 * gets created simultaneously with a range unlock, where this function
9632 	 * would execute on that new channel and wouldn't have the range.
9633 	 * We also use this to clean up range allocations when a later allocation
9634 	 * fails in the locking path.
9635 	 * So we can't actually assert() here.
9636 	 */
9637 
9638 	/* Swap the locked IO into a temporary list, and then try to submit them again.
9639 	 * We could hyper-optimize this to only resubmit locked I/O that overlap
9640 	 * with the range that was just unlocked, but this isn't a performance path so
9641 	 * we go for simplicity here.
9642 	 */
9643 	TAILQ_INIT(&io_locked);
9644 	TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link);
9645 	while (!TAILQ_EMPTY(&io_locked)) {
9646 		bdev_io = TAILQ_FIRST(&io_locked);
9647 		TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link);
9648 		bdev_io_submit(bdev_io);
9649 	}
9650 
9651 	spdk_bdev_for_each_channel_continue(i, 0);
9652 }
9653 
9654 static int
9655 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length,
9656 		       lock_range_cb cb_fn, void *cb_arg)
9657 {
9658 	struct locked_lba_range_ctx *ctx;
9659 	struct lba_range *range;
9660 
9661 	spdk_spin_lock(&bdev->internal.spinlock);
9662 	/* To start the unlock the process, we find the range in the bdev's locked_ranges
9663 	 * and remove it. This ensures new channels don't inherit the locked range.
9664 	 * Then we will send a message to each channel to remove the range from its
9665 	 * per-channel list.
9666 	 */
9667 	TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) {
9668 		if (range->offset == offset && range->length == length &&
9669 		    (range->owner_ch == NULL || range->locked_ctx == cb_arg)) {
9670 			break;
9671 		}
9672 	}
9673 	if (range == NULL) {
9674 		assert(false);
9675 		spdk_spin_unlock(&bdev->internal.spinlock);
9676 		return -EINVAL;
9677 	}
9678 	TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq);
9679 	ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range);
9680 	spdk_spin_unlock(&bdev->internal.spinlock);
9681 
9682 	ctx->cb_fn = cb_fn;
9683 	ctx->cb_arg = cb_arg;
9684 
9685 	spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx,
9686 				   bdev_unlock_lba_range_cb);
9687 	return 0;
9688 }
9689 
9690 static int
9691 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
9692 		      uint64_t offset, uint64_t length,
9693 		      lock_range_cb cb_fn, void *cb_arg)
9694 {
9695 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
9696 	struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch);
9697 	struct lba_range *range;
9698 	bool range_found = false;
9699 
9700 	/* Let's make sure the specified channel actually has a lock on
9701 	 * the specified range.  Note that the range must match exactly.
9702 	 */
9703 	TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
9704 		if (range->offset == offset && range->length == length &&
9705 		    range->owner_ch == ch && range->locked_ctx == cb_arg) {
9706 			range_found = true;
9707 			break;
9708 		}
9709 	}
9710 
9711 	if (!range_found) {
9712 		return -EINVAL;
9713 	}
9714 
9715 	return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg);
9716 }
9717 
9718 struct bdev_quiesce_ctx {
9719 	spdk_bdev_quiesce_cb cb_fn;
9720 	void *cb_arg;
9721 };
9722 
9723 static void
9724 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status)
9725 {
9726 	struct bdev_quiesce_ctx *quiesce_ctx = ctx;
9727 
9728 	if (quiesce_ctx->cb_fn != NULL) {
9729 		quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status);
9730 	}
9731 
9732 	free(quiesce_ctx);
9733 }
9734 
9735 static void
9736 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status)
9737 {
9738 	struct bdev_quiesce_ctx *quiesce_ctx = ctx;
9739 	struct spdk_bdev_module *module = range->bdev->module;
9740 
9741 	if (status != 0) {
9742 		if (quiesce_ctx->cb_fn != NULL) {
9743 			quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status);
9744 		}
9745 		free(quiesce_ctx);
9746 		return;
9747 	}
9748 
9749 	spdk_spin_lock(&module->internal.spinlock);
9750 	TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module);
9751 	spdk_spin_unlock(&module->internal.spinlock);
9752 
9753 	if (quiesce_ctx->cb_fn != NULL) {
9754 		quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status);
9755 		quiesce_ctx->cb_fn = NULL;
9756 		quiesce_ctx->cb_arg = NULL;
9757 	}
9758 	/* quiesce_ctx will be freed on unquiesce */
9759 }
9760 
9761 static int
9762 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
9763 		   uint64_t offset, uint64_t length,
9764 		   spdk_bdev_quiesce_cb cb_fn, void *cb_arg,
9765 		   bool unquiesce)
9766 {
9767 	struct bdev_quiesce_ctx *quiesce_ctx;
9768 	int rc;
9769 
9770 	if (module != bdev->module) {
9771 		SPDK_ERRLOG("Bdev does not belong to specified module.\n");
9772 		return -EINVAL;
9773 	}
9774 
9775 	if (!bdev_io_valid_blocks(bdev, offset, length)) {
9776 		return -EINVAL;
9777 	}
9778 
9779 	if (unquiesce) {
9780 		struct lba_range *range;
9781 
9782 		/* Make sure the specified range is actually quiesced in the specified module and
9783 		 * then remove it from the list. Note that the range must match exactly.
9784 		 */
9785 		spdk_spin_lock(&module->internal.spinlock);
9786 		TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) {
9787 			if (range->bdev == bdev && range->offset == offset && range->length == length) {
9788 				TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module);
9789 				break;
9790 			}
9791 		}
9792 		spdk_spin_unlock(&module->internal.spinlock);
9793 
9794 		if (range == NULL) {
9795 			SPDK_ERRLOG("The range to unquiesce was not found.\n");
9796 			return -EINVAL;
9797 		}
9798 
9799 		quiesce_ctx = range->locked_ctx;
9800 		quiesce_ctx->cb_fn = cb_fn;
9801 		quiesce_ctx->cb_arg = cb_arg;
9802 
9803 		rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx);
9804 	} else {
9805 		quiesce_ctx = malloc(sizeof(*quiesce_ctx));
9806 		if (quiesce_ctx == NULL) {
9807 			return -ENOMEM;
9808 		}
9809 
9810 		quiesce_ctx->cb_fn = cb_fn;
9811 		quiesce_ctx->cb_arg = cb_arg;
9812 
9813 		rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx);
9814 		if (rc != 0) {
9815 			free(quiesce_ctx);
9816 		}
9817 	}
9818 
9819 	return rc;
9820 }
9821 
9822 int
9823 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
9824 		  spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
9825 {
9826 	return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false);
9827 }
9828 
9829 int
9830 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
9831 		    spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
9832 {
9833 	return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true);
9834 }
9835 
9836 int
9837 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
9838 			uint64_t offset, uint64_t length,
9839 			spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
9840 {
9841 	return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false);
9842 }
9843 
9844 int
9845 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module,
9846 			  uint64_t offset, uint64_t length,
9847 			  spdk_bdev_quiesce_cb cb_fn, void *cb_arg)
9848 {
9849 	return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true);
9850 }
9851 
9852 int
9853 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains,
9854 			     int array_size)
9855 {
9856 	if (!bdev) {
9857 		return -EINVAL;
9858 	}
9859 
9860 	if (bdev->fn_table->get_memory_domains) {
9861 		return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size);
9862 	}
9863 
9864 	return 0;
9865 }
9866 
9867 struct spdk_bdev_for_each_io_ctx {
9868 	void *ctx;
9869 	spdk_bdev_io_fn fn;
9870 	spdk_bdev_for_each_io_cb cb;
9871 };
9872 
9873 static void
9874 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev,
9875 			 struct spdk_io_channel *io_ch, void *_ctx)
9876 {
9877 	struct spdk_bdev_for_each_io_ctx *ctx = _ctx;
9878 	struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch);
9879 	struct spdk_bdev_io *bdev_io;
9880 	int rc = 0;
9881 
9882 	TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) {
9883 		rc = ctx->fn(ctx->ctx, bdev_io);
9884 		if (rc != 0) {
9885 			break;
9886 		}
9887 	}
9888 
9889 	spdk_bdev_for_each_channel_continue(i, rc);
9890 }
9891 
9892 static void
9893 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status)
9894 {
9895 	struct spdk_bdev_for_each_io_ctx *ctx = _ctx;
9896 
9897 	ctx->cb(ctx->ctx, status);
9898 
9899 	free(ctx);
9900 }
9901 
9902 void
9903 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn,
9904 			   spdk_bdev_for_each_io_cb cb)
9905 {
9906 	struct spdk_bdev_for_each_io_ctx *ctx;
9907 
9908 	assert(fn != NULL && cb != NULL);
9909 
9910 	ctx = calloc(1, sizeof(*ctx));
9911 	if (ctx == NULL) {
9912 		SPDK_ERRLOG("Failed to allocate context.\n");
9913 		cb(_ctx, -ENOMEM);
9914 		return;
9915 	}
9916 
9917 	ctx->ctx = _ctx;
9918 	ctx->fn = fn;
9919 	ctx->cb = cb;
9920 
9921 	spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx,
9922 				   bdev_for_each_io_done);
9923 }
9924 
9925 void
9926 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status)
9927 {
9928 	spdk_for_each_channel_continue(iter->i, status);
9929 }
9930 
9931 static struct spdk_bdev *
9932 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i)
9933 {
9934 	void *io_device = spdk_io_channel_iter_get_io_device(i);
9935 
9936 	return __bdev_from_io_dev(io_device);
9937 }
9938 
9939 static void
9940 bdev_each_channel_msg(struct spdk_io_channel_iter *i)
9941 {
9942 	struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
9943 	struct spdk_bdev *bdev = io_channel_iter_get_bdev(i);
9944 	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
9945 
9946 	iter->i = i;
9947 	iter->fn(iter, bdev, ch, iter->ctx);
9948 }
9949 
9950 static void
9951 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status)
9952 {
9953 	struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
9954 	struct spdk_bdev *bdev = io_channel_iter_get_bdev(i);
9955 
9956 	iter->i = i;
9957 	iter->cpl(bdev, iter->ctx, status);
9958 
9959 	free(iter);
9960 }
9961 
9962 void
9963 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn,
9964 			   void *ctx, spdk_bdev_for_each_channel_done cpl)
9965 {
9966 	struct spdk_bdev_channel_iter *iter;
9967 
9968 	assert(bdev != NULL && fn != NULL && ctx != NULL);
9969 
9970 	iter = calloc(1, sizeof(struct spdk_bdev_channel_iter));
9971 	if (iter == NULL) {
9972 		SPDK_ERRLOG("Unable to allocate iterator\n");
9973 		assert(false);
9974 		return;
9975 	}
9976 
9977 	iter->fn = fn;
9978 	iter->cpl = cpl;
9979 	iter->ctx = ctx;
9980 
9981 	spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg,
9982 			      iter, bdev_each_channel_cpl);
9983 }
9984 
9985 static void
9986 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
9987 {
9988 	struct spdk_bdev_io *parent_io = cb_arg;
9989 
9990 	spdk_bdev_free_io(bdev_io);
9991 
9992 	/* Check return status of write */
9993 	parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
9994 	parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx);
9995 }
9996 
9997 static void
9998 bdev_copy_do_write(void *_bdev_io)
9999 {
10000 	struct spdk_bdev_io *bdev_io = _bdev_io;
10001 	int rc;
10002 
10003 	/* Write blocks */
10004 	rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc,
10005 					    spdk_io_channel_from_ctx(bdev_io->internal.ch),
10006 					    bdev_io->u.bdev.iovs[0].iov_base,
10007 					    bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks,
10008 					    bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io);
10009 
10010 	if (rc == -ENOMEM) {
10011 		bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write);
10012 	} else if (rc != 0) {
10013 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10014 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
10015 	}
10016 }
10017 
10018 static void
10019 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
10020 {
10021 	struct spdk_bdev_io *parent_io = cb_arg;
10022 
10023 	spdk_bdev_free_io(bdev_io);
10024 
10025 	/* Check return status of read */
10026 	if (!success) {
10027 		parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10028 		parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
10029 		return;
10030 	}
10031 
10032 	/* Do write */
10033 	bdev_copy_do_write(parent_io);
10034 }
10035 
10036 static void
10037 bdev_copy_do_read(void *_bdev_io)
10038 {
10039 	struct spdk_bdev_io *bdev_io = _bdev_io;
10040 	int rc;
10041 
10042 	/* Read blocks */
10043 	rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc,
10044 					   spdk_io_channel_from_ctx(bdev_io->internal.ch),
10045 					   bdev_io->u.bdev.iovs[0].iov_base,
10046 					   bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks,
10047 					   bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io);
10048 
10049 	if (rc == -ENOMEM) {
10050 		bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read);
10051 	} else if (rc != 0) {
10052 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10053 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
10054 	}
10055 }
10056 
10057 static void
10058 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
10059 {
10060 	if (!success) {
10061 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
10062 		bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
10063 		return;
10064 	}
10065 
10066 	bdev_copy_do_read(bdev_io);
10067 }
10068 
10069 int
10070 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
10071 		      uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks,
10072 		      spdk_bdev_io_completion_cb cb, void *cb_arg)
10073 {
10074 	struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
10075 	struct spdk_bdev_io *bdev_io;
10076 	struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
10077 
10078 	if (!desc->write) {
10079 		return -EBADF;
10080 	}
10081 
10082 	if (num_blocks == 0) {
10083 		SPDK_ERRLOG("Can't copy 0 blocks\n");
10084 		return -EINVAL;
10085 	}
10086 
10087 	if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) ||
10088 	    !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) {
10089 		SPDK_DEBUGLOG(bdev,
10090 			      "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n",
10091 			      dst_offset_blocks, src_offset_blocks, num_blocks);
10092 		return -EINVAL;
10093 	}
10094 
10095 	bdev_io = bdev_channel_get_io(channel);
10096 	if (!bdev_io) {
10097 		return -ENOMEM;
10098 	}
10099 
10100 	bdev_io->internal.ch = channel;
10101 	bdev_io->internal.desc = desc;
10102 	bdev_io->type = SPDK_BDEV_IO_TYPE_COPY;
10103 
10104 	bdev_io->u.bdev.offset_blocks = dst_offset_blocks;
10105 	bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks;
10106 	bdev_io->u.bdev.num_blocks = num_blocks;
10107 	bdev_io->u.bdev.memory_domain = NULL;
10108 	bdev_io->u.bdev.memory_domain_ctx = NULL;
10109 	bdev_io->u.bdev.iovs = NULL;
10110 	bdev_io->u.bdev.iovcnt = 0;
10111 	bdev_io->u.bdev.md_buf = NULL;
10112 	bdev_io->u.bdev.accel_sequence = NULL;
10113 	bdev_io_init(bdev_io, bdev, cb_arg, cb);
10114 
10115 	if (dst_offset_blocks == src_offset_blocks) {
10116 		bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
10117 		bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx);
10118 
10119 		return 0;
10120 	}
10121 
10122 
10123 	/* If the copy size is large and should be split, use the generic split logic
10124 	 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not.
10125 	 *
10126 	 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or
10127 	 * emulate it using regular read and write requests otherwise.
10128 	 */
10129 	if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) ||
10130 	    bdev_io->internal.split) {
10131 		bdev_io_submit(bdev_io);
10132 		return 0;
10133 	}
10134 
10135 	spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev));
10136 
10137 	return 0;
10138 }
10139 
10140 SPDK_LOG_REGISTER_COMPONENT(bdev)
10141 
10142 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV)
10143 {
10144 	struct spdk_trace_tpoint_opts opts[] = {
10145 		{
10146 			"BDEV_IO_START", TRACE_BDEV_IO_START,
10147 			OWNER_BDEV, OBJECT_BDEV_IO, 1,
10148 			{
10149 				{ "type", SPDK_TRACE_ARG_TYPE_INT, 8 },
10150 				{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 },
10151 				{ "offset", SPDK_TRACE_ARG_TYPE_INT, 8 },
10152 				{ "len", SPDK_TRACE_ARG_TYPE_INT, 8 },
10153 				{ "name", SPDK_TRACE_ARG_TYPE_STR, 40}
10154 			}
10155 		},
10156 		{
10157 			"BDEV_IO_DONE", TRACE_BDEV_IO_DONE,
10158 			OWNER_BDEV, OBJECT_BDEV_IO, 0,
10159 			{{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }}
10160 		},
10161 		{
10162 			"BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE,
10163 			OWNER_BDEV, OBJECT_NONE, 1,
10164 			{
10165 				{ "name", SPDK_TRACE_ARG_TYPE_STR, 40 },
10166 				{ "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8}
10167 			}
10168 		},
10169 		{
10170 			"BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY,
10171 			OWNER_BDEV, OBJECT_NONE, 0,
10172 			{
10173 				{ "name", SPDK_TRACE_ARG_TYPE_STR, 40 },
10174 				{ "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8}
10175 			}
10176 		},
10177 	};
10178 
10179 
10180 	spdk_trace_register_owner(OWNER_BDEV, 'b');
10181 	spdk_trace_register_object(OBJECT_BDEV_IO, 'i');
10182 	spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts));
10183 	spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0);
10184 	spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0);
10185 }
10186